diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake index 47be8b85388..4c3288df842 100644 --- a/cmake/Modules/Packages/GPU.cmake +++ b/cmake/Modules/Packages/GPU.cmake @@ -151,10 +151,10 @@ if(GPU_API STREQUAL "CUDA") endif() cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS ${CUDA_REQUEST_PIC} - -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) + -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC} - -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) + -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -allow-unsupported-compiler -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING} -DLAMMPS_${LAMMPS_SIZES}) foreach(CU_OBJ ${GPU_GEN_OBJS}) get_filename_component(CU_NAME ${CU_OBJ} NAME_WE) diff --git a/cmake/presets/gpu-cuda.cmake b/cmake/presets/gpu-cuda.cmake new file mode 100644 index 00000000000..2ac6bd9ea65 --- /dev/null +++ b/cmake/presets/gpu-cuda.cmake @@ -0,0 +1,11 @@ +# preset that enables GPU and selects CUDA API + +set(PKG_GPU ON CACHE BOOL "Build GPU package" FORCE) +set(GPU_API "cuda" CACHE STRING "APU used by GPU package" FORCE) +set(GPU_PREC "mixed" CACHE STRING "" FORCE) + +set(CUDA_NVCC_FLAGS "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_DEBUG "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_MINSIZEREL "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_RELWITHDEBINFO "-allow-unsupported-compiler" CACHE STRING "" FORCE) +set(CUDA_NVCC_FLAGS_RELEASE "-allow-unsupported-compiler" CACHE STRING "" FORCE) diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 828f0b10d94..e7761e7bee7 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -87,7 +87,7 @@ OPT. * :doc:`coul/long/soft (o) ` * :doc:`coul/msm (o) ` * :doc:`coul/slater/cut ` - * :doc:`coul/slater/long ` + * :doc:`coul/slater/long (g) ` * :doc:`coul/shield ` * :doc:`coul/streitz ` * :doc:`coul/tt ` @@ -110,7 +110,7 @@ OPT. * :doc:`eam/he ` * :doc:`edip (o) ` * :doc:`edip/multi ` - * :doc:`edpd ` + * :doc:`edpd (g) ` * :doc:`eff/cut ` * :doc:`eim (o) ` * :doc:`exp6/rx (k) ` @@ -158,14 +158,14 @@ OPT. * :doc:`lj/cut (gikot) ` * :doc:`lj/cut/coul/cut (gko) ` * :doc:`lj/cut/coul/cut/dielectric (o) ` - * :doc:`lj/cut/coul/cut/soft (o) ` + * :doc:`lj/cut/coul/cut/soft (go) ` * :doc:`lj/cut/coul/debye (gko) ` * :doc:`lj/cut/coul/debye/dielectric (o) ` * :doc:`lj/cut/coul/dsf (gko) ` * :doc:`lj/cut/coul/long (gikot) ` * :doc:`lj/cut/coul/long/cs ` * :doc:`lj/cut/coul/long/dielectric (o) ` - * :doc:`lj/cut/coul/long/soft (o) ` + * :doc:`lj/cut/coul/long/soft (go) ` * :doc:`lj/cut/coul/msm (go) ` * :doc:`lj/cut/coul/msm/dielectric ` * :doc:`lj/cut/coul/wolf (o) ` @@ -202,7 +202,7 @@ OPT. * :doc:`lubricate/poly (o) ` * :doc:`lubricateU ` * :doc:`lubricateU/poly ` - * :doc:`mdpd ` + * :doc:`mdpd (g) ` * :doc:`mdpd/rhosum ` * :doc:`meam (k) ` * :doc:`meam/ms (k) ` @@ -268,11 +268,11 @@ OPT. * :doc:`smtbq ` * :doc:`snap (ik) ` * :doc:`soft (go) ` - * :doc:`sph/heatconduction ` + * :doc:`sph/heatconduction (g) ` * :doc:`sph/idealgas ` - * :doc:`sph/lj ` + * :doc:`sph/lj (g) ` * :doc:`sph/rhosum ` - * :doc:`sph/taitwater ` + * :doc:`sph/taitwater (g) ` * :doc:`sph/taitwater/morris ` * :doc:`spin/dipole/cut ` * :doc:`spin/dipole/long ` diff --git a/doc/src/pair_coul_slater.rst b/doc/src/pair_coul_slater.rst index 443de4262b8..bde14276db6 100644 --- a/doc/src/pair_coul_slater.rst +++ b/doc/src/pair_coul_slater.rst @@ -1,6 +1,7 @@ .. index:: pair_style coul/slater .. index:: pair_style coul/slater/cut .. index:: pair_style coul/slater/long +.. index:: pair_style coul/slater/long/gpu pair_style coul/slater command ============================== @@ -11,6 +12,8 @@ pair_style coul/slater/cut command pair_style coul/slater/long command =================================== +Accelerator Variants: *coul/slater/long/gpu* + Syntax """""" diff --git a/doc/src/pair_fep_soft.rst b/doc/src/pair_fep_soft.rst index 400ad0cc4a4..20e17ce0b42 100644 --- a/doc/src/pair_fep_soft.rst +++ b/doc/src/pair_fep_soft.rst @@ -1,8 +1,10 @@ .. index:: pair_style lj/cut/soft .. index:: pair_style lj/cut/soft/omp .. index:: pair_style lj/cut/coul/cut/soft +.. index:: pair_style lj/cut/coul/cut/soft/gpu .. index:: pair_style lj/cut/coul/cut/soft/omp .. index:: pair_style lj/cut/coul/long/soft +.. index:: pair_style lj/cut/coul/long/soft/gpu .. index:: pair_style lj/cut/coul/long/soft/omp .. index:: pair_style lj/cut/tip4p/long/soft .. index:: pair_style lj/cut/tip4p/long/soft/omp @@ -27,12 +29,12 @@ Accelerator Variants: *lj/cut/soft/omp* pair_style lj/cut/coul/cut/soft command ======================================= -Accelerator Variants: *lj/cut/coul/cut/soft/omp* +Accelerator Variants: *lj/cut/coul/cut/soft/gpu*, *lj/cut/coul/cut/soft/omp* pair_style lj/cut/coul/long/soft command ======================================== -Accelerator Variants: *lj/cut/coul/long/soft/omp* +Accelerator Variants: *lj/cut/coul/long/soft/gpu*, *lj/cut/coul/long/soft/omp* pair_style lj/cut/tip4p/long/soft command ========================================= diff --git a/doc/src/pair_mesodpd.rst b/doc/src/pair_mesodpd.rst index 5d244f3b1d1..28a398754ff 100644 --- a/doc/src/pair_mesodpd.rst +++ b/doc/src/pair_mesodpd.rst @@ -1,14 +1,20 @@ .. index:: pair_style edpd +.. index:: pair_style edpd/gpu .. index:: pair_style mdpd +.. index:: pair_style mdpd/gpu .. index:: pair_style mdpd/rhosum .. index:: pair_style tdpd pair_style edpd command ======================= +Accelerator Variants: *edpd/gpu* + pair_style mdpd command ======================= +Accelerator Variants: *mdpd/gpu* + pair_style mdpd/rhosum command ============================== diff --git a/doc/src/pair_sph_heatconduction.rst b/doc/src/pair_sph_heatconduction.rst index 4716ed54fb5..e9004cb5a48 100644 --- a/doc/src/pair_sph_heatconduction.rst +++ b/doc/src/pair_sph_heatconduction.rst @@ -1,8 +1,11 @@ .. index:: pair_style sph/heatconduction +.. index:: pair_style sph/heatconduction/gpu pair_style sph/heatconduction command ===================================== +Accelerator Variants: *sph/heatconduction/gpu* + Syntax """""" diff --git a/doc/src/pair_sph_lj.rst b/doc/src/pair_sph_lj.rst index b5c02c41ff4..5ac7ab9c6b2 100644 --- a/doc/src/pair_sph_lj.rst +++ b/doc/src/pair_sph_lj.rst @@ -1,8 +1,11 @@ .. index:: pair_style sph/lj +.. index:: pair_style sph/lj/gpu pair_style sph/lj command ========================= +Accelerator Variants: *sph/lj/gpu* + Syntax """""" diff --git a/doc/src/pair_sph_taitwater.rst b/doc/src/pair_sph_taitwater.rst index 34eb65f0051..79972660c42 100644 --- a/doc/src/pair_sph_taitwater.rst +++ b/doc/src/pair_sph_taitwater.rst @@ -1,8 +1,11 @@ .. index:: pair_style sph/taitwater +.. index:: pair_style sph/taitwater/gpu pair_style sph/taitwater command ================================ +Accelerator Variants: *sph/taitwater/gpu* + Syntax """""" diff --git a/examples/PACKAGES/dpd-meso/mdpd/in.mdpd b/examples/PACKAGES/dpd-meso/mdpd/in.mdpd index b0740c82276..2c740f41275 100644 --- a/examples/PACKAGES/dpd-meso/mdpd/in.mdpd +++ b/examples/PACKAGES/dpd-meso/mdpd/in.mdpd @@ -16,6 +16,7 @@ neighbor 0.3 bin neigh_modify every 1 delay 0 check yes atom_style mdpd +comm_modify vel yes region mdpd block -25 25 -10 10 -10 10 units box create_box 1 mdpd diff --git a/lib/gpu/Makefile.linux_multi b/lib/gpu/Makefile.linux_multi index 3299bbec3a4..005f6590794 100644 --- a/lib/gpu/Makefile.linux_multi +++ b/lib/gpu/Makefile.linux_multi @@ -65,7 +65,7 @@ CUDA_PRECISION = -D_SINGLE_DOUBLE CUDA_INCLUDE = -I$(CUDA_HOME)/include CUDA_LIB = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs -CUDA_OPTS = -DUNIX -O3 --use_fast_math $(LMP_INC) -Xcompiler -fPIC +CUDA_OPTS = -DUNIX -O3 --use_fast_math $(LMP_INC) -Xcompiler -fPIC -allow-unsupported-compiler CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC -std=c++11 CUDR_OPTS = -O2 $(LMP_INC) # -xHost -no-prec-div -ansi-alias diff --git a/lib/gpu/geryon/ocl_mat.h b/lib/gpu/geryon/ocl_mat.h index 3135594dc3d..66ca6ab5275 100644 --- a/lib/gpu/geryon/ocl_mat.h +++ b/lib/gpu/geryon/ocl_mat.h @@ -54,6 +54,6 @@ namespace ucl_opencl { #include "ucl_print.h" #undef UCL_PRINT_ALLOW -} // namespace ucl_cudart +} // namespace ucl_opencl #endif diff --git a/lib/gpu/lal_amoeba.cpp b/lib/gpu/lal_amoeba.cpp index 5e199979135..805c4c4b26b 100644 --- a/lib/gpu/lal_amoeba.cpp +++ b/lib/gpu/lal_amoeba.cpp @@ -281,13 +281,7 @@ int AmoebaT::polar_real(const int eflag, const int vflag) { const int BX=this->block_size(); const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - /* - const int cus = this->device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - } - */ + this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp index e103699d40b..0ddd24d21ed 100644 --- a/lib/gpu/lal_base_dpd.cpp +++ b/lib/gpu/lal_base_dpd.cpp @@ -56,7 +56,8 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name, const int onetype) { + const char *k_name, const int onetype, + const int extra_fields) { screen=_screen; int gpu_nbor=0; @@ -75,7 +76,8 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall, bool charge = false; bool rot = false; bool vel = true; - int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel); + _extra_fields = extra_fields; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); if (success!=0) return success; diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h index 9eb56993afa..64ec725d95a 100644 --- a/lib/gpu/lal_base_dpd.h +++ b/lib/gpu/lal_base_dpd.h @@ -53,7 +53,7 @@ class BaseDPD { const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, const void *pair_program, const char *k_name, - const int onetype=0); + const int onetype=0, const int extra_fields=0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(); @@ -167,7 +167,6 @@ class BaseDPD { /// Atom Data Atom *atom; - // ------------------------ FORCE/ENERGY DATA ----------------------- Answer *ans; @@ -199,7 +198,7 @@ class BaseDPD { protected: bool _compiled; - int _block_size, _threads_per_atom, _onetype; + int _block_size, _threads_per_atom, _onetype, _extra_fields; double _max_bytes, _max_an_bytes; double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; diff --git a/lib/gpu/lal_base_sph.cpp b/lib/gpu/lal_base_sph.cpp new file mode 100644 index 00000000000..f373c0ebb6b --- /dev/null +++ b/lib/gpu/lal_base_sph.cpp @@ -0,0 +1,362 @@ +/*************************************************************************** + base_sph.cpp + ------------------- + Trung Nguyen (U Chicago) + + Base class for SPH pair styles needing per-particle data for position, + velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include "lal_base_sph.h" +namespace LAMMPS_AL { +#define BaseSPHT BaseSPH + +extern Device global_device; + +template +BaseSPHT::BaseSPH() : _compiled(false), _max_bytes(0) { + device=&global_device; + ans=new Answer(); + nbor=new Neighbor(); + pair_program=nullptr; + ucl_device=nullptr; + #if defined(LAL_OCL_EV_JIT) + pair_program_noev=nullptr; + #endif +} + +template +BaseSPHT::~BaseSPH() { + delete ans; + delete nbor; + k_pair_fast.clear(); + k_pair.clear(); + if (pair_program) delete pair_program; + #if defined(LAL_OCL_EV_JIT) + k_pair_noev.clear(); + if (pair_program_noev) delete pair_program_noev; + #endif +} + +template +int BaseSPHT::bytes_per_atom_atomic(const int max_nbors) const { + return device->atom.bytes_per_atom()+ans->bytes_per_atom()+ + nbor->bytes_per_atom(max_nbors); +} + +template +int BaseSPHT::init_atomic(const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, const double gpu_split, + FILE *_screen, const void *pair_program, + const char *k_name, const int onetype, + const int extra_fields) { + screen=_screen; + + int gpu_nbor=0; + if (device->gpu_mode()==Device::GPU_NEIGH) + gpu_nbor=1; + else if (device->gpu_mode()==Device::GPU_HYB_NEIGH) + gpu_nbor=2; + + int _gpu_host=0; + int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor); + if (host_nlocal>0) + _gpu_host=1; + + _threads_per_atom=device->threads_per_atom(); + + bool charge = false; + bool rot = false; + bool vel = true; + _extra_fields = extra_fields; + int success=device->init(*ans,charge,rot,nlocal,nall,maxspecial,vel,_extra_fields/4); + if (success!=0) + return success; + + if (ucl_device!=device->gpu) _compiled=false; + + ucl_device=device->gpu; + atom=&device->atom; + + _block_size=device->pair_block_size(); + compile_kernels(*ucl_device,pair_program,k_name,onetype); + + if (_threads_per_atom>1 && gpu_nbor==0) { + nbor->packing(true); + _nbor_data=&(nbor->dev_packed); + } else + _nbor_data=&(nbor->dev_nbor); + + success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host, + max_nbors,cell_size,false,_threads_per_atom); + if (success!=0) + return success; + + // Initialize host-device load balancer + hd_balancer.init(device,gpu_nbor,gpu_split); + + // Initialize timers for the selected GPU + time_pair.init(*ucl_device); + time_pair.zero(); + + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + + _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + + return success; +} + +template +void BaseSPHT::estimate_gpu_overhead() { + device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead); +} + +template +void BaseSPHT::clear_atomic() { + // Output any timing information + acc_timers(); + double avg_split=hd_balancer.all_avg_split(); + _gpu_overhead*=hd_balancer.timestep(); + _driver_overhead*=hd_balancer.timestep(); + device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes, + _gpu_overhead,_driver_overhead,_threads_per_atom,screen); + + time_pair.clear(); + hd_balancer.clear(); + + nbor->clear(); + ans->clear(); +} + +// --------------------------------------------------------------------------- +// Copy neighbor list from host +// --------------------------------------------------------------------------- +template +int * BaseSPHT::reset_nbors(const int nall, const int inum, int *ilist, + int *numj, int **firstneigh, bool &success) { + success=true; + + int mn=nbor->max_nbor_loop(inum,numj,ilist); + resize_atom(inum,nall,success); + resize_local(inum,mn,success); + if (!success) + return nullptr; + + nbor->get_host(inum,ilist,numj,firstneigh,block_size()); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + + return ilist; +} + +// --------------------------------------------------------------------------- +// Build neighbor list on device +// --------------------------------------------------------------------------- +template +inline void BaseSPHT::build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, + int *host_type, double *sublo, + double *subhi, tagint *tag, + int **nspecial, tagint **special, + bool &success) { + success=true; + resize_atom(inum,nall,success); + resize_local(inum,host_inum,nbor->max_nbors(),success); + if (!success) + return; + atom->cast_copy_x(host_x,host_type); + + int mn; + nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, + tag, nspecial, special, success, mn, ans->error_flag); + + double bytes=ans->gpu_bytes()+nbor->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; +} + +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +void BaseSPHT::compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const int nlocal) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return; + } + + int ago=hd_balancer.ago_first(f_ago); + int inum=hd_balancer.balance(ago,inum_full,cpu_time); + ans->inum(inum); + host_start=inum; + + if (ago==0) { + reset_nbors(nall, inum, ilist, numj, firstneigh, success); + if (!success) + return; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + atom->add_v_data(host_v,tag); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); +} + +// --------------------------------------------------------------------------- +// Reneighbor on GPU if necessary and then compute forces, virials, energies +// --------------------------------------------------------------------------- +template +int** BaseSPHT::compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag_in, const bool vflag_in, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + acc_timers(); + int eflag, vflag; + if (eatom) eflag=2; + else if (eflag_in) eflag=1; + else eflag=0; + if (vatom) vflag=2; + else if (vflag_in) vflag=1; + else vflag=0; + + #ifdef LAL_NO_BLOCK_REDUCE + if (eflag) eflag=2; + if (vflag) vflag=2; + #endif + + set_kernel(eflag,vflag); + if (inum_full==0) { + host_start=0; + // Make sure textures are correct if realloc by a different hybrid style + resize_atom(0,nall,success); + zero_timers(); + return nullptr; + } + + hd_balancer.balance(cpu_time); + int inum=hd_balancer.get_gpu_count(ago,inum_full); + ans->inum(inum); + host_start=inum; + + // Build neighbor list on GPU if necessary + if (ago==0) { + build_nbor_list(inum, inum_full-inum, nall, host_x, host_type, + sublo, subhi, tag, nspecial, special, success); + if (!success) + return nullptr; + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + } else { + atom->cast_x_data(host_x,host_type); + atom->cast_v_data(host_v,tag); + hd_balancer.start_timer(); + atom->add_x_data(host_x,host_type); + } + atom->add_v_data(host_v,tag); + *ilist=nbor->host_ilist.begin(); + *jnum=nbor->host_acc.begin(); + + const int red_blocks=loop(eflag,vflag); + ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks); + device->add_ans_object(ans); + hd_balancer.stop_timer(); + + return nbor->host_jlist.begin()-host_start; +} + +template +double BaseSPHT::host_memory_usage_atomic() const { + return device->atom.host_memory_usage()+nbor->host_memory_usage()+ + 4*sizeof(numtyp)+sizeof(BaseSPH); +} + +template +void BaseSPHT::compile_kernels(UCL_Device &dev, const void *pair_str, + const char *kname, const int onetype) { + if (_compiled && _onetype==onetype) + return; + + _onetype=onetype; + + std::string s_fast=std::string(kname)+"_fast"; + if (pair_program) delete pair_program; + pair_program=new UCL_Program(dev); + std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_fast.set_function(*pair_program,s_fast.c_str()); + k_pair.set_function(*pair_program,kname); + pos_tex.get_texture(*pair_program,"pos_tex"); + vel_tex.get_texture(*pair_program,"vel_tex"); + + #if defined(LAL_OCL_EV_JIT) + oclstring = device->compile_string()+" -DEVFLAG=0"; + if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype); + if (pair_program_noev) delete pair_program_noev; + pair_program_noev=new UCL_Program(dev); + pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); + k_pair_noev.set_function(*pair_program_noev,s_fast.c_str()); + #else + k_pair_sel = &k_pair_fast; + #endif + + _compiled=true; + + #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)) + if (dev.has_subgroup_support()) { + size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size); + #if defined(LAL_OCL_EV_JIT) + mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size)); + #endif + if (_threads_per_atom > (int)mx_subgroup_sz) _threads_per_atom = mx_subgroup_sz; + device->set_simd_size(mx_subgroup_sz); + } + #endif + +} + +template class BaseSPH; +} diff --git a/lib/gpu/lal_base_sph.h b/lib/gpu/lal_base_sph.h new file mode 100644 index 00000000000..e1e57315732 --- /dev/null +++ b/lib/gpu/lal_base_sph.h @@ -0,0 +1,209 @@ +/*************************************************************************** + base_sph.h + ------------------- + Trung Nguyen (U Chicago) + + Base class for SPH pair styles needing per-particle data for position, + velocity, and type. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_BASE_SPH_H +#define LAL_BASE_DPD_H + +#include "lal_device.h" +#include "lal_balance.h" +#include "mpi.h" + +#ifdef USE_OPENCL +#include "geryon/ocl_texture.h" +#elif defined(USE_HIP) +#include "geryon/hip_texture.h" +#else +#include "geryon/nvd_texture.h" +#endif + +namespace LAMMPS_AL { + +template +class BaseSPH { + public: + BaseSPH(); + virtual ~BaseSPH(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * \param k_name name for the kernel for force calculation + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init_atomic(const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const void *pair_program, const char *k_name, + const int onetype=0, const int extra_fields=0); + + /// Estimate the overhead for GPU context changes and CPU driver + void estimate_gpu_overhead(); + + /// Check if there is enough storage for atom arrays and realloc if not + /** \param success set to false if insufficient memory **/ + inline void resize_atom(const int inum, const int nall, bool &success) { + if (atom->resize(nall, success)) { + pos_tex.bind_float(atom->x,4); + vel_tex.bind_float(atom->v,4); + } + ans->resize(inum,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note olist_size=total number of local particles **/ + inline void resize_local(const int inum, const int max_nbors, bool &success) { + nbor->resize(inum,max_nbors,success); + } + + /// Check if there is enough storage for neighbors and realloc if not + /** \param nlocal number of particles whose nbors must be stored on device + * \param host_inum number of particles whose nbors need to copied to host + * \param current maximum number of neighbors + * \note host_inum is 0 if the host is performing neighboring + * \note nlocal+host_inum=total number local particles + * \note olist_size=0 **/ + inline void resize_local(const int inum, const int host_inum, + const int max_nbors, bool &success) { + nbor->resize(inum,host_inum,max_nbors,success); + } + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear_atomic(); + + /// Returns memory usage on device per atom + int bytes_per_atom_atomic(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage_atomic() const; + + /// Accumulate timers + inline void acc_timers() { + if (device->time_device()) { + nbor->acc_timers(screen); + time_pair.add_to_total(); + atom->acc_timers(); + ans->acc_timers(); + } + } + + /// Zero timers + inline void zero_timers() { + time_pair.zero(); + atom->zero_timers(); + ans->zero_timers(); + } + + /// Copy neighbor list from host + int * reset_nbors(const int nall, const int inum, int *ilist, int *numj, + int **firstneigh, bool &success); + + /// Build neighbor list on device + void build_nbor_list(const int inum, const int host_inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, bool &success); + + /// Pair loop with host neighboring + void compute(const int f_ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **v, const int nlocal); + + /// Pair loop with device neighboring + int** compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **numj, const double cpu_time, bool &success, + double **v); + + // -------------------------- DEVICE DATA ------------------------- + + /// Device Properties and Atom and Neighbor storage + Device *device; + + /// Geryon device + UCL_Device *ucl_device; + + /// Device Timers + UCL_Timer time_pair; + + /// Host device load balancer + Balance hd_balancer; + + /// LAMMPS pointer for screen output + FILE *screen; + + // --------------------------- ATOM DATA -------------------------- + + /// Atom Data + Atom *atom; + + // ------------------------ FORCE/ENERGY DATA ----------------------- + + Answer *ans; + + // --------------------------- NBOR DATA ---------------------------- + + /// Neighbor data + Neighbor *nbor; + + // ------------------------- DEVICE KERNELS ------------------------- + UCL_Program *pair_program, *pair_program_noev; + UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel; + inline int block_size() { return _block_size; } + inline void set_kernel(const int eflag, const int vflag) { + #if defined(LAL_OCL_EV_JIT) + if (eflag || vflag) k_pair_sel = &k_pair_fast; + else k_pair_sel = &k_pair_noev; + #endif + } + + + // --------------------------- TEXTURES ----------------------------- + UCL_Texture pos_tex; + UCL_Texture vel_tex; + + // ------------------------- COMMON VARS ---------------------------- + + protected: + bool _compiled; + int _block_size, _threads_per_atom, _onetype, _extra_fields; + double _max_bytes, _max_an_bytes; + double _gpu_overhead, _driver_overhead; + UCL_D_Vec *_nbor_data; + + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int onetype); + virtual int loop(const int eflag, const int vflag) = 0; +}; + +} + +#endif diff --git a/lib/gpu/lal_coul_slater_long.cpp b/lib/gpu/lal_coul_slater_long.cpp new file mode 100644 index 00000000000..42eb86e8ffb --- /dev/null +++ b/lib/gpu/lal_coul_slater_long.cpp @@ -0,0 +1,150 @@ +/*************************************************************************** + coul_slater_long_ext.cpp + ------------------------ + Trung Nguyen (U Chicago) + + Class for acceleration of the coul/slater/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "coul_slater_long_cl.h" +#elif defined(USE_CUDART) +const char *coul_slater_long=0; +#else +#include "coul_slater_long_cubin.h" +#endif + +#include "lal_coul_slater_long.h" +#include +namespace LAMMPS_AL { +#define CoulSlaterLongT CoulSlaterLong + +extern Device pair_gpu_device; + +template +CoulSlaterLongT::CoulSlaterLong() : BaseCharge(), _allocated(false) { +} + +template +CoulSlaterLongT::~CoulSlaterLong() { + clear(); +} + +template +int CoulSlaterLongT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int CoulSlaterLongT::init(const int ntypes, double **host_scale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, double lamda) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,coul_slater_long,"k_coul_slater_long"); + if (success!=0) + return success; + + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,scale,host_write,host_scale); + + sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_coul[i]; + } + ucl_copy(sp_cl,host_write,4,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + _lamda=lamda; + + _allocated=true; + this->_max_bytes=scale.row_bytes()+sp_cl.row_bytes(); + return 0; +} + +template +void CoulSlaterLongT::reinit(const int ntypes, double **host_scale) { + UCL_H_Vec hscale(_lj_types*_lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + this->atom->type_pack1(ntypes,_lj_types,scale,hscale,host_scale); +} + +template +void CoulSlaterLongT::clear() { + if (!_allocated) + return; + _allocated=false; + + scale.clear(); + sp_cl.clear(); + this->clear_atomic(); +} + +template +double CoulSlaterLongT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(CoulSlaterLong); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int CoulSlaterLongT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, + &this->atom->q, &_cut_coulsq, &_qqrd2e, &_g_ewald, + &_lamda, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &scale, &_lj_types, &sp_cl, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->q, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &_lamda, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class CoulSlaterLong; +} diff --git a/lib/gpu/lal_coul_slater_long.cu b/lib/gpu/lal_coul_slater_long.cu new file mode 100644 index 00000000000..1fc8ab8be4e --- /dev/null +++ b/lib/gpu/lal_coul_slater_long.cu @@ -0,0 +1,250 @@ +// ************************************************************************** +// coul_slater_long.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the coul/slater/long pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_coul_slater_long(const __global numtyp4 *restrict x_, + const __global numtyp *restrict scale, + const int lj_types, + const __global numtyp *restrict sp_cl_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp lamda, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + + sp_cl[0]=sp_cl_in[0]; + sp_cl[1]=sp_cl_in[1]; + sp_cl[2]=sp_cl_in[2]; + sp_cl[3]=sp_cl_in[3]; + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp e_coul, virial[6]; + if (EVFLAG) { + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii (numtyp)0) force -= factor_coul*prefactor*((numtyp)1.0-slater_term); + force *= r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv; + numtyp e = prefactor*(_erfc-e_slater); + if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater); + e_coul += e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + acctyp energy; + if (EVFLAG) energy=(acctyp)0.0; + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); +} + +__kernel void k_coul_slater_long_fast(const __global numtyp4 *restrict x_, + const __global numtyp *restrict scale_in, + const __global numtyp *restrict sp_cl_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const numtyp lamda, + const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_cl[4]; + int n_stride; + local_allocate_store_charge(); + + if (tid<4) + sp_cl[tid]=sp_cl_in[tid]; + if (tid (numtyp)0) force -= factor_coul*prefactor*((numtyp)1.0-slater_term); + force *= r2inv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + numtyp e_slater = ((numtyp)1.0 + rlamdainv)*exprlmdainv; + numtyp e = prefactor*(_erfc-e_slater); + if (factor_coul > (numtyp)0) e -= factor_coul*prefactor*((numtyp)1.0 - e_slater); + e_coul += e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + acctyp energy; + if (EVFLAG) energy=(acctyp)0.0; + store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag, + vflag,ans,engv); +} + diff --git a/lib/gpu/lal_coul_slater_long.h b/lib/gpu/lal_coul_slater_long.h new file mode 100644 index 00000000000..8950fd81ef1 --- /dev/null +++ b/lib/gpu/lal_coul_slater_long.h @@ -0,0 +1,82 @@ +/*************************************************************************** + coul_slater_long.h + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the coul/slater/long pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_Coul_Slater_Long_H +#define LAL_Coul_Slater_Long_H + +#include "lal_base_charge.h" + +namespace LAMMPS_AL { + +template +class CoulSlaterLong : public BaseCharge { + public: + CoulSlaterLong(); + ~CoulSlaterLong(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **scale, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, const double lamda); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **scale); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// scale + UCL_D_Vec scale; + /// Special Coul values [0-3] + UCL_D_Vec sp_cl; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald, _lamda; + + protected: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_coul_slater_long_ext.cpp b/lib/gpu/lal_coul_slater_long_ext.cpp new file mode 100644 index 00000000000..8c34cc55529 --- /dev/null +++ b/lib/gpu/lal_coul_slater_long_ext.cpp @@ -0,0 +1,145 @@ +/*************************************************************************** + coul_slater_long_ext.cpp + ------------------------ + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to coul/slater/long acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_coul_slater_long.h" + +using namespace std; +using namespace LAMMPS_AL; + +static CoulSlaterLong CSLMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int csl_gpu_init(const int ntypes, double **host_scale, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, const double lamda) { + CSLMF.clear(); + gpu_mode=CSLMF.device->gpu_mode(); + double gpu_split=CSLMF.device->particle_split(); + int first_gpu=CSLMF.device->first_device(); + int last_gpu=CSLMF.device->last_device(); + int world_me=CSLMF.device->world_me(); + int gpu_rank=CSLMF.device->gpu_rank(); + int procs_per_gpu=CSLMF.device->procs_per_gpu(); + + CSLMF.device->init_message(screen,"coul/slater/long",first_gpu,last_gpu); + + bool message=false; + if (CSLMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=CSLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen, host_cut_coulsq, + host_special_coul, qqrd2e, g_ewald, lamda); + + CSLMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + CSLMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void csl_gpu_reinit(const int ntypes, double **host_scale) { + int world_me=CSLMF.device->world_me(); + int gpu_rank=CSLMF.device->gpu_rank(); + int procs_per_gpu=CSLMF.device->procs_per_gpu(); + + if (world_me==0) + CSLMF.reinit(ntypes, host_scale); + + CSLMF.device->world_barrier(); + + for (int i=0; iserialize_init(); + } +} + +void csl_gpu_clear() { + CSLMF.clear(); +} + +int** csl_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return CSLMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void csl_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + CSLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double csl_gpu_bytes() { + return CSLMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index 70ba373a653..e9ef2294b2d 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -364,6 +364,12 @@ int DeviceT::init_device(MPI_Comm /*world*/, MPI_Comm replica, const int ngpu, } else _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size); + #ifndef LAL_USE_OLD_NEIGHBOR + _use_old_nbor_build = 0; + #else + _use_old_nbor_build = 1; + #endif + return flag; } @@ -510,9 +516,13 @@ int DeviceT::init(Answer &ans, const bool charge, gpu_nbor=1; else if (_gpu_mode==Device::GPU_HYB_NEIGH) gpu_nbor=2; + + // NOTE: enforce the hybrid mode (binning on the CPU) + // when not using sorting on the device #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT) if (gpu_nbor==1) gpu_nbor=2; #endif + // or when the device supports subgroups #ifndef LAL_USE_OLD_NEIGHBOR if (gpu_nbor==1) gpu_nbor=2; #endif @@ -886,19 +896,31 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, } if (times[5] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); - fprintf(screen,"Average split: %.4f.\n",avg_split); - fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); - fprintf(screen,"Vector width: %d.\n", simd_size()); - fprintf(screen,"Prefetch mode: "); - if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); - else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); - else fprintf(screen,"None.\n"); - fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); if (nbor.gpu_nbor()==2) fprintf(screen,"CPU Neighbor: %.4f s.\n",times[8]/_replica_size); fprintf(screen,"CPU Cast/Pack: %.4f s.\n",times[4]/_replica_size); fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size); fprintf(screen,"CPU Idle_Time: %.4f s.\n",times[7]/_replica_size); + fprintf(screen,"Average split: %.4f.\n",avg_split); + fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); + fprintf(screen,"Prefetch mode: "); + if (_nbor_prefetch==2) fprintf(screen,"Intrinsics.\n"); + else if (_nbor_prefetch==1) fprintf(screen,"API.\n"); + else fprintf(screen,"None.\n"); + fprintf(screen,"Vector width: %d.\n", simd_size()); + fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); + fprintf(screen,"Pair block: %d.\n",_block_pair); + fprintf(screen,"Neigh block: %d.\n",_block_nbor_build); + if (nbor.gpu_nbor()==2) { + fprintf(screen,"Neigh mode: Hybrid (binning on host)"); + if (_use_old_nbor_build == 1) fprintf(screen," - legacy\n"); + else fprintf(screen," with subgroup support\n"); + } else if (nbor.gpu_nbor()==1) { + fprintf(screen,"Neigh mode: Device"); + if (_use_old_nbor_build == 1) fprintf(screen," - legacy\n"); + else fprintf(screen," - with subgroup support\n"); + } else if (nbor.gpu_nbor()==0) + fprintf(screen,"Neigh mode: Host\n"); fprintf(screen,"-------------------------------------"); fprintf(screen,"--------------------------------\n\n"); diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index ba693e551a5..d6b52484f1a 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -347,6 +347,7 @@ class Device { int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id; int _max_shared_types, _max_bio_shared_types, _pppm_max_spline; int _nbor_prefetch; + int _use_old_nbor_build; UCL_Program *dev_program; UCL_Kernel k_zero, k_info; diff --git a/lib/gpu/lal_edpd.cpp b/lib/gpu/lal_edpd.cpp new file mode 100644 index 00000000000..c03591b9ed5 --- /dev/null +++ b/lib/gpu/lal_edpd.cpp @@ -0,0 +1,285 @@ +/*************************************************************************** + edpd.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the edpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "edpd_cl.h" +#elif defined(USE_CUDART) +const char *edpd=0; +#else +#include "edpd_cubin.h" +#endif + +#include "lal_edpd.h" +#include +namespace LAMMPS_AL { +#define EDPDT EDPD + +extern Device device; + +template +EDPDT::EDPD() : BaseDPD(), _allocated(false) { + _max_q_size = 0; +} + +template +EDPDT::~EDPD() { + clear(); +} + +template +int EDPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int EDPDT::init(const int ntypes, + double **host_cutsq, double **host_a0, + double **host_gamma, double **host_cut, + double **host_power, double **host_kappa, + double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *host_special_lj, + const int power_flag, const int kappa_flag, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // T and cv + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,edpd,"k_edpd",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_a0,host_gamma, + host_cut); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_power,host_kappa, + host_powerT,host_cutT); + + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + + if (host_sc) { + UCL_H_Vec dview(lj_types*lj_types,*(this->ucl_device),UCL_WRITE_ONLY);; + sc.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + int n = 0; + for (int i = 1; i < ntypes; i++) + for (int j = 1; j < ntypes; j++) { + dview[n].x = host_sc[i][j][0]; + dview[n].y = host_sc[i][j][1]; + dview[n].z = host_sc[i][j][2]; + dview[n].w = host_sc[i][j][3]; + n++; + } + ucl_copy(sc,dview,false); + } + + if (host_kc) { + UCL_H_Vec dview(lj_types*lj_types,*(this->ucl_device),UCL_WRITE_ONLY);; + kc.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + int n = 0; + for (int i = 1; i < ntypes; i++) + for (int j = 1; j < ntypes; j++) { + dview[n].x = host_kc[i][j][0]; + dview[n].y = host_kc[i][j][1]; + dview[n].z = host_kc[i][j][2]; + dview[n].w = host_kc[i][j][3]; + n++; + } + ucl_copy(kc,dview,false); + } + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + double special_sqrt[4]; + special_sqrt[0] = sqrt(host_special_lj[0]); + special_sqrt[1] = sqrt(host_special_lj[1]); + special_sqrt[2] = sqrt(host_special_lj[2]); + special_sqrt[3] = sqrt(host_special_lj[3]); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(special_sqrt,4,*(this->ucl_device)); + ucl_copy(sp_sqrt,dview,false); + + _power_flag = power_flag; + _kappa_flag = kappa_flag; + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_q_size=static_cast(static_cast(ef_nall)*1.10); + Q.alloc(_max_q_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+Q.row_bytes()+ + sc.row_bytes()+kc.row_bytes()+mass.row_bytes()+cutsq.row_bytes()+sp_lj.row_bytes()+sp_sqrt.row_bytes(); + return 0; +} + +template +void EDPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + sc.clear(); + kc.clear(); + Q.clear(); + mass.clear(); + cutsq.clear(); + sp_lj.clear(); + sp_sqrt.clear(); + this->clear_atomic(); +} + +template +double EDPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(EDPD); +} + +template +void EDPDT::update_flux(void **flux_ptr) { + *flux_ptr=Q.host.begin(); + Q.update_host(_max_q_size,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int EDPDT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize Q array if necessary + if (nall > _max_q_size) { + _max_q_size=static_cast(static_cast(nall)*1.10); + Q.resize(_max_q_size); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = edpd_temp[i]; + v.y = edpd_cv[i]; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &mass, + &sc, &kc, &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &Q, &eflag, &vflag, + &_power_flag, &_kappa_flag, &ainum, &nbor_pitch, + &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &mass, + &sc, &kc, &_lj_types, &sp_lj, &sp_sqrt, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &Q, &eflag, &vflag, + &_power_flag, &_kappa_flag, &ainum, &nbor_pitch, + &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void EDPDT::get_extra_data(double *host_T, double *host_cv) { + edpd_temp = host_T; + edpd_cv = host_cv; +} + +template class EDPD; +} diff --git a/lib/gpu/lal_edpd.cu b/lib/gpu/lal_edpd.cu new file mode 100644 index 00000000000..9662d15aea3 --- /dev/null +++ b/lib/gpu/lal_edpd.cu @@ -0,0 +1,619 @@ +// ************************************************************************** +// edpd.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the edpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#define EPSILON (numtyp)1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 /* Full period 32 bit LCG */ +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add /* Prime period 3666320093 */ +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = ucl_sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_heatflux(Qi, ii, inum, tid, t_per_atom, offset, Q) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, Qi); \ + } \ + if (offset==0 && ii1) { \ + simd_reduce_add1(t_per_atom,Qi); \ + } \ + if (offset==0 && ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp coeff2x = coeff2[mtype].x; //power[itype][jtype] + numtyp coeff2y = coeff2[mtype].y; //kappa[itype][jtype] + numtyp coeff2z = coeff2[mtype].z; //powerT[itype][jtype] + numtyp coeff2w = coeff2[mtype].w; //cutT[itype][jtype] + numtyp power_d = coeff2x; + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += sc[mtype].x*T_pow.x + sc[mtype].y*T_pow.y + + sc[mtype].z*T_pow.z + sc[mtype].w*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow(wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ * wr *randnum * dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kc[mtype].x*T_pow.x + kc[mtype].y*T_pow.y + + kc[mtype].z*T_pow.z + kc[mtype].w*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti)/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); + store_heatflux(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + +__kernel void k_edpd_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict mass, + const __global numtyp4 *restrict sc_in, + const __global numtyp4 *restrict kc_in, + const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_sqrt_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp *restrict Q, + const int eflag, const int vflag, + const int power_flag, const int kappa_flag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 sc[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 kc[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + __local numtyp sp_sqrt[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + sp_sqrt[tid]=sp_sqrt_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + numtyp T_ij=(numtyp)0.5*(Ti+Tj); + numtyp4 T_pow; + T_pow.x = T_ij - (numtyp)1.0; + T_pow.y = T_pow.x*T_pow.x; + T_pow.z = T_pow.x*T_pow.y; + T_pow.w = T_pow.x*T_pow.z; + + numtyp power_d = coeff2x; // power[itype][jtype] + if (power_flag) { + numtyp factor = (numtyp)1.0; + factor += scx*T_pow.x + scy*T_pow.y + scz*T_pow.z + scw*T_pow.w; + power_d *= factor; + } + + power_d = MAX((numtyp)0.01,power_d); + numtyp wc = (numtyp)1.0 - r/coeffz; // cut[itype][jtype] + wc = MAX((numtyp)0.0,MIN((numtyp)1.0,wc)); + numtyp wr = ucl_pow((numtyp)wc, (numtyp)0.5*power_d); + + numtyp kboltz = (numtyp)1.0; + numtyp GammaIJ = coeffy; // gamma[itype][jtype] + numtyp SigmaIJ = (numtyp)4.0*GammaIJ*kboltz*Ti*Tj/(Ti+Tj); + SigmaIJ = ucl_sqrt(SigmaIJ); + + numtyp force = coeffx*T_ij*wc; // a0[itype][jtype] + force -= GammaIJ *wr*wr *dot*rinv; + force += SigmaIJ* wr *randnum * dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force *= rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + // heat transfer + + if (r < coeff2w) { + numtyp wrT = (numtyp)1.0 - r/coeff2w; + wrT = MAX((numtyp)0.0,MIN((numtyp)1.0,wrT)); + wrT = ucl_pow(wrT, (numtyp)0.5*coeff2z); // powerT[itype][jtype] + numtyp randnumT = (numtyp)0; + saru(tag1, tag2, seed+tag1+tag2, timestep, randnumT); // randomT->gaussian(); + randnumT = MAX((numtyp)-5.0,MIN(randnum,(numtyp)5.0)); + + numtyp kappaT = coeff2y; // kappa[itype][jtype] + if (kappa_flag) { + numtyp factor = (numtyp)1.0; + factor += kcx*T_pow.x + kcy*T_pow.y + kcz*T_pow.z + kcw*T_pow.w; + kappaT *= factor; + } + + numtyp kij = cvi*cvj*kappaT * T_ij*T_ij; + numtyp alphaij = ucl_sqrt((numtyp)2.0*kboltz*kij); + + numtyp dQc = kij * wrT*wrT * (Tj - Ti )/(Ti*Tj); + numtyp dQd = wr*wr*( GammaIJ * vijeij*vijeij - SigmaIJ*SigmaIJ/mass_itype ) - SigmaIJ * wr *vijeij *randnum; + dQd /= (cvi+cvj); + numtyp dQr = alphaij * wrT * dtinvsqrt * randnumT; + Qi += (dQc + dQd + dQr ); + } + + if (EVFLAG && eflag) { + numtyp e = (numtyp)0.5*coeffx*T_ij*coeffz * wc*wc; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + + } + } // for nbor + } // if ii + + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv); + store_heatflux(Qi,ii,inum,tid,t_per_atom,offset,Q); +} + diff --git a/lib/gpu/lal_edpd.h b/lib/gpu/lal_edpd.h new file mode 100644 index 00000000000..e5f7b0633bf --- /dev/null +++ b/lib/gpu/lal_edpd.h @@ -0,0 +1,102 @@ +/*************************************************************************** + edpd.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the edpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_EDPD_H +#define LAL_EDPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class EDPD : public BaseDPD { + public: + EDPD(); + ~EDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_a0, + double **host_gamma, double **host_cut, double **host_power, + double **host_kappa, double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *host_special_lj, const int power_flag, const int kappa_flag, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_T, double *host_cv); + + /// copy Q (flux) from device to host + void update_flux(void **flux_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = a0, coeff.y = gamma, coeff.z = cut + UCL_D_Vec coeff; + /// coeff2.x = power, coeff2.y = kappa, coeff2.z = powerT, coeff2.w = cutT + UCL_D_Vec coeff2; + + UCL_D_Vec kc, sc; + UCL_D_Vec cutsq; + + /// per-type array + UCL_D_Vec mass; + + /// Special LJ values + UCL_D_Vec sp_lj, sp_sqrt; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector Q; + int _max_q_size; + + int _power_flag, _kappa_flag; + + /// pointer to host data + double *edpd_temp, *edpd_cv; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_edpd_ext.cpp b/lib/gpu/lal_edpd_ext.cpp new file mode 100644 index 00000000000..a9f60c39411 --- /dev/null +++ b/lib/gpu/lal_edpd_ext.cpp @@ -0,0 +1,142 @@ +/*************************************************************************** + edpd_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to edpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_edpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static EDPD EDPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int edpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, + double **host_gamma, double **host_cut, double **host_power, + double **host_kappa, double **host_powerT, double **host_cutT, + double ***host_sc, double ***host_kc, double *host_mass, + double *special_lj, const int power_flag, const int kappa_flag, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + EDPDMF.clear(); + gpu_mode=EDPDMF.device->gpu_mode(); + double gpu_split=EDPDMF.device->particle_split(); + int first_gpu=EDPDMF.device->first_device(); + int last_gpu=EDPDMF.device->last_device(); + int world_me=EDPDMF.device->world_me(); + int gpu_rank=EDPDMF.device->gpu_rank(); + int procs_per_gpu=EDPDMF.device->procs_per_gpu(); + + EDPDMF.device->init_message(screen,"edpd",first_gpu,last_gpu); + + bool message=false; + if (EDPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=EDPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_cut, + host_power, host_kappa, host_powerT, + host_cutT, host_sc, host_kc, host_mass, + special_lj, power_flag, kappa_flag, + inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + EDPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + EDPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void edpd_gpu_clear() { + EDPDMF.clear(); +} + +int ** edpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return EDPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void edpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + EDPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void edpd_gpu_get_extra_data(double *host_T, double *host_cv) { + EDPDMF.get_extra_data(host_T, host_cv); +} + +void edpd_gpu_update_flux(void **flux_ptr) { + EDPDMF.update_flux(flux_ptr); +} + +double edpd_gpu_bytes() { + return EDPDMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_hippo.cpp b/lib/gpu/lal_hippo.cpp index 8d6ad5dfb29..3511d82b000 100644 --- a/lib/gpu/lal_hippo.cpp +++ b/lib/gpu/lal_hippo.cpp @@ -603,13 +603,7 @@ int HippoT::polar_real(const int eflag, const int vflag) { const int BX=this->block_size(); const int GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - /* - const int cus = this->device->gpu->cus(); - while (GX < cus && GX > 1) { - BX /= 2; - GX=static_cast(ceil(static_cast(ainum)/(BX/this->_threads_per_atom))); - } - */ + this->time_pair.start(); // Build the short neighbor list if not done yet diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h index bc4fce40a54..ace5a263393 100644 --- a/lib/gpu/lal_lj_coul_long.h +++ b/lib/gpu/lal_lj_coul_long.h @@ -78,7 +78,7 @@ class LJCoulLong : public BaseCharge { numtyp _cut_coulsq, _qqrd2e, _g_ewald; - private: +protected: bool _allocated; int loop(const int eflag, const int vflag); }; diff --git a/lib/gpu/lal_lj_coul_long_soft.cpp b/lib/gpu/lal_lj_coul_long_soft.cpp new file mode 100644 index 00000000000..80eaaca94a0 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft.cpp @@ -0,0 +1,174 @@ +/*************************************************************************** + lj_coul_long_soft.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the lj/cut/coul/long/soft pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_coul_long_soft_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_long_soft=0; +#else +#include "lj_coul_long_soft_cubin.h" +#endif + +#include "lal_lj_coul_long_soft.h" +#include +namespace LAMMPS_AL { +#define LJCoulLongSoftT LJCoulLongSoft + +extern Device device; + +template +LJCoulLongSoftT::LJCoulLongSoft() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulLongSoftT::~LJCoulLongSoft() { + clear(); +} + +template +int LJCoulLongSoftT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulLongSoftT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, const double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_long_soft,"k_lj_coul_long_soft"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _cut_coulsq=host_cut_coulsq; + _qqrd2e=qqrd2e; + _g_ewald=g_ewald; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulLongSoftT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_epsilon, double **host_cut_ljsq) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, host_cut_ljsq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); +} + +template +void LJCoulLongSoftT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulLongSoftT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulLongSoft); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJCoulLongSoftT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &_cut_coulsq, &_qqrd2e, &_g_ewald, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, + &this->_nbor_data->begin(), &this->ans->force, + &this->ans->engv, &eflag, &vflag, &ainum, + &nbor_pitch, &this->atom->q, &_cut_coulsq, + &_qqrd2e, &_g_ewald, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJCoulLongSoft; +} diff --git a/lib/gpu/lal_lj_coul_long_soft.cu b/lib/gpu/lal_lj_coul_long_soft.cu new file mode 100644 index 00000000000..e311bb5d3b3 --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft.cu @@ -0,0 +1,290 @@ +// ************************************************************************** +// lj_coul_long_soft.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the lj/cut/coul/long/soft pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_long_soft(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const numtyp cut_coulsq, const numtyp qqrd2e, + const numtyp g_ewald, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJCoulLongSoft : public BaseCharge { + public: + LJCoulLongSoft(); + ~LJCoulLongSoft(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + const double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, double **host_cut_ljsq); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = epsilon + UCL_D_Vec lj3; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _cut_coulsq, _qqrd2e, _g_ewald; + +protected: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_long_soft_ext.cpp b/lib/gpu/lal_lj_coul_long_soft_ext.cpp new file mode 100644 index 00000000000..cb2657c03bd --- /dev/null +++ b/lib/gpu/lal_lj_coul_long_soft_ext.cpp @@ -0,0 +1,151 @@ +/*************************************************************************** + lj_coul_long_soft_ext.cpp + ------------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to lj/cut/coul/long/soft acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_long_soft.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulLongSoft LJCLSMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcls_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double host_cut_coulsq, + double *host_special_coul, const double qqrd2e, + const double g_ewald) { + LJCLSMF.clear(); + gpu_mode=LJCLSMF.device->gpu_mode(); + double gpu_split=LJCLSMF.device->particle_split(); + int first_gpu=LJCLSMF.device->first_device(); + int last_gpu=LJCLSMF.device->last_device(); + int world_me=LJCLSMF.device->world_me(); + int gpu_rank=LJCLSMF.device->gpu_rank(); + int procs_per_gpu=LJCLSMF.device->procs_per_gpu(); + + LJCLSMF.device->init_message(screen,"lj/cut/coul/long/soft",first_gpu,last_gpu); + + bool message=false; + if (LJCLSMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCLSMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, epsilon, special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e, g_ewald); + + LJCLSMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCLSMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void ljcls_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double **host_cut_ljsq) { + int world_me=LJCLSMF.device->world_me(); + int gpu_rank=LJCLSMF.device->gpu_rank(); + int procs_per_gpu=LJCLSMF.device->procs_per_gpu(); + + if (world_me==0) + LJCLSMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, + offset, epsilon, host_cut_ljsq); + LJCLSMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + +void ljcls_gpu_clear() { + LJCLSMF.clear(); +} + +int** ljcls_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCLSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcls_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCLSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success, + host_q,nlocal,boxlo,prd); +} + +double ljcls_gpu_bytes() { + return LJCLSMF.host_memory_usage(); +} + diff --git a/lib/gpu/lal_lj_coul_soft.cpp b/lib/gpu/lal_lj_coul_soft.cpp new file mode 100644 index 00000000000..9ee6486817e --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft.cpp @@ -0,0 +1,157 @@ +/*************************************************************************** + lj_coul_soft.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the lj/cut/coul/cut/soft pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "lj_coul_soft_cl.h" +#elif defined(USE_CUDART) +const char *lj_coul_soft=0; +#else +#include "lj_coul_soft_cubin.h" +#endif + +#include "lal_lj_coul_soft.h" +#include +namespace LAMMPS_AL { +#define LJCoulSoftT LJCoulSoft + +extern Device device; + +template +LJCoulSoftT::LJCoulSoft() : BaseCharge(), + _allocated(false) { +} + +template +LJCoulSoftT::~LJCoulSoft() { + clear(); +} + +template +int LJCoulSoftT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJCoulSoftT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double **host_epsilon, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_coul_soft,"k_lj_coul_soft"); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + int max_shared_types=this->device->max_shared_types(); + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cut_ljsq, host_cut_coulsq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset, host_epsilon); + + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq); + + sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY); + for (int i=0; i<4; i++) { + host_write[i]=host_special_lj[i]; + host_write[i+4]=host_special_coul[i]; + } + ucl_copy(sp_lj,host_write,8,false); + + _qqrd2e=qqrd2e; + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes(); + return 0; +} + +template +void LJCoulSoftT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + cutsq.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJCoulSoftT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJCoulSoft); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJCoulSoftT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, + &eflag, &vflag, &ainum, &nbor_pitch, &this->atom->q, + &cutsq, &_qqrd2e, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJCoulSoft; +} diff --git a/lib/gpu/lal_lj_coul_soft.cu b/lib/gpu/lal_lj_coul_soft.cu new file mode 100644 index 00000000000..1fc564bde67 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft.cu @@ -0,0 +1,276 @@ +// ************************************************************************** +// lj_coul_soft.cu +// ------------------- +// Trung Nguyen (U Chicago) +// +// Device code for acceleration of the lj/coul/cut/soft pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : ndtrung@uchicago.edu +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) + +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( q_tex,float); +#else +_texture_2d( pos_tex,int4); +_texture( q_tex,int2); +#endif + +#else +#define pos_tex x_ +#define q_tex q_ +#endif + +__kernel void k_lj_coul_soft(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const int lj_types, + const __global numtyp *restrict sp_lj_in, + const __global int *dev_nbor, + const __global int *dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp *restrict q_, + const __global numtyp *restrict cutsq, + const numtyp qqrd2e, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + __local numtyp sp_lj[8]; + int n_stride; + local_allocate_store_charge(); + + sp_lj[0]=sp_lj_in[0]; + sp_lj[1]=sp_lj_in[1]; + sp_lj[2]=sp_lj_in[2]; + sp_lj[3]=sp_lj_in[3]; + sp_lj[4]=sp_lj_in[4]; + sp_lj[5]=sp_lj_in[5]; + sp_lj[6]=sp_lj_in[6]; + sp_lj[7]=sp_lj_in[7]; + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, e_coul, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + e_coul=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJCoulSoft : public BaseCharge { + public: + LJCoulSoft(); + ~LJCoulSoft(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **host_offset, double **host_epsilon, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, + const double qqrd2e); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = epsilon + UCL_D_Vec lj3; + /// cutsq + UCL_D_Vec cutsq; + /// Special LJ values [0-3] and Special Coul values [4-7] + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + numtyp _qqrd2e; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_coul_soft_ext.cpp b/lib/gpu/lal_lj_coul_soft_ext.cpp new file mode 100644 index 00000000000..02d367b3c74 --- /dev/null +++ b/lib/gpu/lal_lj_coul_soft_ext.cpp @@ -0,0 +1,128 @@ +/*************************************************************************** + lj_coul_soft_ext.cpp + ------------------- + Trung Nguyen (U Chicago) + + Functions for LAMMPS access to lj/cut/coul/cut/soft acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : ndtrung@uchicago.edu + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_coul_soft.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJCoulSoft LJCSMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **epsilon, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_cut_ljsq, double **host_cut_coulsq, + double *host_special_coul, const double qqrd2e) { + LJCSMF.clear(); + gpu_mode=LJCSMF.device->gpu_mode(); + double gpu_split=LJCSMF.device->particle_split(); + int first_gpu=LJCSMF.device->first_device(); + int last_gpu=LJCSMF.device->last_device(); + int world_me=LJCSMF.device->world_me(); + int gpu_rank=LJCSMF.device->gpu_rank(); + int procs_per_gpu=LJCSMF.device->procs_per_gpu(); + + LJCSMF.device->init_message(screen,"lj/cut/coul/cut/soft",first_gpu,last_gpu); + + bool message=false; + if (LJCSMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJCSMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, epsilon, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen, host_cut_ljsq, + host_cut_coulsq, host_special_coul, qqrd2e); + + LJCSMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJCSMF.estimate_gpu_overhead(); + return init_ok; +} + +void ljcs_gpu_clear() { + LJCSMF.clear(); +} + +int** ljcs_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success, double *host_q, double *boxlo, + double *prd) { + return LJCSMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_q, boxlo, prd); +} + +void ljcs_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, + const int nlocal, double *boxlo, double *prd) { + LJCSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag, + vflag,eatom,vatom,host_start,cpu_time,success,host_q, + nlocal,boxlo,prd); +} + +double ljcs_gpu_bytes() { + return LJCSMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_mdpd.cpp b/lib/gpu/lal_mdpd.cpp new file mode 100644 index 00000000000..16cf926df86 --- /dev/null +++ b/lib/gpu/lal_mdpd.cpp @@ -0,0 +1,218 @@ +/*************************************************************************** + mdpd.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the mdpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "mdpd_cl.h" +#elif defined(USE_CUDART) +const char *mdpd=0; +#else +#include "mdpd_cubin.h" +#endif + +#include "lal_mdpd.h" +#include +namespace LAMMPS_AL { +#define MDPDT MDPD + +extern Device device; + +template +MDPDT::MDPD() : BaseDPD(), _allocated(false) { +} + +template +MDPDT::~MDPD() { + clear(); +} + +template +int MDPDT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int MDPDT::init(const int ntypes, + double **host_cutsq, double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, + double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,mdpd,"k_mdpd",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_A_att,host_B_rep, + host_gamma,host_sigma); + + coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff2,host_write,host_cut,host_cut_r, + host_cutsq); + + UCL_H_Vec host_rsq(lj_types*lj_types,*(this->ucl_device), + UCL_WRITE_ONLY); + cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack1(ntypes,lj_types,cutsq,host_rsq,host_cutsq); + + double special_sqrt[4]; + special_sqrt[0] = sqrt(host_special_lj[0]); + special_sqrt[1] = sqrt(host_special_lj[1]); + special_sqrt[2] = sqrt(host_special_lj[2]); + special_sqrt[3] = sqrt(host_special_lj[3]); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + sp_sqrt.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(special_sqrt,4,*(this->ucl_device)); + ucl_copy(sp_sqrt,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+cutsq.row_bytes()+ + sp_lj.row_bytes()+sp_sqrt.row_bytes(); + return 0; +} + +template +void MDPDT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + cutsq.clear(); + sp_lj.clear(); + sp_sqrt.clear(); + this->clear_atomic(); +} + +template +double MDPDT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(MDPD); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int MDPDT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = mdpd_rho[i]; + v.y = 0; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &_lj_types, &sp_lj, &sp_sqrt, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &cutsq, &this->_dtinvsqrt, &this->_seed, + &this->_timestep, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void MDPDT::get_extra_data(double *host_rho) { + mdpd_rho = host_rho; +} + +template class MDPD; +} diff --git a/lib/gpu/lal_mdpd.cu b/lib/gpu/lal_mdpd.cu new file mode 100644 index 00000000000..6230cb24961 --- /dev/null +++ b/lib/gpu/lal_mdpd.cu @@ -0,0 +1,475 @@ +// ************************************************************************** +// mdpd.cu +// ------------------- +// Trung Dac Nguyen (ORNL) +// +// Device code for acceleration of the mdpd pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : December 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#define EPSILON (numtyp)1.0e-10 + +//#define _USE_UNIFORM_SARU_LCG +//#define _USE_UNIFORM_SARU_TEA8 +//#define _USE_GAUSSIAN_SARU_LCG + +#if !defined(_USE_UNIFORM_SARU_LCG) && !defined(_USE_UNIFORM_SARU_TEA8) && !defined(_USE_GAUSSIAN_SARU_LCG) +#define _USE_UNIFORM_SARU_LCG +#endif + +// References: +// 1. Y. Afshar, F. Schmid, A. Pishevar, S. Worley, Comput. Phys. Comm. 184 (2013), 1119–1128. +// 2. C. L. Phillips, J. A. Anderson, S. C. Glotzer, Comput. Phys. Comm. 230 (2011), 7191-7201. +// PRNG period = 3666320093*2^32 ~ 2^64 ~ 10^19 + +#define LCGA 0x4beb5d59 /* Full period 32 bit LCG */ +#define LCGC 0x2600e1f7 +#define oWeylPeriod 0xda879add /* Prime period 3666320093 */ +#define oWeylOffset 0x8009d14b +#define TWO_N32 0.232830643653869628906250e-9f /* 2^-32 */ + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] +// using the inherent LCG, then multiply u with sqrt(3) to "match" +// with a normal random distribution. +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +// Curly brackets to make variables local to the scope. +#ifdef _USE_UNIFORM_SARU_LCG +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns uniformly distributed random numbers u in [-1.0;1.0] using TEA8 +// then multiply u with sqrt(3) to "match" with a normal random distribution +// Afshar et al. mutlplies u in [-0.5;0.5] with sqrt(12) +#ifdef _USE_UNIFORM_SARU_TEA8 +#define SQRT3 (numtyp)1.7320508075688772935274463 +#define k0 0xA341316C +#define k1 0xC8013EA4 +#define k2 0xAD90777D +#define k3 0x7E95761E +#define delta 0x9e3779b9 +#define rounds 8 +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + unsigned int wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int sum = 0; \ + for (int i=0; i < rounds; i++) { \ + sum += delta; \ + state += ((wstate<<4) + k0)^(wstate + sum)^((wstate>>5) + k1); \ + wstate += ((state<<4) + k2)^(state + sum)^((state>>5) + k3); \ + } \ + unsigned int v = (state ^ (state>>26)) + wstate; \ + unsigned int s = (signed int)((v^(v>>20))*0x6957f5a7); \ + randnum = SQRT3*(s*TWO_N32*(numtyp)2.0-(numtyp)1.0); \ +} +#endif + +// specifically implemented for steps = 1; high = 1.0; low = -1.0 +// returns two uniformly distributed random numbers r1 and r2 in [-1.0;1.0], +// and uses the polar method (Marsaglia's) to transform to a normal random value +// This is used to compared with CPU DPD using RandMars::gaussian() +#ifdef _USE_GAUSSIAN_SARU_LCG +#define saru(seed1, seed2, seed, timestep, randnum) { \ + unsigned int seed3 = seed + timestep; \ + seed3^=(seed1<<7)^(seed2>>6); \ + seed2+=(seed1>>4)^(seed3>>15); \ + seed1^=(seed2<<9)+(seed3<<8); \ + seed3^=0xA5366B4D*((seed2>>11) ^ (seed1<<1)); \ + seed2+=0x72BE1579*((seed1<<4) ^ (seed3>>16)); \ + seed1^=0x3F38A6ED*((seed3>>5) ^ (((signed int)seed2)>>22)); \ + seed2+=seed1*seed3; \ + seed1+=seed3 ^ (seed2>>2); \ + seed2^=((signed int)seed2)>>17; \ + unsigned int state=0x12345678; \ + unsigned int wstate=12345678; \ + state = 0x79dedea3*(seed1^(((signed int)seed1)>>14)); \ + wstate = (state + seed2) ^ (((signed int)state)>>8); \ + state = state + (wstate*(wstate^0xdddf97f5)); \ + wstate = 0xABCB96F7 + (wstate>>1); \ + unsigned int v, s; \ + numtyp r1, r2, rsq; \ + while (1) { \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r1 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + state = LCGA*state + LCGC; \ + wstate = wstate + oWeylOffset+((((signed int)wstate)>>31) & oWeylPeriod); \ + v = (state ^ (state>>26)) + wstate; \ + s = (signed int)((v^(v>>20))*0x6957f5a7); \ + r2 = s*TWO_N32*(numtyp)2.0-(numtyp)1.0; \ + rsq = r1 * r1 + r2 * r2; \ + if (rsq < (numtyp)1.0) break; \ + } \ + numtyp fac = ucl_sqrt((numtyp)-2.0*log(rsq)/rsq); \ + randnum = r2*fac; \ +} +#endif + +#define MIN(A,B) ((A) < (B) ? (A) : (B)) +#define MAX(A,B) ((A) < (B) ? (B) : (A)) + +// coeff.x = A_att, coeff.y = B_rep, coeff.z = gamma, coeff.w = sigma +// coeff2.x = cut, coeff2.y = cut_r, coeff2.z = cutsq + +__kernel void k_mdpd(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp4 *restrict coeff2, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global numtyp *restrict sp_sqrt, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_pair(); + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = A_att * wc + B_rep*(rhoi+rhoj)*wc_r + // drag force = -gamma * wr^2 * (delx dot delv) / r + // random force = sigma * wr * rnd * dtinvsqrt; + + numtyp force = A_attij*wc + B_repij*(rhoi+rhoj)*wc_r; + force -= gammaij*wr*wr*dot*rinv; + force += sigmaij*wr*randnum*dtinvsqrt; + force *= factor_dpd*rinv; + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + // unshifted eng of conservative term: + // eng shifted to 0.0 at cutoff + numtyp e = (numtyp)0.5*A_attij*cutij * wr*wr + (numtyp)0.5*B_repij*cut_rij*(rhoi+rhoj)*wc_r*wc_r; + energy+=factor_dpd*e; + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); +} + +__kernel void k_mdpd_fast(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff_in, + const __global numtyp4 *restrict coeff2_in, + const __global numtyp *restrict sp_lj_in, + const __global numtyp *restrict sp_sqrt_in, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, + const __global numtyp4 *restrict v_, + const __global numtyp *restrict cutsq, + const numtyp dtinvsqrt, const int seed, + const int timestep, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + #ifndef ONETYPE + __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES]; + __local numtyp sp_lj[4]; + __local numtyp sp_sqrt[4]; + if (tid<4) { + sp_lj[tid]=sp_lj_in[tid]; + sp_sqrt[tid]=sp_sqrt_in[tid]; + } + if (tid tag2) { + tag1 = jtag; tag2 = itag; + } + + numtyp randnum = (numtyp)0.0; + saru(tag1, tag2, seed, timestep, randnum); + + // conservative force = A_att * wc + B_rep*(rhoi+rhoj)*wc_r + // drag force = -gamma * wr^2 * (delx dot delv) / r + // random force = sigma * wr * rnd * dtinvsqrt; + + numtyp force = A_attij*wc + B_repij*(rhoi+rhoj)*wc_r; + force -= gammaij*wr*wr*dot*rinv; + force += sigmaij*wr*randnum*dtinvsqrt; + #ifndef ONETYPE + force *= factor_dpd*rinv; + #else + force*=rinv; + #endif + + f.x+=delx*force; + f.y+=dely*force; + f.z+=delz*force; + + if (EVFLAG && eflag) { + // unshifted eng of conservative term: + // eng shifted to 0.0 at cutoff + numtyp e = (numtyp)0.5*A_attij*cutij * wr*wr + (numtyp)0.5*B_repij*cut_rij*(rhoi+rhoj)*wc_r*wc_r; + #ifndef ONETYPE + energy+=factor_dpd*e; + #else + energy+=e; + #endif + } + if (EVFLAG && vflag) { + virial[0] += delx*delx*force; + virial[1] += dely*dely*force; + virial[2] += delz*delz*force; + virial[3] += delx*dely*force; + virial[4] += delx*delz*force; + virial[5] += dely*delz*force; + } + } + + } // for nbor + } // if ii + store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, + ans,engv); +} + diff --git a/lib/gpu/lal_mdpd.h b/lib/gpu/lal_mdpd.h new file mode 100644 index 00000000000..0e95185714d --- /dev/null +++ b/lib/gpu/lal_mdpd.h @@ -0,0 +1,88 @@ +/*************************************************************************** + mdpd.h + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the mdpd pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#ifndef LAL_MDPD_H +#define LAL_MDPD_H + +#include "lal_base_dpd.h" + +namespace LAMMPS_AL { + +template +class MDPD : public BaseDPD { + public: + MDPD(); + ~MDPD(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = A_att, coeff.x = B_rep, coeff.z = gamma, coeff.w = sigma + UCL_D_Vec coeff; + /// coeff2.x = cut, coeff2.y = cut_r, coeff2.z = cutsq + UCL_D_Vec coeff2; + + UCL_D_Vec cutsq; + + /// Special LJ values + UCL_D_Vec sp_lj, sp_sqrt; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// pointer to host data + double *mdpd_rho; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_mdpd_ext.cpp b/lib/gpu/lal_mdpd_ext.cpp new file mode 100644 index 00000000000..def6adb1f60 --- /dev/null +++ b/lib/gpu/lal_mdpd_ext.cpp @@ -0,0 +1,133 @@ +/*************************************************************************** + mdpd_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to mdpd acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_mdpd.h" + +using namespace std; +using namespace LAMMPS_AL; + +static MDPD MDPDMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int mdpd_gpu_init(const int ntypes, double **cutsq, + double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, + double **host_cut, double **host_cut_r, + double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + MDPDMF.clear(); + gpu_mode=MDPDMF.device->gpu_mode(); + double gpu_split=MDPDMF.device->particle_split(); + int first_gpu=MDPDMF.device->first_device(); + int last_gpu=MDPDMF.device->last_device(); + int world_me=MDPDMF.device->world_me(); + int gpu_rank=MDPDMF.device->gpu_rank(); + int procs_per_gpu=MDPDMF.device->procs_per_gpu(); + + MDPDMF.device->init_message(screen,"mdpd",first_gpu,last_gpu); + + bool message=false; + if (MDPDMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=MDPDMF.init(ntypes, cutsq, host_A_att, host_B_rep, host_gamma, host_sigma, + host_cut, host_cut_r, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen); + + MDPDMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + MDPDMF.estimate_gpu_overhead(); + return init_ok; +} + +void mdpd_gpu_clear() { + MDPDMF.clear(); +} + +int ** mdpd_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + double *boxlo, double *prd) { + return MDPDMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v, dtinvsqrt, seed, timestep, boxlo, prd); +} + +void mdpd_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const double dtinvsqrt, + const int seed, const int timestep, + const int nlocal, double *boxlo, double *prd) { + MDPDMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + tag, host_v, dtinvsqrt, seed, timestep, nlocal, boxlo, prd); +} + +void mdpd_gpu_get_extra_data(double *host_rho) { + MDPDMF.get_extra_data(host_rho); +} + +double mdpd_gpu_bytes() { + return MDPDMF.host_memory_usage(); +} + + diff --git a/lib/gpu/lal_sph_heatconduction.cpp b/lib/gpu/lal_sph_heatconduction.cpp new file mode 100644 index 00000000000..e8e366e93a6 --- /dev/null +++ b/lib/gpu/lal_sph_heatconduction.cpp @@ -0,0 +1,222 @@ +/*************************************************************************** + sph_heatconduction.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the sph_heatconduction pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "sph_heatconduction_cl.h" +#elif defined(USE_CUDART) +const char *sph_heatconduction=0; +#else +#include "sph_heatconduction_cubin.h" +#endif + +#include "lal_sph_heatconduction.h" +#include +namespace LAMMPS_AL { +#define SPHHeatConductionT SPHHeatConduction + +extern Device device; + +template +SPHHeatConductionT::SPHHeatConduction() : BaseSPH(), _allocated(false) { + _max_dE_size = 0; +} + +template +SPHHeatConductionT::~SPHHeatConduction() { + clear(); +} + +template +int SPHHeatConductionT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int SPHHeatConductionT::init(const int ntypes, + double **host_cutsq, double **host_cut, + double **host_alpha, double* host_mass, + const int dimension, double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho, esph + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,sph_heatconduction,"k_sph_heatconduction", + onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_alpha, + host_cut, host_cutsq); + + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_dE_size=static_cast(static_cast(ef_nall)*1.10); + dE.alloc(_max_dE_size,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _dimension = dimension; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+dE.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void SPHHeatConductionT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + mass.clear(); + dE.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double SPHHeatConductionT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(SPHHeatConduction); +} + +template +void SPHHeatConductionT::update_dE(void **dE_ptr) { + *dE_ptr=dE.host.begin(); + dE.update_host(_max_dE_size,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int SPHHeatConductionT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize dE array if necessary + if (nall > _max_dE_size) { + _max_dE_size=static_cast(static_cast(nall)*1.10); + dE.resize(_max_dE_size); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = rho[i]; + v.y = esph[i]; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &mass, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &dE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &mass, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &dE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void SPHHeatConductionT::get_extra_data(double *host_rho, double *host_esph) { + rho = host_rho; + esph = host_esph; +} + +template class SPHHeatConduction; +} diff --git a/lib/gpu/lal_sph_heatconduction.cu b/lib/gpu/lal_sph_heatconduction.cu new file mode 100644 index 00000000000..21c936347a7 --- /dev/null +++ b/lib/gpu/lal_sph_heatconduction.cu @@ -0,0 +1,253 @@ +// ************************************************************************** +// sph_heatconduction.cu +// --------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the sph/heatconduction pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_dE(dEacc, ii, inum, tid, t_per_atom, offset, dE) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add1(t_per_atom, red_acc, offset, tid, dEacc); \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + dEacc += shfl_down(dEacc, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii +class SPHHeatConduction : public BaseSPH { + public: + SPHHeatConduction(); + ~SPHHeatConduction(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double** host_cut, double **host_alpha, double *host_mass, + const int dimension, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho, double *host_esph); + + /// copy desph from device to host + void update_dE(void **dE_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = alpha, coeff.y = cut, coeff.z = cutsq + UCL_D_Vec coeff; + + /// per-type coeffs + UCL_D_Vec mass; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector dE; + int _max_dE_size; + + int _dimension; + + /// pointer to host data + double *rho, *esph, *cv; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_heatconduction_ext.cpp b/lib/gpu/lal_sph_heatconduction_ext.cpp new file mode 100644 index 00000000000..92e0e342d20 --- /dev/null +++ b/lib/gpu/lal_sph_heatconduction_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + sph_heatconduction_ext.cpp + -------------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to sph/heatconduction acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_sph_heatconduction.h" + +using namespace std; +using namespace LAMMPS_AL; + +static SPHHeatConduction SPHHeatConductionMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int sph_heatconduction_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_alpha, double* host_mass, const int dimension, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + SPHHeatConductionMF.clear(); + gpu_mode=SPHHeatConductionMF.device->gpu_mode(); + double gpu_split=SPHHeatConductionMF.device->particle_split(); + int first_gpu=SPHHeatConductionMF.device->first_device(); + int last_gpu=SPHHeatConductionMF.device->last_device(); + int world_me=SPHHeatConductionMF.device->world_me(); + int gpu_rank=SPHHeatConductionMF.device->gpu_rank(); + int procs_per_gpu=SPHHeatConductionMF.device->procs_per_gpu(); + + SPHHeatConductionMF.device->init_message(screen,"sph_heatconduction",first_gpu,last_gpu); + + bool message=false; + if (SPHHeatConductionMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=SPHHeatConductionMF.init(ntypes, cutsq, host_cut, host_alpha, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + SPHHeatConductionMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + SPHHeatConductionMF.estimate_gpu_overhead(); + return init_ok; +} + +void sph_heatconduction_gpu_clear() { + SPHHeatConductionMF.clear(); +} + +int ** sph_heatconduction_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + return SPHHeatConductionMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, host_tag, nspecial, special, eflag, vflag, + eatom, vatom, host_start, ilist, jnum, cpu_time, success, + host_v); +} + +void sph_heatconduction_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { + SPHHeatConductionMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + host_tag, host_v, nlocal); +} + +void sph_heatconduction_gpu_get_extra_data(double *host_rho, double *host_esph) { + SPHHeatConductionMF.get_extra_data(host_rho, host_esph); +} + +void sph_heatconduction_gpu_update_dE(void **dE_ptr) { + SPHHeatConductionMF.update_dE(dE_ptr); +} + +double sph_heatconduction_gpu_bytes() { + return SPHHeatConductionMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_sph_lj.cpp b/lib/gpu/lal_sph_lj.cpp new file mode 100644 index 00000000000..66c2a5c3027 --- /dev/null +++ b/lib/gpu/lal_sph_lj.cpp @@ -0,0 +1,222 @@ +/*************************************************************************** + sph_lj.cpp + ------------------- + Trung Nguyen (U Chicago) + + Class for acceleration of the sph_lj pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : September 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "sph_lj_cl.h" +#elif defined(USE_CUDART) +const char *sph_lj=0; +#else +#include "sph_lj_cubin.h" +#endif + +#include "lal_sph_lj.h" +#include +namespace LAMMPS_AL { +#define SPHLJT SPHLJ + +extern Device device; + +template +SPHLJT::SPHLJ() : BaseSPH(), _allocated(false) { + _max_drhoE_size = 0; +} + +template +SPHLJT::~SPHLJ() { + clear(); +} + +template +int SPHLJT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int SPHLJT::init(const int ntypes, + double **host_cutsq, double **host_cut, + double **host_viscosity, double* host_mass, + const int dimension, double *host_special_lj, + const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho, cv + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,sph_lj,"k_sph_lj",onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, + host_cut, host_cutsq); + + UCL_H_Vec dview_mass(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) + dview_mass[i] = host_mass[i]; + mass.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(mass,dview_mass,false); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); + drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _dimension = dimension; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void SPHLJT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + mass.clear(); + drhoE.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double SPHLJT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(SPHLJ); +} + +template +void SPHLJT::update_drhoE(void **drhoE_ptr) { + *drhoE_ptr=drhoE.host.begin(); + drhoE.update_host(_max_drhoE_size*2,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int SPHLJT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize drhoE array if necessary + if (nall > _max_drhoE_size) { + _max_drhoE_size=static_cast(static_cast(nall)*1.10); + drhoE.resize(_max_drhoE_size*2); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = rho[i]; + v.y = esph[i]; + v.z = cv[i]; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &mass, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &mass, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void SPHLJT::get_extra_data(double *host_rho, double *host_esph, double *host_cv) { + rho = host_rho; + esph = host_esph; + cv = host_cv; +} + +template class SPHLJ; +} diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu new file mode 100644 index 00000000000..23863b5e288 --- /dev/null +++ b/lib/gpu/lal_sph_lj.cu @@ -0,0 +1,426 @@ +// ************************************************************************** +// sph_lj.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the sph/lj pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, \ + drhoEacc.x, drhoEacc.y); \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom); \ + drhoEacc.y += shfl_down(drhoEacc.y, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii (numtyp)0.0) { + pc[1] = ucl_sqrt(csq); // soundspeed + } else { + pc[1] = (numtyp)0.0; + } +} + + +__kernel void k_sph_lj(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict extra, + const __global numtyp4 *restrict coeff, + const __global numtyp *restrict mass, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp3 *restrict ans, + __global acctyp *restrict engv, + __global acctyp2 *restrict drhoE, + const int eflag, const int vflag, + const int inum, const int nbor_pitch, + const __global numtyp4 *restrict v_, + const int dimension, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_pair(); + + acctyp3 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + acctyp2 drhoEacc; + drhoEacc.x = drhoEacc.x = (acctyp)0; + + if (ii +class SPHLJ : public BaseSPH { + public: + SPHLJ(); + ~SPHLJ(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double** host_cut, double **host_viscosity, double *host_mass, + const int dimension, + double *host_special_lj, const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, const double gpu_split, + FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho, double *host_esph, + double *host_cv); + + /// copy drho and desph from device to host + void update_drhoE(void **drhoE_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq + UCL_D_Vec coeff; + + /// per-type coeffs + UCL_D_Vec mass; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector drhoE; + int _max_drhoE_size; + + int _dimension; + + /// pointer to host data + double *rho, *esph, *cv; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_lj_ext.cpp b/lib/gpu/lal_sph_lj_ext.cpp new file mode 100644 index 00000000000..55f85c030e2 --- /dev/null +++ b/lib/gpu/lal_sph_lj_ext.cpp @@ -0,0 +1,129 @@ +/*************************************************************************** + sph_lj_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to sph/lj acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_sph_lj.h" + +using namespace std; +using namespace LAMMPS_AL; + +static SPHLJ SPHLJMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double* host_mass, const int dimension, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + SPHLJMF.clear(); + gpu_mode=SPHLJMF.device->gpu_mode(); + double gpu_split=SPHLJMF.device->particle_split(); + int first_gpu=SPHLJMF.device->first_device(); + int last_gpu=SPHLJMF.device->last_device(); + int world_me=SPHLJMF.device->world_me(); + int gpu_rank=SPHLJMF.device->gpu_rank(); + int procs_per_gpu=SPHLJMF.device->procs_per_gpu(); + + SPHLJMF.device->init_message(screen,"sph_lj",first_gpu,last_gpu); + + bool message=false; + if (SPHLJMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=SPHLJMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, + dimension, special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + SPHLJMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + SPHLJMF.estimate_gpu_overhead(); + return init_ok; +} + +void sph_lj_gpu_clear() { + SPHLJMF.clear(); +} + +int ** sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + return SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, host_tag, nspecial, special, eflag, vflag, + eatom, vatom, host_start, ilist, jnum, cpu_time, success, + host_v); +} + +void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { + SPHLJMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + host_tag, host_v, nlocal); +} + +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, double *host_cv) { + SPHLJMF.get_extra_data(host_rho, host_esph, host_cv); +} + +void sph_lj_gpu_update_drhoE(void **drhoE_ptr) { + SPHLJMF.update_drhoE(drhoE_ptr); +} + +double sph_lj_gpu_bytes() { + return SPHLJMF.host_memory_usage(); +} diff --git a/lib/gpu/lal_sph_taitwater.cpp b/lib/gpu/lal_sph_taitwater.cpp new file mode 100644 index 00000000000..7a584d435ec --- /dev/null +++ b/lib/gpu/lal_sph_taitwater.cpp @@ -0,0 +1,225 @@ +/*************************************************************************** + sph_taitwater.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Class for acceleration of the sph/taitwater pair style. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#if defined(USE_OPENCL) +#include "sph_taitwater_cl.h" +#elif defined(USE_CUDART) +const char *sph_taitwater=0; +#else +#include "sph_taitwater_cubin.h" +#endif + +#include "lal_sph_taitwater.h" +#include +namespace LAMMPS_AL { +#define SPHTaitwaterT SPHTaitwater + +extern Device device; + +template +SPHTaitwaterT::SPHTaitwater() : BaseSPH(), _allocated(false) { + _max_drhoE_size = 0; +} + +template +SPHTaitwaterT::~SPHTaitwater() { + clear(); +} + +template +int SPHTaitwaterT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int SPHTaitwaterT::init(const int ntypes, double **host_cutsq, + double **host_cut, double **host_viscosity, + double* host_mass, double* host_rho0, + double* host_soundspeed, double* host_B, const int dimension, + double *host_special_lj, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, + const double gpu_split, FILE *_screen) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + int extra_fields = 4; // round up to accomodate quadruples of numtyp values + // rho + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size, + gpu_split,_screen,sph_taitwater,"k_sph_taitwater", + onetype,extra_fields); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,coeff,host_write,host_viscosity, + host_cut, host_cutsq); + + UCL_H_Vec dview_coeff2(ntypes, *(this->ucl_device), UCL_WRITE_ONLY); + for (int i = 0; i < ntypes; i++) { + dview_coeff2[i].x = host_mass[i]; + dview_coeff2[i].y = host_rho0[i]; + dview_coeff2[i].z = host_soundspeed[i]; + dview_coeff2[i].w = host_B[i]; + } + coeff2.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY); + ucl_copy(coeff2,dview_coeff2,false); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + // allocate per-atom array Q + + int ef_nall=nall; + if (ef_nall==0) + ef_nall=2000; + + _max_drhoE_size=static_cast(static_cast(ef_nall)*1.10); + drhoE.alloc(_max_drhoE_size*2,*(this->ucl_device),UCL_READ_WRITE,UCL_READ_WRITE); + + _dimension = dimension; + + _allocated=true; + this->_max_bytes=coeff.row_bytes()+coeff2.row_bytes()+drhoE.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void SPHTaitwaterT::clear() { + if (!_allocated) + return; + _allocated=false; + + coeff.clear(); + coeff2.clear(); + drhoE.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double SPHTaitwaterT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(SPHTaitwater); +} + +template +void SPHTaitwaterT::update_drhoE(void **drhoE_ptr) { + *drhoE_ptr=drhoE.host.begin(); + drhoE.update_host(_max_drhoE_size*2,false); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int SPHTaitwaterT::loop(const int eflag, const int vflag) { + + int nall = this->atom->nall(); + + // Resize drhoE array if necessary + if (nall > _max_drhoE_size) { + _max_drhoE_size=static_cast(static_cast(nall)*1.10); + drhoE.resize(_max_drhoE_size*2); + } + + // signal that we need to transfer extra data from the host + + this->atom->extra_data_unavail(); + + numtyp4 *pextra=reinterpret_cast(&(this->atom->extra[0])); + + int n = 0; + int nstride = 1; + for (int i = 0; i < nall; i++) { + int idx = n+i*nstride; + numtyp4 v; + v.x = rho[i]; + v.y = 0; + v.z = 0; + v.w = 0; + pextra[idx] = v; + } + this->atom->add_extra_data(); + + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &this->atom->extra, &coeff, &coeff2, + &_lj_types, &sp_lj, &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &drhoE, &eflag, &vflag, + &ainum, &nbor_pitch, &this->atom->v, &_dimension, &this->_threads_per_atom); + } + + this->time_pair.stop(); + return GX; +} + +// --------------------------------------------------------------------------- +// Get the extra data pointers from host +// --------------------------------------------------------------------------- + +template +void SPHTaitwaterT::get_extra_data(double *host_rho) { + rho = host_rho; +} + +template class SPHTaitwater; +} diff --git a/lib/gpu/lal_sph_taitwater.cu b/lib/gpu/lal_sph_taitwater.cu new file mode 100644 index 00000000000..708d3ae43bc --- /dev/null +++ b/lib/gpu/lal_sph_taitwater.cu @@ -0,0 +1,377 @@ +// ************************************************************************** +// sph_taitwater.cu +// ------------------- +// Trung Dac Nguyen (U Chicago) +// +// Device code for acceleration of the sph/taitwater pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : September 2023 +// email : ndactrung@gmail.com +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +_texture( vel_tex,float4); +#else +_texture_2d( pos_tex,int4); +_texture_2d( vel_tex,int4); +#endif +#else +#define pos_tex x_ +#define vel_tex v_ +#endif + +#if (SHUFFLE_AVAIL == 0) + +#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE) \ + if (t_per_atom>1) { \ + simdsync(); \ + simd_reduce_add2(t_per_atom, red_acc, offset, tid, \ + drhoEacc.x, drhoEacc.y); \ + } \ + if (offset==0 && ii1) { \ + for (unsigned int s=t_per_atom/2; s>0; s>>=1) { \ + drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom); \ + drhoEacc.y += shfl_down(drhoEacc.y, s, t_per_atom); \ + } \ + } \ + if (offset==0 && ii +class SPHTaitwater : public BaseSPH { + public: + SPHTaitwater(); + ~SPHTaitwater(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double** host_cut, double **host_viscosity, double *host_mass, + double* host_rho0, double* host_soundspeed, double* host_B, + const int dimension, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + void get_extra_data(double *host_rho); + + /// copy drho and desph from device to host + void update_drhoE(void **drhoE_ptr); + + // --------------------------- TYPE DATA -------------------------- + + /// per-pair coeffs: coeff.x = viscosity, coeff.y = cut, coeff.z = cutsq + UCL_D_Vec coeff; + + /// per-type coeffs + UCL_D_Vec coeff2; + + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + /// Per-atom arrays + UCL_Vector drhoE; + int _max_drhoE_size; + + int _dimension; + + /// pointer to host data + double *rho; + + private: + bool _allocated; + int loop(const int eflag, const int vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_sph_taitwater_ext.cpp b/lib/gpu/lal_sph_taitwater_ext.cpp new file mode 100644 index 00000000000..9d125a63958 --- /dev/null +++ b/lib/gpu/lal_sph_taitwater_ext.cpp @@ -0,0 +1,133 @@ +/*************************************************************************** + sph_taitwater_ext.cpp + ------------------- + Trung Dac Nguyen (U Chicago) + + Functions for LAMMPS access to sph taitwater acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : December 2023 + email : ndactrung@gmail.com + ***************************************************************************/ + +#include +#include +#include + +#include "lal_sph_taitwater.h" + +using namespace std; +using namespace LAMMPS_AL; + +static SPHTaitwater SPHTaitwaterMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double* host_mass, + double* host_rho0, double* host_soundspeed, double* host_B, + const int dimension, double *special_lj, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen) { + SPHTaitwaterMF.clear(); + gpu_mode=SPHTaitwaterMF.device->gpu_mode(); + double gpu_split=SPHTaitwaterMF.device->particle_split(); + int first_gpu=SPHTaitwaterMF.device->first_device(); + int last_gpu=SPHTaitwaterMF.device->last_device(); + int world_me=SPHTaitwaterMF.device->world_me(); + int gpu_rank=SPHTaitwaterMF.device->gpu_rank(); + int procs_per_gpu=SPHTaitwaterMF.device->procs_per_gpu(); + + SPHTaitwaterMF.device->init_message(screen,"sph_taitwater",first_gpu,last_gpu); + + bool message=false; + if (SPHTaitwaterMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=SPHTaitwaterMF.init(ntypes, cutsq, host_cut, host_viscosity, host_mass, + host_rho0, host_soundspeed, host_B, dimension, + special_lj, inum, nall, max_nbors, maxspecial, + cell_size, gpu_split, screen); + + SPHTaitwaterMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; iserialize_init(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + SPHTaitwaterMF.estimate_gpu_overhead(); + return init_ok; +} + +void sph_taitwater_gpu_clear() { + SPHTaitwaterMF.clear(); +} + +int ** sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v) { + return SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, host_tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success, + host_v); +} + +void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal) { + SPHTaitwaterMF.compute(ago, inum_full, nall, host_x, host_type, ilist, numj, + firstneigh, eflag, vflag, eatom, vatom, host_start, cpu_time, success, + host_tag, host_v, nlocal); +} + +void sph_taitwater_gpu_get_extra_data(double *host_rho) { + SPHTaitwaterMF.get_extra_data(host_rho); +} + +void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr) { + SPHTaitwaterMF.update_drhoE(drhoE_ptr); +} + +double sph_taitwater_gpu_bytes() { + return SPHTaitwaterMF.host_memory_usage(); +} diff --git a/src/GPU/pair_coul_slater_long_gpu.cpp b/src/GPU/pair_coul_slater_long_gpu.cpp new file mode 100644 index 00000000000..4ace8bd7619 --- /dev/null +++ b/src/GPU/pair_coul_slater_long_gpu.cpp @@ -0,0 +1,254 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_coul_slater_long_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "kspace.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int csl_gpu_init(const int ntypes, double **scale, const int nlocal, const int nall, + const int max_nbors, const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen, double host_cut_coulsq, double *host_special_coul, + const double qqrd2e, const double g_ewald, const double lamda); +void csl_gpu_reinit(const int ntypes, double **scale); +void csl_gpu_clear(); +int **csl_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void csl_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd); +double csl_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairCoulSlaterLongGPU::PairCoulSlaterLongGPU(LAMMPS *lmp) : PairCoulSlaterLong(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairCoulSlaterLongGPU::~PairCoulSlaterLongGPU() +{ + csl_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = csl_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + csl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::init_style() +{ + if (!atom->q_flag) error->all(FLERR, "Pair style coul/slater/long/gpu requires atom attribute q"); + + // Call init_one calculation make sure scale is correct + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { init_one(i, j); } + } + } + double cell_size = cut_coul + neighbor->skin; + + cut_coulsq = cut_coul * cut_coul; + + // ensure use of KSpace long-range solver, set g_ewald + + if (force->kspace == nullptr) error->all(FLERR, "Pair style requires a KSpace style"); + g_ewald = force->kspace->g_ewald; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = csl_gpu_init(atom->ntypes + 1, scale, atom->nlocal, atom->nlocal + atom->nghost, mnf, + maxspecial, cell_size, gpu_mode, screen, cut_coulsq, + force->special_coul, force->qqrd2e, g_ewald, lamda); + + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::reinit() +{ + Pair::reinit(); + + csl_gpu_reinit(atom->ntypes + 1, scale); +} + +/* ---------------------------------------------------------------------- */ + +double PairCoulSlaterLongGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + csl_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairCoulSlaterLongGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, ecoul, fpair; + double r, r2inv, forcecoul, factor_coul; + double grij, expm2, prefactor, t, erfc; + int *jlist; + double rsq; + + ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + double *special_coul = force->special_coul; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + + r2inv = 1.0 / rsq; + + if (rsq < cut_coulsq) { + r2inv = 1.0/rsq; + r = sqrt(rsq); + grij = g_ewald * r; + expm2 = exp(-grij*grij); + t = 1.0 / (1.0 + EWALD_P*grij); + erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2; + double slater_term = exp(-2*r/lamda)*(1 + (2*r/lamda*(1+r/lamda))); + prefactor = qqrd2e * qtmp*q[j]/r; + forcecoul = prefactor * (erfc + EWALD_F*grij*expm2 - slater_term); + if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor*(1-slater_term); + + fpair = forcecoul * r2inv; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + ecoul = prefactor*(erfc - (1 + r/lamda)*exp(-2*r/lamda)); + if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor*(1.0-(1 + r/lamda)*exp(-2*r/lamda)); + } else + ecoul = 0.0; + } + + if (evflag) ev_tally_full(i, 0.0, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_coul_slater_long_gpu.h b/src/GPU/pair_coul_slater_long_gpu.h new file mode 100644 index 00000000000..4a30a71d25a --- /dev/null +++ b/src/GPU/pair_coul_slater_long_gpu.h @@ -0,0 +1,46 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(coul/slater/long/gpu,PairCoulSlaterLongGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_COUL_SLATER_LONG_GPU_H +#define LMP_PAIR_COUL_SLATER_LONG_GPU_H + +#include "pair_coul_slater_long.h" + +namespace LAMMPS_NS { + +class PairCoulSlaterLongGPU : public PairCoulSlaterLong { + public: + PairCoulSlaterLongGPU(LAMMPS *lmp); + ~PairCoulSlaterLongGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + void reinit() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_edpd_gpu.cpp b/src/GPU/pair_edpd_gpu.cpp new file mode 100644 index 00000000000..5bee0cadb81 --- /dev/null +++ b/src/GPU/pair_edpd_gpu.cpp @@ -0,0 +1,195 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_edpd_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int edpd_gpu_init(const int ntypes, double **cutsq, double **host_a0, double **host_gamma, + double **host_cut, double **host_power, double **host_kappa, + double **host_powerT, double** host_cutT, double*** host_sc, double ***host_kc, + double *host_mass, double *special_lj, const int power_flag, const int kappa_flag, + const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); +void edpd_gpu_clear(); +int **edpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, double *boxlo, + double *prd); +void edpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, const int nlocal, + double *boxlo, double *prd); +void edpd_gpu_get_extra_data(double *host_T, double *host_cv); +void edpd_gpu_update_flux(void **flux_ptr); +double edpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +/* ---------------------------------------------------------------------- */ + +PairEDPDGPU::PairEDPDGPU(LAMMPS *lmp) : PairEDPD(lmp), gpu_mode(GPU_FORCE) +{ + flux_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairEDPDGPU::~PairEDPDGPU() +{ + edpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairEDPDGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0 / sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *T = atom->edpd_temp; + double *cv = atom->edpd_cv; + edpd_gpu_get_extra_data(T, cv); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = edpd_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, dtinvsqrt, seed, update->ntimestep, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + edpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->tag, + atom->v, dtinvsqrt, seed, update->ntimestep, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the heat flux from device + + double *Q = atom->edpd_flux; + edpd_gpu_update_flux(&flux_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto flux_ptr = (float *)flux_pinned; + for (int i = 0; i < nlocal; i++) + Q[i] = flux_ptr[i]; + + } else { + auto flux_ptr = (double *)flux_pinned; + for (int i = 0; i < nlocal; i++) + Q[i] = flux_ptr[i]; + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairEDPDGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + edpd_gpu_init(atom->ntypes + 1, cutsq, a0, gamma, cut, power, kappa, + powerT, cutT, sc, kc, atom->mass, force->special_lj, + power_flag, kappa_flag, atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairEDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + edpd_gpu_bytes(); +} diff --git a/src/GPU/pair_edpd_gpu.h b/src/GPU/pair_edpd_gpu.h new file mode 100644 index 00000000000..75495b2ca4a --- /dev/null +++ b/src/GPU/pair_edpd_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(edpd/gpu,PairEDPDGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_EDPD_GPU_H +#define LMP_PAIR_EDPD_GPU_H + +#include "pair_edpd.h" + +namespace LAMMPS_NS { + +class PairEDPDGPU : public PairEDPD { + public: + PairEDPDGPU(LAMMPS *lmp); + ~PairEDPDGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *flux_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp new file mode 100644 index 00000000000..cfde3ab632d --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.cpp @@ -0,0 +1,249 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_lj_cut_coul_cut_soft_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int ljcs_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double **host_cut_coulsq, double *host_special_coul, const double qqrd2e); +void ljcs_gpu_clear(); +int **ljcs_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void ljcs_gpu_compute(const int ago, const int inum, const int nall, double **host_x, int *host_type, + int *ilist, int *numj, int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, const double cpu_time, + bool &success, double *host_q, const int nlocal, double *boxlo, double *prd); +double ljcs_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulCutSoftGPU::PairLJCutCoulCutSoftGPU(LAMMPS *lmp) : + PairLJCutCoulCutSoft(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulCutSoftGPU::~PairLJCutCoulCutSoftGPU() +{ + ljcs_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = ljcs_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcs_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::init_style() +{ + if (!atom->q_flag) error->all(FLERR, "Pair style lj/cut/coul/cut/soft/gpu requires atom attribute q"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + ljcs_gpu_init(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, + screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulCutSoftGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcs_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulCutSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itype, jtype; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; + double forcecoul, forcelj, factor_coul, factor_lj; + double denc, denlj, r4sig6; + int *jlist; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + + if (rsq < cut_coulsq[itype][jtype]) { + denc = sqrt(lj4[itype][jtype] + rsq); + forcecoul = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / (denc*denc*denc); + } else forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r4sig6 = rsq*rsq / lj2[itype][jtype]; + denlj = lj3[itype][jtype] + rsq*r4sig6; + forcelj = lj1[itype][jtype] * epsilon[itype][jtype] * + (48.0*r4sig6/(denlj*denlj*denlj) - 24.0*r4sig6/(denlj*denlj)); + } else forcelj = 0.0; + + fpair = factor_coul*forcecoul + factor_lj*forcelj; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + if (eflag) { + if (rsq < cut_coulsq[itype][jtype]) + ecoul = factor_coul * qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; + else + ecoul = 0.0; + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * + (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype]; + evdwl *= factor_lj; + } else + evdwl = 0.0; + } + + if (evflag) ev_tally_full(i, evdwl, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h new file mode 100644 index 00000000000..0776695ba3d --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_cut_soft_gpu.h @@ -0,0 +1,45 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(lj/cut/coul/cut/soft/gpu,PairLJCutCoulCutSoftGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_CUT_SOFT_GPU_H +#define LMP_PAIR_LJ_CUT_COUL_CUT_SOFT_GPU_H + +#include "pair_lj_cut_coul_cut_soft.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulCutSoftGPU : public PairLJCutCoulCutSoft { + public: + PairLJCutCoulCutSoftGPU(LAMMPS *lmp); + ~PairLJCutCoulCutSoftGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp new file mode 100644 index 00000000000..e8342b65308 --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.cpp @@ -0,0 +1,297 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_lj_cut_coul_long_soft_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "kspace.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" + +#include + +#define EWALD_F 1.12837917 +#define EWALD_P 0.3275911 +#define A1 0.254829592 +#define A2 -0.284496736 +#define A3 1.421413741 +#define A4 -1.453152027 +#define A5 1.061405429 + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int ljcls_gpu_init(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, double *special_lj, + const int nlocal, const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, double **host_cut_ljsq, + double host_cut_coulsq, double *host_special_coul, const double qqrd2e, + const double g_ewald); +void ljcls_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, double **host_lj2, + double **host_lj3, double **host_lj4, double **offset, double **epsilon, + double **host_lj_cutsq); +void ljcls_gpu_clear(); +int **ljcls_gpu_compute_n(const int ago, const int inum, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double *host_q, double *boxlo, + double *prd); +void ljcls_gpu_compute(const int ago, const int inum, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, double *host_q, const int nlocal, + double *boxlo, double *prd); +double ljcls_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairLJCutCoulLongSoftGPU::PairLJCutCoulLongSoftGPU(LAMMPS *lmp) : + PairLJCutCoulLongSoft(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJCutCoulLongSoftGPU::~PairLJCutCoulLongSoftGPU() +{ + ljcls_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = ljcls_gpu_compute_n(neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, + atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, + success, atom->q, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + ljcls_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->q, + atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (host_start < inum) { + cpu_time = platform::walltime(); + cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh); + cpu_time = platform::walltime() - cpu_time; + } +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::init_style() +{ + cut_respa = nullptr; + + if (!atom->q_flag) error->all(FLERR, "Pair style lj/cut/coul/long/soft/gpu requires atom attribute q"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i, j); + cut *= cut; + if (cut > maxcut) maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + cut_coulsq = cut_coul * cut_coul; + + // insure use of KSpace long-range solver, set g_ewald + + if (force->kspace == nullptr) error->all(FLERR, "Pair style requires a KSpace style"); + g_ewald = force->kspace->g_ewald; + + // setup force tables + + if (ncoultablebits) init_tables(cut_coul, cut_respa); + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + ljcls_gpu_init(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, mnf, maxspecial, cell_size, gpu_mode, + screen, cut_ljsq, cut_coulsq, force->special_coul, force->qqrd2e, g_ewald); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::reinit() +{ + Pair::reinit(); + + ljcls_gpu_reinit(atom->ntypes + 1, cutsq, lj1, lj2, lj3, lj4, offset, epsilon, cut_ljsq); +} + +/* ---------------------------------------------------------------------- */ + +double PairLJCutCoulLongSoftGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + ljcls_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJCutCoulLongSoftGPU::cpu_compute(int start, int inum, int eflag, int /* vflag */, int *ilist, + int *numneigh, int **firstneigh) +{ + int i, j, ii, jj, jnum, itype, jtype; + double qtmp, xtmp, ytmp, ztmp, delx, dely, delz, evdwl, ecoul, fpair; + double r, r2inv, forcecoul, forcelj, factor_coul, factor_lj; + double denc, denlj, r4sig6; + double grij, expm2, prefactor, t, erfc; + int *jlist; + double rsq; + + evdwl = ecoul = 0.0; + + double **x = atom->x; + double **f = atom->f; + double *q = atom->q; + int *type = atom->type; + double *special_coul = force->special_coul; + double *special_lj = force->special_lj; + double qqrd2e = force->qqrd2e; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + qtmp = q[i]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + factor_coul = special_coul[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx * delx + dely * dely + delz * delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r2inv = 1.0 / rsq; + + if (rsq < cut_coulsq) { + r = sqrt(rsq); + grij = g_ewald * r; + expm2 = exp(-grij * grij); + t = 1.0 / (1.0 + EWALD_P * grij); + erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2; + + denc = sqrt(lj4[itype][jtype] + rsq); + prefactor = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / (denc*denc*denc); + + forcecoul = prefactor * (erfc + EWALD_F * grij * expm2); + if (factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor; + } else + forcecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + r4sig6 = rsq*rsq / lj2[itype][jtype]; + denlj = lj3[itype][jtype] + rsq*r4sig6; + forcelj = lj1[itype][jtype] * epsilon[itype][jtype] * + (48.0*r4sig6/(denlj*denlj*denlj) - 24.0*r4sig6/(denlj*denlj)); + } else + forcelj = 0.0; + + fpair = (forcecoul + factor_lj * forcelj) * r2inv; + + f[i][0] += delx * fpair; + f[i][1] += dely * fpair; + f[i][2] += delz * fpair; + + if (eflag) { + if (rsq < cut_coulsq) { + prefactor = qqrd2e * lj1[itype][jtype] * qtmp*q[j] / denc; + ecoul = prefactor*erfc; + } else + ecoul = 0.0; + + if (rsq < cut_ljsq[itype][jtype]) { + evdwl = lj1[itype][jtype] * 4.0 * epsilon[itype][jtype] * + (1.0/(denlj*denlj) - 1.0/denlj) - offset[itype][jtype]; + evdwl *= factor_lj; + } else + evdwl = 0.0; + } + + if (evflag) ev_tally_full(i, evdwl, ecoul, fpair, delx, dely, delz); + } + } + } +} diff --git a/src/GPU/pair_lj_cut_coul_long_soft_gpu.h b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h new file mode 100644 index 00000000000..cb6790d3332 --- /dev/null +++ b/src/GPU/pair_lj_cut_coul_long_soft_gpu.h @@ -0,0 +1,46 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS Development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(lj/cut/coul/long/soft/gpu,PairLJCutCoulLongSoftGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_SOFT_GPU_H +#define LMP_PAIR_LJ_CUT_COUL_LONG_SOFT_GPU_H + +#include "pair_lj_cut_coul_long_soft.h" + +namespace LAMMPS_NS { + +class PairLJCutCoulLongSoftGPU : public PairLJCutCoulLongSoft { + public: + PairLJCutCoulLongSoftGPU(LAMMPS *lmp); + ~PairLJCutCoulLongSoftGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + void reinit() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_mdpd_gpu.cpp b/src/GPU/pair_mdpd_gpu.cpp new file mode 100644 index 00000000000..bebe1e97360 --- /dev/null +++ b/src/GPU/pair_mdpd_gpu.cpp @@ -0,0 +1,171 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_mdpd_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int mdpd_gpu_init(const int ntypes, double **cutsq, double **host_A_att, double **host_B_rep, + double **host_gamma, double **host_sigma, double **host_cut, double **host_cut_r, + double *special_lj, const int inum, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, FILE *screen); +void mdpd_gpu_clear(); +int **mdpd_gpu_compute_n(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, const bool eatom, + const bool vatom, int &host_start, int **ilist, int **jnum, + const double cpu_time, bool &success, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, double *boxlo, + double *prd); +void mdpd_gpu_compute(const int ago, const int inum_full, const int nall, double **host_x, + int *host_type, int *ilist, int *numj, int **firstneigh, const bool eflag, + const bool vflag, const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, double **host_v, + const double dtinvsqrt, const int seed, const int timestep, const int nlocal, + double *boxlo, double *prd); +void mdpd_gpu_get_extra_data(double *host_rho); +double mdpd_gpu_bytes(); + +#define EPSILON 1.0e-10 + +/* ---------------------------------------------------------------------- */ + +PairMDPDGPU::PairMDPDGPU(LAMMPS *lmp) : PairMDPD(lmp), gpu_mode(GPU_FORCE) +{ + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairMDPDGPU::~PairMDPDGPU() +{ + mdpd_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairMDPDGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + double dtinvsqrt = 1.0 / sqrt(update->dt); + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + mdpd_gpu_get_extra_data(rho); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = mdpd_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v, dtinvsqrt, seed, update->ntimestep, domain->boxlo, domain->prd); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + mdpd_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, atom->tag, + atom->v, dtinvsqrt, seed, update->ntimestep, atom->nlocal, domain->boxlo, domain->prd); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairMDPDGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + mdpd_gpu_init(atom->ntypes + 1, cutsq, A_att, B_rep, gamma, sigma, + cut, cut_r, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairMDPDGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + mdpd_gpu_bytes(); +} diff --git a/src/GPU/pair_mdpd_gpu.h b/src/GPU/pair_mdpd_gpu.h new file mode 100644 index 00000000000..5f27c4014e8 --- /dev/null +++ b/src/GPU/pair_mdpd_gpu.h @@ -0,0 +1,45 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(mdpd/gpu,PairMDPDGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_MDPD_GPU_H +#define LMP_PAIR_MDPD_GPU_H + +#include "pair_mdpd.h" + +namespace LAMMPS_NS { + +class PairMDPDGPU : public PairMDPD { + public: + PairMDPDGPU(LAMMPS *lmp); + ~PairMDPDGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_sph_heatconduction_gpu.cpp b/src/GPU/pair_sph_heatconduction_gpu.cpp new file mode 100644 index 00000000000..0f0aa079c8b --- /dev/null +++ b/src/GPU/pair_sph_heatconduction_gpu.cpp @@ -0,0 +1,196 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_sph_heatconduction_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int sph_heatconduction_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_alpha, double* host_mass, + const int dimension, double *special_lj, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void sph_heatconduction_gpu_clear(); +int **sph_heatconduction_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_heatconduction_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal); +void sph_heatconduction_gpu_get_extra_data(double *host_rho, double *host_esph); +void sph_heatconduction_gpu_update_dE(void **dE_ptr); +double sph_heatconduction_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairSPHHeatConductionGPU::PairSPHHeatConductionGPU(LAMMPS *lmp) : + PairSPHHeatConduction(lmp), gpu_mode(GPU_FORCE) +{ + dE_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairSPHHeatConductionGPU::~PairSPHHeatConductionGPU() +{ + sph_heatconduction_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairSPHHeatConductionGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + double *esph = atom->esph; + sph_heatconduction_gpu_get_extra_data(rho, esph); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = sph_heatconduction_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, + sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + sph_heatconduction_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->tag, atom->v, atom->nlocal); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the drho and dE from device + + double *desph = atom->desph; + sph_heatconduction_gpu_update_dE(&dE_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto dE_ptr = (float *)dE_pinned; + for (int i = 0; i < nlocal; i++) { + desph[i] = dE_ptr[i]; + } + + } else { + auto dE_ptr = (double *)dE_pinned; + for (int i = 0; i < nlocal; i++) { + desph[i] = dE_ptr[i]; + } + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSPHHeatConductionGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + sph_heatconduction_gpu_init(atom->ntypes + 1, cutsq, cut, alpha, atom->mass, + domain->dimension, force->special_lj, atom->nlocal, + atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairSPHHeatConductionGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + sph_heatconduction_gpu_bytes(); +} diff --git a/src/GPU/pair_sph_heatconduction_gpu.h b/src/GPU/pair_sph_heatconduction_gpu.h new file mode 100644 index 00000000000..571334017db --- /dev/null +++ b/src/GPU/pair_sph_heatconduction_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(sph/heatconduction/gpu,PairSPHHeatConductionGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_SPH_HEATCONDUCTION_GPU_H +#define LMP_PAIR_SPH_HEATCONDUCTION_GPU_H + +#include "pair_sph_heatconduction.h" + +namespace LAMMPS_NS { + +class PairSPHHeatConductionGPU : public PairSPHHeatConduction { + public: + PairSPHHeatConductionGPU(LAMMPS *lmp); + ~PairSPHHeatConductionGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *dE_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp new file mode 100644 index 00000000000..942a3c33bd4 --- /dev/null +++ b/src/GPU/pair_sph_lj_gpu.cpp @@ -0,0 +1,204 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_sph_lj_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int sph_lj_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double* host_mass, + const int dimension, double *special_lj, + const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void sph_lj_gpu_clear(); +int **sph_lj_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *host_tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_lj_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *host_tag, + double **host_v, const int nlocal); +void sph_lj_gpu_get_extra_data(double *host_rho, double *host_esph, + double *host_cv); +void sph_lj_gpu_update_drhoE(void **drhoE_ptr); +double sph_lj_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairSPHLJGPU::PairSPHLJGPU(LAMMPS *lmp) : PairSPHLJ(lmp), gpu_mode(GPU_FORCE) +{ + drhoE_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairSPHLJGPU::~PairSPHLJGPU() +{ + sph_lj_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairSPHLJGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + double *esph = atom->esph; + double *cv = atom->cv; + sph_lj_gpu_get_extra_data(rho, esph, cv); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = sph_lj_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, + sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag, + eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, + eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->tag, atom->v, atom->nlocal); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the drho and dE from device + + double *drho = atom->drho; + double *desph = atom->desph; + sph_lj_gpu_update_drhoE(&drhoE_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto drhoE_ptr = (float *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + + } else { + auto drhoE_ptr = (double *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSPHLJGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + sph_lj_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, atom->mass, + domain->dimension, force->special_lj, atom->nlocal, + atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairSPHLJGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + sph_lj_gpu_bytes(); +} diff --git a/src/GPU/pair_sph_lj_gpu.h b/src/GPU/pair_sph_lj_gpu.h new file mode 100644 index 00000000000..9aae3c2d6ab --- /dev/null +++ b/src/GPU/pair_sph_lj_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(sph/lj/gpu,PairSPHLJGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_SPH_LJ_GPU_H +#define LMP_PAIR_SPH_LJ_GPU_H + +#include "pair_sph_lj.h" + +namespace LAMMPS_NS { + +class PairSPHLJGPU : public PairSPHLJ { + public: + PairSPHLJGPU(LAMMPS *lmp); + ~PairSPHLJGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *drhoE_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/src/GPU/pair_sph_taitwater_gpu.cpp b/src/GPU/pair_sph_taitwater_gpu.cpp new file mode 100644 index 00000000000..37a1b0feb5b --- /dev/null +++ b/src/GPU/pair_sph_taitwater_gpu.cpp @@ -0,0 +1,199 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Trung Dac Nguyen (U Chicago) +------------------------------------------------------------------------- */ + +#include "pair_sph_taitwater_gpu.h" + +#include "atom.h" +#include "domain.h" +#include "error.h" +#include "force.h" +#include "gpu_extra.h" +#include "info.h" +#include "neigh_list.h" +#include "neighbor.h" +#include "suffix.h" +#include "update.h" + +#include + +using namespace LAMMPS_NS; + +// External functions from cuda library for atom decomposition + +int sph_taitwater_gpu_init(const int ntypes, double **cutsq, double** host_cut, + double **host_viscosity, double* host_mass, double* host_rho0, + double* host_soundspeed, double* host_B, const int dimension, + double *special_lj, const int inum, const int nall, + const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void sph_taitwater_gpu_clear(); +int **sph_taitwater_gpu_compute_n(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, bool &success, + double **host_v); +void sph_taitwater_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success, tagint *tag, + double **host_v, const int nlocal); +void sph_taitwater_gpu_get_extra_data(double *host_rho); +void sph_taitwater_gpu_update_drhoE(void **drhoE_ptr); +double sph_taitwater_gpu_bytes(); + +/* ---------------------------------------------------------------------- */ + +PairSPHTaitwaterGPU::PairSPHTaitwaterGPU(LAMMPS *lmp) : PairSPHTaitwater(lmp), gpu_mode(GPU_FORCE) +{ + drhoE_pinned = nullptr; + respa_enable = 0; + reinitflag = 0; + cpu_time = 0.0; + suffix_flag |= Suffix::GPU; + GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairSPHTaitwaterGPU::~PairSPHTaitwaterGPU() +{ + sph_taitwater_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairSPHTaitwaterGPU::compute(int eflag, int vflag) +{ + ev_init(eflag, vflag); + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + + double *rho = atom->rho; + sph_taitwater_gpu_get_extra_data(rho); + + if (gpu_mode != GPU_FORCE) { + double sublo[3], subhi[3]; + if (domain->triclinic == 0) { + sublo[0] = domain->sublo[0]; + sublo[1] = domain->sublo[1]; + sublo[2] = domain->sublo[2]; + subhi[0] = domain->subhi[0]; + subhi[1] = domain->subhi[1]; + subhi[2] = domain->subhi[2]; + } else { + domain->bbox(domain->sublo_lamda, domain->subhi_lamda, sublo, subhi); + } + inum = atom->nlocal; + firstneigh = sph_taitwater_gpu_compute_n( + neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success, atom->v); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + sph_taitwater_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh, + eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success, + atom->tag, atom->v, atom->nlocal); + } + if (!success) error->one(FLERR, "Insufficient memory on accelerator"); + + // get the drho and dE from device + + double *drho = atom->drho; + double *desph = atom->desph; + sph_taitwater_gpu_update_drhoE(&drhoE_pinned); + + int nlocal = atom->nlocal; + if (acc_float) { + auto drhoE_ptr = (float *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + + } else { + auto drhoE_ptr = (double *)drhoE_pinned; + int idx = 0; + for (int i = 0; i < nlocal; i++) { + drho[i] = drhoE_ptr[idx]; + desph[i] = drhoE_ptr[idx+1]; + idx += 2; + } + } + + if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) + neighbor->build_topology(); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +void PairSPHTaitwaterGPU::init_style() +{ + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double mcut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + mcut = init_one(i, j); + mcut *= mcut; + if (mcut > maxcut) maxcut = mcut; + cutsq[i][j] = cutsq[j][i] = mcut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial = 0; + if (atom->molecular != Atom::ATOMIC) maxspecial = atom->maxspecial; + int mnf = 5e-2 * neighbor->oneatom; + int success = + sph_taitwater_gpu_init(atom->ntypes + 1, cutsq, cut, viscosity, atom->mass, + rho0, soundspeed, B, domain->dimension, force->special_lj, + atom->nlocal, atom->nlocal + atom->nghost, + mnf, maxspecial, cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success, error, world); + + acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); + + if (gpu_mode == GPU_FORCE) neighbor->add_request(this, NeighConst::REQ_FULL); +} + +/* ---------------------------------------------------------------------- */ + +double PairSPHTaitwaterGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + sph_taitwater_gpu_bytes(); +} diff --git a/src/GPU/pair_sph_taitwater_gpu.h b/src/GPU/pair_sph_taitwater_gpu.h new file mode 100644 index 00000000000..df8119a3c00 --- /dev/null +++ b/src/GPU/pair_sph_taitwater_gpu.h @@ -0,0 +1,48 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(sph/taitwater/gpu,PairSPHTaitwaterGPU); +// clang-format on +#else + +#ifndef LMP_PAIR_SPH_TAITWATER_GPU_H +#define LMP_PAIR_SPH_TAITWATER_GPU_H + +#include "pair_sph_taitwater.h" + +namespace LAMMPS_NS { + +class PairSPHTaitwaterGPU : public PairSPHTaitwater { + public: + PairSPHTaitwaterGPU(LAMMPS *lmp); + ~PairSPHTaitwaterGPU() override; + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int) override; + void init_style() override; + double memory_usage() override; + + enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH }; + + void *drhoE_pinned; + bool acc_float; + + private: + int gpu_mode; + double cpu_time; +}; + +} // namespace LAMMPS_NS +#endif +#endif diff --git a/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml b/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml index ba11503a2cd..51b04f301c4 100644 --- a/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml +++ b/unittest/force-styles/tests/mol-pair-coul_slater_long.yaml @@ -1,7 +1,7 @@ --- lammps_version: 23 Jun 2022 date_generated: Thu Jul 7 09:00:39 2022 -epsilon: 2e-13 +epsilon: 1e-12 skip_tests: prerequisites: ! | atom full diff --git a/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml b/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml index 8eca0650920..a1e89e54c0e 100644 --- a/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml +++ b/unittest/force-styles/tests/mol-pair-lj_cut_coul_long_soft.yaml @@ -1,7 +1,7 @@ --- lammps_version: 17 Feb 2022 date_generated: Fri Mar 18 22:17:31 2022 -epsilon: 5e-12 +epsilon: 7.5e-12 skip_tests: prerequisites: ! | atom full