From a24d84eaf50532bd6ddb3309c91171c35669c827 Mon Sep 17 00:00:00 2001 From: Jeremy L Thompson Date: Thu, 9 Jan 2025 15:56:21 -0700 Subject: [PATCH] gpu - fix AtPoints transpose shift --- ...-shared-basis-tensor-at-points-templates.h | 20 +++++++++---------- ...-shared-basis-tensor-at-points-templates.h | 20 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h index 2442d648ee..32437bf4c4 100644 --- a/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h +++ b/include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h @@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce // Contract x direction if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { - atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); } } // Pull from shared to register @@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed // Contract x direction if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { - atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); } } // Pull from shared to register @@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Cuda &data, const Ce if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = (j + p) % Q_1D; + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } @@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Cuda &data, const Ceed if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = (j + p) % Q_1D; + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } @@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Cuda &data, const Ce if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = ((j + p) % Q_1D); + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } @@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Cuda &data, const Ceed if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = ((j + p) % Q_1D); + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } diff --git a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h index 6c522ac5cd..923de63395 100644 --- a/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h +++ b/include/ceed/jit-source/hip/hip-shared-basis-tensor-at-points-templates.h @@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee // Contract x direction if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { - atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); } } // Pull from shared to register @@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI // Contract x direction if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { - atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); + atomicAdd(&data.slice[comp * Q_1D + (i + data.t_id_x) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); } } // Pull from shared to register @@ -186,10 +186,10 @@ inline __device__ void InterpTransposeAtPoints2d(SharedData_Hip &data, const Cee if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = (j + p) % Q_1D; + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } @@ -261,10 +261,10 @@ inline __device__ void GradTransposeAtPoints2d(SharedData_Hip &data, const CeedI if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = (j + p) % Q_1D; + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } @@ -343,10 +343,10 @@ inline __device__ void InterpTransposeAtPoints3d(SharedData_Hip &data, const Cee if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = ((j + p) % Q_1D); + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); } @@ -430,10 +430,10 @@ inline __device__ void GradTransposeAtPoints3d(SharedData_Hip &data, const CeedI if (p < NUM_POINTS) { for (CeedInt i = 0; i < Q_1D; i++) { // Note: shifting to avoid atomic adds - const CeedInt ii = (i + (p / Q_1D)) % Q_1D; + const CeedInt ii = (i + data.t_id_x) % Q_1D; for (CeedInt j = 0; j < Q_1D; j++) { - const CeedInt jj = ((j + p) % Q_1D); + const CeedInt jj = (j + data.t_id_y) % Q_1D; atomicAdd(&data.slice[jj + ii * Q_1D], chebyshev_x[jj] * buffer[ii]); }