-
Notifications
You must be signed in to change notification settings - Fork 49
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix AtPoints transpose shift #1723
base: main
Are you sure you want to change the base?
Conversation
47bab60
to
e244d91
Compare
Note: Rust failure is an unrelated issue in their nightly build that they are working on |
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Cuda &data, const Ce | |||
// Contract x direction | |||
if (p < NUM_POINTS) { | |||
for (CeedInt i = 0; i < Q_1D; i++) { | |||
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); | |||
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + data.t_id_x) % Q_1D] * r_U[comp]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should the first i + p
in this line also be changed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oops, yeah. Goes to show that most of the time the two will be the same
@@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Cuda &data, const Ceed | |||
// Contract x direction | |||
if (p < NUM_POINTS) { | |||
for (CeedInt i = 0; i < Q_1D; i++) { | |||
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same here
@@ -74,7 +74,7 @@ inline __device__ void InterpTransposeAtPoints1d(SharedData_Hip &data, const Cee | |||
// Contract x direction | |||
if (p < NUM_POINTS) { | |||
for (CeedInt i = 0; i < Q_1D; i++) { | |||
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
also here
@@ -120,7 +120,7 @@ inline __device__ void GradTransposeAtPoints1d(SharedData_Hip &data, const CeedI | |||
// Contract x direction | |||
if (p < NUM_POINTS) { | |||
for (CeedInt i = 0; i < Q_1D; i++) { | |||
atomicAdd(&data.slice[comp * Q_1D + (i + p) % Q_1D], chebyshev_x[(i + p) % Q_1D] * r_U[comp]); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here too
Minor performance fix to help prevent collisions in atomic adds for AtPoints transpose basis action on GPUs
Note that previously
i = threadIdx.x + threadIdx.y * blockDim.x
was the value ofp
, so this should be largely a more clear replacement for the same logic we had before