Skip to content

Commit

Permalink
Implement fully parallel update with atomic operations (#42)
Browse files Browse the repository at this point in the history
* Implement fully parallel update with atomic operations

* Use serial update by default on one thread

* Add tests for constructor

* Implement suggestions

* Reformat

* Fix tests
  • Loading branch information
efaulhaber authored Jun 28, 2024
1 parent 819e4b7 commit 03fb4ce
Show file tree
Hide file tree
Showing 9 changed files with 226 additions and 48 deletions.
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ authors = ["Erik Faulhaber <erik.faulhaber@uni-koeln.de>"]
version = "0.3.2-dev"

[deps]
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
Atomix = "0.1"
LinearAlgebra = "1"
Polyester = "0.7.5"
Reexport = "1"
Expand Down
2 changes: 2 additions & 0 deletions src/PointNeighbors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ module PointNeighbors

using Reexport: @reexport

using Atomix: Atomix
using LinearAlgebra: dot
using Polyester: @batch
@reexport using StaticArrays: SVector
Expand All @@ -17,6 +18,7 @@ include("nhs_precomputed.jl")
export foreach_point_neighbor, foreach_neighbor
export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch
export DictionaryCellList, FullGridCellList
export ParallelUpdate, SemiParallelUpdate, SerialUpdate
export initialize!, update!, initialize_grid!, update_grid!
export PeriodicBox, copy_neighborhood_search

Expand Down
2 changes: 2 additions & 0 deletions src/cell_lists/dictionary.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ struct DictionaryCellList{NDIMS} <: AbstractCellList
end
end

supported_update_strategies(::DictionaryCellList) = (SemiParallelUpdate, SerialUpdate)

function Base.empty!(cell_list::DictionaryCellList)
Base.empty!(cell_list.hashtable)

Expand Down
17 changes: 17 additions & 0 deletions src/cell_lists/full_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ struct FullGridCellList{C, LI, MC} <: AbstractCellList
end
end

function supported_update_strategies(::FullGridCellList{<:DynamicVectorOfVectors})
return (ParallelUpdate, SemiParallelUpdate, SerialUpdate)
end

supported_update_strategies(::FullGridCellList) = (SemiParallelUpdate, SerialUpdate)

function FullGridCellList(; min_corner, max_corner, search_radius = 0.0,
periodicity = false, backend = DynamicVectorOfVectors{Int32},
max_points_per_cell = 100)
Expand Down Expand Up @@ -113,6 +119,17 @@ function push_cell!(cell_list::FullGridCellList{Nothing}, cell, particle)
throw(UndefRefError("`search_radius` is not defined for this cell list"))
end

@inline function push_cell_atomic!(cell_list::FullGridCellList, cell, particle)
(; cells) = cell_list

# `push!(cell_list[cell], particle)`, but for all backends.
# The atomic version of `pushat!` uses atomics to avoid race conditions when `pushat!`
# is used in a parallel loop.
pushat_atomic!(cells, cell_index(cell_list, cell), particle)

return cell_list
end

function deleteat_cell!(cell_list::FullGridCellList, cell, i)
(; cells) = cell_list

Expand Down
181 changes: 147 additions & 34 deletions src/nhs_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = 0,
periodic_box = nothing,
cell_list = DictionaryCellList{NDIMS}(),
threaded_update = true)
update_strategy = nothing)
Simple grid-based neighborhood search with uniform search radius.
The domain is divided into a regular grid.
Expand Down Expand Up @@ -36,10 +36,13 @@ since not sorting makes our implementation a lot faster (although less paralleli
[`PeriodicBox`](@ref).
- `cell_list`: The cell list that maps a cell index to a list of points inside
the cell. By default, a [`DictionaryCellList`](@ref) is used.
- `threaded_update = true`: Can be used to deactivate thread parallelization in the
neighborhood search update. This can be one of the largest
sources of variations between simulations with different
thread numbers due to neighbor ordering changes.
- `update_strategy = nothing`: Strategy to parallelize `update!`. Available options are:
- `nothing`: Automatically choose the best available option.
- [`ParallelUpdate()`](@ref): This is not available for all cell list implementations,
but is the default when available.
- [`SemiParallelUpdate()`](@ref): This is available for all cell list implementations
and is the default when [`ParallelUpdate`](@ref) is not available.
- [`SerialUpdate()`](@ref)
## References
- M. Chalela, E. Sillero, L. Pereyra, M.A. Garcia, J.B. Cabral, M. Lares, M. Merchán.
Expand All @@ -51,25 +54,33 @@ since not sorting makes our implementation a lot faster (although less paralleli
In: Computer Graphics Forum 30.1 (2011), pages 99–112.
[doi: 10.1111/J.1467-8659.2010.01832.X](https://doi.org/10.1111/J.1467-8659.2010.01832.X)
"""
struct GridNeighborhoodSearch{NDIMS, ELTYPE, CL, PB, UB} <: AbstractNeighborhoodSearch
struct GridNeighborhoodSearch{NDIMS, US, CL, ELTYPE, PB, UB} <: AbstractNeighborhoodSearch
cell_list :: CL
search_radius :: ELTYPE
periodic_box :: PB
n_cells :: NTuple{NDIMS, Int} # Required to calculate periodic cell index
cell_size :: NTuple{NDIMS, ELTYPE} # Required to calculate cell index
update_buffer :: UB # Multithreaded buffer for `update!`
threaded_update :: Bool
update_buffer :: UB # Multithreaded buffer for `update!`
update_strategy :: US

function GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = 0,
periodic_box = nothing,
cell_list = DictionaryCellList{NDIMS}(),
threaded_update = true) where {NDIMS}
ELTYPE = typeof(search_radius)
update_strategy = nothing) where {NDIMS}
if isnothing(update_strategy) && Threads.nthreads == 1
# Use serial update on one thread to avoid a second loop over all particles
# when `ParallelUpdate` is picked.
update_strategy = SerialUpdate()
elseif isnothing(update_strategy)
# Automatically choose best available update option for this cell list
update_strategy = first(supported_update_strategies(cell_list))()
elseif !(typeof(update_strategy) in supported_update_strategies(cell_list))
throw(ArgumentError("$update_strategy is not a valid update strategy for " *
"this cell list. Available options are " *
"$(supported_update_strategies(cell_list))"))
end

# Create update buffer and initialize it with empty vectors
update_buffer = DynamicVectorOfVectors{index_type(cell_list)}(max_outer_length = Threads.nthreads(),
max_inner_length = n_points)
push!(update_buffer, (NTuple{NDIMS, Int}[] for _ in 1:Threads.nthreads())...)
update_buffer = create_update_buffer(update_strategy, cell_list, n_points)

if search_radius < eps() || isnothing(periodic_box)
# No periodicity
Expand All @@ -91,12 +102,65 @@ struct GridNeighborhoodSearch{NDIMS, ELTYPE, CL, PB, UB} <: AbstractNeighborhood
end
end

new{NDIMS, ELTYPE, typeof(cell_list), typeof(periodic_box),
new{NDIMS, typeof(update_strategy), typeof(cell_list),
typeof(search_radius), typeof(periodic_box),
typeof(update_buffer)}(cell_list, search_radius, periodic_box, n_cells,
cell_size, update_buffer, threaded_update)
cell_size, update_buffer, update_strategy)
end
end

"""
ParallelUpdate()
Fully parallel update by using atomic operations to avoid race conditions when adding points
into the same cell.
This is not available for all cell list implementations, but is the default when available.
See [`GridNeighborhoodSearch`](@ref) for usage information.
"""
struct ParallelUpdate end

"""
SemiParallelUpdate()
Loop over all cells in parallel to mark cells with points that now belong to a different
cell. Then, move points of affected cells serially to avoid race conditions.
This is available for all cell list implementations and is the default when
[`ParallelUpdate`](@ref) is not available.
See [`GridNeighborhoodSearch`](@ref) for usage information.
"""
struct SemiParallelUpdate end

"""
SerialUpdate()
Deactivate parallelization in the neighborhood search update.
Parallel neighborhood search update can be one of the largest sources of error variations
between simulations with different thread numbers due to neighbor ordering changes.
See [`GridNeighborhoodSearch`](@ref) for usage information.
"""
struct SerialUpdate end

# No update buffer needed for fully parallel update
@inline create_update_buffer(::ParallelUpdate, _, _) = nothing

@inline function create_update_buffer(::SemiParallelUpdate, cell_list, n_points)
# Create update buffer and initialize it with empty vectors
update_buffer = DynamicVectorOfVectors{index_type(cell_list)}(max_outer_length = Threads.nthreads(),
max_inner_length = n_points)
push!(update_buffer, (index_type(cell_list)[] for _ in 1:Threads.nthreads())...)
end

@inline function create_update_buffer(::SerialUpdate, cell_list, n_points)
# Create update buffer and initialize it with empty vectors.
# Only one thread is used here, so we only need one element in the buffer.
update_buffer = DynamicVectorOfVectors{index_type(cell_list)}(max_outer_length = 1,
max_inner_length = n_points)
push!(update_buffer, index_type(cell_list)[])
end

@inline Base.ndims(::GridNeighborhoodSearch{NDIMS}) where {NDIMS} = NDIMS

function initialize!(neighborhood_search::GridNeighborhoodSearch,
Expand Down Expand Up @@ -137,19 +201,25 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS},
update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i))
end

# Modify the existing cell lists by moving points into their new cells
function update_grid!(neighborhood_search::GridNeighborhoodSearch, coords_fun)
(; cell_list, update_buffer, threaded_update) = neighborhood_search
# Serial and semi-parallel update
function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
SerialUpdate},
GridNeighborhoodSearch{<:Any,
SemiParallelUpdate}},
coords_fun::Function)
(; cell_list, update_buffer) = neighborhood_search

# Empty each thread's list
@threaded for i in eachindex(update_buffer)
emptyat!(update_buffer, i)
end

# Find all cells containing points that now belong to another cell
mark_changed_cell!(neighborhood_search, cell_list, coords_fun, Val(threaded_update))
# Find all cells containing points that now belong to another cell.
# This loop is threaded for `update_strategy == SemiParallelUpdate`.
mark_changed_cells!(neighborhood_search, coords_fun)

# Iterate over all marked cells and move the points into their new cells
# Iterate over all marked cells and move the points into their new cells.
# This is always a serial loop (hence "semi-parallel").
for j in eachindex(update_buffer)
for cell_index in update_buffer[j]
points = cell_list[cell_index]
Expand Down Expand Up @@ -182,26 +252,24 @@ end
# The type annotation is to make Julia specialize on the type of the function.
# Otherwise, unspecialized code will cause a lot of allocations and heavily impact performance.
# See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
@inline function mark_changed_cell!(neighborhood_search, cell_list, coords_fun::T,
threaded_update::Val{true}) where {T}
@inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
SemiParallelUpdate},
coords_fun::T) where {T}
# `each_cell_index(cell_list)` might return a `KeySet`, which has to be `collect`ed
# first to be able to be used in a threaded loop. This function takes care of that.
@threaded for cell_index in each_cell_index_threadable(cell_list)
@threaded for cell_index in each_cell_index_threadable(neighborhood_search.cell_list)
mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
end
end

@inline function mark_changed_cell!(neighborhood_search, cell_list, coords_fun::T,
threaded_update::Val{false}) where {T}
for cell_index in each_cell_index(cell_list)
@inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
SerialUpdate},
coords_fun::T) where {T}
for cell_index in each_cell_index(neighborhood_search.cell_list)
mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
end
end

# Use this function barrier and unpack inside to avoid passing closures to Polyester.jl
# with `@batch` (`@threaded`).
# Otherwise, `@threaded` does not work here with Julia ARM on macOS.
# See https://github.com/JuliaSIMD/Polyester.jl/issues/88.
@inline function mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
(; cell_list, update_buffer) = neighborhood_search

Expand All @@ -219,6 +287,50 @@ end
end
end

# Fully parallel update with atomic push
function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any, ParallelUpdate},
coords_fun::Function)
(; cell_list) = neighborhood_search

# Note that we need two separate loops for adding and removing points.
# `push_cell_atomic!` only guarantees thread-safety when different threads push
# simultaneously, but it does not work when `deleteat_cell!` is called at the same time.

# Add points to new cells
@threaded for cell_index in each_cell_index_threadable(cell_list)
for point in cell_list[cell_index]
cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)

if !is_correct_cell(cell_list, cell_coords_, cell_index)
new_cell_coords = cell_coords(coords_fun(point), neighborhood_search)

# Add point to new cell or create cell if it does not exist
push_cell_atomic!(cell_list, new_cell_coords, point)
end
end
end

# Remove points from old cells
@threaded for cell_index in each_cell_index_threadable(cell_list)
points = cell_list[cell_index]

# WARNING!!!
# The `DynamicVectorOfVectors` requires this loop to be **in descending order**.
# `deleteat_cell!(..., i)` will change the order of points that come after `i`.
for i in reverse(eachindex(points))
point = points[i]
cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)

if !is_correct_cell(cell_list, cell_coords_, cell_index)
# Remove moved point from this cell
deleteat_cell!(cell_list, cell_index, i)
end
end
end

return neighborhood_search
end

# 1D
@inline function eachneighbor(coords, neighborhood_search::GridNeighborhoodSearch{1})
cell = cell_coords(coords, neighborhood_search)
Expand Down Expand Up @@ -295,10 +407,11 @@ end

function copy_neighborhood_search(nhs::GridNeighborhoodSearch, search_radius, n_points;
eachpoint = 1:n_points)
(; periodic_box, threaded_update) = nhs
(; periodic_box) = nhs

cell_list = copy_cell_list(nhs.cell_list, search_radius, periodic_box)

return GridNeighborhoodSearch{ndims(nhs)}(; search_radius, n_points, periodic_box,
cell_list, threaded_update)
cell_list,
update_strategy = nhs.update_strategy)
end
18 changes: 8 additions & 10 deletions src/nhs_precomputed.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
@doc raw"""
PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = 0,
periodic_box = nothing, threaded_update = true)
periodic_box = nothing, update_strategy = nothing)
Neighborhood search with precomputed neighbor lists. A list of all neighbors is computed
for each point during initialization and update.
Expand All @@ -20,10 +20,9 @@ initialization and update.
with [`copy_neighborhood_search`](@ref).
- `periodic_box = nothing`: In order to use a (rectangular) periodic domain, pass a
[`PeriodicBox`](@ref).
- `threaded_update = true`: Can be used to deactivate thread parallelization in the
neighborhood search update. This can be one of the largest
sources of variations between simulations with different
thread numbers due to neighbor ordering changes.
- `update_strategy`: Strategy to parallelize `update!` of the internally used
`GridNeighborhoodSearch`. See [`GridNeighborhoodSearch`](@ref)
for available options.
"""
struct PrecomputedNeighborhoodSearch{NDIMS, NHS, NL, PB}
neighborhood_search :: NHS
Expand All @@ -32,10 +31,9 @@ struct PrecomputedNeighborhoodSearch{NDIMS, NHS, NL, PB}

function PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = 0,
periodic_box = nothing,
threaded_update = true) where {NDIMS}
update_strategy = nothing) where {NDIMS}
nhs = GridNeighborhoodSearch{NDIMS}(; search_radius, n_points,
periodic_box = periodic_box,
threaded_update = threaded_update)
periodic_box, update_strategy)

neighbor_lists = Vector{Vector{Int}}()

Expand Down Expand Up @@ -116,8 +114,8 @@ end

function copy_neighborhood_search(nhs::PrecomputedNeighborhoodSearch,
search_radius, n_points; eachpoint = 1:n_points)
threaded_update = nhs.neighborhood_search.threaded_update
update_strategy_ = nhs.neighborhood_search.update_strategy
return PrecomputedNeighborhoodSearch{ndims(nhs)}(; search_radius, n_points,
periodic_box = nhs.periodic_box,
threaded_update)
update_strategy = update_strategy_)
end
Loading

0 comments on commit 03fb4ce

Please sign in to comment.