From 3592588fe9045c03f1601b96a3dcb9e60fb1cd03 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:29:28 +0100 Subject: [PATCH 1/4] Implement option to transpose neighbor lists of `PrecomputedNeighborhoodSearch` --- src/PointNeighbors.jl | 1 + src/cell_lists/cell_lists.jl | 21 +++++++++++--------- src/nhs_grid.jl | 2 ++ src/nhs_precomputed.jl | 25 ++++++++++++++++++++---- src/nhs_trivial.jl | 2 ++ src/vector_of_vectors.jl | 25 ++++++++++++++++++++---- test/nhs_precomputed.jl | 38 ++++++++++++++++++++++++++++++++++++ test/unittest.jl | 1 + 8 files changed, 98 insertions(+), 17 deletions(-) create mode 100644 test/nhs_precomputed.jl diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 6e7ad93e..137a7f95 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -23,6 +23,7 @@ include("gpu.jl") export foreach_point_neighbor, foreach_neighbor export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch export DictionaryCellList, FullGridCellList, SpatialHashingCellList +export DynamicVectorOfVectors export ParallelUpdate, SemiParallelUpdate, SerialIncrementalUpdate, SerialUpdate, ParallelIncrementalUpdate export requires_update diff --git a/src/cell_lists/cell_lists.jl b/src/cell_lists/cell_lists.jl index 7512329e..6031ed1a 100644 --- a/src/cell_lists/cell_lists.jl +++ b/src/cell_lists/cell_lists.jl @@ -11,16 +11,18 @@ abstract type AbstractCellList end end function construct_backend(::Type{Vector{Vector{T}}}, - max_outer_length, - max_inner_length) where {T} + max_outer_length, max_inner_length; + transpose_backend = false) where {T} + transpose_backend && error("transpose backend not supported for Vector{Vector{T}}") + return [T[] for _ in 1:max_outer_length] end function construct_backend(::Type{DynamicVectorOfVectors{T}}, - max_outer_length, - max_inner_length) where {T} - cells = DynamicVectorOfVectors{T}(max_outer_length = max_outer_length, - max_inner_length = max_inner_length) + max_outer_length, max_inner_length; + transpose_backend = false) where {T} + cells = DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length, + transpose_backend) resize!(cells, max_outer_length) return cells @@ -30,10 +32,11 @@ end # `DynamicVectorOfVectors{T}`, but a type `DynamicVectorOfVectors{T1, T2, T3, T4}`. # While `A{T} <: A{T1, T2}`, this doesn't hold for the types. # `Type{A{T}} <: Type{A{T1, T2}}` is NOT true. -function construct_backend(::Type{DynamicVectorOfVectors{T1, T2, T3, T4}}, max_outer_length, - max_inner_length) where {T1, T2, T3, T4} +function construct_backend(::Type{DynamicVectorOfVectors{T1, T2, T3, T4}}, + max_outer_length, max_inner_length; + transpose_backend = false) where {T1, T2, T3, T4} return construct_backend(DynamicVectorOfVectors{T1}, max_outer_length, - max_inner_length) + max_inner_length; transpose_backend) end function max_points_per_cell(cells::DynamicVectorOfVectors) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 7b0d5929..3f36a62e 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -30,6 +30,8 @@ since not sorting makes our implementation a lot faster (although less paralleli # Keywords - `search_radius = 0.0`: The fixed search radius. The default of `0.0` is useful together with [`copy_neighborhood_search`](@ref). + Note that the type of `search_radius` determines the type used + for the distance computations. - `n_points = 0`: Total number of points. The default of `0` is useful together with [`copy_neighborhood_search`](@ref). - `periodic_box = nothing`: In order to use a (rectangular) periodic domain, pass a diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 7cb3b9e3..eae9aa68 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -3,6 +3,7 @@ periodic_box = nothing, update_strategy = nothing, update_neighborhood_search = GridNeighborhoodSearch{NDIMS}(), backend = DynamicVectorOfVectors{Int32}, + transpose_backend = false, max_neighbors = max_neighbors(NDIMS)) Neighborhood search with precomputed neighbor lists. A list of all neighbors is computed @@ -22,6 +23,8 @@ to strip the internal neighborhood search, which is not needed anymore. # Keywords - `search_radius = 0.0`: The fixed search radius. The default of `0.0` is useful together with [`copy_neighborhood_search`](@ref). + Note that the type of `search_radius` determines the type used + for the distance computations. - `n_points = 0`: Total number of points. The default of `0` is useful together with [`copy_neighborhood_search`](@ref). - `periodic_box = nothing`: In order to use a (rectangular) periodic domain, pass a @@ -41,9 +44,20 @@ to strip the internal neighborhood search, which is not needed anymore. - `Vector{Vector{Int32}}`: Scattered memory, but very memory-efficient. - `DynamicVectorOfVectors{Int32}`: Contiguous memory, optimizing cache-hits and GPU-compatible. -- `max_neighbors`: Maximum number of neighbors per particle. This will be used to - allocate the `DynamicVectorOfVectors`. It is not used with - other backends. The default is 64 in 2D and 324 in 3D. +- `max_neighbors`: Maximum number of neighbors per particle. This will be used to + allocate the `DynamicVectorOfVectors`. It is not used with + other backends. The default is 64 in 2D and 324 in 3D. +- `transpose_backend = false`: Whether to transpose the backend data structure storing the + neighbor lists. This is only supported for the + `DynamicVectorOfVectors` backend. + By default, the neighbors of each point are stored contiguously + in memory. This layout optimizes cache hits when looping + over all neighbors of a point on CPUs. + On GPUs, however, storing the first neighbors of all points + contiguously in memory (`transpose_backend = true`) + allows for coalesced memory accesses when all threads process + the n-th neighbor of their respective point in parallel. + This can lead to a speedup of ~3x in many cases. """ struct PrecomputedNeighborhoodSearch{NDIMS, NL, ELTYPE, PB, NHS} <: AbstractNeighborhoodSearch @@ -73,8 +87,9 @@ function PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = periodic_box, update_strategy), backend = DynamicVectorOfVectors{Int32}, + transpose_backend = false, max_neighbors = max_neighbors(NDIMS)) where {NDIMS} - neighbor_lists = construct_backend(backend, n_points, max_neighbors) + neighbor_lists = construct_backend(backend, n_points, max_neighbors; transpose_backend) PrecomputedNeighborhoodSearch{NDIMS}(neighbor_lists, search_radius, periodic_box, update_neighborhood_search) @@ -211,10 +226,12 @@ function copy_neighborhood_search(nhs::PrecomputedNeighborhoodSearch, # For `Vector{Vector}` backend use `max_neighbors(NDIMS)` as fallback. # This should never be used because this backend doesn't require a `max_neighbors`. max_neighbors_ = max_inner_length(nhs.neighbor_lists, max_neighbors(ndims(nhs))) + transpose_backend = transposed_backend(nhs.neighbor_lists) return PrecomputedNeighborhoodSearch{ndims(nhs)}(; search_radius, n_points, periodic_box = nhs.periodic_box, update_neighborhood_search, backend = typeof(nhs.neighbor_lists), + transpose_backend, max_neighbors = max_neighbors_) end diff --git a/src/nhs_trivial.jl b/src/nhs_trivial.jl index 814ac03a..c97f0809 100644 --- a/src/nhs_trivial.jl +++ b/src/nhs_trivial.jl @@ -10,6 +10,8 @@ Trivial neighborhood search that simply loops over all points. # Keywords - `search_radius = 0.0`: The fixed search radius. The default of `0.0` is useful together with [`copy_neighborhood_search`](@ref). + Note that the type of `search_radius` determines the type used + for the distance computations. - `eachpoint = 1:0`: Iterator for all point indices. Usually just `1:n_points`. The default of `1:0` is useful together with [`copy_neighborhood_search`](@ref). diff --git a/src/vector_of_vectors.jl b/src/vector_of_vectors.jl index 9e331c50..b2475ba6 100644 --- a/src/vector_of_vectors.jl +++ b/src/vector_of_vectors.jl @@ -13,8 +13,17 @@ struct DynamicVectorOfVectors{T, ARRAY2D, ARRAY1D, L} <: AbstractVector{Array{T, end end -function DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length) where {T} - backend = Array{T, 2}(undef, max_inner_length, max_outer_length) +function DynamicVectorOfVectors{T}(; max_outer_length, max_inner_length, + transpose_backend = false) where {T} + if transpose_backend + # Create a row-major array + backend_ = Array{T, 2}(undef, max_outer_length, max_inner_length) + # Wrap in a `PermutedDimsArray` to obtain the usual column-major access, + # even though the underlying memory layout is row-major. + backend = PermutedDimsArray(backend_, (2, 1)) + else + backend = Array{T, 2}(undef, max_inner_length, max_outer_length) + end length_ = Ref(zero(Int32)) lengths = zeros(Int32, max_outer_length) @@ -162,11 +171,19 @@ end return vov end -function max_inner_length(cells::DynamicVectorOfVectors, fallback) +@inline function max_inner_length(cells::DynamicVectorOfVectors, fallback) return size(cells.backend, 1) end # Fallback when backend is a `Vector{Vector{T}}`. Only used for copying the cell list. -function max_inner_length(::Any, fallback) +@inline function max_inner_length(::Any, fallback) return fallback end + +@inline function transposed_backend(::DynamicVectorOfVectors{<:Any, <:PermutedDimsArray}) + return true +end + +@inline function transposed_backend(::Any) + return false +end diff --git a/test/nhs_precomputed.jl b/test/nhs_precomputed.jl new file mode 100644 index 00000000..fe084352 --- /dev/null +++ b/test/nhs_precomputed.jl @@ -0,0 +1,38 @@ +@testset verbose=true "PrecomputedNeighborhoodSearch" begin + # Test regular vs transposed backend + nhs = PrecomputedNeighborhoodSearch{2}(transpose_backend = false, n_points = 2) + + # Add test neighbors + neighbor_lists = nhs.neighbor_lists + @test PointNeighbors.transposed_backend(neighbor_lists) == false + neighbor_lists.backend .= 0 + PointNeighbors.pushat!(neighbor_lists, 1, 101) + PointNeighbors.pushat!(neighbor_lists, 1, 102) + PointNeighbors.pushat!(neighbor_lists, 2, 201) + PointNeighbors.pushat!(neighbor_lists, 2, 202) + PointNeighbors.pushat!(neighbor_lists, 2, 203) + + # Check that neighbors are next to each other in memory + pointer_ = pointer(neighbor_lists.backend) + @test unsafe_load(pointer_, 1) == 101 + @test unsafe_load(pointer_, 2) == 102 + + # Transposed backend + nhs = PrecomputedNeighborhoodSearch{2}(transpose_backend = true, n_points = 2) + + # Add test neighbors + neighbor_lists = nhs.neighbor_lists + @test PointNeighbors.transposed_backend(neighbor_lists) == true + neighbor_lists.backend .= 0 + PointNeighbors.pushat!(neighbor_lists, 1, 101) + PointNeighbors.pushat!(neighbor_lists, 1, 102) + PointNeighbors.pushat!(neighbor_lists, 2, 201) + PointNeighbors.pushat!(neighbor_lists, 2, 202) + PointNeighbors.pushat!(neighbor_lists, 2, 203) + + # Check that first neighbors are next to each other in memory + @test neighbor_lists.backend isa PermutedDimsArray + pointer_ = pointer(neighbor_lists.backend.parent) + @test unsafe_load(pointer_, 1) == 101 + @test unsafe_load(pointer_, 2) == 201 +end diff --git a/test/unittest.jl b/test/unittest.jl index bdcedbd4..f0981614 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -4,6 +4,7 @@ include("vector_of_vectors.jl") include("nhs_trivial.jl") include("nhs_grid.jl") + include("nhs_precomputed.jl") include("neighborhood_search.jl") include("cell_lists/full_grid.jl") end; From 2d527094d1bfcd6de1c7de746865fae2dfbdd5d1 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 28 Nov 2025 16:33:21 +0100 Subject: [PATCH 2/4] Improve docs --- src/nhs_precomputed.jl | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index eae9aa68..7de683a8 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -53,10 +53,11 @@ to strip the internal neighborhood search, which is not needed anymore. By default, the neighbors of each point are stored contiguously in memory. This layout optimizes cache hits when looping over all neighbors of a point on CPUs. - On GPUs, however, storing the first neighbors of all points - contiguously in memory (`transpose_backend = true`) - allows for coalesced memory accesses when all threads process - the n-th neighbor of their respective point in parallel. + On GPUs, however, storing all first neighbors of all points + contiguously in memory, then all second neighbors, etc., + (`transpose_backend = true`) allows for coalesced + memory accesses when all threads process the n-th neighbor + of their respective point in parallel. This can lead to a speedup of ~3x in many cases. """ struct PrecomputedNeighborhoodSearch{NDIMS, NL, ELTYPE, PB, NHS} <: From 6c714db1ab0b0e507dfd30168a4aae57f0ed4434 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 28 Nov 2025 17:44:51 +0100 Subject: [PATCH 3/4] Improve error --- src/cell_lists/cell_lists.jl | 3 ++- src/nhs_precomputed.jl | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cell_lists/cell_lists.jl b/src/cell_lists/cell_lists.jl index 6031ed1a..4c3796e0 100644 --- a/src/cell_lists/cell_lists.jl +++ b/src/cell_lists/cell_lists.jl @@ -13,7 +13,8 @@ end function construct_backend(::Type{Vector{Vector{T}}}, max_outer_length, max_inner_length; transpose_backend = false) where {T} - transpose_backend && error("transpose backend not supported for Vector{Vector{T}}") + transpose_backend && error("transpose backend is only supported for " * + "DynamicVectorOfVectors backend") return [T[] for _ in 1:max_outer_length] end diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 7de683a8..abec387c 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -46,7 +46,7 @@ to strip the internal neighborhood search, which is not needed anymore. and GPU-compatible. - `max_neighbors`: Maximum number of neighbors per particle. This will be used to allocate the `DynamicVectorOfVectors`. It is not used with - other backends. The default is 64 in 2D and 324 in 3D. + other backends. The default is 64 in 2D and 320 in 3D. - `transpose_backend = false`: Whether to transpose the backend data structure storing the neighbor lists. This is only supported for the `DynamicVectorOfVectors` backend. From 6fcf9a83b2f68e8a9e862bb6f279e02e54c76bb1 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 1 Dec 2025 11:36:22 +0100 Subject: [PATCH 4/4] Reformat --- src/cell_lists/cell_lists.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/cell_lists/cell_lists.jl b/src/cell_lists/cell_lists.jl index 4c3796e0..346af284 100644 --- a/src/cell_lists/cell_lists.jl +++ b/src/cell_lists/cell_lists.jl @@ -13,8 +13,9 @@ end function construct_backend(::Type{Vector{Vector{T}}}, max_outer_length, max_inner_length; transpose_backend = false) where {T} - transpose_backend && error("transpose backend is only supported for " * - "DynamicVectorOfVectors backend") + if transpose_backend + error("transpose backend is only supported for DynamicVectorOfVectors backend") + end return [T[] for _ in 1:max_outer_length] end