From 777c4e20eae22de99cdb3e2217d667852c65ab5c Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:26:14 +0200 Subject: [PATCH 01/19] Add unsafe version of `foreach_neighbor` and `foreach_point_neighbor` --- src/PointNeighbors.jl | 3 +- src/neighborhood_search.jl | 121 ++++++++++++++++++++++++++++++++++--- src/nhs_grid.jl | 15 +++-- src/nhs_precomputed.jl | 20 +++--- 4 files changed, 134 insertions(+), 25 deletions(-) diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 702c19e4..58b38687 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -20,7 +20,8 @@ include("nhs_grid.jl") include("nhs_precomputed.jl") include("gpu.jl") -export foreach_point_neighbor, foreach_neighbor +export foreach_point_neighbor, foreach_point_neighbor_unsafe, + foreach_neighbor, foreach_neighbor_unsafe export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch export DictionaryCellList, FullGridCellList, SpatialHashingCellList export DynamicVectorOfVectors diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 5e99b9cd..a7f23916 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -178,7 +178,7 @@ Note that `system_coords` and `neighbor_coords` can be identical. automatically based on the type of `system_coords`. See [`@threaded`](@ref) for a list of available backends. -See also [`initialize!`](@ref), [`update!`](@ref). +See also [`foreach_point_neighbor_unsafe`](@ref), [`initialize!`](@ref), [`update!`](@ref). """ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search; parallelization_backend::ParallelizationBackend = default_backend(system_coords), @@ -200,6 +200,53 @@ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborho return nothing end +""" + foreach_point_neighbor_unsafe(f, system_coords, neighbor_coords, neighborhood_search; + parallelization_backend = default_backend(system_coords), + points = axes(system_coords, 2)) + +Like [`foreach_point_neighbor`](@ref), but skips **all** bounds checks inside the +threaded loop or GPU kernel. +See [`foreach_neighbor_unsafe`](@ref) for more details on which bounds checks are skipped. + +!!! warning + The `neighborhood_search` must have been initialized or updated with `system_coords` + as first coordinate array and `neighbor_coords` as second coordinate array. + This can be skipped for certain implementations. See [`requires_update`](@ref). + +!!! warning + Use this only when `neighborhood_search` is known to be initialized correctly for + `system_coords` and `neighbor_system_coords`. +""" +function foreach_point_neighbor_unsafe(f::T, system_coords, neighbor_coords, neighborhood_search; + parallelization_backend::ParallelizationBackend = default_backend(system_coords), + points = axes(system_coords, 2)) where {T} + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) + + @threaded parallelization_backend for point in points + # Now we can safely assume that `point` is inbounds + @inbounds foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, point) + end + + return nothing +end + +""" + foreach_neighbor(f, system_coords, neighbor_system_coords, + neighborhood_search::AbstractNeighborhoodSearch, point; + search_radius = search_radius(neighborhood_search)) + +Loop over all neighbors of `point` and execute `f(i, j, pos_diff, d)` for every +neighbor within `search_radius`, where `i` is `point`, `j` is the neighbor index, +`pos_diff` is the vector from neighbor to point, and `d` is the distance. + +This method performs bounds checks, even when called with `@inbounds`. +`@inbounds` only skips the bounds check for loading the coordinates of `point` +from `system_coords`. +See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks. +""" @propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, neighborhood_search::AbstractNeighborhoodSearch, point; @@ -207,7 +254,7 @@ end # Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. # We have to use `@propagate_inbounds`, which will also remove boundschecks - # in the neighbor loop, which is not safe (see comment below). + # in the neighbor loop, which is not safe (that's what `foreach_neighbor_unsafe` is for). # To avoid this, we have to use a function barrier to disable the `@inbounds` again. point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) @@ -215,26 +262,80 @@ end point, point_coords, search_radius) end -# This is the generic function that is called for `TrivialNeighborhoodSearch`. -# For `GridNeighborhoodSearch`, a specialized function is used for slightly better -# performance. `PrecomputedNeighborhoodSearch` can skip the distance check altogether. +# This is a function barrier to prevent the `@inbounds` in `foreach_neighbor` +# from propagating into the neighbor loop, which is not safe. @inline function foreach_neighbor(f, neighbor_system_coords, neighborhood_search::AbstractNeighborhoodSearch, point, point_coords, search_radius) + foreach_neighbor_inner(f, neighbor_system_coords, neighborhood_search, + point, point_coords, search_radius) +end + +""" + foreach_neighbor_unsafe(f, system_coords, neighbor_system_coords, + neighborhood_search::AbstractNeighborhoodSearch, point; + search_radius = search_radius(neighborhood_search)) + +Like [`foreach_neighbor`](@ref), but skips **all** bounds checks. + +`foreach_neighbor` performs the following bounds checks that are skipped here: +- Check if `point` is in bounds of `system_coords`. This is the only bounds check + that is skipped when calling `foreach_neighbor` with `@inbounds`, and the only one that + is safe to skip when `point` is guaranteed to be in bounds of `system_coords`. +- Check if the neighbors of `point` are in bounds of `neighbor_system_coords`. + This is not safe to skip when the neighborhood search was not initialized correctly; + for example when initialized with coordinate arrays of different sizes than the ones + passed to this function or when the internal data structures have been tampered with. + In this case, the cell lists (for [`GridNeighborhoodSearch`](@ref)) or neighbor lists + (for [`PrecomputedNeighborhoodSearch`](@ref)) might contain indices that are out of + bounds for `neighbor_system_coords`. +- With `GridNeighborhoodSearch` and [`FullGridCellList`](@ref), check if the neighboring + cells are in bounds of the grid. Again, this is not safe to skip when the neighborhood + search might not have been initialized correctly. +- With `PrecomputedNeighborhoodSearch`, check if `point` is in bounds of the neighbor lists. + Again, this is not safe to skip when the neighborhood search was not initialized correctly. + +Note that all these bounds checks are safe to skip if +- `point` is guaranteed to be in bounds of `system_coords`, +- the neighborhood search was initialized correctly with `system_coords` + and `neighbor_system_coords` and has not been tampered with since then. + +!!! warning + Use this only when `point` is known to be in bounds of `system_coords` + and when `neighborhood_search` is known to be initialized correctly for + `system_coords` and `neighbor_system_coords`. +""" +@inline function foreach_neighbor_unsafe(f, system_coords, neighbor_system_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point; + search_radius = search_radius(neighborhood_search)) + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + point) + + @inbounds foreach_neighbor_inner(f, neighbor_system_coords, neighborhood_search, + point, point_coords, search_radius) +end + +# This is the generic function that is called for `TrivialNeighborhoodSearch`. +# For `GridNeighborhoodSearch`, a specialized function is used for slightly better +# performance. `PrecomputedNeighborhoodSearch` can skip the distance check altogether. +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_system_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box) = neighborhood_search for neighbor in eachneighbor(point_coords, neighborhood_search) - # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. - # But we don't know if `neighbor` (extracted from the cell list) is in bounds. neighbor_coords = extract_svector(neighbor_system_coords, Val(ndims(neighborhood_search)), neighbor) pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, search_radius, - periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) if distance2 <= search_radius^2 distance = sqrt(distance2) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 21759ab2..32953c8e 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -508,14 +508,19 @@ end # Specialized version of the function in `neighborhood_search.jl`, which is faster # than looping over `eachneighbor`. -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::GridNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_system_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius) (; cell_list, periodic_box) = neighborhood_search cell = cell_coords(point_coords, neighborhood_search) for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) neighbor_cell = Tuple(neighbor_cell_) + + # Making the following `@inbounds` is not safe because we don't know if + # `neighbor_cell` is in bounds of the grid. neighbors = points_in_cell(neighbor_cell, neighborhood_search) # Boolean to indicate if this cell has a collision (only with `SpatialHashingCellList`) @@ -525,8 +530,8 @@ end for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. - # But we don't know if `neighbor` (extracted from the cell list) is in bounds. + # Making the following `@inbounds` is not safe because we don't know + # if `neighbor` (extracted from the cell list) is in bounds. neighbor_coords = extract_svector(neighbor_system_coords, Val(ndims(neighborhood_search)), neighbor) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 187ff0c5..3495b198 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -200,22 +200,24 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, end end -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::PrecomputedNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_unsafe(f, neighbor_system_coords, + neighborhood_search::PrecomputedNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box, neighbor_lists) = neighborhood_search - neighbors = @inbounds neighbor_lists[point] + # Making the following `@inbounds` is not safe because the neighbor list + # might not contain `point` if the NHS was not initialized correctly. + neighbors = neighbor_lists[point] for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making this `@inbounds` is not perfectly safe because + # Making this `@inbounds` is not safe because # `neighbor` (extracted from the neighbor list) is only guaranteed to be in bounds # if the neighbor lists were constructed correctly and have not been corrupted. - # However, adding this `@inbounds` yields a ~20% speedup for TLSPH on GPUs (A4500). - neighbor_coords = @inbounds extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), - neighbor) + neighbor_coords = extract_svector(neighbor_system_coords, + Val(ndims(neighborhood_search)), neighbor) pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) distance2 = dot(pos_diff, pos_diff) From 753958f7e9fed304fb4aef30919631da11b92e6d Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:58:10 +0200 Subject: [PATCH 02/19] Fix --- src/neighborhood_search.jl | 43 +++++++++++++++++++------------------- src/nhs_grid.jl | 18 +++++++++------- src/nhs_precomputed.jl | 19 +++++++++-------- 3 files changed, 42 insertions(+), 38 deletions(-) diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index a7f23916..cf02774e 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -216,25 +216,25 @@ See [`foreach_neighbor_unsafe`](@ref) for more details on which bounds checks ar !!! warning Use this only when `neighborhood_search` is known to be initialized correctly for - `system_coords` and `neighbor_system_coords`. + `system_coords` and `neighbor_coords`. """ -function foreach_point_neighbor_unsafe(f::T, system_coords, neighbor_coords, neighborhood_search; +function foreach_point_neighbor_unsafe(f::T, system_coords, neighbor_coords, + neighborhood_search; parallelization_backend::ParallelizationBackend = default_backend(system_coords), points = axes(system_coords, 2)) where {T} # Explicit bounds check before the hot loop (or GPU kernel) @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) @threaded parallelization_backend for point in points - # Now we can safely assume that `point` is inbounds - @inbounds foreach_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, point) + foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search, point) end return nothing end """ - foreach_neighbor(f, system_coords, neighbor_system_coords, + foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point; search_radius = search_radius(neighborhood_search)) @@ -247,7 +247,7 @@ This method performs bounds checks, even when called with `@inbounds`. from `system_coords`. See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks. """ -@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, +@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point; search_radius = search_radius(neighborhood_search)) @@ -258,21 +258,21 @@ See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks # To avoid this, we have to use a function barrier to disable the `@inbounds` again. point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) - foreach_neighbor(f, neighbor_system_coords, neighborhood_search, + foreach_neighbor(f, neighbor_coords, neighborhood_search, point, point_coords, search_radius) end # This is a function barrier to prevent the `@inbounds` in `foreach_neighbor` # from propagating into the neighbor loop, which is not safe. -@inline function foreach_neighbor(f, neighbor_system_coords, +@inline function foreach_neighbor(f, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point, point_coords, search_radius) - foreach_neighbor_inner(f, neighbor_system_coords, neighborhood_search, + foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, point, point_coords, search_radius) end """ - foreach_neighbor_unsafe(f, system_coords, neighbor_system_coords, + foreach_neighbor_unsafe(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point; search_radius = search_radius(neighborhood_search)) @@ -282,13 +282,13 @@ Like [`foreach_neighbor`](@ref), but skips **all** bounds checks. - Check if `point` is in bounds of `system_coords`. This is the only bounds check that is skipped when calling `foreach_neighbor` with `@inbounds`, and the only one that is safe to skip when `point` is guaranteed to be in bounds of `system_coords`. -- Check if the neighbors of `point` are in bounds of `neighbor_system_coords`. +- Check if the neighbors of `point` are in bounds of `neighbor_coords`. This is not safe to skip when the neighborhood search was not initialized correctly; for example when initialized with coordinate arrays of different sizes than the ones passed to this function or when the internal data structures have been tampered with. In this case, the cell lists (for [`GridNeighborhoodSearch`](@ref)) or neighbor lists (for [`PrecomputedNeighborhoodSearch`](@ref)) might contain indices that are out of - bounds for `neighbor_system_coords`. + bounds for `neighbor_coords`. - With `GridNeighborhoodSearch` and [`FullGridCellList`](@ref), check if the neighboring cells are in bounds of the grid. Again, this is not safe to skip when the neighborhood search might not have been initialized correctly. @@ -298,21 +298,21 @@ Like [`foreach_neighbor`](@ref), but skips **all** bounds checks. Note that all these bounds checks are safe to skip if - `point` is guaranteed to be in bounds of `system_coords`, - the neighborhood search was initialized correctly with `system_coords` - and `neighbor_system_coords` and has not been tampered with since then. + and `neighbor_coords` and has not been tampered with since then. !!! warning Use this only when `point` is known to be in bounds of `system_coords` and when `neighborhood_search` is known to be initialized correctly for - `system_coords` and `neighbor_system_coords`. + `system_coords` and `neighbor_coords`. """ -@inline function foreach_neighbor_unsafe(f, system_coords, neighbor_system_coords, +@inline function foreach_neighbor_unsafe(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point; search_radius = search_radius(neighborhood_search)) point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), point) - @inbounds foreach_neighbor_inner(f, neighbor_system_coords, neighborhood_search, + @inbounds foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, point, point_coords, search_radius) end @@ -321,16 +321,17 @@ end # performance. `PrecomputedNeighborhoodSearch` can skip the distance check altogether. # Note that calling this function with `@inbounds` is not safe. # See the comments in `foreach_neighbor_unsafe`. -@propagate_inbounds function foreach_neighbor_inner(f, neighbor_system_coords, +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, point, point_coords, search_radius) (; periodic_box) = neighborhood_search for neighbor in eachneighbor(point_coords, neighborhood_search) - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) (pos_diff, diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 32953c8e..8e3c1ff2 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -510,7 +510,7 @@ end # than looping over `eachneighbor`. # Note that calling this function with `@inbounds` is not safe. # See the comments in `foreach_neighbor_unsafe`. -@propagate_inbounds function foreach_neighbor_inner(f, neighbor_system_coords, +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, neighborhood_search::GridNeighborhoodSearch, point, point_coords, search_radius) (; cell_list, periodic_box) = neighborhood_search @@ -532,15 +532,17 @@ end # Making the following `@inbounds` is not safe because we don't know # if `neighbor` (extracted from the cell list) is in bounds. - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), + neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, - search_radius, periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, + search_radius, periodic_box) if distance2 <= search_radius^2 distance = sqrt(distance2) @@ -548,7 +550,7 @@ end # If this cell has a collision, check if this point belongs to this cell # (only with `SpatialHashingCellList`). if cell_collision && - check_collision(neighbor_cell_, neighbor_coords, cell_list, + check_collision(neighbor_cell_, neighbor_point_coords, cell_list, neighborhood_search) continue end diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 3495b198..c7ddceca 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -202,9 +202,9 @@ end # Note that calling this function with `@inbounds` is not safe. # See the comments in `foreach_neighbor_unsafe`. -@propagate_inbounds function foreach_neighbor_unsafe(f, neighbor_system_coords, - neighborhood_search::PrecomputedNeighborhoodSearch, - point, point_coords, search_radius) +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::PrecomputedNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box, neighbor_lists) = neighborhood_search # Making the following `@inbounds` is not safe because the neighbor list @@ -216,15 +216,16 @@ end # Making this `@inbounds` is not safe because # `neighbor` (extracted from the neighbor list) is only guaranteed to be in bounds # if the neighbor lists were constructed correctly and have not been corrupted. - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, search_radius, - periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) distance = sqrt(distance2) From be87e63034236da9d961700be02775cf708e88ec Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 14 Apr 2026 10:58:32 +0200 Subject: [PATCH 03/19] Add tests for unsafe functions --- test/neighborhood_search.jl | 61 +++++++++++++++++++++++++++++++------ 1 file changed, 52 insertions(+), 9 deletions(-) diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl index 65d3ed85..3820e11d 100644 --- a/test/neighborhood_search.jl +++ b/test/neighborhood_search.jl @@ -254,7 +254,7 @@ names_copied = [name * " copied" for name in names] append!(names, names_copied) - @testset "$(names[i])" for i in eachindex(names) + @testset verbose=true "$(names[i])" for i in eachindex(names) nhs = neighborhood_searches[i] # Initialize with `seed = 1` @@ -267,17 +267,60 @@ update!(nhs, coords, coords) end - neighbors = [Int[] for _ in axes(coords, 2)] + # Test the regular `foreach_point_neighbor` + @testset "`foreach_point_neighbor`" begin + neighbors = [Int[] for _ in axes(coords, 2)] + foreach_point_neighbor(coords, coords, nhs, + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance + append!(neighbors[point], neighbor) + end + + @test sort.(neighbors) == neighbors_expected + end - foreach_point_neighbor(coords, coords, nhs, - parallelization_backend = SerialBackend()) do point, - neighbor, - pos_diff, - distance - append!(neighbors[point], neighbor) + # Test manual loop with `foreach_neighbor` + @testset "Manual Loop with `foreach_neighbor`" begin + neighbors_manual = [Int[] for _ in axes(coords, 2)] + for point in axes(coords, 2) + foreach_neighbor(coords, coords, nhs, + point) do point, neighbor, pos_diff, distance + append!(neighbors_manual[point], neighbor) + end + end + + @test sort.(neighbors_manual) == neighbors_expected end - @test sort.(neighbors) == neighbors_expected + # Repeat with foreach_point_neighbor_unsafe + @testset "`foreach_point_neighbor_unsafe`" begin + neighbors_unsafe = [Int[] for _ in axes(coords, 2)] + foreach_point_neighbor_unsafe(coords, coords, nhs, + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance + append!(neighbors_unsafe[point], neighbor) + end + + @test sort.(neighbors_unsafe) == neighbors_expected + end + + # Repeat with manual loop with `foreach_neighbor_unsafe` + @testset "Manual Loop with `foreach_neighbor_unsafe`" begin + neighbors_manual_unsafe = [Int[] for _ in axes(coords, 2)] + for point in axes(coords, 2) + foreach_neighbor_unsafe(coords, coords, nhs, + point) do point, neighbor, + pos_diff, distance + append!(neighbors_manual_unsafe[point], neighbor) + end + end + + @test sort.(neighbors_manual_unsafe) == neighbors_expected + end end end end From 149da3367698bac13d6e8c0b575ff66bf1e905bd Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:02:41 +0200 Subject: [PATCH 04/19] Include unused tests for spatial hashing --- test/cell_lists/spatial_hashing.jl | 30 +++++++++++++++++------------- test/unittest.jl | 1 + 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/test/cell_lists/spatial_hashing.jl b/test/cell_lists/spatial_hashing.jl index 28d13b12..6a8e8104 100644 --- a/test/cell_lists/spatial_hashing.jl +++ b/test/cell_lists/spatial_hashing.jl @@ -71,9 +71,11 @@ @testset "Cell Coordinates Hash Function" begin # 1D coordinates - @test coordinates_flattened([1]) == UInt128(reinterpret(UInt32, Int32(1))) - @test coordinates_flattened([-1]) == UInt128(reinterpret(UInt32, Int32(-1))) - @test coordinates_flattened([0]) == Int128(0) + @test PointNeighbors.coordinates_flattened([1]) == + UInt128(reinterpret(UInt32, Int32(1))) + @test PointNeighbors.coordinates_flattened([-1]) == + UInt128(reinterpret(UInt32, Int32(-1))) + @test PointNeighbors.coordinates_flattened([0]) == Int128(0) # 2D coordinates coord2 = [-1, 1] @@ -82,18 +84,20 @@ # The second coordinate gives 1 shifted by 32 bits, so 1 * 2^32. expected2 = UInt128(2^32 - 1 + 2^32) - @test coordinates_flattened(coord2) == expected2 + @test PointNeighbors.coordinates_flattened(coord2) == expected2 # 3D coordinates coord3 = [1, 0, -1] expected3 = UInt128(1 + 0 * 2^32 + (2^32 - 1) * Int128(2)^64) - @test coordinates_flattened(coord3) == expected3 + @test PointNeighbors.coordinates_flattened(coord3) == expected3 # Extreme Int32 bounds max_val = Int32(typemax(Int32)) min_val = Int32(typemin(Int32)) - @test coordinates_flattened((max_val,)) == UInt128(reinterpret(UInt32, max_val)) - @test coordinates_flattened((min_val,)) == UInt128(reinterpret(UInt32, min_val)) + @test PointNeighbors.coordinates_flattened((max_val,)) == + UInt128(reinterpret(UInt32, max_val)) + @test PointNeighbors.coordinates_flattened((min_val,)) == + UInt128(reinterpret(UInt32, min_val)) # 3D extreme Int32 bounds coord4 = [min_val, max_val, Int32(0)] @@ -102,19 +106,19 @@ # `typemax(Int32)` gives the unsigned value 2^31 - 1. expected4 = UInt128(2^31 + (2^31 - 1) * 2^32) - @test coordinates_flattened(coord4) == expected4 + @test PointNeighbors.coordinates_flattened(coord4) == expected4 # Passing non-Int32-coercible should error large_val = typemax(Int32) + 1 - @test_throws InexactError coordinates_flattened([large_val]) + @test_throws InexactError PointNeighbors.coordinates_flattened([large_val]) small_val = typemin(Int32) - 1 - @test_throws InexactError coordinates_flattened([small_val]) + @test_throws InexactError PointNeighbors.coordinates_flattened([small_val]) - @test_throws InexactError coordinates_flattened([Inf]) - @test_throws InexactError coordinates_flattened([NaN]) + @test_throws InexactError PointNeighbors.coordinates_flattened([Inf]) + @test_throws InexactError PointNeighbors.coordinates_flattened([NaN]) # Too many dimensions should throw assertion error - @test_throws AssertionError coordinates_flattened([1, 2, 3, 4]) + @test_throws AssertionError PointNeighbors.coordinates_flattened([1, 2, 3, 4]) end end diff --git a/test/unittest.jl b/test/unittest.jl index f0981614..262b1753 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -7,4 +7,5 @@ include("nhs_precomputed.jl") include("neighborhood_search.jl") include("cell_lists/full_grid.jl") + include("cell_lists/spatial_hashing.jl") end; From e3194a6d815309f21cdff4331f608190126c5a35 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 14 Apr 2026 11:11:38 +0200 Subject: [PATCH 05/19] Bump version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index d4654124..32e1fdf5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PointNeighbors" uuid = "1c4d5385-0a27-49de-8e2c-43b175c8985c" authors = ["Erik Faulhaber "] -version = "0.6.6-dev" +version = "0.6.6" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" From f11877cef9d03edd2b969cd9565af214654f084a Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 1 Apr 2026 12:51:29 +0200 Subject: [PATCH 06/19] Rewrite benchmarking code --- benchmarks/run_benchmarks.jl | 46 ++++++---- benchmarks/smoothed_particle_hydrodynamics.jl | 89 ++++++++----------- test/point_cloud.jl | 25 +++++- 3 files changed, 91 insertions(+), 69 deletions(-) diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl index ffabc805..062e6bbe 100644 --- a/benchmarks/run_benchmarks.jl +++ b/benchmarks/run_benchmarks.jl @@ -50,10 +50,11 @@ run_benchmark(benchmark_count_neighbors, (10, 10), 3, ``` """ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborhood_searches; + search_radius_factor = 3.0, parallelization_backend = PolyesterBackend(), names = ["Neighborhood search $i" for i in 1:length(neighborhood_searches)]', - seed = 1, perturbation_factor_position = 1.0) + seed = 1, perturbation_factor_position = 1.0, shuffle = false) # Multiply number of points in each iteration (roughly) by this factor scaling_factor = 4 per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension)) @@ -64,24 +65,28 @@ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborho times = zeros(iterations, length(neighborhood_searches)) for iter in 1:iterations - coordinates = point_cloud(sizes[iter]; seed, perturbation_factor_position) + coordinates_ = point_cloud(sizes[iter]; seed, perturbation_factor_position, shuffle) + coordinates = convert.(typeof(search_radius_factor), coordinates_) domain_size = maximum(sizes[iter]) + 1 # Normalize domain size to 1 coordinates ./= domain_size # Make this Float32 to make sure that Float32 benchmarks use Float32 exclusively - search_radius = 4.0f0 / domain_size + search_radius = search_radius_factor / domain_size n_particles = size(coordinates, 2) neighborhood_searches_copy = copy_neighborhood_search.(neighborhood_searches, search_radius, n_particles) for i in eachindex(neighborhood_searches_copy) - neighborhood_search = neighborhood_searches_copy[i] - PointNeighbors.initialize!(neighborhood_search, coordinates, coordinates) + neighborhood_search_ = neighborhood_searches_copy[i] + neighborhood_search = PointNeighbors.Adapt.adapt(parallelization_backend, + neighborhood_search_) + coords = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates) + PointNeighbors.initialize!(neighborhood_search, coords, coords) - time = benchmark(neighborhood_search, coordinates; parallelization_backend) + time = benchmark(neighborhood_search, coords; parallelization_backend) times[iter, i] = time time_string = BenchmarkTools.prettytime(time * 1e9) time_string_per_particle = BenchmarkTools.prettytime(time * 1e9 / n_particles) @@ -170,23 +175,30 @@ include("benchmarks/benchmarks.jl") run_benchmark_gpu(benchmark_n_body, (10, 10), 3) ``` """ -function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...) +function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) NDIMS = length(n_points_per_dimension) min_corner = 0.0f0 .* n_points_per_dimension max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) - neighborhood_searches = [GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0, - cell_list = FullGridCellList(; - search_radius = 0.0f0, - min_corner, - max_corner)) - PrecomputedNeighborhoodSearch{NDIMS}(search_radius = 0.0f0)] - - names = ["GridNeighborhoodSearch with FullGridCellList";; - "PrecomputedNeighborhoodSearch"] + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + transpose_backend = parallelization_backend isa PointNeighbors.KernelAbstractions.GPU + neighborhood_searches = [ + grid_nhs + PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, + update_neighborhood_search = grid_nhs, + transpose_backend) + ] + + names = [ + "GridNeighborhoodSearch with FullGridCellList";; + "PrecomputedNeighborhoodSearch" + ] run_benchmark(benchmark, n_points_per_dimension, iterations, - neighborhood_searches; names, kwargs...) + neighborhood_searches; names, parallelization_backend, kwargs...) end """ diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 7286baca..bfb760ac 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -1,4 +1,5 @@ using PointNeighbors +using PointNeighbors.Adapt using TrixiParticles using BenchmarkTools @@ -43,47 +44,25 @@ This method is used to simulate an incompressible fluid. """ function benchmark_wcsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - density = 1000.0 - particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1, particle_spacing) - - sound_speed = 10.0 - state_equation = StateEquationCole(; sound_speed, reference_density = density, - exponent = 1) - - viscosity = ArtificialViscosityMonaghan(alpha = 0.02, beta = 0.0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1) - - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end - -""" - benchmark_wcsph_fp32(neighborhood_search, coordinates; - parallelization_backend = default_backend(coordinates)) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) -Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers. -""" -function benchmark_wcsph_fp32(neighborhood_search, coordinates_; - parallelization_backend = default_backend(coordinates_)) - coordinates = convert(Matrix{Float32}, coordinates_) - density = 1000.0f0 + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + density = convert(ELTYPE, 1000.0) particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1f0, particle_spacing) + fluid = InitialCondition(; coordinates = coordinates_cpu, density, + mass = convert(ELTYPE, 0.1) * particle_spacing, + particle_spacing) - sound_speed = 10.0f0 + sound_speed = convert(ELTYPE, 10.0) state_equation = StateEquationCole(; sound_speed, reference_density = density, exponent = 1) - viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0) - - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end + viscosity = ArtificialViscosityMonaghan(alpha = convert(ELTYPE, 0.02), + beta = convert(ELTYPE, 0.0)) + density_diffusion = DensityDiffusionMolteniColagrossi(delta = convert(ELTYPE, 0.1)) -function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_equation, - viscosity, density_diffusion, parallelization_backend) # Compact support == 2 * smoothing length for these kernels smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2 if ndims(neighborhood_search) == 1 @@ -92,23 +71,21 @@ function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_e smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end - fluid_system = WeaklyCompressibleSPHSystem(initial_condition, ContinuityDensity(), + fluid_system = WeaklyCompressibleSPHSystem(fluid, ContinuityDensity(), state_equation, smoothing_kernel, smoothing_length, viscosity = viscosity, density_diffusion = density_diffusion) - system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system) + system = Adapt.adapt(parallelization_backend, fluid_system) # Remove unnecessary data structures that are only used for initialization - neighborhood_search_ = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) - nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search_) semi = DummySemidiscretization(nhs, parallelization_backend, true) - v = PointNeighbors.Adapt.adapt(parallelization_backend, - vcat(initial_condition.velocity, - initial_condition.density')) - u = PointNeighbors.Adapt.adapt(parallelization_backend, initial_condition.coordinates) + v = Adapt.adapt(parallelization_backend, + vcat(fluid.velocity, fluid.density')) + u = Adapt.adapt(parallelization_backend, fluid.coordinates) dv = zero(v) # Initialize the system @@ -128,8 +105,15 @@ This method is used to simulate an elastic structure. """ function benchmark_tlsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - material = (density = 1000.0, E = 1.4e6, nu = 0.4) - solid = InitialCondition(; coordinates, density = material.density, mass = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) + + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + material = (density = convert(ELTYPE, 1000.0), E = convert(ELTYPE, 1.4e6), + nu = convert(ELTYPE, 0.4)) + solid = InitialCondition(; coordinates = coordinates_cpu, + density = material.density, mass = convert(ELTYPE, 0.1)) # Compact support == 2 * smoothing length for these kernels smoothing_length_ = PointNeighbors.search_radius(neighborhood_search) / 2 @@ -142,15 +126,20 @@ function benchmark_tlsph(neighborhood_search, coordinates; solid_system = TotalLagrangianSPHSystem(solid, smoothing_kernel, smoothing_length, material.E, material.nu) - semi = DummySemidiscretization(neighborhood_search, parallelization_backend, true) + system_ = Adapt.adapt(parallelization_backend, solid_system) - v = copy(solid.velocity) - u = copy(solid.coordinates) + # Remove unnecessary data structures that are only used for initialization + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + system = TrixiParticles.@set system_.self_interaction_nhs = nhs + + semi = DummySemidiscretization(nhs, parallelization_backend, true) + + v = Adapt.adapt(parallelization_backend, copy(solid.velocity)) + u = Adapt.adapt(parallelization_backend, copy(solid.coordinates)) dv = zero(v) # Initialize the system - TrixiParticles.initialize!(solid_system, semi) + TrixiParticles.initialize!(system, semi) - return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, - $solid_system, $solid_system, $semi) + return @belapsed TrixiParticles.interact_structure_structure2!($dv, $v, $system, $semi) end diff --git a/test/point_cloud.jl b/test/point_cloud.jl index ba99b2e1..56a280de 100644 --- a/test/point_cloud.jl +++ b/test/point_cloud.jl @@ -2,7 +2,7 @@ using Random # Generate a rectangular point cloud, optionally with a perturbation in the point positions function point_cloud(n_points_per_dimension; - seed = 1, perturbation_factor_position = 1.0) + seed = 1, perturbation_factor_position = 1.0, shuffle = false) # Fixed seed to ensure reproducibility Random.seed!(seed) @@ -10,8 +10,15 @@ function point_cloud(n_points_per_dimension; coordinates = Array{Float64}(undef, n_dims, prod(n_points_per_dimension)) cartesian_indices = CartesianIndices(n_points_per_dimension) + # Extra data structures for the sorting code below + cell_coords = Vector{SVector{n_dims, Int}}(undef, size(coordinates, 2)) + cell_size = ntuple(dim -> 4.0, n_dims) + for i in axes(coordinates, 2) - coordinates[:, i] .= Tuple(cartesian_indices[i]) + point_coords = SVector(Tuple(cartesian_indices[i])) + coordinates[:, i] .= point_coords + cell_coords[i] = PointNeighbors.cell_coords(point_coords, nothing, nothing, + cell_size) .+ 1 end # A standard deviation of 0.05 in the particle coordinates @@ -21,6 +28,20 @@ function point_cloud(n_points_per_dimension; # The benchmark results are also consistent with the timer output of the simulation. perturb!(coordinates, perturbation_factor_position * 0.05) + # Sort by Z index (with `using Morton`) + # permutation = sortperm(cell_coords, by = c -> cartesian2morton(c)) + + # Sort by linear cell index + # permutation = sortperm(cell_coords) + # coordinates .= coordinates[:, permutation] + + if shuffle + # Sort randomly + permutation = shuffle(axes(coordinates, 2)) + + coordinates .= coordinates[:, permutation] + end + return coordinates end From dd890ebe44dafd19978f8168456863f40b8bb3f0 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:28:12 +0200 Subject: [PATCH 07/19] Use random density for WCSPH benchmark --- benchmarks/smoothed_particle_hydrodynamics.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index bfb760ac..88e38508 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -55,6 +55,11 @@ function benchmark_wcsph(neighborhood_search, coordinates; mass = convert(ELTYPE, 0.1) * particle_spacing, particle_spacing) + # Make sure that the computed forces are not all zero + for i in eachindex(fluid.density) + fluid.density[i] += rand(eltype(fluid.density)) + end + sound_speed = convert(ELTYPE, 10.0) state_equation = StateEquationCole(; sound_speed, reference_density = density, exponent = 1) From 6ecd287092ab2d70cdac9f2d32700ff5d4b630d7 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:28:28 +0200 Subject: [PATCH 08/19] Sort point cloud for benchmarking --- test/neighborhood_search.jl | 6 +++--- test/point_cloud.jl | 24 ++++++++++++++---------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl index 3820e11d..ed1a80dd 100644 --- a/test/neighborhood_search.jl +++ b/test/neighborhood_search.jl @@ -142,14 +142,14 @@ "($(seed == 1 ? "`initialize!`" : "`update!`"))" @testset verbose=true "$(name(cloud_size, seed)))" for cloud_size in cloud_sizes, seed in seeds - coords = point_cloud(cloud_size, seed = seed) + search_radius = 2.5 + coords = point_cloud(cloud_size, search_radius, seed = seed) NDIMS = length(cloud_size) n_points = size(coords, 2) - search_radius = 2.5 # Use different coordinates for `initialize!` and then `update!` with the # correct coordinates to make sure that `update!` is working as well. - coords_initialize = point_cloud(cloud_size, seed = 1) + coords_initialize = point_cloud(cloud_size, search_radius, seed = 1) # Compute expected neighbor lists by brute-force looping over all points # as potential neighbors (`TrivialNeighborhoodSearch`). diff --git a/test/point_cloud.jl b/test/point_cloud.jl index 56a280de..64c41bec 100644 --- a/test/point_cloud.jl +++ b/test/point_cloud.jl @@ -1,8 +1,9 @@ using Random # Generate a rectangular point cloud, optionally with a perturbation in the point positions -function point_cloud(n_points_per_dimension; - seed = 1, perturbation_factor_position = 1.0, shuffle = false) +function point_cloud(n_points_per_dimension, search_radius; + seed = 1, perturbation_factor_position = 1.0, + sort = true, shuffle = false) # Fixed seed to ensure reproducibility Random.seed!(seed) @@ -12,7 +13,7 @@ function point_cloud(n_points_per_dimension; # Extra data structures for the sorting code below cell_coords = Vector{SVector{n_dims, Int}}(undef, size(coordinates, 2)) - cell_size = ntuple(dim -> 4.0, n_dims) + cell_size = ntuple(dim -> search_radius, n_dims) for i in axes(coordinates, 2) point_coords = SVector(Tuple(cartesian_indices[i])) @@ -28,17 +29,20 @@ function point_cloud(n_points_per_dimension; # The benchmark results are also consistent with the timer output of the simulation. perturb!(coordinates, perturbation_factor_position * 0.05) - # Sort by Z index (with `using Morton`) - # permutation = sortperm(cell_coords, by = c -> cartesian2morton(c)) - # Sort by linear cell index - # permutation = sortperm(cell_coords) - # coordinates .= coordinates[:, permutation] + if sort + if shuffle + throw(ArgumentError("cannot sort and shuffle at the same time")) + end + + # Sort by Z index (with `using Morton`) + # permutation = sortperm(cell_coords, by = c -> cartesian2morton(c)) - if shuffle + permutation = sortperm(cell_coords) + coordinates .= coordinates[:, permutation] + elseif shuffle # Sort randomly permutation = shuffle(axes(coordinates, 2)) - coordinates .= coordinates[:, permutation] end From 9bce3baab5f5c16a938a43d9cddc2b1e122a7063 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 9 Apr 2026 10:44:49 +0200 Subject: [PATCH 09/19] Pre-sort benchmark and add `run_benchmark_full_grid` --- benchmarks/run_benchmarks.jl | 49 ++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl index 062e6bbe..ff08ebe3 100644 --- a/benchmarks/run_benchmarks.jl +++ b/benchmarks/run_benchmarks.jl @@ -65,7 +65,8 @@ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborho times = zeros(iterations, length(neighborhood_searches)) for iter in 1:iterations - coordinates_ = point_cloud(sizes[iter]; seed, perturbation_factor_position, shuffle) + coordinates_ = point_cloud(sizes[iter], search_radius_factor; + seed, perturbation_factor_position, shuffle) coordinates = convert.(typeof(search_radius_factor), coordinates_) domain_size = maximum(sizes[iter]) + 1 @@ -189,7 +190,7 @@ function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; grid_nhs PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, update_neighborhood_search = grid_nhs, - transpose_backend) + transpose_backend)#, max_neighbors=128) ] names = [ @@ -201,6 +202,50 @@ function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; neighborhood_searches; names, parallelization_backend, kwargs...) end +""" + run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; kwargs...) + +Shortcut to call [`run_benchmark`](@ref) with a `GridNeighborhoodSearch` with a +`FullGridCellList`. This is the neighborhood search implementation that is used +in TrixiParticles.jl when performance is important. +Use this function to benchmark and profile TrixiParticles.jl kernels. + +# Arguments +- `benchmark`: The benchmark function. See [`benchmark_count_neighbors`](@ref), + [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref), + [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref). +- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number + of points. For example, use `(100, 100)` for a 2D benchmark or + `(10, 10, 10)` for a 3D benchmark. +- `iterations`: Number of refinement iterations + +# Keywords +See [`run_benchmark`](@ref) for a list of available keywords. + +# Examples +```julia +include("benchmarks/benchmarks.jl") + +run_benchmark_full_grid(benchmark_n_body, (10, 10), 3) +``` +""" +function run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) + NDIMS = length(n_points_per_dimension) + + min_corner = 0.0f0 .* n_points_per_dimension + max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + neighborhood_searches = [grid_nhs] + + names = ["GridNeighborhoodSearch with FullGridCellList";;] + + run_benchmark(benchmark, n_points_per_dimension, iterations, + neighborhood_searches; names, parallelization_backend, kwargs...) +end + """ plot_benchmark(n_particles_vec, times; kwargs...) From 7f814ce864e6405999b4c97531110852dedf8fea Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 21 Apr 2026 12:12:34 +0200 Subject: [PATCH 10/19] Update TLSPH benchmark --- benchmarks/smoothed_particle_hydrodynamics.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 88e38508..06e24301 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -117,8 +117,11 @@ function benchmark_tlsph(neighborhood_search, coordinates; ELTYPE = typeof(search_radius) material = (density = convert(ELTYPE, 1000.0), E = convert(ELTYPE, 1.4e6), nu = convert(ELTYPE, 0.4)) + + # The `particle_spacing` is only required for setting the type of the initial condition solid = InitialCondition(; coordinates = coordinates_cpu, - density = material.density, mass = convert(ELTYPE, 0.1)) + density = material.density, mass = convert(ELTYPE, 0.1), + particle_spacing = search_radius) # Compact support == 2 * smoothing length for these kernels smoothing_length_ = PointNeighbors.search_radius(neighborhood_search) / 2 @@ -129,8 +132,9 @@ function benchmark_tlsph(neighborhood_search, coordinates; smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end + penalty_force = PenaltyForceGanzenmueller(alpha = convert(ELTYPE, 0.1)) solid_system = TotalLagrangianSPHSystem(solid, smoothing_kernel, smoothing_length, - material.E, material.nu) + material.E, material.nu; penalty_force) system_ = Adapt.adapt(parallelization_backend, solid_system) # Remove unnecessary data structures that are only used for initialization @@ -146,5 +150,5 @@ function benchmark_tlsph(neighborhood_search, coordinates; # Initialize the system TrixiParticles.initialize!(system, semi) - return @belapsed TrixiParticles.interact_structure_structure2!($dv, $v, $system, $semi) + return @belapsed TrixiParticles.interact_structure_structure!($dv, $v, $system, $semi) end From 49bfced5323a2fdd5d79967dde7d159248fe08ee Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 4 May 2026 09:42:04 +0200 Subject: [PATCH 11/19] Fix for TP 0.5 --- benchmarks/smoothed_particle_hydrodynamics.jl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 06e24301..3d35f47d 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -76,10 +76,9 @@ function benchmark_wcsph(neighborhood_search, coordinates; smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end - fluid_system = WeaklyCompressibleSPHSystem(fluid, ContinuityDensity(), + fluid_system = WeaklyCompressibleSPHSystem(fluid; density_calculator = ContinuityDensity(), state_equation, smoothing_kernel, - smoothing_length, viscosity = viscosity, - density_diffusion = density_diffusion) + smoothing_length, viscosity, density_diffusion) system = Adapt.adapt(parallelization_backend, fluid_system) From b0e0742e4396c5ab1316d8500c91534b4a15f6c8 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 5 May 2026 16:07:51 +0200 Subject: [PATCH 12/19] Add SIMD stuff Co-authored-by: Copilot --- src/neighborhood_search.jl | 12 ++--- src/nhs_grid.jl | 99 +++++++++++++++++++++++++++++++++----- src/nhs_precomputed.jl | 35 ++++++++++++++ 3 files changed, 127 insertions(+), 19 deletions(-) diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index cf02774e..7701713b 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -249,7 +249,7 @@ See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks """ @propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, - point; + point, args...; search_radius = search_radius(neighborhood_search)) # Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. @@ -259,16 +259,16 @@ See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) foreach_neighbor(f, neighbor_coords, neighborhood_search, - point, point_coords, search_radius) + point, point_coords, search_radius, args...) end # This is a function barrier to prevent the `@inbounds` in `foreach_neighbor` # from propagating into the neighbor loop, which is not safe. @inline function foreach_neighbor(f, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, - point, point_coords, search_radius) + point, point_coords, search_radius, args...) foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, - point, point_coords, search_radius) + point, point_coords, search_radius, args...) end """ @@ -307,13 +307,13 @@ Note that all these bounds checks are safe to skip if """ @inline function foreach_neighbor_unsafe(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, - point; + point, args...; search_radius = search_radius(neighborhood_search)) point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), point) @inbounds foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, - point, point_coords, search_radius) + point, point_coords, search_radius, args...) end # This is the generic function that is called for `TrivialNeighborhoodSearch`. diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 8e3c1ff2..94a28373 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -506,26 +506,97 @@ function check_cell_collision(neighbor_cell_::CartesianIndex, coords[hash] != PointNeighbors.coordinates_flattened(neighbor_cell) end +using LLVMLoopInfo # Specialized version of the function in `neighborhood_search.jl`, which is faster # than looping over `eachneighbor`. # Note that calling this function with `@inbounds` is not safe. # See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius, data) + (; cell_list, periodic_box) = neighborhood_search + cell = cell_coords(point_coords, neighborhood_search) + + @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + # @inbounds @fastmath for cellx in (cell[1] - 1):(cell[1] + 1), celly in (cell[2] - 1):(cell[2] + 1), cellz in (cell[3] - 1):(cell[3] + 1) + # neighbor_cell = (cellx, celly, cellz) + + # Making the following `@inbounds` is not safe because we don't know if + # `neighbor_cell` is in bounds of the grid. + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + # Boolean to indicate if this cell has a collision (only with `SpatialHashingCellList`) + # cell_collision = check_cell_collision(neighbor_cell_, + # cell_list, neighborhood_search) + + @loopinfo vectorwidth=8 predicate for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + # Making the following `@inbounds` is not safe because we don't know + # if `neighbor` (extracted from the cell list) is in bounds. + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), + neighbor) + + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) + distance2 = dot(pos_diff, pos_diff) + + # (pos_diff, + # distance2) = compute_periodic_distance(pos_diff, distance2, + # search_radius, periodic_box) + + distance = sqrt(distance2) + if distance2 <= search_radius^2 + # distance2 = ifelse(distance2 <= search_radius^2, distance2, zero(distance2)) + # distance = sqrt(distance2) + + # If this cell has a collision, check if this point belongs to this cell + # (only with `SpatialHashingCellList`). + # if cell_collision && + # check_collision(neighbor_cell_, neighbor_point_coords, cell_list, + # neighborhood_search) + # continue + # end + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + data = @inline f(point, neighbor, pos_diff, distance, data) + end + end + end + + return data +end + @propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, neighborhood_search::GridNeighborhoodSearch, point, point_coords, search_radius) (; cell_list, periodic_box) = neighborhood_search cell = cell_coords(point_coords, neighborhood_search) - for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) - neighbor_cell = Tuple(neighbor_cell_) + @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + # @inbounds @fastmath for cellx in (cell[1] - 1):(cell[1] + 1), celly in (cell[2] - 1):(cell[2] + 1), cellz in (cell[3] - 1):(cell[3] + 1) + # neighbor_cell = (cellx, celly, cellz) # Making the following `@inbounds` is not safe because we don't know if # `neighbor_cell` is in bounds of the grid. neighbors = points_in_cell(neighbor_cell, neighborhood_search) # Boolean to indicate if this cell has a collision (only with `SpatialHashingCellList`) - cell_collision = check_cell_collision(neighbor_cell_, - cell_list, neighborhood_search) + # cell_collision = check_cell_collision(neighbor_cell_, + # cell_list, neighborhood_search) + + # vectorwidth = 8 + # for block_idx in cld(length(neighbors), vectorwidth) + # block = SVector{vectorwidth}((block_idx - 1) * vectorwidth .+ (1:vectorwidth)) + # neighbors_block = @inbounds neighbors[block] + + # neighbor_coords = @inbounds extract_svector(neighbor_coords, + # Val(ndims(neighborhood_search)), + # neighbors_block) for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] @@ -540,20 +611,22 @@ end point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - (pos_diff, - distance2) = compute_periodic_distance(pos_diff, distance2, - search_radius, periodic_box) + # (pos_diff, + # distance2) = compute_periodic_distance(pos_diff, distance2, + # search_radius, periodic_box) + distance = sqrt(distance2) if distance2 <= search_radius^2 - distance = sqrt(distance2) + # distance2 = ifelse(distance2 <= search_radius^2, distance2, zero(distance2)) + # distance = sqrt(distance2) # If this cell has a collision, check if this point belongs to this cell # (only with `SpatialHashingCellList`). - if cell_collision && - check_collision(neighbor_cell_, neighbor_point_coords, cell_list, - neighborhood_search) - continue - end + # if cell_collision && + # check_collision(neighbor_cell_, neighbor_point_coords, cell_list, + # neighborhood_search) + # continue + # end # Inline to avoid loss of performance # compared to not using `foreach_point_neighbor`. diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index c7ddceca..fa411f80 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -202,6 +202,41 @@ end # Note that calling this function with `@inbounds` is not safe. # See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::PrecomputedNeighborhoodSearch, + point, point_coords, search_radius, data) + (; periodic_box, neighbor_lists) = neighborhood_search + + # Making the following `@inbounds` is not safe because the neighbor list + # might not contain `point` if the NHS was not initialized correctly. + neighbors = neighbor_lists[point] + @fastmath @loopinfo vectorwidth=8 predicate for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + # Making this `@inbounds` is not safe because + # `neighbor` (extracted from the neighbor list) is only guaranteed to be in bounds + # if the neighbor lists were constructed correctly and have not been corrupted. + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) + + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) + distance2 = dot(pos_diff, pos_diff) + + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) + + distance = sqrt(distance2) + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + data = @inline f(point, neighbor, pos_diff, distance, data) + end + + return data +end + @propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, neighborhood_search::PrecomputedNeighborhoodSearch, point, point_coords, search_radius) From f9c4991f279773dccd9c50d8565367e3f202f6cf Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Tue, 5 May 2026 16:14:10 +0200 Subject: [PATCH 13/19] Add missing dependency --- Project.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 32e1fdf5..cc85d8bd 100644 --- a/Project.toml +++ b/Project.toml @@ -1,13 +1,14 @@ name = "PointNeighbors" uuid = "1c4d5385-0a27-49de-8e2c-43b175c8985c" -authors = ["Erik Faulhaber "] version = "0.6.6" +authors = ["Erik Faulhaber "] [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -18,6 +19,7 @@ Adapt = "4" Atomix = "1" GPUArraysCore = "0.2" KernelAbstractions = "0.9" +LLVMLoopInfo = "1.0.0" LinearAlgebra = "1" Polyester = "0.7.5" Reexport = "1" From a4abedfc517451b5d630581aadf0767de39ab74a Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 22 May 2026 15:07:07 +0200 Subject: [PATCH 14/19] Fix sorting --- src/nhs_precomputed.jl | 3 ++- src/vector_of_vectors.jl | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index fa411f80..e6fdceeb 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -285,7 +285,8 @@ function copy_neighborhood_search(nhs::PrecomputedNeighborhoodSearch, update_neighborhood_search, backend = typeof(nhs.neighbor_lists), transpose_backend, - max_neighbors = max_neighbors_) + max_neighbors = max_neighbors_, + sort_neighbor_lists = nhs.sort_neighbor_lists) end @inline function freeze_neighborhood_search(search::PrecomputedNeighborhoodSearch) diff --git a/src/vector_of_vectors.jl b/src/vector_of_vectors.jl index c05cddbd..d00f2e9d 100644 --- a/src/vector_of_vectors.jl +++ b/src/vector_of_vectors.jl @@ -73,13 +73,13 @@ end # Outer bounds check @boundscheck checkbounds(vov, i) - lengths[i] += 1 + @inbounds lengths[i] += 1 # Inner bounds check @boundscheck check_list_bounds(vov, i) # Activate new entry in column `i` - backend[lengths[i], i] = value + @inbounds backend[lengths[i], i] = value return vov end @@ -175,6 +175,16 @@ end # Sort each inner vector @inline function sorteach!(vov::DynamicVectorOfVectors) + @threaded default_backend(vov.backend) for i in axes(vov.backend, 2) + # QuickSort is ~1.6x faster here than the default + sort!(view(vov.backend, 1:vov.lengths[i], i), alg=QuickSort) + end + + return vov +end + +# GPU version +@inline function sorteach!(vov::DynamicVectorOfVectors{<:Any, <:Any, <:AbstractGPUArray}) # TODO remove this check when Metal supports sorting if nameof(typeof(default_backend(vov.backend))) == :MetalBackend @warn "sorting neighbor lists is not supported on Metal. Skipping sort." From cb5be0c040f25cf9ac4d8ee6542a4277942ac201 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 22 May 2026 15:07:16 +0200 Subject: [PATCH 15/19] Fix update benchmark on GPUs --- test/point_cloud.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/point_cloud.jl b/test/point_cloud.jl index 64c41bec..001e53b5 100644 --- a/test/point_cloud.jl +++ b/test/point_cloud.jl @@ -50,7 +50,7 @@ function point_cloud(n_points_per_dimension, search_radius; end function perturb!(data, std_deviation) - for i in eachindex(data) + PointNeighbors.@threaded PointNeighbors.default_backend(data) for i in eachindex(data) data[i] += std_deviation * randn() end From 26c6414db02309c1e688cce0839803981168e4d6 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 22 May 2026 15:08:05 +0200 Subject: [PATCH 16/19] Implement different versions of neighbor list assembly --- src/nhs_grid.jl | 2 +- src/nhs_precomputed.jl | 122 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 120 insertions(+), 4 deletions(-) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 94a28373..db8944a9 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -547,8 +547,8 @@ using LLVMLoopInfo # distance2) = compute_periodic_distance(pos_diff, distance2, # search_radius, periodic_box) - distance = sqrt(distance2) if distance2 <= search_radius^2 + distance = sqrt(distance2) # distance2 = ifelse(distance2 <= search_radius^2, distance2, zero(distance2)) # distance = sqrt(distance2) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index e6fdceeb..c3be8c8b 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -179,6 +179,7 @@ function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y, end end +using SIMD function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, neighborhood_search, x, y, parallelization_backend, sort_neighbor_lists) @@ -190,11 +191,126 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, end # Fill neighbor lists - foreach_point_neighbor(x, y, neighborhood_search; - parallelization_backend) do point, neighbor, _, _ - pushat!(neighbor_lists, point, neighbor) + # foreach_point_neighbor(x, y, neighborhood_search; + # parallelization_backend) do point, neighbor, _, _ + # @inbounds pushat!(neighbor_lists, point, neighbor) + # end + # @threaded parallelization_backend for point in axes(x, 2) + # length = @inbounds neighbor_lists.lengths[point] + # length = foreach_neighbor_unsafe(x, y, neighborhood_search, point, length) do point_, neighbor, _, distance, length_ + # # @inbounds pushat!(neighbor_lists, point_, neighbor) + # if distance < search_radius(neighborhood_search) + # length_ += 1 + # @inbounds neighbor_lists.backend[length_, point] = neighbor + # end + + # return length_ + # end + # @inbounds neighbor_lists.lengths[point] = length + # end + + search_radius2 = search_radius(neighborhood_search)^2 + + # 100x100x100 points on Rucio: 40ms on the CPU, 65ms on the GPU. + @threaded parallelization_backend for point in axes(x, 2) + point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + length = @inbounds neighbor_lists.lengths[point] + + @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + neighbor_point_coords = extract_svector(y, Val(ndims(neighborhood_search)), + neighbor) + + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) + distance2 = dot(pos_diff, pos_diff) + + @inbounds neighbor_lists.backend[length + 1, point] = neighbor + length = length + (distance2 <= search_radius2) + end + end + + @inbounds neighbor_lists.lengths[point] = length end + # 100x100x100 points on Rucio: 83ms on the CPU, 16ms on the GPU (fastest). + # @threaded parallelization_backend for point in axes(x, 2) + # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + # cell = cell_coords(point_coords, neighborhood_search) + # length = @inbounds neighbor_lists.lengths[point] + + # @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + # neighbor_cell = Tuple(neighbor_cell_) + # neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + # for neighbor_ in eachindex(neighbors) + # neighbor = @inbounds neighbors[neighbor_] + + # neighbor_point_coords = extract_svector(y, Val(ndims(neighborhood_search)), + # neighbor) + + # pos_diff = convert.(eltype(neighborhood_search), + # point_coords - neighbor_point_coords) + # distance2 = dot(pos_diff, pos_diff) + + # if distance2 <= search_radius2 + # length = length + 1 + # @inbounds neighbor_lists.backend[length, point] = neighbor + # end + # end + # end + + # @inbounds neighbor_lists.lengths[point] = length + # end + + # 100x100x100 points on Rucio: 36ms on the CPU (fastest), not GPU-compatible. + # @threaded parallelization_backend for point in axes(x, 2) + # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + # coords_a1, coords_a2, coords_a3 = point_coords + # cell = cell_coords(point_coords, neighborhood_search) + # length_ = @inbounds neighbor_lists.lengths[point] + + # @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + # neighbor_cell = Tuple(neighbor_cell_) + # neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + # vectorwidth = 8 + # @fastmath for block_start in 1:vectorwidth:length(neighbors) + # block_start + vectorwidth - 1 > length(neighbors) && break + + # neighbors_block = @inbounds vload(Vec{8, eltype(neighbors)}, neighbors, block_start) + + # # Linear indexing into `coordinates` because Cartesian indexing doesn't work. + # point_start = (neighbors_block - 1) * 3 + 1 + # x_b = @inbounds y[point_start] + # y_b = @inbounds y[point_start + 1] + # z_b = @inbounds y[point_start + 2] + + # pos_diff_x = coords_a1 - x_b + # pos_diff_y = coords_a2 - y_b + # pos_diff_z = coords_a3 - z_b + # distance2 = pos_diff_x * pos_diff_x + pos_diff_y * pos_diff_y + pos_diff_z * pos_diff_z + # mask = distance2 <= search_radius2 + + # sum(mask) == 0 && continue + + # @inbounds for neighbor_ in 1:vectorwidth + # neighbor = neighbors_block[neighbor_] + # neighbor_lists.backend[length_ + 1, point] = neighbor + # length_ = length_ + mask[neighbor_] + # end + # end + # end + + # @inbounds neighbor_lists.lengths[point] = length_ + # end + if sort_neighbor_lists sorteach!(neighbor_lists) end From 6b5c6d88d9f0f338ec5c9817a55c55d48404de7a Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 22 May 2026 16:10:48 +0200 Subject: [PATCH 17/19] Add specialized GPU kernel for neighbor lists initialization --- src/PointNeighbors.jl | 2 +- src/nhs_precomputed.jl | 43 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 58b38687..722c4e8f 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -6,7 +6,7 @@ using Adapt: Adapt using Atomix: Atomix using Base: @propagate_inbounds using GPUArraysCore: AbstractGPUArray -using KernelAbstractions: KernelAbstractions, @kernel, @index +using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @groupsize, @uniform, @synchronize using LinearAlgebra: dot using Polyester: Polyester @reexport using StaticArrays: SVector diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index c3be8c8b..1909c19d 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -239,7 +239,7 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, @inbounds neighbor_lists.lengths[point] = length end - # 100x100x100 points on Rucio: 83ms on the CPU, 16ms on the GPU (fastest). + # 100x100x100 points on Rucio: 83ms on the CPU, 16ms on the GPU. # @threaded parallelization_backend for point in axes(x, 2) # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) # cell = cell_coords(point_coords, neighborhood_search) @@ -269,6 +269,11 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, # @inbounds neighbor_lists.lengths[point] = length # end + # 100x100x100 points on Rucio: 11ms on the GPU (fastest), not CPU-compatible. + # ndrange = size(x, 2) + # mykernel(parallelization_backend, 64)(x, y, neighborhood_search, neighbor_lists, search_radius2, ndrange = ndrange) + # KernelAbstractions.synchronize(parallelization_backend) + # 100x100x100 points on Rucio: 36ms on the CPU (fastest), not GPU-compatible. # @threaded parallelization_backend for point in axes(x, 2) # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) @@ -316,6 +321,42 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, end end +@kernel cpu=false function mykernel(x, y, neighborhood_search, neighbor_lists, search_radius2) + point = @index(Global) + threadidx = @index(Local) + + point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + + length_ = @inbounds neighbor_lists.lengths[point] + local_neighbors = @localmem Int32 (64, 128) # (groupsize, max_neighbors) + + for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + neighbor_point_coords = @inbounds extract_svector(y, Val(ndims(neighborhood_search)), + neighbor) + + pos_diff = point_coords - neighbor_point_coords + distance2 = dot(pos_diff, pos_diff) + + if distance2 <= search_radius2 + length_ = length_ + 1 + @inbounds local_neighbors[threadidx, length_] = neighbor + end + end + end + + @inbounds neighbor_lists.lengths[point] = length_ + for i in axes(local_neighbors, 2) + @inbounds neighbor_lists.backend[i, point] = local_neighbors[threadidx, i] + end +end + # Note that calling this function with `@inbounds` is not safe. # See the comments in `foreach_neighbor_unsafe`. @propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, From 885f08473820d9e9d0f5a3b7ad115d52933a2447 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 22 May 2026 17:34:23 +0200 Subject: [PATCH 18/19] Add runtimes for RAMSES --- src/nhs_precomputed.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 1909c19d..b00ea19b 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -212,6 +212,7 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, search_radius2 = search_radius(neighborhood_search)^2 # 100x100x100 points on Rucio: 40ms on the CPU, 65ms on the GPU. + # 100x100x100 points on RAMSES: 36ms on the CPU @threaded parallelization_backend for point in axes(x, 2) point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) cell = cell_coords(point_coords, neighborhood_search) @@ -240,6 +241,7 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, end # 100x100x100 points on Rucio: 83ms on the CPU, 16ms on the GPU. + # 100x100x100 points on RAMSES: 52ms on the CPU # @threaded parallelization_backend for point in axes(x, 2) # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) # cell = cell_coords(point_coords, neighborhood_search) @@ -275,6 +277,7 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, # KernelAbstractions.synchronize(parallelization_backend) # 100x100x100 points on Rucio: 36ms on the CPU (fastest), not GPU-compatible. + # 100x100x100 points on RAMSES: 38ms on the CPU # @threaded parallelization_backend for point in axes(x, 2) # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) # coords_a1, coords_a2, coords_a3 = point_coords From d9b9530cc0d0073e0c2af1e3ba4ef2c111de0996 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Fri, 22 May 2026 17:34:37 +0200 Subject: [PATCH 19/19] Add SIMD dependency --- Project.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Project.toml b/Project.toml index cc85d8bd..be97eba0 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] @@ -23,5 +24,6 @@ LLVMLoopInfo = "1.0.0" LinearAlgebra = "1" Polyester = "0.7.5" Reexport = "1" +SIMD = "3.7.2" StaticArrays = "1" julia = "1.10"