diff --git a/Project.toml b/Project.toml index d4654124..be97eba0 100644 --- a/Project.toml +++ b/Project.toml @@ -1,16 +1,18 @@ name = "PointNeighbors" uuid = "1c4d5385-0a27-49de-8e2c-43b175c8985c" +version = "0.6.6" authors = ["Erik Faulhaber "] -version = "0.6.6-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +LLVMLoopInfo = "8b046642-f1f6-4319-8d3c-209ddc03c586" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" +SIMD = "fdea26ae-647d-5447-a871-4b548cad5224" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] @@ -18,8 +20,10 @@ Adapt = "4" Atomix = "1" GPUArraysCore = "0.2" KernelAbstractions = "0.9" +LLVMLoopInfo = "1.0.0" LinearAlgebra = "1" Polyester = "0.7.5" Reexport = "1" +SIMD = "3.7.2" StaticArrays = "1" julia = "1.10" diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl index ffabc805..ff08ebe3 100644 --- a/benchmarks/run_benchmarks.jl +++ b/benchmarks/run_benchmarks.jl @@ -50,10 +50,11 @@ run_benchmark(benchmark_count_neighbors, (10, 10), 3, ``` """ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborhood_searches; + search_radius_factor = 3.0, parallelization_backend = PolyesterBackend(), names = ["Neighborhood search $i" for i in 1:length(neighborhood_searches)]', - seed = 1, perturbation_factor_position = 1.0) + seed = 1, perturbation_factor_position = 1.0, shuffle = false) # Multiply number of points in each iteration (roughly) by this factor scaling_factor = 4 per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension)) @@ -64,24 +65,29 @@ function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborho times = zeros(iterations, length(neighborhood_searches)) for iter in 1:iterations - coordinates = point_cloud(sizes[iter]; seed, perturbation_factor_position) + coordinates_ = point_cloud(sizes[iter], search_radius_factor; + seed, perturbation_factor_position, shuffle) + coordinates = convert.(typeof(search_radius_factor), coordinates_) domain_size = maximum(sizes[iter]) + 1 # Normalize domain size to 1 coordinates ./= domain_size # Make this Float32 to make sure that Float32 benchmarks use Float32 exclusively - search_radius = 4.0f0 / domain_size + search_radius = search_radius_factor / domain_size n_particles = size(coordinates, 2) neighborhood_searches_copy = copy_neighborhood_search.(neighborhood_searches, search_radius, n_particles) for i in eachindex(neighborhood_searches_copy) - neighborhood_search = neighborhood_searches_copy[i] - PointNeighbors.initialize!(neighborhood_search, coordinates, coordinates) + neighborhood_search_ = neighborhood_searches_copy[i] + neighborhood_search = PointNeighbors.Adapt.adapt(parallelization_backend, + neighborhood_search_) + coords = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates) + PointNeighbors.initialize!(neighborhood_search, coords, coords) - time = benchmark(neighborhood_search, coordinates; parallelization_backend) + time = benchmark(neighborhood_search, coords; parallelization_backend) times[iter, i] = time time_string = BenchmarkTools.prettytime(time * 1e9) time_string_per_particle = BenchmarkTools.prettytime(time * 1e9 / n_particles) @@ -170,23 +176,74 @@ include("benchmarks/benchmarks.jl") run_benchmark_gpu(benchmark_n_body, (10, 10), 3) ``` """ -function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...) +function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) NDIMS = length(n_points_per_dimension) min_corner = 0.0f0 .* n_points_per_dimension max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) - neighborhood_searches = [GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0, - cell_list = FullGridCellList(; - search_radius = 0.0f0, - min_corner, - max_corner)) - PrecomputedNeighborhoodSearch{NDIMS}(search_radius = 0.0f0)] - - names = ["GridNeighborhoodSearch with FullGridCellList";; - "PrecomputedNeighborhoodSearch"] + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + transpose_backend = parallelization_backend isa PointNeighbors.KernelAbstractions.GPU + neighborhood_searches = [ + grid_nhs + PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, + update_neighborhood_search = grid_nhs, + transpose_backend)#, max_neighbors=128) + ] + + names = [ + "GridNeighborhoodSearch with FullGridCellList";; + "PrecomputedNeighborhoodSearch" + ] run_benchmark(benchmark, n_points_per_dimension, iterations, - neighborhood_searches; names, kwargs...) + neighborhood_searches; names, parallelization_backend, kwargs...) +end + +""" + run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; kwargs...) + +Shortcut to call [`run_benchmark`](@ref) with a `GridNeighborhoodSearch` with a +`FullGridCellList`. This is the neighborhood search implementation that is used +in TrixiParticles.jl when performance is important. +Use this function to benchmark and profile TrixiParticles.jl kernels. + +# Arguments +- `benchmark`: The benchmark function. See [`benchmark_count_neighbors`](@ref), + [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref), + [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref). +- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number + of points. For example, use `(100, 100)` for a 2D benchmark or + `(10, 10, 10)` for a 3D benchmark. +- `iterations`: Number of refinement iterations + +# Keywords +See [`run_benchmark`](@ref) for a list of available keywords. + +# Examples +```julia +include("benchmarks/benchmarks.jl") + +run_benchmark_full_grid(benchmark_n_body, (10, 10), 3) +``` +""" +function run_benchmark_full_grid(benchmark, n_points_per_dimension, iterations; + parallelization_backend=PolyesterBackend(), kwargs...) + NDIMS = length(n_points_per_dimension) + + min_corner = 0.0f0 .* n_points_per_dimension + max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension)) + cell_list = FullGridCellList(; search_radius = 0.0f0, min_corner, max_corner) + grid_nhs = GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0f0, cell_list, + update_strategy = ParallelUpdate()) + neighborhood_searches = [grid_nhs] + + names = ["GridNeighborhoodSearch with FullGridCellList";;] + + run_benchmark(benchmark, n_points_per_dimension, iterations, + neighborhood_searches; names, parallelization_backend, kwargs...) end """ diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl index 7286baca..3d35f47d 100644 --- a/benchmarks/smoothed_particle_hydrodynamics.jl +++ b/benchmarks/smoothed_particle_hydrodynamics.jl @@ -1,4 +1,5 @@ using PointNeighbors +using PointNeighbors.Adapt using TrixiParticles using BenchmarkTools @@ -43,47 +44,30 @@ This method is used to simulate an incompressible fluid. """ function benchmark_wcsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - density = 1000.0 - particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1, particle_spacing) - - sound_speed = 10.0 - state_equation = StateEquationCole(; sound_speed, reference_density = density, - exponent = 1) - - viscosity = ArtificialViscosityMonaghan(alpha = 0.02, beta = 0.0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end - -""" - benchmark_wcsph_fp32(neighborhood_search, coordinates; - parallelization_backend = default_backend(coordinates)) - -Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers. -""" -function benchmark_wcsph_fp32(neighborhood_search, coordinates_; - parallelization_backend = default_backend(coordinates_)) - coordinates = convert(Matrix{Float32}, coordinates_) - density = 1000.0f0 + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + density = convert(ELTYPE, 1000.0) particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3 - fluid = InitialCondition(; coordinates, density, mass = 0.1f0, particle_spacing) + fluid = InitialCondition(; coordinates = coordinates_cpu, density, + mass = convert(ELTYPE, 0.1) * particle_spacing, + particle_spacing) - sound_speed = 10.0f0 + # Make sure that the computed forces are not all zero + for i in eachindex(fluid.density) + fluid.density[i] += rand(eltype(fluid.density)) + end + + sound_speed = convert(ELTYPE, 10.0) state_equation = StateEquationCole(; sound_speed, reference_density = density, exponent = 1) - viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0) - density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0) - - __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation, - viscosity, density_diffusion, parallelization_backend) -end + viscosity = ArtificialViscosityMonaghan(alpha = convert(ELTYPE, 0.02), + beta = convert(ELTYPE, 0.0)) + density_diffusion = DensityDiffusionMolteniColagrossi(delta = convert(ELTYPE, 0.1)) -function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_equation, - viscosity, density_diffusion, parallelization_backend) # Compact support == 2 * smoothing length for these kernels smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2 if ndims(neighborhood_search) == 1 @@ -92,23 +76,20 @@ function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_e smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end - fluid_system = WeaklyCompressibleSPHSystem(initial_condition, ContinuityDensity(), + fluid_system = WeaklyCompressibleSPHSystem(fluid; density_calculator = ContinuityDensity(), state_equation, smoothing_kernel, - smoothing_length, viscosity = viscosity, - density_diffusion = density_diffusion) + smoothing_length, viscosity, density_diffusion) - system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system) + system = Adapt.adapt(parallelization_backend, fluid_system) # Remove unnecessary data structures that are only used for initialization - neighborhood_search_ = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) - nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search_) semi = DummySemidiscretization(nhs, parallelization_backend, true) - v = PointNeighbors.Adapt.adapt(parallelization_backend, - vcat(initial_condition.velocity, - initial_condition.density')) - u = PointNeighbors.Adapt.adapt(parallelization_backend, initial_condition.coordinates) + v = Adapt.adapt(parallelization_backend, + vcat(fluid.velocity, fluid.density')) + u = Adapt.adapt(parallelization_backend, fluid.coordinates) dv = zero(v) # Initialize the system @@ -128,8 +109,18 @@ This method is used to simulate an elastic structure. """ function benchmark_tlsph(neighborhood_search, coordinates; parallelization_backend = default_backend(coordinates)) - material = (density = 1000.0, E = 1.4e6, nu = 0.4) - solid = InitialCondition(; coordinates, density = material.density, mass = 0.1) + # System initialization has to happen on the CPU + coordinates_cpu = PointNeighbors.Adapt.adapt(Array, coordinates) + + search_radius = PointNeighbors.search_radius(neighborhood_search) + ELTYPE = typeof(search_radius) + material = (density = convert(ELTYPE, 1000.0), E = convert(ELTYPE, 1.4e6), + nu = convert(ELTYPE, 0.4)) + + # The `particle_spacing` is only required for setting the type of the initial condition + solid = InitialCondition(; coordinates = coordinates_cpu, + density = material.density, mass = convert(ELTYPE, 0.1), + particle_spacing = search_radius) # Compact support == 2 * smoothing length for these kernels smoothing_length_ = PointNeighbors.search_radius(neighborhood_search) / 2 @@ -140,17 +131,23 @@ function benchmark_tlsph(neighborhood_search, coordinates; smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}() end + penalty_force = PenaltyForceGanzenmueller(alpha = convert(ELTYPE, 0.1)) solid_system = TotalLagrangianSPHSystem(solid, smoothing_kernel, smoothing_length, - material.E, material.nu) - semi = DummySemidiscretization(neighborhood_search, parallelization_backend, true) + material.E, material.nu; penalty_force) + system_ = Adapt.adapt(parallelization_backend, solid_system) - v = copy(solid.velocity) - u = copy(solid.coordinates) + # Remove unnecessary data structures that are only used for initialization + nhs = PointNeighbors.freeze_neighborhood_search(neighborhood_search) + system = TrixiParticles.@set system_.self_interaction_nhs = nhs + + semi = DummySemidiscretization(nhs, parallelization_backend, true) + + v = Adapt.adapt(parallelization_backend, copy(solid.velocity)) + u = Adapt.adapt(parallelization_backend, copy(solid.coordinates)) dv = zero(v) # Initialize the system - TrixiParticles.initialize!(solid_system, semi) + TrixiParticles.initialize!(system, semi) - return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, - $solid_system, $solid_system, $semi) + return @belapsed TrixiParticles.interact_structure_structure!($dv, $v, $system, $semi) end diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl index 702c19e4..722c4e8f 100644 --- a/src/PointNeighbors.jl +++ b/src/PointNeighbors.jl @@ -6,7 +6,7 @@ using Adapt: Adapt using Atomix: Atomix using Base: @propagate_inbounds using GPUArraysCore: AbstractGPUArray -using KernelAbstractions: KernelAbstractions, @kernel, @index +using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @groupsize, @uniform, @synchronize using LinearAlgebra: dot using Polyester: Polyester @reexport using StaticArrays: SVector @@ -20,7 +20,8 @@ include("nhs_grid.jl") include("nhs_precomputed.jl") include("gpu.jl") -export foreach_point_neighbor, foreach_neighbor +export foreach_point_neighbor, foreach_point_neighbor_unsafe, + foreach_neighbor, foreach_neighbor_unsafe export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoodSearch export DictionaryCellList, FullGridCellList, SpatialHashingCellList export DynamicVectorOfVectors diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl index 5e99b9cd..7701713b 100644 --- a/src/neighborhood_search.jl +++ b/src/neighborhood_search.jl @@ -178,7 +178,7 @@ Note that `system_coords` and `neighbor_coords` can be identical. automatically based on the type of `system_coords`. See [`@threaded`](@ref) for a list of available backends. -See also [`initialize!`](@ref), [`update!`](@ref). +See also [`foreach_point_neighbor_unsafe`](@ref), [`initialize!`](@ref), [`update!`](@ref). """ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search; parallelization_backend::ParallelizationBackend = default_backend(system_coords), @@ -200,41 +200,143 @@ function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborho return nothing end -@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_system_coords, +""" + foreach_point_neighbor_unsafe(f, system_coords, neighbor_coords, neighborhood_search; + parallelization_backend = default_backend(system_coords), + points = axes(system_coords, 2)) + +Like [`foreach_point_neighbor`](@ref), but skips **all** bounds checks inside the +threaded loop or GPU kernel. +See [`foreach_neighbor_unsafe`](@ref) for more details on which bounds checks are skipped. + +!!! warning + The `neighborhood_search` must have been initialized or updated with `system_coords` + as first coordinate array and `neighbor_coords` as second coordinate array. + This can be skipped for certain implementations. See [`requires_update`](@ref). + +!!! warning + Use this only when `neighborhood_search` is known to be initialized correctly for + `system_coords` and `neighbor_coords`. +""" +function foreach_point_neighbor_unsafe(f::T, system_coords, neighbor_coords, + neighborhood_search; + parallelization_backend::ParallelizationBackend = default_backend(system_coords), + points = axes(system_coords, 2)) where {T} + # Explicit bounds check before the hot loop (or GPU kernel) + @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points) + + @threaded parallelization_backend for point in points + foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search, point) + end + + return nothing +end + +""" + foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, point; + search_radius = search_radius(neighborhood_search)) + +Loop over all neighbors of `point` and execute `f(i, j, pos_diff, d)` for every +neighbor within `search_radius`, where `i` is `point`, `j` is the neighbor index, +`pos_diff` is the vector from neighbor to point, and `d` is the distance. + +This method performs bounds checks, even when called with `@inbounds`. +`@inbounds` only skips the bounds check for loading the coordinates of `point` +from `system_coords`. +See [`foreach_neighbor_unsafe`](@ref) for a version that skips all bounds checks. +""" +@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search::AbstractNeighborhoodSearch, - point; + point, args...; search_radius = search_radius(neighborhood_search)) # Due to https://github.com/JuliaLang/julia/issues/30411, we cannot just remove # a `@boundscheck` by calling this function with `@inbounds` because it has a kwarg. # We have to use `@propagate_inbounds`, which will also remove boundschecks - # in the neighbor loop, which is not safe (see comment below). + # in the neighbor loop, which is not safe (that's what `foreach_neighbor_unsafe` is for). # To avoid this, we have to use a function barrier to disable the `@inbounds` again. point_coords = extract_svector(system_coords, Val(ndims(neighborhood_search)), point) - foreach_neighbor(f, neighbor_system_coords, neighborhood_search, - point, point_coords, search_radius) + foreach_neighbor(f, neighbor_coords, neighborhood_search, + point, point_coords, search_radius, args...) +end + +# This is a function barrier to prevent the `@inbounds` in `foreach_neighbor` +# from propagating into the neighbor loop, which is not safe. +@inline function foreach_neighbor(f, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point, point_coords, search_radius, args...) + foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, + point, point_coords, search_radius, args...) +end + +""" + foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, point; + search_radius = search_radius(neighborhood_search)) + +Like [`foreach_neighbor`](@ref), but skips **all** bounds checks. + +`foreach_neighbor` performs the following bounds checks that are skipped here: +- Check if `point` is in bounds of `system_coords`. This is the only bounds check + that is skipped when calling `foreach_neighbor` with `@inbounds`, and the only one that + is safe to skip when `point` is guaranteed to be in bounds of `system_coords`. +- Check if the neighbors of `point` are in bounds of `neighbor_coords`. + This is not safe to skip when the neighborhood search was not initialized correctly; + for example when initialized with coordinate arrays of different sizes than the ones + passed to this function or when the internal data structures have been tampered with. + In this case, the cell lists (for [`GridNeighborhoodSearch`](@ref)) or neighbor lists + (for [`PrecomputedNeighborhoodSearch`](@ref)) might contain indices that are out of + bounds for `neighbor_coords`. +- With `GridNeighborhoodSearch` and [`FullGridCellList`](@ref), check if the neighboring + cells are in bounds of the grid. Again, this is not safe to skip when the neighborhood + search might not have been initialized correctly. +- With `PrecomputedNeighborhoodSearch`, check if `point` is in bounds of the neighbor lists. + Again, this is not safe to skip when the neighborhood search was not initialized correctly. + +Note that all these bounds checks are safe to skip if +- `point` is guaranteed to be in bounds of `system_coords`, +- the neighborhood search was initialized correctly with `system_coords` + and `neighbor_coords` and has not been tampered with since then. + +!!! warning + Use this only when `point` is known to be in bounds of `system_coords` + and when `neighborhood_search` is known to be initialized correctly for + `system_coords` and `neighbor_coords`. +""" +@inline function foreach_neighbor_unsafe(f, system_coords, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point, args...; + search_radius = search_radius(neighborhood_search)) + point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)), + point) + + @inbounds foreach_neighbor_inner(f, neighbor_coords, neighborhood_search, + point, point_coords, search_radius, args...) end # This is the generic function that is called for `TrivialNeighborhoodSearch`. # For `GridNeighborhoodSearch`, a specialized function is used for slightly better # performance. `PrecomputedNeighborhoodSearch` can skip the distance check altogether. -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::AbstractNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::AbstractNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box) = neighborhood_search for neighbor in eachneighbor(point_coords, neighborhood_search) - # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. - # But we don't know if `neighbor` (extracted from the cell list) is in bounds. - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, search_radius, - periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) if distance2 <= search_radius^2 distance = sqrt(distance2) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 21759ab2..db8944a9 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -506,47 +506,127 @@ function check_cell_collision(neighbor_cell_::CartesianIndex, coords[hash] != PointNeighbors.coordinates_flattened(neighbor_cell) end +using LLVMLoopInfo # Specialized version of the function in `neighborhood_search.jl`, which is faster # than looping over `eachneighbor`. -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::GridNeighborhoodSearch, - point, point_coords, search_radius) +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius, data) (; cell_list, periodic_box) = neighborhood_search cell = cell_coords(point_coords, neighborhood_search) - for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) - neighbor_cell = Tuple(neighbor_cell_) + @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + # @inbounds @fastmath for cellx in (cell[1] - 1):(cell[1] + 1), celly in (cell[2] - 1):(cell[2] + 1), cellz in (cell[3] - 1):(cell[3] + 1) + # neighbor_cell = (cellx, celly, cellz) + + # Making the following `@inbounds` is not safe because we don't know if + # `neighbor_cell` is in bounds of the grid. neighbors = points_in_cell(neighbor_cell, neighborhood_search) # Boolean to indicate if this cell has a collision (only with `SpatialHashingCellList`) - cell_collision = check_cell_collision(neighbor_cell_, - cell_list, neighborhood_search) + # cell_collision = check_cell_collision(neighbor_cell_, + # cell_list, neighborhood_search) - for neighbor_ in eachindex(neighbors) + @loopinfo vectorwidth=8 predicate for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making the following `@inbounds` yields a ~2% speedup on an NVIDIA H100. - # But we don't know if `neighbor` (extracted from the cell list) is in bounds. - neighbor_coords = extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), neighbor) + # Making the following `@inbounds` is not safe because we don't know + # if `neighbor` (extracted from the cell list) is in bounds. + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), + neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, - search_radius, periodic_box) + # (pos_diff, + # distance2) = compute_periodic_distance(pos_diff, distance2, + # search_radius, periodic_box) if distance2 <= search_radius^2 distance = sqrt(distance2) + # distance2 = ifelse(distance2 <= search_radius^2, distance2, zero(distance2)) + # distance = sqrt(distance2) # If this cell has a collision, check if this point belongs to this cell # (only with `SpatialHashingCellList`). - if cell_collision && - check_collision(neighbor_cell_, neighbor_coords, cell_list, - neighborhood_search) - continue - end + # if cell_collision && + # check_collision(neighbor_cell_, neighbor_point_coords, cell_list, + # neighborhood_search) + # continue + # end + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + data = @inline f(point, neighbor, pos_diff, distance, data) + end + end + end + + return data +end + +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::GridNeighborhoodSearch, + point, point_coords, search_radius) + (; cell_list, periodic_box) = neighborhood_search + cell = cell_coords(point_coords, neighborhood_search) + + @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + # @inbounds @fastmath for cellx in (cell[1] - 1):(cell[1] + 1), celly in (cell[2] - 1):(cell[2] + 1), cellz in (cell[3] - 1):(cell[3] + 1) + # neighbor_cell = (cellx, celly, cellz) + + # Making the following `@inbounds` is not safe because we don't know if + # `neighbor_cell` is in bounds of the grid. + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + # Boolean to indicate if this cell has a collision (only with `SpatialHashingCellList`) + # cell_collision = check_cell_collision(neighbor_cell_, + # cell_list, neighborhood_search) + + # vectorwidth = 8 + # for block_idx in cld(length(neighbors), vectorwidth) + # block = SVector{vectorwidth}((block_idx - 1) * vectorwidth .+ (1:vectorwidth)) + # neighbors_block = @inbounds neighbors[block] + + # neighbor_coords = @inbounds extract_svector(neighbor_coords, + # Val(ndims(neighborhood_search)), + # neighbors_block) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + # Making the following `@inbounds` is not safe because we don't know + # if `neighbor` (extracted from the cell list) is in bounds. + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), + neighbor) + + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) + distance2 = dot(pos_diff, pos_diff) + + # (pos_diff, + # distance2) = compute_periodic_distance(pos_diff, distance2, + # search_radius, periodic_box) + + distance = sqrt(distance2) + if distance2 <= search_radius^2 + # distance2 = ifelse(distance2 <= search_radius^2, distance2, zero(distance2)) + # distance = sqrt(distance2) + + # If this cell has a collision, check if this point belongs to this cell + # (only with `SpatialHashingCellList`). + # if cell_collision && + # check_collision(neighbor_cell_, neighbor_point_coords, cell_list, + # neighborhood_search) + # continue + # end # Inline to avoid loss of performance # compared to not using `foreach_point_neighbor`. diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 187ff0c5..b00ea19b 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -179,6 +179,7 @@ function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y, end end +using SIMD function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, neighborhood_search, x, y, parallelization_backend, sort_neighbor_lists) @@ -190,39 +191,236 @@ function initialize_neighbor_lists!(neighbor_lists::DynamicVectorOfVectors, end # Fill neighbor lists - foreach_point_neighbor(x, y, neighborhood_search; - parallelization_backend) do point, neighbor, _, _ - pushat!(neighbor_lists, point, neighbor) + # foreach_point_neighbor(x, y, neighborhood_search; + # parallelization_backend) do point, neighbor, _, _ + # @inbounds pushat!(neighbor_lists, point, neighbor) + # end + # @threaded parallelization_backend for point in axes(x, 2) + # length = @inbounds neighbor_lists.lengths[point] + # length = foreach_neighbor_unsafe(x, y, neighborhood_search, point, length) do point_, neighbor, _, distance, length_ + # # @inbounds pushat!(neighbor_lists, point_, neighbor) + # if distance < search_radius(neighborhood_search) + # length_ += 1 + # @inbounds neighbor_lists.backend[length_, point] = neighbor + # end + + # return length_ + # end + # @inbounds neighbor_lists.lengths[point] = length + # end + + search_radius2 = search_radius(neighborhood_search)^2 + + # 100x100x100 points on Rucio: 40ms on the CPU, 65ms on the GPU. + # 100x100x100 points on RAMSES: 36ms on the CPU + @threaded parallelization_backend for point in axes(x, 2) + point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + length = @inbounds neighbor_lists.lengths[point] + + @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + neighbor_point_coords = extract_svector(y, Val(ndims(neighborhood_search)), + neighbor) + + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) + distance2 = dot(pos_diff, pos_diff) + + @inbounds neighbor_lists.backend[length + 1, point] = neighbor + length = length + (distance2 <= search_radius2) + end + end + + @inbounds neighbor_lists.lengths[point] = length end + # 100x100x100 points on Rucio: 83ms on the CPU, 16ms on the GPU. + # 100x100x100 points on RAMSES: 52ms on the CPU + # @threaded parallelization_backend for point in axes(x, 2) + # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + # cell = cell_coords(point_coords, neighborhood_search) + # length = @inbounds neighbor_lists.lengths[point] + + # @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + # neighbor_cell = Tuple(neighbor_cell_) + # neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + # for neighbor_ in eachindex(neighbors) + # neighbor = @inbounds neighbors[neighbor_] + + # neighbor_point_coords = extract_svector(y, Val(ndims(neighborhood_search)), + # neighbor) + + # pos_diff = convert.(eltype(neighborhood_search), + # point_coords - neighbor_point_coords) + # distance2 = dot(pos_diff, pos_diff) + + # if distance2 <= search_radius2 + # length = length + 1 + # @inbounds neighbor_lists.backend[length, point] = neighbor + # end + # end + # end + + # @inbounds neighbor_lists.lengths[point] = length + # end + + # 100x100x100 points on Rucio: 11ms on the GPU (fastest), not CPU-compatible. + # ndrange = size(x, 2) + # mykernel(parallelization_backend, 64)(x, y, neighborhood_search, neighbor_lists, search_radius2, ndrange = ndrange) + # KernelAbstractions.synchronize(parallelization_backend) + + # 100x100x100 points on Rucio: 36ms on the CPU (fastest), not GPU-compatible. + # 100x100x100 points on RAMSES: 38ms on the CPU + # @threaded parallelization_backend for point in axes(x, 2) + # point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + # coords_a1, coords_a2, coords_a3 = point_coords + # cell = cell_coords(point_coords, neighborhood_search) + # length_ = @inbounds neighbor_lists.lengths[point] + + # @inbounds @fastmath for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + # neighbor_cell = Tuple(neighbor_cell_) + # neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + # vectorwidth = 8 + # @fastmath for block_start in 1:vectorwidth:length(neighbors) + # block_start + vectorwidth - 1 > length(neighbors) && break + + # neighbors_block = @inbounds vload(Vec{8, eltype(neighbors)}, neighbors, block_start) + + # # Linear indexing into `coordinates` because Cartesian indexing doesn't work. + # point_start = (neighbors_block - 1) * 3 + 1 + # x_b = @inbounds y[point_start] + # y_b = @inbounds y[point_start + 1] + # z_b = @inbounds y[point_start + 2] + + # pos_diff_x = coords_a1 - x_b + # pos_diff_y = coords_a2 - y_b + # pos_diff_z = coords_a3 - z_b + # distance2 = pos_diff_x * pos_diff_x + pos_diff_y * pos_diff_y + pos_diff_z * pos_diff_z + # mask = distance2 <= search_radius2 + + # sum(mask) == 0 && continue + + # @inbounds for neighbor_ in 1:vectorwidth + # neighbor = neighbors_block[neighbor_] + # neighbor_lists.backend[length_ + 1, point] = neighbor + # length_ = length_ + mask[neighbor_] + # end + # end + # end + + # @inbounds neighbor_lists.lengths[point] = length_ + # end + if sort_neighbor_lists sorteach!(neighbor_lists) end end -@inline function foreach_neighbor(f, neighbor_system_coords, - neighborhood_search::PrecomputedNeighborhoodSearch, - point, point_coords, search_radius) +@kernel cpu=false function mykernel(x, y, neighborhood_search, neighbor_lists, search_radius2) + point = @index(Global) + threadidx = @index(Local) + + point_coords = @inbounds extract_svector(x, Val(ndims(neighborhood_search)), point) + cell = cell_coords(point_coords, neighborhood_search) + + length_ = @inbounds neighbor_lists.lengths[point] + local_neighbors = @localmem Int32 (64, 128) # (groupsize, max_neighbors) + + for neighbor_cell_ in neighboring_cells(cell, neighborhood_search) + neighbor_cell = Tuple(neighbor_cell_) + neighbors = points_in_cell(neighbor_cell, neighborhood_search) + + for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + neighbor_point_coords = @inbounds extract_svector(y, Val(ndims(neighborhood_search)), + neighbor) + + pos_diff = point_coords - neighbor_point_coords + distance2 = dot(pos_diff, pos_diff) + + if distance2 <= search_radius2 + length_ = length_ + 1 + @inbounds local_neighbors[threadidx, length_] = neighbor + end + end + end + + @inbounds neighbor_lists.lengths[point] = length_ + for i in axes(local_neighbors, 2) + @inbounds neighbor_lists.backend[i, point] = local_neighbors[threadidx, i] + end +end + +# Note that calling this function with `@inbounds` is not safe. +# See the comments in `foreach_neighbor_unsafe`. +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::PrecomputedNeighborhoodSearch, + point, point_coords, search_radius, data) + (; periodic_box, neighbor_lists) = neighborhood_search + + # Making the following `@inbounds` is not safe because the neighbor list + # might not contain `point` if the NHS was not initialized correctly. + neighbors = neighbor_lists[point] + @fastmath @loopinfo vectorwidth=8 predicate for neighbor_ in eachindex(neighbors) + neighbor = @inbounds neighbors[neighbor_] + + # Making this `@inbounds` is not safe because + # `neighbor` (extracted from the neighbor list) is only guaranteed to be in bounds + # if the neighbor lists were constructed correctly and have not been corrupted. + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) + + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) + distance2 = dot(pos_diff, pos_diff) + + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) + + distance = sqrt(distance2) + + # Inline to avoid loss of performance + # compared to not using `foreach_point_neighbor`. + data = @inline f(point, neighbor, pos_diff, distance, data) + end + + return data +end + +@propagate_inbounds function foreach_neighbor_inner(f, neighbor_coords, + neighborhood_search::PrecomputedNeighborhoodSearch, + point, point_coords, search_radius) (; periodic_box, neighbor_lists) = neighborhood_search - neighbors = @inbounds neighbor_lists[point] + # Making the following `@inbounds` is not safe because the neighbor list + # might not contain `point` if the NHS was not initialized correctly. + neighbors = neighbor_lists[point] for neighbor_ in eachindex(neighbors) neighbor = @inbounds neighbors[neighbor_] - # Making this `@inbounds` is not perfectly safe because + # Making this `@inbounds` is not safe because # `neighbor` (extracted from the neighbor list) is only guaranteed to be in bounds # if the neighbor lists were constructed correctly and have not been corrupted. - # However, adding this `@inbounds` yields a ~20% speedup for TLSPH on GPUs (A4500). - neighbor_coords = @inbounds extract_svector(neighbor_system_coords, - Val(ndims(neighborhood_search)), - neighbor) + neighbor_point_coords = extract_svector(neighbor_coords, + Val(ndims(neighborhood_search)), neighbor) - pos_diff = convert.(eltype(neighborhood_search), point_coords - neighbor_coords) + pos_diff = convert.(eltype(neighborhood_search), + point_coords - neighbor_point_coords) distance2 = dot(pos_diff, pos_diff) - pos_diff, - distance2 = compute_periodic_distance(pos_diff, distance2, search_radius, - periodic_box) + (pos_diff, + distance2) = compute_periodic_distance(pos_diff, distance2, search_radius, + periodic_box) distance = sqrt(distance2) @@ -247,7 +445,8 @@ function copy_neighborhood_search(nhs::PrecomputedNeighborhoodSearch, update_neighborhood_search, backend = typeof(nhs.neighbor_lists), transpose_backend, - max_neighbors = max_neighbors_) + max_neighbors = max_neighbors_, + sort_neighbor_lists = nhs.sort_neighbor_lists) end @inline function freeze_neighborhood_search(search::PrecomputedNeighborhoodSearch) diff --git a/src/vector_of_vectors.jl b/src/vector_of_vectors.jl index c05cddbd..d00f2e9d 100644 --- a/src/vector_of_vectors.jl +++ b/src/vector_of_vectors.jl @@ -73,13 +73,13 @@ end # Outer bounds check @boundscheck checkbounds(vov, i) - lengths[i] += 1 + @inbounds lengths[i] += 1 # Inner bounds check @boundscheck check_list_bounds(vov, i) # Activate new entry in column `i` - backend[lengths[i], i] = value + @inbounds backend[lengths[i], i] = value return vov end @@ -175,6 +175,16 @@ end # Sort each inner vector @inline function sorteach!(vov::DynamicVectorOfVectors) + @threaded default_backend(vov.backend) for i in axes(vov.backend, 2) + # QuickSort is ~1.6x faster here than the default + sort!(view(vov.backend, 1:vov.lengths[i], i), alg=QuickSort) + end + + return vov +end + +# GPU version +@inline function sorteach!(vov::DynamicVectorOfVectors{<:Any, <:Any, <:AbstractGPUArray}) # TODO remove this check when Metal supports sorting if nameof(typeof(default_backend(vov.backend))) == :MetalBackend @warn "sorting neighbor lists is not supported on Metal. Skipping sort." diff --git a/test/cell_lists/spatial_hashing.jl b/test/cell_lists/spatial_hashing.jl index 28d13b12..6a8e8104 100644 --- a/test/cell_lists/spatial_hashing.jl +++ b/test/cell_lists/spatial_hashing.jl @@ -71,9 +71,11 @@ @testset "Cell Coordinates Hash Function" begin # 1D coordinates - @test coordinates_flattened([1]) == UInt128(reinterpret(UInt32, Int32(1))) - @test coordinates_flattened([-1]) == UInt128(reinterpret(UInt32, Int32(-1))) - @test coordinates_flattened([0]) == Int128(0) + @test PointNeighbors.coordinates_flattened([1]) == + UInt128(reinterpret(UInt32, Int32(1))) + @test PointNeighbors.coordinates_flattened([-1]) == + UInt128(reinterpret(UInt32, Int32(-1))) + @test PointNeighbors.coordinates_flattened([0]) == Int128(0) # 2D coordinates coord2 = [-1, 1] @@ -82,18 +84,20 @@ # The second coordinate gives 1 shifted by 32 bits, so 1 * 2^32. expected2 = UInt128(2^32 - 1 + 2^32) - @test coordinates_flattened(coord2) == expected2 + @test PointNeighbors.coordinates_flattened(coord2) == expected2 # 3D coordinates coord3 = [1, 0, -1] expected3 = UInt128(1 + 0 * 2^32 + (2^32 - 1) * Int128(2)^64) - @test coordinates_flattened(coord3) == expected3 + @test PointNeighbors.coordinates_flattened(coord3) == expected3 # Extreme Int32 bounds max_val = Int32(typemax(Int32)) min_val = Int32(typemin(Int32)) - @test coordinates_flattened((max_val,)) == UInt128(reinterpret(UInt32, max_val)) - @test coordinates_flattened((min_val,)) == UInt128(reinterpret(UInt32, min_val)) + @test PointNeighbors.coordinates_flattened((max_val,)) == + UInt128(reinterpret(UInt32, max_val)) + @test PointNeighbors.coordinates_flattened((min_val,)) == + UInt128(reinterpret(UInt32, min_val)) # 3D extreme Int32 bounds coord4 = [min_val, max_val, Int32(0)] @@ -102,19 +106,19 @@ # `typemax(Int32)` gives the unsigned value 2^31 - 1. expected4 = UInt128(2^31 + (2^31 - 1) * 2^32) - @test coordinates_flattened(coord4) == expected4 + @test PointNeighbors.coordinates_flattened(coord4) == expected4 # Passing non-Int32-coercible should error large_val = typemax(Int32) + 1 - @test_throws InexactError coordinates_flattened([large_val]) + @test_throws InexactError PointNeighbors.coordinates_flattened([large_val]) small_val = typemin(Int32) - 1 - @test_throws InexactError coordinates_flattened([small_val]) + @test_throws InexactError PointNeighbors.coordinates_flattened([small_val]) - @test_throws InexactError coordinates_flattened([Inf]) - @test_throws InexactError coordinates_flattened([NaN]) + @test_throws InexactError PointNeighbors.coordinates_flattened([Inf]) + @test_throws InexactError PointNeighbors.coordinates_flattened([NaN]) # Too many dimensions should throw assertion error - @test_throws AssertionError coordinates_flattened([1, 2, 3, 4]) + @test_throws AssertionError PointNeighbors.coordinates_flattened([1, 2, 3, 4]) end end diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl index 65d3ed85..ed1a80dd 100644 --- a/test/neighborhood_search.jl +++ b/test/neighborhood_search.jl @@ -142,14 +142,14 @@ "($(seed == 1 ? "`initialize!`" : "`update!`"))" @testset verbose=true "$(name(cloud_size, seed)))" for cloud_size in cloud_sizes, seed in seeds - coords = point_cloud(cloud_size, seed = seed) + search_radius = 2.5 + coords = point_cloud(cloud_size, search_radius, seed = seed) NDIMS = length(cloud_size) n_points = size(coords, 2) - search_radius = 2.5 # Use different coordinates for `initialize!` and then `update!` with the # correct coordinates to make sure that `update!` is working as well. - coords_initialize = point_cloud(cloud_size, seed = 1) + coords_initialize = point_cloud(cloud_size, search_radius, seed = 1) # Compute expected neighbor lists by brute-force looping over all points # as potential neighbors (`TrivialNeighborhoodSearch`). @@ -254,7 +254,7 @@ names_copied = [name * " copied" for name in names] append!(names, names_copied) - @testset "$(names[i])" for i in eachindex(names) + @testset verbose=true "$(names[i])" for i in eachindex(names) nhs = neighborhood_searches[i] # Initialize with `seed = 1` @@ -267,17 +267,60 @@ update!(nhs, coords, coords) end - neighbors = [Int[] for _ in axes(coords, 2)] + # Test the regular `foreach_point_neighbor` + @testset "`foreach_point_neighbor`" begin + neighbors = [Int[] for _ in axes(coords, 2)] + foreach_point_neighbor(coords, coords, nhs, + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance + append!(neighbors[point], neighbor) + end + + @test sort.(neighbors) == neighbors_expected + end - foreach_point_neighbor(coords, coords, nhs, - parallelization_backend = SerialBackend()) do point, - neighbor, - pos_diff, - distance - append!(neighbors[point], neighbor) + # Test manual loop with `foreach_neighbor` + @testset "Manual Loop with `foreach_neighbor`" begin + neighbors_manual = [Int[] for _ in axes(coords, 2)] + for point in axes(coords, 2) + foreach_neighbor(coords, coords, nhs, + point) do point, neighbor, pos_diff, distance + append!(neighbors_manual[point], neighbor) + end + end + + @test sort.(neighbors_manual) == neighbors_expected + end + + # Repeat with foreach_point_neighbor_unsafe + @testset "`foreach_point_neighbor_unsafe`" begin + neighbors_unsafe = [Int[] for _ in axes(coords, 2)] + foreach_point_neighbor_unsafe(coords, coords, nhs, + parallelization_backend = SerialBackend()) do point, + neighbor, + pos_diff, + distance + append!(neighbors_unsafe[point], neighbor) + end + + @test sort.(neighbors_unsafe) == neighbors_expected end - @test sort.(neighbors) == neighbors_expected + # Repeat with manual loop with `foreach_neighbor_unsafe` + @testset "Manual Loop with `foreach_neighbor_unsafe`" begin + neighbors_manual_unsafe = [Int[] for _ in axes(coords, 2)] + for point in axes(coords, 2) + foreach_neighbor_unsafe(coords, coords, nhs, + point) do point, neighbor, + pos_diff, distance + append!(neighbors_manual_unsafe[point], neighbor) + end + end + + @test sort.(neighbors_manual_unsafe) == neighbors_expected + end end end end diff --git a/test/point_cloud.jl b/test/point_cloud.jl index ba99b2e1..001e53b5 100644 --- a/test/point_cloud.jl +++ b/test/point_cloud.jl @@ -1,8 +1,9 @@ using Random # Generate a rectangular point cloud, optionally with a perturbation in the point positions -function point_cloud(n_points_per_dimension; - seed = 1, perturbation_factor_position = 1.0) +function point_cloud(n_points_per_dimension, search_radius; + seed = 1, perturbation_factor_position = 1.0, + sort = true, shuffle = false) # Fixed seed to ensure reproducibility Random.seed!(seed) @@ -10,8 +11,15 @@ function point_cloud(n_points_per_dimension; coordinates = Array{Float64}(undef, n_dims, prod(n_points_per_dimension)) cartesian_indices = CartesianIndices(n_points_per_dimension) + # Extra data structures for the sorting code below + cell_coords = Vector{SVector{n_dims, Int}}(undef, size(coordinates, 2)) + cell_size = ntuple(dim -> search_radius, n_dims) + for i in axes(coordinates, 2) - coordinates[:, i] .= Tuple(cartesian_indices[i]) + point_coords = SVector(Tuple(cartesian_indices[i])) + coordinates[:, i] .= point_coords + cell_coords[i] = PointNeighbors.cell_coords(point_coords, nothing, nothing, + cell_size) .+ 1 end # A standard deviation of 0.05 in the particle coordinates @@ -21,11 +29,28 @@ function point_cloud(n_points_per_dimension; # The benchmark results are also consistent with the timer output of the simulation. perturb!(coordinates, perturbation_factor_position * 0.05) + # Sort by linear cell index + if sort + if shuffle + throw(ArgumentError("cannot sort and shuffle at the same time")) + end + + # Sort by Z index (with `using Morton`) + # permutation = sortperm(cell_coords, by = c -> cartesian2morton(c)) + + permutation = sortperm(cell_coords) + coordinates .= coordinates[:, permutation] + elseif shuffle + # Sort randomly + permutation = shuffle(axes(coordinates, 2)) + coordinates .= coordinates[:, permutation] + end + return coordinates end function perturb!(data, std_deviation) - for i in eachindex(data) + PointNeighbors.@threaded PointNeighbors.default_backend(data) for i in eachindex(data) data[i] += std_deviation * randn() end diff --git a/test/unittest.jl b/test/unittest.jl index f0981614..262b1753 100644 --- a/test/unittest.jl +++ b/test/unittest.jl @@ -7,4 +7,5 @@ include("nhs_precomputed.jl") include("neighborhood_search.jl") include("cell_lists/full_grid.jl") + include("cell_lists/spatial_hashing.jl") end;