From c3ffb3ceb39c1fea27df6ae4bf90b148c7850031 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:10:36 +0200 Subject: [PATCH 01/15] Create tutorials in the docs --- docs/Project.toml | 6 ++ docs/literate/src/tut_basic_usage.jl | 114 +++++++++++++++++++++++++++ docs/literate/src/tut_gpu_usage.jl | 96 ++++++++++++++++++++++ docs/literate/src/tut_n_body.jl | 61 ++++++++++++++ docs/literate/src/tut_periodicity.jl | 58 ++++++++++++++ docs/make.jl | 23 +++++- docs/src/tutorial.md | 9 +++ src/nhs_grid.jl | 10 ++- src/nhs_precomputed.jl | 5 ++ src/nhs_trivial.jl | 5 ++ 10 files changed, 384 insertions(+), 3 deletions(-) create mode 100644 docs/literate/src/tut_basic_usage.jl create mode 100644 docs/literate/src/tut_gpu_usage.jl create mode 100644 docs/literate/src/tut_n_body.jl create mode 100644 docs/literate/src/tut_periodicity.jl create mode 100644 docs/src/tutorial.md diff --git a/docs/Project.toml b/docs/Project.toml index 1814eb33..c3734416 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,11 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" [compat] +Adapt = "4" Documenter = "1" +KernelAbstractions = "0.9" +Literate = "2" diff --git a/docs/literate/src/tut_basic_usage.jl b/docs/literate/src/tut_basic_usage.jl new file mode 100644 index 00000000..9f224cfa --- /dev/null +++ b/docs/literate/src/tut_basic_usage.jl @@ -0,0 +1,114 @@ +# # [Basic Usage: Fixed-Radius Neighbor Search](@id tut_basic_usage) + +# This tutorial shows the standard workflow for PointNeighbors.jl: +# create coordinates, configure a neighborhood search, initialize it, +# and loop over neighbors. +using PointNeighbors + +# ## Generate a regular 2D point cloud + +# We create a regular grid of points in 2D for this example, with distances of 1 +# between neighboring points. +# The coordinates need to be stored in a 2×N array, where N is the number of points. +n_points_per_dimension = (100, 100) +n_points = prod(n_points_per_dimension) +coordinates = Array{Float64}(undef, 2, n_points) +cartesian_indices = CartesianIndices(n_points_per_dimension) + +for i in axes(coordinates, 2) + coordinates[:, i] .= Tuple(cartesian_indices[i]) +end + +# ## Create and initialize the neighborhood search + +# We choose a search radius and create the neighborhood search. +# For the [`GridNeighborhoodSearch`](@ref), we need to pass `n_points` as a maximum +# number of points in the neighbor coordinates array, which is required to allocate +# the data structures for the update step. +search_radius = 3.0 +nhs = GridNeighborhoodSearch{2}(; search_radius, n_points) +nothing # hide + +# Initialize the neighborhood search with the coordinates. +# !!! warning +# This neighborhood search is now configured for the given coordinates. +# In general, it is only possible to use this neighborhood search to find neighbors +# for points contained in `coordinates` and not for arbitrary points in space. +# See below for more information. +initialize!(nhs, coordinates, coordinates) + +# ## Count neighbors for each point + +# With the neighborhood search initialized, we can now loop over neighbors. +# We use the function [`foreach_point_neighbor`](@ref) to loop over all pairs of points +# and neighbors within the search radius. +# !!! warning +# The `foreach_point_neighbor` function is multithreaded over the points by default. +# Only remove `parallelization_backend = SerialBackend()` if you are sure that your code +# is thread-safe and that there are no race conditions. +n_neighbors = zeros(Int, n_points) + +function count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) + n_neighbors .= 0 + + foreach_point_neighbor(coordinates, neighbor_coordinates, nhs, + parallelization_backend = SerialBackend()) do i, j, + pos_diff, distance + n_neighbors[i] += 1 + end + + return n_neighbors +end + +count_neighbors!(n_neighbors, coordinates, coordinates, nhs) + +# Interior particles have many neighbors, boundary particles fewer. +extrema(n_neighbors) + +# ## Different point sets for points and neighbors + +# The neighborhood search is currently configured to find neighbors in `coordinates` +# for points in `coordinates`. However, it is also possible to find neighbors in a +# different set of coordinates, e.g., `neighbor_coordinates`. +neighbor_coordinates = coordinates .+ 0.5 +nothing # hide + +# In order to find neighbors in `neighbor_coordinates`, we need to re-initialize +# the neighborhood search with `neighbor_coordinates` as the second argument. +initialize!(nhs, coordinates, neighbor_coordinates) +nothing # hide + +# Now the neighborhood search is configured to find neighbors in `neighbor_coordinates` +# for points in `coordinates`. +count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) +extrema(n_neighbors) + +# ## Updating the neighborhood search + +# If the coordinates of either the points or the neighbors change, the neighborhood search +# needs to be updated with [`update!`](@ref). +# Depending on the neighborhood search implementation, this can be done more efficiently +# than re-initializing the neighborhood search. +# !!! warning +# An `update!` requires that the sizes of the point sets do not change. +# +# If we don't update the neighborhood search, we will get incorrect neighbors: +neighbor_coordinates .+= 10 +count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) +extrema(n_neighbors) + +# After updating the neighborhood search, we get the correct new neighbors. +update!(nhs, coordinates, neighbor_coordinates) +count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) +extrema(n_neighbors) + +# If the first coordinates are updated but the second coordinates are not, we generally +# also need to update the neighborhood search. +# For some neighborhood search implementations, notably the [`GridNeighborhoodSearch`](@ref), +# this is not necessary, since this implementation can find neighbors for arbitrary points in space. +# To check whether an update is necessary, we can call [`requires_update`](@ref): +requires_update(nhs) + +# The first `false` indicates that an update is not required when the first coordinates are +# updated, and the second `true` indicates that an update is required when the second +# coordinates are updated. diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl new file mode 100644 index 00000000..33f9a65a --- /dev/null +++ b/docs/literate/src/tut_gpu_usage.jl @@ -0,0 +1,96 @@ +# # [GPU Usage](@id tut_gpu_usage) + +# This tutorial shows how to use PointNeighbors.jl on GPUs. +# We adapt the [basic usage tutorial](@ref tut_basic_usage) to run on GPUs. +using PointNeighbors +using Adapt + +# PointNeighbors.jl provides vendor-agnostic GPU support through KernelAbstractions.jl. +# Load the appropriate package for your GPU and define the corresponding backend: +# - `using CUDA; backend = CUDABackend()` for NVIDIA GPUs with CUDA.jl +# - `using AMDGPU; backend = ROCBackend()` for AMD GPUs with AMDGPU.jl +# - `using Metal; backend = MetalBackend()` for Apple Silicon GPUs with Metal.jl +# - `using oneAPI; backend = oneAPIBackend()` for Intel GPUs with oneAPI.jl +import KernelAbstractions; backend = KernelAbstractions.CPU(); nothing # hide + +# ## Create coordinates on the CPU + +# We create the same regular grid of points in 2D as in the basic usage tutorial. +# For better GPU performance, we use `Float32` instead of `Float64`. +# Note that some GPUs (notably Apple Silicon) do not support `Float64` at all. +n_points_per_dimension = (100, 100) +n_points = prod(n_points_per_dimension) +coordinates = Array{Float32}(undef, 2, n_points) +cartesian_indices = CartesianIndices(n_points_per_dimension) + +for i in axes(coordinates, 2) + coordinates[:, i] .= Tuple(cartesian_indices[i]) +end + +# We can use Adapt.jl to move this coordinates array to the GPU. +coordinates_gpu = adapt(backend, coordinates) + +# ## Create and initialize the neighborhood search + +# After taking computing the difference between coordinates of neighboring particles, +# [`foreach_point_neighbor`](@ref) converts the result to the type of `search_radius` before +# computing the distance. The type of `search_radius` therefore determines the types of +# `pos_diff` and `distance` inside `foreach_point_neighbor`. We need to make sure to choose +# the data type that we want to use for our computations, which will usually be `Float32` +# when working on GPUs. +# +# Note that it is common in Smoothed Particle Hydrodynamics to store the coordiantes +# in `Float64` and everything else in `Float32` to avoid large errors in the distance +# computations. This is also supported by PointNeighbors.jl. Using `Float64` coordinates +# with a `Float32` search radius limits `Float64` operations by converting to `Float32` +# after loading the point coordinates and computing the difference. +# `pos_diff` and `distance` inside `foreach_point_neighbor` will all use `Float32`. +search_radius = 3.0f0 +nothing # hide + +# For GPU compatibility, we need to use a [`FullGridCellList`](@ref), which is optimized +# for maximum performance, but requires a rectangular bounding box for the domain, +# as opposed ot the default [`DictionaryCellList`](@ref) that supports potentially +# infinite domains. +min_corner = minimum(coordinates, dims = 2) +max_corner = maximum(coordinates, dims = 2) +cell_list = FullGridCellList(; search_radius, min_corner, max_corner) +nothing # hide + +# Now we can create the neighborhood search as usual. +nhs = GridNeighborhoodSearch{2}(; search_radius, cell_list, n_points) +nothing # hide + +# This neighborhood search is currently living in CPU memory, so we first need to move it +# to the GPU. We can also use Adapt.jl for this. +nhs_gpu = adapt(backend, nhs) +nothing # hide + +# From now on, everything happens on the GPU, so we need to use the GPU coordinates. +initialize!(nhs_gpu, coordinates_gpu, coordinates_gpu; parallelization_backend = backend) +nothing # hide + +# ## Count neighbors on the GPU + +# We can now use the same function as in the [basic usage tutorial](@ref tut_basic_usage). +# The parallelization backend is detected automatically from the type of the coordinates. +n_neighbors_gpu = adapt(backend, zeros(Int, n_points)) + +function count_neighbors!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + foreach_point_neighbor(coordinates, coordinates, nhs) do i, j, pos_diff, distance + ## `foreach_point_neighbor` makes sure that `i` and `j` are in bounds + ## of the respective coordinates. + ## We are now inside a GPU kernel, so using `@inbounds` is important for performance. + @inbounds n_neighbors[i] += 1 + end + + return n_neighbors +end + +# Now we can run the neighbor loop on the GPU. +# We just need to make sure to pass the GPU coordinates, GPU neighborhood search, +# and GPU neighbor count array. +count_neighbors!(n_neighbors_gpu, coordinates_gpu, nhs_gpu) +extrema(n_neighbors_gpu) diff --git a/docs/literate/src/tut_n_body.jl b/docs/literate/src/tut_n_body.jl new file mode 100644 index 00000000..7c1d5376 --- /dev/null +++ b/docs/literate/src/tut_n_body.jl @@ -0,0 +1,61 @@ +# # [N-Body Example with Cutoff Radius](@id tut_n_body) + +# This tutorial shows how to compute one right-hand-side evaluation of a simple +# n-body model with a cutoff radius using PointNeighbors.jl. +# It builds on the basic usage tutorial and reuses the same neighbor-loop pattern. +using PointNeighbors +using Random + +# ## Generate a simple 2D particle set + +# We use uniformly distributed points in 2D. As in all of PointNeighbors.jl, +# coordinates are stored in a 2×N array. +n_particles = 5_000 +coordinates = rand(2, n_particles) + +# The cutoff radius for pair interactions is the search radius. +search_radius = 0.04 + +# Each particle gets a mass in [1e10, 2e10]. +mass = 1.0e10 .* (rand(n_particles) .+ 1) +G = Float32(6.6743e-11) +accelerations = similar(coordinates) +nothing # hide + +# ## Create and initialize the neighborhood search + +nhs = GridNeighborhoodSearch{2}(; search_radius, n_points = n_particles) +initialize!(nhs, coordinates, coordinates) + +# ## Compute one acceleration update + +# Sum all pairwise contributions inside the cutoff radius. +# Note that this is a multithreaded loop when starting Julia with multiple threads. +# It is thread-safe because the threading happens over the particles, not the neighbors, +# and each thread only updates the acceleration of its own particle. +function compute_acceleration!(accelerations, coordinates, mass, G, nhs) + accelerations .= 0.0 + + foreach_point_neighbor(coordinates, coordinates, nhs) do i, j, pos_diff, distance + ## Skip self-interactions. Note that `return` only exits the closure, + ## i.e., skips the current neighbor. + distance < eps() && return + + ## `foreach_point_neighbor` makes sure that `i` and `j` are in bounds + ## of the respective coordinates. This is especially relevant on GPUs, + ## where bounds checking is more expensive. + dv = @inbounds -G * mass[j] * pos_diff / distance^3 + + for dim in axes(accelerations, 1) + @inbounds accelerations[dim, i] += dv[dim] + end + end + + return accelerations +end + +compute_acceleration!(accelerations, coordinates, mass, G, nhs) +nothing # hide + +# We now have one acceleration vector per particle. +size(accelerations) diff --git a/docs/literate/src/tut_periodicity.jl b/docs/literate/src/tut_periodicity.jl new file mode 100644 index 00000000..5b752a74 --- /dev/null +++ b/docs/literate/src/tut_periodicity.jl @@ -0,0 +1,58 @@ +# # [Periodic Domains with `PeriodicBox`](@id tut_periodicity) + +# This tutorial extends the basic setup by adding periodic boundary conditions. +using PointNeighbors + +# ## Generate a regular 2D point cloud +n_points_per_dimension = (100, 100) +n_points = prod(n_points_per_dimension) +coordinates = Array{Float64}(undef, 2, n_points) +cartesian_indices = CartesianIndices(n_points_per_dimension) + +for i in axes(coordinates, 2) + coordinates[:, i] .= Tuple(cartesian_indices[i]) +end + +# ## Configure a periodic domain + +# PointNeighbors.jl supports rectangular, axis-aligned periodic domains through the +# [`PeriodicBox`](@ref) type. +search_radius = 3.0 +min_corner = (0.0, 0.0) +max_corner = (100.0, 100.0) +periodic_box = PeriodicBox(; min_corner, max_corner) +nothing # hide + +# This periodic box can now be passed to the neighborhood search constructor. +nhs = GridNeighborhoodSearch{2}(; search_radius, periodic_box, n_points) +initialize!(nhs, coordinates, coordinates) +nothing # hide + +# ## Count neighbors in periodic and non-periodic setups + +# This function is the same as in the [basic usage tutorial](@ref tut_basic_usage) +# and counts the number of neighbors for each point. +# Note that this is a multithreaded loop when starting Julia with multiple threads. +# It is thread-safe because the threading happens over the points, not the neighbors, +# and each thread only updates the counter of its own point. +function count_neighbors!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + foreach_point_neighbor(coordinates, coordinates, nhs) do i, j, pos_diff, distance + n_neighbors[i] += 1 + end + + return n_neighbors +end + +n_neighbors = zeros(Int, n_points) +count_neighbors!(n_neighbors, coordinates, nhs) +extrema(n_neighbors) + +# Since we already have a bounded domain, it makes sense to use a `FullGridCellList`, +# which is focused on maximum performance, but does not support infinite domains. +cell_list = FullGridCellList(; search_radius, min_corner, max_corner) +nhs2 = GridNeighborhoodSearch{2}(; search_radius, periodic_box, cell_list, n_points) +initialize!(nhs2, coordinates, coordinates) +count_neighbors!(n_neighbors, coordinates, nhs2) +extrema(n_neighbors) diff --git a/docs/make.jl b/docs/make.jl index 46872f7d..e670d307 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,5 @@ using Documenter +using Literate: Literate # Get PointNeighbors.jl root directory trixibase_root_dir = dirname(@__DIR__) @@ -38,6 +39,17 @@ copy_file("LICENSE.md", "[AUTHORS.md](AUTHORS.md)" => "[Authors](@ref)", "\n" => "\n> ", r"^" => "# License\n\n> ") +mkpath(joinpath(@__DIR__, "src", "tutorials")) + +Literate.markdown(joinpath("docs", "literate", "src", "tut_basic_usage.jl"), + joinpath("docs", "src", "tutorials")) +Literate.markdown(joinpath("docs", "literate", "src", "tut_n_body.jl"), + joinpath("docs", "src", "tutorials")) +Literate.markdown(joinpath("docs", "literate", "src", "tut_periodicity.jl"), + joinpath("docs", "src", "tutorials")) +Literate.markdown(joinpath("docs", "literate", "src", "tut_gpu_usage.jl"), + joinpath("docs", "src", "tutorials")) + # Make documentation makedocs(modules = [PointNeighbors], sitename = "PointNeighbors.jl", @@ -46,10 +58,19 @@ makedocs(modules = [PointNeighbors], # Disable pretty URLs during manual testing prettyurls = get(ENV, "CI", nothing) == "true", # Set canonical URL to GitHub pages URL - canonical = "https://trixi-framework.github.io/PointNeighbors.jl/stable"), + canonical = "https://trixi-framework.github.io/PointNeighbors.jl/stable", + # Set edit_link explicitly to avoid `git remote show origin` lookups. + edit_link="main"), # Explicitly specify documentation structure pages = [ "Home" => "index.md", + "Tutorials" => [ + "Overview" => "tutorial.md", + "Basic Usage" => joinpath("tutorials", "tut_basic_usage.md"), + "N-Body" => joinpath("tutorials", "tut_n_body.md"), + "Periodicity" => joinpath("tutorials", "tut_periodicity.md"), + "GPU Usage" => joinpath("tutorials", "tut_gpu_usage.md") + ], "API reference" => "reference.md", "Authors" => "authors.md", "License" => "license.md" diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md new file mode 100644 index 00000000..3bba7d1d --- /dev/null +++ b/docs/src/tutorial.md @@ -0,0 +1,9 @@ +# Tutorials + +This section provides worked examples for common PointNeighbors.jl workflows. +Each tutorial starts from a complete, minimal script and explains the main API calls. + +- [Basic Usage: Fixed-Radius Neighbor Search](@ref tut_basic_usage) +- [N-Body Example with Cutoff Radius](@ref tut_n_body) +- [Periodic Domains with `PeriodicBox`](@ref tut_periodicity) +- [GPU Usage](@ref tut_gpu_usage) diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 21759ab2..c40720b3 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -32,8 +32,9 @@ since not sorting makes our implementation a lot faster (although less paralleli with [`copy_neighborhood_search`](@ref). Note that the type of `search_radius` determines the type used for the distance computations. -- `n_points = 0`: Total number of points. The default of `0` is useful together - with [`copy_neighborhood_search`](@ref). +- `n_points = 0`: Maximum total number of points in the neighbor coordinates array. + The default of `0` is useful together with + [`copy_neighborhood_search`](@ref). - `periodic_box = nothing`: In order to use a (rectangular) periodic domain, pass a [`PeriodicBox`](@ref). - `cell_list`: The cell list that maps a cell index to a list of points inside @@ -91,6 +92,11 @@ function GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = 0, "$(supported_update_strategies(cell_list))")) end + if search_radius isa Integer + throw(ArgumentError("`search_radius` cannot be an integer type, since computed " * + "distances will be converted to this type")) + end + update_buffer = create_update_buffer(update_strategy, cell_list, n_points) if search_radius < eps() || isnothing(periodic_box) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index 187ff0c5..884cef58 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -98,6 +98,11 @@ function PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = transpose_backend = false, max_neighbors = max_neighbors(NDIMS), sort_neighbor_lists = true) where {NDIMS} + if search_radius isa Integer + throw(ArgumentError("`search_radius` cannot be an integer type, since computed " * + "distances will be converted to this type")) + end + neighbor_lists = construct_backend(backend, n_points, max_neighbors; transpose_backend) PrecomputedNeighborhoodSearch{NDIMS}(neighbor_lists, search_radius, diff --git a/src/nhs_trivial.jl b/src/nhs_trivial.jl index c97f0809..c51d3f62 100644 --- a/src/nhs_trivial.jl +++ b/src/nhs_trivial.jl @@ -25,6 +25,11 @@ struct TrivialNeighborhoodSearch{NDIMS, ELTYPE, EP, PB} <: AbstractNeighborhoodS function TrivialNeighborhoodSearch{NDIMS}(; search_radius = 0.0, eachpoint = 1:0, periodic_box = nothing) where {NDIMS} + if search_radius isa Integer + throw(ArgumentError("`search_radius` cannot be an integer type, since computed " * + "distances will be converted to this type")) + end + new{NDIMS, typeof(search_radius), typeof(eachpoint), typeof(periodic_box)}(search_radius, eachpoint, periodic_box) end From bba9a0aeae20289d9cc901432531e943d0d34ace Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:13:02 +0200 Subject: [PATCH 02/15] Fix typo --- docs/literate/src/tut_gpu_usage.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl index 33f9a65a..1843b9ae 100644 --- a/docs/literate/src/tut_gpu_usage.jl +++ b/docs/literate/src/tut_gpu_usage.jl @@ -39,7 +39,7 @@ coordinates_gpu = adapt(backend, coordinates) # the data type that we want to use for our computations, which will usually be `Float32` # when working on GPUs. # -# Note that it is common in Smoothed Particle Hydrodynamics to store the coordiantes +# Note that it is common in Smoothed Particle Hydrodynamics to store the coordinates # in `Float64` and everything else in `Float32` to avoid large errors in the distance # computations. This is also supported by PointNeighbors.jl. Using `Float64` coordinates # with a `Float32` search radius limits `Float64` operations by converting to `Float32` From d4ae1b675ab51a316349703022338a9ecc25d1be Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:13:54 +0200 Subject: [PATCH 03/15] Reformat --- docs/literate/src/tut_gpu_usage.jl | 4 +++- docs/make.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl index 1843b9ae..8a8c6407 100644 --- a/docs/literate/src/tut_gpu_usage.jl +++ b/docs/literate/src/tut_gpu_usage.jl @@ -11,7 +11,9 @@ using Adapt # - `using AMDGPU; backend = ROCBackend()` for AMD GPUs with AMDGPU.jl # - `using Metal; backend = MetalBackend()` for Apple Silicon GPUs with Metal.jl # - `using oneAPI; backend = oneAPIBackend()` for Intel GPUs with oneAPI.jl -import KernelAbstractions; backend = KernelAbstractions.CPU(); nothing # hide +import KernelAbstractions # hide +backend = KernelAbstractions.CPU() # hide +nothing # hide # ## Create coordinates on the CPU diff --git a/docs/make.jl b/docs/make.jl index e670d307..6aebdc23 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -60,7 +60,7 @@ makedocs(modules = [PointNeighbors], # Set canonical URL to GitHub pages URL canonical = "https://trixi-framework.github.io/PointNeighbors.jl/stable", # Set edit_link explicitly to avoid `git remote show origin` lookups. - edit_link="main"), + edit_link = "main"), # Explicitly specify documentation structure pages = [ "Home" => "index.md", From 162707e3aea9e663a2d8a555887d7418048861bb Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:15:51 +0200 Subject: [PATCH 04/15] Fix typo --- docs/literate/src/tut_gpu_usage.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl index 8a8c6407..44339d65 100644 --- a/docs/literate/src/tut_gpu_usage.jl +++ b/docs/literate/src/tut_gpu_usage.jl @@ -52,7 +52,7 @@ nothing # hide # For GPU compatibility, we need to use a [`FullGridCellList`](@ref), which is optimized # for maximum performance, but requires a rectangular bounding box for the domain, -# as opposed ot the default [`DictionaryCellList`](@ref) that supports potentially +# as opposed to the default [`DictionaryCellList`](@ref) that supports potentially # infinite domains. min_corner = minimum(coordinates, dims = 2) max_corner = maximum(coordinates, dims = 2) From 0cc76a2c9bada276e0e5cee7ab341691644e8232 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:22:14 +0200 Subject: [PATCH 05/15] Update docs/make.jl Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/make.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index 6aebdc23..da2015af 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -41,14 +41,14 @@ copy_file("LICENSE.md", mkpath(joinpath(@__DIR__, "src", "tutorials")) -Literate.markdown(joinpath("docs", "literate", "src", "tut_basic_usage.jl"), - joinpath("docs", "src", "tutorials")) -Literate.markdown(joinpath("docs", "literate", "src", "tut_n_body.jl"), - joinpath("docs", "src", "tutorials")) -Literate.markdown(joinpath("docs", "literate", "src", "tut_periodicity.jl"), - joinpath("docs", "src", "tutorials")) -Literate.markdown(joinpath("docs", "literate", "src", "tut_gpu_usage.jl"), - joinpath("docs", "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_basic_usage.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_n_body.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_periodicity.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_gpu_usage.jl"), + joinpath(@__DIR__, "src", "tutorials")) # Make documentation makedocs(modules = [PointNeighbors], From e6717499df014eb4afa64b8a0bfa8d0c4f19ac2f Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:27:03 +0200 Subject: [PATCH 06/15] Fix tutorials and remove overview page --- docs/literate/src/tut_basic_usage.jl | 1 + docs/literate/src/tut_gpu_usage.jl | 2 ++ docs/literate/src/tut_n_body.jl | 2 ++ docs/make.jl | 1 - docs/src/tutorial.md | 9 --------- 5 files changed, 5 insertions(+), 10 deletions(-) delete mode 100644 docs/src/tutorial.md diff --git a/docs/literate/src/tut_basic_usage.jl b/docs/literate/src/tut_basic_usage.jl index 9f224cfa..92162e06 100644 --- a/docs/literate/src/tut_basic_usage.jl +++ b/docs/literate/src/tut_basic_usage.jl @@ -36,6 +36,7 @@ nothing # hide # for points contained in `coordinates` and not for arbitrary points in space. # See below for more information. initialize!(nhs, coordinates, coordinates) +nothing # hide # ## Count neighbors for each point diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl index 44339d65..f61670cb 100644 --- a/docs/literate/src/tut_gpu_usage.jl +++ b/docs/literate/src/tut_gpu_usage.jl @@ -31,6 +31,7 @@ end # We can use Adapt.jl to move this coordinates array to the GPU. coordinates_gpu = adapt(backend, coordinates) +nothing # hide # ## Create and initialize the neighborhood search @@ -90,6 +91,7 @@ function count_neighbors!(n_neighbors, coordinates, nhs) return n_neighbors end +nothing # hide # Now we can run the neighbor loop on the GPU. # We just need to make sure to pass the GPU coordinates, GPU neighborhood search, diff --git a/docs/literate/src/tut_n_body.jl b/docs/literate/src/tut_n_body.jl index 7c1d5376..469cd187 100644 --- a/docs/literate/src/tut_n_body.jl +++ b/docs/literate/src/tut_n_body.jl @@ -15,6 +15,7 @@ coordinates = rand(2, n_particles) # The cutoff radius for pair interactions is the search radius. search_radius = 0.04 +nothing # hide # Each particle gets a mass in [1e10, 2e10]. mass = 1.0e10 .* (rand(n_particles) .+ 1) @@ -26,6 +27,7 @@ nothing # hide nhs = GridNeighborhoodSearch{2}(; search_radius, n_points = n_particles) initialize!(nhs, coordinates, coordinates) +nothing # hide # ## Compute one acceleration update diff --git a/docs/make.jl b/docs/make.jl index 6aebdc23..25374ae9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -65,7 +65,6 @@ makedocs(modules = [PointNeighbors], pages = [ "Home" => "index.md", "Tutorials" => [ - "Overview" => "tutorial.md", "Basic Usage" => joinpath("tutorials", "tut_basic_usage.md"), "N-Body" => joinpath("tutorials", "tut_n_body.md"), "Periodicity" => joinpath("tutorials", "tut_periodicity.md"), diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md deleted file mode 100644 index 3bba7d1d..00000000 --- a/docs/src/tutorial.md +++ /dev/null @@ -1,9 +0,0 @@ -# Tutorials - -This section provides worked examples for common PointNeighbors.jl workflows. -Each tutorial starts from a complete, minimal script and explains the main API calls. - -- [Basic Usage: Fixed-Radius Neighbor Search](@ref tut_basic_usage) -- [N-Body Example with Cutoff Radius](@ref tut_n_body) -- [Periodic Domains with `PeriodicBox`](@ref tut_periodicity) -- [GPU Usage](@ref tut_gpu_usage) From 6454805f05274ff4da1416d9317a05664d6f6583 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:41:45 +0200 Subject: [PATCH 07/15] Add GPU CI tests --- .buildkite/pipeline.yml | 58 +++++++++++++++++++++++++++++++++++++++++ .gitignore | 1 + test/gpu.jl | 36 +++++++++++++++++++++++++ test/runtests.jl | 4 +++ 4 files changed, 99 insertions(+) create mode 100644 .buildkite/pipeline.yml create mode 100644 test/gpu.jl diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 00000000..c9c58ccb --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,58 @@ +steps: + - label: "CUDA" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliagpu" + cuda: "*" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("CUDA"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + TRIXIPARTICLES_TEST: cuda + timeout_in_minutes: 60 + + - label: "AMDGPU" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliagpu" + rocm: "*" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("AMDGPU"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + TRIXIPARTICLES_TEST: amdgpu + timeout_in_minutes: 60 + + - label: "Metal" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("Metal"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + TRIXIPARTICLES_TEST: metal + timeout_in_minutes: 60 + + # Doesn't work. Fails with segfault. See https://github.com/trixi-framework/TrixiParticles.jl/issues/484. + # - label: "oneAPI" + # plugins: + # - JuliaCI/julia#v1: + # version: "1" + # agents: + # queue: "juliagpu" + # intel: "*" + # command: | + # julia --color=yes --project=test -e 'using Pkg; Pkg.add("oneAPI"); Pkg.develop(path="."); Pkg.instantiate()' + # julia --color=yes --project=test -e 'include("test/runtests.jl")' + # env: + # TRIXIPARTICLES_TEST: oneapi + # timeout_in_minutes: 60 diff --git a/.gitignore b/.gitignore index aebf3c39..df099421 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ docs/build docs/src/index.md docs/src/authors.md docs/src/license.md +docs/src/tutorials public/ coverage/ coverage_report/ diff --git a/test/gpu.jl b/test/gpu.jl new file mode 100644 index 00000000..830d863c --- /dev/null +++ b/test/gpu.jl @@ -0,0 +1,36 @@ +const POINTNEIGHBORS_TEST_ = lowercase(get(ENV, "POINTNEIGHBORS_TEST", "all")) + +if POINTNEIGHBORS_TEST_ == "cuda" + using CUDA + CUDA.versioninfo() + parallelization_backend = CUDABackend() + supports_double_precision = true + fp64_fastdiv = true +elseif POINTNEIGHBORS_TEST_ == "amdgpu" + using AMDGPU + AMDGPU.versioninfo() + parallelization_backend = ROCBackend() + supports_double_precision = true + fp64_fastdiv = false +elseif POINTNEIGHBORS_TEST_ == "metal" + using Metal + Metal.versioninfo() + parallelization_backend = MetalBackend() + supports_double_precision = false +elseif POINTNEIGHBORS_TEST_ == "oneapi" + using oneAPI + oneAPI.versioninfo() + parallelization_backend = oneAPIBackend() + # The runners are using an iGPU, which does not support double precision + supports_double_precision = false +else + error("Unknown GPU backend: $POINTNEIGHBORS_TEST_") +end + +@testset verbose=true "GPU tutorial" begin + @trixi_test_nowarn trixi_include(joinpath(@__DIR__, "..", "docs", "literate", "src", + "tut_gpu_usage.jl"), + backend = parallelization_backend) + @test n_neighbors_gpu isa PointNeighbors.AbstractGPUArray + @test extrema(n_neighbors_gpu) == (11, 29) +end diff --git a/test/runtests.jl b/test/runtests.jl index cc1f9f9b..0f9c072d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,4 +10,8 @@ const POINTNEIGHBORS_TEST = lowercase(get(ENV, "POINTNEIGHBORS_TEST", "all")) if POINTNEIGHBORS_TEST in ("all", "benchmarks") include("benchmarks.jl") end + + if POINTNEIGHBORS_TEST in ("cuda", "amdgpu", "metal", "oneapi") + include("gpu.jl") + end end; From 39981c3ee9e55a4278adf52154b6962eaf820374 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:48:51 +0200 Subject: [PATCH 08/15] Add trigger GPU tests GitHub actions workflow --- .github/workflows/TriggerGPUTests.yml | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .github/workflows/TriggerGPUTests.yml diff --git a/.github/workflows/TriggerGPUTests.yml b/.github/workflows/TriggerGPUTests.yml new file mode 100644 index 00000000..e457f749 --- /dev/null +++ b/.github/workflows/TriggerGPUTests.yml @@ -0,0 +1,18 @@ +name: "Trigger GPU Tests" + +on: + issue_comment: + types: + - created + +jobs: + trigger-buildkite: + if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/run-gpu-tests') }} + runs-on: ubuntu-latest + steps: + - name: Trigger Buildkite Pipeline + uses: "buildkite/trigger-pipeline-action@v2.4.1" + with: + buildkite_api_access_token: ${{ secrets.TRIGGER_BK_BUILD_TOKEN }} + pipeline: "julialang/pointneighbors" + branch: refs/pull/${{ github.event.issue.number }}/head From 5ce547cf25afd0641c0f8a97dc2a632052d38fee Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:14:12 +0200 Subject: [PATCH 09/15] Fix GPU tests --- .buildkite/pipeline.yml | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index c9c58ccb..20d17fe8 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -10,7 +10,7 @@ steps: julia --color=yes --project=test -e 'using Pkg; Pkg.add("CUDA"); Pkg.develop(path="."); Pkg.instantiate()' julia --color=yes --project=test -e 'include("test/runtests.jl")' env: - TRIXIPARTICLES_TEST: cuda + POINTNEIGHBORS_TEST: cuda timeout_in_minutes: 60 - label: "AMDGPU" @@ -24,7 +24,7 @@ steps: julia --color=yes --project=test -e 'using Pkg; Pkg.add("AMDGPU"); Pkg.develop(path="."); Pkg.instantiate()' julia --color=yes --project=test -e 'include("test/runtests.jl")' env: - TRIXIPARTICLES_TEST: amdgpu + POINTNEIGHBORS_TEST: amdgpu timeout_in_minutes: 60 - label: "Metal" @@ -39,20 +39,19 @@ steps: julia --color=yes --project=test -e 'using Pkg; Pkg.add("Metal"); Pkg.develop(path="."); Pkg.instantiate()' julia --color=yes --project=test -e 'include("test/runtests.jl")' env: - TRIXIPARTICLES_TEST: metal + POINTNEIGHBORS_TEST: metal timeout_in_minutes: 60 - # Doesn't work. Fails with segfault. See https://github.com/trixi-framework/TrixiParticles.jl/issues/484. - # - label: "oneAPI" - # plugins: - # - JuliaCI/julia#v1: - # version: "1" - # agents: - # queue: "juliagpu" - # intel: "*" - # command: | - # julia --color=yes --project=test -e 'using Pkg; Pkg.add("oneAPI"); Pkg.develop(path="."); Pkg.instantiate()' - # julia --color=yes --project=test -e 'include("test/runtests.jl")' - # env: - # TRIXIPARTICLES_TEST: oneapi - # timeout_in_minutes: 60 + - label: "oneAPI" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliagpu" + intel: "*" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("oneAPI"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + POINTNEIGHBORS_TEST: oneapi + timeout_in_minutes: 60 From 57ce69bd552c069bd1c9c98b560b398028b59af5 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:29:49 +0200 Subject: [PATCH 10/15] Fix tests --- test/test_util.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_util.jl b/test/test_util.jl index e305a05a..21a23563 100644 --- a/test/test_util.jl +++ b/test/test_util.jl @@ -1,7 +1,7 @@ # All `using` calls are in this file, so that one can run any test file # after running only this file. using Test: @test, @testset, @test_throws -using TrixiTest: @trixi_test_nowarn +using TrixiTest: @trixi_test_nowarn, trixi_include using PointNeighbors """ From 4aa18cc7e1f3f212b7152b1094b3dd36c7682fb6 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:33:15 +0200 Subject: [PATCH 11/15] Add missing dependencies --- test/Project.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/Project.toml b/test/Project.toml index f726b8a5..454f0763 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,7 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -7,7 +9,9 @@ TrixiParticles = "66699cd8-9c01-4e9d-a059-b96c86d16b3a" TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26" [compat] +Adapt = "4" BenchmarkTools = "1" +KernelAbstractions = "0.9" Plots = "1" Test = "1" TrixiParticles = "0.4" From 2820607a774fb6e47200796d38f4213b43aae1ff Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:36:19 +0200 Subject: [PATCH 12/15] Make `ParallelUpdate` the default for `FullGridCellList` --- src/cell_lists/full_grid.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl index 1edd165e..78c9eba3 100644 --- a/src/cell_lists/full_grid.jl +++ b/src/cell_lists/full_grid.jl @@ -37,7 +37,7 @@ end @inline Base.ndims(cell_list::FullGridCellList) = ndims(cell_list.linear_indices) function supported_update_strategies(::FullGridCellList{<:DynamicVectorOfVectors}) - return (ParallelIncrementalUpdate, ParallelUpdate, SemiParallelUpdate, + return (ParallelUpdate, ParallelIncrementalUpdate, SemiParallelUpdate, SerialIncrementalUpdate, SerialUpdate) end From 1151c74bcdb593f2c2610e7b250350d5a0efa770 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Mon, 13 Apr 2026 18:42:03 +0200 Subject: [PATCH 13/15] Fix unit tests --- test/nhs_grid.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nhs_grid.jl b/test/nhs_grid.jl index 17ff2fa9..4b7110f1 100644 --- a/test/nhs_grid.jl +++ b/test/nhs_grid.jl @@ -34,7 +34,7 @@ @test copy.cell_list isa FullGridCellList @test copy.cell_list.cells isa PointNeighbors.DynamicVectorOfVectors - @test copy.update_strategy == ParallelIncrementalUpdate() + @test copy.update_strategy == ParallelUpdate() # Full grid cell list with `Vector{Vector}` backend nhs = GridNeighborhoodSearch{2}(cell_list = FullGridCellList(; min_corner, From a0be1e2201b70d5613d3bf5b3b6b12d132890597 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:06:03 +0200 Subject: [PATCH 14/15] Add more tutorials --- docs/literate/src/tut_advanced_usage.jl | 46 +++++++++++++++ docs/literate/src/tut_gpu_usage.jl | 77 ++++++++++++++++++++++++- docs/make.jl | 5 +- src/nhs_trivial.jl | 7 +-- 4 files changed, 128 insertions(+), 7 deletions(-) create mode 100644 docs/literate/src/tut_advanced_usage.jl diff --git a/docs/literate/src/tut_advanced_usage.jl b/docs/literate/src/tut_advanced_usage.jl new file mode 100644 index 00000000..771a84ea --- /dev/null +++ b/docs/literate/src/tut_advanced_usage.jl @@ -0,0 +1,46 @@ +# # [Advanced Usage: Copying Neighborhood Searches](@id tut_advanced_usage) + +using PointNeighbors +# When working with multiple point-sets and therefore multiple neighborhood searches, +# it is often desirable to hide these details from the user-facing API. +# We would like the user to simply pass the type of neighborhood search they want to use, +# and the library should take care of creating the neighborhood searches internally. +# +# For exactly this purpose, PointNeighbors.jl provides the concept of a "template +# neighborhood search", which is a neighborhood search created without essential information +# like search radius or number of points, and the function +# [`copy_neighborhood_search`](@ref), which creates a copy of an existing neighborhood +# search or template, but with a (different) concrete configuration. +# +# For the simplest example, we can work with the [`TrivialNeighborhoodSearch`](@ref), +# which simply loops over all points, resulting in quadratic runtime for finding neighbors +# of a particle. +n_points = 1000 +search_radius = 1.0 +nhs = TrivialNeighborhoodSearch{2}(; search_radius, eachpoint = 1:n_points) +nothing # hide + +# This constructor requires knowledge of the search radius and size of the point set, +# which might differ between different point sets in the same simulation. +# Instead, we can create a template neighborhood search without this information: +template_nhs = TrivialNeighborhoodSearch{2}() +nothing # hide + +# This template can now be copied with different configurations for different point sets. +nhs1 = copy_neighborhood_search(template_nhs, search_radius, n_points) +nhs2 = copy_neighborhood_search(template_nhs, search_radius * 2, n_points * 2) +nothing # hide + +# The same concept can be applied to all neighborhood search implementations in +# PointNeighbors.jl. All templates can be copied with exactly the same function call. +template_nhs2 = GridNeighborhoodSearch{2}() +nhs3 = copy_neighborhood_search(template_nhs2, search_radius, n_points) +nothing # hide + +# Additional configuration options can be passed to the templates and will be preserved +# through the copying process. This applies for example to the periodic box or the +# update strategy of the [`GridNeighborhoodSearch`](@ref). +periodic_box = PeriodicBox(min_corner=(0.0, 0.0), max_corner=(10.0, 10.0)) +template_nhs3 = GridNeighborhoodSearch{2}(; periodic_box, update_strategy = SerialUpdate()) +nhs4 = copy_neighborhood_search(template_nhs3, search_radius, n_points) +nhs4.update_strategy, nhs4.periodic_box diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl index f61670cb..078d1cbf 100644 --- a/docs/literate/src/tut_gpu_usage.jl +++ b/docs/literate/src/tut_gpu_usage.jl @@ -77,7 +77,8 @@ nothing # hide # We can now use the same function as in the [basic usage tutorial](@ref tut_basic_usage). # The parallelization backend is detected automatically from the type of the coordinates. -n_neighbors_gpu = adapt(backend, zeros(Int, n_points)) +n_neighbors = zeros(Int, n_points) +n_neighbors_gpu = adapt(backend, n_neighbors) function count_neighbors!(n_neighbors, coordinates, nhs) n_neighbors .= 0 @@ -98,3 +99,77 @@ nothing # hide # and GPU neighbor count array. count_neighbors!(n_neighbors_gpu, coordinates_gpu, nhs_gpu) extrema(n_neighbors_gpu) + +# ## Maximizing performance + +# In the example above, we already used the `FullGridCellList`, which is faster than the +# default `DictionaryCellList` on CPUs and required for GPU usage. +# We also used `@inbounds` inside the neighbor loop and `Float32` for all floating-point +# values to further increase performance on GPUs. + +# For many applications, we want to accumulate data over the neighbors, like the neighbor +# count in the example above. In this case, we can write a manual GPU kernel over the points +# and call [`foreach_neighbor`](@ref) inside this kernel to loop over the neighbors of each +# point. This allows us to accumulate the number of neighbors in a local variable and write +# it back to the global `n_neighbors` array only once per point, reducing the number of +# global memory accesses. +function count_neighbors2!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + ## This is a serial loop! Make it a GPU kernel to run this on the GPU. + for point in axes(coordinates, 2) + n_neighbors_point = 0 + + @inbounds foreach_neighbor(coordinates, coordinates, nhs, + point) do point, neighbor, pos_diff, distance + n_neighbors_point += 1 + end + + @inbounds n_neighbors[point] = n_neighbors_point + end + return n_neighbors +end + +initialize!(nhs, coordinates, coordinates) +count_neighbors2!(n_neighbors, coordinates, nhs) +extrema(n_neighbors) + +# !!! note +# The function `count_neighbors2!` is using a serial loop over the points, not a GPU +# kernel. In order to run this on the GPU, refer to the documentation of +# KernelAbstractions.jl or your GPU package (e.g., CUDA.jl, AMDGPU.jl, Metal.jl) +# on how to write GPU kernels. +# +# While we already use `@inbounds` at `foreach_neighbor`, there are still bounds checks +# inside `foreach_neighbor` that are only safe to remove when we are sure that the +# neighborhood search has been initialized/updated with the same coordinates that we are +# passing to `foreach_neighbor`. If we can guarantee this, we can use +# [`foreach_neighbor_unsafe`](@ref) instead of `foreach_neighbor` to further increase +# performance by removing *all* bounds checks. +function count_neighbors3!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + ## This is a serial loop! Make it a GPU kernel to run this on the GPU. + for point in axes(coordinates, 2) + n_neighbors_point = 0 + + ## WARNING: This is only safe if the NHS has been initialized/updated with + ## the same coordinates that we are passing to `foreach_neighbor_unsafe`. + foreach_neighbor_unsafe(coordinates, coordinates, nhs, + point) do point, neighbor, pos_diff, distance + n_neighbors_point += 1 + end + + @inbounds n_neighbors[point] = n_neighbors_point + end + return n_neighbors +end + +count_neighbors3!(n_neighbors, coordinates, nhs) +extrema(n_neighbors) + +# See the documentation of `foreach_neighbor_unsafe` for more information about when it is +# safe to use this function: +# ```@docs; canonical = false +# foreach_neighbor_unsafe +# ``` diff --git a/docs/make.jl b/docs/make.jl index fbd016d5..4e4905b9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -49,6 +49,8 @@ Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_periodicity.jl"), joinpath(@__DIR__, "src", "tutorials")) Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_gpu_usage.jl"), joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_advanced_usage.jl"), + joinpath(@__DIR__, "src", "tutorials")) # Make documentation makedocs(modules = [PointNeighbors], @@ -68,7 +70,8 @@ makedocs(modules = [PointNeighbors], "Basic Usage" => joinpath("tutorials", "tut_basic_usage.md"), "N-Body" => joinpath("tutorials", "tut_n_body.md"), "Periodicity" => joinpath("tutorials", "tut_periodicity.md"), - "GPU Usage" => joinpath("tutorials", "tut_gpu_usage.md") + "GPU Usage" => joinpath("tutorials", "tut_gpu_usage.md"), + "Advanced Usage" => joinpath("tutorials", "tut_advanced_usage.md") ], "API reference" => "reference.md", "Authors" => "authors.md", diff --git a/src/nhs_trivial.jl b/src/nhs_trivial.jl index c51d3f62..6ea80b52 100644 --- a/src/nhs_trivial.jl +++ b/src/nhs_trivial.jl @@ -54,11 +54,8 @@ end @inline eachneighbor(coords, search::TrivialNeighborhoodSearch) = search.eachpoint -# Create a copy of a neighborhood search but with a different search radius -function copy_neighborhood_search(nhs::TrivialNeighborhoodSearch, search_radius, x, y) - return nhs -end - +# Create a copy of a neighborhood search but with a different search radius and different +# number of points. function copy_neighborhood_search(nhs::TrivialNeighborhoodSearch, search_radius, n_points; eachpoint = 1:n_points) return TrivialNeighborhoodSearch{ndims(nhs)}(; search_radius, eachpoint, From 829990e17da70066ec5ef36d46fee4353b8946d3 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:07:38 +0200 Subject: [PATCH 15/15] Reformat --- docs/literate/src/tut_advanced_usage.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/literate/src/tut_advanced_usage.jl b/docs/literate/src/tut_advanced_usage.jl index 771a84ea..a2b730b7 100644 --- a/docs/literate/src/tut_advanced_usage.jl +++ b/docs/literate/src/tut_advanced_usage.jl @@ -40,7 +40,7 @@ nothing # hide # Additional configuration options can be passed to the templates and will be preserved # through the copying process. This applies for example to the periodic box or the # update strategy of the [`GridNeighborhoodSearch`](@ref). -periodic_box = PeriodicBox(min_corner=(0.0, 0.0), max_corner=(10.0, 10.0)) +periodic_box = PeriodicBox(min_corner = (0.0, 0.0), max_corner = (10.0, 10.0)) template_nhs3 = GridNeighborhoodSearch{2}(; periodic_box, update_strategy = SerialUpdate()) nhs4 = copy_neighborhood_search(template_nhs3, search_radius, n_points) nhs4.update_strategy, nhs4.periodic_box