diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml new file mode 100644 index 00000000..20d17fe8 --- /dev/null +++ b/.buildkite/pipeline.yml @@ -0,0 +1,57 @@ +steps: + - label: "CUDA" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliagpu" + cuda: "*" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("CUDA"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + POINTNEIGHBORS_TEST: cuda + timeout_in_minutes: 60 + + - label: "AMDGPU" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliagpu" + rocm: "*" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("AMDGPU"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + POINTNEIGHBORS_TEST: amdgpu + timeout_in_minutes: 60 + + - label: "Metal" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliaecosystem" + os: "macos" + arch: "aarch64" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("Metal"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + POINTNEIGHBORS_TEST: metal + timeout_in_minutes: 60 + + - label: "oneAPI" + plugins: + - JuliaCI/julia#v1: + version: "1.12" + agents: + queue: "juliagpu" + intel: "*" + command: | + julia --color=yes --project=test -e 'using Pkg; Pkg.add("oneAPI"); Pkg.develop(path="."); Pkg.instantiate()' + julia --color=yes --project=test -e 'include("test/runtests.jl")' + env: + POINTNEIGHBORS_TEST: oneapi + timeout_in_minutes: 60 diff --git a/.github/workflows/TriggerGPUTests.yml b/.github/workflows/TriggerGPUTests.yml new file mode 100644 index 00000000..e457f749 --- /dev/null +++ b/.github/workflows/TriggerGPUTests.yml @@ -0,0 +1,18 @@ +name: "Trigger GPU Tests" + +on: + issue_comment: + types: + - created + +jobs: + trigger-buildkite: + if: ${{ github.event.issue.pull_request && contains(github.event.comment.body, '/run-gpu-tests') }} + runs-on: ubuntu-latest + steps: + - name: Trigger Buildkite Pipeline + uses: "buildkite/trigger-pipeline-action@v2.4.1" + with: + buildkite_api_access_token: ${{ secrets.TRIGGER_BK_BUILD_TOKEN }} + pipeline: "julialang/pointneighbors" + branch: refs/pull/${{ github.event.issue.number }}/head diff --git a/.gitignore b/.gitignore index aebf3c39..df099421 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ docs/build docs/src/index.md docs/src/authors.md docs/src/license.md +docs/src/tutorials public/ coverage/ coverage_report/ diff --git a/docs/Project.toml b/docs/Project.toml index 1814eb33..c3734416 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,11 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" +Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" [compat] +Adapt = "4" Documenter = "1" +KernelAbstractions = "0.9" +Literate = "2" diff --git a/docs/literate/src/tut_advanced_usage.jl b/docs/literate/src/tut_advanced_usage.jl new file mode 100644 index 00000000..a2b730b7 --- /dev/null +++ b/docs/literate/src/tut_advanced_usage.jl @@ -0,0 +1,46 @@ +# # [Advanced Usage: Copying Neighborhood Searches](@id tut_advanced_usage) + +using PointNeighbors +# When working with multiple point-sets and therefore multiple neighborhood searches, +# it is often desirable to hide these details from the user-facing API. +# We would like the user to simply pass the type of neighborhood search they want to use, +# and the library should take care of creating the neighborhood searches internally. +# +# For exactly this purpose, PointNeighbors.jl provides the concept of a "template +# neighborhood search", which is a neighborhood search created without essential information +# like search radius or number of points, and the function +# [`copy_neighborhood_search`](@ref), which creates a copy of an existing neighborhood +# search or template, but with a (different) concrete configuration. +# +# For the simplest example, we can work with the [`TrivialNeighborhoodSearch`](@ref), +# which simply loops over all points, resulting in quadratic runtime for finding neighbors +# of a particle. +n_points = 1000 +search_radius = 1.0 +nhs = TrivialNeighborhoodSearch{2}(; search_radius, eachpoint = 1:n_points) +nothing # hide + +# This constructor requires knowledge of the search radius and size of the point set, +# which might differ between different point sets in the same simulation. +# Instead, we can create a template neighborhood search without this information: +template_nhs = TrivialNeighborhoodSearch{2}() +nothing # hide + +# This template can now be copied with different configurations for different point sets. +nhs1 = copy_neighborhood_search(template_nhs, search_radius, n_points) +nhs2 = copy_neighborhood_search(template_nhs, search_radius * 2, n_points * 2) +nothing # hide + +# The same concept can be applied to all neighborhood search implementations in +# PointNeighbors.jl. All templates can be copied with exactly the same function call. +template_nhs2 = GridNeighborhoodSearch{2}() +nhs3 = copy_neighborhood_search(template_nhs2, search_radius, n_points) +nothing # hide + +# Additional configuration options can be passed to the templates and will be preserved +# through the copying process. This applies for example to the periodic box or the +# update strategy of the [`GridNeighborhoodSearch`](@ref). +periodic_box = PeriodicBox(min_corner = (0.0, 0.0), max_corner = (10.0, 10.0)) +template_nhs3 = GridNeighborhoodSearch{2}(; periodic_box, update_strategy = SerialUpdate()) +nhs4 = copy_neighborhood_search(template_nhs3, search_radius, n_points) +nhs4.update_strategy, nhs4.periodic_box diff --git a/docs/literate/src/tut_basic_usage.jl b/docs/literate/src/tut_basic_usage.jl new file mode 100644 index 00000000..92162e06 --- /dev/null +++ b/docs/literate/src/tut_basic_usage.jl @@ -0,0 +1,115 @@ +# # [Basic Usage: Fixed-Radius Neighbor Search](@id tut_basic_usage) + +# This tutorial shows the standard workflow for PointNeighbors.jl: +# create coordinates, configure a neighborhood search, initialize it, +# and loop over neighbors. +using PointNeighbors + +# ## Generate a regular 2D point cloud + +# We create a regular grid of points in 2D for this example, with distances of 1 +# between neighboring points. +# The coordinates need to be stored in a 2×N array, where N is the number of points. +n_points_per_dimension = (100, 100) +n_points = prod(n_points_per_dimension) +coordinates = Array{Float64}(undef, 2, n_points) +cartesian_indices = CartesianIndices(n_points_per_dimension) + +for i in axes(coordinates, 2) + coordinates[:, i] .= Tuple(cartesian_indices[i]) +end + +# ## Create and initialize the neighborhood search + +# We choose a search radius and create the neighborhood search. +# For the [`GridNeighborhoodSearch`](@ref), we need to pass `n_points` as a maximum +# number of points in the neighbor coordinates array, which is required to allocate +# the data structures for the update step. +search_radius = 3.0 +nhs = GridNeighborhoodSearch{2}(; search_radius, n_points) +nothing # hide + +# Initialize the neighborhood search with the coordinates. +# !!! warning +# This neighborhood search is now configured for the given coordinates. +# In general, it is only possible to use this neighborhood search to find neighbors +# for points contained in `coordinates` and not for arbitrary points in space. +# See below for more information. +initialize!(nhs, coordinates, coordinates) +nothing # hide + +# ## Count neighbors for each point + +# With the neighborhood search initialized, we can now loop over neighbors. +# We use the function [`foreach_point_neighbor`](@ref) to loop over all pairs of points +# and neighbors within the search radius. +# !!! warning +# The `foreach_point_neighbor` function is multithreaded over the points by default. +# Only remove `parallelization_backend = SerialBackend()` if you are sure that your code +# is thread-safe and that there are no race conditions. +n_neighbors = zeros(Int, n_points) + +function count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) + n_neighbors .= 0 + + foreach_point_neighbor(coordinates, neighbor_coordinates, nhs, + parallelization_backend = SerialBackend()) do i, j, + pos_diff, distance + n_neighbors[i] += 1 + end + + return n_neighbors +end + +count_neighbors!(n_neighbors, coordinates, coordinates, nhs) + +# Interior particles have many neighbors, boundary particles fewer. +extrema(n_neighbors) + +# ## Different point sets for points and neighbors + +# The neighborhood search is currently configured to find neighbors in `coordinates` +# for points in `coordinates`. However, it is also possible to find neighbors in a +# different set of coordinates, e.g., `neighbor_coordinates`. +neighbor_coordinates = coordinates .+ 0.5 +nothing # hide + +# In order to find neighbors in `neighbor_coordinates`, we need to re-initialize +# the neighborhood search with `neighbor_coordinates` as the second argument. +initialize!(nhs, coordinates, neighbor_coordinates) +nothing # hide + +# Now the neighborhood search is configured to find neighbors in `neighbor_coordinates` +# for points in `coordinates`. +count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) +extrema(n_neighbors) + +# ## Updating the neighborhood search + +# If the coordinates of either the points or the neighbors change, the neighborhood search +# needs to be updated with [`update!`](@ref). +# Depending on the neighborhood search implementation, this can be done more efficiently +# than re-initializing the neighborhood search. +# !!! warning +# An `update!` requires that the sizes of the point sets do not change. +# +# If we don't update the neighborhood search, we will get incorrect neighbors: +neighbor_coordinates .+= 10 +count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) +extrema(n_neighbors) + +# After updating the neighborhood search, we get the correct new neighbors. +update!(nhs, coordinates, neighbor_coordinates) +count_neighbors!(n_neighbors, coordinates, neighbor_coordinates, nhs) +extrema(n_neighbors) + +# If the first coordinates are updated but the second coordinates are not, we generally +# also need to update the neighborhood search. +# For some neighborhood search implementations, notably the [`GridNeighborhoodSearch`](@ref), +# this is not necessary, since this implementation can find neighbors for arbitrary points in space. +# To check whether an update is necessary, we can call [`requires_update`](@ref): +requires_update(nhs) + +# The first `false` indicates that an update is not required when the first coordinates are +# updated, and the second `true` indicates that an update is required when the second +# coordinates are updated. diff --git a/docs/literate/src/tut_gpu_usage.jl b/docs/literate/src/tut_gpu_usage.jl new file mode 100644 index 00000000..078d1cbf --- /dev/null +++ b/docs/literate/src/tut_gpu_usage.jl @@ -0,0 +1,175 @@ +# # [GPU Usage](@id tut_gpu_usage) + +# This tutorial shows how to use PointNeighbors.jl on GPUs. +# We adapt the [basic usage tutorial](@ref tut_basic_usage) to run on GPUs. +using PointNeighbors +using Adapt + +# PointNeighbors.jl provides vendor-agnostic GPU support through KernelAbstractions.jl. +# Load the appropriate package for your GPU and define the corresponding backend: +# - `using CUDA; backend = CUDABackend()` for NVIDIA GPUs with CUDA.jl +# - `using AMDGPU; backend = ROCBackend()` for AMD GPUs with AMDGPU.jl +# - `using Metal; backend = MetalBackend()` for Apple Silicon GPUs with Metal.jl +# - `using oneAPI; backend = oneAPIBackend()` for Intel GPUs with oneAPI.jl +import KernelAbstractions # hide +backend = KernelAbstractions.CPU() # hide +nothing # hide + +# ## Create coordinates on the CPU + +# We create the same regular grid of points in 2D as in the basic usage tutorial. +# For better GPU performance, we use `Float32` instead of `Float64`. +# Note that some GPUs (notably Apple Silicon) do not support `Float64` at all. +n_points_per_dimension = (100, 100) +n_points = prod(n_points_per_dimension) +coordinates = Array{Float32}(undef, 2, n_points) +cartesian_indices = CartesianIndices(n_points_per_dimension) + +for i in axes(coordinates, 2) + coordinates[:, i] .= Tuple(cartesian_indices[i]) +end + +# We can use Adapt.jl to move this coordinates array to the GPU. +coordinates_gpu = adapt(backend, coordinates) +nothing # hide + +# ## Create and initialize the neighborhood search + +# After taking computing the difference between coordinates of neighboring particles, +# [`foreach_point_neighbor`](@ref) converts the result to the type of `search_radius` before +# computing the distance. The type of `search_radius` therefore determines the types of +# `pos_diff` and `distance` inside `foreach_point_neighbor`. We need to make sure to choose +# the data type that we want to use for our computations, which will usually be `Float32` +# when working on GPUs. +# +# Note that it is common in Smoothed Particle Hydrodynamics to store the coordinates +# in `Float64` and everything else in `Float32` to avoid large errors in the distance +# computations. This is also supported by PointNeighbors.jl. Using `Float64` coordinates +# with a `Float32` search radius limits `Float64` operations by converting to `Float32` +# after loading the point coordinates and computing the difference. +# `pos_diff` and `distance` inside `foreach_point_neighbor` will all use `Float32`. +search_radius = 3.0f0 +nothing # hide + +# For GPU compatibility, we need to use a [`FullGridCellList`](@ref), which is optimized +# for maximum performance, but requires a rectangular bounding box for the domain, +# as opposed to the default [`DictionaryCellList`](@ref) that supports potentially +# infinite domains. +min_corner = minimum(coordinates, dims = 2) +max_corner = maximum(coordinates, dims = 2) +cell_list = FullGridCellList(; search_radius, min_corner, max_corner) +nothing # hide + +# Now we can create the neighborhood search as usual. +nhs = GridNeighborhoodSearch{2}(; search_radius, cell_list, n_points) +nothing # hide + +# This neighborhood search is currently living in CPU memory, so we first need to move it +# to the GPU. We can also use Adapt.jl for this. +nhs_gpu = adapt(backend, nhs) +nothing # hide + +# From now on, everything happens on the GPU, so we need to use the GPU coordinates. +initialize!(nhs_gpu, coordinates_gpu, coordinates_gpu; parallelization_backend = backend) +nothing # hide + +# ## Count neighbors on the GPU + +# We can now use the same function as in the [basic usage tutorial](@ref tut_basic_usage). +# The parallelization backend is detected automatically from the type of the coordinates. +n_neighbors = zeros(Int, n_points) +n_neighbors_gpu = adapt(backend, n_neighbors) + +function count_neighbors!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + foreach_point_neighbor(coordinates, coordinates, nhs) do i, j, pos_diff, distance + ## `foreach_point_neighbor` makes sure that `i` and `j` are in bounds + ## of the respective coordinates. + ## We are now inside a GPU kernel, so using `@inbounds` is important for performance. + @inbounds n_neighbors[i] += 1 + end + + return n_neighbors +end +nothing # hide + +# Now we can run the neighbor loop on the GPU. +# We just need to make sure to pass the GPU coordinates, GPU neighborhood search, +# and GPU neighbor count array. +count_neighbors!(n_neighbors_gpu, coordinates_gpu, nhs_gpu) +extrema(n_neighbors_gpu) + +# ## Maximizing performance + +# In the example above, we already used the `FullGridCellList`, which is faster than the +# default `DictionaryCellList` on CPUs and required for GPU usage. +# We also used `@inbounds` inside the neighbor loop and `Float32` for all floating-point +# values to further increase performance on GPUs. + +# For many applications, we want to accumulate data over the neighbors, like the neighbor +# count in the example above. In this case, we can write a manual GPU kernel over the points +# and call [`foreach_neighbor`](@ref) inside this kernel to loop over the neighbors of each +# point. This allows us to accumulate the number of neighbors in a local variable and write +# it back to the global `n_neighbors` array only once per point, reducing the number of +# global memory accesses. +function count_neighbors2!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + ## This is a serial loop! Make it a GPU kernel to run this on the GPU. + for point in axes(coordinates, 2) + n_neighbors_point = 0 + + @inbounds foreach_neighbor(coordinates, coordinates, nhs, + point) do point, neighbor, pos_diff, distance + n_neighbors_point += 1 + end + + @inbounds n_neighbors[point] = n_neighbors_point + end + return n_neighbors +end + +initialize!(nhs, coordinates, coordinates) +count_neighbors2!(n_neighbors, coordinates, nhs) +extrema(n_neighbors) + +# !!! note +# The function `count_neighbors2!` is using a serial loop over the points, not a GPU +# kernel. In order to run this on the GPU, refer to the documentation of +# KernelAbstractions.jl or your GPU package (e.g., CUDA.jl, AMDGPU.jl, Metal.jl) +# on how to write GPU kernels. +# +# While we already use `@inbounds` at `foreach_neighbor`, there are still bounds checks +# inside `foreach_neighbor` that are only safe to remove when we are sure that the +# neighborhood search has been initialized/updated with the same coordinates that we are +# passing to `foreach_neighbor`. If we can guarantee this, we can use +# [`foreach_neighbor_unsafe`](@ref) instead of `foreach_neighbor` to further increase +# performance by removing *all* bounds checks. +function count_neighbors3!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + ## This is a serial loop! Make it a GPU kernel to run this on the GPU. + for point in axes(coordinates, 2) + n_neighbors_point = 0 + + ## WARNING: This is only safe if the NHS has been initialized/updated with + ## the same coordinates that we are passing to `foreach_neighbor_unsafe`. + foreach_neighbor_unsafe(coordinates, coordinates, nhs, + point) do point, neighbor, pos_diff, distance + n_neighbors_point += 1 + end + + @inbounds n_neighbors[point] = n_neighbors_point + end + return n_neighbors +end + +count_neighbors3!(n_neighbors, coordinates, nhs) +extrema(n_neighbors) + +# See the documentation of `foreach_neighbor_unsafe` for more information about when it is +# safe to use this function: +# ```@docs; canonical = false +# foreach_neighbor_unsafe +# ``` diff --git a/docs/literate/src/tut_n_body.jl b/docs/literate/src/tut_n_body.jl new file mode 100644 index 00000000..469cd187 --- /dev/null +++ b/docs/literate/src/tut_n_body.jl @@ -0,0 +1,63 @@ +# # [N-Body Example with Cutoff Radius](@id tut_n_body) + +# This tutorial shows how to compute one right-hand-side evaluation of a simple +# n-body model with a cutoff radius using PointNeighbors.jl. +# It builds on the basic usage tutorial and reuses the same neighbor-loop pattern. +using PointNeighbors +using Random + +# ## Generate a simple 2D particle set + +# We use uniformly distributed points in 2D. As in all of PointNeighbors.jl, +# coordinates are stored in a 2×N array. +n_particles = 5_000 +coordinates = rand(2, n_particles) + +# The cutoff radius for pair interactions is the search radius. +search_radius = 0.04 +nothing # hide + +# Each particle gets a mass in [1e10, 2e10]. +mass = 1.0e10 .* (rand(n_particles) .+ 1) +G = Float32(6.6743e-11) +accelerations = similar(coordinates) +nothing # hide + +# ## Create and initialize the neighborhood search + +nhs = GridNeighborhoodSearch{2}(; search_radius, n_points = n_particles) +initialize!(nhs, coordinates, coordinates) +nothing # hide + +# ## Compute one acceleration update + +# Sum all pairwise contributions inside the cutoff radius. +# Note that this is a multithreaded loop when starting Julia with multiple threads. +# It is thread-safe because the threading happens over the particles, not the neighbors, +# and each thread only updates the acceleration of its own particle. +function compute_acceleration!(accelerations, coordinates, mass, G, nhs) + accelerations .= 0.0 + + foreach_point_neighbor(coordinates, coordinates, nhs) do i, j, pos_diff, distance + ## Skip self-interactions. Note that `return` only exits the closure, + ## i.e., skips the current neighbor. + distance < eps() && return + + ## `foreach_point_neighbor` makes sure that `i` and `j` are in bounds + ## of the respective coordinates. This is especially relevant on GPUs, + ## where bounds checking is more expensive. + dv = @inbounds -G * mass[j] * pos_diff / distance^3 + + for dim in axes(accelerations, 1) + @inbounds accelerations[dim, i] += dv[dim] + end + end + + return accelerations +end + +compute_acceleration!(accelerations, coordinates, mass, G, nhs) +nothing # hide + +# We now have one acceleration vector per particle. +size(accelerations) diff --git a/docs/literate/src/tut_periodicity.jl b/docs/literate/src/tut_periodicity.jl new file mode 100644 index 00000000..5b752a74 --- /dev/null +++ b/docs/literate/src/tut_periodicity.jl @@ -0,0 +1,58 @@ +# # [Periodic Domains with `PeriodicBox`](@id tut_periodicity) + +# This tutorial extends the basic setup by adding periodic boundary conditions. +using PointNeighbors + +# ## Generate a regular 2D point cloud +n_points_per_dimension = (100, 100) +n_points = prod(n_points_per_dimension) +coordinates = Array{Float64}(undef, 2, n_points) +cartesian_indices = CartesianIndices(n_points_per_dimension) + +for i in axes(coordinates, 2) + coordinates[:, i] .= Tuple(cartesian_indices[i]) +end + +# ## Configure a periodic domain + +# PointNeighbors.jl supports rectangular, axis-aligned periodic domains through the +# [`PeriodicBox`](@ref) type. +search_radius = 3.0 +min_corner = (0.0, 0.0) +max_corner = (100.0, 100.0) +periodic_box = PeriodicBox(; min_corner, max_corner) +nothing # hide + +# This periodic box can now be passed to the neighborhood search constructor. +nhs = GridNeighborhoodSearch{2}(; search_radius, periodic_box, n_points) +initialize!(nhs, coordinates, coordinates) +nothing # hide + +# ## Count neighbors in periodic and non-periodic setups + +# This function is the same as in the [basic usage tutorial](@ref tut_basic_usage) +# and counts the number of neighbors for each point. +# Note that this is a multithreaded loop when starting Julia with multiple threads. +# It is thread-safe because the threading happens over the points, not the neighbors, +# and each thread only updates the counter of its own point. +function count_neighbors!(n_neighbors, coordinates, nhs) + n_neighbors .= 0 + + foreach_point_neighbor(coordinates, coordinates, nhs) do i, j, pos_diff, distance + n_neighbors[i] += 1 + end + + return n_neighbors +end + +n_neighbors = zeros(Int, n_points) +count_neighbors!(n_neighbors, coordinates, nhs) +extrema(n_neighbors) + +# Since we already have a bounded domain, it makes sense to use a `FullGridCellList`, +# which is focused on maximum performance, but does not support infinite domains. +cell_list = FullGridCellList(; search_radius, min_corner, max_corner) +nhs2 = GridNeighborhoodSearch{2}(; search_radius, periodic_box, cell_list, n_points) +initialize!(nhs2, coordinates, coordinates) +count_neighbors!(n_neighbors, coordinates, nhs2) +extrema(n_neighbors) diff --git a/docs/make.jl b/docs/make.jl index 46872f7d..4e4905b9 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,5 @@ using Documenter +using Literate: Literate # Get PointNeighbors.jl root directory trixibase_root_dir = dirname(@__DIR__) @@ -38,6 +39,19 @@ copy_file("LICENSE.md", "[AUTHORS.md](AUTHORS.md)" => "[Authors](@ref)", "\n" => "\n> ", r"^" => "# License\n\n> ") +mkpath(joinpath(@__DIR__, "src", "tutorials")) + +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_basic_usage.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_n_body.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_periodicity.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_gpu_usage.jl"), + joinpath(@__DIR__, "src", "tutorials")) +Literate.markdown(joinpath(@__DIR__, "literate", "src", "tut_advanced_usage.jl"), + joinpath(@__DIR__, "src", "tutorials")) + # Make documentation makedocs(modules = [PointNeighbors], sitename = "PointNeighbors.jl", @@ -46,10 +60,19 @@ makedocs(modules = [PointNeighbors], # Disable pretty URLs during manual testing prettyurls = get(ENV, "CI", nothing) == "true", # Set canonical URL to GitHub pages URL - canonical = "https://trixi-framework.github.io/PointNeighbors.jl/stable"), + canonical = "https://trixi-framework.github.io/PointNeighbors.jl/stable", + # Set edit_link explicitly to avoid `git remote show origin` lookups. + edit_link = "main"), # Explicitly specify documentation structure pages = [ "Home" => "index.md", + "Tutorials" => [ + "Basic Usage" => joinpath("tutorials", "tut_basic_usage.md"), + "N-Body" => joinpath("tutorials", "tut_n_body.md"), + "Periodicity" => joinpath("tutorials", "tut_periodicity.md"), + "GPU Usage" => joinpath("tutorials", "tut_gpu_usage.md"), + "Advanced Usage" => joinpath("tutorials", "tut_advanced_usage.md") + ], "API reference" => "reference.md", "Authors" => "authors.md", "License" => "license.md" diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl index 1edd165e..78c9eba3 100644 --- a/src/cell_lists/full_grid.jl +++ b/src/cell_lists/full_grid.jl @@ -37,7 +37,7 @@ end @inline Base.ndims(cell_list::FullGridCellList) = ndims(cell_list.linear_indices) function supported_update_strategies(::FullGridCellList{<:DynamicVectorOfVectors}) - return (ParallelIncrementalUpdate, ParallelUpdate, SemiParallelUpdate, + return (ParallelUpdate, ParallelIncrementalUpdate, SemiParallelUpdate, SerialIncrementalUpdate, SerialUpdate) end diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl index 8e3c1ff2..255c178d 100644 --- a/src/nhs_grid.jl +++ b/src/nhs_grid.jl @@ -32,8 +32,9 @@ since not sorting makes our implementation a lot faster (although less paralleli with [`copy_neighborhood_search`](@ref). Note that the type of `search_radius` determines the type used for the distance computations. -- `n_points = 0`: Total number of points. The default of `0` is useful together - with [`copy_neighborhood_search`](@ref). +- `n_points = 0`: Maximum total number of points in the neighbor coordinates array. + The default of `0` is useful together with + [`copy_neighborhood_search`](@ref). - `periodic_box = nothing`: In order to use a (rectangular) periodic domain, pass a [`PeriodicBox`](@ref). - `cell_list`: The cell list that maps a cell index to a list of points inside @@ -91,6 +92,11 @@ function GridNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = 0, "$(supported_update_strategies(cell_list))")) end + if search_radius isa Integer + throw(ArgumentError("`search_radius` cannot be an integer type, since computed " * + "distances will be converted to this type")) + end + update_buffer = create_update_buffer(update_strategy, cell_list, n_points) if search_radius < eps() || isnothing(periodic_box) diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl index c7ddceca..055c3b45 100644 --- a/src/nhs_precomputed.jl +++ b/src/nhs_precomputed.jl @@ -98,6 +98,11 @@ function PrecomputedNeighborhoodSearch{NDIMS}(; search_radius = 0.0, n_points = transpose_backend = false, max_neighbors = max_neighbors(NDIMS), sort_neighbor_lists = true) where {NDIMS} + if search_radius isa Integer + throw(ArgumentError("`search_radius` cannot be an integer type, since computed " * + "distances will be converted to this type")) + end + neighbor_lists = construct_backend(backend, n_points, max_neighbors; transpose_backend) PrecomputedNeighborhoodSearch{NDIMS}(neighbor_lists, search_radius, diff --git a/src/nhs_trivial.jl b/src/nhs_trivial.jl index c97f0809..6ea80b52 100644 --- a/src/nhs_trivial.jl +++ b/src/nhs_trivial.jl @@ -25,6 +25,11 @@ struct TrivialNeighborhoodSearch{NDIMS, ELTYPE, EP, PB} <: AbstractNeighborhoodS function TrivialNeighborhoodSearch{NDIMS}(; search_radius = 0.0, eachpoint = 1:0, periodic_box = nothing) where {NDIMS} + if search_radius isa Integer + throw(ArgumentError("`search_radius` cannot be an integer type, since computed " * + "distances will be converted to this type")) + end + new{NDIMS, typeof(search_radius), typeof(eachpoint), typeof(periodic_box)}(search_radius, eachpoint, periodic_box) end @@ -49,11 +54,8 @@ end @inline eachneighbor(coords, search::TrivialNeighborhoodSearch) = search.eachpoint -# Create a copy of a neighborhood search but with a different search radius -function copy_neighborhood_search(nhs::TrivialNeighborhoodSearch, search_radius, x, y) - return nhs -end - +# Create a copy of a neighborhood search but with a different search radius and different +# number of points. function copy_neighborhood_search(nhs::TrivialNeighborhoodSearch, search_radius, n_points; eachpoint = 1:n_points) return TrivialNeighborhoodSearch{ndims(nhs)}(; search_radius, eachpoint, diff --git a/test/Project.toml b/test/Project.toml index f726b8a5..454f0763 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,7 @@ [deps] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" @@ -7,7 +9,9 @@ TrixiParticles = "66699cd8-9c01-4e9d-a059-b96c86d16b3a" TrixiTest = "0a316866-cbd0-4425-8bcb-08103b2c1f26" [compat] +Adapt = "4" BenchmarkTools = "1" +KernelAbstractions = "0.9" Plots = "1" Test = "1" TrixiParticles = "0.4" diff --git a/test/gpu.jl b/test/gpu.jl new file mode 100644 index 00000000..830d863c --- /dev/null +++ b/test/gpu.jl @@ -0,0 +1,36 @@ +const POINTNEIGHBORS_TEST_ = lowercase(get(ENV, "POINTNEIGHBORS_TEST", "all")) + +if POINTNEIGHBORS_TEST_ == "cuda" + using CUDA + CUDA.versioninfo() + parallelization_backend = CUDABackend() + supports_double_precision = true + fp64_fastdiv = true +elseif POINTNEIGHBORS_TEST_ == "amdgpu" + using AMDGPU + AMDGPU.versioninfo() + parallelization_backend = ROCBackend() + supports_double_precision = true + fp64_fastdiv = false +elseif POINTNEIGHBORS_TEST_ == "metal" + using Metal + Metal.versioninfo() + parallelization_backend = MetalBackend() + supports_double_precision = false +elseif POINTNEIGHBORS_TEST_ == "oneapi" + using oneAPI + oneAPI.versioninfo() + parallelization_backend = oneAPIBackend() + # The runners are using an iGPU, which does not support double precision + supports_double_precision = false +else + error("Unknown GPU backend: $POINTNEIGHBORS_TEST_") +end + +@testset verbose=true "GPU tutorial" begin + @trixi_test_nowarn trixi_include(joinpath(@__DIR__, "..", "docs", "literate", "src", + "tut_gpu_usage.jl"), + backend = parallelization_backend) + @test n_neighbors_gpu isa PointNeighbors.AbstractGPUArray + @test extrema(n_neighbors_gpu) == (11, 29) +end diff --git a/test/nhs_grid.jl b/test/nhs_grid.jl index 17ff2fa9..4b7110f1 100644 --- a/test/nhs_grid.jl +++ b/test/nhs_grid.jl @@ -34,7 +34,7 @@ @test copy.cell_list isa FullGridCellList @test copy.cell_list.cells isa PointNeighbors.DynamicVectorOfVectors - @test copy.update_strategy == ParallelIncrementalUpdate() + @test copy.update_strategy == ParallelUpdate() # Full grid cell list with `Vector{Vector}` backend nhs = GridNeighborhoodSearch{2}(cell_list = FullGridCellList(; min_corner, diff --git a/test/runtests.jl b/test/runtests.jl index cc1f9f9b..0f9c072d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,4 +10,8 @@ const POINTNEIGHBORS_TEST = lowercase(get(ENV, "POINTNEIGHBORS_TEST", "all")) if POINTNEIGHBORS_TEST in ("all", "benchmarks") include("benchmarks.jl") end + + if POINTNEIGHBORS_TEST in ("cuda", "amdgpu", "metal", "oneapi") + include("gpu.jl") + end end; diff --git a/test/test_util.jl b/test/test_util.jl index e305a05a..21a23563 100644 --- a/test/test_util.jl +++ b/test/test_util.jl @@ -1,7 +1,7 @@ # All `using` calls are in this file, so that one can run any test file # after running only this file. using Test: @test, @testset, @test_throws -using TrixiTest: @trixi_test_nowarn +using TrixiTest: @trixi_test_nowarn, trixi_include using PointNeighbors """