From 7ceb1a9266f1884296baa2e9bfb1d854f07d1dca Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 17:10:16 +0000 Subject: [PATCH 01/10] Deduplicate compute_number_of_edges CUB/Thrust kernels via explicit instantiation Move edge_partition_device_view_t::compute_number_of_edges* and compute_local_degrees* method bodies from inline definitions in edge_partition_device_view.cuh to a separate _impl.cuh file, with explicit template instantiations in dedicated .cu translation units. This prevents 40+ TUs from each compiling their own copy of the CUB DeviceReduce and Thrust transform_reduce kernels for these methods. Two prims headers that required rare cuda::transform_iterator types (using lambdas) had those lambdas replaced with named functors so the iterator types can be explicitly instantiated without leaking the heavy _impl.cuh back into broadly-included headers. Binary size reduction vs baseline: libcugraph.so: 475 MB -> 459 MB (-16 MB, -3.4%) libcugraph_mg.so: 576 MB -> 520 MB (-56 MB, -9.7%) Combined: 1051 MB -> 979 MB (-72 MB, -6.9%) --- cpp/CMakeLists.txt | 4 + .../compute_number_of_edges_functors.cuh | 38 ++ .../detail/compute_number_of_edges_impl.cuh | 528 ++++++++++++++++++ .../cugraph/edge_partition_device_view.cuh | 455 +-------------- .../extract_transform_if_v_frontier_e.cuh | 8 +- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 15 +- .../compute_number_of_edges_mg_v32_e32.cu | 57 ++ .../compute_number_of_edges_mg_v64_e64.cu | 57 ++ .../compute_number_of_edges_sg_v32_e32.cu | 42 ++ .../compute_number_of_edges_sg_v64_e64.cu | 42 ++ .../traversal/od_shortest_distances_impl.cuh | 1 + 11 files changed, 792 insertions(+), 455 deletions(-) create mode 100644 cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh create mode 100644 cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh create mode 100644 cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu create mode 100644 cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu create mode 100644 cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu create mode 100644 cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6836f47fa4a..9dd67f8537e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -260,6 +260,8 @@ set(CUGRAPH_SG_SOURCES src/structure/graph_sg_v32_e32.cu src/structure/graph_view_sg_v64_e64.cu src/structure/graph_view_sg_v32_e32.cu + src/structure/compute_number_of_edges_sg_v64_e64.cu + src/structure/compute_number_of_edges_sg_v32_e32.cu src/structure/decompress_to_edgelist_sg_v64_e64.cu src/structure/decompress_to_edgelist_sg_v32_e32.cu src/structure/symmetrize_graph_sg_v64_e64.cu @@ -354,6 +356,8 @@ set(CUGRAPH_MG_SOURCES src/community/detail/maximal_independent_moves_mg_v32_e32.cu src/structure/graph_view_mg_v64_e64.cu src/structure/graph_view_mg_v32_e32.cu + src/structure/compute_number_of_edges_mg_v64_e64.cu + src/structure/compute_number_of_edges_mg_v32_e32.cu src/link_prediction/jaccard_mg_v64_e64.cu src/link_prediction/jaccard_mg_v32_e32.cu src/link_prediction/sorensen_mg_v64_e64.cu diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh b/cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh new file mode 100644 index 00000000000..04bbe07117b --- /dev/null +++ b/cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh @@ -0,0 +1,38 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +namespace cugraph { +namespace detail { + +// Converts a uint32_t bitmap offset to a vertex_t by adding range_first. +// Used in the bitmap code path of extract_transform_if_v_frontier_e. +template +struct bitmap_offset_to_vertex_op_t { + vertex_t range_first; + __device__ vertex_t operator()(uint32_t v_offset) const + { + return range_first + static_cast(v_offset); + } +}; + +// Maps a linear index to either a sparse-range vertex or a hypersparse DCS vertex. +// Used in per_v_transform_reduce_dst_key_aggregated_outgoing_e. +template +struct sparse_hypersparse_major_op_t { + vertex_t major_sparse_range_size; + vertex_t major_range_first; + vertex_t const* dcs_nzd_vertices; + __device__ vertex_t operator()(vertex_t i) const + { + if (i < major_sparse_range_size) { return major_range_first + i; } + return *(dcs_nzd_vertices + (i - major_sparse_range_size)); + } +}; + +} // namespace detail +} // namespace cugraph diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh new file mode 100644 index 00000000000..59eb168e421 --- /dev/null +++ b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh @@ -0,0 +1,528 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cugraph { + +// ============================================================================ +// MG specialization: out-of-line definitions +// ============================================================================ + +template +template +__host__ size_t +edge_partition_device_view_t>:: + compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; + return dcs_nzd_vertices_ ? thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + *major_hypersparse_first_}, + size_t{0}, + cuda::std::plus()) + : thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}, + size_t{0}, + cuda::std::plus()); +} + +template +template +__host__ void +edge_partition_device_view_t>:: + compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices_) { + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } +} + +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees(rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(this->major_range_size(), stream); + if (dcs_nzd_vertices_) { + assert(major_hypersparse_first_); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + major_hypersparse_first_.value_or(vertex_t{0})}); + } else { + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + } + return local_degrees; +} + +template +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + if (dcs_nzd_vertices_) { + assert(major_hypersparse_first_); + thrust::transform(rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + dcs_nzd_vertices_.value(), + major_hypersparse_first_.value_or(vertex_t{0})}); + } else { + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + } + return local_degrees; +} + +template +template +__host__ size_t +edge_partition_device_view_t>:: + compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; + return dcs_nzd_vertices_ + ? thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + *major_hypersparse_first_, + mask_first}, + size_t{0}, + cuda::std::plus()) + : thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}, + size_t{0}, + cuda::std::plus()); +} + +template +template +__host__ void +edge_partition_device_view_t>:: + compute_number_of_edges_with_mask_async(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices_) { + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + *major_hypersparse_first_, + mask_first}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } +} + +template +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees_with_mask(MaskIterator mask_first, rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(this->major_range_size(), stream); + if (dcs_nzd_vertices_) { + assert(major_hypersparse_first_); + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + major_hypersparse_first_.value_or(vertex_t{0}), + mask_first}); + } else { + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + } + return local_degrees; +} + +template +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + if (dcs_nzd_vertices_) { + assert(major_hypersparse_first_); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + dcs_nzd_vertices_.value(), + major_hypersparse_first_.value_or(vertex_t{0}), + mask_first}); + } else { + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail:: + local_degree_with_mask_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + } + return local_degrees; +} + +// ============================================================================ +// SG specialization: out-of-line definitions +// ============================================================================ + +template +template +__host__ size_t +edge_partition_device_view_t>:: + compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; + return thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}, + size_t{0}, + cuda::std::plus()); +} + +template +template +__host__ void +edge_partition_device_view_t>:: + compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); +} + +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees(rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(this->major_range_size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + return local_degrees; +} + +template +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + return local_degrees; +} + +template +template +__host__ size_t +edge_partition_device_view_t>:: + compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; + return thrust::transform_reduce( + rmm::exec_policy(stream), + major_first, + major_last, + detail::local_degree_with_mask_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}, + size_t{0}, + cuda::std::plus()); +} + +template +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees_with_mask(MaskIterator mask_first, rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(this->major_range_size(), stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + local_degrees.begin(), + detail::local_degree_with_mask_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + return local_degrees; +} + +template +template +__host__ rmm::device_uvector +edge_partition_device_view_t>:: + compute_local_degrees_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_with_mask_op_t{ + this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); + return local_degrees; +} + +} // namespace cugraph diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 827c0c3a09b..2517c60d346 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -207,198 +207,26 @@ class edge_partition_device_view_t __host__ size_t compute_number_of_edges(MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return dcs_nzd_vertices_ ? thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - true>{this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_}, - size_t{0}, - cuda::std::plus()) - : thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false>{this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}, - size_t{0}, - cuda::std::plus()); - } + rmm::cuda_stream_view stream) const; template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } - - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; - - if (dcs_nzd_vertices_) { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } else { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } - } + rmm::cuda_stream_view stream) const; - __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0})}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - } - return local_degrees; - } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const; template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0})}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - } - return local_degrees; - } + rmm::cuda_stream_view stream) const; template __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return dcs_nzd_vertices_ ? thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - true, - MaskIterator>{this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_, - mask_first}, - size_t{0}, - cuda::std::plus()) - : thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false, - MaskIterator>{this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}, - size_t{0}, - cuda::std::plus()); - } + rmm::cuda_stream_view stream) const; template __host__ void compute_number_of_edges_with_mask_async( @@ -406,146 +234,18 @@ class edge_partition_device_view_t count /* size = 1 */, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } - - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; - - if (dcs_nzd_vertices_) { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - true, - MaskIterator>{this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_, - mask_first}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } else { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false, - MaskIterator>{this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } - } + rmm::cuda_stream_view stream) const; template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - } - return local_degrees; - } + MaskIterator mask_first, rmm::cuda_stream_view stream) const; template __host__ rmm::device_uvector compute_local_degrees_with_mask( MaskIterator mask_first, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - } - return local_degrees; - } + rmm::cuda_stream_view stream) const; __host__ __device__ vertex_t major_value_start_offset() const { @@ -694,162 +394,37 @@ class edge_partition_device_view_t __host__ size_t compute_number_of_edges(MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}, - size_t{0}, - cuda::std::plus()); - } + rmm::cuda_stream_view stream) const; template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } + rmm::cuda_stream_view stream) const; - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; - - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } - - __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - return local_degrees; - } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const; template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - return local_degrees; - } + rmm::cuda_stream_view stream) const; template __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false, - MaskIterator>{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}, - size_t{0}, - cuda::std::plus()); - } + rmm::cuda_stream_view stream) const; template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_with_mask_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - return local_degrees; - } + MaskIterator mask_first, rmm::cuda_stream_view stream) const; template __host__ rmm::device_uvector compute_local_degrees_with_mask( MaskIterator mask_first, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_with_mask_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - return local_degrees; - } + rmm::cuda_stream_view stream) const; __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; } diff --git a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh index 38d90f01466..282c77a02a3 100644 --- a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh @@ -5,6 +5,7 @@ #pragma once +#include #include #include #include @@ -1429,11 +1430,8 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, if (keys.index() == 0) { auto major_first = cuda::make_transform_iterator( std::get<0>(keys).begin(), - cuda::proclaim_return_type( - [range_first = - local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { - return range_first + static_cast(v_offset); - })); + detail::bitmap_offset_to_vertex_op_t{ + local_frontier_range_firsts[partition_idx]}); if (edge_partition_e_mask) { edge_partition.compute_number_of_edges_with_mask_async( edge_partition_e_mask->value_first(), diff --git a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index afb918cecf3..e569466133c 100644 --- a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include #include @@ -363,16 +364,10 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( major_sparse_range_size + *(edge_partition.dcs_nzd_vertex_count()), handle.get_stream()); auto major_first = cuda::make_transform_iterator( thrust::make_counting_iterator(vertex_t{0}), - cuda::proclaim_return_type( - [major_sparse_range_size, - major_range_first = edge_partition.major_range_first(), - dcs_nzd_vertices = *(edge_partition.dcs_nzd_vertices())] __device__(vertex_t i) { - if (i < major_sparse_range_size) { // sparse - return major_range_first + i; - } else { // hypersparse - return *(dcs_nzd_vertices + (i - major_sparse_range_size)); - } - })); + detail::sparse_hypersparse_major_op_t{ + major_sparse_range_size, + edge_partition.major_range_first(), + *(edge_partition.dcs_nzd_vertices())}); degrees_with_mask = edge_partition.compute_local_degrees_with_mask((*edge_partition_e_mask).value_first(), major_first, diff --git a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu new file mode 100644 index 00000000000..30ecd943c1f --- /dev/null +++ b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include +#include + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_local_degrees (non-templated) +template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; + +// compute_number_of_edges +template size_t view_t::compute_number_of_edges(int32_t*, int32_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_async +template void view_t::compute_number_of_edges_async(int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async(int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; + +// compute_local_degrees (MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees(int32_t*, int32_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_with_mask +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_with_mask_async (MG only) +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator only) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// bitmap iterator: cuda::transform_iterator, uint32_t const*> +using bitmap_iter_32_t = cuda::transform_iterator, uint32_t const*>; +template void view_t::compute_number_of_edges_async(bitmap_iter_32_t, bitmap_iter_32_t, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, bitmap_iter_32_t, bitmap_iter_32_t, raft::device_span, rmm::cuda_stream_view) const; + +// sparse-hypersparse iterator: cuda::transform_iterator, counting_iterator> +using sh_iter_32_t = cuda::transform_iterator, thrust::counting_iterator>; +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, sh_iter_32_t, sh_iter_32_t, rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu new file mode 100644 index 00000000000..751698da6e4 --- /dev/null +++ b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu @@ -0,0 +1,57 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include +#include + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_local_degrees (non-templated) +template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; + +// compute_number_of_edges +template size_t view_t::compute_number_of_edges(int64_t*, int64_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_async +template void view_t::compute_number_of_edges_async(int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async(int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; + +// compute_local_degrees (MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees(int64_t*, int64_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_with_mask +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_with_mask_async (MG only) +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator only) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// bitmap iterator: cuda::transform_iterator, uint32_t const*> +using bitmap_iter_64_t = cuda::transform_iterator, uint32_t const*>; +template void view_t::compute_number_of_edges_async(bitmap_iter_64_t, bitmap_iter_64_t, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, bitmap_iter_64_t, bitmap_iter_64_t, raft::device_span, rmm::cuda_stream_view) const; + +// sparse-hypersparse iterator: cuda::transform_iterator, counting_iterator> +using sh_iter_64_t = cuda::transform_iterator, thrust::counting_iterator>; +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, sh_iter_64_t, sh_iter_64_t, rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu new file mode 100644 index 00000000000..04ba4eca8f3 --- /dev/null +++ b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_local_degrees (non-templated) +template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; + +// compute_number_of_edges +template size_t view_t::compute_number_of_edges(int32_t*, int32_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_async +template void view_t::compute_number_of_edges_async(int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async(int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; + +// compute_local_degrees (MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees(int32_t*, int32_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_with_mask +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator only) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu new file mode 100644 index 00000000000..87105dec124 --- /dev/null +++ b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu @@ -0,0 +1,42 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_local_degrees (non-templated) +template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; + +// compute_number_of_edges +template size_t view_t::compute_number_of_edges(int64_t*, int64_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_async +template void view_t::compute_number_of_edges_async(int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async(int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; + +// compute_local_degrees (MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees(int64_t*, int64_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_number_of_edges_with_mask +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator only) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 49af2bee8a9..270a509dae0 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include #include From 3874803b72cf839a5bcc2cc821f4372a67661e38 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 20:35:10 +0000 Subject: [PATCH 02/10] Unify sync/async compute_number_of_edges by delegating sync to async via inline wrappers --- .../detail/compute_number_of_edges_impl.cuh | 182 +++--------------- .../cugraph/edge_partition_device_view.cuh | 68 +++++-- .../compute_number_of_edges_mg_v32_e32.cu | 15 +- .../compute_number_of_edges_mg_v64_e64.cu | 15 +- .../compute_number_of_edges_sg_v32_e32.cu | 16 +- .../compute_number_of_edges_sg_v64_e64.cu | 16 +- 6 files changed, 88 insertions(+), 224 deletions(-) diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh index 59eb168e421..2c4d7cf6b9d 100644 --- a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh +++ b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh @@ -16,7 +16,6 @@ #include #include #include -#include namespace cugraph { @@ -24,39 +23,6 @@ namespace cugraph { // MG specialization: out-of-line definitions // ============================================================================ -template -template -__host__ size_t -edge_partition_device_view_t>:: - compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const -{ - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return dcs_nzd_vertices_ ? thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_}, - size_t{0}, - cuda::std::plus()) - : thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}, - size_t{0}, - cuda::std::plus()); -} - template template __host__ void @@ -112,35 +78,6 @@ edge_partition_device_view_t -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees(rmm::cuda_stream_view stream) const -{ - rmm::device_uvector local_degrees(this->major_range_size(), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0})}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - } - return local_degrees; -} - template template __host__ rmm::device_uvector @@ -173,43 +110,6 @@ edge_partition_device_view_t -template -__host__ size_t -edge_partition_device_view_t>:: - compute_number_of_edges_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const -{ - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return dcs_nzd_vertices_ - ? thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_, - mask_first}, - size_t{0}, - cuda::std::plus()) - : thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}, - size_t{0}, - cuda::std::plus()); -} - template template __host__ void @@ -356,28 +256,6 @@ edge_partition_device_view_t -template -__host__ size_t -edge_partition_device_view_t>:: - compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const -{ - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}, - size_t{0}, - cuda::std::plus()); -} - template template __host__ void @@ -389,6 +267,7 @@ edge_partition_device_view_t d_tmp_storage(0, stream); @@ -416,24 +295,6 @@ edge_partition_device_view_t -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees(rmm::cuda_stream_view stream) const -{ - rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - return local_degrees; -} - template template __host__ rmm::device_uvector @@ -458,26 +319,43 @@ edge_partition_device_view_t template -__host__ size_t +__host__ void edge_partition_device_view_t>:: - compute_number_of_edges_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + compute_number_of_edges_with_mask_async(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return thrust::transform_reduce( - rmm::exec_policy(stream), + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + return; + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = cuda::make_transform_iterator( major_first, - major_last, detail::local_degree_with_mask_op_t{ this->offsets_, std::byte{0} /* dummy */, std::byte{0} /* dummy */, std::byte{0} /* dummy */, - mask_first}, - size_t{0}, - cuda::std::plus()); + mask_first}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); } template diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 2517c60d346..8e691f4b8a3 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -22,7 +23,6 @@ #include #include #include -#include #include #include @@ -204,30 +204,28 @@ class edge_partition_device_view_t - __host__ size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const; - template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, rmm::cuda_stream_view stream) const; - __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const; + template + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + major_first, major_last, raft::device_span(count.data(), 1), stream); + return count.value(stream); + } template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const; - template - __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const; - template __host__ void compute_number_of_edges_with_mask_async( MaskIterator mask_first, @@ -236,6 +234,18 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const; + template + __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + mask_first, major_first, major_last, raft::device_span(count.data(), 1), stream); + return count.value(stream); + } + template __host__ rmm::device_uvector compute_local_degrees_with_mask( MaskIterator mask_first, rmm::cuda_stream_view stream) const; @@ -391,29 +401,47 @@ class edge_partition_device_view_t - __host__ size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const; - template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, rmm::cuda_stream_view stream) const; - __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const; + template + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + major_first, major_last, raft::device_span(count.data(), 1), stream); + return count.value(stream); + } template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const; + template + __host__ void compute_number_of_edges_with_mask_async( + MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const; + template __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + mask_first, major_first, major_last, raft::device_span(count.data(), 1), stream); + return count.value(stream); + } template __host__ rmm::device_uvector compute_local_degrees_with_mask( diff --git a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu index 30ecd943c1f..e501bf78e71 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu @@ -9,14 +9,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_local_degrees (non-templated) -template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; - -// compute_number_of_edges -template size_t view_t::compute_number_of_edges(int32_t*, int32_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_async template void view_t::compute_number_of_edges_async(int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_async(int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; @@ -27,12 +19,7 @@ template rmm::device_uvector view_t::compute_local_degrees(in template rmm::device_uvector view_t::compute_local_degrees(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; -// compute_number_of_edges_with_mask -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - -// compute_number_of_edges_with_mask_async (MG only) +// compute_number_of_edges_with_mask_async template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu index 751698da6e4..80859cce88a 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu @@ -9,14 +9,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_local_degrees (non-templated) -template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; - -// compute_number_of_edges -template size_t view_t::compute_number_of_edges(int64_t*, int64_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_async template void view_t::compute_number_of_edges_async(int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_async(int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; @@ -27,12 +19,7 @@ template rmm::device_uvector view_t::compute_local_degrees(in template rmm::device_uvector view_t::compute_local_degrees(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; -// compute_number_of_edges_with_mask -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - -// compute_number_of_edges_with_mask_async (MG only) +// compute_number_of_edges_with_mask_async template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu index 04ba4eca8f3..8ea02b4c638 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu @@ -8,14 +8,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_local_degrees (non-templated) -template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; - -// compute_number_of_edges -template size_t view_t::compute_number_of_edges(int32_t*, int32_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_async template void view_t::compute_number_of_edges_async(int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_async(int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; @@ -26,10 +18,10 @@ template rmm::device_uvector view_t::compute_local_degrees(in template rmm::device_uvector view_t::compute_local_degrees(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; -// compute_number_of_edges_with_mask -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; +// compute_number_of_edges_with_mask_async +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator only) template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu index 87105dec124..3c73a46ad84 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu @@ -8,14 +8,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_local_degrees (non-templated) -template rmm::device_uvector view_t::compute_local_degrees(rmm::cuda_stream_view) const; - -// compute_number_of_edges -template size_t view_t::compute_number_of_edges(int64_t*, int64_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_async template void view_t::compute_number_of_edges_async(int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_async(int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; @@ -26,10 +18,10 @@ template rmm::device_uvector view_t::compute_local_degrees(in template rmm::device_uvector view_t::compute_local_degrees(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; -// compute_number_of_edges_with_mask -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template size_t view_t::compute_number_of_edges_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; +// compute_number_of_edges_with_mask_async +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator only) template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; From 318a71e58c3c222e3193f1b8b517237630a0de6a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 13 Apr 2026 05:35:44 +0000 Subject: [PATCH 03/10] Unify mask/no-mask variants via nullptr sentinel, merge local_degree_op_t and local_degree_with_mask_op_t into single functor --- .../detail/compute_number_of_edges_impl.cuh | 167 +----------------- .../cugraph/edge_partition_device_view.cuh | 150 ++++++++-------- .../compute_number_of_edges_mg_v32_e32.cu | 15 +- .../compute_number_of_edges_mg_v64_e64.cu | 15 +- .../compute_number_of_edges_sg_v32_e32.cu | 10 -- .../compute_number_of_edges_sg_v64_e64.cu | 10 -- 6 files changed, 84 insertions(+), 283 deletions(-) diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh index 2c4d7cf6b9d..ad4e5e8b0aa 100644 --- a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh +++ b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh @@ -14,7 +14,6 @@ #include #include -#include #include namespace cugraph { @@ -23,93 +22,6 @@ namespace cugraph { // MG specialization: out-of-line definitions // ============================================================================ -template -template -__host__ void -edge_partition_device_view_t>:: - compute_number_of_edges_async(MajorIterator major_first, - MajorIterator major_last, - raft::device_span count, - rmm::cuda_stream_view stream) const -{ - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } - - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; - - if (dcs_nzd_vertices_) { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } else { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } -} - -template -template -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const -{ - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0})}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - } - return local_degrees; -} - template template __host__ void @@ -130,7 +42,7 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, major_range_first_, *dcs_nzd_vertices_, @@ -152,7 +64,7 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, major_range_first_, std::byte{0} /* dummy */, @@ -189,7 +101,7 @@ edge_partition_device_view_tmajor_range_last()), local_degrees.begin(), detail:: - local_degree_with_mask_op_t{ + local_degree_op_t{ this->offsets_, major_range_first_, *dcs_nzd_vertices_, @@ -202,7 +114,7 @@ edge_partition_device_view_tmajor_range_last()), local_degrees.begin(), detail:: - local_degree_with_mask_op_t{ + local_degree_op_t{ this->offsets_, major_range_first_, std::byte{0} /* dummy */, @@ -229,7 +141,7 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, major_range_first_, dcs_nzd_vertices_.value(), @@ -242,7 +154,7 @@ edge_partition_device_view_t{ + local_degree_op_t{ this->offsets_, major_range_first_, std::byte{0} /* dummy */, @@ -256,67 +168,6 @@ edge_partition_device_view_t -template -__host__ void -edge_partition_device_view_t>:: - compute_number_of_edges_async(MajorIterator major_first, - MajorIterator major_last, - raft::device_span count, - rmm::cuda_stream_view stream) const -{ - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - return; - } - - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; - - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); -} - -template -template -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const -{ - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - return local_degrees; -} - template template __host__ void @@ -337,7 +188,7 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, std::byte{0} /* dummy */, std::byte{0} /* dummy */, @@ -370,7 +221,7 @@ edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), - detail::local_degree_with_mask_op_t{ + detail::local_degree_op_t{ this->offsets_, std::byte{0} /* dummy */, std::byte{0} /* dummy */, @@ -394,7 +245,7 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, std::byte{0} /* dummy */, std::byte{0} /* dummy */, diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 8e691f4b8a3..34a3ed420f8 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -47,50 +47,13 @@ __device__ cuda::std::optional major_hypersparse_idx_from_major_nochec : cuda::std::nullopt; } -template -struct local_degree_op_t { - raft::device_span offsets{}; - std::conditional_t major_range_first{}; - - std::conditional_t, std::byte /* dummy */> - dcs_nzd_vertices{}; - std::conditional_t major_hypersparse_first{}; - - __device__ return_type_t operator()(vertex_t major) const - { - if constexpr (multi_gpu) { - vertex_t idx{}; - if constexpr (use_dcs) { - if (major < major_hypersparse_first) { - idx = major - major_range_first; - return static_cast(offsets[idx + 1] - offsets[idx]); - } else { - auto major_hypersparse_idx = - major_hypersparse_idx_from_major_nocheck_impl(dcs_nzd_vertices, major); - if (major_hypersparse_idx) { - idx = (major_hypersparse_first - major_range_first) + *major_hypersparse_idx; - return static_cast(offsets[idx + 1] - offsets[idx]); - } else { - return return_type_t{0}; - } - } - } else { - idx = major - major_range_first; - return static_cast(offsets[idx + 1] - offsets[idx]); - } - } else { - return static_cast(offsets[major + 1] - offsets[major]); - } - } -}; - template -struct local_degree_with_mask_op_t { + typename MaskIterator = uint32_t const*> +struct local_degree_op_t { raft::device_span offsets{}; std::conditional_t major_range_first{}; @@ -100,6 +63,15 @@ struct local_degree_with_mask_op_t { MaskIterator mask_first{}; + __device__ return_type_t compute_degree(edge_t offset, edge_t degree) const + { + if (mask_first) { + return static_cast(count_set_bits(mask_first, offset, degree)); + } else { + return static_cast(degree); + } + } + __device__ return_type_t operator()(vertex_t major) const { if constexpr (multi_gpu) { @@ -107,27 +79,23 @@ struct local_degree_with_mask_op_t { if constexpr (use_dcs) { if (major < major_hypersparse_first) { idx = major - major_range_first; - return static_cast( - count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + return compute_degree(offsets[idx], offsets[idx + 1] - offsets[idx]); } else { auto major_hypersparse_idx = major_hypersparse_idx_from_major_nocheck_impl(dcs_nzd_vertices, major); if (major_hypersparse_idx) { idx = (major_hypersparse_first - major_range_first) + *major_hypersparse_idx; - return static_cast( - count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + return compute_degree(offsets[idx], offsets[idx + 1] - offsets[idx]); } else { return return_type_t{0}; } } } else { idx = major - major_range_first; - return static_cast( - count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + return compute_degree(offsets[idx], offsets[idx + 1] - offsets[idx]); } } else { - return static_cast( - count_set_bits(mask_first, offsets[major], offsets[major + 1] - offsets[major])); + return compute_degree(offsets[major], offsets[major + 1] - offsets[major]); } } }; @@ -204,11 +172,23 @@ class edge_partition_device_view_t + __host__ void compute_number_of_edges_with_mask_async( + MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const; + template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async( + static_cast(nullptr), major_first, major_last, count, stream); + } template __host__ size_t compute_number_of_edges(MajorIterator major_first, @@ -216,24 +196,15 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_async( - major_first, major_last, raft::device_span(count.data(), 1), stream); + compute_number_of_edges_with_mask_async( + static_cast(nullptr), + major_first, + major_last, + raft::device_span(count.data(), 1), + stream); return count.value(stream); } - template - __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const; - - template - __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const; - template __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, MajorIterator major_first, @@ -257,6 +228,15 @@ class edge_partition_device_view_t + __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask( + static_cast(nullptr), major_first, major_last, stream); + } + __host__ __device__ vertex_t major_value_start_offset() const { return major_value_start_offset_; @@ -401,11 +381,23 @@ class edge_partition_device_view_t + __host__ void compute_number_of_edges_with_mask_async( + MaskIterator mask_first, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const; + template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async( + static_cast(nullptr), major_first, major_last, count, stream); + } template __host__ size_t compute_number_of_edges(MajorIterator major_first, @@ -413,24 +405,15 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_async( - major_first, major_last, raft::device_span(count.data(), 1), stream); + compute_number_of_edges_with_mask_async( + static_cast(nullptr), + major_first, + major_last, + raft::device_span(count.data(), 1), + stream); return count.value(stream); } - template - __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const; - - template - __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const; - template __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, MajorIterator major_first, @@ -454,6 +437,15 @@ class edge_partition_device_view_t + __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask( + static_cast(nullptr), major_first, major_last, stream); + } + __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; } __host__ __device__ cuda::std::optional major_hypersparse_first() const noexcept diff --git a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu index e501bf78e71..888d48aaba7 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu @@ -9,16 +9,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_number_of_edges_async -template void view_t::compute_number_of_edges_async(int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async(int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; - -// compute_local_degrees (MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees(int32_t*, int32_t*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_with_mask_async template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; @@ -32,12 +22,11 @@ template rmm::device_uvector view_t::compute_local_degrees_with_mask view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; -// bitmap iterator: cuda::transform_iterator, uint32_t const*> +// bitmap iterator using bitmap_iter_32_t = cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_async(bitmap_iter_32_t, bitmap_iter_32_t, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, bitmap_iter_32_t, bitmap_iter_32_t, raft::device_span, rmm::cuda_stream_view) const; -// sparse-hypersparse iterator: cuda::transform_iterator, counting_iterator> +// sparse-hypersparse iterator using sh_iter_32_t = cuda::transform_iterator, thrust::counting_iterator>; template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, sh_iter_32_t, sh_iter_32_t, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu index 80859cce88a..0f79846f3b0 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu @@ -9,16 +9,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_number_of_edges_async -template void view_t::compute_number_of_edges_async(int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async(int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; - -// compute_local_degrees (MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees(int64_t*, int64_t*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_with_mask_async template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; @@ -32,12 +22,11 @@ template rmm::device_uvector view_t::compute_local_degrees_with_mask view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; -// bitmap iterator: cuda::transform_iterator, uint32_t const*> +// bitmap iterator using bitmap_iter_64_t = cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_async(bitmap_iter_64_t, bitmap_iter_64_t, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, bitmap_iter_64_t, bitmap_iter_64_t, raft::device_span, rmm::cuda_stream_view) const; -// sparse-hypersparse iterator: cuda::transform_iterator, counting_iterator> +// sparse-hypersparse iterator using sh_iter_64_t = cuda::transform_iterator, thrust::counting_iterator>; template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, sh_iter_64_t, sh_iter_64_t, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu index 8ea02b4c638..a3c4397af3f 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu @@ -8,16 +8,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_number_of_edges_async -template void view_t::compute_number_of_edges_async(int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async(int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; - -// compute_local_degrees (MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees(int32_t*, int32_t*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees(int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_with_mask_async template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu index 3c73a46ad84..046b25b8f12 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu @@ -8,16 +8,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_number_of_edges_async -template void view_t::compute_number_of_edges_async(int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async(int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_async>(thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; - -// compute_local_degrees (MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees(int64_t*, int64_t*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees(int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees>(thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; - // compute_number_of_edges_with_mask_async template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; From 8df34cf277d421741e9b22bbf5e1a8a82c1a717a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 13 Apr 2026 12:41:40 +0000 Subject: [PATCH 04/10] Deduplicate vertex_t* / vertex_t const* instantiations via SFINAE const-cast overloads --- .../cugraph/edge_partition_device_view.cuh | 60 +++++++++++++++++++ .../compute_number_of_edges_mg_v32_e32.cu | 2 - .../compute_number_of_edges_mg_v64_e64.cu | 2 - .../compute_number_of_edges_sg_v32_e32.cu | 2 - .../compute_number_of_edges_sg_v64_e64.cu | 2 - 5 files changed, 60 insertions(+), 8 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 34a3ed420f8..903a4ccb3bb 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -180,6 +180,22 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const; + template , int> = 0> + __host__ void compute_number_of_edges_with_mask_async( + MaskIterator mask_first, + T* major_first, + T* major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async( + mask_first, + static_cast(major_first), + static_cast(major_last), + count, + stream); + } + template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, @@ -228,6 +244,20 @@ class edge_partition_device_view_t, int> = 0> + __host__ rmm::device_uvector compute_local_degrees_with_mask( + MaskIterator mask_first, + T* major_first, + T* major_last, + rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask( + mask_first, + static_cast(major_first), + static_cast(major_last), + stream); + } + template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, @@ -389,6 +419,22 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const; + template , int> = 0> + __host__ void compute_number_of_edges_with_mask_async( + MaskIterator mask_first, + T* major_first, + T* major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async( + mask_first, + static_cast(major_first), + static_cast(major_last), + count, + stream); + } + template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, @@ -437,6 +483,20 @@ class edge_partition_device_view_t, int> = 0> + __host__ rmm::device_uvector compute_local_degrees_with_mask( + MaskIterator mask_first, + T* major_first, + T* major_last, + rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask( + mask_first, + static_cast(major_first), + static_cast(major_last), + stream); + } + template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, diff --git a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu index 888d48aaba7..7611d1b2587 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu @@ -10,7 +10,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; @@ -18,7 +17,6 @@ template void view_t::compute_number_of_edges_with_mask_async view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu index 0f79846f3b0..1ded48c315a 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu @@ -10,7 +10,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; @@ -18,7 +17,6 @@ template void view_t::compute_number_of_edges_with_mask_async view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu index a3c4397af3f..f55b993952e 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu @@ -9,7 +9,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t*, int32_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; @@ -17,7 +16,6 @@ template void view_t::compute_number_of_edges_with_mask_async view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t*, int32_t*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; diff --git a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu index 046b25b8f12..1253dbf5324 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu @@ -9,7 +9,6 @@ namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t*, int64_t*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; @@ -17,7 +16,6 @@ template void view_t::compute_number_of_edges_with_mask_async view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t*, int64_t*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; From 194e30921117fd27af98558d6fe29e301ebd6553 Mon Sep 17 00:00:00 2001 From: Charles Hastings Date: Mon, 13 Apr 2026 15:20:57 -0700 Subject: [PATCH 05/10] Fix clang-format issues --- .../detail/compute_number_of_edges_impl.cuh | 39 ++++----- .../cugraph/edge_partition_device_view.cuh | 86 ++++++++----------- .../extract_transform_if_v_frontier_e.cuh | 8 +- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 9 +- .../compute_number_of_edges_mg_v32_e32.cu | 50 ++++++++--- .../compute_number_of_edges_mg_v64_e64.cu | 50 ++++++++--- .../compute_number_of_edges_sg_v32_e32.cu | 29 +++++-- .../compute_number_of_edges_sg_v64_e64.cu | 29 +++++-- 8 files changed, 188 insertions(+), 112 deletions(-) diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh index ad4e5e8b0aa..0b6ac71479e 100644 --- a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh +++ b/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh @@ -100,26 +100,24 @@ edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), - detail:: - local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + *dcs_nzd_vertices_, + major_hypersparse_first_.value_or(vertex_t{0}), + mask_first}); } else { thrust::transform( rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), - detail:: - local_degree_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); } return local_degrees; } @@ -153,13 +151,12 @@ edge_partition_device_view_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + mask_first}); } return local_degrees; } diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 903a4ccb3bb..c9502dd3a08 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -181,19 +181,17 @@ class edge_partition_device_view_t, int> = 0> - __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, - T* major_first, - T* major_last, - raft::device_span count, - rmm::cuda_stream_view stream) const + __host__ void compute_number_of_edges_with_mask_async(MaskIterator mask_first, + T* major_first, + T* major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async( - mask_first, - static_cast(major_first), - static_cast(major_last), - count, - stream); + compute_number_of_edges_with_mask_async(mask_first, + static_cast(major_first), + static_cast(major_last), + count, + stream); } template @@ -212,12 +210,11 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async( - static_cast(nullptr), - major_first, - major_last, - raft::device_span(count.data(), 1), - stream); + compute_number_of_edges_with_mask_async(static_cast(nullptr), + major_first, + major_last, + raft::device_span(count.data(), 1), + stream); return count.value(stream); } @@ -246,16 +243,10 @@ class edge_partition_device_view_t, int> = 0> __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, - T* major_first, - T* major_last, - rmm::cuda_stream_view stream) const + MaskIterator mask_first, T* major_first, T* major_last, rmm::cuda_stream_view stream) const { return compute_local_degrees_with_mask( - mask_first, - static_cast(major_first), - static_cast(major_last), - stream); + mask_first, static_cast(major_first), static_cast(major_last), stream); } template @@ -420,19 +411,17 @@ class edge_partition_device_view_t, int> = 0> - __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, - T* major_first, - T* major_last, - raft::device_span count, - rmm::cuda_stream_view stream) const + __host__ void compute_number_of_edges_with_mask_async(MaskIterator mask_first, + T* major_first, + T* major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async( - mask_first, - static_cast(major_first), - static_cast(major_last), - count, - stream); + compute_number_of_edges_with_mask_async(mask_first, + static_cast(major_first), + static_cast(major_last), + count, + stream); } template @@ -451,12 +440,11 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async( - static_cast(nullptr), - major_first, - major_last, - raft::device_span(count.data(), 1), - stream); + compute_number_of_edges_with_mask_async(static_cast(nullptr), + major_first, + major_last, + raft::device_span(count.data(), 1), + stream); return count.value(stream); } @@ -485,16 +473,10 @@ class edge_partition_device_view_t, int> = 0> __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, - T* major_first, - T* major_last, - rmm::cuda_stream_view stream) const + MaskIterator mask_first, T* major_first, T* major_last, rmm::cuda_stream_view stream) const { return compute_local_degrees_with_mask( - mask_first, - static_cast(major_first), - static_cast(major_last), - stream); + mask_first, static_cast(major_first), static_cast(major_last), stream); } template diff --git a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh index 282c77a02a3..9c1f0abde3c 100644 --- a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh @@ -1428,10 +1428,10 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, bool computed{false}; if constexpr (try_bitmap) { if (keys.index() == 0) { - auto major_first = cuda::make_transform_iterator( - std::get<0>(keys).begin(), - detail::bitmap_offset_to_vertex_op_t{ - local_frontier_range_firsts[partition_idx]}); + auto major_first = + cuda::make_transform_iterator(std::get<0>(keys).begin(), + detail::bitmap_offset_to_vertex_op_t{ + local_frontier_range_firsts[partition_idx]}); if (edge_partition_e_mask) { edge_partition.compute_number_of_edges_with_mask_async( edge_partition_e_mask->value_first(), diff --git a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index e569466133c..bc07d6b4b78 100644 --- a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -4,8 +4,8 @@ */ #pragma once -#include #include +#include #include #include #include @@ -364,10 +364,9 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( major_sparse_range_size + *(edge_partition.dcs_nzd_vertex_count()), handle.get_stream()); auto major_first = cuda::make_transform_iterator( thrust::make_counting_iterator(vertex_t{0}), - detail::sparse_hypersparse_major_op_t{ - major_sparse_range_size, - edge_partition.major_range_first(), - *(edge_partition.dcs_nzd_vertices())}); + detail::sparse_hypersparse_major_op_t{major_sparse_range_size, + edge_partition.major_range_first(), + *(edge_partition.dcs_nzd_vertices())}); degrees_with_mask = edge_partition.compute_local_degrees_with_mask((*edge_partition_e_mask).value_first(), major_first, diff --git a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu index 7611d1b2587..02d5ea5f102 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu @@ -2,30 +2,60 @@ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -#include #include +#include namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async( + uint32_t const*, + int32_t const*, + int32_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask( + uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; // bitmap iterator -using bitmap_iter_32_t = cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, bitmap_iter_32_t, bitmap_iter_32_t, raft::device_span, rmm::cuda_stream_view) const; +using bitmap_iter_32_t = + cuda::transform_iterator, uint32_t const*>; +template void view_t::compute_number_of_edges_with_mask_async( + uint32_t const*, + bitmap_iter_32_t, + bitmap_iter_32_t, + raft::device_span, + rmm::cuda_stream_view) const; // sparse-hypersparse iterator -using sh_iter_32_t = cuda::transform_iterator, thrust::counting_iterator>; -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, sh_iter_32_t, sh_iter_32_t, rmm::cuda_stream_view) const; +using sh_iter_32_t = cuda::transform_iterator, + thrust::counting_iterator>; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask(uint32_t const*, + sh_iter_32_t, + sh_iter_32_t, + rmm::cuda_stream_view) const; } // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu index 1ded48c315a..1cf141cf76f 100644 --- a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu @@ -2,30 +2,60 @@ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -#include #include +#include namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async( + uint32_t const*, + int64_t const*, + int64_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask( + uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; // bitmap iterator -using bitmap_iter_64_t = cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, bitmap_iter_64_t, bitmap_iter_64_t, raft::device_span, rmm::cuda_stream_view) const; +using bitmap_iter_64_t = + cuda::transform_iterator, uint32_t const*>; +template void view_t::compute_number_of_edges_with_mask_async( + uint32_t const*, + bitmap_iter_64_t, + bitmap_iter_64_t, + raft::device_span, + rmm::cuda_stream_view) const; // sparse-hypersparse iterator -using sh_iter_64_t = cuda::transform_iterator, thrust::counting_iterator>; -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, sh_iter_64_t, sh_iter_64_t, rmm::cuda_stream_view) const; +using sh_iter_64_t = cuda::transform_iterator, + thrust::counting_iterator>; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask(uint32_t const*, + sh_iter_64_t, + sh_iter_64_t, + rmm::cuda_stream_view) const; } // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu index f55b993952e..0bd062ab62f 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu @@ -9,14 +9,33 @@ namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int32_t const*, int32_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async( + uint32_t const*, + int32_t const*, + int32_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask( + uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; } // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu index 1253dbf5324..46164161465 100644 --- a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu +++ b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu @@ -9,14 +9,33 @@ namespace cugraph { using view_t = edge_partition_device_view_t; // compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async(uint32_t const*, int64_t const*, int64_t const*, raft::device_span, rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, raft::device_span, rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async( + uint32_t const*, + int64_t const*, + int64_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + uint32_t const*, rmm::cuda_stream_view) const; // compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask(uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask>(uint32_t const*, thrust::counting_iterator, thrust::counting_iterator, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask( + uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector +view_t::compute_local_degrees_with_mask>( + uint32_t const*, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; } // namespace cugraph From d4e48326c12fb180c1e60e42ec6458f4827544d4 Mon Sep 17 00:00:00 2001 From: Charles Hastings Date: Fri, 24 Apr 2026 11:05:34 -0700 Subject: [PATCH 06/10] address some PR comments, rename some files, move some things to be more consistent with other naming --- cpp/CMakeLists.txt | 8 +- .../compute_number_of_edges_functors.cuh | 38 ----- .../cugraph/edge_partition_device_view.cuh | 123 +++++++++------ .../extract_transform_if_v_frontier_e.cuh | 28 ++-- .../sample_and_compute_local_nbr_indices.cuh | 4 +- .../prims/detail/transform_v_frontier_e.cuh | 9 +- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 13 +- .../transform_reduce_e_by_src_dst_key.cuh | 7 +- ...reduce_if_v_frontier_outgoing_e_by_dst.cuh | 10 +- cpp/src/lookup/lookup_src_dst_impl.cuh | 5 +- .../compute_number_of_edges_mg_v32_e32.cu | 61 -------- .../compute_number_of_edges_mg_v64_e64.cu | 61 -------- .../compute_number_of_edges_sg_v32_e32.cu | 41 ----- .../compute_number_of_edges_sg_v64_e64.cu | 41 ----- .../edge_partition_device_view_impl.cuh} | 145 +++++------------- .../edge_partition_device_view_mg_v32_e32.cu | 52 +++++++ .../edge_partition_device_view_mg_v64_e64.cu | 52 +++++++ .../edge_partition_device_view_sg_v32_e32.cu | 34 ++++ .../edge_partition_device_view_sg_v64_e64.cu | 34 ++++ cpp/src/traversal/bfs_impl.cuh | 5 +- .../traversal/od_shortest_distances_impl.cuh | 1 - 21 files changed, 348 insertions(+), 424 deletions(-) delete mode 100644 cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh delete mode 100644 cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu delete mode 100644 cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu delete mode 100644 cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu delete mode 100644 cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu rename cpp/{include/cugraph/detail/compute_number_of_edges_impl.cuh => src/structure/edge_partition_device_view_impl.cuh} (61%) create mode 100644 cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu create mode 100644 cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu create mode 100644 cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu create mode 100644 cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9dd67f8537e..9c5e13eaa37 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -260,8 +260,8 @@ set(CUGRAPH_SG_SOURCES src/structure/graph_sg_v32_e32.cu src/structure/graph_view_sg_v64_e64.cu src/structure/graph_view_sg_v32_e32.cu - src/structure/compute_number_of_edges_sg_v64_e64.cu - src/structure/compute_number_of_edges_sg_v32_e32.cu + src/structure/edge_partition_device_view_sg_v64_e64.cu + src/structure/edge_partition_device_view_sg_v32_e32.cu src/structure/decompress_to_edgelist_sg_v64_e64.cu src/structure/decompress_to_edgelist_sg_v32_e32.cu src/structure/symmetrize_graph_sg_v64_e64.cu @@ -356,8 +356,8 @@ set(CUGRAPH_MG_SOURCES src/community/detail/maximal_independent_moves_mg_v32_e32.cu src/structure/graph_view_mg_v64_e64.cu src/structure/graph_view_mg_v32_e32.cu - src/structure/compute_number_of_edges_mg_v64_e64.cu - src/structure/compute_number_of_edges_mg_v32_e32.cu + src/structure/edge_partition_device_view_mg_v64_e64.cu + src/structure/edge_partition_device_view_mg_v32_e32.cu src/link_prediction/jaccard_mg_v64_e64.cu src/link_prediction/jaccard_mg_v32_e32.cu src/link_prediction/sorensen_mg_v64_e64.cu diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh b/cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh deleted file mode 100644 index 04bbe07117b..00000000000 --- a/cpp/include/cugraph/detail/compute_number_of_edges_functors.cuh +++ /dev/null @@ -1,38 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#pragma once - -#include - -namespace cugraph { -namespace detail { - -// Converts a uint32_t bitmap offset to a vertex_t by adding range_first. -// Used in the bitmap code path of extract_transform_if_v_frontier_e. -template -struct bitmap_offset_to_vertex_op_t { - vertex_t range_first; - __device__ vertex_t operator()(uint32_t v_offset) const - { - return range_first + static_cast(v_offset); - } -}; - -// Maps a linear index to either a sparse-range vertex or a hypersparse DCS vertex. -// Used in per_v_transform_reduce_dst_key_aggregated_outgoing_e. -template -struct sparse_hypersparse_major_op_t { - vertex_t major_sparse_range_size; - vertex_t major_range_first; - vertex_t const* dcs_nzd_vertices; - __device__ vertex_t operator()(vertex_t i) const - { - if (i < major_sparse_range_size) { return major_range_first + i; } - return *(dcs_nzd_vertices + (i - major_sparse_range_size)); - } -}; - -} // namespace detail -} // namespace cugraph diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index c9502dd3a08..9770ca4b384 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -172,22 +172,22 @@ class edge_partition_device_view_t + template __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, + raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, rmm::cuda_stream_view stream) const; - template , int> = 0> - __host__ void compute_number_of_edges_with_mask_async(MaskIterator mask_first, + template , int> = 0> + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, T* major_first, T* major_last, raft::device_span count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async(mask_first, + compute_number_of_edges_with_mask_async(edge_mask, static_cast(major_first), static_cast(major_last), count, @@ -200,8 +200,7 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async( - static_cast(nullptr), major_first, major_last, count, stream); + compute_number_of_edges_with_mask_async({}, major_first, major_last, count, stream); } template @@ -210,43 +209,49 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async(static_cast(nullptr), - major_first, - major_last, - raft::device_span(count.data(), 1), - stream); + compute_number_of_edges_with_mask_async( + {}, major_first, major_last, raft::device_span(count.data(), 1), stream); return count.value(stream); } - template - __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const { rmm::device_scalar count(size_t{0}, stream); compute_number_of_edges_with_mask_async( - mask_first, major_first, major_last, raft::device_span(count.data(), 1), stream); + edge_mask, major_first, major_last, raft::device_span(count.data(), 1), stream); return count.value(stream); } - template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, rmm::cuda_stream_view stream) const; + raft::device_span edge_mask, rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask( + edge_mask, + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + stream); + } - template + template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, + raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const; - template , int> = 0> + template , int> = 0> __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, T* major_first, T* major_last, rmm::cuda_stream_view stream) const + raft::device_span edge_mask, + T* major_first, + T* major_last, + rmm::cuda_stream_view stream) const { return compute_local_degrees_with_mask( - mask_first, static_cast(major_first), static_cast(major_last), stream); + edge_mask, static_cast(major_first), static_cast(major_last), stream); } template @@ -254,8 +259,7 @@ class edge_partition_device_view_t(nullptr), major_first, major_last, stream); + return compute_local_degrees_with_mask({}, major_first, major_last, stream); } __host__ __device__ vertex_t major_value_start_offset() const @@ -402,22 +406,26 @@ class edge_partition_device_view_t + template __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, + raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span count /* size = 1 */, rmm::cuda_stream_view stream) const; - template , int> = 0> - __host__ void compute_number_of_edges_with_mask_async(MaskIterator mask_first, + // OK... the next step is to support raft::device_span instead of + // major_first/major_last, and a variation for T major_first and T major_last for the counting + // iterator/transform_iterator cases Need to think about how to consolidate MG/SG variations. also + // do this for the other compute* functions. + template , int> = 0> + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, T* major_first, T* major_last, raft::device_span count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async(mask_first, + compute_number_of_edges_with_mask_async(edge_mask, static_cast(major_first), static_cast(major_last), count, @@ -430,8 +438,7 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async( - static_cast(nullptr), major_first, major_last, count, stream); + compute_number_of_edges_with_mask_async({}, major_first, major_last, count, stream); } template @@ -440,43 +447,49 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async(static_cast(nullptr), - major_first, - major_last, - raft::device_span(count.data(), 1), - stream); + compute_number_of_edges_with_mask_async( + {}, major_first, major_last, raft::device_span(count.data(), 1), stream); return count.value(stream); } - template - __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const { rmm::device_scalar count(size_t{0}, stream); compute_number_of_edges_with_mask_async( - mask_first, major_first, major_last, raft::device_span(count.data(), 1), stream); + edge_mask, major_first, major_last, raft::device_span(count.data(), 1), stream); return count.value(stream); } - template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, rmm::cuda_stream_view stream) const; + raft::device_span edge_mask, rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask( + edge_mask, + thrust::make_counting_iterator(this->major_range_first()), + thrust::make_counting_iterator(this->major_range_last()), + stream); + } - template + template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, + raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const; - template , int> = 0> + template , int> = 0> __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, T* major_first, T* major_last, rmm::cuda_stream_view stream) const + raft::device_span edge_mask, + T* major_first, + T* major_last, + rmm::cuda_stream_view stream) const { return compute_local_degrees_with_mask( - mask_first, static_cast(major_first), static_cast(major_last), stream); + edge_mask, static_cast(major_first), static_cast(major_last), stream); } template @@ -484,8 +497,7 @@ class edge_partition_device_view_t(nullptr), major_first, major_last, stream); + return compute_local_degrees_with_mask({}, major_first, major_last, stream); } __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; } @@ -569,4 +581,19 @@ class edge_partition_device_view_t +struct sparse_hypersparse_major_op_t { + edge_partition_device_view_t edge_partition; + __device__ vertex_t operator()(vertex_t i) const + { + return edge_partition.major_from_major_idx_nocheck(i); + } +}; + +} // namespace detail + } // namespace cugraph diff --git a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh index 9c1f0abde3c..df168751fa4 100644 --- a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh @@ -5,7 +5,6 @@ #pragma once -#include #include #include #include @@ -902,11 +901,11 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, auto frontier_major_last = thrust_tuple_get_or_identity(frontier_key_last); if (edge_partition_e_mask) { - local_max_pushes = - edge_partition.compute_number_of_edges_with_mask(edge_partition_e_mask->value_first(), - frontier_major_first, - frontier_major_last, - handle.get_stream()); + auto edge_partition_e_mask_span = + raft::device_span(edge_partition_e_mask->value_first(), + static_cast(edge_partition.number_of_edges())); + local_max_pushes = edge_partition.compute_number_of_edges_with_mask( + edge_partition_e_mask_span, frontier_major_first, frontier_major_last, handle.get_stream()); } else { local_max_pushes = edge_partition.compute_number_of_edges( frontier_major_first, frontier_major_last, handle.get_stream()); @@ -1428,13 +1427,15 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, bool computed{false}; if constexpr (try_bitmap) { if (keys.index() == 0) { - auto major_first = - cuda::make_transform_iterator(std::get<0>(keys).begin(), - detail::bitmap_offset_to_vertex_op_t{ - local_frontier_range_firsts[partition_idx]}); + auto major_first = cuda::make_transform_iterator( + std::get<0>(keys).begin(), + detail::shift_right_t{local_frontier_range_firsts[partition_idx]}); if (edge_partition_e_mask) { - edge_partition.compute_number_of_edges_with_mask_async( + auto edge_partition_e_mask_span = raft::device_span( edge_partition_e_mask->value_first(), + static_cast(edge_partition.number_of_edges())); + edge_partition.compute_number_of_edges_with_mask_async( + edge_partition_e_mask_span, major_first, major_first + std::get<0>(keys).size(), raft::device_span(counters.data() + j, size_t{1}), @@ -1462,8 +1463,11 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, } auto major_first = thrust_tuple_get_or_identity(key_first); if (edge_partition_e_mask) { - edge_partition.compute_number_of_edges_with_mask_async( + auto edge_partition_e_mask_span = raft::device_span( edge_partition_e_mask->value_first(), + static_cast(edge_partition.number_of_edges())); + edge_partition.compute_number_of_edges_with_mask_async( + edge_partition_e_mask_span, major_first, major_first + num_keys, raft::device_span(counters.data() + j, size_t{1}), diff --git a/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh index 325576b4259..719fe09b8ab 100644 --- a/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -2414,7 +2414,9 @@ compute_aggregate_local_frontier_local_degrees(raft::handle_t const& handle, aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], handle.get_stream()) : edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), + raft::device_span( + (*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())), aggregate_local_frontier_major_first + local_frontier_offsets[i], aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], handle.get_stream()); diff --git a/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh index d516dbac6af..f9d84de7796 100644 --- a/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh @@ -424,10 +424,17 @@ auto transform_v_frontier_e(raft::handle_t const& handle, aggregate_local_frontier_key_first + local_frontier_offsets[i]; auto edge_partition_frontier_major_first = thrust_tuple_get_or_identity(edge_partition_frontier_key_first); + cuda::std::optional> edge_partition_mask_span{ + cuda::std::nullopt}; + if (edge_partition_e_mask) { + edge_partition_mask_span = + raft::device_span((*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())); + } auto edge_partition_frontier_local_degrees = edge_partition_e_mask ? edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), + edge_partition_mask_span.value(), edge_partition_frontier_major_first, edge_partition_frontier_major_first + (local_frontier_offsets[i + 1] - local_frontier_offsets[i]), diff --git a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index bc07d6b4b78..8ff0e1652a5 100644 --- a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -4,7 +4,6 @@ */ #pragma once -#include #include #include #include @@ -354,6 +353,9 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( std::optional> offsets_with_mask{std::nullopt}; if (edge_partition_e_mask) { + auto edge_partition_mask_span = + raft::device_span((*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())); rmm::device_uvector degrees_with_mask(0, handle.get_stream()); if (edge_partition.dcs_nzd_vertices()) { auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); @@ -364,17 +366,16 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( major_sparse_range_size + *(edge_partition.dcs_nzd_vertex_count()), handle.get_stream()); auto major_first = cuda::make_transform_iterator( thrust::make_counting_iterator(vertex_t{0}), - detail::sparse_hypersparse_major_op_t{major_sparse_range_size, - edge_partition.major_range_first(), - *(edge_partition.dcs_nzd_vertices())}); + detail::sparse_hypersparse_major_op_t{ + edge_partition}); degrees_with_mask = - edge_partition.compute_local_degrees_with_mask((*edge_partition_e_mask).value_first(), + edge_partition.compute_local_degrees_with_mask(edge_partition_mask_span, major_first, major_first + degrees_with_mask.size(), handle.get_stream()); } else { degrees_with_mask = edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), + edge_partition_mask_span, thrust::make_counting_iterator(edge_partition.major_range_first()), thrust::make_counting_iterator(edge_partition.major_range_last()), handle.get_stream()); diff --git a/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh index a13bf51d3a2..a8e7ac13e50 100644 --- a/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh @@ -556,8 +556,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle, rmm::device_uvector tmp_keys(0, handle.get_stream()); std::optional> edge_offsets_with_mask{std::nullopt}; if (edge_partition_e_mask) { - auto local_degrees = edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), handle.get_stream()); + auto edge_partition_mask_span = + raft::device_span((*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())); + auto local_degrees = edge_partition.compute_local_degrees_with_mask(edge_partition_mask_span, + handle.get_stream()); edge_offsets_with_mask = rmm::device_uvector(edge_partition.major_range_size() + 1, handle.get_stream()); (*edge_offsets_with_mask).set_element_to_zero_async(0, handle.get_stream()); diff --git a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh index c8bb00ebf1f..568bf7d19aa 100644 --- a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh @@ -1180,8 +1180,11 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, handle.get_stream()); if (edge_partition_e_mask) { + auto edge_partition_mask_span = + raft::device_span((*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())); ret += - edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), + edge_partition.compute_number_of_edges_with_mask(edge_partition_mask_span, edge_partition_frontier_vertices.begin(), edge_partition_frontier_vertices.end(), handle.get_stream()); @@ -1193,8 +1196,11 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, } else { assert(i == 0); if (edge_partition_e_mask) { + auto edge_partition_mask_span = + raft::device_span((*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())); ret += edge_partition.compute_number_of_edges_with_mask( - (*edge_partition_e_mask).value_first(), + edge_partition_mask_span, local_frontier_vertex_first, local_frontier_vertex_first + frontier.size(), handle.get_stream()); diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index bed45dc118e..f42c925e9be 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -588,8 +588,11 @@ EdgeTypeAndIdToSrcDstLookupContainerType build_edge_id_and_type_to_src_dst_looku auto number_of_local_edges = edge_partition.number_of_edges(); if (graph_view.has_edge_mask()) { + auto edge_partition_mask_span = + raft::device_span((*edge_partition_mask_view).value_first(), + static_cast(edge_partition.number_of_edges())); number_of_local_edges = edge_partition.compute_number_of_edges_with_mask( - (*edge_partition_mask_view).value_first(), + edge_partition_mask_span, thrust::make_counting_iterator(edge_partition.major_range_first()), thrust::make_counting_iterator(edge_partition.major_range_last()), handle.get_stream()); diff --git a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu deleted file mode 100644 index 02d5ea5f102..00000000000 --- a/cpp/src/structure/compute_number_of_edges_mg_v32_e32.cu +++ /dev/null @@ -1,61 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#include -#include - -namespace cugraph { - -using view_t = edge_partition_device_view_t; - -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( - uint32_t const*, - int32_t const*, - int32_t const*, - raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - raft::device_span, - rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - uint32_t const*, rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector -view_t::compute_local_degrees_with_mask( - uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector -view_t::compute_local_degrees_with_mask>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - -// bitmap iterator -using bitmap_iter_32_t = - cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_with_mask_async( - uint32_t const*, - bitmap_iter_32_t, - bitmap_iter_32_t, - raft::device_span, - rmm::cuda_stream_view) const; - -// sparse-hypersparse iterator -using sh_iter_32_t = cuda::transform_iterator, - thrust::counting_iterator>; -template rmm::device_uvector -view_t::compute_local_degrees_with_mask(uint32_t const*, - sh_iter_32_t, - sh_iter_32_t, - rmm::cuda_stream_view) const; - -} // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu deleted file mode 100644 index 1cf141cf76f..00000000000 --- a/cpp/src/structure/compute_number_of_edges_mg_v64_e64.cu +++ /dev/null @@ -1,61 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#include -#include - -namespace cugraph { - -using view_t = edge_partition_device_view_t; - -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( - uint32_t const*, - int64_t const*, - int64_t const*, - raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - raft::device_span, - rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - uint32_t const*, rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector -view_t::compute_local_degrees_with_mask( - uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector -view_t::compute_local_degrees_with_mask>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - -// bitmap iterator -using bitmap_iter_64_t = - cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_with_mask_async( - uint32_t const*, - bitmap_iter_64_t, - bitmap_iter_64_t, - raft::device_span, - rmm::cuda_stream_view) const; - -// sparse-hypersparse iterator -using sh_iter_64_t = cuda::transform_iterator, - thrust::counting_iterator>; -template rmm::device_uvector -view_t::compute_local_degrees_with_mask(uint32_t const*, - sh_iter_64_t, - sh_iter_64_t, - rmm::cuda_stream_view) const; - -} // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu b/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu deleted file mode 100644 index 0bd062ab62f..00000000000 --- a/cpp/src/structure/compute_number_of_edges_sg_v32_e32.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#include - -namespace cugraph { - -using view_t = edge_partition_device_view_t; - -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( - uint32_t const*, - int32_t const*, - int32_t const*, - raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - raft::device_span, - rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - uint32_t const*, rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector -view_t::compute_local_degrees_with_mask( - uint32_t const*, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector -view_t::compute_local_degrees_with_mask>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - -} // namespace cugraph diff --git a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu b/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu deleted file mode 100644 index 46164161465..00000000000 --- a/cpp/src/structure/compute_number_of_edges_sg_v64_e64.cu +++ /dev/null @@ -1,41 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. - * SPDX-License-Identifier: Apache-2.0 - */ -#include - -namespace cugraph { - -using view_t = edge_partition_device_view_t; - -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( - uint32_t const*, - int64_t const*, - int64_t const*, - raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - raft::device_span, - rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator only) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - uint32_t const*, rmm::cuda_stream_view) const; - -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector -view_t::compute_local_degrees_with_mask( - uint32_t const*, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector -view_t::compute_local_degrees_with_mask>( - uint32_t const*, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - -} // namespace cugraph diff --git a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh b/cpp/src/structure/edge_partition_device_view_impl.cuh similarity index 61% rename from cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh rename to cpp/src/structure/edge_partition_device_view_impl.cuh index 0b6ac71479e..1da0881f3fc 100644 --- a/cpp/include/cugraph/detail/compute_number_of_edges_impl.cuh +++ b/cpp/src/structure/edge_partition_device_view_impl.cuh @@ -23,10 +23,10 @@ namespace cugraph { // ============================================================================ template -template +template __host__ void edge_partition_device_view_t>:: - compute_number_of_edges_with_mask_async(MaskIterator mask_first, + compute_number_of_edges_with_mask_async(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span count, @@ -42,12 +42,12 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_, - mask_first}); + edge_mask.data()}); cub::DeviceReduce::Sum(static_cast(nullptr), tmp_storage_bytes, local_degree_first, @@ -64,12 +64,12 @@ edge_partition_device_view_t{ + detail::local_degree_op_t{ this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */, - mask_first}); + edge_mask.data()}); cub::DeviceReduce::Sum(static_cast(nullptr), tmp_storage_bytes, local_degree_first, @@ -87,46 +87,10 @@ edge_partition_device_view_t -template +template __host__ rmm::device_uvector edge_partition_device_view_t>:: - compute_local_degrees_with_mask(MaskIterator mask_first, rmm::cuda_stream_view stream) const -{ - rmm::device_uvector local_degrees(this->major_range_size(), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - } - return local_degrees; -} - -template -template -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees_with_mask(MaskIterator mask_first, + compute_local_degrees_with_mask(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const @@ -134,29 +98,27 @@ edge_partition_device_view_t local_degrees(cuda::std::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); + thrust::transform(rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + dcs_nzd_vertices_.value(), + major_hypersparse_first_.value_or(vertex_t{0}), + edge_mask.data()}); } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); + thrust::transform(rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + this->offsets_, + major_range_first_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); } return local_degrees; } @@ -166,10 +128,10 @@ edge_partition_device_view_t -template +template __host__ void edge_partition_device_view_t>:: - compute_number_of_edges_with_mask_async(MaskIterator mask_first, + compute_number_of_edges_with_mask_async(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span count, @@ -185,12 +147,11 @@ edge_partition_device_view_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); + detail::local_degree_op_t{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); cub::DeviceReduce::Sum(static_cast(nullptr), tmp_storage_bytes, local_degree_first, @@ -207,31 +168,10 @@ edge_partition_device_view_t -template -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees_with_mask(MaskIterator mask_first, rmm::cuda_stream_view stream) const -{ - rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - return local_degrees; -} - -template -template +template __host__ rmm::device_uvector edge_partition_device_view_t>:: - compute_local_degrees_with_mask(MaskIterator mask_first, + compute_local_degrees_with_mask(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const @@ -242,12 +182,11 @@ edge_partition_device_view_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); + detail::local_degree_op_t{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); return local_degrees; } diff --git a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu new file mode 100644 index 00000000000..85308149f7e --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu @@ -0,0 +1,52 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +#include + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_number_of_edges_with_mask_async +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + int32_t const*, + int32_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + raft::device_span, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask< + thrust::counting_iterator>(raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; + +// bitmap iterator +using bitmap_iter_32_t = cuda::transform_iterator, uint32_t const*>; +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + bitmap_iter_32_t, + bitmap_iter_32_t, + raft::device_span, + rmm::cuda_stream_view) const; + +// sparse-hypersparse iterator +using sh_iter_32_t = + cuda::transform_iterator, + thrust::counting_iterator>; +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + raft::device_span, sh_iter_32_t, sh_iter_32_t, rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu new file mode 100644 index 00000000000..b776239e39e --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu @@ -0,0 +1,52 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +#include + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_number_of_edges_with_mask_async +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + int64_t const*, + int64_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + raft::device_span, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask< + thrust::counting_iterator>(raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; + +// bitmap iterator +using bitmap_iter_64_t = cuda::transform_iterator, uint32_t const*>; +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + bitmap_iter_64_t, + bitmap_iter_64_t, + raft::device_span, + rmm::cuda_stream_view) const; + +// sparse-hypersparse iterator +using sh_iter_64_t = + cuda::transform_iterator, + thrust::counting_iterator>; +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + raft::device_span, sh_iter_64_t, sh_iter_64_t, rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu new file mode 100644 index 00000000000..6fc734c01c8 --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_number_of_edges_with_mask_async +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + int32_t const*, + int32_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + raft::device_span, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask< + thrust::counting_iterator>(raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu new file mode 100644 index 00000000000..a0d02061f1f --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +namespace cugraph { + +using view_t = edge_partition_device_view_t; + +// compute_number_of_edges_with_mask_async +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + int64_t const*, + int64_t const*, + raft::device_span, + rmm::cuda_stream_view) const; +template void view_t::compute_number_of_edges_with_mask_async>( + raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + raft::device_span, + rmm::cuda_stream_view) const; + +// compute_local_degrees_with_mask (MaskIterator + MajorIterator) +template rmm::device_uvector view_t::compute_local_degrees_with_mask( + raft::device_span, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; +template rmm::device_uvector view_t::compute_local_degrees_with_mask< + thrust::counting_iterator>(raft::device_span, + thrust::counting_iterator, + thrust::counting_iterator, + rmm::cuda_stream_view) const; + +} // namespace cugraph diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 1d89150ec79..207bb28160b 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -325,8 +325,11 @@ void bfs(raft::handle_t const& handle, // partition_size * 0.5 & partition_size * // hypersparse_threshold_ratio * 0.5 as approximate out degrees if (edge_partition_e_mask) { + auto edge_partition_mask_span = + raft::device_span((*edge_partition_e_mask).value_first(), + static_cast(edge_partition.number_of_edges())); approx_out_degrees = edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), + edge_partition_mask_span, thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + high_and_mid_degree_segment_size, diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 270a509dae0..49af2bee8a9 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include #include From 0995cc0ac751d1d89fe95ce317be724798c46152 Mon Sep 17 00:00:00 2001 From: Charles Hastings Date: Mon, 27 Apr 2026 13:30:08 -0700 Subject: [PATCH 07/10] remove old comment, add specialization of compute_number_of_edges for od_shortest_path --- cpp/include/cugraph/edge_partition_device_view.cuh | 4 ---- .../traversal/od_shortest_distances_sg_v32_e32.cu | 13 ++++++++++++- .../traversal/od_shortest_distances_sg_v64_e64.cu | 13 ++++++++++++- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 9770ca4b384..58ca8607d73 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -414,10 +414,6 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const; - // OK... the next step is to support raft::device_span instead of - // major_first/major_last, and a variation for T major_first and T major_last for the counting - // iterator/transform_iterator cases Need to think about how to consolidate MG/SG variations. also - // do this for the other compute* functions. template , int> = 0> __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, T* major_first, diff --git a/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu b/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu index 4e0a4d720b0..25997878583 100644 --- a/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu +++ b/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu @@ -1,7 +1,8 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ +#include "structure/edge_partition_device_view_impl.cuh" #include "traversal/od_shortest_distances_impl.cuh" namespace cugraph { @@ -26,4 +27,14 @@ template rmm::device_uvector od_shortest_distances( double cutoff, bool do_expensive_check); +using view_t = edge_partition_device_view_t; +using od_extract_iter_t = + cuda::transform_iterator, uint64_t const*>; +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + od_extract_iter_t, + od_extract_iter_t, + raft::device_span, + rmm::cuda_stream_view) const; + } // namespace cugraph diff --git a/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu b/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu index 37112ce5e68..a853a3328a0 100644 --- a/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu +++ b/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu @@ -1,7 +1,8 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ +#include "structure/edge_partition_device_view_impl.cuh" #include "traversal/od_shortest_distances_impl.cuh" namespace cugraph { @@ -26,4 +27,14 @@ template rmm::device_uvector od_shortest_distances( double cutoff, bool do_expensive_check); +using view_t = edge_partition_device_view_t; +using od_extract_iter_t = + cuda::transform_iterator, uint64_t const*>; +template void view_t::compute_number_of_edges_with_mask_async( + raft::device_span, + od_extract_iter_t, + od_extract_iter_t, + raft::device_span, + rmm::cuda_stream_view) const; + } // namespace cugraph From 04237f90a041ecafa9f2a91a11aeb5fafb5c69bd Mon Sep 17 00:00:00 2001 From: Charles Hastings Date: Thu, 30 Apr 2026 08:30:16 -0700 Subject: [PATCH 08/10] Move implementation into detail methods, let default compile inline --- .../cugraph/edge_partition_device_view.cuh | 568 ++++++++++++++++-- .../extract_transform_if_v_frontier_e.cuh | 29 +- .../sample_and_compute_local_nbr_indices.cuh | 2 +- .../prims/detail/transform_v_frontier_e.cuh | 6 +- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 11 +- .../transform_reduce_e_by_src_dst_key.cuh | 6 +- ...reduce_if_v_frontier_outgoing_e_by_dst.cuh | 33 +- cpp/src/lookup/lookup_src_dst_impl.cuh | 9 +- .../edge_partition_device_view_impl.cuh | 281 ++++----- .../edge_partition_device_view_mg_v32_e32.cu | 55 +- .../edge_partition_device_view_mg_v64_e64.cu | 67 ++- .../edge_partition_device_view_sg_v32_e32.cu | 33 +- .../edge_partition_device_view_sg_v64_e64.cu | 50 +- cpp/src/traversal/bfs_impl.cuh | 19 +- .../traversal/od_shortest_distances_impl.cuh | 4 +- 15 files changed, 844 insertions(+), 329 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 58ca8607d73..64d8a0e7df0 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -26,6 +26,7 @@ #include #include +#include #include namespace cugraph { @@ -149,6 +150,238 @@ class edge_partition_device_view_base_t { raft::device_span indices_{nullptr}; }; +template +__host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + return; + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices) { + assert(major_hypersparse_first); + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask.data()}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{offsets, + major_range_first, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } +} + +template +__host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + std::tuple local_vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + MajorIterator major_first, + MajorIterator major_last, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + if (dcs_nzd_vertices) { + assert(major_hypersparse_first); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask.data()}); + } else { + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{offsets, + major_range_first, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); + } + return local_degrees; +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + return; + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{offsets, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); +} + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{offsets, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask.data()}); + return local_degrees; +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream); + } // namespace detail template @@ -172,45 +405,99 @@ class edge_partition_device_view_t + template + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_mg(edge_mask, + majors, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + __host__ void compute_number_of_edges_with_mask_async( raft::device_span edge_mask, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const; + std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_mg(edge_mask, + vertex_partition_range, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } - template , int> = 0> + template __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, - T* major_first, - T* major_last, + MajorIterator major_first, + MajorIterator major_last, raft::device_span count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async(edge_mask, - static_cast(major_first), - static_cast(major_last), - count, - stream); + detail::compute_number_of_edges_with_mask_async_mg(edge_mask, + major_first, + major_last, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + template + __host__ void compute_number_of_edges_async(raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async({}, majors, count, stream); + } + + __host__ void compute_number_of_edges_async(std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async({}, vertex_partition_range, count, stream); } template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, - raft::device_span count /* size = 1 */, + raft::device_span count, rmm::cuda_stream_view stream) const { compute_number_of_edges_with_mask_async({}, major_first, major_last, count, stream); } - template - __host__ size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const { rmm::device_scalar count(size_t{0}, stream); compute_number_of_edges_with_mask_async( - {}, major_first, major_last, raft::device_span(count.data(), 1), stream); + edge_mask, majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + __host__ size_t + compute_number_of_edges_with_mask(raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + edge_mask, vertex_partition_range, raft::device_span{count.data(), 1}, stream); return count.value(stream); } @@ -222,26 +509,97 @@ class edge_partition_device_view_t count(size_t{0}, stream); compute_number_of_edges_with_mask_async( - edge_mask, major_first, major_last, raft::device_span(count.data(), 1), stream); + edge_mask, major_first, major_last, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async(majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + __host__ size_t compute_number_of_edges(std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + vertex_partition_range, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + major_first, major_last, raft::device_span{count.data(), 1}, stream); return count.value(stream); } __host__ rmm::device_uvector compute_local_degrees_with_mask( raft::device_span edge_mask, rmm::cuda_stream_view stream) const { - return compute_local_degrees_with_mask( + return detail::compute_local_degrees_with_mask_mg( edge_mask, - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), + std::tuple{this->major_range_first(), this->major_range_last()}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, stream); } + template + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg(edge_mask, + majors, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg(edge_mask, + vertex_partition_range, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + template __host__ rmm::device_uvector compute_local_degrees_with_mask( raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg(edge_mask, + major_first, + major_last, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } template , int> = 0> __host__ rmm::device_uvector compute_local_degrees_with_mask( @@ -254,6 +612,19 @@ class edge_partition_device_view_t(major_first), static_cast(major_last), stream); } + template + __host__ rmm::device_uvector compute_local_degrees(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask({}, majors, stream); + } + + __host__ rmm::device_uvector compute_local_degrees( + std::tuple vertex_partition_range, rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask({}, vertex_partition_range, stream); + } + template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, @@ -356,7 +727,7 @@ class edge_partition_device_view_t> for consistency + // FIXME: better return cuda::std::optional> for consistency // (see dcs_nzd_range_bitmap()) __host__ __device__ cuda::std::optional dcs_nzd_vertices() const { @@ -406,45 +777,80 @@ class edge_partition_device_view_t + template + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_sg( + edge_mask, majors, count, this->offsets_, stream); + } + __host__ void compute_number_of_edges_with_mask_async( raft::device_span edge_mask, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const; + std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_sg( + edge_mask, vertex_partition_range, count, this->offsets_, stream); + } - template , int> = 0> + template __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, - T* major_first, - T* major_last, + MajorIterator major_first, + MajorIterator major_last, raft::device_span count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async(edge_mask, - static_cast(major_first), - static_cast(major_last), - count, - stream); + detail::compute_number_of_edges_with_mask_async_sg( + edge_mask, major_first, major_last, count, this->offsets_, stream); + } + + template + __host__ void compute_number_of_edges_async(raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async({}, majors, count, stream); + } + + __host__ void compute_number_of_edges_async(std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + compute_number_of_edges_with_mask_async({}, vertex_partition_range, count, stream); } template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, - raft::device_span count /* size = 1 */, + raft::device_span count, rmm::cuda_stream_view stream) const { compute_number_of_edges_with_mask_async({}, major_first, major_last, count, stream); } - template - __host__ size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const { rmm::device_scalar count(size_t{0}, stream); compute_number_of_edges_with_mask_async( - {}, major_first, major_last, raft::device_span(count.data(), 1), stream); + edge_mask, majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + __host__ size_t + compute_number_of_edges_with_mask(raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + edge_mask, vertex_partition_range, raft::device_span{count.data(), 1}, stream); return count.value(stream); } @@ -456,26 +862,77 @@ class edge_partition_device_view_t count(size_t{0}, stream); compute_number_of_edges_with_mask_async( - edge_mask, major_first, major_last, raft::device_span(count.data(), 1), stream); + edge_mask, major_first, major_last, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async(majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + __host__ size_t compute_number_of_edges(std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + vertex_partition_range, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + major_first, major_last, raft::device_span{count.data(), 1}, stream); return count.value(stream); } __host__ rmm::device_uvector compute_local_degrees_with_mask( raft::device_span edge_mask, rmm::cuda_stream_view stream) const { - return compute_local_degrees_with_mask( + return detail::compute_local_degrees_with_mask_sg( edge_mask, - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), + std::tuple{this->major_range_first(), this->major_range_last()}, + this->offsets_, stream); } + template + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg(edge_mask, majors, this->offsets_, stream); + } + + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + edge_mask, vertex_partition_range, this->offsets_, stream); + } + template __host__ rmm::device_uvector compute_local_degrees_with_mask( raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, - rmm::cuda_stream_view stream) const; + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + edge_mask, major_first, major_last, this->offsets_, stream); + } template , int> = 0> __host__ rmm::device_uvector compute_local_degrees_with_mask( @@ -488,6 +945,19 @@ class edge_partition_device_view_t(major_first), static_cast(major_last), stream); } + template + __host__ rmm::device_uvector compute_local_degrees(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask({}, majors, stream); + } + + __host__ rmm::device_uvector compute_local_degrees( + std::tuple vertex_partition_range, rmm::cuda_stream_view stream) const + { + return compute_local_degrees_with_mask({}, vertex_partition_range, stream); + } + template __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, MajorIterator major_last, diff --git a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh index df168751fa4..5936351a2bd 100644 --- a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh @@ -896,19 +896,18 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, detail::edge_partition_edge_property_device_view_t>( *edge_mask_view, partition_idx) : cuda::std::nullopt; - auto frontier_major_first = - thrust_tuple_get_or_identity(frontier_key_first); - auto frontier_major_last = - thrust_tuple_get_or_identity(frontier_key_last); + auto frontier_majors = raft::device_span{ + thrust_tuple_get_or_identity(frontier_key_first), + static_cast(cuda::std::distance(frontier_key_first, frontier_key_last))}; if (edge_partition_e_mask) { - auto edge_partition_e_mask_span = - raft::device_span(edge_partition_e_mask->value_first(), - static_cast(edge_partition.number_of_edges())); + auto edge_partition_e_mask_span = raft::device_span( + edge_partition_e_mask->value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); local_max_pushes = edge_partition.compute_number_of_edges_with_mask( - edge_partition_e_mask_span, frontier_major_first, frontier_major_last, handle.get_stream()); + edge_partition_e_mask_span, frontier_majors, handle.get_stream()); } else { - local_max_pushes = edge_partition.compute_number_of_edges( - frontier_major_first, frontier_major_last, handle.get_stream()); + local_max_pushes = + edge_partition.compute_number_of_edges(frontier_majors, handle.get_stream()); } } @@ -1433,7 +1432,7 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, if (edge_partition_e_mask) { auto edge_partition_e_mask_span = raft::device_span( edge_partition_e_mask->value_first(), - static_cast(edge_partition.number_of_edges())); + packed_bool_size(static_cast(edge_partition.number_of_edges()))); edge_partition.compute_number_of_edges_with_mask_async( edge_partition_e_mask_span, major_first, @@ -1465,17 +1464,15 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, if (edge_partition_e_mask) { auto edge_partition_e_mask_span = raft::device_span( edge_partition_e_mask->value_first(), - static_cast(edge_partition.number_of_edges())); + packed_bool_size(static_cast(edge_partition.number_of_edges()))); edge_partition.compute_number_of_edges_with_mask_async( edge_partition_e_mask_span, - major_first, - major_first + num_keys, + raft::device_span{major_first, num_keys}, raft::device_span(counters.data() + j, size_t{1}), loop_stream); } else { edge_partition.compute_number_of_edges_async( - major_first, - major_first + num_keys, + raft::device_span{major_first, num_keys}, raft::device_span(counters.data() + j, size_t{1}), loop_stream); } diff --git a/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh index 719fe09b8ab..8e6324e36db 100644 --- a/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -2416,7 +2416,7 @@ compute_aggregate_local_frontier_local_degrees(raft::handle_t const& handle, : edge_partition.compute_local_degrees_with_mask( raft::device_span( (*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())), + packed_bool_size(static_cast(edge_partition.number_of_edges()))), aggregate_local_frontier_major_first + local_frontier_offsets[i], aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], handle.get_stream()); diff --git a/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh index f9d84de7796..9e3e0b3ac6f 100644 --- a/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh @@ -427,9 +427,9 @@ auto transform_v_frontier_e(raft::handle_t const& handle, cuda::std::optional> edge_partition_mask_span{ cuda::std::nullopt}; if (edge_partition_e_mask) { - edge_partition_mask_span = - raft::device_span((*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())); + edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); } auto edge_partition_frontier_local_degrees = diff --git a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index 8ff0e1652a5..3b8f1fa15d9 100644 --- a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -44,6 +44,7 @@ #include #include +#include #include namespace cugraph { @@ -353,9 +354,9 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( std::optional> offsets_with_mask{std::nullopt}; if (edge_partition_e_mask) { - auto edge_partition_mask_span = - raft::device_span((*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); rmm::device_uvector degrees_with_mask(0, handle.get_stream()); if (edge_partition.dcs_nzd_vertices()) { auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); @@ -376,8 +377,8 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( } else { degrees_with_mask = edge_partition.compute_local_degrees_with_mask( edge_partition_mask_span, - thrust::make_counting_iterator(edge_partition.major_range_first()), - thrust::make_counting_iterator(edge_partition.major_range_last()), + std::tuple{edge_partition.major_range_first(), + edge_partition.major_range_last()}, handle.get_stream()); } offsets_with_mask = diff --git a/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh index a8e7ac13e50..61be8babfc5 100644 --- a/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh @@ -556,9 +556,9 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle, rmm::device_uvector tmp_keys(0, handle.get_stream()); std::optional> edge_offsets_with_mask{std::nullopt}; if (edge_partition_e_mask) { - auto edge_partition_mask_span = - raft::device_span((*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); auto local_degrees = edge_partition.compute_local_degrees_with_mask(edge_partition_mask_span, handle.get_stream()); edge_offsets_with_mask = diff --git a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh index 568bf7d19aa..9ce8343ea20 100644 --- a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh @@ -1178,36 +1178,33 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, local_frontier_sizes[i], static_cast(i), handle.get_stream()); + auto edge_partition_frontier_vertices_span = raft::device_span{ + edge_partition_frontier_vertices.data(), edge_partition_frontier_vertices.size()}; if (edge_partition_e_mask) { - auto edge_partition_mask_span = - raft::device_span((*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())); - ret += - edge_partition.compute_number_of_edges_with_mask(edge_partition_mask_span, - edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + ret += edge_partition.compute_number_of_edges_with_mask( + edge_partition_mask_span, edge_partition_frontier_vertices_span, handle.get_stream()); } else { - ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), + ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices_span, handle.get_stream()); } } else { assert(i == 0); if (edge_partition_e_mask) { - auto edge_partition_mask_span = - raft::device_span((*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); ret += edge_partition.compute_number_of_edges_with_mask( edge_partition_mask_span, - local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), + raft::device_span{local_frontier_vertex_first, frontier.size()}, handle.get_stream()); } else { - ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); + ret += edge_partition.compute_number_of_edges( + raft::device_span{local_frontier_vertex_first, frontier.size()}, + handle.get_stream()); } } } diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index f42c925e9be..fcccc1b1be2 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -588,13 +588,12 @@ EdgeTypeAndIdToSrcDstLookupContainerType build_edge_id_and_type_to_src_dst_looku auto number_of_local_edges = edge_partition.number_of_edges(); if (graph_view.has_edge_mask()) { - auto edge_partition_mask_span = - raft::device_span((*edge_partition_mask_view).value_first(), - static_cast(edge_partition.number_of_edges())); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_mask_view).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); number_of_local_edges = edge_partition.compute_number_of_edges_with_mask( edge_partition_mask_span, - thrust::make_counting_iterator(edge_partition.major_range_first()), - thrust::make_counting_iterator(edge_partition.major_range_last()), + std::make_tuple(edge_partition.major_range_first(), edge_partition.major_range_last()), handle.get_stream()); } diff --git a/cpp/src/structure/edge_partition_device_view_impl.cuh b/cpp/src/structure/edge_partition_device_view_impl.cuh index 1da0881f3fc..3f081088521 100644 --- a/cpp/src/structure/edge_partition_device_view_impl.cuh +++ b/cpp/src/structure/edge_partition_device_view_impl.cuh @@ -10,184 +10,159 @@ #include #include -#include -#include -#include -#include +#include namespace cugraph { +namespace detail { // ============================================================================ // MG specialization: out-of-line definitions // ============================================================================ - -template -template -__host__ void -edge_partition_device_view_t>:: - compute_number_of_edges_with_mask_async(raft::device_span edge_mask, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count, - rmm::cuda_stream_view stream) const +template +__host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } + compute_number_of_edges_with_mask_async_mg(edge_mask, + majors.begin(), + majors.end(), + count, + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); +} - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; +template +__host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + std::tuple local_vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + compute_number_of_edges_with_mask_async_mg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(local_vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(local_vertex_partition_range)), + count, + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); +} - if (dcs_nzd_vertices_) { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_, - edge_mask.data()}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } else { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - edge_mask.data()}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + return compute_local_degrees_with_mask_mg(edge_mask, + majors.begin(), + majors.end(), + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); } -template -template -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees_with_mask(raft::device_span edge_mask, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0}), - edge_mask.data()}); - } else { - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - edge_mask.data()}); - } - return local_degrees; + return compute_local_degrees_with_mask_mg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(local_vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(local_vertex_partition_range)), + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); } // ============================================================================ // SG specialization: out-of-line definitions // ============================================================================ -template -template -__host__ void -edge_partition_device_view_t>:: - compute_number_of_edges_with_mask_async(raft::device_span edge_mask, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count, - rmm::cuda_stream_view stream) const +template +__host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream) { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - return; - } + compute_number_of_edges_with_mask_async_sg( + edge_mask, majors.begin(), majors.end(), count, offsets, stream); +} - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; +template +__host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + compute_number_of_edges_with_mask_async_sg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(vertex_partition_range)), + count, + offsets, + stream); +} - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - edge_mask.data()}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + return compute_local_degrees_with_mask_sg( + edge_mask, majors.begin(), majors.end(), offsets, stream); } -template -template -__host__ rmm::device_uvector -edge_partition_device_view_t>:: - compute_local_degrees_with_mask(raft::device_span edge_mask, - MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream) { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - edge_mask.data()}); - return local_degrees; + return compute_local_degrees_with_mask_sg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(vertex_partition_range)), + offsets, + stream); } +} // namespace detail } // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu index 85308149f7e..de9c5737336 100644 --- a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu +++ b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu @@ -4,25 +4,53 @@ */ #include "edge_partition_device_view_impl.cuh" +#include #include namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( +// detail::compute_number_of_edges_with_mask_async_mg +template __host__ void detail::compute_number_of_edges_with_mask_async_mg( raft::device_span, - int32_t const*, - int32_t const*, + raft::device_span, raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( + cuda::std::optional>, + int32_t, + cuda::std::optional, + raft::device_span, + rmm::cuda_stream_view); + +template __host__ void detail::compute_number_of_edges_with_mask_async_mg( raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, + std::tuple, raft::device_span, - rmm::cuda_stream_view) const; + cuda::std::optional>, + int32_t, + cuda::std::optional, + raft::device_span, + rmm::cuda_stream_view); + +template __host__ rmm::device_uvector +detail::compute_local_degrees_with_mask_mg( + raft::device_span, + raft::device_span, + cuda::std::optional>, + int32_t, + cuda::std::optional, + raft::device_span, + rmm::cuda_stream_view); + +template __host__ rmm::device_uvector +detail::compute_local_degrees_with_mask_mg( + raft::device_span, + std::tuple, + cuda::std::optional>, + int32_t, + cuda::std::optional, + raft::device_span, + rmm::cuda_stream_view); // compute_local_degrees_with_mask (MaskIterator + MajorIterator) template rmm::device_uvector view_t::compute_local_degrees_with_mask( @@ -33,15 +61,6 @@ template rmm::device_uvector view_t::compute_local_degrees_with_mask< thrust::counting_iterator, rmm::cuda_stream_view) const; -// bitmap iterator -using bitmap_iter_32_t = cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_with_mask_async( - raft::device_span, - bitmap_iter_32_t, - bitmap_iter_32_t, - raft::device_span, - rmm::cuda_stream_view) const; - // sparse-hypersparse iterator using sh_iter_32_t = cuda::transform_iterator, diff --git a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu index b776239e39e..2839f2b79ab 100644 --- a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu +++ b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu @@ -4,25 +4,53 @@ */ #include "edge_partition_device_view_impl.cuh" +#include #include namespace cugraph { -using view_t = edge_partition_device_view_t; - -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( - raft::device_span, - int64_t const*, - int64_t const*, - raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( - raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, - raft::device_span, - rmm::cuda_stream_view) const; +using vertex_t = int64_t; +using edge_t = int64_t; +using view_t = edge_partition_device_view_t; + +// detail::compute_number_of_edges_with_mask_async_mg +template __host__ void detail::compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void detail::compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector detail::compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector detail::compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); // compute_local_degrees_with_mask (MaskIterator + MajorIterator) template rmm::device_uvector view_t::compute_local_degrees_with_mask( @@ -33,15 +61,6 @@ template rmm::device_uvector view_t::compute_local_degrees_with_mask< thrust::counting_iterator, rmm::cuda_stream_view) const; -// bitmap iterator -using bitmap_iter_64_t = cuda::transform_iterator, uint32_t const*>; -template void view_t::compute_number_of_edges_with_mask_async( - raft::device_span, - bitmap_iter_64_t, - bitmap_iter_64_t, - raft::device_span, - rmm::cuda_stream_view) const; - // sparse-hypersparse iterator using sh_iter_64_t = cuda::transform_iterator, diff --git a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu index 6fc734c01c8..645acd290f2 100644 --- a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu +++ b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu @@ -4,23 +4,38 @@ */ #include "edge_partition_device_view_impl.cuh" +#include +#include + namespace cugraph { using view_t = edge_partition_device_view_t; -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( +// detail::compute_number_of_edges_with_mask_async_sg +template __host__ void detail::compute_number_of_edges_with_mask_async_sg( raft::device_span, - int32_t const*, - int32_t const*, + raft::device_span, raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( + raft::device_span, + rmm::cuda_stream_view); +template __host__ void detail::compute_number_of_edges_with_mask_async_sg( raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, + std::tuple, raft::device_span, - rmm::cuda_stream_view) const; + raft::device_span, + rmm::cuda_stream_view); + +template __host__ rmm::device_uvector + detail::compute_local_degrees_with_mask_sg(raft::device_span, + raft::device_span, + raft::device_span, + rmm::cuda_stream_view); + +template __host__ rmm::device_uvector + detail::compute_local_degrees_with_mask_sg(raft::device_span, + std::tuple, + raft::device_span, + rmm::cuda_stream_view); // compute_local_degrees_with_mask (MaskIterator + MajorIterator) template rmm::device_uvector view_t::compute_local_degrees_with_mask( diff --git a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu index a0d02061f1f..9162c317b6a 100644 --- a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu +++ b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu @@ -4,23 +4,43 @@ */ #include "edge_partition_device_view_impl.cuh" +#include +#include + namespace cugraph { -using view_t = edge_partition_device_view_t; - -// compute_number_of_edges_with_mask_async -template void view_t::compute_number_of_edges_with_mask_async( - raft::device_span, - int64_t const*, - int64_t const*, - raft::device_span, - rmm::cuda_stream_view) const; -template void view_t::compute_number_of_edges_with_mask_async>( - raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, - raft::device_span, - rmm::cuda_stream_view) const; +using view_t = edge_partition_device_view_t; +using vertex_t = int64_t; +using edge_t = int64_t; + +// detail::compute_number_of_edges_with_mask_async_sg +template __host__ void detail::compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void detail::compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector +detail::compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector +detail::compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream); // compute_local_degrees_with_mask (MaskIterator + MajorIterator) template rmm::device_uvector view_t::compute_local_degrees_with_mask( diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 207bb28160b..7516ba0e8c3 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -37,6 +37,7 @@ #include #include +#include #include namespace cugraph { @@ -325,20 +326,20 @@ void bfs(raft::handle_t const& handle, // partition_size * 0.5 & partition_size * // hypersparse_threshold_ratio * 0.5 as approximate out degrees if (edge_partition_e_mask) { - auto edge_partition_mask_span = - raft::device_span((*edge_partition_e_mask).value_first(), - static_cast(edge_partition.number_of_edges())); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); approx_out_degrees = edge_partition.compute_local_degrees_with_mask( edge_partition_mask_span, - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + - high_and_mid_degree_segment_size, + std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + + static_cast(high_and_mid_degree_segment_size)), handle.get_stream()); } else { approx_out_degrees = edge_partition.compute_local_degrees( - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + - high_and_mid_degree_segment_size, + std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + + static_cast(high_and_mid_degree_segment_size)), handle.get_stream()); } thrust::transform(handle.get_thrust_policy(), diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 49af2bee8a9..0e4bdb3018b 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -348,7 +348,9 @@ kv_store_t filter_key_to_dist_ma edge_partition_device_view_t( graph_view.local_edge_partition_view(0)); auto num_edges = edge_partition.compute_number_of_edges( - near_bucket.vertex_begin(), near_bucket.vertex_end(), handle.get_stream()); + raft::device_span{near_bucket.vertex_begin(), near_bucket.size()}, + handle.get_stream()); + for (size_t i = 0; i < far_buffers.size(); ++i) { auto far_vertex_first = cuda::make_transform_iterator( far_buffers[i].begin(), From 7b4c6b191a9b0345b9617331ccf4e08634a0da7c Mon Sep 17 00:00:00 2001 From: Charles Hastings Date: Thu, 30 Apr 2026 09:33:30 -0700 Subject: [PATCH 09/10] clean up edge_partition_device_view .cu files --- .../edge_partition_device_view_mg_v32_e32.cu | 104 ++++++++---------- .../edge_partition_device_view_mg_v64_e64.cu | 28 +---- .../edge_partition_device_view_sg_v32_e32.cu | 70 ++++++------ .../edge_partition_device_view_sg_v64_e64.cu | 23 +--- 4 files changed, 88 insertions(+), 137 deletions(-) diff --git a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu index de9c5737336..b64fb6259c5 100644 --- a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu +++ b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu @@ -8,64 +8,48 @@ #include namespace cugraph { - -using view_t = edge_partition_device_view_t; - -// detail::compute_number_of_edges_with_mask_async_mg -template __host__ void detail::compute_number_of_edges_with_mask_async_mg( - raft::device_span, - raft::device_span, - raft::device_span, - cuda::std::optional>, - int32_t, - cuda::std::optional, - raft::device_span, - rmm::cuda_stream_view); - -template __host__ void detail::compute_number_of_edges_with_mask_async_mg( - raft::device_span, - std::tuple, - raft::device_span, - cuda::std::optional>, - int32_t, - cuda::std::optional, - raft::device_span, - rmm::cuda_stream_view); - -template __host__ rmm::device_uvector -detail::compute_local_degrees_with_mask_mg( - raft::device_span, - raft::device_span, - cuda::std::optional>, - int32_t, - cuda::std::optional, - raft::device_span, - rmm::cuda_stream_view); - -template __host__ rmm::device_uvector -detail::compute_local_degrees_with_mask_mg( - raft::device_span, - std::tuple, - cuda::std::optional>, - int32_t, - cuda::std::optional, - raft::device_span, - rmm::cuda_stream_view); - -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - raft::device_span, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask< - thrust::counting_iterator>(raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - -// sparse-hypersparse iterator -using sh_iter_32_t = - cuda::transform_iterator, - thrust::counting_iterator>; -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - raft::device_span, sh_iter_32_t, sh_iter_32_t, rmm::cuda_stream_view) const; - +namespace detail { + +using vertex_t = int32_t; +using edge_t = int32_t; + +template __host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void compute_number_of_edges_with_mask_async_mg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + raft::device_span edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +} // namespace detail } // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu index 2839f2b79ab..324fe6c5416 100644 --- a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu +++ b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu @@ -8,13 +8,12 @@ #include namespace cugraph { +namespace detail { using vertex_t = int64_t; using edge_t = int64_t; -using view_t = edge_partition_device_view_t; -// detail::compute_number_of_edges_with_mask_async_mg -template __host__ void detail::compute_number_of_edges_with_mask_async_mg( +template __host__ void compute_number_of_edges_with_mask_async_mg( raft::device_span edge_mask, raft::device_span majors, raft::device_span count, @@ -24,7 +23,7 @@ template __host__ void detail::compute_number_of_edges_with_mask_async_mg( raft::device_span offsets, rmm::cuda_stream_view stream); -template __host__ void detail::compute_number_of_edges_with_mask_async_mg( +template __host__ void compute_number_of_edges_with_mask_async_mg( raft::device_span edge_mask, std::tuple vertex_partition_range, raft::device_span count, @@ -34,7 +33,7 @@ template __host__ void detail::compute_number_of_edges_with_mask_async_mg( raft::device_span offsets, rmm::cuda_stream_view stream); -template __host__ rmm::device_uvector detail::compute_local_degrees_with_mask_mg( +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( raft::device_span edge_mask, raft::device_span majors, cuda::std::optional> dcs_nzd_vertices, @@ -43,7 +42,7 @@ template __host__ rmm::device_uvector detail::compute_local_degrees_with raft::device_span offsets, rmm::cuda_stream_view stream); -template __host__ rmm::device_uvector detail::compute_local_degrees_with_mask_mg( +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( raft::device_span edge_mask, std::tuple local_vertex_partition_range, cuda::std::optional> dcs_nzd_vertices, @@ -52,20 +51,5 @@ template __host__ rmm::device_uvector detail::compute_local_degrees_with raft::device_span offsets, rmm::cuda_stream_view stream); -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - raft::device_span, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask< - thrust::counting_iterator>(raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - -// sparse-hypersparse iterator -using sh_iter_64_t = - cuda::transform_iterator, - thrust::counting_iterator>; -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - raft::device_span, sh_iter_64_t, sh_iter_64_t, rmm::cuda_stream_view) const; - +} // namespace detail } // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu index 645acd290f2..2a980737cb6 100644 --- a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu +++ b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu @@ -8,42 +8,36 @@ #include namespace cugraph { - -using view_t = edge_partition_device_view_t; - -// detail::compute_number_of_edges_with_mask_async_sg -template __host__ void detail::compute_number_of_edges_with_mask_async_sg( - raft::device_span, - raft::device_span, - raft::device_span, - raft::device_span, - rmm::cuda_stream_view); -template __host__ void detail::compute_number_of_edges_with_mask_async_sg( - raft::device_span, - std::tuple, - raft::device_span, - raft::device_span, - rmm::cuda_stream_view); - -template __host__ rmm::device_uvector - detail::compute_local_degrees_with_mask_sg(raft::device_span, - raft::device_span, - raft::device_span, - rmm::cuda_stream_view); - -template __host__ rmm::device_uvector - detail::compute_local_degrees_with_mask_sg(raft::device_span, - std::tuple, - raft::device_span, - rmm::cuda_stream_view); - -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - raft::device_span, int32_t const*, int32_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask< - thrust::counting_iterator>(raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - +namespace detail { + +using vertex_t = int32_t; +using edge_t = int32_t; + +template __host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void compute_number_of_edges_with_mask_async_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +} // namespace detail } // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu index 9162c317b6a..c8fd07909de 100644 --- a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu +++ b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu @@ -8,47 +8,36 @@ #include namespace cugraph { +namespace detail { -using view_t = edge_partition_device_view_t; using vertex_t = int64_t; using edge_t = int64_t; -// detail::compute_number_of_edges_with_mask_async_sg -template __host__ void detail::compute_number_of_edges_with_mask_async_sg( +template __host__ void compute_number_of_edges_with_mask_async_sg( raft::device_span edge_mask, raft::device_span majors, raft::device_span count, raft::device_span offsets, rmm::cuda_stream_view stream); -template __host__ void detail::compute_number_of_edges_with_mask_async_sg( +template __host__ void compute_number_of_edges_with_mask_async_sg( raft::device_span edge_mask, std::tuple vertex_partition_range, raft::device_span count, raft::device_span offsets, rmm::cuda_stream_view stream); -template __host__ rmm::device_uvector -detail::compute_local_degrees_with_mask_sg( +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( raft::device_span edge_mask, raft::device_span majors, raft::device_span offsets, rmm::cuda_stream_view stream); -template __host__ rmm::device_uvector -detail::compute_local_degrees_with_mask_sg( +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( raft::device_span edge_mask, std::tuple vertex_partition_range, raft::device_span offsets, rmm::cuda_stream_view stream); -// compute_local_degrees_with_mask (MaskIterator + MajorIterator) -template rmm::device_uvector view_t::compute_local_degrees_with_mask( - raft::device_span, int64_t const*, int64_t const*, rmm::cuda_stream_view) const; -template rmm::device_uvector view_t::compute_local_degrees_with_mask< - thrust::counting_iterator>(raft::device_span, - thrust::counting_iterator, - thrust::counting_iterator, - rmm::cuda_stream_view) const; - +} // namespace detail } // namespace cugraph From b57d4c61611f5f9573b309872d5010a203cbeafb Mon Sep 17 00:00:00 2001 From: Charles Hastings Date: Fri, 1 May 2026 09:57:00 -0700 Subject: [PATCH 10/10] more PR comments - use cuda::std::optional instead of {}, clean up some things --- .../cugraph/edge_partition_device_view.cuh | 327 +++++++++++------- .../extract_transform_if_v_frontier_e.cuh | 46 ++- ...reduce_if_v_frontier_outgoing_e_by_dst.cuh | 4 +- .../edge_partition_device_view_impl.cuh | 16 +- .../edge_partition_device_view_mg_v32_e32.cu | 8 +- .../edge_partition_device_view_mg_v64_e64.cu | 8 +- .../edge_partition_device_view_sg_v32_e32.cu | 8 +- .../edge_partition_device_view_sg_v64_e64.cu | 8 +- .../od_shortest_distances_sg_v32_e32.cu | 13 +- .../od_shortest_distances_sg_v64_e64.cu | 13 +- 10 files changed, 258 insertions(+), 193 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 64d8a0e7df0..7a57c789d96 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -62,12 +62,12 @@ struct local_degree_op_t { dcs_nzd_vertices{}; std::conditional_t major_hypersparse_first{}; - MaskIterator mask_first{}; + cuda::std::optional mask_first{}; __device__ return_type_t compute_degree(edge_t offset, edge_t degree) const { - if (mask_first) { - return static_cast(count_set_bits(mask_first, offset, degree)); + if (mask_first.has_value() && *mask_first) { + return static_cast(count_set_bits(*mask_first, offset, degree)); } else { return static_cast(degree); } @@ -152,7 +152,7 @@ class edge_partition_device_view_base_t { template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span count, @@ -175,7 +175,7 @@ __host__ void compute_number_of_edges_with_mask_async_mg( auto local_degree_first = cuda::make_transform_iterator( major_first, detail::local_degree_op_t{ - offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask.data()}); + offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask}); cub::DeviceReduce::Sum(static_cast(nullptr), tmp_storage_bytes, local_degree_first, @@ -192,11 +192,8 @@ __host__ void compute_number_of_edges_with_mask_async_mg( } else { auto local_degree_first = cuda::make_transform_iterator( major_first, - detail::local_degree_op_t{offsets, - major_range_first, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - edge_mask.data()}); + detail::local_degree_op_t{ + offsets, major_range_first, std::byte{0} /* dummy */, std::byte{0} /* dummy */, edge_mask}); cub::DeviceReduce::Sum(static_cast(nullptr), tmp_storage_bytes, local_degree_first, @@ -215,7 +212,7 @@ __host__ void compute_number_of_edges_with_mask_async_mg( template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -226,7 +223,7 @@ __host__ void compute_number_of_edges_with_mask_async_mg( template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple local_vertex_partition_range, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -237,7 +234,7 @@ __host__ void compute_number_of_edges_with_mask_async_mg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, MajorIterator major_first, MajorIterator major_last, cuda::std::optional> dcs_nzd_vertices, @@ -255,25 +252,22 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( major_last, local_degrees.begin(), detail::local_degree_op_t{ - offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask.data()}); + offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask}); } else { thrust::transform( rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), - detail::local_degree_op_t{offsets, - major_range_first, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - edge_mask.data()}); + detail::local_degree_op_t{ + offsets, major_range_first, std::byte{0} /* dummy */, std::byte{0} /* dummy */, edge_mask}); } return local_degrees; } template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, @@ -283,7 +277,7 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple local_vertex_partition_range, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, @@ -293,7 +287,7 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span count, @@ -314,7 +308,7 @@ __host__ void compute_number_of_edges_with_mask_async_sg( std::byte{0} /* dummy */, std::byte{0} /* dummy */, std::byte{0} /* dummy */, - edge_mask.data()}); + edge_mask}); cub::DeviceReduce::Sum(static_cast(nullptr), tmp_storage_bytes, local_degree_first, @@ -332,7 +326,7 @@ __host__ void compute_number_of_edges_with_mask_async_sg( template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, raft::device_span offsets, @@ -340,7 +334,7 @@ __host__ void compute_number_of_edges_with_mask_async_sg( template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span count, raft::device_span offsets, @@ -348,7 +342,7 @@ __host__ void compute_number_of_edges_with_mask_async_sg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, MajorIterator major_first, MajorIterator major_last, raft::device_span offsets, @@ -364,20 +358,20 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( std::byte{0} /* dummy */, std::byte{0} /* dummy */, std::byte{0} /* dummy */, - edge_mask.data()}); + edge_mask}); return local_degrees; } template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span offsets, rmm::cuda_stream_view stream); @@ -411,14 +405,15 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - detail::compute_number_of_edges_with_mask_async_mg(edge_mask, - majors, - count, - dcs_nzd_vertices_, - major_range_first_, - major_hypersparse_first_, - this->offsets_, - stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + majors, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } __host__ void compute_number_of_edges_with_mask_async( @@ -427,14 +422,15 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - detail::compute_number_of_edges_with_mask_async_mg(edge_mask, - vertex_partition_range, - count, - dcs_nzd_vertices_, - major_range_first_, - major_hypersparse_first_, - this->offsets_, - stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template @@ -444,15 +440,16 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - detail::compute_number_of_edges_with_mask_async_mg(edge_mask, - major_first, - major_last, - count, - dcs_nzd_vertices_, - major_range_first_, - major_hypersparse_first_, - this->offsets_, - stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template @@ -460,14 +457,28 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async({}, majors, count, stream); + detail::compute_number_of_edges_with_mask_async_mg(cuda::std::nullopt, + majors, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } __host__ void compute_number_of_edges_async(std::tuple vertex_partition_range, raft::device_span count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async({}, vertex_partition_range, count, stream); + detail::compute_number_of_edges_with_mask_async_mg(cuda::std::nullopt, + vertex_partition_range, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template @@ -476,7 +487,15 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async({}, major_first, major_last, count, stream); + detail::compute_number_of_edges_with_mask_async_mg(cuda::std::nullopt, + major_first, + major_last, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template @@ -485,8 +504,15 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async( - edge_mask, majors, raft::device_span{count.data(), 1}, stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + majors, + raft::device_span{count.data(), 1}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); return count.value(stream); } @@ -496,8 +522,15 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async( - edge_mask, vertex_partition_range, raft::device_span{count.data(), 1}, stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + raft::device_span{count.data(), 1}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); return count.value(stream); } @@ -508,8 +541,16 @@ class edge_partition_device_view_t count(size_t{0}, stream); - compute_number_of_edges_with_mask_async( - edge_mask, major_first, major_last, raft::device_span{count.data(), 1}, stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + raft::device_span{count.data(), 1}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); return count.value(stream); } @@ -546,7 +587,7 @@ class edge_partition_device_view_t edge_mask, rmm::cuda_stream_view stream) const { return detail::compute_local_degrees_with_mask_mg( - edge_mask, + cuda::std::optional{edge_mask.data()}, std::tuple{this->major_range_first(), this->major_range_last()}, dcs_nzd_vertices_, major_range_first_, @@ -561,13 +602,14 @@ class edge_partition_device_view_t majors, rmm::cuda_stream_view stream) const { - return detail::compute_local_degrees_with_mask_mg(edge_mask, - majors, - dcs_nzd_vertices_, - major_range_first_, - major_hypersparse_first_, - this->offsets_, - stream); + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + majors, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } __host__ rmm::device_uvector compute_local_degrees_with_mask( @@ -575,13 +617,14 @@ class edge_partition_device_view_t vertex_partition_range, rmm::cuda_stream_view stream) const { - return detail::compute_local_degrees_with_mask_mg(edge_mask, - vertex_partition_range, - dcs_nzd_vertices_, - major_range_first_, - major_hypersparse_first_, - this->offsets_, - stream); + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template @@ -591,38 +634,40 @@ class edge_partition_device_view_toffsets_, - stream); - } - - template , int> = 0> - __host__ rmm::device_uvector compute_local_degrees_with_mask( - raft::device_span edge_mask, - T* major_first, - T* major_last, - rmm::cuda_stream_view stream) const - { - return compute_local_degrees_with_mask( - edge_mask, static_cast(major_first), static_cast(major_last), stream); + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template __host__ rmm::device_uvector compute_local_degrees(raft::device_span majors, rmm::cuda_stream_view stream) const { - return compute_local_degrees_with_mask({}, majors, stream); + return detail::compute_local_degrees_with_mask_mg(cuda::std::nullopt, + majors, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } __host__ rmm::device_uvector compute_local_degrees( std::tuple vertex_partition_range, rmm::cuda_stream_view stream) const { - return compute_local_degrees_with_mask({}, vertex_partition_range, stream); + return detail::compute_local_degrees_with_mask_mg(cuda::std::nullopt, + vertex_partition_range, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template @@ -630,7 +675,14 @@ class edge_partition_device_view_toffsets_, + stream); } __host__ __device__ vertex_t major_value_start_offset() const @@ -727,12 +779,10 @@ class edge_partition_device_view_t> for consistency - // (see dcs_nzd_range_bitmap()) - __host__ __device__ cuda::std::optional dcs_nzd_vertices() const + __host__ __device__ cuda::std::optional> dcs_nzd_vertices() + const { - return dcs_nzd_vertices_ ? cuda::std::optional{(*dcs_nzd_vertices_).data()} - : cuda::std::nullopt; + return dcs_nzd_vertices_; } __host__ __device__ cuda::std::optional dcs_nzd_vertex_count() const @@ -784,7 +834,11 @@ class edge_partition_device_view_toffsets_, stream); + cuda::std::optional{edge_mask.data()}, + majors, + count, + this->offsets_, + stream); } __host__ void compute_number_of_edges_with_mask_async( @@ -794,7 +848,11 @@ class edge_partition_device_view_toffsets_, stream); + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + count, + this->offsets_, + stream); } template @@ -805,7 +863,12 @@ class edge_partition_device_view_t( - edge_mask, major_first, major_last, count, this->offsets_, stream); + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + count, + this->offsets_, + stream); } template @@ -813,14 +876,16 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async({}, majors, count, stream); + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::nullopt, majors, count, this->offsets_, stream); } __host__ void compute_number_of_edges_async(std::tuple vertex_partition_range, raft::device_span count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async({}, vertex_partition_range, count, stream); + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::nullopt, vertex_partition_range, count, this->offsets_, stream); } template @@ -829,7 +894,8 @@ class edge_partition_device_view_t count, rmm::cuda_stream_view stream) const { - compute_number_of_edges_with_mask_async({}, major_first, major_last, count, stream); + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::nullopt, major_first, major_last, count, this->offsets_, stream); } template @@ -899,7 +965,7 @@ class edge_partition_device_view_t edge_mask, rmm::cuda_stream_view stream) const { return detail::compute_local_degrees_with_mask_sg( - edge_mask, + cuda::std::optional{edge_mask.data()}, std::tuple{this->major_range_first(), this->major_range_last()}, this->offsets_, stream); @@ -911,7 +977,8 @@ class edge_partition_device_view_t majors, rmm::cuda_stream_view stream) const { - return detail::compute_local_degrees_with_mask_sg(edge_mask, majors, this->offsets_, stream); + return detail::compute_local_degrees_with_mask_sg( + cuda::std::optional{edge_mask.data()}, majors, this->offsets_, stream); } __host__ rmm::device_uvector compute_local_degrees_with_mask( @@ -920,7 +987,10 @@ class edge_partition_device_view_toffsets_, stream); + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + this->offsets_, + stream); } template @@ -931,31 +1001,26 @@ class edge_partition_device_view_t( - edge_mask, major_first, major_last, this->offsets_, stream); - } - - template , int> = 0> - __host__ rmm::device_uvector compute_local_degrees_with_mask( - raft::device_span edge_mask, - T* major_first, - T* major_last, - rmm::cuda_stream_view stream) const - { - return compute_local_degrees_with_mask( - edge_mask, static_cast(major_first), static_cast(major_last), stream); + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + this->offsets_, + stream); } template __host__ rmm::device_uvector compute_local_degrees(raft::device_span majors, rmm::cuda_stream_view stream) const { - return compute_local_degrees_with_mask({}, majors, stream); + return detail::compute_local_degrees_with_mask_sg( + cuda::std::nullopt, majors, this->offsets_, stream); } __host__ rmm::device_uvector compute_local_degrees( std::tuple vertex_partition_range, rmm::cuda_stream_view stream) const { - return compute_local_degrees_with_mask({}, vertex_partition_range, stream); + return detail::compute_local_degrees_with_mask_sg( + cuda::std::nullopt, vertex_partition_range, this->offsets_, stream); } template @@ -963,7 +1028,8 @@ class edge_partition_device_view_t( + cuda::std::nullopt, major_first, major_last, this->offsets_, stream); } __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; } @@ -1033,7 +1099,8 @@ class edge_partition_device_view_t dcs_nzd_vertices() const + __host__ __device__ cuda::std::optional> dcs_nzd_vertices() + const { return cuda::std::nullopt; } diff --git a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh index 5936351a2bd..5d4b3314545 100644 --- a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh @@ -896,18 +896,38 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, detail::edge_partition_edge_property_device_view_t>( *edge_mask_view, partition_idx) : cuda::std::nullopt; - auto frontier_majors = raft::device_span{ - thrust_tuple_get_or_identity(frontier_key_first), - static_cast(cuda::std::distance(frontier_key_first, frontier_key_last))}; - if (edge_partition_e_mask) { - auto edge_partition_e_mask_span = raft::device_span( - edge_partition_e_mask->value_first(), - packed_bool_size(static_cast(edge_partition.number_of_edges()))); - local_max_pushes = edge_partition.compute_number_of_edges_with_mask( - edge_partition_e_mask_span, frontier_majors, handle.get_stream()); + auto frontier_majors_first = + thrust_tuple_get_or_identity(frontier_key_first); + size_t frontier_majors_size = + static_cast(cuda::std::distance(frontier_key_first, frontier_key_last)); + + if constexpr (std::is_pointer_v>) { + auto frontier_majors = + raft::device_span(frontier_majors_first, frontier_majors_size); + if (edge_partition_e_mask) { + auto edge_partition_e_mask_span = raft::device_span( + edge_partition_e_mask->value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + local_max_pushes = edge_partition.compute_number_of_edges_with_mask( + edge_partition_e_mask_span, frontier_majors, handle.get_stream()); + } else { + local_max_pushes = + edge_partition.compute_number_of_edges(frontier_majors, handle.get_stream()); + } } else { - local_max_pushes = - edge_partition.compute_number_of_edges(frontier_majors, handle.get_stream()); + if (edge_partition_e_mask) { + auto edge_partition_e_mask_span = raft::device_span( + edge_partition_e_mask->value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + local_max_pushes = edge_partition.compute_number_of_edges_with_mask( + edge_partition_e_mask_span, + frontier_majors_first, + frontier_majors_first + frontier_majors_size, + handle.get_stream()); + } else { + local_max_pushes = edge_partition.compute_number_of_edges( + frontier_majors_first, frontier_majors_first + frontier_majors_size, handle.get_stream()); + } } } @@ -1467,12 +1487,12 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, packed_bool_size(static_cast(edge_partition.number_of_edges()))); edge_partition.compute_number_of_edges_with_mask_async( edge_partition_e_mask_span, - raft::device_span{major_first, num_keys}, + raft::device_span(major_first, num_keys), raft::device_span(counters.data() + j, size_t{1}), loop_stream); } else { edge_partition.compute_number_of_edges_async( - raft::device_span{major_first, num_keys}, + raft::device_span(major_first, num_keys), raft::device_span(counters.data() + j, size_t{1}), loop_stream); } diff --git a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh index 9ce8343ea20..26783b24cde 100644 --- a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh @@ -1199,11 +1199,11 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, packed_bool_size(static_cast(edge_partition.number_of_edges()))); ret += edge_partition.compute_number_of_edges_with_mask( edge_partition_mask_span, - raft::device_span{local_frontier_vertex_first, frontier.size()}, + raft::device_span(local_frontier_vertex_first, frontier.size()), handle.get_stream()); } else { ret += edge_partition.compute_number_of_edges( - raft::device_span{local_frontier_vertex_first, frontier.size()}, + raft::device_span(local_frontier_vertex_first, frontier.size()), handle.get_stream()); } } diff --git a/cpp/src/structure/edge_partition_device_view_impl.cuh b/cpp/src/structure/edge_partition_device_view_impl.cuh index 3f081088521..9669d47a025 100644 --- a/cpp/src/structure/edge_partition_device_view_impl.cuh +++ b/cpp/src/structure/edge_partition_device_view_impl.cuh @@ -21,7 +21,7 @@ namespace detail { // ============================================================================ template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -43,7 +43,7 @@ __host__ void compute_number_of_edges_with_mask_async_mg( template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple local_vertex_partition_range, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -66,7 +66,7 @@ __host__ void compute_number_of_edges_with_mask_async_mg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, @@ -86,7 +86,7 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple local_vertex_partition_range, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, @@ -111,7 +111,7 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, raft::device_span offsets, @@ -123,7 +123,7 @@ __host__ void compute_number_of_edges_with_mask_async_sg( template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span count, raft::device_span offsets, @@ -140,7 +140,7 @@ __host__ void compute_number_of_edges_with_mask_async_sg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span offsets, rmm::cuda_stream_view stream) @@ -151,7 +151,7 @@ __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span offsets, rmm::cuda_stream_view stream) diff --git a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu index b64fb6259c5..2e02ed7165b 100644 --- a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu +++ b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu @@ -14,7 +14,7 @@ using vertex_t = int32_t; using edge_t = int32_t; template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -24,7 +24,7 @@ template __host__ void compute_number_of_edges_with_mask_async_mg( rmm::cuda_stream_view stream); template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -34,7 +34,7 @@ template __host__ void compute_number_of_edges_with_mask_async_mg( rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, @@ -43,7 +43,7 @@ template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple local_vertex_partition_range, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, diff --git a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu index 324fe6c5416..3a8e5493b8f 100644 --- a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu +++ b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu @@ -14,7 +14,7 @@ using vertex_t = int64_t; using edge_t = int64_t; template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -24,7 +24,7 @@ template __host__ void compute_number_of_edges_with_mask_async_mg( rmm::cuda_stream_view stream); template __host__ void compute_number_of_edges_with_mask_async_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span count, cuda::std::optional> dcs_nzd_vertices, @@ -34,7 +34,7 @@ template __host__ void compute_number_of_edges_with_mask_async_mg( rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, @@ -43,7 +43,7 @@ template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple local_vertex_partition_range, cuda::std::optional> dcs_nzd_vertices, vertex_t major_range_first, diff --git a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu index 2a980737cb6..4bc538ba615 100644 --- a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu +++ b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu @@ -14,27 +14,27 @@ using vertex_t = int32_t; using edge_t = int32_t; template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span count, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span offsets, rmm::cuda_stream_view stream); diff --git a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu index c8fd07909de..a5f279d9a1b 100644 --- a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu +++ b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu @@ -14,27 +14,27 @@ using vertex_t = int64_t; using edge_t = int64_t; template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span count, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ void compute_number_of_edges_with_mask_async_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span count, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, raft::device_span majors, raft::device_span offsets, rmm::cuda_stream_view stream); template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( - raft::device_span edge_mask, + cuda::std::optional edge_mask, std::tuple vertex_partition_range, raft::device_span offsets, rmm::cuda_stream_view stream); diff --git a/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu b/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu index 25997878583..4e0a4d720b0 100644 --- a/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu +++ b/cpp/src/traversal/od_shortest_distances_sg_v32_e32.cu @@ -1,8 +1,7 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -#include "structure/edge_partition_device_view_impl.cuh" #include "traversal/od_shortest_distances_impl.cuh" namespace cugraph { @@ -27,14 +26,4 @@ template rmm::device_uvector od_shortest_distances( double cutoff, bool do_expensive_check); -using view_t = edge_partition_device_view_t; -using od_extract_iter_t = - cuda::transform_iterator, uint64_t const*>; -template void view_t::compute_number_of_edges_with_mask_async( - raft::device_span, - od_extract_iter_t, - od_extract_iter_t, - raft::device_span, - rmm::cuda_stream_view) const; - } // namespace cugraph diff --git a/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu b/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu index a853a3328a0..37112ce5e68 100644 --- a/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu +++ b/cpp/src/traversal/od_shortest_distances_sg_v64_e64.cu @@ -1,8 +1,7 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ -#include "structure/edge_partition_device_view_impl.cuh" #include "traversal/od_shortest_distances_impl.cuh" namespace cugraph { @@ -27,14 +26,4 @@ template rmm::device_uvector od_shortest_distances( double cutoff, bool do_expensive_check); -using view_t = edge_partition_device_view_t; -using od_extract_iter_t = - cuda::transform_iterator, uint64_t const*>; -template void view_t::compute_number_of_edges_with_mask_async( - raft::device_span, - od_extract_iter_t, - od_extract_iter_t, - raft::device_span, - rmm::cuda_stream_view) const; - } // namespace cugraph