diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1f3707fd21..418a8a8697 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -262,6 +262,8 @@ set(CUGRAPH_SG_SOURCES src/structure/graph_sg_v32_e32.cu src/structure/graph_view_sg_v64_e64.cu src/structure/graph_view_sg_v32_e32.cu + src/structure/edge_partition_device_view_sg_v64_e64.cu + src/structure/edge_partition_device_view_sg_v32_e32.cu src/structure/decompress_to_edgelist_sg_v64_e64.cu src/structure/decompress_to_edgelist_sg_v32_e32.cu src/structure/symmetrize_graph_sg_v64_e64.cu @@ -360,6 +362,8 @@ set(CUGRAPH_MG_SOURCES src/community/detail/maximal_independent_moves_mg_v32_e32.cu src/structure/graph_view_mg_v64_e64.cu src/structure/graph_view_mg_v32_e32.cu + src/structure/edge_partition_device_view_mg_v64_e64.cu + src/structure/edge_partition_device_view_mg_v32_e32.cu src/link_prediction/jaccard_mg_v64_e64.cu src/link_prediction/jaccard_mg_v32_e32.cu src/link_prediction/sorensen_mg_v64_e64.cu diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 827c0c3a09..7a57c789d9 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -22,10 +23,10 @@ #include #include #include -#include #include #include +#include #include namespace cugraph { @@ -47,50 +48,13 @@ __device__ cuda::std::optional major_hypersparse_idx_from_major_nochec : cuda::std::nullopt; } -template -struct local_degree_op_t { - raft::device_span offsets{}; - std::conditional_t major_range_first{}; - - std::conditional_t, std::byte /* dummy */> - dcs_nzd_vertices{}; - std::conditional_t major_hypersparse_first{}; - - __device__ return_type_t operator()(vertex_t major) const - { - if constexpr (multi_gpu) { - vertex_t idx{}; - if constexpr (use_dcs) { - if (major < major_hypersparse_first) { - idx = major - major_range_first; - return static_cast(offsets[idx + 1] - offsets[idx]); - } else { - auto major_hypersparse_idx = - major_hypersparse_idx_from_major_nocheck_impl(dcs_nzd_vertices, major); - if (major_hypersparse_idx) { - idx = (major_hypersparse_first - major_range_first) + *major_hypersparse_idx; - return static_cast(offsets[idx + 1] - offsets[idx]); - } else { - return return_type_t{0}; - } - } - } else { - idx = major - major_range_first; - return static_cast(offsets[idx + 1] - offsets[idx]); - } - } else { - return static_cast(offsets[major + 1] - offsets[major]); - } - } -}; - template -struct local_degree_with_mask_op_t { + typename MaskIterator = uint32_t const*> +struct local_degree_op_t { raft::device_span offsets{}; std::conditional_t major_range_first{}; @@ -98,7 +62,16 @@ struct local_degree_with_mask_op_t { dcs_nzd_vertices{}; std::conditional_t major_hypersparse_first{}; - MaskIterator mask_first{}; + cuda::std::optional mask_first{}; + + __device__ return_type_t compute_degree(edge_t offset, edge_t degree) const + { + if (mask_first.has_value() && *mask_first) { + return static_cast(count_set_bits(*mask_first, offset, degree)); + } else { + return static_cast(degree); + } + } __device__ return_type_t operator()(vertex_t major) const { @@ -107,27 +80,23 @@ struct local_degree_with_mask_op_t { if constexpr (use_dcs) { if (major < major_hypersparse_first) { idx = major - major_range_first; - return static_cast( - count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + return compute_degree(offsets[idx], offsets[idx + 1] - offsets[idx]); } else { auto major_hypersparse_idx = major_hypersparse_idx_from_major_nocheck_impl(dcs_nzd_vertices, major); if (major_hypersparse_idx) { idx = (major_hypersparse_first - major_range_first) + *major_hypersparse_idx; - return static_cast( - count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + return compute_degree(offsets[idx], offsets[idx + 1] - offsets[idx]); } else { return return_type_t{0}; } } } else { idx = major - major_range_first; - return static_cast( - count_set_bits(mask_first, offsets[idx], offsets[idx + 1] - offsets[idx])); + return compute_degree(offsets[idx], offsets[idx + 1] - offsets[idx]); } } else { - return static_cast( - count_set_bits(mask_first, offsets[major], offsets[major + 1] - offsets[major])); + return compute_degree(offsets[major], offsets[major + 1] - offsets[major]); } } }; @@ -181,6 +150,232 @@ class edge_partition_device_view_base_t { raft::device_span indices_{nullptr}; }; +template +__host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + return; + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices) { + assert(major_hypersparse_first); + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + offsets, major_range_first, std::byte{0} /* dummy */, std::byte{0} /* dummy */, edge_mask}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + } +} + +template +__host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + std::tuple local_vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + MajorIterator major_first, + MajorIterator major_last, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + if (dcs_nzd_vertices) { + assert(major_hypersparse_first); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + offsets, major_range_first, *dcs_nzd_vertices, *major_hypersparse_first, edge_mask}); + } else { + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{ + offsets, major_range_first, std::byte{0} /* dummy */, std::byte{0} /* dummy */, edge_mask}); + } + return local_degrees; +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + if (cuda::std::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + return; + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = cuda::make_transform_iterator( + major_first, + detail::local_degree_op_t{offsets, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + cuda::std::distance(major_first, major_last), + stream); +} + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); + thrust::transform( + rmm::exec_policy_nosync(stream), + major_first, + major_last, + local_degrees.begin(), + detail::local_degree_op_t{offsets, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + edge_mask}); + return local_degrees; +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream); + } // namespace detail template @@ -204,347 +399,290 @@ class edge_partition_device_view_t + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + majors, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + __host__ void compute_number_of_edges_with_mask_async( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + template - __host__ size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return dcs_nzd_vertices_ ? thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - true>{this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_}, - size_t{0}, - cuda::std::plus()) - : thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false>{this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}, - size_t{0}, - cuda::std::plus()); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + template + __host__ void compute_number_of_edges_async(raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_mg(cuda::std::nullopt, + majors, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + __host__ void compute_number_of_edges_async(std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_mg(cuda::std::nullopt, + vertex_partition_range, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, - raft::device_span count /* size = 1 */, + raft::device_span count, rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } - - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; - - if (dcs_nzd_vertices_) { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } else { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } + detail::compute_number_of_edges_with_mask_async_mg(cuda::std::nullopt, + major_first, + major_last, + count, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } - __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0})}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - } - return local_degrees; + rmm::device_scalar count(size_t{0}, stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + majors, + raft::device_span{count.data(), 1}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + return count.value(stream); } - template - __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ size_t + compute_number_of_edges_with_mask(raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0})}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); - } - return local_degrees; + rmm::device_scalar count(size_t{0}, stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + raft::device_span{count.data(), 1}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + return count.value(stream); } - template - __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return dcs_nzd_vertices_ ? thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - true, - MaskIterator>{this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_, - mask_first}, - size_t{0}, - cuda::std::plus()) - : thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false, - MaskIterator>{this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}, - size_t{0}, - cuda::std::plus()); - } - - template - __host__ void compute_number_of_edges_with_mask_async( - MaskIterator mask_first, - MajorIterator major_first, - MajorIterator major_last, - raft::device_span count /* size = 1 */, - rmm::cuda_stream_view stream) const + rmm::device_scalar count(size_t{0}, stream); + detail::compute_number_of_edges_with_mask_async_mg( + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + raft::device_span{count.data(), 1}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(raft::device_span majors, + rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async(majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; + __host__ size_t compute_number_of_edges(std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + vertex_partition_range, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } - if (dcs_nzd_vertices_) { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - true, - MaskIterator>{this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - *major_hypersparse_first_, - mask_first}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } else { - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false, - MaskIterator>{this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - } + template + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + major_first, major_last, raft::device_span{count.data(), 1}, stream); + return count.value(stream); } - template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, rmm::cuda_stream_view stream) const + raft::device_span edge_mask, rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - *dcs_nzd_vertices_, - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - } - return local_degrees; + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + std::tuple{this->major_range_first(), this->major_range_last()}, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } - template + template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, + raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + majors, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + template + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - if (dcs_nzd_vertices_) { - assert(major_hypersparse_first_); - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - dcs_nzd_vertices_.value(), - major_hypersparse_first_.value_or(vertex_t{0}), - mask_first}); - } else { - thrust::transform( - rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail:: - local_degree_with_mask_op_t{ - this->offsets_, - major_range_first_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - } - return local_degrees; + return detail::compute_local_degrees_with_mask_mg( + cuda::std::optional{edge_mask.data()}, + major_first, + major_last, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + template + __host__ rmm::device_uvector compute_local_degrees(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg(cuda::std::nullopt, + majors, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + __host__ rmm::device_uvector compute_local_degrees( + std::tuple vertex_partition_range, rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg(cuda::std::nullopt, + vertex_partition_range, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); + } + + template + __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_mg(cuda::std::nullopt, + major_first, + major_last, + dcs_nzd_vertices_, + major_range_first_, + major_hypersparse_first_, + this->offsets_, + stream); } __host__ __device__ vertex_t major_value_start_offset() const @@ -641,12 +779,10 @@ class edge_partition_device_view_t> for consistency - // (see dcs_nzd_range_bitmap()) - __host__ __device__ cuda::std::optional dcs_nzd_vertices() const + __host__ __device__ cuda::std::optional> dcs_nzd_vertices() + const { - return dcs_nzd_vertices_ ? cuda::std::optional{(*dcs_nzd_vertices_).data()} - : cuda::std::nullopt; + return dcs_nzd_vertices_; } __host__ __device__ cuda::std::optional dcs_nzd_vertex_count() const @@ -691,164 +827,209 @@ class edge_partition_device_view_t + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, + raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::optional{edge_mask.data()}, + majors, + count, + this->offsets_, + stream); + } + + __host__ void compute_number_of_edges_with_mask_async( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + count, + this->offsets_, + stream); + } + template - __host__ size_t compute_number_of_edges(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const + __host__ void compute_number_of_edges_with_mask_async(raft::device_span edge_mask, + MajorIterator major_first, + MajorIterator major_last, + raft::device_span count, + rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return thrust::transform_reduce( - rmm::exec_policy(stream), + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::optional{edge_mask.data()}, major_first, major_last, - detail::local_degree_op_t{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}, - size_t{0}, - cuda::std::plus()); + count, + this->offsets_, + stream); + } + + template + __host__ void compute_number_of_edges_async(raft::device_span majors, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::nullopt, majors, count, this->offsets_, stream); + } + + __host__ void compute_number_of_edges_async(std::tuple vertex_partition_range, + raft::device_span count, + rmm::cuda_stream_view stream) const + { + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::nullopt, vertex_partition_range, count, this->offsets_, stream); } template __host__ void compute_number_of_edges_async(MajorIterator major_first, MajorIterator major_last, - raft::device_span count /* size = 1 */, + raft::device_span count, rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) { - RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); - } - - rmm::device_uvector d_tmp_storage(0, stream); - size_t tmp_storage_bytes{0}; + detail::compute_number_of_edges_with_mask_async_sg( + cuda::std::nullopt, major_first, major_last, count, this->offsets_, stream); + } - auto local_degree_first = cuda::make_transform_iterator( - major_first, - detail::local_degree_op_t{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - cub::DeviceReduce::Sum(static_cast(nullptr), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); - d_tmp_storage.resize(tmp_storage_bytes, stream); - cub::DeviceReduce::Sum(d_tmp_storage.data(), - tmp_storage_bytes, - local_degree_first, - count.data(), - cuda::std::distance(major_first, major_last), - stream); + template + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + edge_mask, majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); } - __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const + __host__ size_t + compute_number_of_edges_with_mask(raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - return local_degrees; + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + edge_mask, vertex_partition_range, raft::device_span{count.data(), 1}, stream); + return count.value(stream); } template - __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, - MajorIterator major_last, - rmm::cuda_stream_view stream) const - { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - thrust::transform(rmm::exec_policy_nosync(stream), - major_first, - major_last, - local_degrees.begin(), - detail::local_degree_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */}); - return local_degrees; - } - - template - __host__ size_t compute_number_of_edges_with_mask(MaskIterator mask_first, + __host__ size_t compute_number_of_edges_with_mask(raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const { - if (cuda::std::distance(major_first, major_last) == 0) return size_t{0}; - return thrust::transform_reduce( - rmm::exec_policy(stream), - major_first, - major_last, - detail::local_degree_with_mask_op_t< - vertex_t, - edge_t, - size_t /* no limit on majors.size(), so edge_t can overflow */, - multi_gpu, - false, - MaskIterator>{this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}, - size_t{0}, - cuda::std::plus()); - } - - template + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_with_mask_async( + edge_mask, major_first, major_last, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async(majors, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + __host__ size_t compute_number_of_edges(std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + vertex_partition_range, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + + template + __host__ size_t compute_number_of_edges(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + rmm::device_scalar count(size_t{0}, stream); + compute_number_of_edges_async( + major_first, major_last, raft::device_span{count.data(), 1}, stream); + return count.value(stream); + } + __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, rmm::cuda_stream_view stream) const + raft::device_span edge_mask, rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), - thrust::make_counting_iterator(this->major_range_first()), - thrust::make_counting_iterator(this->major_range_last()), - local_degrees.begin(), - detail::local_degree_with_mask_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - return local_degrees; + return detail::compute_local_degrees_with_mask_sg( + cuda::std::optional{edge_mask.data()}, + std::tuple{this->major_range_first(), this->major_range_last()}, + this->offsets_, + stream); } - template + template __host__ rmm::device_uvector compute_local_degrees_with_mask( - MaskIterator mask_first, + raft::device_span edge_mask, + raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + cuda::std::optional{edge_mask.data()}, majors, this->offsets_, stream); + } + + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, + std::tuple vertex_partition_range, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + cuda::std::optional{edge_mask.data()}, + vertex_partition_range, + this->offsets_, + stream); + } + + template + __host__ rmm::device_uvector compute_local_degrees_with_mask( + raft::device_span edge_mask, MajorIterator major_first, MajorIterator major_last, rmm::cuda_stream_view stream) const { - rmm::device_uvector local_degrees(cuda::std::distance(major_first, major_last), stream); - thrust::transform( - rmm::exec_policy_nosync(stream), + return detail::compute_local_degrees_with_mask_sg( + cuda::std::optional{edge_mask.data()}, major_first, major_last, - local_degrees.begin(), - detail::local_degree_with_mask_op_t{ - this->offsets_, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - std::byte{0} /* dummy */, - mask_first}); - return local_degrees; + this->offsets_, + stream); + } + + template + __host__ rmm::device_uvector compute_local_degrees(raft::device_span majors, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + cuda::std::nullopt, majors, this->offsets_, stream); + } + + __host__ rmm::device_uvector compute_local_degrees( + std::tuple vertex_partition_range, rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + cuda::std::nullopt, vertex_partition_range, this->offsets_, stream); + } + + template + __host__ rmm::device_uvector compute_local_degrees(MajorIterator major_first, + MajorIterator major_last, + rmm::cuda_stream_view stream) const + { + return detail::compute_local_degrees_with_mask_sg( + cuda::std::nullopt, major_first, major_last, this->offsets_, stream); } __host__ __device__ vertex_t major_value_start_offset() const { return vertex_t{0}; } @@ -918,7 +1099,8 @@ class edge_partition_device_view_t dcs_nzd_vertices() const + __host__ __device__ cuda::std::optional> dcs_nzd_vertices() + const { return cuda::std::nullopt; } @@ -932,4 +1114,19 @@ class edge_partition_device_view_t +struct sparse_hypersparse_major_op_t { + edge_partition_device_view_t edge_partition; + __device__ vertex_t operator()(vertex_t i) const + { + return edge_partition.major_from_major_idx_nocheck(i); + } +}; + +} // namespace detail + } // namespace cugraph diff --git a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh index 38d90f0146..5d4b331454 100644 --- a/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/extract_transform_if_v_frontier_e.cuh @@ -896,19 +896,38 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, detail::edge_partition_edge_property_device_view_t>( *edge_mask_view, partition_idx) : cuda::std::nullopt; - auto frontier_major_first = + auto frontier_majors_first = thrust_tuple_get_or_identity(frontier_key_first); - auto frontier_major_last = - thrust_tuple_get_or_identity(frontier_key_last); - if (edge_partition_e_mask) { - local_max_pushes = - edge_partition.compute_number_of_edges_with_mask(edge_partition_e_mask->value_first(), - frontier_major_first, - frontier_major_last, - handle.get_stream()); + size_t frontier_majors_size = + static_cast(cuda::std::distance(frontier_key_first, frontier_key_last)); + + if constexpr (std::is_pointer_v>) { + auto frontier_majors = + raft::device_span(frontier_majors_first, frontier_majors_size); + if (edge_partition_e_mask) { + auto edge_partition_e_mask_span = raft::device_span( + edge_partition_e_mask->value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + local_max_pushes = edge_partition.compute_number_of_edges_with_mask( + edge_partition_e_mask_span, frontier_majors, handle.get_stream()); + } else { + local_max_pushes = + edge_partition.compute_number_of_edges(frontier_majors, handle.get_stream()); + } } else { - local_max_pushes = edge_partition.compute_number_of_edges( - frontier_major_first, frontier_major_last, handle.get_stream()); + if (edge_partition_e_mask) { + auto edge_partition_e_mask_span = raft::device_span( + edge_partition_e_mask->value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + local_max_pushes = edge_partition.compute_number_of_edges_with_mask( + edge_partition_e_mask_span, + frontier_majors_first, + frontier_majors_first + frontier_majors_size, + handle.get_stream()); + } else { + local_max_pushes = edge_partition.compute_number_of_edges( + frontier_majors_first, frontier_majors_first + frontier_majors_size, handle.get_stream()); + } } } @@ -1429,14 +1448,13 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, if (keys.index() == 0) { auto major_first = cuda::make_transform_iterator( std::get<0>(keys).begin(), - cuda::proclaim_return_type( - [range_first = - local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { - return range_first + static_cast(v_offset); - })); + detail::shift_right_t{local_frontier_range_firsts[partition_idx]}); if (edge_partition_e_mask) { - edge_partition.compute_number_of_edges_with_mask_async( + auto edge_partition_e_mask_span = raft::device_span( edge_partition_e_mask->value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + edge_partition.compute_number_of_edges_with_mask_async( + edge_partition_e_mask_span, major_first, major_first + std::get<0>(keys).size(), raft::device_span(counters.data() + j, size_t{1}), @@ -1464,16 +1482,17 @@ extract_transform_if_v_frontier_e(raft::handle_t const& handle, } auto major_first = thrust_tuple_get_or_identity(key_first); if (edge_partition_e_mask) { - edge_partition.compute_number_of_edges_with_mask_async( + auto edge_partition_e_mask_span = raft::device_span( edge_partition_e_mask->value_first(), - major_first, - major_first + num_keys, + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + edge_partition.compute_number_of_edges_with_mask_async( + edge_partition_e_mask_span, + raft::device_span(major_first, num_keys), raft::device_span(counters.data() + j, size_t{1}), loop_stream); } else { edge_partition.compute_number_of_edges_async( - major_first, - major_first + num_keys, + raft::device_span(major_first, num_keys), raft::device_span(counters.data() + j, size_t{1}), loop_stream); } diff --git a/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh b/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh index 325576b425..8e6324e36d 100644 --- a/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh +++ b/cpp/include/cugraph/prims/detail/sample_and_compute_local_nbr_indices.cuh @@ -2414,7 +2414,9 @@ compute_aggregate_local_frontier_local_degrees(raft::handle_t const& handle, aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], handle.get_stream()) : edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), + raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))), aggregate_local_frontier_major_first + local_frontier_offsets[i], aggregate_local_frontier_major_first + local_frontier_offsets[i + 1], handle.get_stream()); diff --git a/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh b/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh index d516dbac6a..9e3e0b3ac6 100644 --- a/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh +++ b/cpp/include/cugraph/prims/detail/transform_v_frontier_e.cuh @@ -424,10 +424,17 @@ auto transform_v_frontier_e(raft::handle_t const& handle, aggregate_local_frontier_key_first + local_frontier_offsets[i]; auto edge_partition_frontier_major_first = thrust_tuple_get_or_identity(edge_partition_frontier_key_first); + cuda::std::optional> edge_partition_mask_span{ + cuda::std::nullopt}; + if (edge_partition_e_mask) { + edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + } auto edge_partition_frontier_local_degrees = edge_partition_e_mask ? edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), + edge_partition_mask_span.value(), edge_partition_frontier_major_first, edge_partition_frontier_major_first + (local_frontier_offsets[i + 1] - local_frontier_offsets[i]), diff --git a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index afb918cecf..3b8f1fa15d 100644 --- a/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/include/cugraph/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -44,6 +44,7 @@ #include #include +#include #include namespace cugraph { @@ -353,6 +354,9 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( std::optional> offsets_with_mask{std::nullopt}; if (edge_partition_e_mask) { + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); rmm::device_uvector degrees_with_mask(0, handle.get_stream()); if (edge_partition.dcs_nzd_vertices()) { auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); @@ -363,26 +367,18 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( major_sparse_range_size + *(edge_partition.dcs_nzd_vertex_count()), handle.get_stream()); auto major_first = cuda::make_transform_iterator( thrust::make_counting_iterator(vertex_t{0}), - cuda::proclaim_return_type( - [major_sparse_range_size, - major_range_first = edge_partition.major_range_first(), - dcs_nzd_vertices = *(edge_partition.dcs_nzd_vertices())] __device__(vertex_t i) { - if (i < major_sparse_range_size) { // sparse - return major_range_first + i; - } else { // hypersparse - return *(dcs_nzd_vertices + (i - major_sparse_range_size)); - } - })); + detail::sparse_hypersparse_major_op_t{ + edge_partition}); degrees_with_mask = - edge_partition.compute_local_degrees_with_mask((*edge_partition_e_mask).value_first(), + edge_partition.compute_local_degrees_with_mask(edge_partition_mask_span, major_first, major_first + degrees_with_mask.size(), handle.get_stream()); } else { degrees_with_mask = edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), - thrust::make_counting_iterator(edge_partition.major_range_first()), - thrust::make_counting_iterator(edge_partition.major_range_last()), + edge_partition_mask_span, + std::tuple{edge_partition.major_range_first(), + edge_partition.major_range_last()}, handle.get_stream()); } offsets_with_mask = diff --git a/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh b/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh index a13bf51d3a..61be8babfc 100644 --- a/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_e_by_src_dst_key.cuh @@ -556,8 +556,11 @@ transform_reduce_e_by_src_dst_key(raft::handle_t const& handle, rmm::device_uvector tmp_keys(0, handle.get_stream()); std::optional> edge_offsets_with_mask{std::nullopt}; if (edge_partition_e_mask) { - auto local_degrees = edge_partition.compute_local_degrees_with_mask( - (*edge_partition_e_mask).value_first(), handle.get_stream()); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + auto local_degrees = edge_partition.compute_local_degrees_with_mask(edge_partition_mask_span, + handle.get_stream()); edge_offsets_with_mask = rmm::device_uvector(edge_partition.major_range_size() + 1, handle.get_stream()); (*edge_offsets_with_mask).set_element_to_zero_async(0, handle.get_stream()); diff --git a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh index c8bb00ebf1..26783b24cd 100644 --- a/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/include/cugraph/prims/transform_reduce_if_v_frontier_outgoing_e_by_dst.cuh @@ -1178,30 +1178,33 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, local_frontier_sizes[i], static_cast(i), handle.get_stream()); + auto edge_partition_frontier_vertices_span = raft::device_span{ + edge_partition_frontier_vertices.data(), edge_partition_frontier_vertices.size()}; if (edge_partition_e_mask) { - ret += - edge_partition.compute_number_of_edges_with_mask((*edge_partition_e_mask).value_first(), - edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), - handle.get_stream()); + auto edge_partition_mask_span = raft::device_span( + (*edge_partition_e_mask).value_first(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + ret += edge_partition.compute_number_of_edges_with_mask( + edge_partition_mask_span, edge_partition_frontier_vertices_span, handle.get_stream()); } else { - ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices.begin(), - edge_partition_frontier_vertices.end(), + ret += edge_partition.compute_number_of_edges(edge_partition_frontier_vertices_span, handle.get_stream()); } } else { assert(i == 0); if (edge_partition_e_mask) { - ret += edge_partition.compute_number_of_edges_with_mask( + auto edge_partition_mask_span = raft::device_span( (*edge_partition_e_mask).value_first(), - local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + ret += edge_partition.compute_number_of_edges_with_mask( + edge_partition_mask_span, + raft::device_span(local_frontier_vertex_first, frontier.size()), handle.get_stream()); } else { - ret += edge_partition.compute_number_of_edges(local_frontier_vertex_first, - local_frontier_vertex_first + frontier.size(), - handle.get_stream()); + ret += edge_partition.compute_number_of_edges( + raft::device_span(local_frontier_vertex_first, frontier.size()), + handle.get_stream()); } } } diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index bed45dc118..fcccc1b1be 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -588,10 +588,12 @@ EdgeTypeAndIdToSrcDstLookupContainerType build_edge_id_and_type_to_src_dst_looku auto number_of_local_edges = edge_partition.number_of_edges(); if (graph_view.has_edge_mask()) { - number_of_local_edges = edge_partition.compute_number_of_edges_with_mask( + auto edge_partition_mask_span = raft::device_span( (*edge_partition_mask_view).value_first(), - thrust::make_counting_iterator(edge_partition.major_range_first()), - thrust::make_counting_iterator(edge_partition.major_range_last()), + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + number_of_local_edges = edge_partition.compute_number_of_edges_with_mask( + edge_partition_mask_span, + std::make_tuple(edge_partition.major_range_first(), edge_partition.major_range_last()), handle.get_stream()); } diff --git a/cpp/src/structure/edge_partition_device_view_impl.cuh b/cpp/src/structure/edge_partition_device_view_impl.cuh new file mode 100644 index 0000000000..9669d47a02 --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_impl.cuh @@ -0,0 +1,168 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#pragma once + +#include + +#include + +#include +#include + +#include + +namespace cugraph { + +namespace detail { +// ============================================================================ +// MG specialization: out-of-line definitions +// ============================================================================ +template +__host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + compute_number_of_edges_with_mask_async_mg(edge_mask, + majors.begin(), + majors.end(), + count, + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); +} + +template +__host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + std::tuple local_vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + compute_number_of_edges_with_mask_async_mg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(local_vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(local_vertex_partition_range)), + count, + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + return compute_local_degrees_with_mask_mg(edge_mask, + majors.begin(), + majors.end(), + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + return compute_local_degrees_with_mask_mg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(local_vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(local_vertex_partition_range)), + dcs_nzd_vertices, + major_range_first, + major_hypersparse_first, + offsets, + stream); +} + +// ============================================================================ +// SG specialization: out-of-line definitions +// ============================================================================ + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + compute_number_of_edges_with_mask_async_sg( + edge_mask, majors.begin(), majors.end(), count, offsets, stream); +} + +template +__host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + compute_number_of_edges_with_mask_async_sg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(vertex_partition_range)), + count, + offsets, + stream); +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + return compute_local_degrees_with_mask_sg( + edge_mask, majors.begin(), majors.end(), offsets, stream); +} + +template +__host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream) +{ + return compute_local_degrees_with_mask_sg( + edge_mask, + thrust::make_counting_iterator(std::get<0>(vertex_partition_range)), + thrust::make_counting_iterator(std::get<1>(vertex_partition_range)), + offsets, + stream); +} + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu new file mode 100644 index 0000000000..2e02ed7165 --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_mg_v32_e32.cu @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +#include +#include + +namespace cugraph { +namespace detail { + +using vertex_t = int32_t; +using edge_t = int32_t; + +template __host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu new file mode 100644 index 0000000000..3a8e5493b8 --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_mg_v64_e64.cu @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +#include +#include + +namespace cugraph { +namespace detail { + +using vertex_t = int64_t; +using edge_t = int64_t; + +template __host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void compute_number_of_edges_with_mask_async_mg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + raft::device_span majors, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_mg( + cuda::std::optional edge_mask, + std::tuple local_vertex_partition_range, + cuda::std::optional> dcs_nzd_vertices, + vertex_t major_range_first, + cuda::std::optional major_hypersparse_first, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu new file mode 100644 index 0000000000..4bc538ba61 --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_sg_v32_e32.cu @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +#include +#include + +namespace cugraph { +namespace detail { + +using vertex_t = int32_t; +using edge_t = int32_t; + +template __host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu new file mode 100644 index 0000000000..a5f279d9a1 --- /dev/null +++ b/cpp/src/structure/edge_partition_device_view_sg_v64_e64.cu @@ -0,0 +1,43 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ +#include "edge_partition_device_view_impl.cuh" + +#include +#include + +namespace cugraph { +namespace detail { + +using vertex_t = int64_t; +using edge_t = int64_t; + +template __host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ void compute_number_of_edges_with_mask_async_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span count, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + raft::device_span majors, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +template __host__ rmm::device_uvector compute_local_degrees_with_mask_sg( + cuda::std::optional edge_mask, + std::tuple vertex_partition_range, + raft::device_span offsets, + rmm::cuda_stream_view stream); + +} // namespace detail +} // namespace cugraph diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 1d89150ec7..7516ba0e8c 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -37,6 +37,7 @@ #include #include +#include #include namespace cugraph { @@ -325,17 +326,20 @@ void bfs(raft::handle_t const& handle, // partition_size * 0.5 & partition_size * // hypersparse_threshold_ratio * 0.5 as approximate out degrees if (edge_partition_e_mask) { - approx_out_degrees = edge_partition.compute_local_degrees_with_mask( + auto edge_partition_mask_span = raft::device_span( (*edge_partition_e_mask).value_first(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + - high_and_mid_degree_segment_size, + packed_bool_size(static_cast(edge_partition.number_of_edges()))); + approx_out_degrees = edge_partition.compute_local_degrees_with_mask( + edge_partition_mask_span, + std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + + static_cast(high_and_mid_degree_segment_size)), handle.get_stream()); } else { approx_out_degrees = edge_partition.compute_local_degrees( - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + - high_and_mid_degree_segment_size, + std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + + static_cast(high_and_mid_degree_segment_size)), handle.get_stream()); } thrust::transform(handle.get_thrust_policy(), diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index 49af2bee8a..0e4bdb3018 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -348,7 +348,9 @@ kv_store_t filter_key_to_dist_ma edge_partition_device_view_t( graph_view.local_edge_partition_view(0)); auto num_edges = edge_partition.compute_number_of_edges( - near_bucket.vertex_begin(), near_bucket.vertex_end(), handle.get_stream()); + raft::device_span{near_bucket.vertex_begin(), near_bucket.size()}, + handle.get_stream()); + for (size_t i = 0; i < far_buffers.size(); ++i) { auto far_vertex_first = cuda::make_transform_iterator( far_buffers[i].begin(),