From ef1c01b882098fc842cb58b0032c3e46486f248f Mon Sep 17 00:00:00 2001 From: David Bayer Date: Thu, 30 Apr 2026 11:23:52 +0200 Subject: [PATCH 1/3] [cudax] Change the group mapping application logic --- .../include/cuda/experimental/__group/fwd.cuh | 2 +- .../cuda/experimental/__group/group.cuh | 31 +- .../experimental/__group/mapping/group_as.cuh | 243 ++++------------ .../experimental/__group/mapping/group_by.cuh | 273 ++++-------------- .../__group/mapping/mapping_result.cuh | 145 ++++++++++ cudax/test/common/group_testing.cuh | 48 +++ cudax/test/group/mapping/group_as.cu | 36 ++- cudax/test/group/mapping/group_by.cu | 36 ++- .../synchronizer/barrier_synchronizer.cu | 3 +- .../group/synchronizer/lane_synchronizer.cu | 3 +- 10 files changed, 377 insertions(+), 443 deletions(-) create mode 100644 cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh diff --git a/cudax/include/cuda/experimental/__group/fwd.cuh b/cudax/include/cuda/experimental/__group/fwd.cuh index e13f9b86115..541dfc56d60 100644 --- a/cudax/include/cuda/experimental/__group/fwd.cuh +++ b/cudax/include/cuda/experimental/__group/fwd.cuh @@ -69,7 +69,7 @@ class group; // mappings -template <::cuda::std::size_t _Np = ::cuda::std::dynamic_extent, bool _IsExhaustive = true> +template <::cuda::std::size_t _Count = ::cuda::std::dynamic_extent, bool _IsExhaustive = true> class group_by; template diff --git a/cudax/include/cuda/experimental/__group/group.cuh b/cudax/include/cuda/experimental/__group/group.cuh index b755a3c92c5..30a16ab5d69 100644 --- a/cudax/include/cuda/experimental/__group/group.cuh +++ b/cudax/include/cuda/experimental/__group/group.cuh @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -57,8 +58,25 @@ class group // todo(dabayer): static_assert that _Unit is (under) typename _ParentGroup::unit_type + [[nodiscard]] _CCCL_DEVICE_API static auto __get_parent_mapping_result(const _ParentGroup& __parent) noexcept + { + using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; + using _MappingResult = + ::cuda::experimental::__mapping_result<1, + ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(), + _ParentMappingResult::is_always_exhaustive(), + _ParentMappingResult::is_always_contiguous()>; + return _MappingResult{ + 1, + 0, + ::cuda::experimental::__count_query_group(__parent), + ::cuda::experimental::__rank_query_group(__parent)}; + } + using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; - using _MappingResult = __group_mapping_result_t<_Mapping, _Unit, _ParentGroup>; + using _MappingResult = decltype(::cuda::std::declval().map( + ::cuda::std::declval(), + __get_parent_mapping_result(::cuda::std::declval()))); using _SynchronizerInstance = __group_synchronizer_instance_t<_Synchronizer, _Unit, _ParentGroup, _Mapping, _MappingResult>; static_assert(__group_mapping_result<_MappingResult>); @@ -72,16 +90,7 @@ class group [[nodiscard]] _CCCL_DEVICE_API static _MappingResult __do_mapping(const _Mapping& __mapping, const _ParentGroup& __parent) noexcept { - // Do not invoke the mapping for threads that are not part of the parent group. - if constexpr (!_ParentMappingResult::is_always_exhaustive()) - { - if (!__parent.__mapping_result().is_valid()) - { - return _MappingResult::invalid(); - } - } - - const auto __mapping_result = __mapping.map(_Unit{}, __parent); + const auto __mapping_result = __mapping.map(__parent, __get_parent_mapping_result(__parent)); if (__mapping_result.is_valid()) { _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank"); diff --git a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh index 07656a3d045..f3a3d5617cd 100644 --- a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh +++ b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -54,83 +55,6 @@ class group_as<__group_as_static_tag<_Counts...>, _IsExhaustive> static constexpr auto __counts_sum = (0 + ... + _Counts); public: - template - struct __mapping_result - { - unsigned __group_rank_; - unsigned __count_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept - { - return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return sizeof...(_Counts); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - return static_cast(sizeof...(_Counts)); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting count of thread that is not part of the group is UB"); - } - return __count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_HIDE_FROM_ABI explicit group_as() = default; _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive) @@ -170,56 +94,68 @@ public: return static_cast(static_count(__i)); } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup&, const _PrevMappingResult& __prev_mapping_result) const noexcept { - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; + constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count(); + constexpr auto __static_prev_nunits = _PrevMappingResult::static_count(); + constexpr auto __static_curr_ngroups = sizeof...(_Counts); + constexpr auto __static_ngroups = + (__static_prev_ngroups != ::cuda::std::dynamic_extent) + ? (__static_prev_ngroups * __static_curr_ngroups) + : ::cuda::std::dynamic_extent; + using _MappingResult = - __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>; + __mapping_result<__static_ngroups, + ::cuda::std::dynamic_extent, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; - constexpr auto __static_nunits = ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(); - const auto __nunits = _Unit::template count_as(__parent); - const auto __unit_rank = _Unit::template rank_as(__parent); - const auto __ngroups = static_cast(sizeof...(_Counts)); + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + constexpr auto __curr_ngroups = static_cast(sizeof...(_Counts)); + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; if constexpr (_IsExhaustive) { - if constexpr (__static_nunits != ::cuda::std::dynamic_extent) + if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent) { - static_assert(__static_nunits == __counts_sum, "group_as mapping _IsExhaustive precondition violation"); + static_assert(__static_prev_nunits == __counts_sum, "group_as mapping _IsExhaustive precondition violation"); } else { - _CCCL_ASSERT(__nunits == static_cast(__counts_sum), + _CCCL_ASSERT(__prev_nunits == static_cast(__counts_sum), "group_as mapping _IsExhaustive precondition violation"); } } else { - if constexpr (__static_nunits != ::cuda::std::dynamic_extent) + if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent) { - static_assert(__static_nunits >= __counts_sum, "group_as mapping requires more units than are available"); + static_assert(__static_prev_nunits >= __counts_sum, "group_as mapping requires more units than are available"); } else { - _CCCL_ASSERT(__nunits >= static_cast(__counts_sum), + _CCCL_ASSERT(__prev_nunits >= static_cast(__counts_sum), "group_as mapping requires more units than are available"); } - if (__unit_rank >= static_cast(__counts_sum)) + if (__prev_unit_rank >= static_cast(__counts_sum)) { - return _MappingResult::invalid(); + return _MappingResult::invalid_with_group_count(__ngroups); } } unsigned __sum = 0; _CCCL_PRAGMA_UNROLL_FULL() - for (unsigned __i = 0; __i < __ngroups; ++__i) + for (unsigned __i = 0; __i < __curr_ngroups; ++__i) { const auto __i_count = count(__i); - if (__unit_rank < __sum + __i_count) + if (__prev_unit_rank < __sum + __i_count) { - return _MappingResult{__i, __i_count, __unit_rank - __sum}; + return _MappingResult{ + __ngroups, __prev_mapping_result.group_rank() * __curr_ngroups + __i, __i_count, __prev_unit_rank - __sum}; } __sum += __i_count; } @@ -238,83 +174,6 @@ class group_as<__group_as_dynamic_tag<_GroupCount>, _IsExhaustive> unsigned __counts_[_GroupCount]; public: - template - struct __mapping_result - { - unsigned __group_rank_; - unsigned __count_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept - { - return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return _GroupCount; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - return static_cast(_GroupCount); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive) _CCCL_REQUIRES(_IsExhaustive2) _CCCL_DEVICE_API explicit constexpr group_as(::cuda::std::span __counts) noexcept @@ -368,35 +227,49 @@ public: return __counts_[__i]; } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup&, const _PrevMappingResult& __prev_mapping_result) const noexcept { - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; + constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count(); + constexpr auto __static_prev_nunits = _PrevMappingResult::static_count(); + constexpr auto __static_curr_ngroups = _GroupCount; + constexpr auto __static_ngroups = + (__static_prev_ngroups != ::cuda::std::dynamic_extent) + ? (__static_prev_ngroups * __static_curr_ngroups) + : ::cuda::std::dynamic_extent; + using _MappingResult = - __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>; + __mapping_result<__static_ngroups, + ::cuda::std::dynamic_extent, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; - const auto __nunits = __unit.template count_as(__parent); - const auto __unit_rank = __unit.template rank_as(__parent); + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + constexpr auto __curr_ngroups = static_cast(_GroupCount); + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; // If the mapping is exhaustive, check the preconditions, otherwise remove the last partial group. if constexpr (_IsExhaustive) { - _CCCL_ASSERT(::cuda::std::accumulate(__counts_, __counts_ + _GroupCount, 0u) == __nunits, + _CCCL_ASSERT(::cuda::std::accumulate(__counts_, __counts_ + __curr_ngroups, 0u) == __prev_nunits, "group_as mapping _IsExhaustive precondition violation"); } - else if (__unit_rank >= ::cuda::std::accumulate(__counts_, __counts_ + _GroupCount, 0u)) + else if (__prev_unit_rank >= ::cuda::std::accumulate(__counts_, __counts_ + __curr_ngroups, 0u)) { - return _MappingResult::invalid(); + return _MappingResult::invalid_with_group_count(__ngroups); } unsigned __sum = 0; _CCCL_PRAGMA_UNROLL_FULL() - for (unsigned __i = 0; __i < _GroupCount; ++__i) + for (unsigned __i = 0; __i < __curr_ngroups; ++__i) { const auto __i_count = count(__i); - if (__unit_rank < __sum + __i_count) + if (__prev_unit_rank < __sum + __i_count) { - return _MappingResult{__i, __i_count, __unit_rank - __sum}; + return _MappingResult{ + __ngroups, __prev_mapping_result.group_rank() * __curr_ngroups + __i, __i_count, __prev_unit_rank - __sum}; } __sum += __i_count; } diff --git a/cudax/include/cuda/experimental/__group/mapping/group_by.cuh b/cudax/include/cuda/experimental/__group/mapping/group_by.cuh index 1739f9bb705..b13cda226f6 100644 --- a/cudax/include/cuda/experimental/__group/mapping/group_by.cuh +++ b/cudax/include/cuda/experimental/__group/mapping/group_by.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -57,86 +58,6 @@ class group_by static_assert(::cuda::std::in_range(_Count), "_Count must be within uint32_t range"); public: - template <::cuda::std::size_t _NGroups, bool _ParentIsAlwaysExhaustive, bool _ParentIsAlwaysContiguous> - struct __mapping_result - { - unsigned __group_count_; - unsigned __group_rank_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result - invalid(unsigned __group_count = __invalid_count_or_rank) noexcept - { - return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return _NGroups; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank, - "querying group_count() by a thread that was not part of the parent group is not allowed"); - return __group_count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return _Count; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting count of thread that is not part of the group is UB"); - } - return static_cast(_Count); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_HIDE_FROM_ABI explicit group_by() = default; _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive) @@ -158,47 +79,62 @@ public: return static_cast(_Count); } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept { - constexpr auto __static_nunits = ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(); + constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count(); + constexpr auto __static_prev_nunits = _PrevMappingResult::static_count(); + constexpr auto __static_curr_ngroups = + (__static_prev_nunits != ::cuda::std::dynamic_extent) + ? __static_prev_nunits / _Count + : ::cuda::std::dynamic_extent; constexpr auto __static_ngroups = - (__static_nunits != ::cuda::std::dynamic_extent) ? __static_nunits / _Count : ::cuda::std::dynamic_extent; + (__static_prev_ngroups != ::cuda::std::dynamic_extent && __static_curr_ngroups != ::cuda::std::dynamic_extent) + ? (__static_prev_ngroups * __static_curr_ngroups) + : ::cuda::std::dynamic_extent; - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; using _MappingResult = __mapping_result<__static_ngroups, - _ParentMappingResult::is_always_exhaustive(), - _ParentMappingResult::is_always_contiguous()>; + _Count, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; - const auto __nunits = _Unit::template count_as(__parent); - const auto __unit_rank = _Unit::template rank_as(__parent); + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } - _MappingResult __ret{}; - __ret.__group_count_ = __nunits / count(); - __ret.__group_rank_ = __unit_rank / count(); - __ret.__rank_ = __unit_rank % count(); + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + const auto __curr_ngroups = __prev_nunits / count(); + const auto __curr_group_rank = __prev_unit_rank / count(); + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; // If the mapping is exhaustive, check the preconditions, otherwise return invalid mapping for the remainder. if constexpr (_IsExhaustive) { - if constexpr (__static_nunits != ::cuda::std::dynamic_extent) + if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent) { - static_assert(__static_nunits % _Count == 0, "group_by mapping _IsExhaustive precondition violation"); + static_assert(__static_prev_nunits % _Count == 0, "group_by mapping _IsExhaustive precondition violation"); } else { - _CCCL_ASSERT(__nunits % count() == 0, "group_by mapping _IsExhaustive precondition violation"); + _CCCL_ASSERT(__prev_nunits % count() == 0, "group_by mapping _IsExhaustive precondition violation"); } } - else if (__nunits % count() != 0) + else if (__prev_nunits % count() != 0) { - if (__ret.__group_rank_ >= __ret.__group_count_) + if (__curr_group_rank >= __curr_ngroups) { - return _MappingResult::invalid(__ret.__group_count_); + return _MappingResult::invalid_with_group_count(__ngroups); } } - return __ret; + return _MappingResult{ + __ngroups, + __prev_mapping_result.group_rank() * __curr_ngroups + __curr_group_rank, + count(), + __prev_unit_rank % count()}; } }; @@ -208,87 +144,6 @@ class group_by<::cuda::std::dynamic_extent, _IsExhaustive> unsigned __count_; public: - template - struct __mapping_result - { - unsigned __group_count_; - unsigned __group_rank_; - unsigned __count_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result - invalid(unsigned __group_count = __invalid_count_or_rank) noexcept - { - return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank, - "querying group_count() by a thread that was not part of the parent group is not allowed"); - return __group_count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_DEVICE_API explicit constexpr group_by(unsigned __count) noexcept : __count_{__count} { @@ -318,56 +173,50 @@ public: return __count_; } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept { - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; using _MappingResult = - __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>; + __mapping_result<::cuda::std::dynamic_extent, + ::cuda::std::dynamic_extent, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; - const auto __nunits = __unit.template count_as(__parent); - const auto __unit_rank = __unit.template rank_as(__parent); + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } - _MappingResult __ret{}; - __ret.__group_count_ = __nunits / __count_; - __ret.__group_rank_ = __unit_rank / __count_; - __ret.__count_ = __count_; - __ret.__rank_ = __unit_rank % __count_; + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + const auto __curr_ngroups = __prev_nunits / __count_; + const auto __curr_group_rank = __prev_unit_rank / __count_; + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; // If the mapping is exhaustive, check the preconditions, otherwise remove the last partial group. if constexpr (_IsExhaustive) { - _CCCL_ASSERT(__nunits % __count_ == 0, "group_by mapping _IsExhaustive precondition violation"); + _CCCL_ASSERT(__prev_nunits % __count_ == 0, "group_by mapping _IsExhaustive precondition violation"); } - else if (__nunits % __count_ != 0) + else if (__prev_nunits % __count_ != 0) { - if (__ret.__group_rank_ >= __ret.__group_count_) + if (__curr_group_rank >= __curr_ngroups) { - return _MappingResult::invalid(__ret.__group_count_); + return _MappingResult::invalid_with_group_count(__ngroups); } } - return __ret; + return _MappingResult{ + __ngroups, + __prev_mapping_result.group_rank() * __curr_ngroups + __curr_group_rank, + __count_, + __prev_unit_rank % __count_}; } }; _CCCL_DEVICE group_by(unsigned) -> group_by<::cuda::std::dynamic_extent>; _CCCL_DEVICE group_by(unsigned, const non_exhaustive_t&) -> group_by<::cuda::std::dynamic_extent, false>; - -template -_CCCL_DEVICE_API void __check_mapping_result(const _MappingResult& __mapping_result) noexcept -{ - // Don't check the mapping result if it's not valid. We can skip this check if mapping result is always exhaustive. - if constexpr (!_MappingResult::is_always_exhaustive()) - { - if (!__mapping_result.is_valid()) - { - return; - } - } - _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank"); - _CCCL_ASSERT(__mapping_result.rank() < __mapping_result.count(), "invalid rank"); -} } // namespace cuda::experimental #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh b/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh new file mode 100644 index 00000000000..303cd99a9e5 --- /dev/null +++ b/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH +#define _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +#include + +#if !defined(_CCCL_DOXYGEN_INVOKED) + +// todo(dabayer): do we want to always use uint32_t for all counts/ranks? + +namespace cuda::experimental +{ +template <::cuda::std::size_t _StaticGroupCount, ::cuda::std::size_t _StaticCount, bool _IsExhaustive, bool _IsContiguous> +struct __mapping_result +{ + unsigned __group_count_; + unsigned __group_rank_; + unsigned __count_; + unsigned __rank_; + + [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept + { + return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result + invalid_with_group_count(unsigned __group_count) noexcept + { + return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept + { + return _StaticGroupCount; + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept + { + if constexpr (_StaticGroupCount != ::cuda::std::dynamic_extent) + { + return static_cast(_StaticGroupCount); + } + else + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank, + "getting group count by a unit that was not part of the parent group is not allowed"); + } + return __group_count_; + } + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); + } + return __group_rank_; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept + { + return _StaticCount; + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept + { + if constexpr (_StaticCount != ::cuda::std::dynamic_extent) + { + return static_cast(_StaticCount); + } + else + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); + } + return __count_; + } + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); + } + return __rank_; + } + + [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept + { + if constexpr (_IsExhaustive) + { + return true; + } + else + { + return __rank_ != __invalid_count_or_rank; + } + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept + { + return _IsExhaustive; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept + { + return _IsContiguous; + } +}; +} // namespace cuda::experimental + +#endif // !_CCCL_DOXYGEN_INVOKED + +#include + +#endif // _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH diff --git a/cudax/test/common/group_testing.cuh b/cudax/test/common/group_testing.cuh index b194aa8e74b..362396e2ba9 100644 --- a/cudax/test/common/group_testing.cuh +++ b/cudax/test/common/group_testing.cuh @@ -44,6 +44,54 @@ __device__ auto& get_barriers(const Level& level) noexcept return reinterpret_cast(global_barriers_storage); } } + +struct ThreadsInWarpMappingResult +{ + __device__ static constexpr ::cuda::std::size_t static_group_count() + { + return 1; + } + + __device__ unsigned group_count() const + { + return 1; + } + + __device__ unsigned group_rank() const + { + return 0; + } + + __device__ static constexpr ::cuda::std::size_t static_count() + { + return 32; + } + + __device__ unsigned count() const + { + return 32; + } + + __device__ unsigned rank() const + { + return cuda::gpu_thread.rank_as(cuda::warp); + } + + __device__ bool is_valid() const + { + return true; + } + + __device__ static constexpr bool is_always_exhaustive() noexcept + { + return true; + } + + __device__ static constexpr bool is_always_contiguous() noexcept + { + return true; + } +}; } // namespace #endif // COMMON_GROUP_CUH diff --git a/cudax/test/group/mapping/group_as.cu b/cudax/test/group/mapping/group_as.cu index 9d262147f83..fed2f8ee35c 100644 --- a/cudax/test/group/mapping/group_as.cu +++ b/cudax/test/group/mapping/group_as.cu @@ -101,13 +101,14 @@ __device__ void test_group_as(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank(parent_group); @@ -197,13 +198,14 @@ __device__ void test_group_as(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{ns}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank_as(parent_group); @@ -315,13 +317,14 @@ __device__ void test_group_as_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank(parent_group); @@ -416,13 +419,14 @@ __device__ void test_group_as_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{ns, cudax::non_exhaustive}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank(parent_group); diff --git a/cudax/test/group/mapping/group_by.cu b/cudax/test/group/mapping/group_by.cu index a603a0d42d6..16dd8f09519 100644 --- a/cudax/test/group/mapping/group_by.cu +++ b/cudax/test/group/mapping/group_by.cu @@ -70,13 +70,14 @@ __device__ void test_group_by(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == 32 / N); @@ -139,13 +140,14 @@ __device__ void test_group_by(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{N}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == cuda::std::dynamic_extent); @@ -211,13 +213,14 @@ __device__ void test_group_by_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{cudax::non_exhaustive}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == 32 / N); @@ -285,13 +288,14 @@ __device__ void test_group_by_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{N, cudax::non_exhaustive}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == cuda::std::dynamic_extent); diff --git a/cudax/test/group/synchronizer/barrier_synchronizer.cu b/cudax/test/group/synchronizer/barrier_synchronizer.cu index 48c4bf868f8..95ed330fea1 100644 --- a/cudax/test/group/synchronizer/barrier_synchronizer.cu +++ b/cudax/test/group/synchronizer/barrier_synchronizer.cu @@ -92,11 +92,12 @@ __device__ void test_barrier_synchronizer(const Level& level, Config config) using Barrier = cuda::std::remove_all_extents_t>; const auto parent_group = cudax::make_this_group(level, config); + const ThreadsInWarpMappingResult prev_mapping_result; const cudax::group_by mapping{4}; const cudax::barrier_synchronizer synchronizer{barriers}; - const auto mapping_result = mapping.map(cuda::gpu_thread, parent_group); + const auto mapping_result = mapping.map(parent_group, prev_mapping_result); const auto synchronizer_instance = synchronizer.make_instance(cuda::gpu_thread, parent_group, mapping, mapping_result); diff --git a/cudax/test/group/synchronizer/lane_synchronizer.cu b/cudax/test/group/synchronizer/lane_synchronizer.cu index 242e83f776e..cedd12933b0 100644 --- a/cudax/test/group/synchronizer/lane_synchronizer.cu +++ b/cudax/test/group/synchronizer/lane_synchronizer.cu @@ -33,11 +33,12 @@ __device__ void test_lane_synchronizer(const Level& level, Config config) // Test make_instance(...). { const auto parent_group = cudax::make_this_group(level, config); + const ThreadsInWarpMappingResult prev_mapping_result; const cudax::group_by mapping{2}; const Synchronizer synchronizer{}; - const auto mapping_result = mapping.map(cuda::gpu_thread, parent_group); + const auto mapping_result = mapping.map(parent_group, prev_mapping_result); const auto synchronizer_instance = synchronizer.make_instance(cuda::gpu_thread, parent_group, mapping, mapping_result); From b5d3df762746390fa31c6dbbaf8db370cf5b6d42 Mon Sep 17 00:00:00 2001 From: David Bayer Date: Thu, 30 Apr 2026 11:50:31 +0200 Subject: [PATCH 2/3] try to fix nvcc on windows --- cudax/include/cuda/experimental/__group/group.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cudax/include/cuda/experimental/__group/group.cuh b/cudax/include/cuda/experimental/__group/group.cuh index 30a16ab5d69..2e82c242e1b 100644 --- a/cudax/include/cuda/experimental/__group/group.cuh +++ b/cudax/include/cuda/experimental/__group/group.cuh @@ -58,7 +58,8 @@ class group // todo(dabayer): static_assert that _Unit is (under) typename _ParentGroup::unit_type - [[nodiscard]] _CCCL_DEVICE_API static auto __get_parent_mapping_result(const _ParentGroup& __parent) noexcept + [[nodiscard]] _CCCL_DEVICE_API static constexpr auto + __get_initial_mapping_result(const _ParentGroup& __parent) noexcept { using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; using _MappingResult = @@ -76,7 +77,7 @@ class group using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; using _MappingResult = decltype(::cuda::std::declval().map( ::cuda::std::declval(), - __get_parent_mapping_result(::cuda::std::declval()))); + __get_initial_mapping_result(::cuda::std::declval()))); using _SynchronizerInstance = __group_synchronizer_instance_t<_Synchronizer, _Unit, _ParentGroup, _Mapping, _MappingResult>; static_assert(__group_mapping_result<_MappingResult>); @@ -90,7 +91,7 @@ class group [[nodiscard]] _CCCL_DEVICE_API static _MappingResult __do_mapping(const _Mapping& __mapping, const _ParentGroup& __parent) noexcept { - const auto __mapping_result = __mapping.map(__parent, __get_parent_mapping_result(__parent)); + const auto __mapping_result = __mapping.map(__parent, __get_initial_mapping_result(__parent)); if (__mapping_result.is_valid()) { _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank"); From a190751d6322fc507be2016d52014e6b61c6b1ee Mon Sep 17 00:00:00 2001 From: David Bayer Date: Thu, 30 Apr 2026 11:59:11 +0200 Subject: [PATCH 3/3] fix --- .../cuda/experimental/__group/mapping/group_as.cuh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh index f3a3d5617cd..9907f07098d 100644 --- a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh +++ b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh @@ -112,6 +112,11 @@ public: _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, _PrevMappingResult::is_always_contiguous()>; + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } + const auto __prev_nunits = __prev_mapping_result.count(); const auto __prev_unit_rank = __prev_mapping_result.rank(); constexpr auto __curr_ngroups = static_cast(sizeof...(_Counts)); @@ -245,6 +250,11 @@ public: _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, _PrevMappingResult::is_always_contiguous()>; + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } + const auto __prev_nunits = __prev_mapping_result.count(); const auto __prev_unit_rank = __prev_mapping_result.rank(); constexpr auto __curr_ngroups = static_cast(_GroupCount);