diff --git a/cudax/include/cuda/experimental/__group/fwd.cuh b/cudax/include/cuda/experimental/__group/fwd.cuh index e13f9b86115..fff355653be 100644 --- a/cudax/include/cuda/experimental/__group/fwd.cuh +++ b/cudax/include/cuda/experimental/__group/fwd.cuh @@ -69,7 +69,7 @@ class group; // mappings -template <::cuda::std::size_t _Np = ::cuda::std::dynamic_extent, bool _IsExhaustive = true> +template <::cuda::std::size_t _Count = ::cuda::std::dynamic_extent, bool _IsExhaustive = true> class group_by; template @@ -97,6 +97,13 @@ inline constexpr bool __is_this_group_v> = true; template inline constexpr bool __is_this_group_v> = true; +template +inline constexpr bool __is_group_mapping_v = false; +template <::cuda::std::size_t _Count, bool _IsExhaustive> +inline constexpr bool __is_group_mapping_v> = true; +template +inline constexpr bool __is_group_mapping_v> = true; + // tags struct non_exhaustive_t; diff --git a/cudax/include/cuda/experimental/__group/group.cuh b/cudax/include/cuda/experimental/__group/group.cuh index b755a3c92c5..2e82c242e1b 100644 --- a/cudax/include/cuda/experimental/__group/group.cuh +++ b/cudax/include/cuda/experimental/__group/group.cuh @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -57,8 +58,26 @@ class group // todo(dabayer): static_assert that _Unit is (under) typename _ParentGroup::unit_type + [[nodiscard]] _CCCL_DEVICE_API static constexpr auto + __get_initial_mapping_result(const _ParentGroup& __parent) noexcept + { + using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; + using _MappingResult = + ::cuda::experimental::__mapping_result<1, + ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(), + _ParentMappingResult::is_always_exhaustive(), + _ParentMappingResult::is_always_contiguous()>; + return _MappingResult{ + 1, + 0, + ::cuda::experimental::__count_query_group(__parent), + ::cuda::experimental::__rank_query_group(__parent)}; + } + using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; - using _MappingResult = __group_mapping_result_t<_Mapping, _Unit, _ParentGroup>; + using _MappingResult = decltype(::cuda::std::declval().map( + ::cuda::std::declval(), + __get_initial_mapping_result(::cuda::std::declval()))); using _SynchronizerInstance = __group_synchronizer_instance_t<_Synchronizer, _Unit, _ParentGroup, _Mapping, _MappingResult>; static_assert(__group_mapping_result<_MappingResult>); @@ -72,16 +91,7 @@ class group [[nodiscard]] _CCCL_DEVICE_API static _MappingResult __do_mapping(const _Mapping& __mapping, const _ParentGroup& __parent) noexcept { - // Do not invoke the mapping for threads that are not part of the parent group. - if constexpr (!_ParentMappingResult::is_always_exhaustive()) - { - if (!__parent.__mapping_result().is_valid()) - { - return _MappingResult::invalid(); - } - } - - const auto __mapping_result = __mapping.map(_Unit{}, __parent); + const auto __mapping_result = __mapping.map(__parent, __get_initial_mapping_result(__parent)); if (__mapping_result.is_valid()) { _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank"); diff --git a/cudax/include/cuda/experimental/__group/mapping/composite_mapping.cuh b/cudax/include/cuda/experimental/__group/mapping/composite_mapping.cuh new file mode 100644 index 00000000000..951877a550c --- /dev/null +++ b/cudax/include/cuda/experimental/__group/mapping/composite_mapping.cuh @@ -0,0 +1,140 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___GROUP_MAPPING_COMPOSITE_MAPPING_CUH +#define _CUDA_EXPERIMENTAL___GROUP_MAPPING_COMPOSITE_MAPPING_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include +#include +#include + +#include + +#if !defined(_CCCL_DOXYGEN_INVOKED) + +// todo(dabayer): do we want to always use uint32_t for all counts/ranks? + +namespace cuda::experimental +{ +template +class composite_mapping +{ + ::cuda::std::tuple<_Mappings...> __mappings_; + + template <::cuda::std::size_t _Ip = 0, class _ParentGroup, class _PrevMappingResult> + [[nodiscard]] _CCCL_DEVICE_API auto + __map_impl(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept + { + const auto __result = ::cuda::std::get<_Ip>(__mappings_).map(__parent, __prev_mapping_result); + if constexpr (_Ip + 1 < sizeof...(_Mappings)) + { + return __map_impl<_Ip + 1>(__parent, __result); + } + else + { + return __result; + } + } + +public: + _CCCL_DEVICE_API constexpr composite_mapping(const _Mappings&... __mappings) noexcept( + ::cuda::std::__fold_and_v<::cuda::std::is_nothrow_copy_constructible_v<_Mappings>...>) + : __mappings_{__mappings...} + {} + + [[nodiscard]] _CCCL_DEVICE_API constexpr const ::cuda::std::tuple<_Mappings...>& get() const noexcept + { + return __mappings_; + } + + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept + { + return __map_impl(__parent, __prev_mapping_result); + } +}; + +template +_CCCL_DEVICE composite_mapping(const _Mappings&...) -> composite_mapping<_Mappings...>; + +_CCCL_TEMPLATE(class _Lhs, class _Rhs) +_CCCL_REQUIRES(__is_group_mapping_v<_Lhs> _CCCL_AND __is_group_mapping_v<_Rhs>) +[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_Lhs, _Rhs> +operator|(const _Lhs& __lhs, const _Rhs& __rhs) noexcept( + ::cuda::std::is_nothrow_constructible_v, const _Lhs&, const _Rhs&>) +{ + return {__lhs, __rhs}; +} + +_CCCL_TEMPLATE(class... _LhsMappings, class _Rhs) +_CCCL_REQUIRES(__is_group_mapping_v<_Rhs>) +[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_LhsMappings..., _Rhs> +operator|(const composite_mapping<_LhsMappings...>& __lhs, const _Rhs& __rhs) noexcept( + ::cuda::std::is_nothrow_constructible_v, const _LhsMappings&..., const _Rhs&>) +{ + return ::cuda::std::apply( + [&](const auto&... __lhs_mappings) { + return composite_mapping{__lhs_mappings..., __rhs}; + }, + __lhs.get()); +} + +_CCCL_TEMPLATE(class _Lhs, class... _RhsMappings) +_CCCL_REQUIRES(__is_group_mapping_v<_Lhs>) +[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_Lhs, _RhsMappings...> +operator|(const _Lhs& __lhs, const composite_mapping<_RhsMappings...>& __rhs) noexcept( + ::cuda::std::is_nothrow_constructible_v, const _Lhs&, const _RhsMappings&...>) +{ + return ::cuda::std::apply( + [&](const auto&... __rhs_mappings) { + return composite_mapping{__lhs, __rhs_mappings...}; + }, + __rhs.get()); +} + +template +[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_LhsMappings..., _RhsMappings...> +operator|(const composite_mapping<_LhsMappings...>& __lhs, const composite_mapping<_RhsMappings...>& __rhs) noexcept( + ::cuda::std::is_nothrow_constructible_v, + const _LhsMappings&..., + const _RhsMappings&...>) +{ + return ::cuda::std::apply( + [&](const auto&... __lhs_mappings) { + return ::cuda::std::apply( + [&](const auto&... __rhs_mappings) { + return composite_mapping{__lhs_mappings..., __rhs_mappings...}; + }, + __rhs.get()); + }, + __lhs.get()); +} +} // namespace cuda::experimental + +#endif // !_CCCL_DOXYGEN_INVOKED + +#include + +#endif // _CUDA_EXPERIMENTAL___GROUP_MAPPING_COMPOSITE_MAPPING_CUH diff --git a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh index 07656a3d045..9907f07098d 100644 --- a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh +++ b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh @@ -31,6 +31,7 @@ #include #include +#include #include #include @@ -54,83 +55,6 @@ class group_as<__group_as_static_tag<_Counts...>, _IsExhaustive> static constexpr auto __counts_sum = (0 + ... + _Counts); public: - template - struct __mapping_result - { - unsigned __group_rank_; - unsigned __count_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept - { - return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return sizeof...(_Counts); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - return static_cast(sizeof...(_Counts)); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting count of thread that is not part of the group is UB"); - } - return __count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_HIDE_FROM_ABI explicit group_as() = default; _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive) @@ -170,56 +94,73 @@ public: return static_cast(static_count(__i)); } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup&, const _PrevMappingResult& __prev_mapping_result) const noexcept { - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; + constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count(); + constexpr auto __static_prev_nunits = _PrevMappingResult::static_count(); + constexpr auto __static_curr_ngroups = sizeof...(_Counts); + constexpr auto __static_ngroups = + (__static_prev_ngroups != ::cuda::std::dynamic_extent) + ? (__static_prev_ngroups * __static_curr_ngroups) + : ::cuda::std::dynamic_extent; + using _MappingResult = - __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>; + __mapping_result<__static_ngroups, + ::cuda::std::dynamic_extent, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; + + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } - constexpr auto __static_nunits = ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(); - const auto __nunits = _Unit::template count_as(__parent); - const auto __unit_rank = _Unit::template rank_as(__parent); - const auto __ngroups = static_cast(sizeof...(_Counts)); + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + constexpr auto __curr_ngroups = static_cast(sizeof...(_Counts)); + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; if constexpr (_IsExhaustive) { - if constexpr (__static_nunits != ::cuda::std::dynamic_extent) + if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent) { - static_assert(__static_nunits == __counts_sum, "group_as mapping _IsExhaustive precondition violation"); + static_assert(__static_prev_nunits == __counts_sum, "group_as mapping _IsExhaustive precondition violation"); } else { - _CCCL_ASSERT(__nunits == static_cast(__counts_sum), + _CCCL_ASSERT(__prev_nunits == static_cast(__counts_sum), "group_as mapping _IsExhaustive precondition violation"); } } else { - if constexpr (__static_nunits != ::cuda::std::dynamic_extent) + if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent) { - static_assert(__static_nunits >= __counts_sum, "group_as mapping requires more units than are available"); + static_assert(__static_prev_nunits >= __counts_sum, "group_as mapping requires more units than are available"); } else { - _CCCL_ASSERT(__nunits >= static_cast(__counts_sum), + _CCCL_ASSERT(__prev_nunits >= static_cast(__counts_sum), "group_as mapping requires more units than are available"); } - if (__unit_rank >= static_cast(__counts_sum)) + if (__prev_unit_rank >= static_cast(__counts_sum)) { - return _MappingResult::invalid(); + return _MappingResult::invalid_with_group_count(__ngroups); } } unsigned __sum = 0; _CCCL_PRAGMA_UNROLL_FULL() - for (unsigned __i = 0; __i < __ngroups; ++__i) + for (unsigned __i = 0; __i < __curr_ngroups; ++__i) { const auto __i_count = count(__i); - if (__unit_rank < __sum + __i_count) + if (__prev_unit_rank < __sum + __i_count) { - return _MappingResult{__i, __i_count, __unit_rank - __sum}; + return _MappingResult{ + __ngroups, __prev_mapping_result.group_rank() * __curr_ngroups + __i, __i_count, __prev_unit_rank - __sum}; } __sum += __i_count; } @@ -238,83 +179,6 @@ class group_as<__group_as_dynamic_tag<_GroupCount>, _IsExhaustive> unsigned __counts_[_GroupCount]; public: - template - struct __mapping_result - { - unsigned __group_rank_; - unsigned __count_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept - { - return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return _GroupCount; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - return static_cast(_GroupCount); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive) _CCCL_REQUIRES(_IsExhaustive2) _CCCL_DEVICE_API explicit constexpr group_as(::cuda::std::span __counts) noexcept @@ -368,35 +232,54 @@ public: return __counts_[__i]; } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup&, const _PrevMappingResult& __prev_mapping_result) const noexcept { - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; + constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count(); + constexpr auto __static_prev_nunits = _PrevMappingResult::static_count(); + constexpr auto __static_curr_ngroups = _GroupCount; + constexpr auto __static_ngroups = + (__static_prev_ngroups != ::cuda::std::dynamic_extent) + ? (__static_prev_ngroups * __static_curr_ngroups) + : ::cuda::std::dynamic_extent; + using _MappingResult = - __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>; + __mapping_result<__static_ngroups, + ::cuda::std::dynamic_extent, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; + + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } - const auto __nunits = __unit.template count_as(__parent); - const auto __unit_rank = __unit.template rank_as(__parent); + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + constexpr auto __curr_ngroups = static_cast(_GroupCount); + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; // If the mapping is exhaustive, check the preconditions, otherwise remove the last partial group. if constexpr (_IsExhaustive) { - _CCCL_ASSERT(::cuda::std::accumulate(__counts_, __counts_ + _GroupCount, 0u) == __nunits, + _CCCL_ASSERT(::cuda::std::accumulate(__counts_, __counts_ + __curr_ngroups, 0u) == __prev_nunits, "group_as mapping _IsExhaustive precondition violation"); } - else if (__unit_rank >= ::cuda::std::accumulate(__counts_, __counts_ + _GroupCount, 0u)) + else if (__prev_unit_rank >= ::cuda::std::accumulate(__counts_, __counts_ + __curr_ngroups, 0u)) { - return _MappingResult::invalid(); + return _MappingResult::invalid_with_group_count(__ngroups); } unsigned __sum = 0; _CCCL_PRAGMA_UNROLL_FULL() - for (unsigned __i = 0; __i < _GroupCount; ++__i) + for (unsigned __i = 0; __i < __curr_ngroups; ++__i) { const auto __i_count = count(__i); - if (__unit_rank < __sum + __i_count) + if (__prev_unit_rank < __sum + __i_count) { - return _MappingResult{__i, __i_count, __unit_rank - __sum}; + return _MappingResult{ + __ngroups, __prev_mapping_result.group_rank() * __curr_ngroups + __i, __i_count, __prev_unit_rank - __sum}; } __sum += __i_count; } diff --git a/cudax/include/cuda/experimental/__group/mapping/group_by.cuh b/cudax/include/cuda/experimental/__group/mapping/group_by.cuh index 1739f9bb705..b13cda226f6 100644 --- a/cudax/include/cuda/experimental/__group/mapping/group_by.cuh +++ b/cudax/include/cuda/experimental/__group/mapping/group_by.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -57,86 +58,6 @@ class group_by static_assert(::cuda::std::in_range(_Count), "_Count must be within uint32_t range"); public: - template <::cuda::std::size_t _NGroups, bool _ParentIsAlwaysExhaustive, bool _ParentIsAlwaysContiguous> - struct __mapping_result - { - unsigned __group_count_; - unsigned __group_rank_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result - invalid(unsigned __group_count = __invalid_count_or_rank) noexcept - { - return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return _NGroups; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank, - "querying group_count() by a thread that was not part of the parent group is not allowed"); - return __group_count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return _Count; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting count of thread that is not part of the group is UB"); - } - return static_cast(_Count); - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_HIDE_FROM_ABI explicit group_by() = default; _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive) @@ -158,47 +79,62 @@ public: return static_cast(_Count); } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept { - constexpr auto __static_nunits = ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(); + constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count(); + constexpr auto __static_prev_nunits = _PrevMappingResult::static_count(); + constexpr auto __static_curr_ngroups = + (__static_prev_nunits != ::cuda::std::dynamic_extent) + ? __static_prev_nunits / _Count + : ::cuda::std::dynamic_extent; constexpr auto __static_ngroups = - (__static_nunits != ::cuda::std::dynamic_extent) ? __static_nunits / _Count : ::cuda::std::dynamic_extent; + (__static_prev_ngroups != ::cuda::std::dynamic_extent && __static_curr_ngroups != ::cuda::std::dynamic_extent) + ? (__static_prev_ngroups * __static_curr_ngroups) + : ::cuda::std::dynamic_extent; - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; using _MappingResult = __mapping_result<__static_ngroups, - _ParentMappingResult::is_always_exhaustive(), - _ParentMappingResult::is_always_contiguous()>; + _Count, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; - const auto __nunits = _Unit::template count_as(__parent); - const auto __unit_rank = _Unit::template rank_as(__parent); + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } - _MappingResult __ret{}; - __ret.__group_count_ = __nunits / count(); - __ret.__group_rank_ = __unit_rank / count(); - __ret.__rank_ = __unit_rank % count(); + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + const auto __curr_ngroups = __prev_nunits / count(); + const auto __curr_group_rank = __prev_unit_rank / count(); + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; // If the mapping is exhaustive, check the preconditions, otherwise return invalid mapping for the remainder. if constexpr (_IsExhaustive) { - if constexpr (__static_nunits != ::cuda::std::dynamic_extent) + if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent) { - static_assert(__static_nunits % _Count == 0, "group_by mapping _IsExhaustive precondition violation"); + static_assert(__static_prev_nunits % _Count == 0, "group_by mapping _IsExhaustive precondition violation"); } else { - _CCCL_ASSERT(__nunits % count() == 0, "group_by mapping _IsExhaustive precondition violation"); + _CCCL_ASSERT(__prev_nunits % count() == 0, "group_by mapping _IsExhaustive precondition violation"); } } - else if (__nunits % count() != 0) + else if (__prev_nunits % count() != 0) { - if (__ret.__group_rank_ >= __ret.__group_count_) + if (__curr_group_rank >= __curr_ngroups) { - return _MappingResult::invalid(__ret.__group_count_); + return _MappingResult::invalid_with_group_count(__ngroups); } } - return __ret; + return _MappingResult{ + __ngroups, + __prev_mapping_result.group_rank() * __curr_ngroups + __curr_group_rank, + count(), + __prev_unit_rank % count()}; } }; @@ -208,87 +144,6 @@ class group_by<::cuda::std::dynamic_extent, _IsExhaustive> unsigned __count_; public: - template - struct __mapping_result - { - unsigned __group_count_; - unsigned __group_rank_; - unsigned __count_; - unsigned __rank_; - - [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result - invalid(unsigned __group_count = __invalid_count_or_rank) noexcept - { - return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept - { - _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank, - "querying group_count() by a thread that was not part of the parent group is not allowed"); - return __group_count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __group_rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept - { - return ::cuda::std::dynamic_extent; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); - } - return __count_; - } - - [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept - { - if constexpr (!is_always_exhaustive()) - { - _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); - } - return __rank_; - } - - [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept - { - if constexpr (is_always_exhaustive()) - { - return true; - } - else - { - return __rank_ != __invalid_count_or_rank; - } - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept - { - return _ParentIsAlwaysExhaustive && _IsExhaustive; - } - - [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept - { - return _ParentIsAlwaysContiguous; - } - }; - _CCCL_DEVICE_API explicit constexpr group_by(unsigned __count) noexcept : __count_{__count} { @@ -318,56 +173,50 @@ public: return __count_; } - template - [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept + template + [[nodiscard]] _CCCL_DEVICE_API auto + map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept { - using _ParentMappingResult = typename _ParentGroup::__mapping_result_type; using _MappingResult = - __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>; + __mapping_result<::cuda::std::dynamic_extent, + ::cuda::std::dynamic_extent, + _PrevMappingResult::is_always_exhaustive() && _IsExhaustive, + _PrevMappingResult::is_always_contiguous()>; - const auto __nunits = __unit.template count_as(__parent); - const auto __unit_rank = __unit.template rank_as(__parent); + if (!__prev_mapping_result.is_valid()) + { + return _MappingResult::invalid(); + } - _MappingResult __ret{}; - __ret.__group_count_ = __nunits / __count_; - __ret.__group_rank_ = __unit_rank / __count_; - __ret.__count_ = __count_; - __ret.__rank_ = __unit_rank % __count_; + const auto __prev_nunits = __prev_mapping_result.count(); + const auto __prev_unit_rank = __prev_mapping_result.rank(); + const auto __curr_ngroups = __prev_nunits / __count_; + const auto __curr_group_rank = __prev_unit_rank / __count_; + const auto __ngroups = __prev_mapping_result.group_count() * __curr_ngroups; // If the mapping is exhaustive, check the preconditions, otherwise remove the last partial group. if constexpr (_IsExhaustive) { - _CCCL_ASSERT(__nunits % __count_ == 0, "group_by mapping _IsExhaustive precondition violation"); + _CCCL_ASSERT(__prev_nunits % __count_ == 0, "group_by mapping _IsExhaustive precondition violation"); } - else if (__nunits % __count_ != 0) + else if (__prev_nunits % __count_ != 0) { - if (__ret.__group_rank_ >= __ret.__group_count_) + if (__curr_group_rank >= __curr_ngroups) { - return _MappingResult::invalid(__ret.__group_count_); + return _MappingResult::invalid_with_group_count(__ngroups); } } - return __ret; + return _MappingResult{ + __ngroups, + __prev_mapping_result.group_rank() * __curr_ngroups + __curr_group_rank, + __count_, + __prev_unit_rank % __count_}; } }; _CCCL_DEVICE group_by(unsigned) -> group_by<::cuda::std::dynamic_extent>; _CCCL_DEVICE group_by(unsigned, const non_exhaustive_t&) -> group_by<::cuda::std::dynamic_extent, false>; - -template -_CCCL_DEVICE_API void __check_mapping_result(const _MappingResult& __mapping_result) noexcept -{ - // Don't check the mapping result if it's not valid. We can skip this check if mapping result is always exhaustive. - if constexpr (!_MappingResult::is_always_exhaustive()) - { - if (!__mapping_result.is_valid()) - { - return; - } - } - _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank"); - _CCCL_ASSERT(__mapping_result.rank() < __mapping_result.count(), "invalid rank"); -} } // namespace cuda::experimental #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh b/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh new file mode 100644 index 00000000000..303cd99a9e5 --- /dev/null +++ b/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh @@ -0,0 +1,145 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH +#define _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +#include + +#if !defined(_CCCL_DOXYGEN_INVOKED) + +// todo(dabayer): do we want to always use uint32_t for all counts/ranks? + +namespace cuda::experimental +{ +template <::cuda::std::size_t _StaticGroupCount, ::cuda::std::size_t _StaticCount, bool _IsExhaustive, bool _IsContiguous> +struct __mapping_result +{ + unsigned __group_count_; + unsigned __group_rank_; + unsigned __count_; + unsigned __rank_; + + [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept + { + return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result + invalid_with_group_count(unsigned __group_count) noexcept + { + return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank}; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept + { + return _StaticGroupCount; + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept + { + if constexpr (_StaticGroupCount != ::cuda::std::dynamic_extent) + { + return static_cast(_StaticGroupCount); + } + else + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank, + "getting group count by a unit that was not part of the parent group is not allowed"); + } + return __group_count_; + } + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); + } + return __group_rank_; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept + { + return _StaticCount; + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept + { + if constexpr (_StaticCount != ::cuda::std::dynamic_extent) + { + return static_cast(_StaticCount); + } + else + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB"); + } + return __count_; + } + } + + [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept + { + if constexpr (!_IsExhaustive) + { + _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB"); + } + return __rank_; + } + + [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept + { + if constexpr (_IsExhaustive) + { + return true; + } + else + { + return __rank_ != __invalid_count_or_rank; + } + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept + { + return _IsExhaustive; + } + + [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept + { + return _IsContiguous; + } +}; +} // namespace cuda::experimental + +#endif // !_CCCL_DOXYGEN_INVOKED + +#include + +#endif // _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH diff --git a/cudax/include/cuda/experimental/group.cuh b/cudax/include/cuda/experimental/group.cuh index 7f740aeba20..e1d8b0ce9a2 100644 --- a/cudax/include/cuda/experimental/group.cuh +++ b/cudax/include/cuda/experimental/group.cuh @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt index f8703ac5f34..d6d61160060 100644 --- a/cudax/test/CMakeLists.txt +++ b/cudax/test/CMakeLists.txt @@ -129,6 +129,11 @@ cudax_add_catch2_test(test_target algorithm algorithm/copy.cu ) +cudax_add_catch2_test(test_target group.mapping.composite_mapping + group/mapping/composite_mapping.cu +) +target_compile_definitions(${test_target} PUBLIC _CUDAX_GROUP) + cudax_add_catch2_test(test_target group.mapping.group_as group/mapping/group_as.cu ) diff --git a/cudax/test/common/group_testing.cuh b/cudax/test/common/group_testing.cuh index b194aa8e74b..362396e2ba9 100644 --- a/cudax/test/common/group_testing.cuh +++ b/cudax/test/common/group_testing.cuh @@ -44,6 +44,54 @@ __device__ auto& get_barriers(const Level& level) noexcept return reinterpret_cast(global_barriers_storage); } } + +struct ThreadsInWarpMappingResult +{ + __device__ static constexpr ::cuda::std::size_t static_group_count() + { + return 1; + } + + __device__ unsigned group_count() const + { + return 1; + } + + __device__ unsigned group_rank() const + { + return 0; + } + + __device__ static constexpr ::cuda::std::size_t static_count() + { + return 32; + } + + __device__ unsigned count() const + { + return 32; + } + + __device__ unsigned rank() const + { + return cuda::gpu_thread.rank_as(cuda::warp); + } + + __device__ bool is_valid() const + { + return true; + } + + __device__ static constexpr bool is_always_exhaustive() noexcept + { + return true; + } + + __device__ static constexpr bool is_always_contiguous() noexcept + { + return true; + } +}; } // namespace #endif // COMMON_GROUP_CUH diff --git a/cudax/test/group/mapping/composite_mapping.cu b/cudax/test/group/mapping/composite_mapping.cu new file mode 100644 index 00000000000..504ca7e2fac --- /dev/null +++ b/cudax/test/group/mapping/composite_mapping.cu @@ -0,0 +1,191 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDA Experimental in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "group_testing.cuh" + +namespace +{ +template +__device__ void test_composite_mapping(const Mapping1& mapping1, const Mapping2& mapping2, Config config) +{ + using Mapping = cudax::composite_mapping; + + // Test construction from 2 mappings. + { + cudax::composite_mapping mapping{mapping1, mapping2}; + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_nothrow_constructible_v + == (cuda::std::is_nothrow_copy_constructible_v + && cuda::std::is_nothrow_copy_constructible_v) ); + } + + // Test get(). + { + const cudax::composite_mapping mapping{mapping1, mapping2}; + static_assert(cuda::std::is_same_v&>); + static_assert(noexcept(mapping.get())); + + const auto& mapping1_ref = cuda::std::get<0>(mapping.get()); + CUDAX_CHECK(mapping1_ref.count() == 4); + + const auto& mapping2_ref = cuda::std::get<1>(mapping.get()); + CUDAX_CHECK(mapping2_ref.count(0) == 1); + CUDAX_CHECK(mapping2_ref.count(1) == 3); + } + + // Test map(...). + { + const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; + const cudax::composite_mapping mapping{mapping1, mapping2}; + + static_assert(cudax::__group_mapping_result); + + auto result = mapping.map(parent_group, prev_mapping_result); + using Result = decltype(result); + + const auto rank_in_warp = cuda::gpu_thread.rank_as(parent_group); + + if constexpr (Mapping1::static_count() != cuda::std::dynamic_extent + && Mapping2::static_group_count() != cuda::std::dynamic_extent) + { + static_assert(Result::static_group_count() == 16); + } + else + { + static_assert(Result::static_group_count() == cuda::std::dynamic_extent); + } + CUDAX_CHECK(result.group_count() == 16); + CUDAX_CHECK(result.group_rank() == (rank_in_warp / 4 * 2 + (rank_in_warp % 4 > 0))); + + static_assert(Result::static_count() == cuda::std::dynamic_extent); + CUDAX_CHECK(result.count() == ((rank_in_warp % 4 > 0) ? 3 : 1)); + CUDAX_CHECK(result.rank() == ((rank_in_warp % 4 > 0) ? (rank_in_warp % 4 - 1) : 0)); + + CUDAX_CHECK(result.is_valid()); + static_assert(Result::is_always_exhaustive()); + static_assert(Result::is_always_contiguous()); + } + + // Test operator|. + { + auto mapping = mapping1 | mapping2; + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(mapping1 | mapping2)); + + const auto& mapping1_ref = cuda::std::get<0>(mapping.get()); + CUDAX_CHECK(mapping1_ref.count() == 4); + + const auto& mapping2_ref = cuda::std::get<1>(mapping.get()); + CUDAX_CHECK(mapping2_ref.count(0) == 1); + CUDAX_CHECK(mapping2_ref.count(1) == 3); + } + { + auto mapping = cudax::composite_mapping{mapping1} | mapping2; + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cudax::composite_mapping{mapping1} | mapping2)); + + const auto& mapping1_ref = cuda::std::get<0>(mapping.get()); + CUDAX_CHECK(mapping1_ref.count() == 4); + + const auto& mapping2_ref = cuda::std::get<1>(mapping.get()); + CUDAX_CHECK(mapping2_ref.count(0) == 1); + CUDAX_CHECK(mapping2_ref.count(1) == 3); + } + { + auto mapping = mapping1 | cudax::composite_mapping{mapping2}; + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(mapping1 | cudax::composite_mapping{mapping2})); + + const auto& mapping1_ref = cuda::std::get<0>(mapping.get()); + CUDAX_CHECK(mapping1_ref.count() == 4); + + const auto& mapping2_ref = cuda::std::get<1>(mapping.get()); + CUDAX_CHECK(mapping2_ref.count(0) == 1); + CUDAX_CHECK(mapping2_ref.count(1) == 3); + } + { + auto mapping = cudax::composite_mapping{mapping1} | cudax::composite_mapping{mapping2}; + + static_assert(cuda::std::is_same_v); + static_assert(noexcept(cudax::composite_mapping{mapping1} | cudax::composite_mapping{mapping2})); + + const auto& mapping1_ref = cuda::std::get<0>(mapping.get()); + CUDAX_CHECK(mapping1_ref.count() == 4); + + const auto& mapping2_ref = cuda::std::get<1>(mapping.get()); + CUDAX_CHECK(mapping2_ref.count(0) == 1); + CUDAX_CHECK(mapping2_ref.count(1) == 3); + } +} + +struct TestKernel +{ + template + __device__ void operator()(const Config& config) + { + { + const cudax::group_by<4> mapping1{}; + const cudax::group_as mapping2{cuda::std::integer_sequence{}}; + test_composite_mapping(mapping1, mapping2, config); + } + { + const cudax::group_by mapping1{4}; + const cudax::group_as mapping2{cuda::std::integer_sequence{}}; + test_composite_mapping(mapping1, mapping2, config); + } + { + const cudax::group_by<4> mapping1{}; + constexpr unsigned counts2[]{1, 3}; + const cudax::group_as mapping2{counts2}; + test_composite_mapping(mapping1, mapping2, config); + } + { + const cudax::group_by mapping1{4}; + constexpr unsigned counts2[]{1, 3}; + const cudax::group_as mapping2{counts2}; + test_composite_mapping(mapping1, mapping2, config); + } + } +}; +} // namespace + +C2H_TEST("Composite mapping", "[group]") +{ + const auto device = cuda::devices[0]; + + const cuda::stream stream{device}; + + { + const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<8, 4>()); + cuda::launch(stream, config, TestKernel{}); + } + { + const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims(dim3{8, 4})); + cuda::launch(stream, config, TestKernel{}); + } + + stream.sync(); +} diff --git a/cudax/test/group/mapping/group_as.cu b/cudax/test/group/mapping/group_as.cu index 9d262147f83..fed2f8ee35c 100644 --- a/cudax/test/group/mapping/group_as.cu +++ b/cudax/test/group/mapping/group_as.cu @@ -101,13 +101,14 @@ __device__ void test_group_as(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank(parent_group); @@ -197,13 +198,14 @@ __device__ void test_group_as(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{ns}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank_as(parent_group); @@ -315,13 +317,14 @@ __device__ void test_group_as_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank(parent_group); @@ -416,13 +419,14 @@ __device__ void test_group_as_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{ns, cudax::non_exhaustive}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); const auto rank_in_warp = cuda::gpu_thread.rank(parent_group); diff --git a/cudax/test/group/mapping/group_by.cu b/cudax/test/group/mapping/group_by.cu index a603a0d42d6..16dd8f09519 100644 --- a/cudax/test/group/mapping/group_by.cu +++ b/cudax/test/group/mapping/group_by.cu @@ -70,13 +70,14 @@ __device__ void test_group_by(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == 32 / N); @@ -139,13 +140,14 @@ __device__ void test_group_by(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{N}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == cuda::std::dynamic_extent); @@ -211,13 +213,14 @@ __device__ void test_group_by_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{cudax::non_exhaustive}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == 32 / N); @@ -285,13 +288,14 @@ __device__ void test_group_by_non_exhaustive(Config config) // Test map(...). { const cudax::this_warp parent_group{config}; + const ThreadsInWarpMappingResult prev_mapping_result; - static_assert( - cudax::__group_mapping_result().map(cuda::gpu_thread, parent_group))>); - static_assert(noexcept(cuda::std::declval().map(cuda::gpu_thread, parent_group))); + static_assert(cudax::__group_mapping_result().map( + parent_group, prev_mapping_result))>); + static_assert(noexcept(cuda::std::declval().map(parent_group, prev_mapping_result))); const Mapping mapping{N, cudax::non_exhaustive}; - auto result = mapping.map(cuda::gpu_thread, parent_group); + auto result = mapping.map(parent_group, prev_mapping_result); using Result = decltype(result); static_assert(Result::static_group_count() == cuda::std::dynamic_extent); diff --git a/cudax/test/group/synchronizer/barrier_synchronizer.cu b/cudax/test/group/synchronizer/barrier_synchronizer.cu index 48c4bf868f8..95ed330fea1 100644 --- a/cudax/test/group/synchronizer/barrier_synchronizer.cu +++ b/cudax/test/group/synchronizer/barrier_synchronizer.cu @@ -92,11 +92,12 @@ __device__ void test_barrier_synchronizer(const Level& level, Config config) using Barrier = cuda::std::remove_all_extents_t>; const auto parent_group = cudax::make_this_group(level, config); + const ThreadsInWarpMappingResult prev_mapping_result; const cudax::group_by mapping{4}; const cudax::barrier_synchronizer synchronizer{barriers}; - const auto mapping_result = mapping.map(cuda::gpu_thread, parent_group); + const auto mapping_result = mapping.map(parent_group, prev_mapping_result); const auto synchronizer_instance = synchronizer.make_instance(cuda::gpu_thread, parent_group, mapping, mapping_result); diff --git a/cudax/test/group/synchronizer/lane_synchronizer.cu b/cudax/test/group/synchronizer/lane_synchronizer.cu index 242e83f776e..cedd12933b0 100644 --- a/cudax/test/group/synchronizer/lane_synchronizer.cu +++ b/cudax/test/group/synchronizer/lane_synchronizer.cu @@ -33,11 +33,12 @@ __device__ void test_lane_synchronizer(const Level& level, Config config) // Test make_instance(...). { const auto parent_group = cudax::make_this_group(level, config); + const ThreadsInWarpMappingResult prev_mapping_result; const cudax::group_by mapping{2}; const Synchronizer synchronizer{}; - const auto mapping_result = mapping.map(cuda::gpu_thread, parent_group); + const auto mapping_result = mapping.map(parent_group, prev_mapping_result); const auto synchronizer_instance = synchronizer.make_instance(cuda::gpu_thread, parent_group, mapping, mapping_result);