diff --git a/cudax/include/cuda/experimental/__group/fwd.cuh b/cudax/include/cuda/experimental/__group/fwd.cuh
index e13f9b86115..fff355653be 100644
--- a/cudax/include/cuda/experimental/__group/fwd.cuh
+++ b/cudax/include/cuda/experimental/__group/fwd.cuh
@@ -69,7 +69,7 @@ class group;
 
 // mappings
 
-template <::cuda::std::size_t _Np = ::cuda::std::dynamic_extent, bool _IsExhaustive = true>
+template <::cuda::std::size_t _Count = ::cuda::std::dynamic_extent, bool _IsExhaustive = true>
 class group_by;
 
 template <class _Data, bool _IsExahustive>
@@ -97,6 +97,13 @@ inline constexpr bool __is_this_group_v<this_cluster<_Hierarchy>> = true;
 template <class _Hierarchy>
 inline constexpr bool __is_this_group_v<this_grid<_Hierarchy>> = true;
 
+template <class _Tp>
+inline constexpr bool __is_group_mapping_v = false;
+template <::cuda::std::size_t _Count, bool _IsExhaustive>
+inline constexpr bool __is_group_mapping_v<group_by<_Count, _IsExhaustive>> = true;
+template <class _Data, bool _IsExhaustive>
+inline constexpr bool __is_group_mapping_v<group_as<_Data, _IsExhaustive>> = true;
+
 // tags
 
 struct non_exhaustive_t;
diff --git a/cudax/include/cuda/experimental/__group/group.cuh b/cudax/include/cuda/experimental/__group/group.cuh
index b755a3c92c5..2e82c242e1b 100644
--- a/cudax/include/cuda/experimental/__group/group.cuh
+++ b/cudax/include/cuda/experimental/__group/group.cuh
@@ -37,6 +37,7 @@
 #include <cuda/experimental/__group/concepts.cuh>
 #include <cuda/experimental/__group/fwd.cuh>
 #include <cuda/experimental/__group/mapping/group_by.cuh>
+#include <cuda/experimental/__group/mapping/mapping_result.cuh>
 #include <cuda/experimental/__group/this_group.cuh>
 #include <cuda/experimental/__group/traits.cuh>
 
@@ -57,8 +58,26 @@ class group
 
   // todo(dabayer): static_assert that _Unit is (under) typename _ParentGroup::unit_type
 
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr auto
+  __get_initial_mapping_result(const _ParentGroup& __parent) noexcept
+  {
+    using _ParentMappingResult = typename _ParentGroup::__mapping_result_type;
+    using _MappingResult =
+      ::cuda::experimental::__mapping_result<1,
+                                             ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>(),
+                                             _ParentMappingResult::is_always_exhaustive(),
+                                             _ParentMappingResult::is_always_contiguous()>;
+    return _MappingResult{
+      1,
+      0,
+      ::cuda::experimental::__count_query_group<unsigned, _Unit>(__parent),
+      ::cuda::experimental::__rank_query_group<unsigned, _Unit>(__parent)};
+  }
+
   using _ParentMappingResult = typename _ParentGroup::__mapping_result_type;
-  using _MappingResult       = __group_mapping_result_t<_Mapping, _Unit, _ParentGroup>;
+  using _MappingResult       = decltype(::cuda::std::declval<const _Mapping&>().map(
+    ::cuda::std::declval<const _ParentGroup&>(),
+    __get_initial_mapping_result(::cuda::std::declval<const _ParentGroup&>())));
   using _SynchronizerInstance =
     __group_synchronizer_instance_t<_Synchronizer, _Unit, _ParentGroup, _Mapping, _MappingResult>;
   static_assert(__group_mapping_result<_MappingResult>);
@@ -72,16 +91,7 @@ class group
   [[nodiscard]] _CCCL_DEVICE_API static _MappingResult
   __do_mapping(const _Mapping& __mapping, const _ParentGroup& __parent) noexcept
   {
-    // Do not invoke the mapping for threads that are not part of the parent group.
-    if constexpr (!_ParentMappingResult::is_always_exhaustive())
-    {
-      if (!__parent.__mapping_result().is_valid())
-      {
-        return _MappingResult::invalid();
-      }
-    }
-
-    const auto __mapping_result = __mapping.map(_Unit{}, __parent);
+    const auto __mapping_result = __mapping.map(__parent, __get_initial_mapping_result(__parent));
     if (__mapping_result.is_valid())
     {
       _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank");
diff --git a/cudax/include/cuda/experimental/__group/mapping/composite_mapping.cuh b/cudax/include/cuda/experimental/__group/mapping/composite_mapping.cuh
new file mode 100644
index 00000000000..951877a550c
--- /dev/null
+++ b/cudax/include/cuda/experimental/__group/mapping/composite_mapping.cuh
@@ -0,0 +1,140 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___GROUP_MAPPING_COMPOSITE_MAPPING_CUH
+#define _CUDA_EXPERIMENTAL___GROUP_MAPPING_COMPOSITE_MAPPING_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__type_traits/fold.h>
+#include <cuda/std/__type_traits/is_nothrow_copy_constructible.h>
+#include <cuda/std/tuple>
+
+#include <cuda/experimental/__group/fwd.cuh>
+#include <cuda/experimental/__group/queries.cuh>
+#include <cuda/experimental/__group/traits.cuh>
+
+#include <cuda/std/__cccl/prologue.h>
+
+#if !defined(_CCCL_DOXYGEN_INVOKED)
+
+// todo(dabayer): do we want to always use uint32_t for all counts/ranks?
+
+namespace cuda::experimental
+{
+template <class... _Mappings>
+class composite_mapping
+{
+  ::cuda::std::tuple<_Mappings...> __mappings_;
+
+  template <::cuda::std::size_t _Ip = 0, class _ParentGroup, class _PrevMappingResult>
+  [[nodiscard]] _CCCL_DEVICE_API auto
+  __map_impl(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept
+  {
+    const auto __result = ::cuda::std::get<_Ip>(__mappings_).map(__parent, __prev_mapping_result);
+    if constexpr (_Ip + 1 < sizeof...(_Mappings))
+    {
+      return __map_impl<_Ip + 1>(__parent, __result);
+    }
+    else
+    {
+      return __result;
+    }
+  }
+
+public:
+  _CCCL_DEVICE_API constexpr composite_mapping(const _Mappings&... __mappings) noexcept(
+    ::cuda::std::__fold_and_v<::cuda::std::is_nothrow_copy_constructible_v<_Mappings>...>)
+      : __mappings_{__mappings...}
+  {}
+
+  [[nodiscard]] _CCCL_DEVICE_API constexpr const ::cuda::std::tuple<_Mappings...>& get() const noexcept
+  {
+    return __mappings_;
+  }
+
+  template <class _ParentGroup, class _PrevMappingResult>
+  [[nodiscard]] _CCCL_DEVICE_API auto
+  map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept
+  {
+    return __map_impl(__parent, __prev_mapping_result);
+  }
+};
+
+template <class... _Mappings>
+_CCCL_DEVICE composite_mapping(const _Mappings&...) -> composite_mapping<_Mappings...>;
+
+_CCCL_TEMPLATE(class _Lhs, class _Rhs)
+_CCCL_REQUIRES(__is_group_mapping_v<_Lhs> _CCCL_AND __is_group_mapping_v<_Rhs>)
+[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_Lhs, _Rhs>
+operator|(const _Lhs& __lhs, const _Rhs& __rhs) noexcept(
+  ::cuda::std::is_nothrow_constructible_v<composite_mapping<_Lhs, _Rhs>, const _Lhs&, const _Rhs&>)
+{
+  return {__lhs, __rhs};
+}
+
+_CCCL_TEMPLATE(class... _LhsMappings, class _Rhs)
+_CCCL_REQUIRES(__is_group_mapping_v<_Rhs>)
+[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_LhsMappings..., _Rhs>
+operator|(const composite_mapping<_LhsMappings...>& __lhs, const _Rhs& __rhs) noexcept(
+  ::cuda::std::is_nothrow_constructible_v<composite_mapping<_LhsMappings..., _Rhs>, const _LhsMappings&..., const _Rhs&>)
+{
+  return ::cuda::std::apply(
+    [&](const auto&... __lhs_mappings) {
+      return composite_mapping{__lhs_mappings..., __rhs};
+    },
+    __lhs.get());
+}
+
+_CCCL_TEMPLATE(class _Lhs, class... _RhsMappings)
+_CCCL_REQUIRES(__is_group_mapping_v<_Lhs>)
+[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_Lhs, _RhsMappings...>
+operator|(const _Lhs& __lhs, const composite_mapping<_RhsMappings...>& __rhs) noexcept(
+  ::cuda::std::is_nothrow_constructible_v<composite_mapping<_Lhs, _RhsMappings...>, const _Lhs&, const _RhsMappings&...>)
+{
+  return ::cuda::std::apply(
+    [&](const auto&... __rhs_mappings) {
+      return composite_mapping{__lhs, __rhs_mappings...};
+    },
+    __rhs.get());
+}
+
+template <class... _LhsMappings, class... _RhsMappings>
+[[nodiscard]] _CCCL_DEVICE_API constexpr composite_mapping<_LhsMappings..., _RhsMappings...>
+operator|(const composite_mapping<_LhsMappings...>& __lhs, const composite_mapping<_RhsMappings...>& __rhs) noexcept(
+  ::cuda::std::is_nothrow_constructible_v<composite_mapping<_LhsMappings..., _RhsMappings...>,
+                                          const _LhsMappings&...,
+                                          const _RhsMappings&...>)
+{
+  return ::cuda::std::apply(
+    [&](const auto&... __lhs_mappings) {
+      return ::cuda::std::apply(
+        [&](const auto&... __rhs_mappings) {
+          return composite_mapping{__lhs_mappings..., __rhs_mappings...};
+        },
+        __rhs.get());
+    },
+    __lhs.get());
+}
+} // namespace cuda::experimental
+
+#endif // !_CCCL_DOXYGEN_INVOKED
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_EXPERIMENTAL___GROUP_MAPPING_COMPOSITE_MAPPING_CUH
diff --git a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh
index 07656a3d045..9907f07098d 100644
--- a/cudax/include/cuda/experimental/__group/mapping/group_as.cuh
+++ b/cudax/include/cuda/experimental/__group/mapping/group_as.cuh
@@ -31,6 +31,7 @@
 #include <cuda/std/span>
 
 #include <cuda/experimental/__group/fwd.cuh>
+#include <cuda/experimental/__group/mapping/mapping_result.cuh>
 #include <cuda/experimental/__group/queries.cuh>
 #include <cuda/experimental/__group/traits.cuh>
 
@@ -54,83 +55,6 @@ class group_as<__group_as_static_tag<_Counts...>, _IsExhaustive>
   static constexpr auto __counts_sum = (0 + ... + _Counts);
 
 public:
-  template <bool _ParentIsAlwaysExhaustive, bool _ParentIsAlwaysContiguous>
-  struct __mapping_result
-  {
-    unsigned __group_rank_;
-    unsigned __count_;
-    unsigned __rank_;
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept
-    {
-      return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank};
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept
-    {
-      return sizeof...(_Counts);
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept
-    {
-      return static_cast<unsigned>(sizeof...(_Counts));
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
-      }
-      return __group_rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept
-    {
-      return ::cuda::std::dynamic_extent;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting count of thread that is not part of the group is UB");
-      }
-      return __count_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB");
-      }
-      return __rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept
-    {
-      if constexpr (is_always_exhaustive())
-      {
-        return true;
-      }
-      else
-      {
-        return __rank_ != __invalid_count_or_rank;
-      }
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept
-    {
-      return _ParentIsAlwaysExhaustive && _IsExhaustive;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept
-    {
-      return _ParentIsAlwaysContiguous;
-    }
-  };
-
   _CCCL_HIDE_FROM_ABI explicit group_as() = default;
 
   _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive)
@@ -170,56 +94,73 @@ public:
     return static_cast<unsigned>(static_count(__i));
   }
 
-  template <class _Unit, class _ParentGroup>
-  [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept
+  template <class _ParentGroup, class _PrevMappingResult>
+  [[nodiscard]] _CCCL_DEVICE_API auto
+  map(const _ParentGroup&, const _PrevMappingResult& __prev_mapping_result) const noexcept
   {
-    using _ParentMappingResult = typename _ParentGroup::__mapping_result_type;
+    constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count();
+    constexpr auto __static_prev_nunits  = _PrevMappingResult::static_count();
+    constexpr auto __static_curr_ngroups = sizeof...(_Counts);
+    constexpr auto __static_ngroups =
+      (__static_prev_ngroups != ::cuda::std::dynamic_extent)
+        ? (__static_prev_ngroups * __static_curr_ngroups)
+        : ::cuda::std::dynamic_extent;
+
     using _MappingResult =
-      __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>;
+      __mapping_result<__static_ngroups,
+                       ::cuda::std::dynamic_extent,
+                       _PrevMappingResult::is_always_exhaustive() && _IsExhaustive,
+                       _PrevMappingResult::is_always_contiguous()>;
+
+    if (!__prev_mapping_result.is_valid())
+    {
+      return _MappingResult::invalid();
+    }
 
-    constexpr auto __static_nunits = ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>();
-    const auto __nunits            = _Unit::template count_as<unsigned>(__parent);
-    const auto __unit_rank         = _Unit::template rank_as<unsigned>(__parent);
-    const auto __ngroups           = static_cast<unsigned>(sizeof...(_Counts));
+    const auto __prev_nunits      = __prev_mapping_result.count();
+    const auto __prev_unit_rank   = __prev_mapping_result.rank();
+    constexpr auto __curr_ngroups = static_cast<unsigned>(sizeof...(_Counts));
+    const auto __ngroups          = __prev_mapping_result.group_count() * __curr_ngroups;
 
     if constexpr (_IsExhaustive)
     {
-      if constexpr (__static_nunits != ::cuda::std::dynamic_extent)
+      if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent)
       {
-        static_assert(__static_nunits == __counts_sum, "group_as mapping _IsExhaustive precondition violation");
+        static_assert(__static_prev_nunits == __counts_sum, "group_as mapping _IsExhaustive precondition violation");
       }
       else
       {
-        _CCCL_ASSERT(__nunits == static_cast<unsigned>(__counts_sum),
+        _CCCL_ASSERT(__prev_nunits == static_cast<unsigned>(__counts_sum),
                      "group_as mapping _IsExhaustive precondition violation");
       }
     }
     else
     {
-      if constexpr (__static_nunits != ::cuda::std::dynamic_extent)
+      if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent)
       {
-        static_assert(__static_nunits >= __counts_sum, "group_as mapping requires more units than are available");
+        static_assert(__static_prev_nunits >= __counts_sum, "group_as mapping requires more units than are available");
       }
       else
       {
-        _CCCL_ASSERT(__nunits >= static_cast<unsigned>(__counts_sum),
+        _CCCL_ASSERT(__prev_nunits >= static_cast<unsigned>(__counts_sum),
                      "group_as mapping requires more units than are available");
       }
 
-      if (__unit_rank >= static_cast<unsigned>(__counts_sum))
+      if (__prev_unit_rank >= static_cast<unsigned>(__counts_sum))
       {
-        return _MappingResult::invalid();
+        return _MappingResult::invalid_with_group_count(__ngroups);
       }
     }
 
     unsigned __sum = 0;
     _CCCL_PRAGMA_UNROLL_FULL()
-    for (unsigned __i = 0; __i < __ngroups; ++__i)
+    for (unsigned __i = 0; __i < __curr_ngroups; ++__i)
     {
       const auto __i_count = count(__i);
-      if (__unit_rank < __sum + __i_count)
+      if (__prev_unit_rank < __sum + __i_count)
       {
-        return _MappingResult{__i, __i_count, __unit_rank - __sum};
+        return _MappingResult{
+          __ngroups, __prev_mapping_result.group_rank() * __curr_ngroups + __i, __i_count, __prev_unit_rank - __sum};
       }
       __sum += __i_count;
     }
@@ -238,83 +179,6 @@ class group_as<__group_as_dynamic_tag<_GroupCount>, _IsExhaustive>
   unsigned __counts_[_GroupCount];
 
 public:
-  template <bool _ParentIsAlwaysExhaustive, bool _ParentIsAlwaysContiguous>
-  struct __mapping_result
-  {
-    unsigned __group_rank_;
-    unsigned __count_;
-    unsigned __rank_;
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept
-    {
-      return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank};
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept
-    {
-      return _GroupCount;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept
-    {
-      return static_cast<unsigned>(_GroupCount);
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
-      }
-      return __group_rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept
-    {
-      return ::cuda::std::dynamic_extent;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
-      }
-      return __count_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB");
-      }
-      return __rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept
-    {
-      if constexpr (is_always_exhaustive())
-      {
-        return true;
-      }
-      else
-      {
-        return __rank_ != __invalid_count_or_rank;
-      }
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept
-    {
-      return _ParentIsAlwaysExhaustive && _IsExhaustive;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept
-    {
-      return _ParentIsAlwaysContiguous;
-    }
-  };
-
   _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive)
   _CCCL_REQUIRES(_IsExhaustive2)
   _CCCL_DEVICE_API explicit constexpr group_as(::cuda::std::span<const unsigned, _GroupCount> __counts) noexcept
@@ -368,35 +232,54 @@ public:
     return __counts_[__i];
   }
 
-  template <class _Unit, class _ParentGroup>
-  [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept
+  template <class _ParentGroup, class _PrevMappingResult>
+  [[nodiscard]] _CCCL_DEVICE_API auto
+  map(const _ParentGroup&, const _PrevMappingResult& __prev_mapping_result) const noexcept
   {
-    using _ParentMappingResult = typename _ParentGroup::__mapping_result_type;
+    constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count();
+    constexpr auto __static_prev_nunits  = _PrevMappingResult::static_count();
+    constexpr auto __static_curr_ngroups = _GroupCount;
+    constexpr auto __static_ngroups =
+      (__static_prev_ngroups != ::cuda::std::dynamic_extent)
+        ? (__static_prev_ngroups * __static_curr_ngroups)
+        : ::cuda::std::dynamic_extent;
+
     using _MappingResult =
-      __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>;
+      __mapping_result<__static_ngroups,
+                       ::cuda::std::dynamic_extent,
+                       _PrevMappingResult::is_always_exhaustive() && _IsExhaustive,
+                       _PrevMappingResult::is_always_contiguous()>;
+
+    if (!__prev_mapping_result.is_valid())
+    {
+      return _MappingResult::invalid();
+    }
 
-    const auto __nunits    = __unit.template count_as<unsigned>(__parent);
-    const auto __unit_rank = __unit.template rank_as<unsigned>(__parent);
+    const auto __prev_nunits      = __prev_mapping_result.count();
+    const auto __prev_unit_rank   = __prev_mapping_result.rank();
+    constexpr auto __curr_ngroups = static_cast<unsigned>(_GroupCount);
+    const auto __ngroups          = __prev_mapping_result.group_count() * __curr_ngroups;
 
     // If the mapping is exhaustive, check the preconditions, otherwise remove the last partial group.
     if constexpr (_IsExhaustive)
     {
-      _CCCL_ASSERT(::cuda::std::accumulate(__counts_, __counts_ + _GroupCount, 0u) == __nunits,
+      _CCCL_ASSERT(::cuda::std::accumulate(__counts_, __counts_ + __curr_ngroups, 0u) == __prev_nunits,
                    "group_as mapping _IsExhaustive precondition violation");
     }
-    else if (__unit_rank >= ::cuda::std::accumulate(__counts_, __counts_ + _GroupCount, 0u))
+    else if (__prev_unit_rank >= ::cuda::std::accumulate(__counts_, __counts_ + __curr_ngroups, 0u))
     {
-      return _MappingResult::invalid();
+      return _MappingResult::invalid_with_group_count(__ngroups);
     }
 
     unsigned __sum = 0;
     _CCCL_PRAGMA_UNROLL_FULL()
-    for (unsigned __i = 0; __i < _GroupCount; ++__i)
+    for (unsigned __i = 0; __i < __curr_ngroups; ++__i)
     {
       const auto __i_count = count(__i);
-      if (__unit_rank < __sum + __i_count)
+      if (__prev_unit_rank < __sum + __i_count)
       {
-        return _MappingResult{__i, __i_count, __unit_rank - __sum};
+        return _MappingResult{
+          __ngroups, __prev_mapping_result.group_rank() * __curr_ngroups + __i, __i_count, __prev_unit_rank - __sum};
       }
       __sum += __i_count;
     }
diff --git a/cudax/include/cuda/experimental/__group/mapping/group_by.cuh b/cudax/include/cuda/experimental/__group/mapping/group_by.cuh
index 1739f9bb705..b13cda226f6 100644
--- a/cudax/include/cuda/experimental/__group/mapping/group_by.cuh
+++ b/cudax/include/cuda/experimental/__group/mapping/group_by.cuh
@@ -27,6 +27,7 @@
 #include <cuda/std/__utility/cmp.h>
 
 #include <cuda/experimental/__group/fwd.cuh>
+#include <cuda/experimental/__group/mapping/mapping_result.cuh>
 #include <cuda/experimental/__group/queries.cuh>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -57,86 +58,6 @@ class group_by
   static_assert(::cuda::std::in_range<unsigned>(_Count), "_Count must be within uint32_t range");
 
 public:
-  template <::cuda::std::size_t _NGroups, bool _ParentIsAlwaysExhaustive, bool _ParentIsAlwaysContiguous>
-  struct __mapping_result
-  {
-    unsigned __group_count_;
-    unsigned __group_rank_;
-    unsigned __rank_;
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result
-    invalid(unsigned __group_count = __invalid_count_or_rank) noexcept
-    {
-      return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank};
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept
-    {
-      return _NGroups;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept
-    {
-      _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank,
-                   "querying group_count() by a thread that was not part of the parent group is not allowed");
-      return __group_count_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
-      }
-      return __group_rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept
-    {
-      return _Count;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting count of thread that is not part of the group is UB");
-      }
-      return static_cast<unsigned>(_Count);
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB");
-      }
-      return __rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept
-    {
-      if constexpr (is_always_exhaustive())
-      {
-        return true;
-      }
-      else
-      {
-        return __rank_ != __invalid_count_or_rank;
-      }
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept
-    {
-      return _ParentIsAlwaysExhaustive && _IsExhaustive;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept
-    {
-      return _ParentIsAlwaysContiguous;
-    }
-  };
-
   _CCCL_HIDE_FROM_ABI explicit group_by() = default;
 
   _CCCL_TEMPLATE(bool _IsExhaustive2 = _IsExhaustive)
@@ -158,47 +79,62 @@ public:
     return static_cast<unsigned>(_Count);
   }
 
-  template <class _Unit, class _ParentGroup>
-  [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept
+  template <class _ParentGroup, class _PrevMappingResult>
+  [[nodiscard]] _CCCL_DEVICE_API auto
+  map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept
   {
-    constexpr auto __static_nunits = ::cuda::experimental::__static_count_query_group<_Unit, _ParentGroup>();
+    constexpr auto __static_prev_ngroups = _PrevMappingResult::static_group_count();
+    constexpr auto __static_prev_nunits  = _PrevMappingResult::static_count();
+    constexpr auto __static_curr_ngroups =
+      (__static_prev_nunits != ::cuda::std::dynamic_extent)
+        ? __static_prev_nunits / _Count
+        : ::cuda::std::dynamic_extent;
     constexpr auto __static_ngroups =
-      (__static_nunits != ::cuda::std::dynamic_extent) ? __static_nunits / _Count : ::cuda::std::dynamic_extent;
+      (__static_prev_ngroups != ::cuda::std::dynamic_extent && __static_curr_ngroups != ::cuda::std::dynamic_extent)
+        ? (__static_prev_ngroups * __static_curr_ngroups)
+        : ::cuda::std::dynamic_extent;
 
-    using _ParentMappingResult = typename _ParentGroup::__mapping_result_type;
     using _MappingResult =
       __mapping_result<__static_ngroups,
-                       _ParentMappingResult::is_always_exhaustive(),
-                       _ParentMappingResult::is_always_contiguous()>;
+                       _Count,
+                       _PrevMappingResult::is_always_exhaustive() && _IsExhaustive,
+                       _PrevMappingResult::is_always_contiguous()>;
 
-    const auto __nunits    = _Unit::template count_as<unsigned>(__parent);
-    const auto __unit_rank = _Unit::template rank_as<unsigned>(__parent);
+    if (!__prev_mapping_result.is_valid())
+    {
+      return _MappingResult::invalid();
+    }
 
-    _MappingResult __ret{};
-    __ret.__group_count_ = __nunits / count();
-    __ret.__group_rank_  = __unit_rank / count();
-    __ret.__rank_        = __unit_rank % count();
+    const auto __prev_nunits     = __prev_mapping_result.count();
+    const auto __prev_unit_rank  = __prev_mapping_result.rank();
+    const auto __curr_ngroups    = __prev_nunits / count();
+    const auto __curr_group_rank = __prev_unit_rank / count();
+    const auto __ngroups         = __prev_mapping_result.group_count() * __curr_ngroups;
 
     // If the mapping is exhaustive, check the preconditions, otherwise return invalid mapping for the remainder.
     if constexpr (_IsExhaustive)
     {
-      if constexpr (__static_nunits != ::cuda::std::dynamic_extent)
+      if constexpr (__static_prev_nunits != ::cuda::std::dynamic_extent)
       {
-        static_assert(__static_nunits % _Count == 0, "group_by mapping _IsExhaustive precondition violation");
+        static_assert(__static_prev_nunits % _Count == 0, "group_by mapping _IsExhaustive precondition violation");
       }
       else
       {
-        _CCCL_ASSERT(__nunits % count() == 0, "group_by mapping _IsExhaustive precondition violation");
+        _CCCL_ASSERT(__prev_nunits % count() == 0, "group_by mapping _IsExhaustive precondition violation");
       }
     }
-    else if (__nunits % count() != 0)
+    else if (__prev_nunits % count() != 0)
     {
-      if (__ret.__group_rank_ >= __ret.__group_count_)
+      if (__curr_group_rank >= __curr_ngroups)
       {
-        return _MappingResult::invalid(__ret.__group_count_);
+        return _MappingResult::invalid_with_group_count(__ngroups);
       }
     }
-    return __ret;
+    return _MappingResult{
+      __ngroups,
+      __prev_mapping_result.group_rank() * __curr_ngroups + __curr_group_rank,
+      count(),
+      __prev_unit_rank % count()};
   }
 };
 
@@ -208,87 +144,6 @@ class group_by<::cuda::std::dynamic_extent, _IsExhaustive>
   unsigned __count_;
 
 public:
-  template <bool _ParentIsAlwaysExhaustive, bool _ParentIsAlwaysContiguous>
-  struct __mapping_result
-  {
-    unsigned __group_count_;
-    unsigned __group_rank_;
-    unsigned __count_;
-    unsigned __rank_;
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result
-    invalid(unsigned __group_count = __invalid_count_or_rank) noexcept
-    {
-      return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank};
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept
-    {
-      return ::cuda::std::dynamic_extent;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept
-    {
-      _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank,
-                   "querying group_count() by a thread that was not part of the parent group is not allowed");
-      return __group_count_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
-      }
-      return __group_rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept
-    {
-      return ::cuda::std::dynamic_extent;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
-      }
-      return __count_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept
-    {
-      if constexpr (!is_always_exhaustive())
-      {
-        _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB");
-      }
-      return __rank_;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept
-    {
-      if constexpr (is_always_exhaustive())
-      {
-        return true;
-      }
-      else
-      {
-        return __rank_ != __invalid_count_or_rank;
-      }
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept
-    {
-      return _ParentIsAlwaysExhaustive && _IsExhaustive;
-    }
-
-    [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept
-    {
-      return _ParentIsAlwaysContiguous;
-    }
-  };
-
   _CCCL_DEVICE_API explicit constexpr group_by(unsigned __count) noexcept
       : __count_{__count}
   {
@@ -318,56 +173,50 @@ public:
     return __count_;
   }
 
-  template <class _Unit, class _ParentGroup>
-  [[nodiscard]] _CCCL_DEVICE_API auto map(const _Unit& __unit, const _ParentGroup& __parent) const noexcept
+  template <class _ParentGroup, class _PrevMappingResult>
+  [[nodiscard]] _CCCL_DEVICE_API auto
+  map(const _ParentGroup& __parent, const _PrevMappingResult& __prev_mapping_result) const noexcept
   {
-    using _ParentMappingResult = typename _ParentGroup::__mapping_result_type;
     using _MappingResult =
-      __mapping_result<_ParentMappingResult::is_always_exhaustive(), _ParentMappingResult::is_always_contiguous()>;
+      __mapping_result<::cuda::std::dynamic_extent,
+                       ::cuda::std::dynamic_extent,
+                       _PrevMappingResult::is_always_exhaustive() && _IsExhaustive,
+                       _PrevMappingResult::is_always_contiguous()>;
 
-    const auto __nunits    = __unit.template count_as<unsigned>(__parent);
-    const auto __unit_rank = __unit.template rank_as<unsigned>(__parent);
+    if (!__prev_mapping_result.is_valid())
+    {
+      return _MappingResult::invalid();
+    }
 
-    _MappingResult __ret{};
-    __ret.__group_count_ = __nunits / __count_;
-    __ret.__group_rank_  = __unit_rank / __count_;
-    __ret.__count_       = __count_;
-    __ret.__rank_        = __unit_rank % __count_;
+    const auto __prev_nunits     = __prev_mapping_result.count();
+    const auto __prev_unit_rank  = __prev_mapping_result.rank();
+    const auto __curr_ngroups    = __prev_nunits / __count_;
+    const auto __curr_group_rank = __prev_unit_rank / __count_;
+    const auto __ngroups         = __prev_mapping_result.group_count() * __curr_ngroups;
 
     // If the mapping is exhaustive, check the preconditions, otherwise remove the last partial group.
     if constexpr (_IsExhaustive)
     {
-      _CCCL_ASSERT(__nunits % __count_ == 0, "group_by mapping _IsExhaustive precondition violation");
+      _CCCL_ASSERT(__prev_nunits % __count_ == 0, "group_by mapping _IsExhaustive precondition violation");
     }
-    else if (__nunits % __count_ != 0)
+    else if (__prev_nunits % __count_ != 0)
     {
-      if (__ret.__group_rank_ >= __ret.__group_count_)
+      if (__curr_group_rank >= __curr_ngroups)
       {
-        return _MappingResult::invalid(__ret.__group_count_);
+        return _MappingResult::invalid_with_group_count(__ngroups);
       }
     }
-    return __ret;
+    return _MappingResult{
+      __ngroups,
+      __prev_mapping_result.group_rank() * __curr_ngroups + __curr_group_rank,
+      __count_,
+      __prev_unit_rank % __count_};
   }
 };
 
 _CCCL_DEVICE group_by(unsigned) -> group_by<::cuda::std::dynamic_extent>;
 
 _CCCL_DEVICE group_by(unsigned, const non_exhaustive_t&) -> group_by<::cuda::std::dynamic_extent, false>;
-
-template <class _MappingResult>
-_CCCL_DEVICE_API void __check_mapping_result(const _MappingResult& __mapping_result) noexcept
-{
-  // Don't check the mapping result if it's not valid. We can skip this check if mapping result is always exhaustive.
-  if constexpr (!_MappingResult::is_always_exhaustive())
-  {
-    if (!__mapping_result.is_valid())
-    {
-      return;
-    }
-  }
-  _CCCL_ASSERT(__mapping_result.group_rank() < __mapping_result.group_count(), "invalid group rank");
-  _CCCL_ASSERT(__mapping_result.rank() < __mapping_result.count(), "invalid rank");
-}
 } // namespace cuda::experimental
 
 #endif // !_CCCL_DOXYGEN_INVOKED
diff --git a/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh b/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh
new file mode 100644
index 00000000000..303cd99a9e5
--- /dev/null
+++ b/cudax/include/cuda/experimental/__group/mapping/mapping_result.cuh
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH
+#define _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__fwd/span.h>
+
+#include <cuda/experimental/__group/fwd.cuh>
+
+#include <cuda/std/__cccl/prologue.h>
+
+#if !defined(_CCCL_DOXYGEN_INVOKED)
+
+// todo(dabayer): do we want to always use uint32_t for all counts/ranks?
+
+namespace cuda::experimental
+{
+template <::cuda::std::size_t _StaticGroupCount, ::cuda::std::size_t _StaticCount, bool _IsExhaustive, bool _IsContiguous>
+struct __mapping_result
+{
+  unsigned __group_count_;
+  unsigned __group_rank_;
+  unsigned __count_;
+  unsigned __rank_;
+
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result invalid() noexcept
+  {
+    return {__invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank};
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr __mapping_result
+  invalid_with_group_count(unsigned __group_count) noexcept
+  {
+    return {__group_count, __invalid_count_or_rank, __invalid_count_or_rank, __invalid_count_or_rank};
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_group_count() noexcept
+  {
+    return _StaticGroupCount;
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API unsigned group_count() const noexcept
+  {
+    if constexpr (_StaticGroupCount != ::cuda::std::dynamic_extent)
+    {
+      return static_cast<unsigned>(_StaticGroupCount);
+    }
+    else
+    {
+      if constexpr (!_IsExhaustive)
+      {
+        _CCCL_ASSERT(__group_count_ != __invalid_count_or_rank,
+                     "getting group count by a unit that was not part of the parent group is not allowed");
+      }
+      return __group_count_;
+    }
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API unsigned group_rank() const noexcept
+  {
+    if constexpr (!_IsExhaustive)
+    {
+      _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
+    }
+    return __group_rank_;
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr ::cuda::std::size_t static_count() noexcept
+  {
+    return _StaticCount;
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API unsigned count() const noexcept
+  {
+    if constexpr (_StaticCount != ::cuda::std::dynamic_extent)
+    {
+      return static_cast<unsigned>(_StaticCount);
+    }
+    else
+    {
+      if constexpr (!_IsExhaustive)
+      {
+        _CCCL_ASSERT(is_valid(), "getting group rank of thread that is not part of the group is UB");
+      }
+      return __count_;
+    }
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API unsigned rank() const noexcept
+  {
+    if constexpr (!_IsExhaustive)
+    {
+      _CCCL_ASSERT(is_valid(), "getting rank of thread that is not part of the group is UB");
+    }
+    return __rank_;
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API bool is_valid() const noexcept
+  {
+    if constexpr (_IsExhaustive)
+    {
+      return true;
+    }
+    else
+    {
+      return __rank_ != __invalid_count_or_rank;
+    }
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_exhaustive() noexcept
+  {
+    return _IsExhaustive;
+  }
+
+  [[nodiscard]] _CCCL_DEVICE_API static constexpr bool is_always_contiguous() noexcept
+  {
+    return _IsContiguous;
+  }
+};
+} // namespace cuda::experimental
+
+#endif // !_CCCL_DOXYGEN_INVOKED
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_EXPERIMENTAL___GROUP_MAPPING_MAPPING_RESULT_CUH
diff --git a/cudax/include/cuda/experimental/group.cuh b/cudax/include/cuda/experimental/group.cuh
index 7f740aeba20..e1d8b0ce9a2 100644
--- a/cudax/include/cuda/experimental/group.cuh
+++ b/cudax/include/cuda/experimental/group.cuh
@@ -25,6 +25,7 @@
 #include <cuda/experimental/__group/fwd.cuh>
 #include <cuda/experimental/__group/group.cuh>
 #include <cuda/experimental/__group/implicit_hierarchy.cuh>
+#include <cuda/experimental/__group/mapping/composite_mapping.cuh>
 #include <cuda/experimental/__group/mapping/group_as.cuh>
 #include <cuda/experimental/__group/mapping/group_by.cuh>
 #include <cuda/experimental/__group/queries.cuh>
diff --git a/cudax/test/CMakeLists.txt b/cudax/test/CMakeLists.txt
index f8703ac5f34..d6d61160060 100644
--- a/cudax/test/CMakeLists.txt
+++ b/cudax/test/CMakeLists.txt
@@ -129,6 +129,11 @@ cudax_add_catch2_test(test_target algorithm
     algorithm/copy.cu
 )
 
+cudax_add_catch2_test(test_target group.mapping.composite_mapping
+    group/mapping/composite_mapping.cu
+)
+target_compile_definitions(${test_target} PUBLIC _CUDAX_GROUP)
+
 cudax_add_catch2_test(test_target group.mapping.group_as
     group/mapping/group_as.cu
 )
diff --git a/cudax/test/common/group_testing.cuh b/cudax/test/common/group_testing.cuh
index b194aa8e74b..362396e2ba9 100644
--- a/cudax/test/common/group_testing.cuh
+++ b/cudax/test/common/group_testing.cuh
@@ -44,6 +44,54 @@ __device__ auto& get_barriers(const Level& level) noexcept
     return reinterpret_cast<Barrier(&)[N]>(global_barriers_storage<BarriersStorage, Id>);
   }
 }
+
+struct ThreadsInWarpMappingResult
+{
+  __device__ static constexpr ::cuda::std::size_t static_group_count()
+  {
+    return 1;
+  }
+
+  __device__ unsigned group_count() const
+  {
+    return 1;
+  }
+
+  __device__ unsigned group_rank() const
+  {
+    return 0;
+  }
+
+  __device__ static constexpr ::cuda::std::size_t static_count()
+  {
+    return 32;
+  }
+
+  __device__ unsigned count() const
+  {
+    return 32;
+  }
+
+  __device__ unsigned rank() const
+  {
+    return cuda::gpu_thread.rank_as<unsigned>(cuda::warp);
+  }
+
+  __device__ bool is_valid() const
+  {
+    return true;
+  }
+
+  __device__ static constexpr bool is_always_exhaustive() noexcept
+  {
+    return true;
+  }
+
+  __device__ static constexpr bool is_always_contiguous() noexcept
+  {
+    return true;
+  }
+};
 } // namespace
 
 #endif // COMMON_GROUP_CUH
diff --git a/cudax/test/group/mapping/composite_mapping.cu b/cudax/test/group/mapping/composite_mapping.cu
new file mode 100644
index 00000000000..504ca7e2fac
--- /dev/null
+++ b/cudax/test/group/mapping/composite_mapping.cu
@@ -0,0 +1,191 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/devices>
+#include <cuda/hierarchy>
+#include <cuda/launch>
+#include <cuda/std/cstddef>
+#include <cuda/std/numeric>
+#include <cuda/std/tuple>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
+#include <cuda/stream>
+
+#include <cuda/experimental/group.cuh>
+
+#include "group_testing.cuh"
+
+namespace
+{
+template <class Mapping1, class Mapping2, class Config>
+__device__ void test_composite_mapping(const Mapping1& mapping1, const Mapping2& mapping2, Config config)
+{
+  using Mapping = cudax::composite_mapping<Mapping1, Mapping2>;
+
+  // Test construction from 2 mappings.
+  {
+    cudax::composite_mapping mapping{mapping1, mapping2};
+    static_assert(cuda::std::is_same_v<decltype(mapping), Mapping>);
+    static_assert(cuda::std::is_nothrow_constructible_v<Mapping, Mapping1, Mapping2>
+                  == (cuda::std::is_nothrow_copy_constructible_v<Mapping1>
+                      && cuda::std::is_nothrow_copy_constructible_v<Mapping2>) );
+  }
+
+  // Test get().
+  {
+    const cudax::composite_mapping mapping{mapping1, mapping2};
+    static_assert(cuda::std::is_same_v<decltype(mapping.get()), const cuda::std::tuple<Mapping1, Mapping2>&>);
+    static_assert(noexcept(mapping.get()));
+
+    const auto& mapping1_ref = cuda::std::get<0>(mapping.get());
+    CUDAX_CHECK(mapping1_ref.count() == 4);
+
+    const auto& mapping2_ref = cuda::std::get<1>(mapping.get());
+    CUDAX_CHECK(mapping2_ref.count(0) == 1);
+    CUDAX_CHECK(mapping2_ref.count(1) == 3);
+  }
+
+  // Test map(...).
+  {
+    const cudax::this_warp parent_group{config};
+    const ThreadsInWarpMappingResult prev_mapping_result;
+    const cudax::composite_mapping mapping{mapping1, mapping2};
+
+    static_assert(cudax::__group_mapping_result<decltype(mapping.map(parent_group, prev_mapping_result))>);
+
+    auto result  = mapping.map(parent_group, prev_mapping_result);
+    using Result = decltype(result);
+
+    const auto rank_in_warp = cuda::gpu_thread.rank_as<unsigned>(parent_group);
+
+    if constexpr (Mapping1::static_count() != cuda::std::dynamic_extent
+                  && Mapping2::static_group_count() != cuda::std::dynamic_extent)
+    {
+      static_assert(Result::static_group_count() == 16);
+    }
+    else
+    {
+      static_assert(Result::static_group_count() == cuda::std::dynamic_extent);
+    }
+    CUDAX_CHECK(result.group_count() == 16);
+    CUDAX_CHECK(result.group_rank() == (rank_in_warp / 4 * 2 + (rank_in_warp % 4 > 0)));
+
+    static_assert(Result::static_count() == cuda::std::dynamic_extent);
+    CUDAX_CHECK(result.count() == ((rank_in_warp % 4 > 0) ? 3 : 1));
+    CUDAX_CHECK(result.rank() == ((rank_in_warp % 4 > 0) ? (rank_in_warp % 4 - 1) : 0));
+
+    CUDAX_CHECK(result.is_valid());
+    static_assert(Result::is_always_exhaustive());
+    static_assert(Result::is_always_contiguous());
+  }
+
+  // Test operator|.
+  {
+    auto mapping = mapping1 | mapping2;
+
+    static_assert(cuda::std::is_same_v<Mapping, decltype(mapping)>);
+    static_assert(noexcept(mapping1 | mapping2));
+
+    const auto& mapping1_ref = cuda::std::get<0>(mapping.get());
+    CUDAX_CHECK(mapping1_ref.count() == 4);
+
+    const auto& mapping2_ref = cuda::std::get<1>(mapping.get());
+    CUDAX_CHECK(mapping2_ref.count(0) == 1);
+    CUDAX_CHECK(mapping2_ref.count(1) == 3);
+  }
+  {
+    auto mapping = cudax::composite_mapping{mapping1} | mapping2;
+
+    static_assert(cuda::std::is_same_v<Mapping, decltype(mapping)>);
+    static_assert(noexcept(cudax::composite_mapping{mapping1} | mapping2));
+
+    const auto& mapping1_ref = cuda::std::get<0>(mapping.get());
+    CUDAX_CHECK(mapping1_ref.count() == 4);
+
+    const auto& mapping2_ref = cuda::std::get<1>(mapping.get());
+    CUDAX_CHECK(mapping2_ref.count(0) == 1);
+    CUDAX_CHECK(mapping2_ref.count(1) == 3);
+  }
+  {
+    auto mapping = mapping1 | cudax::composite_mapping{mapping2};
+
+    static_assert(cuda::std::is_same_v<Mapping, decltype(mapping)>);
+    static_assert(noexcept(mapping1 | cudax::composite_mapping{mapping2}));
+
+    const auto& mapping1_ref = cuda::std::get<0>(mapping.get());
+    CUDAX_CHECK(mapping1_ref.count() == 4);
+
+    const auto& mapping2_ref = cuda::std::get<1>(mapping.get());
+    CUDAX_CHECK(mapping2_ref.count(0) == 1);
+    CUDAX_CHECK(mapping2_ref.count(1) == 3);
+  }
+  {
+    auto mapping = cudax::composite_mapping{mapping1} | cudax::composite_mapping{mapping2};
+
+    static_assert(cuda::std::is_same_v<Mapping, decltype(mapping)>);
+    static_assert(noexcept(cudax::composite_mapping{mapping1} | cudax::composite_mapping{mapping2}));
+
+    const auto& mapping1_ref = cuda::std::get<0>(mapping.get());
+    CUDAX_CHECK(mapping1_ref.count() == 4);
+
+    const auto& mapping2_ref = cuda::std::get<1>(mapping.get());
+    CUDAX_CHECK(mapping2_ref.count(0) == 1);
+    CUDAX_CHECK(mapping2_ref.count(1) == 3);
+  }
+}
+
+struct TestKernel
+{
+  template <class Config>
+  __device__ void operator()(const Config& config)
+  {
+    {
+      const cudax::group_by<4> mapping1{};
+      const cudax::group_as mapping2{cuda::std::integer_sequence<cuda::std::size_t, 1, 3>{}};
+      test_composite_mapping(mapping1, mapping2, config);
+    }
+    {
+      const cudax::group_by mapping1{4};
+      const cudax::group_as mapping2{cuda::std::integer_sequence<cuda::std::size_t, 1, 3>{}};
+      test_composite_mapping(mapping1, mapping2, config);
+    }
+    {
+      const cudax::group_by<4> mapping1{};
+      constexpr unsigned counts2[]{1, 3};
+      const cudax::group_as mapping2{counts2};
+      test_composite_mapping(mapping1, mapping2, config);
+    }
+    {
+      const cudax::group_by mapping1{4};
+      constexpr unsigned counts2[]{1, 3};
+      const cudax::group_as mapping2{counts2};
+      test_composite_mapping(mapping1, mapping2, config);
+    }
+  }
+};
+} // namespace
+
+C2H_TEST("Composite mapping", "[group]")
+{
+  const auto device = cuda::devices[0];
+
+  const cuda::stream stream{device};
+
+  {
+    const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<8, 4>());
+    cuda::launch(stream, config, TestKernel{});
+  }
+  {
+    const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims(dim3{8, 4}));
+    cuda::launch(stream, config, TestKernel{});
+  }
+
+  stream.sync();
+}
diff --git a/cudax/test/group/mapping/group_as.cu b/cudax/test/group/mapping/group_as.cu
index 9d262147f83..fed2f8ee35c 100644
--- a/cudax/test/group/mapping/group_as.cu
+++ b/cudax/test/group/mapping/group_as.cu
@@ -101,13 +101,14 @@ __device__ void test_group_as(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping;
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       const auto rank_in_warp = cuda::gpu_thread.rank(parent_group);
@@ -197,13 +198,14 @@ __device__ void test_group_as(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping{ns};
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       const auto rank_in_warp = cuda::gpu_thread.rank_as<unsigned>(parent_group);
@@ -315,13 +317,14 @@ __device__ void test_group_as_non_exhaustive(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping;
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       const auto rank_in_warp = cuda::gpu_thread.rank(parent_group);
@@ -416,13 +419,14 @@ __device__ void test_group_as_non_exhaustive(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping{ns, cudax::non_exhaustive};
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       const auto rank_in_warp = cuda::gpu_thread.rank(parent_group);
diff --git a/cudax/test/group/mapping/group_by.cu b/cudax/test/group/mapping/group_by.cu
index a603a0d42d6..16dd8f09519 100644
--- a/cudax/test/group/mapping/group_by.cu
+++ b/cudax/test/group/mapping/group_by.cu
@@ -70,13 +70,14 @@ __device__ void test_group_by(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping;
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       static_assert(Result::static_group_count() == 32 / N);
@@ -139,13 +140,14 @@ __device__ void test_group_by(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping{N};
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       static_assert(Result::static_group_count() == cuda::std::dynamic_extent);
@@ -211,13 +213,14 @@ __device__ void test_group_by_non_exhaustive(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping{cudax::non_exhaustive};
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       static_assert(Result::static_group_count() == 32 / N);
@@ -285,13 +288,14 @@ __device__ void test_group_by_non_exhaustive(Config config)
     // Test map(...).
     {
       const cudax::this_warp parent_group{config};
+      const ThreadsInWarpMappingResult prev_mapping_result;
 
-      static_assert(
-        cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group))>);
-      static_assert(noexcept(cuda::std::declval<const Mapping>().map(cuda::gpu_thread, parent_group)));
+      static_assert(cudax::__group_mapping_result<decltype(cuda::std::declval<const Mapping>().map(
+                      parent_group, prev_mapping_result))>);
+      static_assert(noexcept(cuda::std::declval<const Mapping>().map(parent_group, prev_mapping_result)));
 
       const Mapping mapping{N, cudax::non_exhaustive};
-      auto result  = mapping.map(cuda::gpu_thread, parent_group);
+      auto result  = mapping.map(parent_group, prev_mapping_result);
       using Result = decltype(result);
 
       static_assert(Result::static_group_count() == cuda::std::dynamic_extent);
diff --git a/cudax/test/group/synchronizer/barrier_synchronizer.cu b/cudax/test/group/synchronizer/barrier_synchronizer.cu
index 48c4bf868f8..95ed330fea1 100644
--- a/cudax/test/group/synchronizer/barrier_synchronizer.cu
+++ b/cudax/test/group/synchronizer/barrier_synchronizer.cu
@@ -92,11 +92,12 @@ __device__ void test_barrier_synchronizer(const Level& level, Config config)
     using Barrier  = cuda::std::remove_all_extents_t<cuda::std::remove_reference_t<decltype(barriers)>>;
 
     const auto parent_group = cudax::make_this_group(level, config);
+    const ThreadsInWarpMappingResult prev_mapping_result;
 
     const cudax::group_by mapping{4};
     const cudax::barrier_synchronizer synchronizer{barriers};
 
-    const auto mapping_result = mapping.map(cuda::gpu_thread, parent_group);
+    const auto mapping_result = mapping.map(parent_group, prev_mapping_result);
     const auto synchronizer_instance =
       synchronizer.make_instance(cuda::gpu_thread, parent_group, mapping, mapping_result);
 
diff --git a/cudax/test/group/synchronizer/lane_synchronizer.cu b/cudax/test/group/synchronizer/lane_synchronizer.cu
index 242e83f776e..cedd12933b0 100644
--- a/cudax/test/group/synchronizer/lane_synchronizer.cu
+++ b/cudax/test/group/synchronizer/lane_synchronizer.cu
@@ -33,11 +33,12 @@ __device__ void test_lane_synchronizer(const Level& level, Config config)
   // Test make_instance(...).
   {
     const auto parent_group = cudax::make_this_group(level, config);
+    const ThreadsInWarpMappingResult prev_mapping_result;
 
     const cudax::group_by mapping{2};
     const Synchronizer synchronizer{};
 
-    const auto mapping_result = mapping.map(cuda::gpu_thread, parent_group);
+    const auto mapping_result = mapping.map(parent_group, prev_mapping_result);
     const auto synchronizer_instance =
       synchronizer.make_instance(cuda::gpu_thread, parent_group, mapping, mapping_result);