From ea3f956de0ffb790d9a8865d5072902e79cf2f27 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 12:40:55 -0700 Subject: [PATCH 01/47] is_trivially_copyable_relaxed --- .../is_trivially_copyable_relaxed.h | 77 +++++++++++ libcudacxx/include/cuda/type_traits | 1 + .../is_trivially_copyable_relaxed.pass.cpp | 128 ++++++++++++++++++ 3 files changed, 206 insertions(+) create mode 100644 libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h create mode 100644 libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h new file mode 100644 index 00000000000..b47a6b47407 --- /dev/null +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -0,0 +1,77 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_RELAXED_H +#define __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_RELAXED_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +template > +constexpr bool is_trivially_copyable_relaxed_impl_v = + ::cuda::std::__is_extended_floating_point_v<_Up> || ::cuda::is_extended_fp_vector_type_v<_Up> + || ::cuda::std::is_trivially_copyable_v<_Up>; + +template +struct is_trivially_copyable_relaxed : ::cuda::std::bool_constant<::cuda::is_trivially_copyable_relaxed_impl_v<_Tp>> +{}; + +template +struct is_trivially_copyable_relaxed<_Tp[]> : is_trivially_copyable_relaxed<_Tp> +{}; + +template +struct is_trivially_copyable_relaxed<_Tp[_Size]> : is_trivially_copyable_relaxed<_Tp> +{}; + +template +struct is_trivially_copyable_relaxed<::cuda::std::array<_Tp, _Size>> : is_trivially_copyable_relaxed<_Tp> +{}; + +// cuda::std::pair +template +struct is_trivially_copyable_relaxed<::cuda::std::pair<_T1, _T2>> + : ::cuda::std::bool_constant && is_trivially_copyable_relaxed_impl_v<_T2>> +{}; + +// cuda::std::tuple +template +struct is_trivially_copyable_relaxed<::cuda::std::tuple<_Ts...>> + : ::cuda::std::bool_constant<(is_trivially_copyable_relaxed_impl_v<_Ts> && ...)> +{}; + +template +constexpr bool is_trivially_copyable_relaxed_v = is_trivially_copyable_relaxed<_Tp>::value; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_RELAXED_H diff --git a/libcudacxx/include/cuda/type_traits b/libcudacxx/include/cuda/type_traits index 446359b6700..dd23942ae28 100644 --- a/libcudacxx/include/cuda/type_traits +++ b/libcudacxx/include/cuda/type_traits @@ -22,6 +22,7 @@ #endif // no system header #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp new file mode 100644 index 00000000000..db92fd55937 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp @@ -0,0 +1,128 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_is_trivially_copyable_relaxed() +{ + static_assert(cuda::is_trivially_copyable_relaxed::value); + static_assert(cuda::is_trivially_copyable_relaxed::value); + static_assert(cuda::is_trivially_copyable_relaxed::value); + static_assert(cuda::is_trivially_copyable_relaxed::value); + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); +} + +template +__host__ __device__ void test_is_not_trivially_copyable_relaxed() +{ + static_assert(!cuda::is_trivially_copyable_relaxed::value); + static_assert(!cuda::is_trivially_copyable_relaxed::value); + static_assert(!cuda::is_trivially_copyable_relaxed::value); + static_assert(!cuda::is_trivially_copyable_relaxed::value); + static_assert(!cuda::is_trivially_copyable_relaxed_v); + static_assert(!cuda::is_trivially_copyable_relaxed_v); + static_assert(!cuda::is_trivially_copyable_relaxed_v); + static_assert(!cuda::is_trivially_copyable_relaxed_v); +} + +struct TrivialPod +{ + int x; + float y; +}; + +class NonTriviallyCopyable +{ +public: + __host__ __device__ NonTriviallyCopyable(const NonTriviallyCopyable&) {} +}; + +__host__ __device__ void test() +{ + // standard trivially copyable types + test_is_trivially_copyable_relaxed(); + test_is_trivially_copyable_relaxed(); + test_is_trivially_copyable_relaxed(); + test_is_trivially_copyable_relaxed(); + + // C-style arrays of trivially copyable types + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); + + // cuda::std::array, pair, tuple of trivially copyable types + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); + + // extended floating point scalar types +#if _CCCL_HAS_NVFP16() + test_is_trivially_copyable_relaxed<__half>(); +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() + test_is_trivially_copyable_relaxed<__nv_bfloat16>(); +#endif // _CCCL_HAS_NVBF16() +#if _CCCL_HAS_NVFP8_E4M3() + test_is_trivially_copyable_relaxed<__nv_fp8_e4m3>(); +#endif // _CCCL_HAS_NVFP8_E4M3() + + // extended floating point vector types +#if _CCCL_HAS_NVFP16() + test_is_trivially_copyable_relaxed<__half2>(); +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() + test_is_trivially_copyable_relaxed<__nv_bfloat162>(); +#endif // _CCCL_HAS_NVBF16() +#if _CCCL_HAS_NVFP8() + test_is_trivially_copyable_relaxed<__nv_fp8x2_e4m3>(); +#endif // _CCCL_HAS_NVFP8() + + // compositions of extended floating point types +#if _CCCL_HAS_NVFP16() + static_assert(cuda::is_trivially_copyable_relaxed_v<__half[4]>); + static_assert(cuda::is_trivially_copyable_relaxed_v); + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); +#endif // _CCCL_HAS_NVBF16() + + // nested compositions +#if _CCCL_HAS_NVFP16() + test_is_trivially_copyable_relaxed, 2>>(); + test_is_trivially_copyable_relaxed, int>>(); + test_is_trivially_copyable_relaxed, double>>(); +#endif // _CCCL_HAS_NVFP16() + +#if _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() + test_is_trivially_copyable_relaxed>(); +#endif // _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() + + // non-trivially copyable types + test_is_not_trivially_copyable_relaxed(); +} + +int main(int, char**) +{ + test(); + return 0; +} From 9dd60dbebfd471824313e651d6d4ec3c44bd44cf Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 12:55:50 -0700 Subject: [PATCH 02/47] add documentation --- docs/libcudacxx/extended_api/type_traits.rst | 6 ++ .../is_trivially_copyable_relaxed.rst | 62 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst diff --git a/docs/libcudacxx/extended_api/type_traits.rst b/docs/libcudacxx/extended_api/type_traits.rst index 8588b77c375..a11d840668a 100644 --- a/docs/libcudacxx/extended_api/type_traits.rst +++ b/docs/libcudacxx/extended_api/type_traits.rst @@ -8,6 +8,7 @@ Type traits :maxdepth: 1 type_traits/is_floating_point + type_traits/is_trivially_copyable_relaxed type_traits/vector_types .. list-table:: @@ -24,6 +25,11 @@ Type traits - CCCL 3.0.0 - CUDA 13.0 + * - :ref:`cuda::is_trivially_copyable_relaxed ` + - Relaxed trivially copyable check including extended floating-point types + - CCCL 3.4.0 + - CUDA 13.4 + * - :ref:`Vector Type Traits ` - Type traits for CUDA vector types - CCCL 3.3.0 diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst new file mode 100644 index 00000000000..b86a60877c0 --- /dev/null +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -0,0 +1,62 @@ +.. _libcudacxx-extended-api-type_traits-is_trivially_copyable_relaxed: + +``cuda::is_trivially_copyable_relaxed`` +======================================= + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda { + + template + struct is_trivially_copyable_relaxed; + + template + constexpr bool is_trivially_copyable_relaxed_v = is_trivially_copyable_relaxed::value; + + } // namespace cuda + +``cuda::is_trivially_copyable_relaxed`` is a type trait that extends ``cuda::std::is_trivially_copyable`` to also recognize CUDA extended floating-point scalar and vector types as trivially copyable. + +A type ``T`` satisfies ``cuda::is_trivially_copyable_relaxed`` if any of the following holds: + +- ``T`` is trivially copyable. +- ``T`` is an extended floating-point scalar type (e.g. ``__half``, ``__nv_bfloat16``, ``__nv_fp8_e4m3``). +- ``T`` is an extended floating-point vector type (e.g. ``__half2``, ``__nv_bfloat162``, ``__nv_fp8x2_e4m3``). + +The trait also propagates through composite types: + +- C-style arrays: ``T[N]`` and ``T[]`` are relaxed trivially copyable when ``T`` is. +- ``cuda::std::array``: relaxed trivially copyable when ``T`` is. +- ``cuda::std::pair``: relaxed trivially copyable when both ``T1`` and ``T2`` are. +- ``cuda::std::tuple``: relaxed trivially copyable when all ``Ts...`` are. + +``const``, ``volatile``, and ``const volatile`` qualifications are handled transparently. + +Example +------- + +.. code:: cuda + + #include + #include + #include + #include + + #include + + // Standard trivially copyable types + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); + + // Extended floating-point types + static_assert(cuda::is_trivially_copyable_relaxed_v<__half>); + static_assert(cuda::is_trivially_copyable_relaxed_v<__nv_bfloat16>); + static_assert(cuda::is_trivially_copyable_relaxed_v<__half2>); + + // Composite types containing extended floating-point types + static_assert(cuda::is_trivially_copyable_relaxed_v<__half[4]>); + static_assert(cuda::is_trivially_copyable_relaxed_v>); + static_assert(cuda::is_trivially_copyable_relaxed_v>); + static_assert(cuda::is_trivially_copyable_relaxed_v>); From 179a81b9c620509d479d3630e9af216cdc440f91 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 13:02:35 -0700 Subject: [PATCH 03/47] describe custom specialization --- .../is_trivially_copyable_relaxed.rst | 32 +++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index b86a60877c0..1e749991c31 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -34,8 +34,36 @@ The trait also propagates through composite types: ``const``, ``volatile``, and ``const volatile`` qualifications are handled transparently. -Example -------- +Custom Specialization +--------------------- + +Users may specialize ``cuda::is_trivially_copyable_relaxed`` for their own types whose memory representation is safe to copy +with ``memcpy`` but that the compiler does not consider trivially copyable. +A common case is a type that wraps extended floating-point fields and provides user-defined copy operations +solely to add ``__host__ __device__`` annotations: + +.. code:: cuda + + struct HalfWrapper { + __half value; + }; + + struct NonTriviallyCopyable { + __host__ __device__ NonTriviallyCopyable(const NonTriviallyCopyable&) {} + }; + + // Specializing the trait + template <> + struct cuda::is_trivially_copyable_relaxed : cuda::std::true_type {}; + + template <> + struct cuda::is_trivially_copyable_relaxed : cuda::std::true_type {}; + + static_assert(cuda::is_trivially_copyable_relaxed_v); + static_assert(cuda::is_trivially_copyable_relaxed_v); + +Examples +-------- .. code:: cuda From 2c33c2eef2c425a806f93edf0e0658a1f46e96ac Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 13:15:06 -0700 Subject: [PATCH 04/47] move to internal function --- .../cuda/__type_traits/is_trivially_copyable_relaxed.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h index b47a6b47407..24b24818976 100644 --- a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -35,12 +35,12 @@ _CCCL_BEGIN_NAMESPACE_CUDA template > -constexpr bool is_trivially_copyable_relaxed_impl_v = +constexpr bool __is_trivially_copyable_relaxed_impl_v = ::cuda::std::__is_extended_floating_point_v<_Up> || ::cuda::is_extended_fp_vector_type_v<_Up> || ::cuda::std::is_trivially_copyable_v<_Up>; template -struct is_trivially_copyable_relaxed : ::cuda::std::bool_constant<::cuda::is_trivially_copyable_relaxed_impl_v<_Tp>> +struct is_trivially_copyable_relaxed : ::cuda::std::bool_constant<::cuda::__is_trivially_copyable_relaxed_impl_v<_Tp>> {}; template @@ -58,13 +58,13 @@ struct is_trivially_copyable_relaxed<::cuda::std::array<_Tp, _Size>> : is_trivia // cuda::std::pair template struct is_trivially_copyable_relaxed<::cuda::std::pair<_T1, _T2>> - : ::cuda::std::bool_constant && is_trivially_copyable_relaxed_impl_v<_T2>> + : ::cuda::std::bool_constant<__is_trivially_copyable_relaxed_impl_v<_T1> && __is_trivially_copyable_relaxed_impl_v<_T2>> {}; // cuda::std::tuple template struct is_trivially_copyable_relaxed<::cuda::std::tuple<_Ts...>> - : ::cuda::std::bool_constant<(is_trivially_copyable_relaxed_impl_v<_Ts> && ...)> + : ::cuda::std::bool_constant<(__is_trivially_copyable_relaxed_impl_v<_Ts> && ...)> {}; template From fbade3407cd0047c87fa4ed716a76ce17a26fd7c Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 15:24:26 -0700 Subject: [PATCH 05/47] address padding --- .../is_trivially_copyable_relaxed.rst | 24 +- .../is_trivially_copyable_relaxed.h | 37 +-- ...is_trivially_copyable_relaxed.mem.pass.cpp | 265 ++++++++++++++++++ .../is_trivially_copyable_relaxed.pass.cpp | 26 +- 4 files changed, 310 insertions(+), 42 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index 1e749991c31..6bfc38ff94f 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -10,14 +10,14 @@ Defined in the ```` header. namespace cuda { template - struct is_trivially_copyable_relaxed; + constexpr bool is_trivially_copyable_relaxed_v = /* see below */; template - constexpr bool is_trivially_copyable_relaxed_v = is_trivially_copyable_relaxed::value; + using is_trivially_copyable_relaxed = cuda::std::bool_constant>; } // namespace cuda -``cuda::is_trivially_copyable_relaxed`` is a type trait that extends ``cuda::std::is_trivially_copyable`` to also recognize CUDA extended floating-point scalar and vector types as trivially copyable. +``cuda::is_trivially_copyable_relaxed_v`` is a variable template that extends ``cuda::std::is_trivially_copyable`` to also recognize CUDA extended floating-point scalar and vector types as trivially copyable. A type ``T`` satisfies ``cuda::is_trivially_copyable_relaxed`` if any of the following holds: @@ -29,16 +29,20 @@ The trait also propagates through composite types: - C-style arrays: ``T[N]`` and ``T[]`` are relaxed trivially copyable when ``T`` is. - ``cuda::std::array``: relaxed trivially copyable when ``T`` is. -- ``cuda::std::pair``: relaxed trivially copyable when both ``T1`` and ``T2`` are. -- ``cuda::std::tuple``: relaxed trivially copyable when all ``Ts...`` are. +- ``cuda::std::pair``: relaxed trivially copyable when both ``T1`` and ``T2`` are and the object has no padding. +- ``cuda::std::tuple``: relaxed trivially copyable when all ``Ts...`` are and the object has no padding. ``const``, ``volatile``, and ``const volatile`` qualifications are handled transparently. Custom Specialization --------------------- -Users may specialize ``cuda::is_trivially_copyable_relaxed`` for their own types whose memory representation is safe to copy -with ``memcpy`` but that the compiler does not consider trivially copyable. +Users may specialize ``cuda::is_trivially_copyable_relaxed_v`` for their own types whose memory representation is safe to copy with ``memcpy`` but that the compiler does not consider trivially copyable. + +... warning:: + + Users are responsible for ensuring that the type is actually trivially copyable when specializing this variable template. Otherwise, the behavior is undefined. + A common case is a type that wraps extended floating-point fields and provides user-defined copy operations solely to add ``__host__ __device__`` annotations: @@ -52,12 +56,12 @@ solely to add ``__host__ __device__`` annotations: __host__ __device__ NonTriviallyCopyable(const NonTriviallyCopyable&) {} }; - // Specializing the trait + // Specializing the variable template template <> - struct cuda::is_trivially_copyable_relaxed : cuda::std::true_type {}; + constexpr bool cuda::is_trivially_copyable_relaxed_v = true; template <> - struct cuda::is_trivially_copyable_relaxed : cuda::std::true_type {}; + constexpr bool cuda::is_trivially_copyable_relaxed_v = true; static_assert(cuda::is_trivially_copyable_relaxed_v); static_assert(cuda::is_trivially_copyable_relaxed_v); diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h index 24b24818976..79e8cb5e2d8 100644 --- a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -34,41 +34,34 @@ _CCCL_BEGIN_NAMESPACE_CUDA -template > -constexpr bool __is_trivially_copyable_relaxed_impl_v = - ::cuda::std::__is_extended_floating_point_v<_Up> || ::cuda::is_extended_fp_vector_type_v<_Up> - || ::cuda::std::is_trivially_copyable_v<_Up>; - +//! Users are allowed to specialize this variable template for their own types template -struct is_trivially_copyable_relaxed : ::cuda::std::bool_constant<::cuda::__is_trivially_copyable_relaxed_impl_v<_Tp>> -{}; +constexpr bool is_trivially_copyable_relaxed_v = + ::cuda::std::__is_extended_floating_point_v<::cuda::std::remove_cv_t<_Tp>> + || ::cuda::is_extended_fp_vector_type_v<::cuda::std::remove_cv_t<_Tp>> || ::cuda::std::is_trivially_copyable_v<_Tp>; template -struct is_trivially_copyable_relaxed<_Tp[]> : is_trivially_copyable_relaxed<_Tp> -{}; +constexpr bool is_trivially_copyable_relaxed_v<_Tp[]> = is_trivially_copyable_relaxed_v<_Tp>; template -struct is_trivially_copyable_relaxed<_Tp[_Size]> : is_trivially_copyable_relaxed<_Tp> -{}; +constexpr bool is_trivially_copyable_relaxed_v<_Tp[_Size]> = is_trivially_copyable_relaxed_v<_Tp>; template -struct is_trivially_copyable_relaxed<::cuda::std::array<_Tp, _Size>> : is_trivially_copyable_relaxed<_Tp> -{}; +constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::array<_Tp, _Size>> = is_trivially_copyable_relaxed_v<_Tp>; -// cuda::std::pair template -struct is_trivially_copyable_relaxed<::cuda::std::pair<_T1, _T2>> - : ::cuda::std::bool_constant<__is_trivially_copyable_relaxed_impl_v<_T1> && __is_trivially_copyable_relaxed_impl_v<_T2>> -{}; +constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::pair<_T1, _T2>> = + (sizeof(::cuda::std::pair<_T1, _T2>) == sizeof(_T1) + sizeof(_T2)) + && is_trivially_copyable_relaxed_v<_T1> && is_trivially_copyable_relaxed_v<_T2>; -// cuda::std::tuple template -struct is_trivially_copyable_relaxed<::cuda::std::tuple<_Ts...>> - : ::cuda::std::bool_constant<(__is_trivially_copyable_relaxed_impl_v<_Ts> && ...)> -{}; +constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::tuple<_Ts...>> = + (sizeof...(_Ts) == 0 || sizeof(::cuda::std::tuple<_Ts...>) == (sizeof(_Ts) + ... + 0)) + && (is_trivially_copyable_relaxed_v<_Ts> && ...); +// defined as alias so users cannot specialize it (they should specialize the variable template instead) template -constexpr bool is_trivially_copyable_relaxed_v = is_trivially_copyable_relaxed<_Tp>::value; +using is_trivially_copyable_relaxed = ::cuda::std::bool_constant>; _CCCL_END_NAMESPACE_CUDA diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp new file mode 100644 index 00000000000..538849373d6 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp @@ -0,0 +1,265 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +// memcpy is not used to avoid compiler optimizations +__host__ __device__ void test_memcpy(void* dst, const void* src, cuda::std::size_t bytes) noexcept +{ + unsigned char* d = static_cast(dst); + const unsigned char* s = static_cast(src); + for (; bytes > 0; --bytes) + { + *d++ = *s++; + } +} + +__host__ __device__ int test_memcmp(const void* lhs, const void* rhs, cuda::std::size_t bytes) noexcept +{ + const unsigned char* clhs = static_cast(lhs); + const unsigned char* crhs = static_cast(rhs); + for (; bytes > 0; --bytes) + { + if (*clhs++ != *crhs++) + { + return clhs[-1] < crhs[-1] ? -1 : 1; + } + } + return 0; +} + +template +__host__ __device__ void test_memcpy_roundtrip(T from) +{ + static_assert(cuda::is_trivially_copyable_relaxed_v); + struct Buffer + { + char data[sizeof(T)]; + }; + Buffer buffer; + test_memcpy(&buffer, &from, sizeof(T)); + + Buffer copy; + test_memcpy(©, &buffer, sizeof(T)); + assert(test_memcmp(&buffer, ©, sizeof(T)) == 0); +} + +#define REPEAT_1(base_type, index) base_type(input[index][0]) +#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), base_type(input[index][1]) +#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), base_type(input[index][2]) +#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), base_type(input[index][3]) + +#define TEST_CUDA_VECTOR_TYPE(base_type, size) \ + { \ + for (base_type##size i : \ + {base_type##size{REPEAT_##size(base_type, 0)}, \ + base_type##size{REPEAT_##size(base_type, 1)}, \ + base_type##size{REPEAT_##size(base_type, 2)}, \ + base_type##size{REPEAT_##size(base_type, 3)}, \ + base_type##size{REPEAT_##size(base_type, 4)}, \ + base_type##size{REPEAT_##size(base_type, 5)}, \ + base_type##size{REPEAT_##size(base_type, 6)}}) \ + { \ + test_memcpy_roundtrip(i); \ + } \ + } + +#define TEST_CUDA_VECTOR_TYPES(base_type) \ + TEST_CUDA_VECTOR_TYPE(base_type, 1) \ + TEST_CUDA_VECTOR_TYPE(base_type, 2) \ + TEST_CUDA_VECTOR_TYPE(base_type, 3) \ + TEST_CUDA_VECTOR_TYPE(base_type, 4) + +__host__ __device__ bool tests() +{ + // standard scalar types + test_memcpy_roundtrip(42); + test_memcpy_roundtrip(0.0f); + test_memcpy_roundtrip(3.14159); + test_memcpy_roundtrip(static_cast(7)); + test_memcpy_roundtrip(static_cast('A')); + + // cuda::std::pair + using pair = cuda::std::pair; + for (pair i : + {pair{0.0f, 1}, + pair{1.0f, 2}, + pair{-1.0f, 3}, + pair{10.0f, 4}, + pair{-10.0f, 5}, + pair{2.71828f, 6}, + pair{3.14159f, 7}}) + { + test_memcpy_roundtrip(i); + } + + // cuda::std::tuple + using tuple = cuda::std::tuple; + for (tuple i : + {tuple{1, 0.0f}, + tuple{2, 1.0f}, + tuple{3, -1.0f}, + tuple{4, 10.0f}, + tuple{5, -10.0f}, + tuple{6, 2.71828f}, + tuple{7, 3.14159f}}) + { + test_memcpy_roundtrip(i); + } + + // cuda::std::array + using array = cuda::std::array; + for (array i : + {array{0.0f, 1.0f}, + array{1.0f, 2.0f}, + array{-1.0f, 3.0f}, + array{10.0f, 4.0f}, + array{-10.0f, 5.0f}, + array{2.71828f, 6.0f}, + array{3.14159f, 7.0f}}) + { + test_memcpy_roundtrip(i); + } + + // CUDA vector types + constexpr double input[7][4] = { + {0.0, 1.0, -7.0, -0.0}, + {1.0, 2.0, -7.0, -1.0}, + {-1.0, 3.0, -7.0, 1.0}, + {10.0, 4.0, -7.0, -10.0}, + {-10.0, 5.0, -7.0, 10.0}, + {2.71828, 6.0, -7.0, -2.71828}, + {3.14159, 7.0, -7.0, -3.14159}}; + + TEST_CUDA_VECTOR_TYPES(char) + TEST_CUDA_VECTOR_TYPES(short) + TEST_CUDA_VECTOR_TYPES(int) + TEST_CUDA_VECTOR_TYPES(float) + +#if !_CCCL_CUDA_COMPILER(CLANG) + using uchar = unsigned char; + using ushort = unsigned short; + using uint = unsigned int; + using ulong = unsigned long; + TEST_CUDA_VECTOR_TYPES(uchar) + TEST_CUDA_VECTOR_TYPES(ushort) + TEST_CUDA_VECTOR_TYPES(uint) + TEST_CUDA_VECTOR_TYPE(ulong, 1) + TEST_CUDA_VECTOR_TYPE(ulong, 2) + TEST_CUDA_VECTOR_TYPE(ulong, 3) +#endif // !_CCCL_CUDA_COMPILER(CLANG) + + TEST_CUDA_VECTOR_TYPE(long, 1) + TEST_CUDA_VECTOR_TYPE(long, 2) + TEST_CUDA_VECTOR_TYPE(long, 3) + + using longlong = long long; + using ulonglong = unsigned long long; + TEST_CUDA_VECTOR_TYPE(longlong, 1) + TEST_CUDA_VECTOR_TYPE(longlong, 2) + TEST_CUDA_VECTOR_TYPE(longlong, 3) + TEST_CUDA_VECTOR_TYPE(ulonglong, 1) + TEST_CUDA_VECTOR_TYPE(ulonglong, 2) + TEST_CUDA_VECTOR_TYPE(ulonglong, 3) + TEST_CUDA_VECTOR_TYPE(double, 1) + TEST_CUDA_VECTOR_TYPE(double, 2) + TEST_CUDA_VECTOR_TYPE(double, 3) + + using dim = unsigned int; + TEST_CUDA_VECTOR_TYPE(dim, 3) + + // extended floating-point scalar types +#if _CCCL_HAS_NVFP16() + for (__half i : + {__float2half(0.0f), + __float2half(1.0f), + __float2half(-1.0f), + __float2half(10.0f), + __float2half(-10.0f), + __float2half(2.71828f), + __float2half(3.14159f)}) + { + test_memcpy_roundtrip(i); + } +#endif // _CCCL_HAS_NVFP16() + +#if _CCCL_HAS_NVBF16() + for (__nv_bfloat16 i : + {__float2bfloat16(0.0f), + __float2bfloat16(1.0f), + __float2bfloat16(-1.0f), + __float2bfloat16(10.0f), + __float2bfloat16(-10.0f), + __float2bfloat16(2.71828f), + __float2bfloat16(3.14159f)}) + { + test_memcpy_roundtrip(i); + } +#endif // _CCCL_HAS_NVBF16() + + // extended floating-point vector types +#if _CCCL_HAS_NVFP16() + for (__half2 i : + {__half2{__float2half(0.0f), __float2half(1.0f)}, + __half2{__float2half(-1.0f), __float2half(2.0f)}, + __half2{__float2half(10.0f), __float2half(-10.0f)}, + __half2{__float2half(2.71828f), __float2half(3.14159f)}}) + { + test_memcpy_roundtrip(i); + } +#endif // _CCCL_HAS_NVFP16() + +#if _CCCL_HAS_NVBF16() + for (__nv_bfloat162 i : + {__nv_bfloat162{__float2bfloat16(0.0f), __float2bfloat16(1.0f)}, + __nv_bfloat162{__float2bfloat16(-1.0f), __float2bfloat16(2.0f)}, + __nv_bfloat162{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}, + __nv_bfloat162{__float2bfloat16(2.71828f), __float2bfloat16(3.14159f)}}) + { + test_memcpy_roundtrip(i); + } +#endif // _CCCL_HAS_NVBF16() + + // padding-free compositions of extended floating-point types +#if _CCCL_HAS_NVFP16() + test_memcpy_roundtrip( + cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)}); + test_memcpy_roundtrip(cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); + test_memcpy_roundtrip(cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() + test_memcpy_roundtrip(cuda::std::array<__nv_bfloat16, 2>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}); + test_memcpy_roundtrip(cuda::std::pair<__nv_bfloat16, __nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}); +#endif // _CCCL_HAS_NVBF16() + + // nested padding-free compositions +#if _CCCL_HAS_NVFP16() + test_memcpy_roundtrip(cuda::std::array, 2>{ + cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}, + cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}}); +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() + test_memcpy_roundtrip(cuda::std::tuple<__half, __nv_bfloat16>{__float2half(1.0f), __float2bfloat16(2.0f)}); +#endif // _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() + + return true; +} + +int main(int, char**) +{ + tests(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp index db92fd55937..ad61d9ddcf2 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp @@ -68,7 +68,7 @@ __host__ __device__ void test() // cuda::std::array, pair, tuple of trivially copyable types test_is_trivially_copyable_relaxed>(); test_is_trivially_copyable_relaxed>(); - test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); test_is_trivially_copyable_relaxed>(); // extended floating point scalar types @@ -93,28 +93,34 @@ __host__ __device__ void test() test_is_trivially_copyable_relaxed<__nv_fp8x2_e4m3>(); #endif // _CCCL_HAS_NVFP8() - // compositions of extended floating point types + // padding-free compositions of extended floating point types #if _CCCL_HAS_NVFP16() static_assert(cuda::is_trivially_copyable_relaxed_v<__half[4]>); static_assert(cuda::is_trivially_copyable_relaxed_v); test_is_trivially_copyable_relaxed>(); - test_is_trivially_copyable_relaxed>(); - test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); #endif // _CCCL_HAS_NVFP16() #if _CCCL_HAS_NVBF16() test_is_trivially_copyable_relaxed>(); - test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); #endif // _CCCL_HAS_NVBF16() - // nested compositions + // compositions with padding are not trivially copyable relaxed #if _CCCL_HAS_NVFP16() - test_is_trivially_copyable_relaxed, 2>>(); - test_is_trivially_copyable_relaxed, int>>(); - test_is_trivially_copyable_relaxed, double>>(); + static_assert(!cuda::is_trivially_copyable_relaxed_v>); + static_assert(!cuda::is_trivially_copyable_relaxed_v>); #endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() + static_assert(!cuda::is_trivially_copyable_relaxed_v>); +#endif // _CCCL_HAS_NVBF16() + // nested padding-free compositions +#if _CCCL_HAS_NVFP16() + test_is_trivially_copyable_relaxed, 2>>(); +#endif // _CCCL_HAS_NVFP16() #if _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() - test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); #endif // _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() // non-trivially copyable types From deb622ce233ee3778e0c6c8423849cd259f44762 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 15:33:46 -0700 Subject: [PATCH 06/47] fix clang --- .../cuda/__type_traits/is_trivially_copyable_relaxed.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h index 79e8cb5e2d8..3741ba78658 100644 --- a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -37,8 +37,12 @@ _CCCL_BEGIN_NAMESPACE_CUDA //! Users are allowed to specialize this variable template for their own types template constexpr bool is_trivially_copyable_relaxed_v = - ::cuda::std::__is_extended_floating_point_v<::cuda::std::remove_cv_t<_Tp>> - || ::cuda::is_extended_fp_vector_type_v<::cuda::std::remove_cv_t<_Tp>> || ::cuda::std::is_trivially_copyable_v<_Tp>; + ::cuda::std::is_trivially_copyable_v<_Tp> + || ::cuda::std::__is_extended_floating_point_v<::cuda::std::remove_cv_t<_Tp>> +#if _CCCL_HAS_CTK() + || ::cuda::is_extended_fp_vector_type_v<::cuda::std::remove_cv_t<_Tp>> +#endif // _CCCL_HAS_CTK() + ; template constexpr bool is_trivially_copyable_relaxed_v<_Tp[]> = is_trivially_copyable_relaxed_v<_Tp>; From a553bb64f128fcf11d7bacd83fbd1feb5c938447 Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 15:59:33 -0700 Subject: [PATCH 07/47] do not handle volatile --- .../is_trivially_copyable_relaxed.rst | 11 +++++---- .../is_trivially_copyable_relaxed.h | 8 +++---- ...is_trivially_copyable_relaxed.mem.pass.cpp | 23 ++++++++++++++----- .../is_trivially_copyable_relaxed.pass.cpp | 8 ------- 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index 6bfc38ff94f..60d1783e186 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -32,7 +32,7 @@ The trait also propagates through composite types: - ``cuda::std::pair``: relaxed trivially copyable when both ``T1`` and ``T2`` are and the object has no padding. - ``cuda::std::tuple``: relaxed trivially copyable when all ``Ts...`` are and the object has no padding. -``const``, ``volatile``, and ``const volatile`` qualifications are handled transparently. +``const`` qualification is handled transparently, while ``volatile`` is compile-time dependent. Custom Specialization --------------------- @@ -87,8 +87,11 @@ Examples static_assert(cuda::is_trivially_copyable_relaxed_v<__nv_bfloat16>); static_assert(cuda::is_trivially_copyable_relaxed_v<__half2>); - // Composite types containing extended floating-point types + // Padding-free composite types containing extended floating-point types static_assert(cuda::is_trivially_copyable_relaxed_v<__half[4]>); static_assert(cuda::is_trivially_copyable_relaxed_v>); - static_assert(cuda::is_trivially_copyable_relaxed_v>); - static_assert(cuda::is_trivially_copyable_relaxed_v>); + static_assert(cuda::is_trivially_copyable_relaxed_v>); + static_assert(cuda::is_trivially_copyable_relaxed_v>); + + // Composites with padding are not trivially copyable relaxed + static_assert(!cuda::is_trivially_copyable_relaxed_v>); diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h index 3741ba78658..2ee22d77e8e 100644 --- a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include @@ -37,10 +37,10 @@ _CCCL_BEGIN_NAMESPACE_CUDA //! Users are allowed to specialize this variable template for their own types template constexpr bool is_trivially_copyable_relaxed_v = - ::cuda::std::is_trivially_copyable_v<_Tp> - || ::cuda::std::__is_extended_floating_point_v<::cuda::std::remove_cv_t<_Tp>> + ::cuda::std::is_trivially_copyable_v<::cuda::std::remove_const_t<_Tp>> + || ::cuda::std::__is_extended_floating_point_v<::cuda::std::remove_const_t<_Tp>> #if _CCCL_HAS_CTK() - || ::cuda::is_extended_fp_vector_type_v<::cuda::std::remove_cv_t<_Tp>> + || ::cuda::is_extended_fp_vector_type_v<::cuda::std::remove_const_t<_Tp>> #endif // _CCCL_HAS_CTK() ; diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp index 538849373d6..c0515212644 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp @@ -57,10 +57,12 @@ __host__ __device__ void test_memcpy_roundtrip(T from) assert(test_memcmp(&buffer, ©, sizeof(T)) == 0); } -#define REPEAT_1(base_type, index) base_type(input[index][0]) -#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), base_type(input[index][1]) -#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), base_type(input[index][2]) -#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), base_type(input[index][3]) +#define CAST(base_type, val) static_cast(val) + +#define REPEAT_1(base_type, index) CAST(base_type, input[index][0]) +#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), CAST(base_type, input[index][1]) +#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), CAST(base_type, input[index][2]) +#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), CAST(base_type, input[index][3]) #define TEST_CUDA_VECTOR_TYPE(base_type, size) \ { \ @@ -178,8 +180,17 @@ __host__ __device__ bool tests() TEST_CUDA_VECTOR_TYPE(double, 2) TEST_CUDA_VECTOR_TYPE(double, 3) - using dim = unsigned int; - TEST_CUDA_VECTOR_TYPE(dim, 3) + for (dim3 i : + {dim3{0u, 1u, 2u}, + dim3{1u, 2u, 3u}, + dim3{10u, 20u, 30u}, + dim3{100u, 200u, 300u}, + dim3{255u, 128u, 64u}, + dim3{1024u, 512u, 256u}, + dim3{4096u, 2048u, 1024u}}) + { + test_memcpy_roundtrip(i); + } // extended floating-point scalar types #if _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp index ad61d9ddcf2..67c814c0bb4 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp @@ -19,12 +19,8 @@ __host__ __device__ void test_is_trivially_copyable_relaxed() { static_assert(cuda::is_trivially_copyable_relaxed::value); static_assert(cuda::is_trivially_copyable_relaxed::value); - static_assert(cuda::is_trivially_copyable_relaxed::value); - static_assert(cuda::is_trivially_copyable_relaxed::value); static_assert(cuda::is_trivially_copyable_relaxed_v); static_assert(cuda::is_trivially_copyable_relaxed_v); - static_assert(cuda::is_trivially_copyable_relaxed_v); - static_assert(cuda::is_trivially_copyable_relaxed_v); } template @@ -32,12 +28,8 @@ __host__ __device__ void test_is_not_trivially_copyable_relaxed() { static_assert(!cuda::is_trivially_copyable_relaxed::value); static_assert(!cuda::is_trivially_copyable_relaxed::value); - static_assert(!cuda::is_trivially_copyable_relaxed::value); - static_assert(!cuda::is_trivially_copyable_relaxed::value); static_assert(!cuda::is_trivially_copyable_relaxed_v); static_assert(!cuda::is_trivially_copyable_relaxed_v); - static_assert(!cuda::is_trivially_copyable_relaxed_v); - static_assert(!cuda::is_trivially_copyable_relaxed_v); } struct TrivialPod From de42a45466ca797bd7edf95aa8e5451061541dda Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 16:19:40 -0700 Subject: [PATCH 08/47] unused-local-typedef --- .../cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp index c0515212644..a3abc0e4967 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp @@ -168,6 +168,8 @@ __host__ __device__ bool tests() TEST_CUDA_VECTOR_TYPE(long, 2) TEST_CUDA_VECTOR_TYPE(long, 3) + _CCCL_DIAG_PUSH + _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") using longlong = long long; using ulonglong = unsigned long long; TEST_CUDA_VECTOR_TYPE(longlong, 1) From 4e7873d077514580045eb6cd7e2db707528ac7eb Mon Sep 17 00:00:00 2001 From: fbusato Date: Wed, 1 Apr 2026 16:34:33 -0700 Subject: [PATCH 09/47] fix clang pragma --- .../type_traits/is_trivially_copyable_relaxed.mem.pass.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp index a3abc0e4967..6fb5576aed2 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp @@ -16,6 +16,9 @@ #include "test_macros.h" +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") + // memcpy is not used to avoid compiler optimizations __host__ __device__ void test_memcpy(void* dst, const void* src, cuda::std::size_t bytes) noexcept { @@ -168,8 +171,6 @@ __host__ __device__ bool tests() TEST_CUDA_VECTOR_TYPE(long, 2) TEST_CUDA_VECTOR_TYPE(long, 3) - _CCCL_DIAG_PUSH - _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") using longlong = long long; using ulonglong = unsigned long long; TEST_CUDA_VECTOR_TYPE(longlong, 1) From 6e5021f1269e45e08ec517bd1bc3fff9fb1da211 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Apr 2026 10:36:40 -0700 Subject: [PATCH 10/47] simplify conditions --- .../is_trivially_copyable_relaxed.rst | 21 ++- .../is_trivially_copyable_relaxed.h | 6 +- ...is_trivially_copyable_relaxed.mem.pass.cpp | 143 ++++++++++-------- .../is_trivially_copyable_relaxed.pass.cpp | 10 +- 4 files changed, 108 insertions(+), 72 deletions(-) diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index 60d1783e186..f6337f5297e 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -32,16 +32,29 @@ The trait also propagates through composite types: - ``cuda::std::pair``: relaxed trivially copyable when both ``T1`` and ``T2`` are and the object has no padding. - ``cuda::std::tuple``: relaxed trivially copyable when all ``Ts...`` are and the object has no padding. -``const`` qualification is handled transparently, while ``volatile`` is compile-time dependent. +``const`` qualification is handled transparently, while ``volatile`` is compiler dependent. + +.. note:: + + The type trait cannot determine if a structure (``struct`` or ``class``) contains extended floating-point types, and thus recognize the type as *trivially copyable*. The user must manually specialize the type trait for such types. Custom Specialization --------------------- -Users may specialize ``cuda::is_trivially_copyable_relaxed_v`` for their own types whose memory representation is safe to copy with ``memcpy`` but that the compiler does not consider trivially copyable. +Users may specialize ``cuda::is_trivially_copyable_relaxed_v`` for types whose semantics allow copying with ``memcpy``, but which the compiler does not consider to be trivially copyable. + +A `trivially copyable `__ class is a class that + +- has at least one eligible copy constructor, move constructor, copy assignment operator, or move assignment operator, +- each eligible copy constructor is trivial +- each eligible move constructor is trivial +- each eligible copy assignment operator is trivial +- each eligible move assignment operator is trivial, and +- has a non-deleted trivial destructor. -... warning:: +.. warning:: - Users are responsible for ensuring that the type is actually trivially copyable when specializing this variable template. Otherwise, the behavior is undefined. + The user is responsible for ensuring that the type is actually trivially copyable when specializing this variable template. Otherwise, the behavior is undefined. A common case is a type that wraps extended floating-point fields and provides user-defined copy operations solely to add ``__host__ __device__`` annotations: diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h index 2ee22d77e8e..7fbb685390a 100644 --- a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -55,13 +55,11 @@ constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::array<_Tp, _Size>> = template constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::pair<_T1, _T2>> = - (sizeof(::cuda::std::pair<_T1, _T2>) == sizeof(_T1) + sizeof(_T2)) - && is_trivially_copyable_relaxed_v<_T1> && is_trivially_copyable_relaxed_v<_T2>; + is_trivially_copyable_relaxed_v<_T1> && is_trivially_copyable_relaxed_v<_T2>; template constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::tuple<_Ts...>> = - (sizeof...(_Ts) == 0 || sizeof(::cuda::std::tuple<_Ts...>) == (sizeof(_Ts) + ... + 0)) - && (is_trivially_copyable_relaxed_v<_Ts> && ...); + (is_trivially_copyable_relaxed_v<_Ts> && ...); // defined as alias so users cannot specialize it (they should specialize the variable template instead) template diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp index 6fb5576aed2..af0ebddcc1f 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp @@ -19,45 +19,66 @@ _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") -// memcpy is not used to avoid compiler optimizations -__host__ __device__ void test_memcpy(void* dst, const void* src, cuda::std::size_t bytes) noexcept +// operator== for CUDA vector types and dim3 (not provided by the toolkit) +template , int> = 0> +__host__ __device__ bool operator==(T a, T b) { - unsigned char* d = static_cast(dst); - const unsigned char* s = static_cast(src); - for (; bytes > 0; --bytes) + if constexpr (sizeof(T) == sizeof(decltype(T::x))) { - *d++ = *s++; + return a.x == b.x; + } + else if constexpr (sizeof(T) == 2 * sizeof(decltype(T::x))) + { + return a.x == b.x && a.y == b.y; + } + else if constexpr (sizeof(T) == 3 * sizeof(decltype(T::x))) + { + return a.x == b.x && a.y == b.y && a.z == b.z; + } + else + { + return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } } -__host__ __device__ int test_memcmp(const void* lhs, const void* rhs, cuda::std::size_t bytes) noexcept +// In CUDA 12.x, __half/__nv_bfloat16 operators are __device__ only +#if _CCCL_HAS_NVFP16() +# if defined(__CUDA_NO_HALF_OPERATORS__) +__host__ __device__ bool operator==(__half a, __half b) { - const unsigned char* clhs = static_cast(lhs); - const unsigned char* crhs = static_cast(rhs); - for (; bytes > 0; --bytes) - { - if (*clhs++ != *crhs++) - { - return clhs[-1] < crhs[-1] ? -1 : 1; - } - } - return 0; + return __half2float(a) == __half2float(b); +} +# endif +# if defined(__CUDA_NO_HALF2_OPERATORS__) +__host__ __device__ bool operator==(__half2 a, __half2 b) +{ + return __half2float(a.x) == __half2float(b.x) && __half2float(a.y) == __half2float(b.y); +} +# endif +#endif // _CCCL_HAS_NVFP16() + +#if _CCCL_HAS_NVBF16() +# if defined(__CUDA_NO_BFLOAT16_OPERATORS__) +__host__ __device__ bool operator==(__nv_bfloat16 a, __nv_bfloat16 b) +{ + return __bfloat162float(a) == __bfloat162float(b); +} +# endif +# if defined(__CUDA_NO_BFLOAT162_OPERATORS__) +__host__ __device__ bool operator==(__nv_bfloat162 a, __nv_bfloat162 b) +{ + return __bfloat162float(a.x) == __bfloat162float(b.x) && __bfloat162float(a.y) == __bfloat162float(b.y); } +# endif +#endif // _CCCL_HAS_NVBF16() template __host__ __device__ void test_memcpy_roundtrip(T from) { static_assert(cuda::is_trivially_copyable_relaxed_v); - struct Buffer - { - char data[sizeof(T)]; - }; - Buffer buffer; - test_memcpy(&buffer, &from, sizeof(T)); - - Buffer copy; - test_memcpy(©, &buffer, sizeof(T)); - assert(test_memcmp(&buffer, ©, sizeof(T)) == 0); + T to; + ::memcpy(static_cast(&to), static_cast(&from), sizeof(T)); + assert(from == to); } #define CAST(base_type, val) static_cast(val) @@ -195,7 +216,17 @@ __host__ __device__ bool tests() test_memcpy_roundtrip(i); } - // extended floating-point scalar types + return true; +} + +// Extended floating-point types: in CUDA 12.x, __half/__nv_bfloat16 operator== is __device__ only. +// The function is __device__ on CUDA 12.x and __host__ __device__ on CUDA 13.x. +#if _CCCL_CTK_AT_LEAST(13, 0) +__host__ __device__ void tests_nvfp() +#else +__device__ void tests_nvfp() +#endif +{ #if _CCCL_HAS_NVFP16() for (__half i : {__float2half(0.0f), @@ -208,6 +239,24 @@ __host__ __device__ bool tests() { test_memcpy_roundtrip(i); } + + for (__half2 i : + {__half2{__float2half(0.0f), __float2half(1.0f)}, + __half2{__float2half(-1.0f), __float2half(2.0f)}, + __half2{__float2half(10.0f), __float2half(-10.0f)}, + __half2{__float2half(2.71828f), __float2half(3.14159f)}}) + { + test_memcpy_roundtrip(i); + } + + test_memcpy_roundtrip( + cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)}); + test_memcpy_roundtrip(cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); + test_memcpy_roundtrip(cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); + + test_memcpy_roundtrip(cuda::std::array, 2>{ + cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}, + cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}}); #endif // _CCCL_HAS_NVFP16() #if _CCCL_HAS_NVBF16() @@ -222,21 +271,7 @@ __host__ __device__ bool tests() { test_memcpy_roundtrip(i); } -#endif // _CCCL_HAS_NVBF16() - - // extended floating-point vector types -#if _CCCL_HAS_NVFP16() - for (__half2 i : - {__half2{__float2half(0.0f), __float2half(1.0f)}, - __half2{__float2half(-1.0f), __float2half(2.0f)}, - __half2{__float2half(10.0f), __float2half(-10.0f)}, - __half2{__float2half(2.71828f), __float2half(3.14159f)}}) - { - test_memcpy_roundtrip(i); - } -#endif // _CCCL_HAS_NVFP16() -#if _CCCL_HAS_NVBF16() for (__nv_bfloat162 i : {__nv_bfloat162{__float2bfloat16(0.0f), __float2bfloat16(1.0f)}, __nv_bfloat162{__float2bfloat16(-1.0f), __float2bfloat16(2.0f)}, @@ -245,35 +280,25 @@ __host__ __device__ bool tests() { test_memcpy_roundtrip(i); } -#endif // _CCCL_HAS_NVBF16() - // padding-free compositions of extended floating-point types -#if _CCCL_HAS_NVFP16() - test_memcpy_roundtrip( - cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)}); - test_memcpy_roundtrip(cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); - test_memcpy_roundtrip(cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); -#endif // _CCCL_HAS_NVFP16() -#if _CCCL_HAS_NVBF16() test_memcpy_roundtrip(cuda::std::array<__nv_bfloat16, 2>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}); test_memcpy_roundtrip(cuda::std::pair<__nv_bfloat16, __nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}); #endif // _CCCL_HAS_NVBF16() - // nested padding-free compositions -#if _CCCL_HAS_NVFP16() - test_memcpy_roundtrip(cuda::std::array, 2>{ - cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}, - cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}}); -#endif // _CCCL_HAS_NVFP16() #if _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() test_memcpy_roundtrip(cuda::std::tuple<__half, __nv_bfloat16>{__float2half(1.0f), __float2bfloat16(2.0f)}); #endif // _CCCL_HAS_NVFP16() && _CCCL_HAS_NVBF16() - - return true; } int main(int, char**) { tests(); +#if _CCCL_CTK_AT_LEAST(13, 0) + tests_nvfp(); +#else + NV_IF_TARGET(NV_IS_DEVICE, (tests_nvfp();)); +#endif return 0; } + +_CCCL_DIAG_POP diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp index 67c814c0bb4..fad1a3a78ed 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp @@ -98,16 +98,16 @@ __host__ __device__ void test() test_is_trivially_copyable_relaxed>(); #endif // _CCCL_HAS_NVBF16() - // compositions with padding are not trivially copyable relaxed + // compositions with padding #if _CCCL_HAS_NVFP16() - static_assert(!cuda::is_trivially_copyable_relaxed_v>); - static_assert(!cuda::is_trivially_copyable_relaxed_v>); + test_is_trivially_copyable_relaxed>(); + test_is_trivially_copyable_relaxed>(); #endif // _CCCL_HAS_NVFP16() #if _CCCL_HAS_NVBF16() - static_assert(!cuda::is_trivially_copyable_relaxed_v>); + test_is_trivially_copyable_relaxed>(); #endif // _CCCL_HAS_NVBF16() - // nested padding-free compositions + // nested compositions #if _CCCL_HAS_NVFP16() test_is_trivially_copyable_relaxed, 2>>(); #endif // _CCCL_HAS_NVFP16() From 8ff54f41f8015ddbc2c1998f7df6e34bfc2ed89a Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Apr 2026 11:01:29 -0700 Subject: [PATCH 11/47] improve documentation --- docs/libcudacxx/extended_api/type_traits.rst | 10 +++++----- .../type_traits/is_trivially_copyable_relaxed.rst | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/libcudacxx/extended_api/type_traits.rst b/docs/libcudacxx/extended_api/type_traits.rst index a11d840668a..0f0b087b8fc 100644 --- a/docs/libcudacxx/extended_api/type_traits.rst +++ b/docs/libcudacxx/extended_api/type_traits.rst @@ -25,12 +25,12 @@ Type traits - CCCL 3.0.0 - CUDA 13.0 - * - :ref:`cuda::is_trivially_copyable_relaxed ` - - Relaxed trivially copyable check including extended floating-point types - - CCCL 3.4.0 - - CUDA 13.4 - * - :ref:`Vector Type Traits ` - Type traits for CUDA vector types - CCCL 3.3.0 - CUDA 13.3 + + * - :ref:`cuda::is_trivially_copyable_relaxed ` + - Relaxed trivially copyable check including extended floating-point types + - CCCL 3.4.0 + - CUDA 13.4 diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index f6337f5297e..2ee873e7374 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -36,7 +36,7 @@ The trait also propagates through composite types: .. note:: - The type trait cannot determine if a structure (``struct`` or ``class``) contains extended floating-point types, and thus recognize the type as *trivially copyable*. The user must manually specialize the type trait for such types. + The type trait cannot determine if a structure (``struct`` or ``class``) contains extended floating-point types, and thus it recognizes the type as *trivially copyable*. The user must manually specialize the type trait for such types. Custom Specialization --------------------- From c4c1504f8e0d468478318574db4ec24b2136f092 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 2 Apr 2026 14:52:08 -0700 Subject: [PATCH 12/47] fix operator== --- ...is_trivially_copyable_relaxed.mem.pass.cpp | 25 ++++++++++++------- .../is_trivially_copyable_relaxed.pass.cpp | 3 +++ 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp index af0ebddcc1f..a5618b519c0 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.mem.pass.cpp @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// +#include +#include + #include #include #include @@ -41,20 +44,21 @@ __host__ __device__ bool operator==(T a, T b) } } -// In CUDA 12.x, __half/__nv_bfloat16 operators are __device__ only +// Extended FP operator== is __device__ only on CTK < 12.3, __host__ __device__ from CTK >= 12.3. +// Provide __host__ __device__ fallbacks only when operators are explicitly disabled by the user. #if _CCCL_HAS_NVFP16() # if defined(__CUDA_NO_HALF_OPERATORS__) __host__ __device__ bool operator==(__half a, __half b) { return __half2float(a) == __half2float(b); } -# endif +# endif // defined(__CUDA_NO_HALF_OPERATORS__) # if defined(__CUDA_NO_HALF2_OPERATORS__) __host__ __device__ bool operator==(__half2 a, __half2 b) { return __half2float(a.x) == __half2float(b.x) && __half2float(a.y) == __half2float(b.y); } -# endif +# endif // defined(__CUDA_NO_HALF2_OPERATORS__) #endif // _CCCL_HAS_NVFP16() #if _CCCL_HAS_NVBF16() @@ -63,13 +67,13 @@ __host__ __device__ bool operator==(__nv_bfloat16 a, __nv_bfloat16 b) { return __bfloat162float(a) == __bfloat162float(b); } -# endif +# endif // defined(__CUDA_NO_BFLOAT16_OPERATORS__) # if defined(__CUDA_NO_BFLOAT162_OPERATORS__) __host__ __device__ bool operator==(__nv_bfloat162 a, __nv_bfloat162 b) { return __bfloat162float(a.x) == __bfloat162float(b.x) && __bfloat162float(a.y) == __bfloat162float(b.y); } -# endif +# endif // defined(__CUDA_NO_BFLOAT162_OPERATORS__) #endif // _CCCL_HAS_NVBF16() template @@ -78,7 +82,11 @@ __host__ __device__ void test_memcpy_roundtrip(T from) static_assert(cuda::is_trivially_copyable_relaxed_v); T to; ::memcpy(static_cast(&to), static_cast(&from), sizeof(T)); +#if _CCCL_CTK_AT_LEAST(12, 3) assert(from == to); +#else + NV_IF_TARGET(NV_IS_DEVICE, (assert(from == to);)); +#endif } #define CAST(base_type, val) static_cast(val) @@ -219,9 +227,8 @@ __host__ __device__ bool tests() return true; } -// Extended floating-point types: in CUDA 12.x, __half/__nv_bfloat16 operator== is __device__ only. -// The function is __device__ on CUDA 12.x and __host__ __device__ on CUDA 13.x. -#if _CCCL_CTK_AT_LEAST(13, 0) +// Extended floating-point types: operator== is __device__ only on CTK < 12.3. +#if _CCCL_CTK_AT_LEAST(12, 3) __host__ __device__ void tests_nvfp() #else __device__ void tests_nvfp() @@ -293,7 +300,7 @@ __device__ void tests_nvfp() int main(int, char**) { tests(); -#if _CCCL_CTK_AT_LEAST(13, 0) +#if _CCCL_CTK_AT_LEAST(12, 3) tests_nvfp(); #else NV_IF_TARGET(NV_IS_DEVICE, (tests_nvfp();)); diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp index fad1a3a78ed..3c100ec19d6 100644 --- a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.pass.cpp @@ -7,6 +7,9 @@ // //===----------------------------------------------------------------------===// +#include +#include + #include #include #include From e603a9637616c2f4414423fffef553aba4ef0f31 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Mon, 6 Apr 2026 16:46:51 -0700 Subject: [PATCH 13/47] Update docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst Co-authored-by: Giannis Gonidelis --- .../extended_api/type_traits/is_trivially_copyable_relaxed.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index 2ee873e7374..5a96d45529e 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -46,7 +46,7 @@ Users may specialize ``cuda::is_trivially_copyable_relaxed_v`` for types whose s A `trivially copyable `__ class is a class that - has at least one eligible copy constructor, move constructor, copy assignment operator, or move assignment operator, -- each eligible copy constructor is trivial +- each of its eligible copy constructors is trivial - each eligible move constructor is trivial - each eligible copy assignment operator is trivial - each eligible move assignment operator is trivial, and From cde3d1ed8f1fda2f4701d3412f4d057863cb1859 Mon Sep 17 00:00:00 2001 From: Federico Busato <50413820+fbusato@users.noreply.github.com> Date: Mon, 6 Apr 2026 16:47:13 -0700 Subject: [PATCH 14/47] Update docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst Co-authored-by: Giannis Gonidelis --- .../extended_api/type_traits/is_trivially_copyable_relaxed.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst index 5a96d45529e..330d7c6162e 100644 --- a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable_relaxed.rst @@ -28,7 +28,7 @@ A type ``T`` satisfies ``cuda::is_trivially_copyable_relaxed`` if any of the fol The trait also propagates through composite types: - C-style arrays: ``T[N]`` and ``T[]`` are relaxed trivially copyable when ``T`` is. -- ``cuda::std::array``: relaxed trivially copyable when ``T`` is. +- ``cuda::std::array``: relaxed trivially copyable when ``T`` is also trivially copyable. - ``cuda::std::pair``: relaxed trivially copyable when both ``T1`` and ``T2`` are and the object has no padding. - ``cuda::std::tuple``: relaxed trivially copyable when all ``Ts...`` are and the object has no padding. From c20fb893b9977ebba0694c14a9dd278c9de594da Mon Sep 17 00:00:00 2001 From: fbusato Date: Tue, 7 Apr 2026 14:31:41 -0700 Subject: [PATCH 15/47] add recursive struct check --- .../is_trivially_copyable_relaxed.h | 15 +- .../std/__type_traits/aggregate_members.h | 163 ++++++++++++++++++ ...s_trivially_copyable_relaxed.aggr.pass.cpp | 114 ++++++++++++ ...is_trivially_copyable_relaxed.mem.pass.cpp | 10 +- 4 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__type_traits/aggregate_members.h create mode 100644 libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable_relaxed.aggr.pass.cpp diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h index 7fbb685390a..1d002c13319 100644 --- a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable_relaxed.h @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include #include #include @@ -34,6 +36,9 @@ _CCCL_BEGIN_NAMESPACE_CUDA +template +constexpr bool __is_aggregate_trivially_copyable_v = false; + //! Users are allowed to specialize this variable template for their own types template constexpr bool is_trivially_copyable_relaxed_v = @@ -42,7 +47,7 @@ constexpr bool is_trivially_copyable_relaxed_v = #if _CCCL_HAS_CTK() || ::cuda::is_extended_fp_vector_type_v<::cuda::std::remove_const_t<_Tp>> #endif // _CCCL_HAS_CTK() - ; + || __is_aggregate_trivially_copyable_v<::cuda::std::remove_const_t<_Tp>>; template constexpr bool is_trivially_copyable_relaxed_v<_Tp[]> = is_trivially_copyable_relaxed_v<_Tp>; @@ -61,6 +66,14 @@ template constexpr bool is_trivially_copyable_relaxed_v<::cuda::std::tuple<_Ts...>> = (is_trivially_copyable_relaxed_v<_Ts> && ...); +// if all the previous conditions fail, check if the type is an aggregate and all its members are trivially copyable +template +using __is_trivially_copyable_relaxed = ::cuda::std::bool_constant>; + +template +constexpr bool __is_aggregate_trivially_copyable_v<_Tp, ::cuda::std::enable_if_t<::cuda::std::is_aggregate_v<_Tp>>> = + ::cuda::std::__aggregate_all_of<__is_trivially_copyable_relaxed, _Tp>::value; + // defined as alias so users cannot specialize it (they should specialize the variable template instead) template using is_trivially_copyable_relaxed = ::cuda::std::bool_constant>; diff --git a/libcudacxx/include/cuda/std/__type_traits/aggregate_members.h b/libcudacxx/include/cuda/std/__type_traits/aggregate_members.h new file mode 100644 index 00000000000..c12253f09e4 --- /dev/null +++ b/libcudacxx/include/cuda/std/__type_traits/aggregate_members.h @@ -0,0 +1,163 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___TYPE_TRAITS_AGGREGATE_MEMBERS_H +#define _CUDA_STD___TYPE_TRAITS_AGGREGATE_MEMBERS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include + +#include + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wmissing-field-initializers") + +_CCCL_BEGIN_NAMESPACE_CUDA_STD + +#if defined(_CCCL_BUILTIN_STRUCTURED_BINDING_SIZE) + +// Returns the number of aggregate members, or `-1` if the type is not an aggregate. +template = 0, int> = 0> +constexpr int __aggregate_arity_v = _CCCL_BUILTIN_STRUCTURED_BINDING_SIZE(_Tp); + +#else // ^^^ _CCCL_BUILTIN_STRUCTURED_BINDING_SIZE ^^^ / !_CCCL_BUILTIN_STRUCTURED_BINDING_SIZE vvv + +// provide a generic way to initialize an aggregate member +struct __any_aggregate_member +{ + template + _CCCL_API constexpr operator _Tp&&() const; +}; + +template > +struct __aggregate_arity_impl +{ + template + _CCCL_API auto operator()(_Args... __args) -> decltype(_Self{}(__args..., __any_aggregate_member{})); + + template + _CCCL_API auto operator()(_Args...) const -> char (*)[sizeof...(_Args) + 1]; // return the number of members + 1 +}; + +// T is not an aggregate, return 1 +template +struct __aggregate_arity_impl<_Tp, false> +{ + _CCCL_API auto operator()() const -> char*; +}; + +// Returns the number of aggregate members, or `-1` if the type is not an aggregate. +template +constexpr int __aggregate_arity_v = int{sizeof(*__aggregate_arity_impl<_Tp>{}())} - 2; + +#endif // ^^^ !_CCCL_BUILTIN_STRUCTURED_BINDING_SIZE ^^^ + +// Apply a Predicate to every aggregate member + +// provide a generic way to initialize an aggregate member but only if the Predicate is true +template