diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu index 5e9ffc062ef..3f173c6162b 100644 --- a/c/parallel/src/transform.cu +++ b/c/parallel/src/transform.cu @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -327,7 +328,7 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho build_ptr->cache = new transform::cache(); // avoid new and delete which requires the allocated and freed types to match - static_assert(std::is_trivially_copyable_v); + static_assert(::cuda::is_trivially_copyable_v); build_ptr->runtime_policy = std::malloc(sizeof(policy_sel)); std::memcpy(build_ptr->runtime_policy, &policy_sel, sizeof(policy_sel)); @@ -528,7 +529,7 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H build_ptr->cache = new transform::cache(); // avoid new and delete which requires the allocated and freed types to match - static_assert(std::is_trivially_copyable_v); + static_assert(::cuda::is_trivially_copyable_v); build_ptr->runtime_policy = std::malloc(sizeof(policy_sel)); std::memcpy(build_ptr->runtime_policy, &policy_sel, sizeof(policy_sel)); diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh index 59cbb8b2c23..a8a968bfe51 100644 --- a/cub/cub/agent/single_pass_scan_operators.cuh +++ b/cub/cub/agent/single_pass_scan_operators.cuh @@ -29,9 +29,9 @@ #include #include +#include #include #include -#include #include @@ -473,7 +473,7 @@ using default_no_delay_t = default_no_delay_constructor_t::delay_t; template using default_delay_constructor_t = // TODO(bgruber): remove the check for is_primitive in CCCL 4.0 - ::cuda::std::conditional_t::value || ::cuda::std::is_trivially_copyable_v, + ::cuda::std::conditional_t::value || ::cuda::is_trivially_copyable_v, fixed_delay_constructor_t<350, 450>, default_no_delay_constructor_t>; @@ -483,7 +483,7 @@ using default_delay_t = typename default_delay_constructor_t::delay_t; template using default_reduce_by_key_delay_constructor_t = // TODO(bgruber): remove the check for is_primitive in CCCL 4.0 - ::cuda::std::conditional_t<(is_primitive::value || ::cuda::std::is_trivially_copyable_v) + ::cuda::std::conditional_t<(is_primitive::value || ::cuda::is_trivially_copyable_v) && (sizeof(ValueT) + sizeof(KeyT) < largest_atomic_message_size), reduce_by_key_delay_constructor_t<350, 450>, default_delay_constructor_t>>; @@ -582,7 +582,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init( template in CCCL 4.0 bool SingleWord = detail::is_primitive::value - || (::cuda::std::is_trivially_copyable_v + || (::cuda::is_trivially_copyable_v && sizeof(T) < detail::largest_atomic_message_size // TODO(bgruber): a power of two size is not strictly necessary, but the implementation // cannot handle it currently. For example, we could support status word + int3. @@ -957,7 +957,7 @@ struct ScanTileState template in CCCL 4.0 - bool SingleWord = (detail::is_primitive::value || ::cuda::std::is_trivially_copyable_v) + bool SingleWord = (detail::is_primitive::value || ::cuda::is_trivially_copyable_v) && (sizeof(ValueT) + sizeof(KeyT) < detail::largest_atomic_message_size)> struct ReduceByKeyScanTileState; diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh index 5c58b862ba9..253aea8da7b 100644 --- a/cub/cub/detail/uninitialized_copy.cuh +++ b/cub/cub/detail/uninitialized_copy.cuh @@ -13,9 +13,9 @@ # pragma system_header #endif // no system header +#include #include #include -#include #include CUB_NAMESPACE_BEGIN @@ -30,14 +30,14 @@ _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val) new (ptr) T(::cuda::std::forward(val)); } #else // ^^^ _CCCL_CUDA_COMPILER(NVHPC) ^^^ / vvv !_CCCL_CUDA_COMPILER(NVHPC) vvv -template , int> = 0> +template , int> = 0> _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val) { // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better *ptr = ::cuda::std::forward(val); } -template , int> = 0> +template , int> = 0> _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val) { new (ptr) T(::cuda::std::forward(val)); diff --git a/cub/cub/detail/warpspeed/look_ahead.cuh b/cub/cub/detail/warpspeed/look_ahead.cuh index feffea126d5..0079e8cb197 100644 --- a/cub/cub/detail/warpspeed/look_ahead.cuh +++ b/cub/cub/detail/warpspeed/look_ahead.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -75,7 +76,7 @@ storeTileAggregate(tile_state_t* ptrTileStates, scan_state scanState, Ac _CCCL_ASSERT(index >= 0 && index < gridDim.x, "Reading out of bounds tile state"); if constexpr (sizeof(tile_state_t) <= cub::detail::warpspeed::max_native_atomic_size() - && ::cuda::std::is_trivially_copyable_v>) + && ::cuda::is_trivially_copyable_v>) { static_assert(::cuda::is_power_of_two(sizeof(tile_state_t))); tile_state_t tmp{scanState, sum}; @@ -103,7 +104,7 @@ _CCCL_DEVICE_API tile_state_t loadTileAggregate(tile_state_t* pt tile_state_t res; if constexpr (sizeof(tile_state_t) <= cub::detail::warpspeed::max_native_atomic_size() - && ::cuda::std::is_trivially_copyable_v>) + && ::cuda::is_trivially_copyable_v>) { static_assert(::cuda::is_power_of_two(sizeof(tile_state_t))); # if _CCCL_HAS_NV_ATOMIC_BUILTINS() diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh index b5abe5aae4b..38f4a5d1ac8 100644 --- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh @@ -18,6 +18,7 @@ #include #include +#include #include CUB_NAMESPACE_BEGIN @@ -80,7 +81,7 @@ struct Transforms static_assert(::cuda::std::is_convertible_v, "The common type of `LevelT` and `SampleT` must be " "convertible to `int`."); - static_assert(::cuda::std::is_trivially_copyable_v, + static_assert(::cuda::is_trivially_copyable_v, "The common type of `LevelT` and `SampleT` must be " "trivially copyable."); diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh index 023b5b18ff0..9d2047506cd 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh @@ -24,9 +24,9 @@ #include #include +#include #include #include -#include #include CUB_NAMESPACE_BEGIN @@ -1657,7 +1657,7 @@ struct policy_selector_from_types int{sizeof(AccumT)}, classify_type, is_primitive_v, - ::cuda::std::is_trivially_copyable_v, + ::cuda::is_trivially_copyable_v, is_primitive_v, basic_binary_op_t::value}(cc); } diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh index 429555865a9..046f893d432 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -525,7 +526,7 @@ struct policy_selector_from_types int{sizeof(KeyT)}, classify_type, is_primitive_v, - ::cuda::std::is_trivially_copyable_v, + ::cuda::is_trivially_copyable_v, is_primitive_v}; return selector(cc); } diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh index f8534346552..4c176e41182 100644 --- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -612,7 +613,7 @@ struct policy_selector_from_types classify_type, is_primitive_v, is_primitive_v, - ::cuda::std::is_trivially_copyable_v}; + ::cuda::is_trivially_copyable_v}; return selector(cc); } }; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh index d67edd7569a..8dd8aa9fb6e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -1451,8 +1452,8 @@ struct policy_selector_from_types classify_op, THRUST_NS_QUALIFIER::is_contiguous_iterator_v, THRUST_NS_QUALIFIER::is_contiguous_iterator_v, - ::cuda::std::is_trivially_copyable_v, - ::cuda::std::is_trivially_copyable_v, + ::cuda::is_trivially_copyable_v, + ::cuda::is_trivially_copyable_v, ::cuda::std::is_default_constructible_v, accum_is_primitive_or_trivially_copy_constructible, benchmark_match}; diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh index 3d93fa6c028..9ee7dcf206b 100644 --- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh @@ -26,9 +26,9 @@ #include #include +#include #include #include -#include CUB_NAMESPACE_BEGIN @@ -1951,7 +1951,7 @@ struct policy_selector_from_types static_cast(sizeof(ValueT)), static_cast(sizeof(AccumT)), is_primitive::value, - ::cuda::std::is_trivially_copyable_v, + ::cuda::is_trivially_copyable_v, classify_type, classify_type, classify_type, diff --git a/cudax/include/cuda/experimental/__container/graph_buffer.cuh b/cudax/include/cuda/experimental/__container/graph_buffer.cuh index 5eec2c6feb0..ad65bff4975 100644 --- a/cudax/include/cuda/experimental/__container/graph_buffer.cuh +++ b/cudax/include/cuda/experimental/__container/graph_buffer.cuh @@ -27,8 +27,8 @@ # include # include # include +# include # include -# include # include # include # include @@ -68,7 +68,7 @@ namespace cuda::experimental template class graph_buffer { - static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "graph_buffer requires T to be trivially copyable."); + static_assert(::cuda::is_trivially_copyable_v<_Tp>, "graph_buffer requires T to be trivially copyable."); public: using value_type = _Tp; diff --git a/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh b/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh index 79101ac8722..595c466c980 100644 --- a/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh +++ b/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh @@ -33,6 +33,7 @@ # include # include # include +# include # include # include # include @@ -43,7 +44,6 @@ # include # include # include -# include # include # include @@ -125,7 +125,7 @@ _CCCL_HOST_API void copy(::cuda::device_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyI && ::cuda::std::is_convertible_v<_AccessorPolicyOut, __default_accessor_out>; constexpr bool __are_byte_copyable = ::cuda::std::is_same_v<::cuda::std::remove_cv_t<_TpIn>, ::cuda::std::remove_cv_t<_TpOut>> - && ::cuda::std::is_trivially_copyable_v<_TpIn> // + && ::cuda::is_trivially_copyable_v<_TpIn> // && __have_default_accessors; if (__tensor_size == 1 && __are_byte_copyable) diff --git a/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh b/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh index 5424a73c865..0b1897760a9 100644 --- a/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh +++ b/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh @@ -27,6 +27,7 @@ # include # include # include +# include # include # include # include @@ -37,7 +38,6 @@ # include # include # include -# include # include # include @@ -74,7 +74,7 @@ _CCCL_HOST_API void __copy_bytes_impl( namespace cudax = ::cuda::experimental; static_assert(::cuda::std::is_same_v<::cuda::std::remove_cv_t<_TpIn>, ::cuda::std::remove_cv_t<_TpOut>>, "cudax::copy_bytes: TpIn and TpOut must be the same type"); - static_assert(::cuda::std::is_trivially_copyable_v<_TpIn>, "TpIn must be trivially copyable"); + static_assert(::cuda::is_trivially_copyable_v<_TpIn>, "TpIn must be trivially copyable"); static_assert(!::cuda::std::is_const_v<_TpOut>, "TpOut must not be const"); static_assert(::cuda::__is_cuda_mdspan_layout_v<_LayoutPolicyIn>, "cudax::copy_bytes: LayoutPolicyIn must be a predefined layout policy"); diff --git a/cudax/include/cuda/experimental/__graph/copy_bytes.cuh b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh index c4fd1fc2ac1..007c35963af 100644 --- a/cudax/include/cuda/experimental/__graph/copy_bytes.cuh +++ b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh @@ -26,11 +26,11 @@ # include # include # include +# include # include # include # include # include -# include # include # include @@ -47,7 +47,7 @@ _CCCL_HOST_API graph_node_ref __copy_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_SrcTy> __src, ::cuda::std::span<_DstTy> __dst) { static_assert(!::cuda::std::is_const_v<_DstTy>, "Copy destination can't be const"); - static_assert(::cuda::std::is_trivially_copyable_v<_SrcTy> && ::cuda::std::is_trivially_copyable_v<_DstTy>, + static_assert(::cuda::is_trivially_copyable_v<_SrcTy> && ::cuda::is_trivially_copyable_v<_DstTy>, "Copy source and destination element types must be trivially copyable"); if (__src.size_bytes() > __dst.size_bytes()) diff --git a/cudax/include/cuda/experimental/__graph/fill_bytes.cuh b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh index 398baf492fb..ded7ce5cea7 100644 --- a/cudax/include/cuda/experimental/__graph/fill_bytes.cuh +++ b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh @@ -26,11 +26,11 @@ # include # include # include +# include # include # include # include # include -# include # include # include @@ -48,8 +48,7 @@ _CCCL_HOST_API graph_node_ref __fill_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_DstTy, _DstSize> __dst, ::cuda::std::uint8_t __value) { static_assert(!::cuda::std::is_const_v<_DstTy>, "Fill destination can't be const"); - static_assert(::cuda::std::is_trivially_copyable_v<_DstTy>, - "Fill destination element type must be trivially copyable"); + static_assert(::cuda::is_trivially_copyable_v<_DstTy>, "Fill destination element type must be trivially copyable"); auto __deps = __pb.get_dependencies(); ::CUgraphNodeParams __params{}; diff --git a/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh b/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh index 16890ba6f46..19340e03dd9 100644 --- a/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh +++ b/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include #include #include @@ -51,7 +51,7 @@ class kernel_ref template class kernel_ref { - static_assert((true && ... && ::cuda::std::is_trivially_copyable_v<_Args>), + static_assert((true && ... && ::cuda::is_trivially_copyable_v<_Args>), "All kernel_ref argument types must be trivially copyable."); public: diff --git a/docs/libcudacxx/extended_api/type_traits.rst b/docs/libcudacxx/extended_api/type_traits.rst index f11c8e9f998..9f55314a9e7 100644 --- a/docs/libcudacxx/extended_api/type_traits.rst +++ b/docs/libcudacxx/extended_api/type_traits.rst @@ -8,6 +8,7 @@ Type traits :maxdepth: 1 type_traits/is_floating_point + type_traits/is_trivially_copyable type_traits/is_bitwise_comparable type_traits/vector_types @@ -30,6 +31,11 @@ Type traits - CCCL 3.3.0 - CUDA 13.3 + * - :ref:`cuda::is_trivially_copyable ` + - Relaxed trivially copyable check including extended floating-point types + - CCCL 3.4.0 + - CUDA 13.4 + * - :ref:`cuda::is_bitwise_comparable ` - User-specializable bitwise comparability check - CCCL 3.4.0 diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable.rst new file mode 100644 index 00000000000..e646f942ce1 --- /dev/null +++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable.rst @@ -0,0 +1,75 @@ +.. _libcudacxx-extended-api-type_traits-is_trivially_copyable: + +``cuda::is_trivially_copyable`` +======================================= + +Defined in the ```` header. + +.. code:: cuda + + namespace cuda { + + template + constexpr bool is_trivially_copyable_v = /* see below */; + + template + using is_trivially_copyable = cuda::std::bool_constant>; + + } // namespace cuda + +``cuda::is_trivially_copyable_v`` trait evaluates if a type can be copied by copying its underlying bytes. +It extends ``cuda::std::is_trivially_copyable`` to also recognize CUDA extended floating-point vector types as trivially copyable. + +``cuda::is_trivially_copyable_v`` relies on ``cuda::std::is_trivially_copyable`` but adds support for CUDA-specific types. + +The trait is true when ``T`` is any of the following: + +- A type for which ``cuda::std::is_trivially_copyable_v`` is true. +- An extended floating-point vector type, for example ``__half2``, ``__nv_bfloat162``. + +The trait also propagates through composite types: + +- C-style arrays: ``T[N]`` and ``T[]`` are trivially copyable when ``T`` is. +- ``cuda::std::array``: trivially copyable when ``T`` is. +- ``cuda::std::pair``: trivially copyable when both ``T1`` and ``T2`` are. +- ``cuda::std::tuple``: trivially copyable when all ``Ts...`` are. +- ``cuda::std::complex``: trivially copyable when ``T`` is. +- ``cuda::complex``: trivially copyable when ``T`` is. +- `Aggregates `__: trivially copyable when all their members are. + + - On MSVC, recursive data-member inspection is not supported beyond the first level. + +``const`` qualification is handled transparently, while ``volatile`` is compiler dependent. + +Examples +-------- + +.. code:: cuda + + #include + #include + #include + #include + + #include + + // Standard trivially copyable types + static_assert(cuda::is_trivially_copyable_v); + static_assert(cuda::is_trivially_copyable_v); + + // Extended floating-point types + static_assert(cuda::is_trivially_copyable_v<__half>); + static_assert(cuda::is_trivially_copyable_v<__nv_bfloat16>); + static_assert(cuda::is_trivially_copyable_v<__half2>); + static_assert(cuda::is_trivially_copyable_v>); + static_assert(cuda::is_trivially_copyable_v>); + + // Composite types containing extended floating-point types + static_assert(cuda::is_trivially_copyable_v<__half[4]>); + static_assert(cuda::is_trivially_copyable_v>); + static_assert(cuda::is_trivially_copyable_v>); + static_assert(cuda::is_trivially_copyable_v>); + static_assert(cuda::is_trivially_copyable_v>); + + +`See it on Godbolt 🔗 `__ diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst index 9904db7352e..44079a4d81a 100644 --- a/docs/libcudacxx/standard_api/numerics_library/bit.rst +++ b/docs/libcudacxx/standard_api/numerics_library/bit.rst @@ -3,6 +3,16 @@ ```` ================== +``cuda::std::bit_cast`` +----------------------- + +``cuda::std::bit_cast`` extended the standard ``std::bit_cast`` to also recognize CUDA extended floating-point scalar and vector types as trivially copyable. + +**Limitations** + +- The function can be used in ``constexpr`` contexts only when the source and destination types are trivially copyable. +- The function cannot be used in ``constexpr`` contexts with MSVC <= 19.25 and GCC <= 10. + CUDA Performance Considerations ------------------------------- diff --git a/libcudacxx/include/cuda/__algorithm/copy.h b/libcudacxx/include/cuda/__algorithm/copy.h index 982f1b83f7d..217b3375654 100644 --- a/libcudacxx/include/cuda/__algorithm/copy.h +++ b/libcudacxx/include/cuda/__algorithm/copy.h @@ -26,6 +26,7 @@ # include # include # include +# include # include # include # include @@ -72,7 +73,7 @@ _CCCL_HOST_API void __copy_bytes_impl( [[maybe_unused]] copy_configuration __config) { static_assert(!::cuda::std::is_const_v<_DstTy>, "Copy destination can't be const"); - static_assert(::cuda::std::is_trivially_copyable_v<_SrcTy> && ::cuda::std::is_trivially_copyable_v<_DstTy>); + static_assert(::cuda::is_trivially_copyable_v<_SrcTy> && ::cuda::is_trivially_copyable_v<_DstTy>); if (__src.size_bytes() > __dst.size_bytes()) { diff --git a/libcudacxx/include/cuda/__algorithm/fill.h b/libcudacxx/include/cuda/__algorithm/fill.h index 32e0391a987..8a832ee7739 100644 --- a/libcudacxx/include/cuda/__algorithm/fill.h +++ b/libcudacxx/include/cuda/__algorithm/fill.h @@ -26,6 +26,7 @@ # include # include # include +# include # include # include # include @@ -41,7 +42,7 @@ _CCCL_HOST_API void __fill_bytes_impl(stream_ref __stream, ::cuda::std::span<_DstTy, _DstSize> __dst, ::cuda::std::uint8_t __value) { static_assert(!::cuda::std::is_const_v<_DstTy>, "Fill destination can't be const"); - static_assert(::cuda::std::is_trivially_copyable_v<_DstTy>); + static_assert(::cuda::is_trivially_copyable_v<_DstTy>); // TODO do a host callback if not device accessible? ::cuda::__driver::__memsetAsync(__dst.data(), __value, __dst.size_bytes(), __stream.get()); diff --git a/libcudacxx/include/cuda/__container/buffer.h b/libcudacxx/include/cuda/__container/buffer.h index 6c4cbe4ebb2..58fcfc2469d 100644 --- a/libcudacxx/include/cuda/__container/buffer.h +++ b/libcudacxx/include/cuda/__container/buffer.h @@ -38,6 +38,7 @@ # include # include # include +# include # include # include # include @@ -50,7 +51,6 @@ # include # include # include -# include # include # include # include @@ -115,7 +115,7 @@ class buffer friend class buffer; // For now we require trivially copyable type to simplify the implementation - static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "cuda::buffer requires T to be trivially copyable."); + static_assert(::cuda::is_trivially_copyable_v<_Tp>, "cuda::buffer requires T to be trivially copyable."); // At least one of the properties must signal an execution space static_assert(::cuda::mr::__contains_execution_space_property<_Properties...>, diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h index 70f9f71551b..7b8fea552d7 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h @@ -31,9 +31,9 @@ #include #include #include +#include #include #include -#include #include #include @@ -58,7 +58,7 @@ template & __barrier) { - static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "memcpy_async requires a trivially copyable type"); + static_assert(::cuda::is_trivially_copyable_v<_Tp>, "memcpy_async requires a trivially copyable type"); // 1. Determine which completion mechanisms can be used with the current // barrier. A local shared memory barrier, i.e., block-scope barrier in local diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h index 4bc7c0240a8..163771d3a36 100644 --- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h +++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h @@ -33,9 +33,9 @@ # include # include # include +# include # include # include -# include # include # include @@ -55,7 +55,7 @@ _CCCL_DEVICE_API async_contract_fulfillment memcpy_async_tx( // memcpy_async when compiling with GCC 4.8. // FIXME: remove the #if once GCC 4.8 is no longer supported. # if !_CCCL_COMPILER(GCC) || _CCCL_COMPILER(GCC, >, 4, 8) - static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "memcpy_async_tx requires a trivially copyable type"); + static_assert(::cuda::is_trivially_copyable_v<_Tp>, "memcpy_async_tx requires a trivially copyable type"); # endif static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned."); static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)"); diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable.h new file mode 100644 index 00000000000..fecc6714b97 --- /dev/null +++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable.h @@ -0,0 +1,110 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H +#define __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA + +template +inline constexpr bool __is_aggregate_trivially_copyable_v = false; + +template +inline constexpr bool __is_trivially_copyable_v = + ::cuda::std::is_trivially_copyable_v<_Tp> || __is_aggregate_trivially_copyable_v<_Tp>; + +#if _CCCL_HAS_NVFP16() + +template <> +inline constexpr bool __is_trivially_copyable_v<::__half2> = true; + +#endif // _CCCL_HAS_NVFP16() + +#if _CCCL_HAS_NVBF16() + +template <> +inline constexpr bool __is_trivially_copyable_v<::__nv_bfloat162> = true; + +#endif // _CCCL_HAS_NVBF16() + +template +inline constexpr bool __is_trivially_copyable_v<_Tp[]> = __is_trivially_copyable_v<_Tp>; + +template +inline constexpr bool __is_trivially_copyable_v<_Tp[_Size]> = __is_trivially_copyable_v<_Tp>; + +template +inline constexpr bool __is_trivially_copyable_v<::cuda::std::array<_Tp, _Size>> = __is_trivially_copyable_v<_Tp>; + +template +inline constexpr bool __is_trivially_copyable_v<::cuda::std::pair<_T1, _T2>> = + __is_trivially_copyable_v<_T1> && __is_trivially_copyable_v<_T2>; + +template +inline constexpr bool __is_trivially_copyable_v<::cuda::std::tuple<_Ts...>> = (__is_trivially_copyable_v<_Ts> && ...); + +template +inline constexpr bool __is_trivially_copyable_v> = true; + +template +inline constexpr bool __is_trivially_copyable_v<::cuda::std::complex<_Tp>> = true; + +// if all the previous conditions fail, check if the type is an aggregate and all its members are trivially copyable +template +using __is_trivially_copyable_callable = ::cuda::std::bool_constant<__is_trivially_copyable_v<_Tp>>; + +template +inline constexpr bool __is_aggregate_trivially_copyable_v< + _Tp, + ::cuda::std::enable_if_t<::cuda::std::is_aggregate_v<_Tp> && !::cuda::std::is_trivially_copyable_v<_Tp>>> = + ::cuda::std::__aggregate_all_of_v<__is_trivially_copyable_callable, _Tp>; + +//---------------------------------------------------------------------------------------------------------------------- +// public traits + +template +inline constexpr bool is_trivially_copyable_v = __is_trivially_copyable_v<_Tp>; + +template +inline constexpr bool is_trivially_copyable_v = is_trivially_copyable_v<_Tp>; + +// defined as alias so users cannot specialize it (they should specialize the variable template instead) +template +using is_trivially_copyable = ::cuda::std::bool_constant>; + +_CCCL_END_NAMESPACE_CUDA + +#include + +#endif // __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h index e7886ad4976..98844e3038f 100644 --- a/libcudacxx/include/cuda/std/__bit/bit_cast.h +++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h @@ -4,7 +4,7 @@ // under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// SPDX-FileCopyrightText: Copyright (c) 2024-26 NVIDIA CORPORATION & AFFILIATES. // //===----------------------------------------------------------------------===// @@ -21,10 +21,9 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include +#include +#include +#include #include #include @@ -42,37 +41,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD #else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ / vvv !_CCCL_BUILTIN_BIT_CAST vvv # define _CCCL_CONSTEXPR_BIT_CAST # define _CCCL_HAS_CONSTEXPR_BIT_CAST() 0 -# if _CCCL_COMPILER(GCC, >=, 8) -// GCC starting with GCC8 warns about our extended floating point types having protected data members +#endif // !_CCCL_BUILTIN_BIT_CAST + +#if _CCCL_COMPILER(GCC, >=, 8) _CCCL_DIAG_PUSH _CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess") -# endif // _CCCL_COMPILER(GCC, >=, 8) -#endif // !_CCCL_BUILTIN_BIT_CAST +#endif // _CCCL_COMPILER(GCC, >=, 8) -template = 0, - enable_if_t || __is_extended_floating_point_v<_To>, int> = 0, - enable_if_t || __is_extended_floating_point_v<_From>, int> = 0> -[[nodiscard]] _CCCL_API inline _CCCL_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept +template +[[nodiscard]] _CCCL_API inline _To __bit_cast_memcpy(const _From& __from) noexcept { -#if defined(_CCCL_BUILTIN_BIT_CAST) - return _CCCL_BUILTIN_BIT_CAST(_To, __from); -#else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ / vvv !_CCCL_BUILTIN_BIT_CAST vvv - static_assert(is_trivially_default_constructible_v<_To>, - "The compiler does not support __builtin_bit_cast, so bit_cast additionally requires the destination " - "type to be trivially constructible"); +#if !_CCCL_COMPILER(GCC, <=, 7) + static_assert(::cuda::std::default_initializable<_To>, + "bit_cast memcpy fallback requires the destination type to be default initializable"); +#endif // !_CCCL_COMPILER(GCC, <=, 7) _To __temp; ::cuda::std::memcpy(&__temp, &__from, sizeof(_To)); return __temp; -#endif // !_CCCL_BUILTIN_BIT_CAST } -#if !defined(_CCCL_BUILTIN_BIT_CAST) -# if _CCCL_COMPILER(GCC, >=, 8) +#if _CCCL_COMPILER(GCC, >=, 8) _CCCL_DIAG_POP -# endif // _CCCL_COMPILER(GCC, >=, 8) -#endif // !_CCCL_BUILTIN_BIT_CAST +#endif // _CCCL_COMPILER(GCC, >=, 8) + +_CCCL_TEMPLATE(class _To, class _From) +_CCCL_REQUIRES((sizeof(_To) == sizeof(_From)) _CCCL_AND(::cuda::is_trivially_copyable_v<_To>) + _CCCL_AND(::cuda::is_trivially_copyable_v<_From>)) +[[nodiscard]] _CCCL_API inline _CCCL_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept +{ +#if defined(_CCCL_BUILTIN_BIT_CAST) + if constexpr (::cuda::std::is_trivially_copyable_v<_To> && ::cuda::std::is_trivially_copyable_v<_From>) + { + return _CCCL_BUILTIN_BIT_CAST(_To, __from); + } + else +#endif // _CCCL_BUILTIN_BIT_CAST + { + return ::cuda::std::__bit_cast_memcpy<_To>(__from); + } +} _CCCL_END_NAMESPACE_CUDA_STD diff --git a/libcudacxx/include/cuda/std/string_view b/libcudacxx/include/cuda/std/string_view index a320302bc07..6c518f1ccb7 100644 --- a/libcudacxx/include/cuda/std/string_view +++ b/libcudacxx/include/cuda/std/string_view @@ -51,8 +51,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/libcudacxx/include/cuda/type_traits b/libcudacxx/include/cuda/type_traits index 4f47d003628..21e0c0b1174 100644 --- a/libcudacxx/include/cuda/type_traits +++ b/libcudacxx/include/cuda/type_traits @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.basic_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.basic_types.pass.cpp new file mode 100644 index 00000000000..696a3a07714 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.basic_types.pass.cpp @@ -0,0 +1,57 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include + +#include "test_macros.h" + +template +TEST_FUNC void test_is_trivially_copyable() +{ + static_assert(cuda::is_trivially_copyable::value); + static_assert(cuda::is_trivially_copyable::value); + static_assert(cuda::is_trivially_copyable_v); + static_assert(cuda::is_trivially_copyable_v); +} + +__host__ __device__ void test_single_types() +{ + // standard trivially copyable types + test_is_trivially_copyable(); + test_is_trivially_copyable(); + test_is_trivially_copyable(); + +#if _CCCL_HAS_CTK() + test_is_trivially_copyable(); + test_is_trivially_copyable(); +#endif // _CCCL_HAS_CTK() + + // extended floating point scalar types +#if _CCCL_HAS_NVFP16() + test_is_trivially_copyable<__half>(); + test_is_trivially_copyable<__half2>(); +#endif // _CCCL_HAS_NVFP16() +#if _CCCL_HAS_NVBF16() + test_is_trivially_copyable<__nv_bfloat16>(); + test_is_trivially_copyable<__nv_bfloat162>(); +#endif // _CCCL_HAS_NVBF16() +#if _CCCL_HAS_NVFP8_E4M3() + test_is_trivially_copyable<__nv_fp8_e4m3>(); + test_is_trivially_copyable<__nv_fp8x2_e4m3>(); +#endif // _CCCL_HAS_NVFP8_E4M3() +} + +int main(int, char**) +{ + test_single_types(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.composite_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.composite_types.pass.cpp new file mode 100644 index 00000000000..e46542b4ea0 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.composite_types.pass.cpp @@ -0,0 +1,95 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +template +__host__ __device__ void test_is_trivially_copyable() +{ + static_assert(cuda::is_trivially_copyable::value); + static_assert(cuda::is_trivially_copyable::value); + static_assert(cuda::is_trivially_copyable_v); + static_assert(cuda::is_trivially_copyable_v); +} + +template +struct TriviallyCopyableWrapper +{ + T x; +}; + +struct TrivialPod +{ + int x; + float y; +}; + +class NonTriviallyCopyable +{ +public: + __host__ __device__ NonTriviallyCopyable(const NonTriviallyCopyable&) {} // NOLINT +}; + +template +__host__ __device__ void test_is_trivially_copyable_compositions() +{ + test_is_trivially_copyable(); + test_is_trivially_copyable>(); + test_is_trivially_copyable>(); + test_is_trivially_copyable>(); + test_is_trivially_copyable>(); + test_is_trivially_copyable>(); + test_is_trivially_copyable>(); +} + +__host__ __device__ void test_composite_types() +{ + test_is_trivially_copyable(); + + test_is_trivially_copyable(); + test_is_trivially_copyable(); + + // cuda::std::array, pair, tuple, complex, and aggregate wrappers of trivially copyable types + test_is_trivially_copyable_compositions(); + test_is_trivially_copyable_compositions(); + test_is_trivially_copyable>(); + + // non-trivially copyable types + static_assert(!cuda::is_trivially_copyable_v); +} + +__host__ __device__ void test_extended_fp_types() +{ +#if _CCCL_HAS_NVFP16() + test_is_trivially_copyable_compositions<__half>(); + test_is_trivially_copyable_compositions<__half2>(); +#endif // _CCCL_HAS_NVFP16() + +#if _CCCL_HAS_NVBF16() + test_is_trivially_copyable_compositions<__nv_bfloat16>(); + test_is_trivially_copyable_compositions<__nv_bfloat162>(); +#endif // _CCCL_HAS_NVBF16() +} + +int main(int, char**) +{ + test_composite_types(); + test_extended_fp_types(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.mem.pass.cpp new file mode 100644 index 00000000000..e935f083fc5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.mem.pass.cpp @@ -0,0 +1,345 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +_CCCL_DIAG_PUSH +_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef") + +// operator== for CUDA vector types and dim3 (not provided by the toolkit) +template , int> = 0> +__host__ __device__ bool operator==(T a, T b) +{ + if constexpr (sizeof(T) == sizeof(decltype(T::x))) + { + return a.x == b.x; + } + else if constexpr (sizeof(T) == 2 * sizeof(decltype(T::x))) + { + return a.x == b.x && a.y == b.y; + } + else if constexpr (sizeof(T) == 3 * sizeof(decltype(T::x))) + { + return a.x == b.x && a.y == b.y && a.z == b.z; + } + else + { + return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; + } +} + +template +__host__ __device__ bool operator==(cuda::complex a, cuda::complex b) +{ + return a.real() == b.real() && a.imag() == b.imag(); +} + +template +struct large_custom_t +{ + unsigned char data[Size]; +}; + +template +__host__ __device__ bool operator==(const large_custom_t& a, const large_custom_t& b) +{ + for (int i = 0; i < Size; ++i) + { + if (a.data[i] != b.data[i]) + { + return false; + } + } + return true; +} + +template +__host__ __device__ void test_memcpy_roundtrip(T from) +{ + static_assert(cuda::is_trivially_copyable_v); + T to; + ::memcpy(static_cast(&to), static_cast(&from), sizeof(T)); + assert(from == to); +} + +#define CAST(base_type, val) static_cast(val) + +#define REPEAT_1(base_type, index) CAST(base_type, input[index][0]) +#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), CAST(base_type, input[index][1]) +#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), CAST(base_type, input[index][2]) +#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), CAST(base_type, input[index][3]) + +#define TEST_CUDA_VECTOR_TYPE(base_type, size) \ + { \ + for (base_type##size i : \ + {base_type##size{REPEAT_##size(base_type, 0)}, \ + base_type##size{REPEAT_##size(base_type, 1)}, \ + base_type##size{REPEAT_##size(base_type, 2)}, \ + base_type##size{REPEAT_##size(base_type, 3)}, \ + base_type##size{REPEAT_##size(base_type, 4)}, \ + base_type##size{REPEAT_##size(base_type, 5)}, \ + base_type##size{REPEAT_##size(base_type, 6)}}) \ + { \ + test_memcpy_roundtrip(i); \ + } \ + } + +#define TEST_CUDA_VECTOR_TYPES(base_type) \ + TEST_CUDA_VECTOR_TYPE(base_type, 1) \ + TEST_CUDA_VECTOR_TYPE(base_type, 2) \ + TEST_CUDA_VECTOR_TYPE(base_type, 3) \ + TEST_CUDA_VECTOR_TYPE(base_type, 4) + +__host__ __device__ bool tests() +{ + // standard scalar types + test_memcpy_roundtrip(42); + test_memcpy_roundtrip(0.0f); + test_memcpy_roundtrip(3.14159); + test_memcpy_roundtrip(static_cast(7)); + test_memcpy_roundtrip(static_cast('A')); + + // cuda::std::pair + using pair = cuda::std::pair; + for (pair i : + {pair{0.0f, 1}, + pair{1.0f, 2}, + pair{-1.0f, 3}, + pair{10.0f, 4}, + pair{-10.0f, 5}, + pair{2.71828f, 6}, + pair{3.14159f, 7}}) + { + test_memcpy_roundtrip(i); + } + + // cuda::std::tuple + using tuple = cuda::std::tuple; + for (tuple i : + {tuple{1, 0.0f}, + tuple{2, 1.0f}, + tuple{3, -1.0f}, + tuple{4, 10.0f}, + tuple{5, -10.0f}, + tuple{6, 2.71828f}, + tuple{7, 3.14159f}}) + { + test_memcpy_roundtrip(i); + } + + // cuda::std::array + using array = cuda::std::array; + for (array i : + {array{0.0f, 1.0f}, + array{1.0f, 2.0f}, + array{-1.0f, 3.0f}, + array{10.0f, 4.0f}, + array{-10.0f, 5.0f}, + array{2.71828f, 6.0f}, + array{3.14159f, 7.0f}}) + { + test_memcpy_roundtrip(i); + } + + // cuda::std::complex + using std_complex_f = cuda::std::complex; + for (std_complex_f i : + {std_complex_f{0.0f, 1.0f}, + std_complex_f{1.0f, -1.0f}, + std_complex_f{-1.0f, 0.0f}, + std_complex_f{10.0f, -10.0f}, + std_complex_f{-10.0f, 10.0f}, + std_complex_f{2.71828f, 3.14159f}, + std_complex_f{3.14159f, 2.71828f}}) + { + test_memcpy_roundtrip(i); + } + + // cuda::complex + using cuda_complex_f = cuda::complex; + for (cuda_complex_f i : + {cuda_complex_f{0.0f, 1.0f}, + cuda_complex_f{1.0f, -1.0f}, + cuda_complex_f{-1.0f, 0.0f}, + cuda_complex_f{10.0f, -10.0f}, + cuda_complex_f{-10.0f, 10.0f}, + cuda_complex_f{2.71828f, 3.14159f}, + cuda_complex_f{3.14159f, 2.71828f}}) + { + test_memcpy_roundtrip(i); + } + + // CUDA vector types + constexpr double input[7][4] = { + {0.0, 1.0, -7.0, -0.0}, + {1.0, 2.0, -7.0, -1.0}, + {-1.0, 3.0, -7.0, 1.0}, + {10.0, 4.0, -7.0, -10.0}, + {-10.0, 5.0, -7.0, 10.0}, + {2.71828, 6.0, -7.0, -2.71828}, + {3.14159, 7.0, -7.0, -3.14159}}; + + TEST_CUDA_VECTOR_TYPES(char) + TEST_CUDA_VECTOR_TYPES(short) + TEST_CUDA_VECTOR_TYPES(int) + TEST_CUDA_VECTOR_TYPES(float) + +#if !_CCCL_CUDA_COMPILER(CLANG) + using uchar = unsigned char; + using ushort = unsigned short; + using uint = unsigned int; + using ulong = unsigned long; + TEST_CUDA_VECTOR_TYPES(uchar) + TEST_CUDA_VECTOR_TYPES(ushort) + TEST_CUDA_VECTOR_TYPES(uint) + TEST_CUDA_VECTOR_TYPE(ulong, 1) + TEST_CUDA_VECTOR_TYPE(ulong, 2) + TEST_CUDA_VECTOR_TYPE(ulong, 3) +#endif // !_CCCL_CUDA_COMPILER(CLANG) + + TEST_CUDA_VECTOR_TYPE(long, 1) + TEST_CUDA_VECTOR_TYPE(long, 2) + TEST_CUDA_VECTOR_TYPE(long, 3) + + using longlong = long long; + using ulonglong = unsigned long long; + TEST_CUDA_VECTOR_TYPE(longlong, 1) + TEST_CUDA_VECTOR_TYPE(longlong, 2) + TEST_CUDA_VECTOR_TYPE(longlong, 3) + TEST_CUDA_VECTOR_TYPE(ulonglong, 1) + TEST_CUDA_VECTOR_TYPE(ulonglong, 2) + TEST_CUDA_VECTOR_TYPE(ulonglong, 3) + TEST_CUDA_VECTOR_TYPE(double, 1) + TEST_CUDA_VECTOR_TYPE(double, 2) + TEST_CUDA_VECTOR_TYPE(double, 3) + + test_memcpy_roundtrip(large_custom_t<128>{}); + test_memcpy_roundtrip(large_custom_t<512>{}); + + for (dim3 i : + {dim3{0u, 1u, 2u}, + dim3{1u, 2u, 3u}, + dim3{10u, 20u, 30u}, + dim3{100u, 200u, 300u}, + dim3{255u, 128u, 64u}, + dim3{1024u, 512u, 256u}, + dim3{4096u, 2048u, 1024u}}) + { + test_memcpy_roundtrip(i); + } + + return true; +} + +__host__ __device__ bool tests_nvfp() +{ +#if _LIBCUDACXX_HAS_NVFP16() + for (__half i : + {__float2half(0.0f), + __float2half(1.0f), + __float2half(-1.0f), + __float2half(10.0f), + __float2half(-10.0f), + __float2half(2.71828f), + __float2half(3.14159f)}) + { + test_memcpy_roundtrip(i); + } + + for (__half2 i : + {__half2{__float2half(0.0f), __float2half(1.0f)}, + __half2{__float2half(-1.0f), __float2half(2.0f)}, + __half2{__float2half(10.0f), __float2half(-10.0f)}, + __half2{__float2half(2.71828f), __float2half(3.14159f)}}) + { + test_memcpy_roundtrip(i); + } + + test_memcpy_roundtrip( + cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)}); + test_memcpy_roundtrip(cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); + test_memcpy_roundtrip(cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)}); + + test_memcpy_roundtrip(cuda::std::array, 2>{ + cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}, + cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}}); + + using complex_half = cuda::std::complex<__half>; + for (complex_half i : + {complex_half{__float2half(0.0f), __float2half(1.0f)}, + complex_half{__float2half(1.0f), __float2half(-1.0f)}, + complex_half{__float2half(-1.0f), __float2half(0.0f)}, + complex_half{__float2half(10.0f), __float2half(-10.0f)}}) + { + test_memcpy_roundtrip(i); + } +#endif // _LIBCUDACXX_HAS_NVFP16() + +#if _LIBCUDACXX_HAS_NVBF16() + for (__nv_bfloat16 i : + {__float2bfloat16(0.0f), + __float2bfloat16(1.0f), + __float2bfloat16(-1.0f), + __float2bfloat16(10.0f), + __float2bfloat16(-10.0f), + __float2bfloat16(2.71828f), + __float2bfloat16(3.14159f)}) + { + test_memcpy_roundtrip(i); + } + + for (__nv_bfloat162 i : + {__nv_bfloat162{__float2bfloat16(0.0f), __float2bfloat16(1.0f)}, + __nv_bfloat162{__float2bfloat16(-1.0f), __float2bfloat16(2.0f)}, + __nv_bfloat162{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}, + __nv_bfloat162{__float2bfloat16(2.71828f), __float2bfloat16(3.14159f)}}) + { + test_memcpy_roundtrip(i); + } + + test_memcpy_roundtrip(cuda::std::array<__nv_bfloat16, 2>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}); + test_memcpy_roundtrip(cuda::std::pair<__nv_bfloat16, __nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}); + + using complex_bfloat = cuda::std::complex<__nv_bfloat16>; + for (complex_bfloat i : + {complex_bfloat{__float2bfloat16(0.0f), __float2bfloat16(1.0f)}, + complex_bfloat{__float2bfloat16(1.0f), __float2bfloat16(-1.0f)}, + complex_bfloat{__float2bfloat16(-1.0f), __float2bfloat16(0.0f)}, + complex_bfloat{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}}) + { + test_memcpy_roundtrip(i); + } +#endif // _LIBCUDACXX_HAS_NVBF16() + +#if _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16() + test_memcpy_roundtrip(cuda::std::tuple<__half, __nv_bfloat16>{__float2half(1.0f), __float2bfloat16(2.0f)}); +#endif // _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16() + return true; +} + +int main(int, char**) +{ + assert(tests()); + assert(tests_nvfp()); + return 0; +} + +_CCCL_DIAG_POP diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.trivially_copyable.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.trivially_copyable.pass.cpp new file mode 100644 index 00000000000..c4cda64cd24 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.trivially_copyable.pass.cpp @@ -0,0 +1,218 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +// XFAIL: enable-tile +// nvbug6077498: ICE when validating tile MLIR + +// +// +// template +// constexpr To bit_cast(const From& from) noexcept; + +#include +#include +#include +#include + +#include "bit_cast_test_helpers.h" + +struct TrivialPod +{ + int x; + float y; + __host__ __device__ friend bool operator==(TrivialPod a, TrivialPod b) + { + return a.x == b.x && a.y == b.y; + } +}; + +TEST_FUNC bool tests() +{ + // User-defined trivially copyable type + for (const TrivialPod& i : {TrivialPod{0, 0.0f}, TrivialPod{1, 1.0f}, TrivialPod{-1, 3.5f}, TrivialPod{42, 2.5f}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } + + // cuda::std::array + for (const cuda::std::array& i : + {cuda::std::array{0, 1, 2, 3}, + cuda::std::array{-1, -2, -3, -4}, + cuda::std::array{100, 200, 300, 400}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } + + // cuda::std::pair + for (const cuda::std::pair& i : + {cuda::std::pair{0, 0.0f}, + cuda::std::pair{1, 1.0f}, + cuda::std::pair{-1, 3.5f}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } + + // cuda::std::tuple + for (const cuda::std::tuple& i : + {cuda::std::tuple{0, 0.0f}, + cuda::std::tuple{1, 1.0f}, + cuda::std::tuple{-1, 3.5f}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } + + // cuda::std::tuple<> (empty, sizeof == 1 with no data bytes) + test_roundtrip_through_buffer(cuda::std::tuple<>{}); + + // cuda::std::complex + for (const cuda::std::complex& i : + {cuda::std::complex{0.0f, 1.0f}, + cuda::std::complex{1.0f, -1.0f}, + cuda::std::complex{-1.0f, 0.0f}, + cuda::std::complex{10.0f, -10.0f}, + cuda::std::complex{2.5f, 3.5f}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } + +#if _LIBCUDACXX_HAS_NVFP16() + // cuda::std::complex<__half> + for (const cuda::std::complex<__half>& i : + {cuda::std::complex<__half>{__float2half(0.0f), __float2half(1.0f)}, + cuda::std::complex<__half>{__float2half(1.0f), __float2half(-1.0f)}, + cuda::std::complex<__half>{__float2half(-1.0f), __float2half(0.0f)}, + cuda::std::complex<__half>{__float2half(10.0f), __float2half(-10.0f)}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } +#endif // _LIBCUDACXX_HAS_NVFP16() + +#if _LIBCUDACXX_HAS_NVBF16() + // cuda::std::complex<__nv_bfloat16> + for (const cuda::std::complex<__nv_bfloat16>& i : + {cuda::std::complex<__nv_bfloat16>{__float2bfloat16(0.0f), __float2bfloat16(1.0f)}, + cuda::std::complex<__nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(-1.0f)}, + cuda::std::complex<__nv_bfloat16>{__float2bfloat16(-1.0f), __float2bfloat16(0.0f)}, + cuda::std::complex<__nv_bfloat16>{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } +#endif // _LIBCUDACXX_HAS_NVBF16() + + // Extended floating point vector types +#if _LIBCUDACXX_HAS_NVFP16() + for (const __half2& i : + {__half2{__float2half(0.0f), __float2half(1.0f)}, + __half2{__float2half(-1.0f), __float2half(2.0f)}, + __half2{__float2half(10.0f), __float2half(-10.0f)}, + __half2{__float2half(2.5f), __float2half(3.5f)}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } +#endif // _LIBCUDACXX_HAS_NVFP16() + +#if _LIBCUDACXX_HAS_NVBF16() + for (const __nv_bfloat162& i : + {__nv_bfloat162{__float2bfloat16(0.0f), __float2bfloat16(1.0f)}, + __nv_bfloat162{__float2bfloat16(-1.0f), __float2bfloat16(2.0f)}, + __nv_bfloat162{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}, + __nv_bfloat162{__float2bfloat16(2.5f), __float2bfloat16(3.5f)}}) + { + test_roundtrip_through_nested_T(i); + test_roundtrip_through_buffer(i); + } +#endif // _LIBCUDACXX_HAS_NVBF16() + + // Padding-free compositions of extended floating point scalar types +#if _LIBCUDACXX_HAS_NVFP16() + { + const auto arr = + cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)}; + test_roundtrip_through_nested_T(arr); + test_roundtrip_through_buffer(arr); + } + { + const auto p = cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}; + test_roundtrip_through_nested_T(p); + test_roundtrip_through_buffer(p); + } + { + const auto t = cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)}; + test_roundtrip_through_nested_T(t); + test_roundtrip_through_buffer(t); + } + { + const auto nested = cuda::std::array, 2>{ + cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)}, + cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}}; + test_roundtrip_through_nested_T(nested); + test_roundtrip_through_buffer(nested); + } +#endif // _LIBCUDACXX_HAS_NVFP16() + +#if _LIBCUDACXX_HAS_NVBF16() + { + const auto arr = cuda::std::array<__nv_bfloat16, 2>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}; + test_roundtrip_through_nested_T(arr); + test_roundtrip_through_buffer(arr); + } + { + const auto p = cuda::std::pair<__nv_bfloat16, __nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)}; + test_roundtrip_through_nested_T(p); + test_roundtrip_through_buffer(p); + } +#endif // _LIBCUDACXX_HAS_NVBF16() + +#if _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16() + { + const auto t = cuda::std::tuple<__half, __nv_bfloat16>{__float2half(1.0f), __float2bfloat16(2.0f)}; + test_roundtrip_through_nested_T(t); + test_roundtrip_through_buffer(t); + } +#endif // _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16() + + // Padded compositions +#if _LIBCUDACXX_HAS_NVFP16() + { + const auto p1 = cuda::std::pair<__half, int>{__float2half(1.0f), 42}; + test_roundtrip_through_nested_T(p1); + test_roundtrip_through_buffer(p1); + } + { + const auto p2 = cuda::std::pair<__half, float>{__float2half(1.0f), 3.5f}; + test_roundtrip_through_nested_T(p2); + test_roundtrip_through_buffer(p2); + } +#endif // _LIBCUDACXX_HAS_NVFP16() + +#if _LIBCUDACXX_HAS_NVBF16() + { + const auto p = cuda::std::pair<__nv_bfloat16, int>{__float2bfloat16(1.0f), 42}; + test_roundtrip_through_nested_T(p); + test_roundtrip_through_buffer(p); + } +#endif // _LIBCUDACXX_HAS_NVBF16() + + return true; +} + +int main(int, char**) +{ + tests(); + return 0; +} diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_test_helpers.h b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_test_helpers.h new file mode 100644 index 00000000000..bacac847cf9 --- /dev/null +++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_test_helpers.h @@ -0,0 +1,85 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef LIBCUDACXX_TEST_STD_NUMERICS_BIT_BIT_CAST_TEST_HELPERS_H +#define LIBCUDACXX_TEST_STD_NUMERICS_BIT_BIT_CAST_TEST_HELPERS_H + +#include +#include +#include + +#include "test_macros.h" + +// cuda::std::bit_cast does not preserve padding bits, so if T has padding bits, +// the results might not memcmp cleanly. +template +TEST_FUNC void test_roundtrip_through_buffer(T from) +{ + struct Buffer + { + char buffer[sizeof(T)]; + }; + Buffer middle = cuda::std::bit_cast(from); + T to = cuda::std::bit_cast(middle); + Buffer middle2 = cuda::std::bit_cast(to); + + assert((from == to) == (from == from)); // because NaN + + if constexpr (HasUniqueObjectRepresentations) + { + assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0); + assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0); + assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0); + } +} + +template +TEST_FUNC void test_roundtrip_through_nested_T(T from) +{ + struct Nested + { + T x; + }; + static_assert(sizeof(Nested) == sizeof(T)); + + Nested middle = cuda::std::bit_cast(from); + T to = cuda::std::bit_cast(middle); + Nested middle2 = cuda::std::bit_cast(to); + + assert((from == to) == (from == from)); // because NaN + + if constexpr (HasUniqueObjectRepresentations) + { + assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0); + assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0); + assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0); + } +} + +template +TEST_FUNC void test_roundtrip_through(T from) +{ + static_assert(sizeof(Intermediate) == sizeof(T)); + + Intermediate middle = cuda::std::bit_cast(from); + T to = cuda::std::bit_cast(middle); + Intermediate middle2 = cuda::std::bit_cast(to); + + assert((from == to) == (from == from)); // because NaN + + if constexpr (HasUniqueObjectRepresentations) + { + assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0); + assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0); + assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0); + } +} + +#endif // LIBCUDACXX_TEST_STD_NUMERICS_BIT_BIT_CAST_TEST_HELPERS_H diff --git a/thrust/thrust/detail/allocator/value_initialize_range.h b/thrust/thrust/detail/allocator/value_initialize_range.h index 491a96dc29d..9000ef604ef 100644 --- a/thrust/thrust/detail/allocator/value_initialize_range.h +++ b/thrust/thrust/detail/allocator/value_initialize_range.h @@ -20,6 +20,7 @@ #include #include +#include THRUST_NAMESPACE_BEGIN namespace detail diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h index e7ec4cfe6d2..c011ff55764 100644 --- a/thrust/thrust/type_traits/is_trivially_relocatable.h +++ b/thrust/thrust/type_traits/is_trivially_relocatable.h @@ -24,11 +24,11 @@ #include #include +#include #include #include #include #include -#include THRUST_NAMESPACE_BEGIN @@ -197,7 +197,7 @@ namespace detail // https://wg21.link/P1144R0#wording-inheritance template struct is_trivially_relocatable_impl - : integral_constant || proclaim_trivially_relocatable::value> + : integral_constant || proclaim_trivially_relocatable::value> {}; template