diff --git a/c/parallel/src/transform.cu b/c/parallel/src/transform.cu
index 5e9ffc062ef..3f173c6162b 100644
--- a/c/parallel/src/transform.cu
+++ b/c/parallel/src/transform.cu
@@ -16,6 +16,7 @@
 #include <cub/util_temporary_storage.cuh>
 #include <cub/util_type.cuh>
 
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/cstdint>
 #include <cuda/std/memory>
 
@@ -327,7 +328,7 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {9}, "Ho
   build_ptr->cache                      = new transform::cache();
 
   // avoid new and delete which requires the allocated and freed types to match
-  static_assert(std::is_trivially_copyable_v<decltype(policy_sel)>);
+  static_assert(::cuda::is_trivially_copyable_v<decltype(policy_sel)>);
   build_ptr->runtime_policy = std::malloc(sizeof(policy_sel));
   std::memcpy(build_ptr->runtime_policy, &policy_sel, sizeof(policy_sel));
 
@@ -528,7 +529,7 @@ static_assert(device_transform_policy()(detail::current_tuning_cc()) == {12}, "H
   build_ptr->cache                      = new transform::cache();
 
   // avoid new and delete which requires the allocated and freed types to match
-  static_assert(std::is_trivially_copyable_v<decltype(policy_sel)>);
+  static_assert(::cuda::is_trivially_copyable_v<decltype(policy_sel)>);
   build_ptr->runtime_policy = std::malloc(sizeof(policy_sel));
   std::memcpy(build_ptr->runtime_policy, &policy_sel, sizeof(policy_sel));
 
diff --git a/cub/cub/agent/single_pass_scan_operators.cuh b/cub/cub/agent/single_pass_scan_operators.cuh
index 59cbb8b2c23..a8a968bfe51 100644
--- a/cub/cub/agent/single_pass_scan_operators.cuh
+++ b/cub/cub/agent/single_pass_scan_operators.cuh
@@ -29,9 +29,9 @@
 #include <cub/util_temporary_storage.cuh>
 #include <cub/warp/warp_reduce.cuh>
 
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/conditional.h>
 #include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 
 #include <nv/target>
 
@@ -473,7 +473,7 @@ using default_no_delay_t             = default_no_delay_constructor_t::delay_t;
 template <class T>
 using default_delay_constructor_t =
   // TODO(bgruber): remove the check for is_primitive<ValueT> in CCCL 4.0
-  ::cuda::std::conditional_t<is_primitive<T>::value || ::cuda::std::is_trivially_copyable_v<T>,
+  ::cuda::std::conditional_t<is_primitive<T>::value || ::cuda::is_trivially_copyable_v<T>,
                              fixed_delay_constructor_t<350, 450>,
                              default_no_delay_constructor_t>;
 
@@ -483,7 +483,7 @@ using default_delay_t = typename default_delay_constructor_t<T>::delay_t;
 template <class KeyT, class ValueT>
 using default_reduce_by_key_delay_constructor_t =
   // TODO(bgruber): remove the check for is_primitive<ValueT> in CCCL 4.0
-  ::cuda::std::conditional_t<(is_primitive<ValueT>::value || ::cuda::std::is_trivially_copyable_v<ValueT>)
+  ::cuda::std::conditional_t<(is_primitive<ValueT>::value || ::cuda::is_trivially_copyable_v<ValueT>)
                                && (sizeof(ValueT) + sizeof(KeyT) < largest_atomic_message_size),
                              reduce_by_key_delay_constructor_t<350, 450>,
                              default_delay_constructor_t<KeyValuePair<KeyT, ValueT>>>;
@@ -582,7 +582,7 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE cudaError_t tile_state_init(
 template <typename T,
           // TODO(bgruber): remove the check for is_primitive<T> in CCCL 4.0
           bool SingleWord = detail::is_primitive<T>::value
-                         || (::cuda::std::is_trivially_copyable_v<T>
+                         || (::cuda::is_trivially_copyable_v<T>
                              && sizeof(T) < detail::largest_atomic_message_size
                              // TODO(bgruber): a power of two size is not strictly necessary, but the implementation
                              // cannot handle it currently. For example, we could support status word + int3.
@@ -957,7 +957,7 @@ struct ScanTileState<T, false>
 template <typename ValueT,
           typename KeyT,
           // TODO(bgruber): remove the check for is_primitive<ValueT> in CCCL 4.0
-          bool SingleWord = (detail::is_primitive<ValueT>::value || ::cuda::std::is_trivially_copyable_v<ValueT>)
+          bool SingleWord = (detail::is_primitive<ValueT>::value || ::cuda::is_trivially_copyable_v<ValueT>)
                          && (sizeof(ValueT) + sizeof(KeyT) < detail::largest_atomic_message_size)>
 struct ReduceByKeyScanTileState;
 
diff --git a/cub/cub/detail/uninitialized_copy.cuh b/cub/cub/detail/uninitialized_copy.cuh
index 5c58b862ba9..253aea8da7b 100644
--- a/cub/cub/detail/uninitialized_copy.cuh
+++ b/cub/cub/detail/uninitialized_copy.cuh
@@ -13,9 +13,9 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__new/device_new.h>
 #include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__utility/forward.h>
 
 CUB_NAMESPACE_BEGIN
@@ -30,14 +30,14 @@ _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
   new (ptr) T(::cuda::std::forward<U>(val));
 }
 #else // ^^^ _CCCL_CUDA_COMPILER(NVHPC) ^^^ / vvv !_CCCL_CUDA_COMPILER(NVHPC) vvv
-template <typename T, typename U, ::cuda::std::enable_if_t<::cuda::std::is_trivially_copyable_v<T>, int> = 0>
+template <typename T, typename U, ::cuda::std::enable_if_t<::cuda::is_trivially_copyable_v<T>, int> = 0>
 _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
 {
   // gevtushenko: placement new should work here as well, but the code generated for copy assignment is sometimes better
   *ptr = ::cuda::std::forward<U>(val);
 }
 
-template <typename T, typename U, ::cuda::std::enable_if_t<!::cuda::std::is_trivially_copyable_v<T>, int> = 0>
+template <typename T, typename U, ::cuda::std::enable_if_t<!::cuda::is_trivially_copyable_v<T>, int> = 0>
 _CCCL_HOST_DEVICE void uninitialized_copy_single(T* ptr, U&& val)
 {
   new (ptr) T(::cuda::std::forward<U>(val));
diff --git a/cub/cub/detail/warpspeed/look_ahead.cuh b/cub/cub/detail/warpspeed/look_ahead.cuh
index feffea126d5..0079e8cb197 100644
--- a/cub/cub/detail/warpspeed/look_ahead.cuh
+++ b/cub/cub/detail/warpspeed/look_ahead.cuh
@@ -23,6 +23,7 @@
 #include <cuda/__functional/operator_properties.h>
 #include <cuda/__memory/is_aligned.h>
 #include <cuda/__ptx/instructions/get_sreg.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__bit/popcount.h>
 #include <cuda/std/__type_traits/underlying_type.h>
 
@@ -75,7 +76,7 @@ storeTileAggregate(tile_state_t<AccumT>* ptrTileStates, scan_state scanState, Ac
   _CCCL_ASSERT(index >= 0 && index < gridDim.x, "Reading out of bounds tile state");
 
   if constexpr (sizeof(tile_state_t<AccumT>) <= cub::detail::warpspeed::max_native_atomic_size()
-                && ::cuda::std::is_trivially_copyable_v<tile_state_t<AccumT>>)
+                && ::cuda::is_trivially_copyable_v<tile_state_t<AccumT>>)
   {
     static_assert(::cuda::is_power_of_two(sizeof(tile_state_t<AccumT>)));
     tile_state_t<AccumT> tmp{scanState, sum};
@@ -103,7 +104,7 @@ _CCCL_DEVICE_API tile_state_t<AccumT> loadTileAggregate(tile_state_t<AccumT>* pt
 
   tile_state_t<AccumT> res;
   if constexpr (sizeof(tile_state_t<AccumT>) <= cub::detail::warpspeed::max_native_atomic_size()
-                && ::cuda::std::is_trivially_copyable_v<tile_state_t<AccumT>>)
+                && ::cuda::is_trivially_copyable_v<tile_state_t<AccumT>>)
   {
     static_assert(::cuda::is_power_of_two(sizeof(tile_state_t<AccumT>)));
 #  if _CCCL_HAS_NV_ATOMIC_BUILTINS()
diff --git a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
index b5abe5aae4b..38f4a5d1ac8 100644
--- a/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_histogram.cuh
@@ -18,6 +18,7 @@
 #include <cub/grid/grid_queue.cuh>
 #include <cub/util_arch.cuh>
 
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__numeric/reduce.h>
 
 CUB_NAMESPACE_BEGIN
@@ -80,7 +81,7 @@ struct Transforms
     static_assert(::cuda::std::is_convertible_v<CommonT, int>,
                   "The common type of `LevelT` and `SampleT` must be "
                   "convertible to `int`.");
-    static_assert(::cuda::std::is_trivially_copyable_v<CommonT>,
+    static_assert(::cuda::is_trivially_copyable_v<CommonT>,
                   "The common type of `LevelT` and `SampleT` must be "
                   "trivially copyable.");
 
diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
index 023b5b18ff0..9d2047506cd 100644
--- a/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_reduce_by_key.cuh
@@ -24,9 +24,9 @@
 
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__device/compute_capability.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__host_stdlib/ostream>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/concepts>
 
 CUB_NAMESPACE_BEGIN
@@ -1657,7 +1657,7 @@ struct policy_selector_from_types
       int{sizeof(AccumT)},
       classify_type<AccumT>,
       is_primitive_v<KeyT>,
-      ::cuda::std::is_trivially_copyable_v<KeyT>,
+      ::cuda::is_trivially_copyable_v<KeyT>,
       is_primitive_v<AccumT>,
       basic_binary_op_t<ReductionOpT>::value}(cc);
   }
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
index 429555865a9..046f893d432 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_encode.cuh
@@ -25,6 +25,7 @@
 
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__device/compute_capability.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__host_stdlib/ostream>
@@ -525,7 +526,7 @@ struct policy_selector_from_types
       int{sizeof(KeyT)},
       classify_type<KeyT>,
       is_primitive_v<LengthT>,
-      ::cuda::std::is_trivially_copyable_v<LengthT>,
+      ::cuda::is_trivially_copyable_v<LengthT>,
       is_primitive_v<KeyT>};
     return selector(cc);
   }
diff --git a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
index f8534346552..4c176e41182 100644
--- a/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_rle_non_trivial_runs.cuh
@@ -24,6 +24,7 @@
 #include <cub/util_device.cuh>
 
 #include <cuda/__device/compute_capability.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__algorithm/clamp.h>
 #include <cuda/std/__host_stdlib/ostream>
 #include <cuda/std/concepts>
@@ -612,7 +613,7 @@ struct policy_selector_from_types
       classify_type<KeyT>,
       is_primitive_v<LengthT>,
       is_primitive_v<KeyT>,
-      ::cuda::std::is_trivially_copyable_v<KeyT>};
+      ::cuda::is_trivially_copyable_v<KeyT>};
     return selector(cc);
   }
 };
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
index d67edd7569a..8dd8aa9fb6e 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
@@ -31,6 +31,7 @@
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <cuda/__device/compute_capability.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__functional/invoke.h>
 #include <cuda/std/__functional/operations.h>
@@ -1451,8 +1452,8 @@ struct policy_selector_from_types
       classify_op<ScanOpT>,
       THRUST_NS_QUALIFIER::is_contiguous_iterator_v<InputIteratorT>,
       THRUST_NS_QUALIFIER::is_contiguous_iterator_v<OutputIteratorT>,
-      ::cuda::std::is_trivially_copyable_v<InputValueT>,
-      ::cuda::std::is_trivially_copyable_v<OutputValueT>,
+      ::cuda::is_trivially_copyable_v<InputValueT>,
+      ::cuda::is_trivially_copyable_v<OutputValueT>,
       ::cuda::std::is_default_constructible_v<OutputValueT>,
       accum_is_primitive_or_trivially_copy_constructible,
       benchmark_match};
diff --git a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
index 3d93fa6c028..9ee7dcf206b 100644
--- a/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
+++ b/cub/cub/device/dispatch/tuning/tuning_scan_by_key.cuh
@@ -26,9 +26,9 @@
 #include <cub/util_type.cuh>
 
 #include <cuda/__device/compute_capability.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__host_stdlib/ostream>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 
 CUB_NAMESPACE_BEGIN
 
@@ -1951,7 +1951,7 @@ struct policy_selector_from_types
       static_cast<int>(sizeof(ValueT)),
       static_cast<int>(sizeof(AccumT)),
       is_primitive<ValueT>::value,
-      ::cuda::std::is_trivially_copyable_v<ValueT>,
+      ::cuda::is_trivially_copyable_v<ValueT>,
       classify_type<KeyT>,
       classify_type<ValueT>,
       classify_type<AccumT>,
diff --git a/cudax/include/cuda/experimental/__container/graph_buffer.cuh b/cudax/include/cuda/experimental/__container/graph_buffer.cuh
index 5eec2c6feb0..ad65bff4975 100644
--- a/cudax/include/cuda/experimental/__container/graph_buffer.cuh
+++ b/cudax/include/cuda/experimental/__container/graph_buffer.cuh
@@ -27,8 +27,8 @@
 #  include <cuda/__runtime/api_wrapper.h>
 #  include <cuda/__stream/invalid_stream.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/__utility/no_init.h>
-#  include <cuda/std/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__utility/exchange.h>
 #  include <cuda/std/__utility/move.h>
 #  include <cuda/std/cstddef>
@@ -68,7 +68,7 @@ namespace cuda::experimental
 template <class _Tp>
 class graph_buffer
 {
-  static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "graph_buffer requires T to be trivially copyable.");
+  static_assert(::cuda::is_trivially_copyable_v<_Tp>, "graph_buffer requires T to be trivially copyable.");
 
 public:
   using value_type      = _Tp;
diff --git a/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh b/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh
index 79101ac8722..595c466c980 100644
--- a/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh
+++ b/cudax/include/cuda/experimental/__copy/mdspan_d2d.cuh
@@ -33,6 +33,7 @@
 #  include <cuda/__mdspan/host_device_mdspan.h>
 #  include <cuda/__mdspan/traits.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__algorithm/max.h>
 #  include <cuda/std/__functional/identity.h>
 #  include <cuda/std/__host_stdlib/stdexcept>
@@ -43,7 +44,6 @@
 #  include <cuda/std/__type_traits/is_const.h>
 #  include <cuda/std/__type_traits/is_convertible.h>
 #  include <cuda/std/__type_traits/is_same.h>
-#  include <cuda/std/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__type_traits/remove_cvref.h>
 
 #  include <cuda/experimental/__copy/copy_contiguous.cuh>
@@ -125,7 +125,7 @@ _CCCL_HOST_API void copy(::cuda::device_mdspan<_TpIn, _ExtentsIn, _LayoutPolicyI
     && ::cuda::std::is_convertible_v<_AccessorPolicyOut, __default_accessor_out>;
   constexpr bool __are_byte_copyable =
     ::cuda::std::is_same_v<::cuda::std::remove_cv_t<_TpIn>, ::cuda::std::remove_cv_t<_TpOut>>
-    && ::cuda::std::is_trivially_copyable_v<_TpIn> //
+    && ::cuda::is_trivially_copyable_v<_TpIn> //
     && __have_default_accessors;
 
   if (__tensor_size == 1 && __are_byte_copyable)
diff --git a/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh b/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh
index 5424a73c865..0b1897760a9 100644
--- a/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh
+++ b/cudax/include/cuda/experimental/__copy_bytes/mdspan_d2h_h2d.cuh
@@ -27,6 +27,7 @@
 #  include <cuda/__mdspan/host_device_mdspan.h>
 #  include <cuda/__mdspan/traits.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__algorithm/max.h>
 #  include <cuda/std/__cstddef/types.h>
 #  include <cuda/std/__host_stdlib/stdexcept>
@@ -37,7 +38,6 @@
 #  include <cuda/std/__type_traits/is_const.h>
 #  include <cuda/std/__type_traits/is_convertible.h>
 #  include <cuda/std/__type_traits/is_same.h>
-#  include <cuda/std/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__type_traits/remove_cv.h>
 
 #  include <cuda/experimental/__copy_bytes/memcpy_batch_tiles.cuh>
@@ -74,7 +74,7 @@ _CCCL_HOST_API void __copy_bytes_impl(
   namespace cudax = ::cuda::experimental;
   static_assert(::cuda::std::is_same_v<::cuda::std::remove_cv_t<_TpIn>, ::cuda::std::remove_cv_t<_TpOut>>,
                 "cudax::copy_bytes: TpIn and TpOut must be the same type");
-  static_assert(::cuda::std::is_trivially_copyable_v<_TpIn>, "TpIn must be trivially copyable");
+  static_assert(::cuda::is_trivially_copyable_v<_TpIn>, "TpIn must be trivially copyable");
   static_assert(!::cuda::std::is_const_v<_TpOut>, "TpOut must not be const");
   static_assert(::cuda::__is_cuda_mdspan_layout_v<_LayoutPolicyIn>,
                 "cudax::copy_bytes: LayoutPolicyIn must be a predefined layout policy");
diff --git a/cudax/include/cuda/experimental/__graph/copy_bytes.cuh b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh
index c4fd1fc2ac1..007c35963af 100644
--- a/cudax/include/cuda/experimental/__graph/copy_bytes.cuh
+++ b/cudax/include/cuda/experimental/__graph/copy_bytes.cuh
@@ -26,11 +26,11 @@
 #  include <cuda/__algorithm/common.h>
 #  include <cuda/__stream/launch_transform.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/__exception/exception_macros.h>
 #  include <cuda/std/__host_stdlib/stdexcept>
 #  include <cuda/std/__type_traits/is_const.h>
-#  include <cuda/std/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/cstddef>
 #  include <cuda/std/span>
 
@@ -47,7 +47,7 @@ _CCCL_HOST_API graph_node_ref
 __copy_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_SrcTy> __src, ::cuda::std::span<_DstTy> __dst)
 {
   static_assert(!::cuda::std::is_const_v<_DstTy>, "Copy destination can't be const");
-  static_assert(::cuda::std::is_trivially_copyable_v<_SrcTy> && ::cuda::std::is_trivially_copyable_v<_DstTy>,
+  static_assert(::cuda::is_trivially_copyable_v<_SrcTy> && ::cuda::is_trivially_copyable_v<_DstTy>,
                 "Copy source and destination element types must be trivially copyable");
 
   if (__src.size_bytes() > __dst.size_bytes())
diff --git a/cudax/include/cuda/experimental/__graph/fill_bytes.cuh b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh
index 398baf492fb..ded7ce5cea7 100644
--- a/cudax/include/cuda/experimental/__graph/fill_bytes.cuh
+++ b/cudax/include/cuda/experimental/__graph/fill_bytes.cuh
@@ -26,11 +26,11 @@
 #  include <cuda/__algorithm/common.h>
 #  include <cuda/__stream/launch_transform.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/__exception/exception_macros.h>
 #  include <cuda/std/__host_stdlib/stdexcept>
 #  include <cuda/std/__type_traits/is_const.h>
-#  include <cuda/std/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/cstdint>
 #  include <cuda/std/span>
 
@@ -48,8 +48,7 @@ _CCCL_HOST_API graph_node_ref
 __fill_bytes_graph_impl(path_builder& __pb, ::cuda::std::span<_DstTy, _DstSize> __dst, ::cuda::std::uint8_t __value)
 {
   static_assert(!::cuda::std::is_const_v<_DstTy>, "Fill destination can't be const");
-  static_assert(::cuda::std::is_trivially_copyable_v<_DstTy>,
-                "Fill destination element type must be trivially copyable");
+  static_assert(::cuda::is_trivially_copyable_v<_DstTy>, "Fill destination element type must be trivially copyable");
 
   auto __deps = __pb.get_dependencies();
   ::CUgraphNodeParams __params{};
diff --git a/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh b/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh
index 16890ba6f46..19340e03dd9 100644
--- a/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh
+++ b/cudax/include/cuda/experimental/__kernel/kernel_ref.cuh
@@ -25,8 +25,8 @@
 #include <cuda/__driver/driver_api.h>
 #include <cuda/__memory/address_space.h>
 #include <cuda/__runtime/api_wrapper.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__type_traits/always_false.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__utility/forward.h>
 #include <cuda/std/string_view>
 
@@ -51,7 +51,7 @@ class kernel_ref
 template <class... _Args>
 class kernel_ref<void(_Args...)>
 {
-  static_assert((true && ... && ::cuda::std::is_trivially_copyable_v<_Args>),
+  static_assert((true && ... && ::cuda::is_trivially_copyable_v<_Args>),
                 "All kernel_ref argument types must be trivially copyable.");
 
 public:
diff --git a/docs/libcudacxx/extended_api/type_traits.rst b/docs/libcudacxx/extended_api/type_traits.rst
index f11c8e9f998..9f55314a9e7 100644
--- a/docs/libcudacxx/extended_api/type_traits.rst
+++ b/docs/libcudacxx/extended_api/type_traits.rst
@@ -8,6 +8,7 @@ Type traits
    :maxdepth: 1
 
    type_traits/is_floating_point
+   type_traits/is_trivially_copyable
    type_traits/is_bitwise_comparable
    type_traits/vector_types
 
@@ -30,6 +31,11 @@ Type traits
      - CCCL 3.3.0
      - CUDA 13.3
 
+   * - :ref:`cuda::is_trivially_copyable <libcudacxx-extended-api-type_traits-is_trivially_copyable>`
+     - Relaxed trivially copyable check including extended floating-point types
+     - CCCL 3.4.0
+     - CUDA 13.4
+
    * - :ref:`cuda::is_bitwise_comparable <libcudacxx-extended-api-type_traits-is_bitwise_comparable>`
      - User-specializable bitwise comparability check
      - CCCL 3.4.0
diff --git a/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable.rst b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable.rst
new file mode 100644
index 00000000000..e646f942ce1
--- /dev/null
+++ b/docs/libcudacxx/extended_api/type_traits/is_trivially_copyable.rst
@@ -0,0 +1,75 @@
+.. _libcudacxx-extended-api-type_traits-is_trivially_copyable:
+
+``cuda::is_trivially_copyable``
+=======================================
+
+Defined in the ``<cuda/type_traits>`` header.
+
+.. code:: cuda
+
+   namespace cuda {
+
+   template <typename T>
+   constexpr bool is_trivially_copyable_v = /* see below */;
+
+   template <typename T>
+   using is_trivially_copyable = cuda::std::bool_constant<is_trivially_copyable_v<T>>;
+
+   } // namespace cuda
+
+``cuda::is_trivially_copyable_v`` trait evaluates if a type can be copied by copying its underlying bytes.
+It extends ``cuda::std::is_trivially_copyable`` to also recognize CUDA extended floating-point vector types as trivially copyable.
+
+``cuda::is_trivially_copyable_v<T>`` relies on ``cuda::std::is_trivially_copyable`` but adds support for CUDA-specific types.
+
+The trait is true when ``T`` is any of the following:
+
+- A type for which ``cuda::std::is_trivially_copyable_v<T>`` is true.
+- An extended floating-point vector type, for example ``__half2``, ``__nv_bfloat162``.
+
+The trait also propagates through composite types:
+
+- C-style arrays: ``T[N]`` and ``T[]`` are trivially copyable when ``T`` is.
+- ``cuda::std::array<T, N>``: trivially copyable when ``T`` is.
+- ``cuda::std::pair<T1, T2>``: trivially copyable when both ``T1`` and ``T2`` are.
+- ``cuda::std::tuple<Ts...>``: trivially copyable when all ``Ts...`` are.
+- ``cuda::std::complex<T>``: trivially copyable when ``T`` is.
+- ``cuda::complex<T>``: trivially copyable when ``T`` is.
+- `Aggregates <https://en.cppreference.com/cpp/language/aggregate_initialization>`__: trivially copyable when all their members are.
+
+  - On MSVC, recursive data-member inspection is not supported beyond the first level.
+
+``const`` qualification is handled transparently, while ``volatile`` is compiler dependent.
+
+Examples
+--------
+
+.. code:: cuda
+
+   #include <cuda/type_traits>
+   #include <cuda/std/array>
+   #include <cuda/std/tuple>
+   #include <cuda/std/utility>
+
+   #include <cuda_fp16.h>
+
+   // Standard trivially copyable types
+   static_assert(cuda::is_trivially_copyable_v<int>);
+   static_assert(cuda::is_trivially_copyable_v<float>);
+
+   // Extended floating-point types
+   static_assert(cuda::is_trivially_copyable_v<__half>);
+   static_assert(cuda::is_trivially_copyable_v<__nv_bfloat16>);
+   static_assert(cuda::is_trivially_copyable_v<__half2>);
+   static_assert(cuda::is_trivially_copyable_v<cuda::std::complex<__half>>);
+   static_assert(cuda::is_trivially_copyable_v<cuda::complex<__half>>);
+
+   // Composite types containing extended floating-point types
+   static_assert(cuda::is_trivially_copyable_v<__half[4]>);
+   static_assert(cuda::is_trivially_copyable_v<cuda::std::array<__half2, 4>>);
+   static_assert(cuda::is_trivially_copyable_v<cuda::std::pair<__half2, __half>>);
+   static_assert(cuda::is_trivially_copyable_v<cuda::std::tuple<__half, __half2>>);
+   static_assert(cuda::is_trivially_copyable_v<cuda::std::pair<__half2, int>>);
+
+
+`See it on Godbolt 🔗 <https://godbolt.org/z/PqccjfEv6>`__
diff --git a/docs/libcudacxx/standard_api/numerics_library/bit.rst b/docs/libcudacxx/standard_api/numerics_library/bit.rst
index 9904db7352e..44079a4d81a 100644
--- a/docs/libcudacxx/standard_api/numerics_library/bit.rst
+++ b/docs/libcudacxx/standard_api/numerics_library/bit.rst
@@ -3,6 +3,16 @@
 ``<cuda/std/bit>``
 ==================
 
+``cuda::std::bit_cast``
+-----------------------
+
+``cuda::std::bit_cast`` extended the standard ``std::bit_cast`` to also recognize CUDA extended floating-point scalar and vector types as trivially copyable.
+
+**Limitations**
+
+- The function can be used in ``constexpr`` contexts only when the source and destination types are trivially copyable.
+- The function cannot be used in ``constexpr`` contexts with MSVC <= 19.25 and GCC <= 10.
+
 CUDA Performance Considerations
 -------------------------------
 
diff --git a/libcudacxx/include/cuda/__algorithm/copy.h b/libcudacxx/include/cuda/__algorithm/copy.h
index 982f1b83f7d..217b3375654 100644
--- a/libcudacxx/include/cuda/__algorithm/copy.h
+++ b/libcudacxx/include/cuda/__algorithm/copy.h
@@ -26,6 +26,7 @@
 #  include <cuda/__algorithm/common.h>
 #  include <cuda/__stream/launch_transform.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/__exception/exception_macros.h>
 #  include <cuda/std/__host_stdlib/stdexcept>
@@ -72,7 +73,7 @@ _CCCL_HOST_API void __copy_bytes_impl(
   [[maybe_unused]] copy_configuration __config)
 {
   static_assert(!::cuda::std::is_const_v<_DstTy>, "Copy destination can't be const");
-  static_assert(::cuda::std::is_trivially_copyable_v<_SrcTy> && ::cuda::std::is_trivially_copyable_v<_DstTy>);
+  static_assert(::cuda::is_trivially_copyable_v<_SrcTy> && ::cuda::is_trivially_copyable_v<_DstTy>);
 
   if (__src.size_bytes() > __dst.size_bytes())
   {
diff --git a/libcudacxx/include/cuda/__algorithm/fill.h b/libcudacxx/include/cuda/__algorithm/fill.h
index 32e0391a987..8a832ee7739 100644
--- a/libcudacxx/include/cuda/__algorithm/fill.h
+++ b/libcudacxx/include/cuda/__algorithm/fill.h
@@ -26,6 +26,7 @@
 #  include <cuda/__algorithm/common.h>
 #  include <cuda/__stream/launch_transform.h>
 #  include <cuda/__stream/stream_ref.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__concepts/concept_macros.h>
 #  include <cuda/std/__exception/exception_macros.h>
 #  include <cuda/std/__host_stdlib/stdexcept>
@@ -41,7 +42,7 @@ _CCCL_HOST_API void
 __fill_bytes_impl(stream_ref __stream, ::cuda::std::span<_DstTy, _DstSize> __dst, ::cuda::std::uint8_t __value)
 {
   static_assert(!::cuda::std::is_const_v<_DstTy>, "Fill destination can't be const");
-  static_assert(::cuda::std::is_trivially_copyable_v<_DstTy>);
+  static_assert(::cuda::is_trivially_copyable_v<_DstTy>);
 
   // TODO do a host callback if not device accessible?
   ::cuda::__driver::__memsetAsync(__dst.data(), __value, __dst.size_bytes(), __stream.get());
diff --git a/libcudacxx/include/cuda/__container/buffer.h b/libcudacxx/include/cuda/__container/buffer.h
index 6c4cbe4ebb2..58fcfc2469d 100644
--- a/libcudacxx/include/cuda/__container/buffer.h
+++ b/libcudacxx/include/cuda/__container/buffer.h
@@ -38,6 +38,7 @@
 #  include <cuda/__memory_resource/synchronous_resource_adapter.h>
 #  include <cuda/__runtime/ensure_current_context.h>
 #  include <cuda/__stream/get_stream.h>
+#  include <cuda/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__exception/cuda_error.h>
 #  include <cuda/std/__exception/exception_macros.h>
 #  include <cuda/std/__execution/env.h>
@@ -50,7 +51,6 @@
 #  include <cuda/std/__ranges/size.h>
 #  include <cuda/std/__ranges/unwrap_end.h>
 #  include <cuda/std/__type_traits/decay.h>
-#  include <cuda/std/__type_traits/is_trivially_copyable.h>
 #  include <cuda/std/__utility/forward.h>
 #  include <cuda/std/__utility/move.h>
 #  include <cuda/std/cstdint>
@@ -115,7 +115,7 @@ class buffer
   friend class buffer;
 
   // For now we require trivially copyable type to simplify the implementation
-  static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "cuda::buffer requires T to be trivially copyable.");
+  static_assert(::cuda::is_trivially_copyable_v<_Tp>, "cuda::buffer requires T to be trivially copyable.");
 
   // At least one of the properties must signal an execution space
   static_assert(::cuda::mr::__contains_execution_space_property<_Properties...>,
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
index 70f9f71551b..7b8fea552d7 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_barrier.h
@@ -31,9 +31,9 @@
 #include <cuda/__memcpy_async/memcpy_completion.h>
 #include <cuda/__memcpy_async/try_get_barrier_handle.h>
 #include <cuda/__memory/aligned_size.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__algorithm/max.h>
 #include <cuda/std/__atomic/scopes.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 
@@ -58,7 +58,7 @@ template <typename _Group, class _Tp, typename _Size, thread_scope _Sco, typenam
 _CCCL_API inline async_contract_fulfillment __memcpy_async_barrier(
   _Group const& __group, _Tp* __destination, _Tp const* __source, _Size __size, barrier<_Sco, _CompF>& __barrier)
 {
-  static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "memcpy_async requires a trivially copyable type");
+  static_assert(::cuda::is_trivially_copyable_v<_Tp>, "memcpy_async requires a trivially copyable type");
 
   // 1. Determine which completion mechanisms can be used with the current
   // barrier. A local shared memory barrier, i.e., block-scope barrier in local
diff --git a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
index 4bc7c0240a8..163771d3a36 100644
--- a/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
+++ b/libcudacxx/include/cuda/__memcpy_async/memcpy_async_tx.h
@@ -33,9 +33,9 @@
 #    include <cuda/__ptx/instructions/cp_async_bulk.h>
 #    include <cuda/__ptx/ptx_dot_variants.h>
 #    include <cuda/__ptx/ptx_helper_functions.h>
+#    include <cuda/__type_traits/is_trivially_copyable.h>
 #    include <cuda/std/__atomic/scopes.h>
 #    include <cuda/std/__type_traits/conditional.h>
-#    include <cuda/std/__type_traits/is_trivially_copyable.h>
 #    include <cuda/std/cstdint>
 
 #    include <cuda/std/__cccl/prologue.h>
@@ -55,7 +55,7 @@ _CCCL_DEVICE_API async_contract_fulfillment memcpy_async_tx(
   // memcpy_async when compiling with GCC 4.8.
   // FIXME: remove the #if once GCC 4.8 is no longer supported.
 #    if !_CCCL_COMPILER(GCC) || _CCCL_COMPILER(GCC, >, 4, 8)
-  static_assert(::cuda::std::is_trivially_copyable_v<_Tp>, "memcpy_async_tx requires a trivially copyable type");
+  static_assert(::cuda::is_trivially_copyable_v<_Tp>, "memcpy_async_tx requires a trivially copyable type");
 #    endif
   static_assert(16 <= _Alignment, "mempcy_async_tx expects arguments to be at least 16 byte aligned.");
   static_assert(_Alignment >= alignof(_Tp), "alignment must be at least the alignof(T)");
diff --git a/libcudacxx/include/cuda/__type_traits/is_trivially_copyable.h b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable.h
new file mode 100644
index 00000000000..fecc6714b97
--- /dev/null
+++ b/libcudacxx/include/cuda/__type_traits/is_trivially_copyable.h
@@ -0,0 +1,110 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H
+#define __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__fwd/complex.h>
+#include <cuda/std/__cstddef/types.h>
+#include <cuda/std/__fwd/array.h>
+#include <cuda/std/__fwd/complex.h>
+#include <cuda/std/__fwd/pair.h>
+#include <cuda/std/__fwd/tuple.h>
+#include <cuda/std/__type_traits/aggregate_members_all_of.h>
+#include <cuda/std/__type_traits/enable_if.h>
+#include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_aggregate.h>
+#include <cuda/std/__type_traits/is_trivially_copyable.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA
+
+template <typename _Tp, typename = void>
+inline constexpr bool __is_aggregate_trivially_copyable_v = false;
+
+template <typename _Tp>
+inline constexpr bool __is_trivially_copyable_v =
+  ::cuda::std::is_trivially_copyable_v<_Tp> || __is_aggregate_trivially_copyable_v<_Tp>;
+
+#if _CCCL_HAS_NVFP16()
+
+template <>
+inline constexpr bool __is_trivially_copyable_v<::__half2> = true;
+
+#endif // _CCCL_HAS_NVFP16()
+
+#if _CCCL_HAS_NVBF16()
+
+template <>
+inline constexpr bool __is_trivially_copyable_v<::__nv_bfloat162> = true;
+
+#endif // _CCCL_HAS_NVBF16()
+
+template <typename _Tp>
+inline constexpr bool __is_trivially_copyable_v<_Tp[]> = __is_trivially_copyable_v<_Tp>;
+
+template <typename _Tp, ::cuda::std::size_t _Size>
+inline constexpr bool __is_trivially_copyable_v<_Tp[_Size]> = __is_trivially_copyable_v<_Tp>;
+
+template <typename _Tp, ::cuda::std::size_t _Size>
+inline constexpr bool __is_trivially_copyable_v<::cuda::std::array<_Tp, _Size>> = __is_trivially_copyable_v<_Tp>;
+
+template <typename _T1, typename _T2>
+inline constexpr bool __is_trivially_copyable_v<::cuda::std::pair<_T1, _T2>> =
+  __is_trivially_copyable_v<_T1> && __is_trivially_copyable_v<_T2>;
+
+template <typename... _Ts>
+inline constexpr bool __is_trivially_copyable_v<::cuda::std::tuple<_Ts...>> = (__is_trivially_copyable_v<_Ts> && ...);
+
+template <typename _Tp>
+inline constexpr bool __is_trivially_copyable_v<complex<_Tp>> = true;
+
+template <typename _Tp>
+inline constexpr bool __is_trivially_copyable_v<::cuda::std::complex<_Tp>> = true;
+
+// if all the previous conditions fail, check if the type is an aggregate and all its members are trivially copyable
+template <typename _Tp>
+using __is_trivially_copyable_callable = ::cuda::std::bool_constant<__is_trivially_copyable_v<_Tp>>;
+
+template <typename _Tp>
+inline constexpr bool __is_aggregate_trivially_copyable_v<
+  _Tp,
+  ::cuda::std::enable_if_t<::cuda::std::is_aggregate_v<_Tp> && !::cuda::std::is_trivially_copyable_v<_Tp>>> =
+  ::cuda::std::__aggregate_all_of_v<__is_trivially_copyable_callable, _Tp>;
+
+//----------------------------------------------------------------------------------------------------------------------
+// public traits
+
+template <typename _Tp>
+inline constexpr bool is_trivially_copyable_v = __is_trivially_copyable_v<_Tp>;
+
+template <typename _Tp>
+inline constexpr bool is_trivially_copyable_v<const _Tp> = is_trivially_copyable_v<_Tp>;
+
+// defined as alias so users cannot specialize it (they should specialize the variable template instead)
+template <typename _Tp>
+using is_trivially_copyable = ::cuda::std::bool_constant<is_trivially_copyable_v<_Tp>>;
+
+_CCCL_END_NAMESPACE_CUDA
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // __CUDA__TYPE_TRAITS_IS_TRIVIALLY_COPYABLE_H
diff --git a/libcudacxx/include/cuda/std/__bit/bit_cast.h b/libcudacxx/include/cuda/std/__bit/bit_cast.h
index e7886ad4976..98844e3038f 100644
--- a/libcudacxx/include/cuda/std/__bit/bit_cast.h
+++ b/libcudacxx/include/cuda/std/__bit/bit_cast.h
@@ -4,7 +4,7 @@
 // under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+// SPDX-FileCopyrightText: Copyright (c) 2024-26 NVIDIA CORPORATION & AFFILIATES.
 //
 //===----------------------------------------------------------------------===//
 
@@ -21,10 +21,9 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/std/__type_traits/enable_if.h>
-#include <cuda/std/__type_traits/is_extended_floating_point.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
-#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__concepts/constructible.h>
 #include <cuda/std/cstring>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -42,37 +41,45 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD
 #else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ / vvv !_CCCL_BUILTIN_BIT_CAST vvv
 #  define _CCCL_CONSTEXPR_BIT_CAST
 #  define _CCCL_HAS_CONSTEXPR_BIT_CAST() 0
-#  if _CCCL_COMPILER(GCC, >=, 8)
-// GCC starting with GCC8 warns about our extended floating point types having protected data members
+#endif // !_CCCL_BUILTIN_BIT_CAST
+
+#if _CCCL_COMPILER(GCC, >=, 8)
 _CCCL_DIAG_PUSH
 _CCCL_DIAG_SUPPRESS_GCC("-Wclass-memaccess")
-#  endif // _CCCL_COMPILER(GCC, >=, 8)
-#endif // !_CCCL_BUILTIN_BIT_CAST
+#endif // _CCCL_COMPILER(GCC, >=, 8)
 
-template <class _To,
-          class _From,
-          enable_if_t<(sizeof(_To) == sizeof(_From)), int>                                          = 0,
-          enable_if_t<is_trivially_copyable_v<_To> || __is_extended_floating_point_v<_To>, int>     = 0,
-          enable_if_t<is_trivially_copyable_v<_From> || __is_extended_floating_point_v<_From>, int> = 0>
-[[nodiscard]] _CCCL_API inline _CCCL_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
+template <class _To, class _From>
+[[nodiscard]] _CCCL_API inline _To __bit_cast_memcpy(const _From& __from) noexcept
 {
-#if defined(_CCCL_BUILTIN_BIT_CAST)
-  return _CCCL_BUILTIN_BIT_CAST(_To, __from);
-#else // ^^^ _CCCL_BUILTIN_BIT_CAST ^^^ / vvv !_CCCL_BUILTIN_BIT_CAST vvv
-  static_assert(is_trivially_default_constructible_v<_To>,
-                "The compiler does not support __builtin_bit_cast, so bit_cast additionally requires the destination "
-                "type to be trivially constructible");
+#if !_CCCL_COMPILER(GCC, <=, 7)
+  static_assert(::cuda::std::default_initializable<_To>,
+                "bit_cast memcpy fallback requires the destination type to be default initializable");
+#endif // !_CCCL_COMPILER(GCC, <=, 7)
   _To __temp;
   ::cuda::std::memcpy(&__temp, &__from, sizeof(_To));
   return __temp;
-#endif // !_CCCL_BUILTIN_BIT_CAST
 }
 
-#if !defined(_CCCL_BUILTIN_BIT_CAST)
-#  if _CCCL_COMPILER(GCC, >=, 8)
+#if _CCCL_COMPILER(GCC, >=, 8)
 _CCCL_DIAG_POP
-#  endif // _CCCL_COMPILER(GCC, >=, 8)
-#endif // !_CCCL_BUILTIN_BIT_CAST
+#endif // _CCCL_COMPILER(GCC, >=, 8)
+
+_CCCL_TEMPLATE(class _To, class _From)
+_CCCL_REQUIRES((sizeof(_To) == sizeof(_From)) _CCCL_AND(::cuda::is_trivially_copyable_v<_To>)
+                 _CCCL_AND(::cuda::is_trivially_copyable_v<_From>))
+[[nodiscard]] _CCCL_API inline _CCCL_CONSTEXPR_BIT_CAST _To bit_cast(const _From& __from) noexcept
+{
+#if defined(_CCCL_BUILTIN_BIT_CAST)
+  if constexpr (::cuda::std::is_trivially_copyable_v<_To> && ::cuda::std::is_trivially_copyable_v<_From>)
+  {
+    return _CCCL_BUILTIN_BIT_CAST(_To, __from);
+  }
+  else
+#endif // _CCCL_BUILTIN_BIT_CAST
+  {
+    return ::cuda::std::__bit_cast_memcpy<_To>(__from);
+  }
+}
 
 _CCCL_END_NAMESPACE_CUDA_STD
 
diff --git a/libcudacxx/include/cuda/std/string_view b/libcudacxx/include/cuda/std/string_view
index a320302bc07..6c518f1ccb7 100644
--- a/libcudacxx/include/cuda/std/string_view
+++ b/libcudacxx/include/cuda/std/string_view
@@ -51,8 +51,8 @@
 #include <cuda/std/__type_traits/is_convertible.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/is_standard_layout.h>
-#include <cuda/std/__type_traits/is_trivially_constructible.h>
 #include <cuda/std/__type_traits/is_trivially_copyable.h>
+#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
 #include <cuda/std/__type_traits/type_identity.h>
 #include <cuda/std/__utility/ctad_support.h>
diff --git a/libcudacxx/include/cuda/type_traits b/libcudacxx/include/cuda/type_traits
index 4f47d003628..21e0c0b1174 100644
--- a/libcudacxx/include/cuda/type_traits
+++ b/libcudacxx/include/cuda/type_traits
@@ -23,6 +23,7 @@
 
 #include <cuda/__type_traits/is_bitwise_comparable.h>
 #include <cuda/__type_traits/is_floating_point.h>
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/__type_traits/is_vector_type.h>
 #include <cuda/__type_traits/vector_type.h>
 #include <cuda/std/type_traits>
diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.basic_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.basic_types.pass.cpp
new file mode 100644
index 00000000000..696a3a07714
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.basic_types.pass.cpp
@@ -0,0 +1,57 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cuda/type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+TEST_FUNC void test_is_trivially_copyable()
+{
+  static_assert(cuda::is_trivially_copyable<T>::value);
+  static_assert(cuda::is_trivially_copyable<const T>::value);
+  static_assert(cuda::is_trivially_copyable_v<T>);
+  static_assert(cuda::is_trivially_copyable_v<const T>);
+}
+
+__host__ __device__ void test_single_types()
+{
+  // standard trivially copyable types
+  test_is_trivially_copyable<int>();
+  test_is_trivially_copyable<float>();
+  test_is_trivially_copyable<double>();
+
+#if _CCCL_HAS_CTK()
+  test_is_trivially_copyable<int4>();
+  test_is_trivially_copyable<double2>();
+#endif // _CCCL_HAS_CTK()
+
+  // extended floating point scalar types
+#if _CCCL_HAS_NVFP16()
+  test_is_trivially_copyable<__half>();
+  test_is_trivially_copyable<__half2>();
+#endif // _CCCL_HAS_NVFP16()
+#if _CCCL_HAS_NVBF16()
+  test_is_trivially_copyable<__nv_bfloat16>();
+  test_is_trivially_copyable<__nv_bfloat162>();
+#endif // _CCCL_HAS_NVBF16()
+#if _CCCL_HAS_NVFP8_E4M3()
+  test_is_trivially_copyable<__nv_fp8_e4m3>();
+  test_is_trivially_copyable<__nv_fp8x2_e4m3>();
+#endif // _CCCL_HAS_NVFP8_E4M3()
+}
+
+int main(int, char**)
+{
+  test_single_types();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.composite_types.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.composite_types.pass.cpp
new file mode 100644
index 00000000000..e46542b4ea0
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.composite_types.pass.cpp
@@ -0,0 +1,95 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cuda/__complex_>
+#include <cuda/std/array>
+#include <cuda/std/complex>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+#include <cuda/type_traits>
+
+#include "test_macros.h"
+
+template <class T>
+__host__ __device__ void test_is_trivially_copyable()
+{
+  static_assert(cuda::is_trivially_copyable<T>::value);
+  static_assert(cuda::is_trivially_copyable<const T>::value);
+  static_assert(cuda::is_trivially_copyable_v<T>);
+  static_assert(cuda::is_trivially_copyable_v<const T>);
+}
+
+template <class T>
+struct TriviallyCopyableWrapper
+{
+  T x;
+};
+
+struct TrivialPod
+{
+  int x;
+  float y;
+};
+
+class NonTriviallyCopyable
+{
+public:
+  __host__ __device__ NonTriviallyCopyable(const NonTriviallyCopyable&) {} // NOLINT
+};
+
+template <class T>
+__host__ __device__ void test_is_trivially_copyable_compositions()
+{
+  test_is_trivially_copyable<T[4]>();
+  test_is_trivially_copyable<cuda::std::array<T, 4>>();
+  test_is_trivially_copyable<cuda::std::pair<T, T>>();
+  test_is_trivially_copyable<cuda::std::tuple<T, T>>();
+  test_is_trivially_copyable<cuda::std::complex<T>>();
+  test_is_trivially_copyable<cuda::complex<T>>();
+  test_is_trivially_copyable<TriviallyCopyableWrapper<T>>();
+}
+
+__host__ __device__ void test_composite_types()
+{
+  test_is_trivially_copyable<int[4]>();
+
+  test_is_trivially_copyable<TrivialPod>();
+  test_is_trivially_copyable<TrivialPod[2]>();
+
+  // cuda::std::array, pair, tuple, complex, and aggregate wrappers of trivially copyable types
+  test_is_trivially_copyable_compositions<int>();
+  test_is_trivially_copyable_compositions<float>();
+  test_is_trivially_copyable<cuda::std::tuple<>>();
+
+  // non-trivially copyable types
+  static_assert(!cuda::is_trivially_copyable_v<NonTriviallyCopyable>);
+}
+
+__host__ __device__ void test_extended_fp_types()
+{
+#if _CCCL_HAS_NVFP16()
+  test_is_trivially_copyable_compositions<__half>();
+  test_is_trivially_copyable_compositions<__half2>();
+#endif // _CCCL_HAS_NVFP16()
+
+#if _CCCL_HAS_NVBF16()
+  test_is_trivially_copyable_compositions<__nv_bfloat16>();
+  test_is_trivially_copyable_compositions<__nv_bfloat162>();
+#endif // _CCCL_HAS_NVBF16()
+}
+
+int main(int, char**)
+{
+  test_composite_types();
+  test_extended_fp_types();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.mem.pass.cpp b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.mem.pass.cpp
new file mode 100644
index 00000000000..e935f083fc5
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/type_traits/is_trivially_copyable.mem.pass.cpp
@@ -0,0 +1,345 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include <cuda/__complex_>
+#include <cuda/std/array>
+#include <cuda/std/cassert>
+#include <cuda/std/complex>
+#include <cuda/std/cstring>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+#include <cuda/type_traits>
+
+#include "test_macros.h"
+
+_CCCL_DIAG_PUSH
+_CCCL_DIAG_SUPPRESS_CLANG("-Wunused-local-typedef")
+
+// operator== for CUDA vector types and dim3 (not provided by the toolkit)
+template <class T, cuda::std::enable_if_t<cuda::is_vector_type_v<T>, int> = 0>
+__host__ __device__ bool operator==(T a, T b)
+{
+  if constexpr (sizeof(T) == sizeof(decltype(T::x)))
+  {
+    return a.x == b.x;
+  }
+  else if constexpr (sizeof(T) == 2 * sizeof(decltype(T::x)))
+  {
+    return a.x == b.x && a.y == b.y;
+  }
+  else if constexpr (sizeof(T) == 3 * sizeof(decltype(T::x)))
+  {
+    return a.x == b.x && a.y == b.y && a.z == b.z;
+  }
+  else
+  {
+    return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w;
+  }
+}
+
+template <class T>
+__host__ __device__ bool operator==(cuda::complex<T> a, cuda::complex<T> b)
+{
+  return a.real() == b.real() && a.imag() == b.imag();
+}
+
+template <int Size>
+struct large_custom_t
+{
+  unsigned char data[Size];
+};
+
+template <int Size>
+__host__ __device__ bool operator==(const large_custom_t<Size>& a, const large_custom_t<Size>& b)
+{
+  for (int i = 0; i < Size; ++i)
+  {
+    if (a.data[i] != b.data[i])
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename T>
+__host__ __device__ void test_memcpy_roundtrip(T from)
+{
+  static_assert(cuda::is_trivially_copyable_v<T>);
+  T to;
+  ::memcpy(static_cast<void*>(&to), static_cast<const void*>(&from), sizeof(T));
+  assert(from == to);
+}
+
+#define CAST(base_type, val) static_cast<decltype(base_type##1 ::x)>(val)
+
+#define REPEAT_1(base_type, index) CAST(base_type, input[index][0])
+#define REPEAT_2(base_type, index) REPEAT_1(base_type, index), CAST(base_type, input[index][1])
+#define REPEAT_3(base_type, index) REPEAT_2(base_type, index), CAST(base_type, input[index][2])
+#define REPEAT_4(base_type, index) REPEAT_3(base_type, index), CAST(base_type, input[index][3])
+
+#define TEST_CUDA_VECTOR_TYPE(base_type, size)           \
+  {                                                      \
+    for (base_type##size i :                             \
+         {base_type##size{REPEAT_##size(base_type, 0)},  \
+          base_type##size{REPEAT_##size(base_type, 1)},  \
+          base_type##size{REPEAT_##size(base_type, 2)},  \
+          base_type##size{REPEAT_##size(base_type, 3)},  \
+          base_type##size{REPEAT_##size(base_type, 4)},  \
+          base_type##size{REPEAT_##size(base_type, 5)},  \
+          base_type##size{REPEAT_##size(base_type, 6)}}) \
+    {                                                    \
+      test_memcpy_roundtrip(i);                          \
+    }                                                    \
+  }
+
+#define TEST_CUDA_VECTOR_TYPES(base_type) \
+  TEST_CUDA_VECTOR_TYPE(base_type, 1)     \
+  TEST_CUDA_VECTOR_TYPE(base_type, 2)     \
+  TEST_CUDA_VECTOR_TYPE(base_type, 3)     \
+  TEST_CUDA_VECTOR_TYPE(base_type, 4)
+
+__host__ __device__ bool tests()
+{
+  // standard scalar types
+  test_memcpy_roundtrip(42);
+  test_memcpy_roundtrip(0.0f);
+  test_memcpy_roundtrip(3.14159);
+  test_memcpy_roundtrip(static_cast<short>(7));
+  test_memcpy_roundtrip(static_cast<char>('A'));
+
+  // cuda::std::pair
+  using pair = cuda::std::pair<float, int>;
+  for (pair i :
+       {pair{0.0f, 1},
+        pair{1.0f, 2},
+        pair{-1.0f, 3},
+        pair{10.0f, 4},
+        pair{-10.0f, 5},
+        pair{2.71828f, 6},
+        pair{3.14159f, 7}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  // cuda::std::tuple
+  using tuple = cuda::std::tuple<int, float>;
+  for (tuple i :
+       {tuple{1, 0.0f},
+        tuple{2, 1.0f},
+        tuple{3, -1.0f},
+        tuple{4, 10.0f},
+        tuple{5, -10.0f},
+        tuple{6, 2.71828f},
+        tuple{7, 3.14159f}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  // cuda::std::array
+  using array = cuda::std::array<float, 2>;
+  for (array i :
+       {array{0.0f, 1.0f},
+        array{1.0f, 2.0f},
+        array{-1.0f, 3.0f},
+        array{10.0f, 4.0f},
+        array{-10.0f, 5.0f},
+        array{2.71828f, 6.0f},
+        array{3.14159f, 7.0f}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  // cuda::std::complex
+  using std_complex_f = cuda::std::complex<float>;
+  for (std_complex_f i :
+       {std_complex_f{0.0f, 1.0f},
+        std_complex_f{1.0f, -1.0f},
+        std_complex_f{-1.0f, 0.0f},
+        std_complex_f{10.0f, -10.0f},
+        std_complex_f{-10.0f, 10.0f},
+        std_complex_f{2.71828f, 3.14159f},
+        std_complex_f{3.14159f, 2.71828f}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  // cuda::complex
+  using cuda_complex_f = cuda::complex<float>;
+  for (cuda_complex_f i :
+       {cuda_complex_f{0.0f, 1.0f},
+        cuda_complex_f{1.0f, -1.0f},
+        cuda_complex_f{-1.0f, 0.0f},
+        cuda_complex_f{10.0f, -10.0f},
+        cuda_complex_f{-10.0f, 10.0f},
+        cuda_complex_f{2.71828f, 3.14159f},
+        cuda_complex_f{3.14159f, 2.71828f}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  // CUDA vector types
+  constexpr double input[7][4] = {
+    {0.0, 1.0, -7.0, -0.0},
+    {1.0, 2.0, -7.0, -1.0},
+    {-1.0, 3.0, -7.0, 1.0},
+    {10.0, 4.0, -7.0, -10.0},
+    {-10.0, 5.0, -7.0, 10.0},
+    {2.71828, 6.0, -7.0, -2.71828},
+    {3.14159, 7.0, -7.0, -3.14159}};
+
+  TEST_CUDA_VECTOR_TYPES(char)
+  TEST_CUDA_VECTOR_TYPES(short)
+  TEST_CUDA_VECTOR_TYPES(int)
+  TEST_CUDA_VECTOR_TYPES(float)
+
+#if !_CCCL_CUDA_COMPILER(CLANG)
+  using uchar  = unsigned char;
+  using ushort = unsigned short;
+  using uint   = unsigned int;
+  using ulong  = unsigned long;
+  TEST_CUDA_VECTOR_TYPES(uchar)
+  TEST_CUDA_VECTOR_TYPES(ushort)
+  TEST_CUDA_VECTOR_TYPES(uint)
+  TEST_CUDA_VECTOR_TYPE(ulong, 1)
+  TEST_CUDA_VECTOR_TYPE(ulong, 2)
+  TEST_CUDA_VECTOR_TYPE(ulong, 3)
+#endif // !_CCCL_CUDA_COMPILER(CLANG)
+
+  TEST_CUDA_VECTOR_TYPE(long, 1)
+  TEST_CUDA_VECTOR_TYPE(long, 2)
+  TEST_CUDA_VECTOR_TYPE(long, 3)
+
+  using longlong  = long long;
+  using ulonglong = unsigned long long;
+  TEST_CUDA_VECTOR_TYPE(longlong, 1)
+  TEST_CUDA_VECTOR_TYPE(longlong, 2)
+  TEST_CUDA_VECTOR_TYPE(longlong, 3)
+  TEST_CUDA_VECTOR_TYPE(ulonglong, 1)
+  TEST_CUDA_VECTOR_TYPE(ulonglong, 2)
+  TEST_CUDA_VECTOR_TYPE(ulonglong, 3)
+  TEST_CUDA_VECTOR_TYPE(double, 1)
+  TEST_CUDA_VECTOR_TYPE(double, 2)
+  TEST_CUDA_VECTOR_TYPE(double, 3)
+
+  test_memcpy_roundtrip(large_custom_t<128>{});
+  test_memcpy_roundtrip(large_custom_t<512>{});
+
+  for (dim3 i :
+       {dim3{0u, 1u, 2u},
+        dim3{1u, 2u, 3u},
+        dim3{10u, 20u, 30u},
+        dim3{100u, 200u, 300u},
+        dim3{255u, 128u, 64u},
+        dim3{1024u, 512u, 256u},
+        dim3{4096u, 2048u, 1024u}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  return true;
+}
+
+__host__ __device__ bool tests_nvfp()
+{
+#if _LIBCUDACXX_HAS_NVFP16()
+  for (__half i :
+       {__float2half(0.0f),
+        __float2half(1.0f),
+        __float2half(-1.0f),
+        __float2half(10.0f),
+        __float2half(-10.0f),
+        __float2half(2.71828f),
+        __float2half(3.14159f)})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  for (__half2 i :
+       {__half2{__float2half(0.0f), __float2half(1.0f)},
+        __half2{__float2half(-1.0f), __float2half(2.0f)},
+        __half2{__float2half(10.0f), __float2half(-10.0f)},
+        __half2{__float2half(2.71828f), __float2half(3.14159f)}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  test_memcpy_roundtrip(
+    cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)});
+  test_memcpy_roundtrip(cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)});
+  test_memcpy_roundtrip(cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)});
+
+  test_memcpy_roundtrip(cuda::std::array<cuda::std::pair<__half, __half>, 2>{
+    cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)},
+    cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}});
+
+  using complex_half = cuda::std::complex<__half>;
+  for (complex_half i :
+       {complex_half{__float2half(0.0f), __float2half(1.0f)},
+        complex_half{__float2half(1.0f), __float2half(-1.0f)},
+        complex_half{__float2half(-1.0f), __float2half(0.0f)},
+        complex_half{__float2half(10.0f), __float2half(-10.0f)}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+#endif // _LIBCUDACXX_HAS_NVFP16()
+
+#if _LIBCUDACXX_HAS_NVBF16()
+  for (__nv_bfloat16 i :
+       {__float2bfloat16(0.0f),
+        __float2bfloat16(1.0f),
+        __float2bfloat16(-1.0f),
+        __float2bfloat16(10.0f),
+        __float2bfloat16(-10.0f),
+        __float2bfloat16(2.71828f),
+        __float2bfloat16(3.14159f)})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  for (__nv_bfloat162 i :
+       {__nv_bfloat162{__float2bfloat16(0.0f), __float2bfloat16(1.0f)},
+        __nv_bfloat162{__float2bfloat16(-1.0f), __float2bfloat16(2.0f)},
+        __nv_bfloat162{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)},
+        __nv_bfloat162{__float2bfloat16(2.71828f), __float2bfloat16(3.14159f)}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+
+  test_memcpy_roundtrip(cuda::std::array<__nv_bfloat16, 2>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)});
+  test_memcpy_roundtrip(cuda::std::pair<__nv_bfloat16, __nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)});
+
+  using complex_bfloat = cuda::std::complex<__nv_bfloat16>;
+  for (complex_bfloat i :
+       {complex_bfloat{__float2bfloat16(0.0f), __float2bfloat16(1.0f)},
+        complex_bfloat{__float2bfloat16(1.0f), __float2bfloat16(-1.0f)},
+        complex_bfloat{__float2bfloat16(-1.0f), __float2bfloat16(0.0f)},
+        complex_bfloat{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}})
+  {
+    test_memcpy_roundtrip(i);
+  }
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+#if _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16()
+  test_memcpy_roundtrip(cuda::std::tuple<__half, __nv_bfloat16>{__float2half(1.0f), __float2bfloat16(2.0f)});
+#endif // _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16()
+  return true;
+}
+
+int main(int, char**)
+{
+  assert(tests());
+  assert(tests_nvfp());
+  return 0;
+}
+
+_CCCL_DIAG_POP
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.trivially_copyable.pass.cpp b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.trivially_copyable.pass.cpp
new file mode 100644
index 00000000000..c4cda64cd24
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast.trivially_copyable.pass.cpp
@@ -0,0 +1,218 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+// XFAIL: enable-tile
+// nvbug6077498: ICE when validating tile MLIR
+
+// <cuda/std/bit>
+//
+// template<class To, class From>
+//   constexpr To bit_cast(const From& from) noexcept;
+
+#include <cuda/std/array>
+#include <cuda/std/complex>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
+
+#include "bit_cast_test_helpers.h"
+
+struct TrivialPod
+{
+  int x;
+  float y;
+  __host__ __device__ friend bool operator==(TrivialPod a, TrivialPod b)
+  {
+    return a.x == b.x && a.y == b.y;
+  }
+};
+
+TEST_FUNC bool tests()
+{
+  // User-defined trivially copyable type
+  for (const TrivialPod& i : {TrivialPod{0, 0.0f}, TrivialPod{1, 1.0f}, TrivialPod{-1, 3.5f}, TrivialPod{42, 2.5f}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  // cuda::std::array
+  for (const cuda::std::array<int, 4>& i :
+       {cuda::std::array<int, 4>{0, 1, 2, 3},
+        cuda::std::array<int, 4>{-1, -2, -3, -4},
+        cuda::std::array<int, 4>{100, 200, 300, 400}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  // cuda::std::pair
+  for (const cuda::std::pair<int, float>& i :
+       {cuda::std::pair<int, float>{0, 0.0f},
+        cuda::std::pair<int, float>{1, 1.0f},
+        cuda::std::pair<int, float>{-1, 3.5f}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  // cuda::std::tuple
+  for (const cuda::std::tuple<int, float>& i :
+       {cuda::std::tuple<int, float>{0, 0.0f},
+        cuda::std::tuple<int, float>{1, 1.0f},
+        cuda::std::tuple<int, float>{-1, 3.5f}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+  // cuda::std::tuple<> (empty, sizeof == 1 with no data bytes)
+  test_roundtrip_through_buffer<false>(cuda::std::tuple<>{});
+
+  // cuda::std::complex<float>
+  for (const cuda::std::complex<float>& i :
+       {cuda::std::complex<float>{0.0f, 1.0f},
+        cuda::std::complex<float>{1.0f, -1.0f},
+        cuda::std::complex<float>{-1.0f, 0.0f},
+        cuda::std::complex<float>{10.0f, -10.0f},
+        cuda::std::complex<float>{2.5f, 3.5f}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+
+#if _LIBCUDACXX_HAS_NVFP16()
+  // cuda::std::complex<__half>
+  for (const cuda::std::complex<__half>& i :
+       {cuda::std::complex<__half>{__float2half(0.0f), __float2half(1.0f)},
+        cuda::std::complex<__half>{__float2half(1.0f), __float2half(-1.0f)},
+        cuda::std::complex<__half>{__float2half(-1.0f), __float2half(0.0f)},
+        cuda::std::complex<__half>{__float2half(10.0f), __float2half(-10.0f)}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+#endif // _LIBCUDACXX_HAS_NVFP16()
+
+#if _LIBCUDACXX_HAS_NVBF16()
+  // cuda::std::complex<__nv_bfloat16>
+  for (const cuda::std::complex<__nv_bfloat16>& i :
+       {cuda::std::complex<__nv_bfloat16>{__float2bfloat16(0.0f), __float2bfloat16(1.0f)},
+        cuda::std::complex<__nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(-1.0f)},
+        cuda::std::complex<__nv_bfloat16>{__float2bfloat16(-1.0f), __float2bfloat16(0.0f)},
+        cuda::std::complex<__nv_bfloat16>{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+  // Extended floating point vector types
+#if _LIBCUDACXX_HAS_NVFP16()
+  for (const __half2& i :
+       {__half2{__float2half(0.0f), __float2half(1.0f)},
+        __half2{__float2half(-1.0f), __float2half(2.0f)},
+        __half2{__float2half(10.0f), __float2half(-10.0f)},
+        __half2{__float2half(2.5f), __float2half(3.5f)}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+#endif // _LIBCUDACXX_HAS_NVFP16()
+
+#if _LIBCUDACXX_HAS_NVBF16()
+  for (const __nv_bfloat162& i :
+       {__nv_bfloat162{__float2bfloat16(0.0f), __float2bfloat16(1.0f)},
+        __nv_bfloat162{__float2bfloat16(-1.0f), __float2bfloat16(2.0f)},
+        __nv_bfloat162{__float2bfloat16(10.0f), __float2bfloat16(-10.0f)},
+        __nv_bfloat162{__float2bfloat16(2.5f), __float2bfloat16(3.5f)}})
+  {
+    test_roundtrip_through_nested_T(i);
+    test_roundtrip_through_buffer(i);
+  }
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+  // Padding-free compositions of extended floating point scalar types
+#if _LIBCUDACXX_HAS_NVFP16()
+  {
+    const auto arr =
+      cuda::std::array<__half, 4>{__float2half(1.0f), __float2half(2.0f), __float2half(3.0f), __float2half(4.0f)};
+    test_roundtrip_through_nested_T(arr);
+    test_roundtrip_through_buffer(arr);
+  }
+  {
+    const auto p = cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)};
+    test_roundtrip_through_nested_T(p);
+    test_roundtrip_through_buffer(p);
+  }
+  {
+    const auto t = cuda::std::tuple<__half, __half>{__float2half(1.0f), __float2half(2.0f)};
+    test_roundtrip_through_nested_T(t);
+    test_roundtrip_through_buffer(t);
+  }
+  {
+    const auto nested = cuda::std::array<cuda::std::pair<__half, __half>, 2>{
+      cuda::std::pair<__half, __half>{__float2half(1.0f), __float2half(2.0f)},
+      cuda::std::pair<__half, __half>{__float2half(3.0f), __float2half(4.0f)}};
+    test_roundtrip_through_nested_T(nested);
+    test_roundtrip_through_buffer(nested);
+  }
+#endif // _LIBCUDACXX_HAS_NVFP16()
+
+#if _LIBCUDACXX_HAS_NVBF16()
+  {
+    const auto arr = cuda::std::array<__nv_bfloat16, 2>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)};
+    test_roundtrip_through_nested_T(arr);
+    test_roundtrip_through_buffer(arr);
+  }
+  {
+    const auto p = cuda::std::pair<__nv_bfloat16, __nv_bfloat16>{__float2bfloat16(1.0f), __float2bfloat16(2.0f)};
+    test_roundtrip_through_nested_T(p);
+    test_roundtrip_through_buffer(p);
+  }
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+#if _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16()
+  {
+    const auto t = cuda::std::tuple<__half, __nv_bfloat16>{__float2half(1.0f), __float2bfloat16(2.0f)};
+    test_roundtrip_through_nested_T(t);
+    test_roundtrip_through_buffer(t);
+  }
+#endif // _LIBCUDACXX_HAS_NVFP16() && _LIBCUDACXX_HAS_NVBF16()
+
+  // Padded compositions
+#if _LIBCUDACXX_HAS_NVFP16()
+  {
+    const auto p1 = cuda::std::pair<__half, int>{__float2half(1.0f), 42};
+    test_roundtrip_through_nested_T<false>(p1);
+    test_roundtrip_through_buffer<false>(p1);
+  }
+  {
+    const auto p2 = cuda::std::pair<__half, float>{__float2half(1.0f), 3.5f};
+    test_roundtrip_through_nested_T<false>(p2);
+    test_roundtrip_through_buffer<false>(p2);
+  }
+#endif // _LIBCUDACXX_HAS_NVFP16()
+
+#if _LIBCUDACXX_HAS_NVBF16()
+  {
+    const auto p = cuda::std::pair<__nv_bfloat16, int>{__float2bfloat16(1.0f), 42};
+    test_roundtrip_through_nested_T<false>(p);
+    test_roundtrip_through_buffer<false>(p);
+  }
+#endif // _LIBCUDACXX_HAS_NVBF16()
+
+  return true;
+}
+
+int main(int, char**)
+{
+  tests();
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_test_helpers.h b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_test_helpers.h
new file mode 100644
index 00000000000..bacac847cf9
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/std/numerics/bit/bit.cast/bit_cast_test_helpers.h
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBCUDACXX_TEST_STD_NUMERICS_BIT_BIT_CAST_TEST_HELPERS_H
+#define LIBCUDACXX_TEST_STD_NUMERICS_BIT_BIT_CAST_TEST_HELPERS_H
+
+#include <cuda/std/bit>
+#include <cuda/std/cassert>
+#include <cuda/std/cstring>
+
+#include "test_macros.h"
+
+// cuda::std::bit_cast does not preserve padding bits, so if T has padding bits,
+// the results might not memcmp cleanly.
+template <bool HasUniqueObjectRepresentations = true, typename T>
+TEST_FUNC void test_roundtrip_through_buffer(T from)
+{
+  struct Buffer
+  {
+    char buffer[sizeof(T)];
+  };
+  Buffer middle  = cuda::std::bit_cast<Buffer>(from);
+  T to           = cuda::std::bit_cast<T>(middle);
+  Buffer middle2 = cuda::std::bit_cast<Buffer>(to);
+
+  assert((from == to) == (from == from)); // because NaN
+
+  if constexpr (HasUniqueObjectRepresentations)
+  {
+    assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0);
+  }
+}
+
+template <bool HasUniqueObjectRepresentations = true, typename T>
+TEST_FUNC void test_roundtrip_through_nested_T(T from)
+{
+  struct Nested
+  {
+    T x;
+  };
+  static_assert(sizeof(Nested) == sizeof(T));
+
+  Nested middle  = cuda::std::bit_cast<Nested>(from);
+  T to           = cuda::std::bit_cast<T>(middle);
+  Nested middle2 = cuda::std::bit_cast<Nested>(to);
+
+  assert((from == to) == (from == from)); // because NaN
+
+  if constexpr (HasUniqueObjectRepresentations)
+  {
+    assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0);
+  }
+}
+
+template <typename Intermediate, bool HasUniqueObjectRepresentations = true, typename T>
+TEST_FUNC void test_roundtrip_through(T from)
+{
+  static_assert(sizeof(Intermediate) == sizeof(T));
+
+  Intermediate middle  = cuda::std::bit_cast<Intermediate>(from);
+  T to                 = cuda::std::bit_cast<T>(middle);
+  Intermediate middle2 = cuda::std::bit_cast<Intermediate>(to);
+
+  assert((from == to) == (from == from)); // because NaN
+
+  if constexpr (HasUniqueObjectRepresentations)
+  {
+    assert(cuda::std::memcmp(&from, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&to, &middle, sizeof(T)) == 0);
+    assert(cuda::std::memcmp(&middle, &middle2, sizeof(T)) == 0);
+  }
+}
+
+#endif // LIBCUDACXX_TEST_STD_NUMERICS_BIT_BIT_CAST_TEST_HELPERS_H
diff --git a/thrust/thrust/detail/allocator/value_initialize_range.h b/thrust/thrust/detail/allocator/value_initialize_range.h
index 491a96dc29d..9000ef604ef 100644
--- a/thrust/thrust/detail/allocator/value_initialize_range.h
+++ b/thrust/thrust/detail/allocator/value_initialize_range.h
@@ -20,6 +20,7 @@
 
 #include <cuda/std/__memory/allocator_traits.h>
 #include <cuda/std/__memory/pointer_traits.h>
+#include <cuda/std/__type_traits/is_trivially_default_constructible.h>
 
 THRUST_NAMESPACE_BEGIN
 namespace detail
diff --git a/thrust/thrust/type_traits/is_trivially_relocatable.h b/thrust/thrust/type_traits/is_trivially_relocatable.h
index e7ec4cfe6d2..c011ff55764 100644
--- a/thrust/thrust/type_traits/is_trivially_relocatable.h
+++ b/thrust/thrust/type_traits/is_trivially_relocatable.h
@@ -24,11 +24,11 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/type_traits/is_contiguous_iterator.h>
 
+#include <cuda/__type_traits/is_trivially_copyable.h>
 #include <cuda/std/__fwd/pair.h>
 #include <cuda/std/__fwd/tuple.h>
 #include <cuda/std/__type_traits/conjunction.h>
 #include <cuda/std/__type_traits/is_same.h>
-#include <cuda/std/__type_traits/is_trivially_copyable.h>
 
 THRUST_NAMESPACE_BEGIN
 
@@ -197,7 +197,7 @@ namespace detail
 // https://wg21.link/P1144R0#wording-inheritance
 template <typename T>
 struct is_trivially_relocatable_impl
-    : integral_constant<bool, ::cuda::std::is_trivially_copyable_v<T> || proclaim_trivially_relocatable<T>::value>
+    : integral_constant<bool, ::cuda::is_trivially_copyable_v<T> || proclaim_trivially_relocatable<T>::value>
 {};
 
 template <typename T, ::cuda::std::size_t N>