diff --git a/libcudacxx/include/cuda/std/__fwd/simd.h b/libcudacxx/include/cuda/std/__fwd/simd.h
index 282d5288914..9619f0c8998 100644
--- a/libcudacxx/include/cuda/std/__fwd/simd.h
+++ b/libcudacxx/include/cuda/std/__fwd/simd.h
@@ -46,7 +46,7 @@ using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
 template <typename _Tp, typename _Abi>
 struct __simd_storage;
 
-template <typename _Tp, typename _Abi>
+template <typename _Tp, typename _Abi, typename = void>
 struct __simd_operations;
 
 template <size_t _Bytes, typename _Abi>
diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h
index 8bda465ed17..f5f1459ecf3 100644
--- a/libcudacxx/include/cuda/std/__simd/basic_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h
@@ -31,6 +31,7 @@
 #include <cuda/std/__simd/concepts.h>
 #include <cuda/std/__simd/flag.h>
 #include <cuda/std/__simd/iterator.h>
+#include <cuda/std/__simd/specializations/fixed_size_float_vec.h>
 #include <cuda/std/__simd/specializations/fixed_size_vec.h>
 #include <cuda/std/__simd/utility.h>
 #include <cuda/std/__type_traits/enable_if.h>
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h
new file mode 100644
index 00000000000..b5378fb21e1
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h
@@ -0,0 +1,128 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__simd/specializations/fixed_size_vec.h>
+#include <cuda/std/__simd/specializations/fp32x2_intrinsics.h>
+#include <cuda/std/__type_traits/enable_if.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// Simd operations for fixed_size ABI with float elements and F32x2 fast paths.
+template <__simd_size_type _Np>
+struct __simd_operations<float, __fixed_size<_Np>, enable_if_t<__is_fixed_size_float_v<float, _Np>>>
+    : __fixed_size_operations<float, _Np>
+{
+  using __base       = __fixed_size_operations<float, _Np>;
+  using _SimdStorage = __simd_storage<float, __fixed_size<_Np>>;
+
+  _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      // clang-format off
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                   (constexpr _SimdStorage __one = __base::__broadcast(1.0f);
+                    __s                          = ::cuda::std::simd::__plus_f32x2(__s, __one);
+                    return;))
+      // clang-format on
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    __base::__increment(__s);
+  }
+
+  _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      // clang-format off
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                   (constexpr _SimdStorage __one = __base::__broadcast(1.0f);
+                    __s                          = ::cuda::std::simd::__minus_f32x2(__s, __one);
+                    return;))
+      // clang-format on
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    __base::__decrement(__s);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      // clang-format off
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                   (constexpr _SimdStorage __zero = __base::__broadcast(0.0f);
+                    return ::cuda::std::simd::__minus_f32x2(__zero, __s);))
+      // clang-format on
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__unary_minus(__s);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__plus(__lhs, __rhs);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__minus(__lhs, __rhs);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__multiplies(__lhs, __rhs);
+  }
+};
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
index a60aa034bbc..ca07cee583b 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
@@ -24,6 +24,7 @@
 #include <cuda/__utility/in_range.h>
 #include <cuda/std/__cstddef/types.h>
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/specializations/fixed_size_storage.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__utility/integer_sequence.h>
 
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
new file mode 100644
index 00000000000..846142cb62f
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__fwd/simd.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+template <__simd_size_type _Np>
+struct __fixed_size
+{
+  static_assert(_Np > 0, "_Np must be greater than 0");
+
+  static constexpr __simd_size_type __simd_size = _Np;
+};
+
+// Element-per-slot simd storage for fixed_size ABI
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_storage<_Tp, __fixed_size<_Np>>
+{
+  using value_type = _Tp;
+
+  _Tp __data[_Np]{};
+
+  [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    return __data[__idx];
+  }
+
+  _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    __data[__idx] = __v;
+  }
+};
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
index 0dc5576d935..703d1617eb7 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -21,51 +21,24 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__utility/in_range.h>
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/specializations/fixed_size_mask.h>
+#include <cuda/std/__simd/specializations/fixed_size_storage.h>
+#include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__utility/integer_sequence.h>
 
 #include <cuda/std/__cccl/prologue.h>
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
 
-template <__simd_size_type _Np>
-struct __fixed_size
-{
-  static_assert(_Np > 0, "_Np must be greater than 0");
-
-  static constexpr __simd_size_type __simd_size = _Np;
-};
-
-// Element-per-slot simd storage for fixed_size ABI
+// Simd operations for fixed_size ABI
 template <typename _Tp, __simd_size_type _Np>
-struct __simd_storage<_Tp, __fixed_size<_Np>>
-{
-  using value_type = _Tp;
-
-  _Tp __data[_Np]{};
-
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage()                                 = default;
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&)            = default;
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default;
+inline constexpr bool __is_fixed_size_float_v = is_same_v<_Tp, float> && _Np >= 2;
 
-  [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept
-  {
-    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
-    return __data[__idx];
-  }
-
-  _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept
-  {
-    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
-    __data[__idx] = __v;
-  }
-};
-
-// Simd operations for fixed_size ABI
 template <typename _Tp, __simd_size_type _Np>
-struct __simd_operations<_Tp, __fixed_size<_Np>>
+struct __fixed_size_operations
 {
   using _SimdStorage = __simd_storage<_Tp, __fixed_size<_Np>>;
   using _MaskStorage = __mask_storage<sizeof(_Tp), __fixed_size<_Np>>;
@@ -90,7 +63,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
     ((__result.__data[_Is] = __g(integral_constant<__simd_size_type, _Is>())), ...);
     return __result;
 #else // ^^^ C++20 ^^^ / vvv C++17 vvv
-    return _SimdStorage{{ __g(integral_constant<__simd_size_type, _Is>())... }};
+    return _SimdStorage{{__g(integral_constant<__simd_size_type, _Is>())...}};
 #endif // _CCCL_STD_VER < 2020
   }
 
@@ -354,6 +327,13 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
     return __result;
   }
 };
+
+// Default path (no optimizations)
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_operations<_Tp, __fixed_size<_Np>, enable_if_t<!__is_fixed_size_float_v<_Tp, _Np>>>
+    : __fixed_size_operations<_Tp, _Np>
+{};
+
 _CCCL_END_NAMESPACE_CUDA_STD_SIMD
 
 #include <cuda/std/__cccl/epilogue.h>
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
new file mode 100644
index 00000000000..ae80855c662
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
@@ -0,0 +1,254 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK())
+#define _CCCL_HAS_SIMD_F32X2_PTX()        (__cccl_ptx_isa >= 860ULL)
+
+#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX())
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+#  include <cuda/std/__simd/specializations/fixed_size_storage.h>
+
+#  include <nv/target>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+_CCCL_DEVICE_API inline void __add_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
+  asm("{.reg .b64 __lhs, __rhs, __result;"
+      "mov.b64 __lhs, {%2, %3};"
+      "mov.b64 __rhs, {%4, %5};"
+      "add.f32x2 __result, __lhs, __rhs;"
+      "mov.b64 {%0, %1}, __result;}"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+}
+
+_CCCL_DEVICE_API inline void __mul_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fmul2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
+  asm("{.reg .b64 __lhs, __rhs, __result;"
+      "mov.b64 __lhs, {%2, %3};"
+      "mov.b64 __rhs, {%4, %5};"
+      "mul.f32x2 __result, __lhs, __rhs;"
+      "mov.b64 {%0, %1}, __result;}"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+}
+
+_CCCL_DEVICE_API inline void __sub_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{-__rhs1, -__rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (asm("{.reg .b64 __lhs, __rhs, __result;"
+                    "mov.b64 __lhs, {%2, %3};"
+                    "mov.b64 __rhs, {%4, %5};"
+                    "sub.f32x2 __result, __lhs, __rhs;"
+                    "mov.b64 {%0, %1}, __result;}"
+                    : "=f"(__result1), "=f"(__result2)
+                    : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));))
+  // clang-format on
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+}
+
+_CCCL_DEVICE_API inline void __fma_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  const float __add1,
+  const float __add2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result =
+                  ::__ffma2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}, ::float2{__add1, __add2});
+                __result1 = __result.x;
+                __result2 = __result.y;))
+  // clang-format on
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
+  asm("{.reg .b64 __lhs, __rhs, __add, __result;"
+      "mov.b64 __lhs, {%2, %3};"
+      "mov.b64 __rhs, {%4, %5};"
+      "mov.b64 __add, {%6, %7};"
+      "fma.rn.f32x2 __result, __lhs, __rhs, __add;"
+      "mov.b64 {%0, %1}, __result;}"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2), "f"(__add1), "f"(__add2));
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
+}
+
+template <__simd_size_type _Np>
+using __simd_storage_f32 = __simd_storage<float, __fixed_size<_Np>>;
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__plus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__add_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] + __rhs.__data[_Np - 1];
+  }
+  return __result;
+}
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__minus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__sub_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] - __rhs.__data[_Np - 1];
+  }
+  return __result;
+}
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__multiplies_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__mul_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1];
+  }
+  return __result;
+}
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__fma_f32x2(const __simd_storage_f32<_Np>& __lhs,
+            const __simd_storage_f32<_Np>& __rhs,
+            const __simd_storage_f32<_Np>& __add) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__fma_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __add.__data[__i],
+      __add.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1] + __add.__data[_Np - 1];
+  }
+  return __result;
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_SIMD_F32X2()
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H
diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt
index 039c917fa7d..20fb09610ed 100644
--- a/libcudacxx/test/CMakeLists.txt
+++ b/libcudacxx/test/CMakeLists.txt
@@ -98,3 +98,4 @@ endif()
 
 add_subdirectory(nvtarget)
 add_subdirectory(atomic_codegen)
+add_subdirectory(simd_codegen)
diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
index d1c5f16b949..326fe0dd186 100755
--- a/libcudacxx/test/atomic_codegen/dump_and_check.bash
+++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -euo pipefail
 
-## Usage: dump_and_check test.a test.cu PREFIX
-input_archive="$1"
-input_testfile="$2"
-input_prefix="$3"
+## Usage: dump_and_check test.a test.cu PREFIXES [cuobjdump-mode]
+input_archive="${1}"
+input_testfile="${2}"
+input_prefix="${3}"
+dump_mode="${4:---dump-ptx}"
+filecheck="${FILECHECK:-FileCheck}"
 
-cuobjdump --dump-ptx "$input_archive" | FileCheck --match-full-lines --check-prefix "$input_prefix" "$input_testfile"
+cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes="${input_prefix}" "${input_testfile}"
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
new file mode 100644
index 00000000000..7a82933ba97
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -0,0 +1,103 @@
+##===----------------------------------------------------------------------===##
+##
+## Part of libcu++ in the CUDA C++ Core Libraries,
+## under the Apache License v2.0 with LLVM Exceptions.
+## See https://llvm.org/LICENSE.txt for license information.
+## SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+## SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+##
+##===----------------------------------------------------------------------===##
+
+add_custom_target(libcudacxx.test.simd.sass)
+
+find_program(
+  filecheck
+  NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18
+)
+
+if (filecheck)
+  message("-- ${filecheck} found... building simd codegen tests")
+else()
+  return()
+endif()
+
+find_program(cuobjdump "cuobjdump" REQUIRED)
+find_program(bash "bash" REQUIRED)
+
+set(libcudacxx_simd_codegen_tests)
+if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  file(GLOB libcudacxx_simd_codegen_tests "*.cu")
+endif()
+
+set(simd_codegen_cuda_archs 80 90)
+if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
+  list(APPEND simd_codegen_cuda_archs 100 120)
+endif()
+
+function(simd_codegen_add_tests test_path)
+  cmake_path(GET test_path FILENAME test_file)
+  cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
+
+  file(READ "${test_path}" test_contents)
+  string(
+    REGEX MATCH
+    "SM[0-9][0-9]*"
+    has_arch_specific_prefix
+    "${test_contents}"
+  )
+
+  # search for SMXX prefixes in the test contents
+  set(test_archs)
+  if (has_arch_specific_prefix)
+    foreach (arch IN LISTS simd_codegen_cuda_archs)
+      string(FIND "${test_contents}" "SM${arch}" arch_specific_prefix)
+      if (NOT arch_specific_prefix EQUAL -1)
+        list(APPEND test_archs "${arch}")
+      endif()
+    endforeach()
+  else()
+    set(test_archs ${simd_codegen_cuda_archs})
+  endif()
+
+  # Run tests for each architecture specified in the test file
+  foreach (arch IN LISTS test_archs)
+    set(target_name "simd_codegen_sm${arch}_${test_name}")
+    set(check_prefixes "SMXX")
+    if (has_arch_specific_prefix)
+      string(APPEND check_prefixes ",SM${arch}")
+    endif()
+
+    add_library(${target_name} STATIC "${test_path}")
+
+    set_target_properties(
+      ${target_name}
+      PROPERTIES CUDA_ARCHITECTURES "${arch}"
+    )
+
+    target_compile_options(${target_name} PRIVATE "-Wno-comment")
+
+    target_include_directories(
+      ${target_name}
+      PRIVATE "${libcudacxx_SOURCE_DIR}/include"
+    )
+    add_dependencies(libcudacxx.test.simd.sass ${target_name})
+
+    add_custom_command(
+      TARGET libcudacxx.test.simd.sass
+      POST_BUILD
+      # gersemi: off
+      COMMAND
+        ${CMAKE_COMMAND} -E env "FILECHECK=${filecheck}"
+          "${CMAKE_CURRENT_SOURCE_DIR}/../atomic_codegen/dump_and_check.bash"
+          $<TARGET_FILE:${target_name}>
+          "${test_path}"
+          "${check_prefixes}"
+          --dump-sass
+      # gersemi: on
+    )
+  endforeach()
+endfunction()
+
+foreach (test_path IN LISTS libcudacxx_simd_codegen_tests)
+  simd_codegen_add_tests("${test_path}")
+endforeach()
diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/decrement_f32x2.cu
new file mode 100644
index 00000000000..41ec986c9c8
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/decrement_f32x2.cu
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* out)
+{
+  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
+
+  Vec_f32_4 vec(values);
+  --vec;
+
+  out[0] = vec[0];
+  out[1] = vec[1];
+  out[2] = vec[2];
+  out[3] = vec[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_decrement_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu
new file mode 100644
index 00000000000..f608f209e09
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/fma_bf16.cu
@@ -0,0 +1,54 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void
+test_fma_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> add_values{add[0], add[1], add[2], add[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const Vec_bf16_4 add_vec(add_values);
+  const Vec_bf16_4 result = lhs_vec * rhs_vec + add_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_bf16_4
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu
new file mode 100644
index 00000000000..a0dad310ac9
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/fma_f16.cu
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#  include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_fma_f16_4(const __half* lhs, const __half* rhs, const __half* add, __half* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+  const cuda::std::array<__half, 4> add_values{add[0], add[1], add[2], add[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const Vec_f16_4 add_vec(add_values);
+  const Vec_f16_4 result = lhs_vec * rhs_vec + add_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_f16_4
+; SM80: {{.*HFMA2.*}}
+; SM80: {{.*HFMA2.*}}
+; SM90: {{.*HFMA2.*}}
+; SM90: {{.*HFMA2.*}}
+; SM100: {{.*HFMA2.*}}
+; SM100: {{.*HFMA2.*}}
+; SM120: {{.*HFMA2.*}}
+; SM120: {{.*HFMA2.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/increment_f32x2.cu
new file mode 100644
index 00000000000..9c696f11e40
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/increment_f32x2.cu
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* out)
+{
+  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
+
+  Vec_f32_4 vec(values);
+  ++vec;
+
+  out[0] = vec[0];
+  out[1] = vec[1];
+  out[2] = vec[2];
+  out[3] = vec[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_increment_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu
new file mode 100644
index 00000000000..fff63e06d1c
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/less_bf16.cu
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const auto result = lhs_vec < rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_bf16_4
+; SM80: {{.*FSETP\.LT.*}}
+; SM80: {{.*FSETP\.LT.*}}
+; SM80: {{.*FSETP\.LT.*}}
+; SM80: {{.*FSETP\.LT.*}}
+; SM90: {{.*HSETP2.*BF16.*}}
+; SM90: {{.*HSETP2.*BF16.*}}
+; SM100: {{.*HSETP2.*BF16.*}}
+; SM100: {{.*HSETP2.*BF16.*}}
+; SM120: {{.*HSETP2.*BF16.*}}
+; SM120: {{.*HSETP2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu
new file mode 100644
index 00000000000..beef111b99e
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/less_f16.cu
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#  include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_less_f16_4(const __half* lhs, const __half* rhs, bool* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const auto result = lhs_vec < rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_f16_4
+; SM80: {{.*HSETP2.*}}
+; SM80: {{.*HSETP2.*}}
+; SM90: {{.*HSETP2.*}}
+; SM90: {{.*HSETP2.*}}
+; SM100: {{.*HSETP2.*}}
+; SM100: {{.*HSETP2.*}}
+; SM120: {{.*HSETP2.*}}
+; SM120: {{.*HSETP2.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu
new file mode 100644
index 00000000000..2c00e62b9ee
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/minus_f32x2.cu
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const float* rhs, float* out)
+{
+  const cuda::std::array<float, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<float, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f32_4 lhs_vec(lhs_values);
+  const Vec_f32_4 rhs_vec(rhs_values);
+  const Vec_f32_4 result = lhs_vec - rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_minus_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
new file mode 100644
index 00000000000..3036c4fd526
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void
+test_operator_multiplies_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const Vec_bf16_4 result = lhs_vec * rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_bf16_4
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HMUL2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HMUL2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HMUL2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu
new file mode 100644
index 00000000000..bf721009051
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu
@@ -0,0 +1,51 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#  include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_multiplies_f16_4(const __half* lhs, const __half* rhs, __half* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const Vec_f16_4 result = lhs_vec * rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_f16_4
+; SM80: {{.*(HMUL2|HFMA2).*}}
+; SM80: {{.*(HMUL2|HFMA2).*}}
+; SM90: {{.*(HMUL2|HFMA2).*}}
+; SM90: {{.*(HMUL2|HFMA2).*}}
+; SM100: {{.*(HMUL2|HFMA2).*}}
+; SM100: {{.*(HMUL2|HFMA2).*}}
+; SM120: {{.*(HMUL2|HFMA2).*}}
+; SM120: {{.*(HMUL2|HFMA2).*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu
new file mode 100644
index 00000000000..3d192fdad83
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/plus_bf16.cu
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#  include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void
+test_operator_plus_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const Vec_bf16_4 result = lhs_vec + rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_bf16_4
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HADD2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HADD2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HADD2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu
new file mode 100644
index 00000000000..4a044a19d1c
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/plus_f16.cu
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#  include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_plus_f16_4(const __half* lhs, const __half* rhs, __half* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const Vec_f16_4 result = lhs_vec + rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f16_4
+; SMXX: {{.*(HADD2|HFMA2).*}}
+; SMXX: {{.*(HADD2|HFMA2).*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/plus_f32x2.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu
new file mode 100644
index 00000000000..b0caf15341b
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/plus_f32x2.cu
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const float* rhs, float* out)
+{
+  const cuda::std::array<float, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<float, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f32_4 lhs_vec(lhs_values);
+  const Vec_f32_4 rhs_vec(rhs_values);
+  const Vec_f32_4 result = lhs_vec + rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
new file mode 100644
index 00000000000..02d9c14f18b
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, float* out)
+{
+  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
+
+  const Vec_f32_4 vec(values);
+  const Vec_f32_4 result = -vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_unary_minus_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/