From 691608bde0ce3a18395c41297ec9386a8f36ac79 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 30 Apr 2026 16:40:00 -0700
Subject: [PATCH 1/7] half/bfloat plus/multiply/fma

---
 .../__simd/specializations/fixed_size_vec.h   | 32 +++++++--
 libcudacxx/test/CMakeLists.txt                |  1 +
 .../test/atomic_codegen/dump_and_check.bash   |  6 +-
 libcudacxx/test/simd_codegen/CMakeLists.txt   | 65 +++++++++++++++++++
 libcudacxx/test/simd_codegen/fma_bf16.cu      | 44 +++++++++++++
 libcudacxx/test/simd_codegen/fma_f16.cu       | 43 ++++++++++++
 .../test/simd_codegen/multiplies_bf16.cu      | 42 ++++++++++++
 .../test/simd_codegen/multiplies_f16.cu       | 41 ++++++++++++
 libcudacxx/test/simd_codegen/plus_bf16.cu     | 42 ++++++++++++
 libcudacxx/test/simd_codegen/plus_f16.cu      | 35 ++++++++++
 10 files changed, 344 insertions(+), 7 deletions(-)
 create mode 100644 libcudacxx/test/simd_codegen/CMakeLists.txt
 create mode 100644 libcudacxx/test/simd_codegen/fma_bf16.cu
 create mode 100644 libcudacxx/test/simd_codegen/fma_f16.cu
 create mode 100644 libcudacxx/test/simd_codegen/multiplies_bf16.cu
 create mode 100644 libcudacxx/test/simd_codegen/multiplies_f16.cu
 create mode 100644 libcudacxx/test/simd_codegen/plus_bf16.cu
 create mode 100644 libcudacxx/test/simd_codegen/plus_f16.cu

diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
index 0dc5576d935..c4ede831227 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -21,7 +21,10 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cuda/__cmath/ceil_div.h>
 #include <cuda/__utility/in_range.h>
+#include <cuda/std/__bit/bit_cast.h>
+#include <cuda/std/__cstring/memcpy.h>
 #include <cuda/std/__fwd/simd.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__utility/integer_sequence.h>
@@ -161,13 +164,32 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
-    _SimdStorage __result;
-    _CCCL_PRAGMA_UNROLL_FULL()
-    for (__simd_size_type __i = 0; __i < _Np; ++__i)
+#if 0 // _CCCL_HAS_NVFP16()
+    if constexpr (is_same_v<_Tp, ::__half> && _Np >= 2)
     {
-      __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]);
+      constexpr auto __half_size = ::cuda::ceil_div(_Np, __simd_size_type{2});
+      using __half2_storage      = __simd_storage<::__half2, __fixed_size<__half_size>>;
+      using __half2_operations   = __simd_operations<::__half2, __fixed_size<__half_size>>;
+      __half2_storage __lhs_half2;
+      __half2_storage __rhs_half2;
+      ::cuda::std::memcpy((void*) &__lhs_half2, (void*) &__lhs, sizeof(__lhs));
+      ::cuda::std::memcpy((void*) &__rhs_half2, (void*) &__rhs, sizeof(__rhs));
+      const auto __result_half2 = __half2_operations::__plus(__lhs_half2, __rhs_half2);
+      _SimdStorage __result;
+      ::cuda::std::memcpy((void*) &__result, (void*) &__result_half2, sizeof(__result));
+      return __result;
+    }
+    else
+#endif // _CCCL_HAS_NVFP16()
+    {
+      _SimdStorage __result;
+      _CCCL_PRAGMA_UNROLL_FULL()
+      for (__simd_size_type __i = 0; __i < _Np; ++__i)
+      {
+        __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]);
+      }
+      return __result;
     }
-    return __result;
   }
 
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt
index 039c917fa7d..20fb09610ed 100644
--- a/libcudacxx/test/CMakeLists.txt
+++ b/libcudacxx/test/CMakeLists.txt
@@ -98,3 +98,4 @@ endif()
 
 add_subdirectory(nvtarget)
 add_subdirectory(atomic_codegen)
+add_subdirectory(simd_codegen)
diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
index c66bb433637..0e43ed98cc3 100755
--- a/libcudacxx/test/atomic_codegen/dump_and_check.bash
+++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash
@@ -1,9 +1,11 @@
 #!/bin/bash
 set -e
 
-## Usage: dump_and_check test.a test.cu PREFIX
+## Usage: dump_and_check test.a test.cu PREFIXES [cuobjdump-mode]
 input_archive=$1
 input_testfile=$2
 input_prefix=$3
+dump_mode=${4:---dump-ptx}
+filecheck=${FILECHECK:-FileCheck}
 
-cuobjdump --dump-ptx $input_archive | FileCheck --match-full-lines --check-prefix $input_prefix $input_testfile
+cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes "${input_prefix}" "${input_testfile}"
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
new file mode 100644
index 00000000000..4c25e72b8d6
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -0,0 +1,65 @@
+add_custom_target(libcudacxx.test.simd.sass)
+
+find_program(filecheck NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18)
+
+if (filecheck)
+  message("-- ${filecheck} found... building simd codegen tests")
+else()
+  return()
+endif()
+
+find_program(cuobjdump "cuobjdump" REQUIRED)
+find_program(bash "bash" REQUIRED)
+
+set(libcudacxx_simd_codegen_tests)
+if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+  file(GLOB libcudacxx_simd_codegen_tests "*.cu")
+endif()
+
+set(simd_codegen_cuda_archs 80 90)
+if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9)
+  list(APPEND simd_codegen_cuda_archs 100 120)
+endif()
+
+function(simd_codegen_add_tests arch)
+  foreach (test_path IN LISTS libcudacxx_simd_codegen_tests)
+    cmake_path(GET test_path FILENAME test_file)
+    cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
+
+    set(target_name "simd_codegen_sm${arch}_${test_name}")
+    set(check_prefixes "SMXX,SM${arch}")
+
+    add_library(${target_name} STATIC "${test_path}")
+
+    set_target_properties(
+      ${target_name}
+      PROPERTIES CUDA_ARCHITECTURES "${arch}"
+    )
+
+    target_compile_options(${target_name} PRIVATE "-Wno-comment")
+
+    target_include_directories(
+      ${target_name}
+      PRIVATE "${libcudacxx_SOURCE_DIR}/include"
+    )
+    add_dependencies(libcudacxx.test.simd.sass ${target_name})
+
+    add_custom_command(
+      TARGET libcudacxx.test.simd.sass
+      POST_BUILD
+      # gersemi: off
+      COMMAND
+        ${CMAKE_COMMAND} -E env "FILECHECK=${filecheck}"
+          "${CMAKE_CURRENT_SOURCE_DIR}/../atomic_codegen/dump_and_check.bash"
+          $<TARGET_FILE:${target_name}>
+          "${test_path}"
+          "${check_prefixes}"
+          --dump-sass
+      # gersemi: on
+    )
+  endforeach()
+endfunction()
+
+foreach (arch IN LISTS simd_codegen_cuda_archs)
+  simd_codegen_add_tests(${arch})
+endforeach()
diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu
new file mode 100644
index 00000000000..1b02e8ba22a
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/fma_bf16.cu
@@ -0,0 +1,44 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_fma_bf16_4(
+  const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> add_values{add[0], add[1], add[2], add[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const Vec_bf16_4 add_vec(add_values);
+  const Vec_bf16_4 result = lhs_vec * rhs_vec + add_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_bf16_4
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu
new file mode 100644
index 00000000000..62af327f9c3
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/fma_f16.cu
@@ -0,0 +1,43 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_fma_f16_4(const __half* lhs, const __half* rhs, const __half* add, __half* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+  const cuda::std::array<__half, 4> add_values{add[0], add[1], add[2], add[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const Vec_f16_4 add_vec(add_values);
+  const Vec_f16_4 result = lhs_vec * rhs_vec + add_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_f16_4
+; SM80: {{.*HFMA2.*}}
+; SM80: {{.*HFMA2.*}}
+; SM90: {{.*HFMA2.*}}
+; SM90: {{.*HFMA2.*}}
+; SM100: {{.*HFMA2.*}}
+; SM100: {{.*HFMA2.*}}
+; SM120: {{.*HFMA2.*}}
+; SM120: {{.*HFMA2.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
new file mode 100644
index 00000000000..37d0a3d9533
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
@@ -0,0 +1,42 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void
+test_operator_multiplies_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const Vec_bf16_4 result = lhs_vec * rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_bf16_4
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HMUL2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HMUL2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HMUL2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu
new file mode 100644
index 00000000000..06f40655f22
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu
@@ -0,0 +1,41 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_multiplies_f16_4(const __half* lhs, const __half* rhs, __half* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const Vec_f16_4 result = lhs_vec * rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_f16_4
+; SM80: {{.*(HMUL2|HFMA2).*}}
+; SM80: {{.*(HMUL2|HFMA2).*}}
+; SM90: {{.*(HMUL2|HFMA2).*}}
+; SM90: {{.*(HMUL2|HFMA2).*}}
+; SM100: {{.*(HMUL2|HFMA2).*}}
+; SM100: {{.*(HMUL2|HFMA2).*}}
+; SM120: {{.*(HMUL2|HFMA2).*}}
+; SM120: {{.*(HMUL2|HFMA2).*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()
diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu
new file mode 100644
index 00000000000..dcffca128b0
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/plus_bf16.cu
@@ -0,0 +1,42 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void
+test_operator_plus_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const Vec_bf16_4 result = lhs_vec + rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_bf16_4
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM80: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HFMA2.*BF16.*}}
+; SM90: {{.*HADD2.*BF16.*}}
+; SM100: {{.*HFMA2.*BF16.*}}
+; SM100: {{.*HADD2.*BF16.*}}
+; SM120: {{.*HFMA2.*BF16.*}}
+; SM120: {{.*HADD2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu
new file mode 100644
index 00000000000..03bb80bbf5c
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/plus_f16.cu
@@ -0,0 +1,35 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_plus_f16_4(const __half* lhs, const __half* rhs, __half* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const Vec_f16_4 result = lhs_vec + rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f16_4
+; SMXX: {{.*(HADD2|HFMA2).*}}
+; SMXX: {{.*(HADD2|HFMA2).*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()

From fc02d50f7179c9455d3c4d7501ffca4fdd9112ea Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 30 Apr 2026 17:02:15 -0700
Subject: [PATCH 2/7] check comparison

---
 libcudacxx/test/simd_codegen/less_bf16.cu | 44 +++++++++++++++++++++++
 libcudacxx/test/simd_codegen/less_f16.cu  | 41 +++++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 libcudacxx/test/simd_codegen/less_bf16.cu
 create mode 100644 libcudacxx/test/simd_codegen/less_f16.cu

diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu
new file mode 100644
index 00000000000..bb61ebdc672
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/less_bf16.cu
@@ -0,0 +1,44 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVBF16()
+
+#include <cuda_bf16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
+
+extern "C" __global__ void
+test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out)
+{
+  const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_bf16_4 lhs_vec(lhs_values);
+  const Vec_bf16_4 rhs_vec(rhs_values);
+  const auto result = lhs_vec < rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_bf16_4
+; SM80: {{.*FSETP\.LT.*}}
+; SM80: {{.*FSETP\.LT.*}}
+; SM80: {{.*FSETP\.LT.*}}
+; SM80: {{.*FSETP\.LT.*}}
+; SM90: {{.*HSETP2.*BF16.*}}
+; SM90: {{.*HSETP2.*BF16.*}}
+; SM100: {{.*HSETP2.*BF16.*}}
+; SM100: {{.*HSETP2.*BF16.*}}
+; SM120: {{.*HSETP2.*BF16.*}}
+; SM120: {{.*HSETP2.*BF16.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVBF16()
diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu
new file mode 100644
index 00000000000..408d90e46d3
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/less_f16.cu
@@ -0,0 +1,41 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_NVFP16()
+
+#include <cuda_fp16.h>
+
+namespace simd = cuda::std::simd;
+
+using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_less_f16_4(const __half* lhs, const __half* rhs, bool* out)
+{
+  const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f16_4 lhs_vec(lhs_values);
+  const Vec_f16_4 rhs_vec(rhs_values);
+  const auto result = lhs_vec < rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_f16_4
+; SM80: {{.*HSETP2.*}}
+; SM80: {{.*HSETP2.*}}
+; SM90: {{.*HSETP2.*}}
+; SM90: {{.*HSETP2.*}}
+; SM100: {{.*HSETP2.*}}
+; SM100: {{.*HSETP2.*}}
+; SM120: {{.*HSETP2.*}}
+; SM120: {{.*HSETP2.*}}
+
+*/
+
+#endif // _CCCL_HAS_NVFP16()

From 4ef18dbc9ef5785c04b8285b28e734541c3161f5 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 30 Apr 2026 18:17:13 -0700
Subject: [PATCH 3/7] test f32x2

---
 .../__simd/specializations/fixed_size_vec.h   | 14 ++++
 .../std/__simd/specializations/intrinsic.h    | 80 +++++++++++++++++++
 libcudacxx/test/simd_codegen/CMakeLists.txt   |  9 ++-
 libcudacxx/test/simd_codegen/fma_bf16.cu      |  6 +-
 libcudacxx/test/simd_codegen/fma_f16.cu       |  2 +-
 libcudacxx/test/simd_codegen/less_bf16.cu     |  5 +-
 libcudacxx/test/simd_codegen/less_f16.cu      |  2 +-
 .../test/simd_codegen/multiplies_bf16.cu      |  2 +-
 .../test/simd_codegen/multiplies_f16.cu       |  2 +-
 libcudacxx/test/simd_codegen/plus_bf16.cu     |  2 +-
 libcudacxx/test/simd_codegen/plus_f16.cu      |  2 +-
 libcudacxx/test/simd_codegen/plus_f32.cu      | 33 ++++++++
 12 files changed, 146 insertions(+), 13 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
 create mode 100644 libcudacxx/test/simd_codegen/plus_f32.cu

diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
index c4ede831227..c87325688e6 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -26,6 +26,7 @@
 #include <cuda/std/__bit/bit_cast.h>
 #include <cuda/std/__cstring/memcpy.h>
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/specializations/intrinsic.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__utility/integer_sequence.h>
 
@@ -164,6 +165,19 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
+      {
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                     (_SimdStorage __result; //
+                      ::cuda::std::simd::__plus_f32x2(__lhs.__data, __rhs.__data, __result.__data);
+                      return __result;))
+      }
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+
 #if 0 // _CCCL_HAS_NVFP16()
     if constexpr (is_same_v<_Tp, ::__half> && _Np >= 2)
     {
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
new file mode 100644
index 00000000000..3571b01248d
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
@@ -0,0 +1,80 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL
+#  define _CCCL_HAS_SIMD_F32X2() 1
+#else
+#  define _CCCL_HAS_SIMD_F32X2() 0
+#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+#  include <cuda/std/__cstddef/types.h>
+
+#  include <nv/target>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+_CCCL_API inline void __add_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+  const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
+  __result1           = __result.x;
+  __result2           = __result.y;
+#  elif __cccl_ptx_isa >= 860ULL
+  asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
+#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+}
+
+template <size_t _Np>
+_CCCL_API constexpr void
+__plus_f32x2(const float (&__lhs)[_Np], const float (&__rhs)[_Np], float (&__result)[_Np]) noexcept
+{
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (size_t __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__add_f32x2(
+      __lhs[__i], __lhs[__i + 1], __rhs[__i], __rhs[__i + 1], __result[__i], __result[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result[_Np - 1] = __lhs[_Np - 1] + __rhs[_Np - 1];
+  }
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_SIMD_F32X2()
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index 4c25e72b8d6..6e4250695d4 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -1,6 +1,9 @@
 add_custom_target(libcudacxx.test.simd.sass)
 
-find_program(filecheck NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18)
+find_program(
+  filecheck
+  NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18
+)
 
 if (filecheck)
   message("-- ${filecheck} found... building simd codegen tests")
@@ -26,6 +29,10 @@ function(simd_codegen_add_tests arch)
     cmake_path(GET test_path FILENAME test_file)
     cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
 
+    if (test_name STREQUAL "plus_f32" AND NOT arch EQUAL 100)
+      continue()
+    endif()
+
     set(target_name "simd_codegen_sm${arch}_${test_name}")
     set(check_prefixes "SMXX,SM${arch}")
 
diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu
index 1b02e8ba22a..e9f9182bd6e 100644
--- a/libcudacxx/test/simd_codegen/fma_bf16.cu
+++ b/libcudacxx/test/simd_codegen/fma_bf16.cu
@@ -3,14 +3,14 @@
 
 #if _CCCL_HAS_NVBF16()
 
-#include <cuda_bf16.h>
+#  include <cuda_bf16.h>
 
 namespace simd = cuda::std::simd;
 
 using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
 
-extern "C" __global__ void test_fma_bf16_4(
-  const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out)
+extern "C" __global__ void
+test_fma_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out)
 {
   const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
   const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu
index 62af327f9c3..901d341fe46 100644
--- a/libcudacxx/test/simd_codegen/fma_f16.cu
+++ b/libcudacxx/test/simd_codegen/fma_f16.cu
@@ -3,7 +3,7 @@
 
 #if _CCCL_HAS_NVFP16()
 
-#include <cuda_fp16.h>
+#  include <cuda_fp16.h>
 
 namespace simd = cuda::std::simd;
 
diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu
index bb61ebdc672..3fc6287a32c 100644
--- a/libcudacxx/test/simd_codegen/less_bf16.cu
+++ b/libcudacxx/test/simd_codegen/less_bf16.cu
@@ -3,14 +3,13 @@
 
 #if _CCCL_HAS_NVBF16()
 
-#include <cuda_bf16.h>
+#  include <cuda_bf16.h>
 
 namespace simd = cuda::std::simd;
 
 using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>;
 
-extern "C" __global__ void
-test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out)
+extern "C" __global__ void test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out)
 {
   const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
   const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu
index 408d90e46d3..97e5070f5a3 100644
--- a/libcudacxx/test/simd_codegen/less_f16.cu
+++ b/libcudacxx/test/simd_codegen/less_f16.cu
@@ -3,7 +3,7 @@
 
 #if _CCCL_HAS_NVFP16()
 
-#include <cuda_fp16.h>
+#  include <cuda_fp16.h>
 
 namespace simd = cuda::std::simd;
 
diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
index 37d0a3d9533..c66bc326b60 100644
--- a/libcudacxx/test/simd_codegen/multiplies_bf16.cu
+++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
@@ -3,7 +3,7 @@
 
 #if _CCCL_HAS_NVBF16()
 
-#include <cuda_bf16.h>
+#  include <cuda_bf16.h>
 
 namespace simd = cuda::std::simd;
 
diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu
index 06f40655f22..aa0c32fbe35 100644
--- a/libcudacxx/test/simd_codegen/multiplies_f16.cu
+++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu
@@ -3,7 +3,7 @@
 
 #if _CCCL_HAS_NVFP16()
 
-#include <cuda_fp16.h>
+#  include <cuda_fp16.h>
 
 namespace simd = cuda::std::simd;
 
diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu
index dcffca128b0..4492a4edca1 100644
--- a/libcudacxx/test/simd_codegen/plus_bf16.cu
+++ b/libcudacxx/test/simd_codegen/plus_bf16.cu
@@ -3,7 +3,7 @@
 
 #if _CCCL_HAS_NVBF16()
 
-#include <cuda_bf16.h>
+#  include <cuda_bf16.h>
 
 namespace simd = cuda::std::simd;
 
diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu
index 03bb80bbf5c..5a56242da89 100644
--- a/libcudacxx/test/simd_codegen/plus_f16.cu
+++ b/libcudacxx/test/simd_codegen/plus_f16.cu
@@ -3,7 +3,7 @@
 
 #if _CCCL_HAS_NVFP16()
 
-#include <cuda_fp16.h>
+#  include <cuda_fp16.h>
 
 namespace simd = cuda::std::simd;
 
diff --git a/libcudacxx/test/simd_codegen/plus_f32.cu b/libcudacxx/test/simd_codegen/plus_f32.cu
new file mode 100644
index 00000000000..b8b11732c55
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/plus_f32.cu
@@ -0,0 +1,33 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const float* rhs, float* out)
+{
+  const cuda::std::array<float, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<float, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f32_4 lhs_vec(lhs_values);
+  const Vec_f32_4 rhs_vec(rhs_values);
+  const Vec_f32_4 result = lhs_vec + rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
+
+#endif // _CCCL_HAS_SIMD_F32X2()

From 7bf2ed96a07acd5f25b98fd4ee5b1c72ee12a4f5 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Thu, 30 Apr 2026 18:25:02 -0700
Subject: [PATCH 4/7] fix _CCCL_DEVICE_API

---
 .../cuda/std/__simd/specializations/intrinsic.h     | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
index 3571b01248d..016f43b55eb 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
@@ -37,7 +37,7 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
 
-_CCCL_API inline void __add_f32x2(
+_CCCL_DEVICE_API inline void __add_f32x2(
   const float __lhs1,
   const float __lhs2,
   const float __rhs1,
@@ -46,9 +46,12 @@ _CCCL_API inline void __add_f32x2(
   float& __result2) noexcept
 {
 #  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
-  const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
-  __result1           = __result.x;
-  __result2           = __result.y;
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
 #  elif __cccl_ptx_isa >= 860ULL
   asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};"
       : "=f"(__result1), "=f"(__result2)
@@ -57,7 +60,7 @@ _CCCL_API inline void __add_f32x2(
 }
 
 template <size_t _Np>
-_CCCL_API constexpr void
+_CCCL_DEVICE_API constexpr void
 __plus_f32x2(const float (&__lhs)[_Np], const float (&__rhs)[_Np], float (&__result)[_Np]) noexcept
 {
   _CCCL_PRAGMA_UNROLL_FULL()

From 3dd3298e83a6c34709bdfa0b4b97ffd82054b764 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 1 May 2026 11:32:05 -0700
Subject: [PATCH 5/7] extend f32x2 optmization to sub/increment/decrement/unary
 minus

---
 .../__simd/specializations/fixed_size_mask.h  |   1 +
 .../specializations/fixed_size_storage.h      |  68 +++++
 .../__simd/specializations/fixed_size_vec.h   | 122 ++++-----
 .../specializations/fp32x2_intrinsics.h       | 255 ++++++++++++++++++
 .../std/__simd/specializations/intrinsic.h    |  83 ------
 libcudacxx/test/simd_codegen/CMakeLists.txt   |   2 +-
 .../test/simd_codegen/decrement_f32x2.cu      |  31 +++
 .../test/simd_codegen/increment_f32x2.cu      |  31 +++
 libcudacxx/test/simd_codegen/minus_f32x2.cu   |  33 +++
 .../{plus_f32.cu => plus_f32x2.cu}            |   0
 .../test/simd_codegen/unary_minus_f32x2.cu    |  31 +++
 11 files changed, 513 insertions(+), 144 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
 delete mode 100644 libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
 create mode 100644 libcudacxx/test/simd_codegen/decrement_f32x2.cu
 create mode 100644 libcudacxx/test/simd_codegen/increment_f32x2.cu
 create mode 100644 libcudacxx/test/simd_codegen/minus_f32x2.cu
 rename libcudacxx/test/simd_codegen/{plus_f32.cu => plus_f32x2.cu} (100%)
 create mode 100644 libcudacxx/test/simd_codegen/unary_minus_f32x2.cu

diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
index a60aa034bbc..ca07cee583b 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h
@@ -24,6 +24,7 @@
 #include <cuda/__utility/in_range.h>
 #include <cuda/std/__cstddef/types.h>
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/specializations/fixed_size_storage.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__utility/integer_sequence.h>
 
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
new file mode 100644
index 00000000000..fc88e3ff7e2
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
@@ -0,0 +1,68 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__utility/in_range.h>
+#include <cuda/std/__fwd/simd.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+template <__simd_size_type _Np>
+struct __fixed_size
+{
+  static_assert(_Np > 0, "_Np must be greater than 0");
+
+  static constexpr __simd_size_type __simd_size = _Np;
+};
+
+// Element-per-slot simd storage for fixed_size ABI
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_storage<_Tp, __fixed_size<_Np>>
+{
+  using value_type = _Tp;
+
+  _Tp __data[_Np]{};
+
+  _CCCL_HIDE_FROM_ABI constexpr __simd_storage()                                 = default;
+  _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&)            = default;
+  _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default;
+
+  [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    return __data[__idx];
+  }
+
+  _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept
+  {
+    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
+    __data[__idx] = __v;
+  }
+};
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
index c87325688e6..a35d08baf48 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -21,12 +21,9 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__cmath/ceil_div.h>
-#include <cuda/__utility/in_range.h>
-#include <cuda/std/__bit/bit_cast.h>
-#include <cuda/std/__cstring/memcpy.h>
 #include <cuda/std/__fwd/simd.h>
-#include <cuda/std/__simd/specializations/intrinsic.h>
+#include <cuda/std/__simd/specializations/fixed_size_storage.h>
+#include <cuda/std/__simd/specializations/fp32x2_intrinsics.h>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__utility/integer_sequence.h>
 
@@ -34,39 +31,6 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
 
-template <__simd_size_type _Np>
-struct __fixed_size
-{
-  static_assert(_Np > 0, "_Np must be greater than 0");
-
-  static constexpr __simd_size_type __simd_size = _Np;
-};
-
-// Element-per-slot simd storage for fixed_size ABI
-template <typename _Tp, __simd_size_type _Np>
-struct __simd_storage<_Tp, __fixed_size<_Np>>
-{
-  using value_type = _Tp;
-
-  _Tp __data[_Np]{};
-
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage()                                 = default;
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&)            = default;
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default;
-
-  [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept
-  {
-    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
-    return __data[__idx];
-  }
-
-  _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept
-  {
-    _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
-    __data[__idx] = __v;
-  }
-};
-
 // Simd operations for fixed_size ABI
 template <typename _Tp, __simd_size_type _Np>
 struct __simd_operations<_Tp, __fixed_size<_Np>>
@@ -108,6 +72,20 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
 
   _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept
   {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
+      {
+        // clang-format off
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                     (constexpr _SimdStorage __one = __broadcast(1.0f);
+                      __s                      = ::cuda::std::simd::__plus_f32x2(__s, __one);
+                      return;))
+        // clang-format on
+      }
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
     {
@@ -117,6 +95,20 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
 
   _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept
   {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
+      {
+        // clang-format off
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                     (constexpr _SimdStorage __one = __broadcast(1.0f);
+                      __s                      = ::cuda::std::simd::__minus_f32x2(__s, __one);
+                      return;))
+        // clang-format on
+      }
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
     {
@@ -150,6 +142,19 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
   {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
+      {
+        // clang-format off
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                     (constexpr _SimdStorage __zero = __broadcast(0.0f);
+                      return ::cuda::std::simd::__minus_f32x2(__zero, __s);))
+        // clang-format on
+      }
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
     _SimdStorage __result;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
@@ -170,31 +175,10 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
     {
       if constexpr (is_same_v<_Tp, float> && _Np >= 2)
       {
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
-                     (_SimdStorage __result; //
-                      ::cuda::std::simd::__plus_f32x2(__lhs.__data, __rhs.__data, __result.__data);
-                      return __result;))
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
       }
     }
 #endif // _CCCL_HAS_SIMD_F32X2()
-
-#if 0 // _CCCL_HAS_NVFP16()
-    if constexpr (is_same_v<_Tp, ::__half> && _Np >= 2)
-    {
-      constexpr auto __half_size = ::cuda::ceil_div(_Np, __simd_size_type{2});
-      using __half2_storage      = __simd_storage<::__half2, __fixed_size<__half_size>>;
-      using __half2_operations   = __simd_operations<::__half2, __fixed_size<__half_size>>;
-      __half2_storage __lhs_half2;
-      __half2_storage __rhs_half2;
-      ::cuda::std::memcpy((void*) &__lhs_half2, (void*) &__lhs, sizeof(__lhs));
-      ::cuda::std::memcpy((void*) &__rhs_half2, (void*) &__rhs, sizeof(__rhs));
-      const auto __result_half2 = __half2_operations::__plus(__lhs_half2, __rhs_half2);
-      _SimdStorage __result;
-      ::cuda::std::memcpy((void*) &__result, (void*) &__result_half2, sizeof(__result));
-      return __result;
-    }
-    else
-#endif // _CCCL_HAS_NVFP16()
     {
       _SimdStorage __result;
       _CCCL_PRAGMA_UNROLL_FULL()
@@ -209,6 +193,15 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
+      {
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
+      }
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
     _SimdStorage __result;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
@@ -221,6 +214,15 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
+      {
+        NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
+      }
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
     _SimdStorage __result;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
new file mode 100644
index 00000000000..8fdd7773c1e
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
@@ -0,0 +1,255 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL)
+#  define _CCCL_HAS_SIMD_F32X2() 1
+#else
+#  define _CCCL_HAS_SIMD_F32X2() 0
+#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL)
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+#  include <cuda/std/__simd/specializations/fixed_size_storage.h>
+
+#  include <nv/target>
+
+#  include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+_CCCL_DEVICE_API inline void __add_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
+#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+  asm("{.reg .b64 __lhs, __rhs, __result;"
+      "mov.b64 __lhs, {%2, %3};"
+      "mov.b64 __rhs, {%4, %5};"
+      "add.f32x2 __result, __lhs, __rhs;"
+      "mov.b64 {%0, %1}, __result;}"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
+#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+}
+
+_CCCL_DEVICE_API inline void __mul_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fmul2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
+#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+  asm("{.reg .b64 __lhs, __rhs, __result;"
+      "mov.b64 __lhs, {%2, %3};"
+      "mov.b64 __rhs, {%4, %5};"
+      "mul.f32x2 __result, __lhs, __rhs;"
+      "mov.b64 {%0, %1}, __result;}"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
+#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+}
+
+_CCCL_DEVICE_API inline void __sub_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{-__rhs1, -__rhs2});
+                __result1           = __result.x;
+                __result2           = __result.y;))
+  // clang-format on
+#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (asm("{.reg .b64 __lhs, __rhs, __result;"
+                    "mov.b64 __lhs, {%2, %3};"
+                    "mov.b64 __rhs, {%4, %5};"
+                    "sub.f32x2 __result, __lhs, __rhs;"
+                    "mov.b64 {%0, %1}, __result;}"
+                    : "=f"(__result1), "=f"(__result2)
+                    : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));))
+  // clang-format on
+#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+}
+
+_CCCL_DEVICE_API inline void __fma_f32x2(
+  const float __lhs1,
+  const float __lhs2,
+  const float __rhs1,
+  const float __rhs2,
+  const float __add1,
+  const float __add2,
+  float& __result1,
+  float& __result2) noexcept
+{
+#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+  // clang-format off
+  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+               (const auto __result =
+                  ::__ffma2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}, ::float2{__add1, __add2});
+                __result1 = __result.x;
+                __result2 = __result.y;))
+  // clang-format on
+#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+  asm("{.reg .b64 __lhs, __rhs, __add, __result;"
+      "mov.b64 __lhs, {%2, %3};"
+      "mov.b64 __rhs, {%4, %5};"
+      "mov.b64 __add, {%6, %7};"
+      "fma.rn.f32x2 __result, __lhs, __rhs, __add;"
+      "mov.b64 {%0, %1}, __result;}"
+      : "=f"(__result1), "=f"(__result2)
+      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2), "f"(__add1), "f"(__add2));
+#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+}
+
+template <__simd_size_type _Np>
+using __simd_storage_f32 = __simd_storage<float, __fixed_size<_Np>>;
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__plus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__add_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] + __rhs.__data[_Np - 1];
+  }
+  return __result;
+}
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__minus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__sub_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] - __rhs.__data[_Np - 1];
+  }
+  return __result;
+}
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__multiplies_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__mul_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1];
+  }
+  return __result;
+}
+
+template <__simd_size_type _Np>
+[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np>
+__fma_f32x2(const __simd_storage_f32<_Np>& __lhs,
+            const __simd_storage_f32<_Np>& __rhs,
+            const __simd_storage_f32<_Np>& __add) noexcept
+{
+  __simd_storage_f32<_Np> __result;
+  _CCCL_PRAGMA_UNROLL_FULL()
+  for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2)
+  {
+    ::cuda::std::simd::__fma_f32x2(
+      __lhs.__data[__i],
+      __lhs.__data[__i + 1],
+      __rhs.__data[__i],
+      __rhs.__data[__i + 1],
+      __add.__data[__i],
+      __add.__data[__i + 1],
+      __result.__data[__i],
+      __result.__data[__i + 1]);
+  }
+  if (_Np % 2 != 0)
+  {
+    __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1] + __add.__data[_Np - 1];
+  }
+  return __result;
+}
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#  include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CCCL_HAS_SIMD_F32X2()
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
deleted file mode 100644
index 016f43b55eb..00000000000
--- a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h
+++ /dev/null
@@ -1,83 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of libcu++ in the CUDA C++ Core Libraries,
-// under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H
-#define _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H
-
-#include <cuda/std/detail/__config>
-
-#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
-#  pragma GCC system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
-#  pragma clang system_header
-#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
-#  pragma system_header
-#endif // no system header
-
-#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL
-#  define _CCCL_HAS_SIMD_F32X2() 1
-#else
-#  define _CCCL_HAS_SIMD_F32X2() 0
-#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL
-
-#if _CCCL_HAS_SIMD_F32X2()
-
-#  include <cuda/std/__cstddef/types.h>
-
-#  include <nv/target>
-
-#  include <cuda/std/__cccl/prologue.h>
-
-_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
-
-_CCCL_DEVICE_API inline void __add_f32x2(
-  const float __lhs1,
-  const float __lhs2,
-  const float __rhs1,
-  const float __rhs2,
-  float& __result1,
-  float& __result2) noexcept
-{
-#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
-  // clang-format off
-  NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
-               (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
-                __result1           = __result.x;
-                __result2           = __result.y;))
-  // clang-format on
-#  elif __cccl_ptx_isa >= 860ULL
-  asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};"
-      : "=f"(__result1), "=f"(__result2)
-      : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
-#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
-}
-
-template <size_t _Np>
-_CCCL_DEVICE_API constexpr void
-__plus_f32x2(const float (&__lhs)[_Np], const float (&__rhs)[_Np], float (&__result)[_Np]) noexcept
-{
-  _CCCL_PRAGMA_UNROLL_FULL()
-  for (size_t __i = 0; __i < (_Np / 2) * 2; __i += 2)
-  {
-    ::cuda::std::simd::__add_f32x2(
-      __lhs[__i], __lhs[__i + 1], __rhs[__i], __rhs[__i + 1], __result[__i], __result[__i + 1]);
-  }
-  if (_Np % 2 != 0)
-  {
-    __result[_Np - 1] = __lhs[_Np - 1] + __rhs[_Np - 1];
-  }
-}
-
-_CCCL_END_NAMESPACE_CUDA_STD_SIMD
-
-#  include <cuda/std/__cccl/epilogue.h>
-
-#endif // _CCCL_HAS_SIMD_F32X2()
-#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index 6e4250695d4..1ef17d67e24 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -29,7 +29,7 @@ function(simd_codegen_add_tests arch)
     cmake_path(GET test_path FILENAME test_file)
     cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
 
-    if (test_name STREQUAL "plus_f32" AND NOT arch EQUAL 100)
+    if (test_name MATCHES "_f32x2$" AND NOT arch EQUAL 100)
       continue()
     endif()
 
diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/decrement_f32x2.cu
new file mode 100644
index 00000000000..14272c13c0c
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/decrement_f32x2.cu
@@ -0,0 +1,31 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* out)
+{
+  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
+
+  Vec_f32_4 vec(values);
+  --vec;
+
+  out[0] = vec[0];
+  out[1] = vec[1];
+  out[2] = vec[2];
+  out[3] = vec[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_decrement_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
+
+#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/increment_f32x2.cu
new file mode 100644
index 00000000000..58ffc9e6225
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/increment_f32x2.cu
@@ -0,0 +1,31 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* out)
+{
+  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
+
+  Vec_f32_4 vec(values);
+  ++vec;
+
+  out[0] = vec[0];
+  out[1] = vec[1];
+  out[2] = vec[2];
+  out[3] = vec[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_increment_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
+
+#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu
new file mode 100644
index 00000000000..8393de408aa
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/minus_f32x2.cu
@@ -0,0 +1,33 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const float* rhs, float* out)
+{
+  const cuda::std::array<float, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]};
+  const cuda::std::array<float, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]};
+
+  const Vec_f32_4 lhs_vec(lhs_values);
+  const Vec_f32_4 rhs_vec(rhs_values);
+  const Vec_f32_4 result = lhs_vec - rhs_vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_minus_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
+
+#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/plus_f32.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu
similarity index 100%
rename from libcudacxx/test/simd_codegen/plus_f32.cu
rename to libcudacxx/test/simd_codegen/plus_f32x2.cu
diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
new file mode 100644
index 00000000000..d70e7b26b83
--- /dev/null
+++ b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
@@ -0,0 +1,31 @@
+#include <cuda/std/__simd_> // IWYU pragma: keep
+#include <cuda/std/array>
+
+#if _CCCL_HAS_SIMD_F32X2()
+
+namespace simd = cuda::std::simd;
+
+using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
+
+extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, float* out)
+{
+  const cuda::std::array<float, 4> values{in[0], in[1], in[2], in[3]};
+
+  const Vec_f32_4 vec(values);
+  const Vec_f32_4 result = -vec;
+
+  out[0] = result[0];
+  out[1] = result[1];
+  out[2] = result[2];
+  out[3] = result[3];
+}
+
+/*
+
+; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_unary_minus_f32_4
+; SM100: {{.*FADD2.*}}
+; SM100: {{.*FADD2.*}}
+
+*/
+
+#endif // _CCCL_HAS_SIMD_F32X2()

From 6a29aa2c60bf4e53cf05d545a0871f5298033241 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 1 May 2026 14:55:29 -0700
Subject: [PATCH 6/7] address comments

---
 .../specializations/fixed_size_storage.h      |  4 ---
 .../__simd/specializations/fixed_size_vec.h   |  4 +--
 .../specializations/fp32x2_intrinsics.h       | 33 +++++++++----------
 .../test/atomic_codegen/dump_and_check.bash   | 10 +++---
 libcudacxx/test/simd_codegen/CMakeLists.txt   | 12 ++++++-
 .../test/simd_codegen/decrement_f32x2.cu      | 14 +++++---
 libcudacxx/test/simd_codegen/fma_bf16.cu      | 10 ++++++
 libcudacxx/test/simd_codegen/fma_f16.cu       | 10 ++++++
 .../test/simd_codegen/increment_f32x2.cu      | 14 +++++---
 libcudacxx/test/simd_codegen/less_bf16.cu     | 10 ++++++
 libcudacxx/test/simd_codegen/less_f16.cu      | 10 ++++++
 libcudacxx/test/simd_codegen/minus_f32x2.cu   | 14 +++++---
 .../test/simd_codegen/multiplies_bf16.cu      | 10 ++++++
 .../test/simd_codegen/multiplies_f16.cu       | 10 ++++++
 libcudacxx/test/simd_codegen/plus_bf16.cu     | 10 ++++++
 libcudacxx/test/simd_codegen/plus_f16.cu      | 10 ++++++
 libcudacxx/test/simd_codegen/plus_f32x2.cu    | 14 +++++---
 .../test/simd_codegen/unary_minus_f32x2.cu    | 14 +++++---
 18 files changed, 164 insertions(+), 49 deletions(-)

diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
index fc88e3ff7e2..846142cb62f 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h
@@ -44,10 +44,6 @@ struct __simd_storage<_Tp, __fixed_size<_Np>>
 
   _Tp __data[_Np]{};
 
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage()                                 = default;
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&)            = default;
-  _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default;
-
   [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept
   {
     _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds");
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
index a35d08baf48..16bf2fbd459 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -80,7 +80,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
         // clang-format off
         NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                      (constexpr _SimdStorage __one = __broadcast(1.0f);
-                      __s                      = ::cuda::std::simd::__plus_f32x2(__s, __one);
+                      __s                          = ::cuda::std::simd::__plus_f32x2(__s, __one);
                       return;))
         // clang-format on
       }
@@ -103,7 +103,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
         // clang-format off
         NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                      (constexpr _SimdStorage __one = __broadcast(1.0f);
-                      __s                      = ::cuda::std::simd::__minus_f32x2(__s, __one);
+                      __s                          = ::cuda::std::simd::__minus_f32x2(__s, __one);
                       return;))
         // clang-format on
       }
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
index 8fdd7773c1e..ae80855c662 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h
@@ -21,11 +21,10 @@
 #  pragma system_header
 #endif // no system header
 
-#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL)
-#  define _CCCL_HAS_SIMD_F32X2() 1
-#else
-#  define _CCCL_HAS_SIMD_F32X2() 0
-#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL)
+#define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK())
+#define _CCCL_HAS_SIMD_F32X2_PTX()        (__cccl_ptx_isa >= 860ULL)
+
+#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX())
 
 #if _CCCL_HAS_SIMD_F32X2()
 
@@ -45,14 +44,14 @@ _CCCL_DEVICE_API inline void __add_f32x2(
   float& __result1,
   float& __result2) noexcept
 {
-#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   // clang-format off
   NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
                 __result1           = __result.x;
                 __result2           = __result.y;))
   // clang-format on
-#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
   asm("{.reg .b64 __lhs, __rhs, __result;"
       "mov.b64 __lhs, {%2, %3};"
       "mov.b64 __rhs, {%4, %5};"
@@ -60,7 +59,7 @@ _CCCL_DEVICE_API inline void __add_f32x2(
       "mov.b64 {%0, %1}, __result;}"
       : "=f"(__result1), "=f"(__result2)
       : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
-#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
 }
 
 _CCCL_DEVICE_API inline void __mul_f32x2(
@@ -71,14 +70,14 @@ _CCCL_DEVICE_API inline void __mul_f32x2(
   float& __result1,
   float& __result2) noexcept
 {
-#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   // clang-format off
   NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                (const auto __result = ::__fmul2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2});
                 __result1           = __result.x;
                 __result2           = __result.y;))
   // clang-format on
-#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
   asm("{.reg .b64 __lhs, __rhs, __result;"
       "mov.b64 __lhs, {%2, %3};"
       "mov.b64 __rhs, {%4, %5};"
@@ -86,7 +85,7 @@ _CCCL_DEVICE_API inline void __mul_f32x2(
       "mov.b64 {%0, %1}, __result;}"
       : "=f"(__result1), "=f"(__result2)
       : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));
-#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
 }
 
 _CCCL_DEVICE_API inline void __sub_f32x2(
@@ -97,14 +96,14 @@ _CCCL_DEVICE_API inline void __sub_f32x2(
   float& __result1,
   float& __result2) noexcept
 {
-#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   // clang-format off
   NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{-__rhs1, -__rhs2});
                 __result1           = __result.x;
                 __result2           = __result.y;))
   // clang-format on
-#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
   // clang-format off
   NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                (asm("{.reg .b64 __lhs, __rhs, __result;"
@@ -115,7 +114,7 @@ _CCCL_DEVICE_API inline void __sub_f32x2(
                     : "=f"(__result1), "=f"(__result2)
                     : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));))
   // clang-format on
-#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
 }
 
 _CCCL_DEVICE_API inline void __fma_f32x2(
@@ -128,7 +127,7 @@ _CCCL_DEVICE_API inline void __fma_f32x2(
   float& __result1,
   float& __result2) noexcept
 {
-#  if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  if _CCCL_HAS_SIMD_F32X2_INTRINSICS()
   // clang-format off
   NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
                (const auto __result =
@@ -136,7 +135,7 @@ _CCCL_DEVICE_API inline void __fma_f32x2(
                 __result1 = __result.x;
                 __result2 = __result.y;))
   // clang-format on
-#  elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6
+#  elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6
   asm("{.reg .b64 __lhs, __rhs, __add, __result;"
       "mov.b64 __lhs, {%2, %3};"
       "mov.b64 __rhs, {%4, %5};"
@@ -145,7 +144,7 @@ _CCCL_DEVICE_API inline void __fma_f32x2(
       "mov.b64 {%0, %1}, __result;}"
       : "=f"(__result1), "=f"(__result2)
       : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2), "f"(__add1), "f"(__add2));
-#  endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8)
+#  endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS()
 }
 
 template <__simd_size_type _Np>
diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
index 51e9978dc57..525c188700e 100755
--- a/libcudacxx/test/atomic_codegen/dump_and_check.bash
+++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash
@@ -2,10 +2,10 @@
 set -euo pipefail
 
 ## Usage: dump_and_check test.a test.cu PREFIXES [cuobjdump-mode]
-input_archive=$1
-input_testfile=$2
-input_prefix=$3
-dump_mode=${4:---dump-ptx}
-filecheck=${FILECHECK:-FileCheck}
+input_archive="${1}"
+input_testfile="${2}"
+input_prefix="${3}"
+dump_mode="${4:---dump-ptx}"
+filecheck="${FILECHECK:-FileCheck}"
 
 cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes "${input_prefix}" "${input_testfile}"
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index 1ef17d67e24..ab882c76ef0 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -1,3 +1,13 @@
+##===----------------------------------------------------------------------===##
+##
+## Part of libcu++ in the CUDA C++ Core Libraries,
+## under the Apache License v2.0 with LLVM Exceptions.
+## See https://llvm.org/LICENSE.txt for license information.
+## SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+## SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+##
+##===----------------------------------------------------------------------===##
+
 add_custom_target(libcudacxx.test.simd.sass)
 
 find_program(
@@ -20,7 +30,7 @@ if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
 endif()
 
 set(simd_codegen_cuda_archs 80 90)
-if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9)
+if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
   list(APPEND simd_codegen_cuda_archs 100 120)
 endif()
 
diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/decrement_f32x2.cu
index 14272c13c0c..41ec986c9c8 100644
--- a/libcudacxx/test/simd_codegen/decrement_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/decrement_f32x2.cu
@@ -1,8 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
-#if _CCCL_HAS_SIMD_F32X2()
-
 namespace simd = cuda::std::simd;
 
 using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
@@ -27,5 +35,3 @@ extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float*
 ; SM100: {{.*FADD2.*}}
 
 */
-
-#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu
index e9f9182bd6e..f608f209e09 100644
--- a/libcudacxx/test/simd_codegen/fma_bf16.cu
+++ b/libcudacxx/test/simd_codegen/fma_bf16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu
index 901d341fe46..a0dad310ac9 100644
--- a/libcudacxx/test/simd_codegen/fma_f16.cu
+++ b/libcudacxx/test/simd_codegen/fma_f16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/increment_f32x2.cu
index 58ffc9e6225..9c696f11e40 100644
--- a/libcudacxx/test/simd_codegen/increment_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/increment_f32x2.cu
@@ -1,8 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
-#if _CCCL_HAS_SIMD_F32X2()
-
 namespace simd = cuda::std::simd;
 
 using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
@@ -27,5 +35,3 @@ extern "C" __global__ void test_operator_increment_f32_4(const float* in, float*
 ; SM100: {{.*FADD2.*}}
 
 */
-
-#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu
index 3fc6287a32c..fff63e06d1c 100644
--- a/libcudacxx/test/simd_codegen/less_bf16.cu
+++ b/libcudacxx/test/simd_codegen/less_bf16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu
index 97e5070f5a3..beef111b99e 100644
--- a/libcudacxx/test/simd_codegen/less_f16.cu
+++ b/libcudacxx/test/simd_codegen/less_f16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu
index 8393de408aa..2c00e62b9ee 100644
--- a/libcudacxx/test/simd_codegen/minus_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/minus_f32x2.cu
@@ -1,8 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
-#if _CCCL_HAS_SIMD_F32X2()
-
 namespace simd = cuda::std::simd;
 
 using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
@@ -29,5 +37,3 @@ extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const flo
 ; SM100: {{.*FADD2.*}}
 
 */
-
-#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
index c66bc326b60..3036c4fd526 100644
--- a/libcudacxx/test/simd_codegen/multiplies_bf16.cu
+++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu
index aa0c32fbe35..bf721009051 100644
--- a/libcudacxx/test/simd_codegen/multiplies_f16.cu
+++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu
index 4492a4edca1..3d192fdad83 100644
--- a/libcudacxx/test/simd_codegen/plus_bf16.cu
+++ b/libcudacxx/test/simd_codegen/plus_bf16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu
index 5a56242da89..4a044a19d1c 100644
--- a/libcudacxx/test/simd_codegen/plus_f16.cu
+++ b/libcudacxx/test/simd_codegen/plus_f16.cu
@@ -1,3 +1,13 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
diff --git a/libcudacxx/test/simd_codegen/plus_f32x2.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu
index b8b11732c55..b0caf15341b 100644
--- a/libcudacxx/test/simd_codegen/plus_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/plus_f32x2.cu
@@ -1,8 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
-#if _CCCL_HAS_SIMD_F32X2()
-
 namespace simd = cuda::std::simd;
 
 using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
@@ -29,5 +37,3 @@ extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const floa
 ; SM100: {{.*FADD2.*}}
 
 */
-
-#endif // _CCCL_HAS_SIMD_F32X2()
diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
index d70e7b26b83..02d9c14f18b 100644
--- a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
+++ b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu
@@ -1,8 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
 #include <cuda/std/__simd_> // IWYU pragma: keep
 #include <cuda/std/array>
 
-#if _CCCL_HAS_SIMD_F32X2()
-
 namespace simd = cuda::std::simd;
 
 using Vec_f32_4 = simd::basic_vec<float, simd::fixed_size<4>>;
@@ -27,5 +35,3 @@ extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, floa
 ; SM100: {{.*FADD2.*}}
 
 */
-
-#endif // _CCCL_HAS_SIMD_F32X2()

From 58e44c59e9bbbf30ccace8971516db94c11bd678 Mon Sep 17 00:00:00 2001
From: fbusato <fbusato@nvidia.com>
Date: Fri, 1 May 2026 17:05:46 -0700
Subject: [PATCH 7/7] improve organization

---
 libcudacxx/include/cuda/std/__fwd/simd.h      |   2 +-
 .../include/cuda/std/__simd/basic_vec.h       |   1 +
 .../specializations/fixed_size_float_vec.h    | 128 ++++++++++++++++++
 .../__simd/specializations/fixed_size_vec.h   |  98 +++-----------
 .../test/atomic_codegen/dump_and_check.bash   |   2 +-
 libcudacxx/test/simd_codegen/CMakeLists.txt   |  41 ++++--
 6 files changed, 182 insertions(+), 90 deletions(-)
 create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h

diff --git a/libcudacxx/include/cuda/std/__fwd/simd.h b/libcudacxx/include/cuda/std/__fwd/simd.h
index 282d5288914..9619f0c8998 100644
--- a/libcudacxx/include/cuda/std/__fwd/simd.h
+++ b/libcudacxx/include/cuda/std/__fwd/simd.h
@@ -46,7 +46,7 @@ using mask = basic_mask<sizeof(_Tp), __deduce_abi_t<_Tp, _Np>>;
 template <typename _Tp, typename _Abi>
 struct __simd_storage;
 
-template <typename _Tp, typename _Abi>
+template <typename _Tp, typename _Abi, typename = void>
 struct __simd_operations;
 
 template <size_t _Bytes, typename _Abi>
diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h
index 8bda465ed17..f5f1459ecf3 100644
--- a/libcudacxx/include/cuda/std/__simd/basic_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h
@@ -31,6 +31,7 @@
 #include <cuda/std/__simd/concepts.h>
 #include <cuda/std/__simd/flag.h>
 #include <cuda/std/__simd/iterator.h>
+#include <cuda/std/__simd/specializations/fixed_size_float_vec.h>
 #include <cuda/std/__simd/specializations/fixed_size_vec.h>
 #include <cuda/std/__simd/utility.h>
 #include <cuda/std/__type_traits/enable_if.h>
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h
new file mode 100644
index 00000000000..b5378fb21e1
--- /dev/null
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h
@@ -0,0 +1,128 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++ in the CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H
+#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__simd/specializations/fixed_size_vec.h>
+#include <cuda/std/__simd/specializations/fp32x2_intrinsics.h>
+#include <cuda/std/__type_traits/enable_if.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
+
+// Simd operations for fixed_size ABI with float elements and F32x2 fast paths.
+template <__simd_size_type _Np>
+struct __simd_operations<float, __fixed_size<_Np>, enable_if_t<__is_fixed_size_float_v<float, _Np>>>
+    : __fixed_size_operations<float, _Np>
+{
+  using __base       = __fixed_size_operations<float, _Np>;
+  using _SimdStorage = __simd_storage<float, __fixed_size<_Np>>;
+
+  _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      // clang-format off
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                   (constexpr _SimdStorage __one = __base::__broadcast(1.0f);
+                    __s                          = ::cuda::std::simd::__plus_f32x2(__s, __one);
+                    return;))
+      // clang-format on
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    __base::__increment(__s);
+  }
+
+  _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      // clang-format off
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                   (constexpr _SimdStorage __one = __base::__broadcast(1.0f);
+                    __s                          = ::cuda::std::simd::__minus_f32x2(__s, __one);
+                    return;))
+      // clang-format on
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    __base::__decrement(__s);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      // clang-format off
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
+                   (constexpr _SimdStorage __zero = __base::__broadcast(0.0f);
+                    return ::cuda::std::simd::__minus_f32x2(__zero, __s);))
+      // clang-format on
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__unary_minus(__s);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__plus(__lhs, __rhs);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__minus(__lhs, __rhs);
+  }
+
+  [[nodiscard]] _CCCL_API static constexpr _SimdStorage
+  __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
+  {
+#if _CCCL_HAS_SIMD_F32X2()
+    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
+    {
+      NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
+    }
+#endif // _CCCL_HAS_SIMD_F32X2()
+    return __base::__multiplies(__lhs, __rhs);
+  }
+};
+
+_CCCL_END_NAMESPACE_CUDA_STD_SIMD
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H
diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
index 16bf2fbd459..703d1617eb7 100644
--- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
+++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h
@@ -22,9 +22,11 @@
 #endif // no system header
 
 #include <cuda/std/__fwd/simd.h>
+#include <cuda/std/__simd/specializations/fixed_size_mask.h>
 #include <cuda/std/__simd/specializations/fixed_size_storage.h>
-#include <cuda/std/__simd/specializations/fp32x2_intrinsics.h>
+#include <cuda/std/__type_traits/enable_if.h>
 #include <cuda/std/__type_traits/integral_constant.h>
+#include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__utility/integer_sequence.h>
 
 #include <cuda/std/__cccl/prologue.h>
@@ -33,7 +35,10 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD
 
 // Simd operations for fixed_size ABI
 template <typename _Tp, __simd_size_type _Np>
-struct __simd_operations<_Tp, __fixed_size<_Np>>
+inline constexpr bool __is_fixed_size_float_v = is_same_v<_Tp, float> && _Np >= 2;
+
+template <typename _Tp, __simd_size_type _Np>
+struct __fixed_size_operations
 {
   using _SimdStorage = __simd_storage<_Tp, __fixed_size<_Np>>;
   using _MaskStorage = __mask_storage<sizeof(_Tp), __fixed_size<_Np>>;
@@ -58,7 +63,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
     ((__result.__data[_Is] = __g(integral_constant<__simd_size_type, _Is>())), ...);
     return __result;
 #else // ^^^ C++20 ^^^ / vvv C++17 vvv
-    return _SimdStorage{{ __g(integral_constant<__simd_size_type, _Is>())... }};
+    return _SimdStorage{{__g(integral_constant<__simd_size_type, _Is>())...}};
 #endif // _CCCL_STD_VER < 2020
   }
 
@@ -72,20 +77,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
 
   _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept
   {
-#if _CCCL_HAS_SIMD_F32X2()
-    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
-    {
-      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
-      {
-        // clang-format off
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
-                     (constexpr _SimdStorage __one = __broadcast(1.0f);
-                      __s                          = ::cuda::std::simd::__plus_f32x2(__s, __one);
-                      return;))
-        // clang-format on
-      }
-    }
-#endif // _CCCL_HAS_SIMD_F32X2()
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
     {
@@ -95,20 +86,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
 
   _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept
   {
-#if _CCCL_HAS_SIMD_F32X2()
-    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
-    {
-      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
-      {
-        // clang-format off
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
-                     (constexpr _SimdStorage __one = __broadcast(1.0f);
-                      __s                          = ::cuda::std::simd::__minus_f32x2(__s, __one);
-                      return;))
-        // clang-format on
-      }
-    }
-#endif // _CCCL_HAS_SIMD_F32X2()
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
     {
@@ -142,19 +119,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept
   {
-#if _CCCL_HAS_SIMD_F32X2()
-    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
-    {
-      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
-      {
-        // clang-format off
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100,
-                     (constexpr _SimdStorage __zero = __broadcast(0.0f);
-                      return ::cuda::std::simd::__minus_f32x2(__zero, __s);))
-        // clang-format on
-      }
-    }
-#endif // _CCCL_HAS_SIMD_F32X2()
     _SimdStorage __result;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
@@ -170,38 +134,18 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
-#if _CCCL_HAS_SIMD_F32X2()
-    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
-    {
-      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
-      {
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);))
-      }
-    }
-#endif // _CCCL_HAS_SIMD_F32X2()
+    _SimdStorage __result;
+    _CCCL_PRAGMA_UNROLL_FULL()
+    for (__simd_size_type __i = 0; __i < _Np; ++__i)
     {
-      _SimdStorage __result;
-      _CCCL_PRAGMA_UNROLL_FULL()
-      for (__simd_size_type __i = 0; __i < _Np; ++__i)
-      {
-        __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]);
-      }
-      return __result;
+      __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]);
     }
+    return __result;
   }
 
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
-#if _CCCL_HAS_SIMD_F32X2()
-    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
-    {
-      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
-      {
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);))
-      }
-    }
-#endif // _CCCL_HAS_SIMD_F32X2()
     _SimdStorage __result;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
@@ -214,15 +158,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
   [[nodiscard]] _CCCL_API static constexpr _SimdStorage
   __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept
   {
-#if _CCCL_HAS_SIMD_F32X2()
-    _CCCL_IF_NOT_CONSTEVAL_DEFAULT
-    {
-      if constexpr (is_same_v<_Tp, float> && _Np >= 2)
-      {
-        NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);))
-      }
-    }
-#endif // _CCCL_HAS_SIMD_F32X2()
     _SimdStorage __result;
     _CCCL_PRAGMA_UNROLL_FULL()
     for (__simd_size_type __i = 0; __i < _Np; ++__i)
@@ -392,6 +327,13 @@ struct __simd_operations<_Tp, __fixed_size<_Np>>
     return __result;
   }
 };
+
+// Default path (no optimizations)
+template <typename _Tp, __simd_size_type _Np>
+struct __simd_operations<_Tp, __fixed_size<_Np>, enable_if_t<!__is_fixed_size_float_v<_Tp, _Np>>>
+    : __fixed_size_operations<_Tp, _Np>
+{};
+
 _CCCL_END_NAMESPACE_CUDA_STD_SIMD
 
 #include <cuda/std/__cccl/epilogue.h>
diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash
index 525c188700e..326fe0dd186 100755
--- a/libcudacxx/test/atomic_codegen/dump_and_check.bash
+++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash
@@ -8,4 +8,4 @@ input_prefix="${3}"
 dump_mode="${4:---dump-ptx}"
 filecheck="${FILECHECK:-FileCheck}"
 
-cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes "${input_prefix}" "${input_testfile}"
+cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes="${input_prefix}" "${input_testfile}"
diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt
index ab882c76ef0..7a82933ba97 100644
--- a/libcudacxx/test/simd_codegen/CMakeLists.txt
+++ b/libcudacxx/test/simd_codegen/CMakeLists.txt
@@ -34,17 +34,38 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
   list(APPEND simd_codegen_cuda_archs 100 120)
 endif()
 
-function(simd_codegen_add_tests arch)
-  foreach (test_path IN LISTS libcudacxx_simd_codegen_tests)
-    cmake_path(GET test_path FILENAME test_file)
-    cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
+function(simd_codegen_add_tests test_path)
+  cmake_path(GET test_path FILENAME test_file)
+  cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name)
 
-    if (test_name MATCHES "_f32x2$" AND NOT arch EQUAL 100)
-      continue()
-    endif()
+  file(READ "${test_path}" test_contents)
+  string(
+    REGEX MATCH
+    "SM[0-9][0-9]*"
+    has_arch_specific_prefix
+    "${test_contents}"
+  )
+
+  # search for SMXX prefixes in the test contents
+  set(test_archs)
+  if (has_arch_specific_prefix)
+    foreach (arch IN LISTS simd_codegen_cuda_archs)
+      string(FIND "${test_contents}" "SM${arch}" arch_specific_prefix)
+      if (NOT arch_specific_prefix EQUAL -1)
+        list(APPEND test_archs "${arch}")
+      endif()
+    endforeach()
+  else()
+    set(test_archs ${simd_codegen_cuda_archs})
+  endif()
 
+  # Run tests for each architecture specified in the test file
+  foreach (arch IN LISTS test_archs)
     set(target_name "simd_codegen_sm${arch}_${test_name}")
-    set(check_prefixes "SMXX,SM${arch}")
+    set(check_prefixes "SMXX")
+    if (has_arch_specific_prefix)
+      string(APPEND check_prefixes ",SM${arch}")
+    endif()
 
     add_library(${target_name} STATIC "${test_path}")
 
@@ -77,6 +98,6 @@ function(simd_codegen_add_tests arch)
   endforeach()
 endfunction()
 
-foreach (arch IN LISTS simd_codegen_cuda_archs)
-  simd_codegen_add_tests(${arch})
+foreach (test_path IN LISTS libcudacxx_simd_codegen_tests)
+  simd_codegen_add_tests("${test_path}")
 endforeach()