From 691608bde0ce3a18395c41297ec9386a8f36ac79 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 30 Apr 2026 16:40:00 -0700 Subject: [PATCH 1/7] half/bfloat plus/multiply/fma --- .../__simd/specializations/fixed_size_vec.h | 32 +++++++-- libcudacxx/test/CMakeLists.txt | 1 + .../test/atomic_codegen/dump_and_check.bash | 6 +- libcudacxx/test/simd_codegen/CMakeLists.txt | 65 +++++++++++++++++++ libcudacxx/test/simd_codegen/fma_bf16.cu | 44 +++++++++++++ libcudacxx/test/simd_codegen/fma_f16.cu | 43 ++++++++++++ .../test/simd_codegen/multiplies_bf16.cu | 42 ++++++++++++ .../test/simd_codegen/multiplies_f16.cu | 41 ++++++++++++ libcudacxx/test/simd_codegen/plus_bf16.cu | 42 ++++++++++++ libcudacxx/test/simd_codegen/plus_f16.cu | 35 ++++++++++ 10 files changed, 344 insertions(+), 7 deletions(-) create mode 100644 libcudacxx/test/simd_codegen/CMakeLists.txt create mode 100644 libcudacxx/test/simd_codegen/fma_bf16.cu create mode 100644 libcudacxx/test/simd_codegen/fma_f16.cu create mode 100644 libcudacxx/test/simd_codegen/multiplies_bf16.cu create mode 100644 libcudacxx/test/simd_codegen/multiplies_f16.cu create mode 100644 libcudacxx/test/simd_codegen/plus_bf16.cu create mode 100644 libcudacxx/test/simd_codegen/plus_f16.cu diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h index 0dc5576d935..c4ede831227 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -21,7 +21,10 @@ # pragma system_header #endif // no system header +#include #include +#include +#include #include #include #include @@ -161,13 +164,32 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (__simd_size_type __i = 0; __i < _Np; ++__i) +#if 0 // _CCCL_HAS_NVFP16() + if constexpr (is_same_v<_Tp, ::__half> && _Np >= 2) { - __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]); + constexpr auto __half_size = ::cuda::ceil_div(_Np, __simd_size_type{2}); + using __half2_storage = __simd_storage<::__half2, __fixed_size<__half_size>>; + using __half2_operations = __simd_operations<::__half2, __fixed_size<__half_size>>; + __half2_storage __lhs_half2; + __half2_storage __rhs_half2; + ::cuda::std::memcpy((void*) &__lhs_half2, (void*) &__lhs, sizeof(__lhs)); + ::cuda::std::memcpy((void*) &__rhs_half2, (void*) &__rhs, sizeof(__rhs)); + const auto __result_half2 = __half2_operations::__plus(__lhs_half2, __rhs_half2); + _SimdStorage __result; + ::cuda::std::memcpy((void*) &__result, (void*) &__result_half2, sizeof(__result)); + return __result; + } + else +#endif // _CCCL_HAS_NVFP16() + { + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) + { + __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]); + } + return __result; } - return __result; } [[nodiscard]] _CCCL_API static constexpr _SimdStorage diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt index 039c917fa7d..20fb09610ed 100644 --- a/libcudacxx/test/CMakeLists.txt +++ b/libcudacxx/test/CMakeLists.txt @@ -98,3 +98,4 @@ endif() add_subdirectory(nvtarget) add_subdirectory(atomic_codegen) +add_subdirectory(simd_codegen) diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash index c66bb433637..0e43ed98cc3 100755 --- a/libcudacxx/test/atomic_codegen/dump_and_check.bash +++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash @@ -1,9 +1,11 @@ #!/bin/bash set -e -## Usage: dump_and_check test.a test.cu PREFIX +## Usage: dump_and_check test.a test.cu PREFIXES [cuobjdump-mode] input_archive=$1 input_testfile=$2 input_prefix=$3 +dump_mode=${4:---dump-ptx} +filecheck=${FILECHECK:-FileCheck} -cuobjdump --dump-ptx $input_archive | FileCheck --match-full-lines --check-prefix $input_prefix $input_testfile +cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes "${input_prefix}" "${input_testfile}" diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt new file mode 100644 index 00000000000..4c25e72b8d6 --- /dev/null +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -0,0 +1,65 @@ +add_custom_target(libcudacxx.test.simd.sass) + +find_program(filecheck NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18) + +if (filecheck) + message("-- ${filecheck} found... building simd codegen tests") +else() + return() +endif() + +find_program(cuobjdump "cuobjdump" REQUIRED) +find_program(bash "bash" REQUIRED) + +set(libcudacxx_simd_codegen_tests) +if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + file(GLOB libcudacxx_simd_codegen_tests "*.cu") +endif() + +set(simd_codegen_cuda_archs 80 90) +if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9) + list(APPEND simd_codegen_cuda_archs 100 120) +endif() + +function(simd_codegen_add_tests arch) + foreach (test_path IN LISTS libcudacxx_simd_codegen_tests) + cmake_path(GET test_path FILENAME test_file) + cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) + + set(target_name "simd_codegen_sm${arch}_${test_name}") + set(check_prefixes "SMXX,SM${arch}") + + add_library(${target_name} STATIC "${test_path}") + + set_target_properties( + ${target_name} + PROPERTIES CUDA_ARCHITECTURES "${arch}" + ) + + target_compile_options(${target_name} PRIVATE "-Wno-comment") + + target_include_directories( + ${target_name} + PRIVATE "${libcudacxx_SOURCE_DIR}/include" + ) + add_dependencies(libcudacxx.test.simd.sass ${target_name}) + + add_custom_command( + TARGET libcudacxx.test.simd.sass + POST_BUILD + # gersemi: off + COMMAND + ${CMAKE_COMMAND} -E env "FILECHECK=${filecheck}" + "${CMAKE_CURRENT_SOURCE_DIR}/../atomic_codegen/dump_and_check.bash" + $ + "${test_path}" + "${check_prefixes}" + --dump-sass + # gersemi: on + ) + endforeach() +endfunction() + +foreach (arch IN LISTS simd_codegen_cuda_archs) + simd_codegen_add_tests(${arch}) +endforeach() diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu new file mode 100644 index 00000000000..1b02e8ba22a --- /dev/null +++ b/libcudacxx/test/simd_codegen/fma_bf16.cu @@ -0,0 +1,44 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void test_fma_bf16_4( + const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> add_values{add[0], add[1], add[2], add[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const Vec_bf16_4 add_vec(add_values); + const Vec_bf16_4 result = lhs_vec * rhs_vec + add_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_bf16_4 +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu new file mode 100644 index 00000000000..62af327f9c3 --- /dev/null +++ b/libcudacxx/test/simd_codegen/fma_f16.cu @@ -0,0 +1,43 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_fma_f16_4(const __half* lhs, const __half* rhs, const __half* add, __half* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + const cuda::std::array<__half, 4> add_values{add[0], add[1], add[2], add[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const Vec_f16_4 add_vec(add_values); + const Vec_f16_4 result = lhs_vec * rhs_vec + add_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_f16_4 +; SM80: {{.*HFMA2.*}} +; SM80: {{.*HFMA2.*}} +; SM90: {{.*HFMA2.*}} +; SM90: {{.*HFMA2.*}} +; SM100: {{.*HFMA2.*}} +; SM100: {{.*HFMA2.*}} +; SM120: {{.*HFMA2.*}} +; SM120: {{.*HFMA2.*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu new file mode 100644 index 00000000000..37d0a3d9533 --- /dev/null +++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu @@ -0,0 +1,42 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void +test_operator_multiplies_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const Vec_bf16_4 result = lhs_vec * rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_bf16_4 +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HMUL2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HMUL2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HMUL2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu new file mode 100644 index 00000000000..06f40655f22 --- /dev/null +++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu @@ -0,0 +1,41 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_operator_multiplies_f16_4(const __half* lhs, const __half* rhs, __half* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const Vec_f16_4 result = lhs_vec * rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_f16_4 +; SM80: {{.*(HMUL2|HFMA2).*}} +; SM80: {{.*(HMUL2|HFMA2).*}} +; SM90: {{.*(HMUL2|HFMA2).*}} +; SM90: {{.*(HMUL2|HFMA2).*}} +; SM100: {{.*(HMUL2|HFMA2).*}} +; SM100: {{.*(HMUL2|HFMA2).*}} +; SM120: {{.*(HMUL2|HFMA2).*}} +; SM120: {{.*(HMUL2|HFMA2).*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu new file mode 100644 index 00000000000..dcffca128b0 --- /dev/null +++ b/libcudacxx/test/simd_codegen/plus_bf16.cu @@ -0,0 +1,42 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void +test_operator_plus_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const Vec_bf16_4 result = lhs_vec + rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_bf16_4 +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HADD2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HADD2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HADD2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu new file mode 100644 index 00000000000..03bb80bbf5c --- /dev/null +++ b/libcudacxx/test/simd_codegen/plus_f16.cu @@ -0,0 +1,35 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_operator_plus_f16_4(const __half* lhs, const __half* rhs, __half* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const Vec_f16_4 result = lhs_vec + rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f16_4 +; SMXX: {{.*(HADD2|HFMA2).*}} +; SMXX: {{.*(HADD2|HFMA2).*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() From fc02d50f7179c9455d3c4d7501ffca4fdd9112ea Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 30 Apr 2026 17:02:15 -0700 Subject: [PATCH 2/7] check comparison --- libcudacxx/test/simd_codegen/less_bf16.cu | 44 +++++++++++++++++++++++ libcudacxx/test/simd_codegen/less_f16.cu | 41 +++++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 libcudacxx/test/simd_codegen/less_bf16.cu create mode 100644 libcudacxx/test/simd_codegen/less_f16.cu diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu new file mode 100644 index 00000000000..bb61ebdc672 --- /dev/null +++ b/libcudacxx/test/simd_codegen/less_bf16.cu @@ -0,0 +1,44 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void +test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const auto result = lhs_vec < rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_bf16_4 +; SM80: {{.*FSETP\.LT.*}} +; SM80: {{.*FSETP\.LT.*}} +; SM80: {{.*FSETP\.LT.*}} +; SM80: {{.*FSETP\.LT.*}} +; SM90: {{.*HSETP2.*BF16.*}} +; SM90: {{.*HSETP2.*BF16.*}} +; SM100: {{.*HSETP2.*BF16.*}} +; SM100: {{.*HSETP2.*BF16.*}} +; SM120: {{.*HSETP2.*BF16.*}} +; SM120: {{.*HSETP2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu new file mode 100644 index 00000000000..408d90e46d3 --- /dev/null +++ b/libcudacxx/test/simd_codegen/less_f16.cu @@ -0,0 +1,41 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +#include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_less_f16_4(const __half* lhs, const __half* rhs, bool* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const auto result = lhs_vec < rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_f16_4 +; SM80: {{.*HSETP2.*}} +; SM80: {{.*HSETP2.*}} +; SM90: {{.*HSETP2.*}} +; SM90: {{.*HSETP2.*}} +; SM100: {{.*HSETP2.*}} +; SM100: {{.*HSETP2.*}} +; SM120: {{.*HSETP2.*}} +; SM120: {{.*HSETP2.*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() From 4ef18dbc9ef5785c04b8285b28e734541c3161f5 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 30 Apr 2026 18:17:13 -0700 Subject: [PATCH 3/7] test f32x2 --- .../__simd/specializations/fixed_size_vec.h | 14 ++++ .../std/__simd/specializations/intrinsic.h | 80 +++++++++++++++++++ libcudacxx/test/simd_codegen/CMakeLists.txt | 9 ++- libcudacxx/test/simd_codegen/fma_bf16.cu | 6 +- libcudacxx/test/simd_codegen/fma_f16.cu | 2 +- libcudacxx/test/simd_codegen/less_bf16.cu | 5 +- libcudacxx/test/simd_codegen/less_f16.cu | 2 +- .../test/simd_codegen/multiplies_bf16.cu | 2 +- .../test/simd_codegen/multiplies_f16.cu | 2 +- libcudacxx/test/simd_codegen/plus_bf16.cu | 2 +- libcudacxx/test/simd_codegen/plus_f16.cu | 2 +- libcudacxx/test/simd_codegen/plus_f32.cu | 33 ++++++++ 12 files changed, 146 insertions(+), 13 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h create mode 100644 libcudacxx/test/simd_codegen/plus_f32.cu diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h index c4ede831227..c87325688e6 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -164,6 +165,19 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (is_same_v<_Tp, float> && _Np >= 2) + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (_SimdStorage __result; // + ::cuda::std::simd::__plus_f32x2(__lhs.__data, __rhs.__data, __result.__data); + return __result;)) + } + } +#endif // _CCCL_HAS_SIMD_F32X2() + #if 0 // _CCCL_HAS_NVFP16() if constexpr (is_same_v<_Tp, ::__half> && _Np >= 2) { diff --git a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h new file mode 100644 index 00000000000..3571b01248d --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h @@ -0,0 +1,80 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL +# define _CCCL_HAS_SIMD_F32X2() 1 +#else +# define _CCCL_HAS_SIMD_F32X2() 0 +#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL + +#if _CCCL_HAS_SIMD_F32X2() + +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +_CCCL_API inline void __add_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) + const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); + __result1 = __result.x; + __result2 = __result.y; +# elif __cccl_ptx_isa >= 860ULL + asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); +# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +} + +template +_CCCL_API constexpr void +__plus_f32x2(const float (&__lhs)[_Np], const float (&__rhs)[_Np], float (&__result)[_Np]) noexcept +{ + _CCCL_PRAGMA_UNROLL_FULL() + for (size_t __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__add_f32x2( + __lhs[__i], __lhs[__i + 1], __rhs[__i], __rhs[__i + 1], __result[__i], __result[__i + 1]); + } + if (_Np % 2 != 0) + { + __result[_Np - 1] = __lhs[_Np - 1] + __rhs[_Np - 1]; + } +} + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +# include + +#endif // _CCCL_HAS_SIMD_F32X2() +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index 4c25e72b8d6..6e4250695d4 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -1,6 +1,9 @@ add_custom_target(libcudacxx.test.simd.sass) -find_program(filecheck NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18) +find_program( + filecheck + NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18 +) if (filecheck) message("-- ${filecheck} found... building simd codegen tests") @@ -26,6 +29,10 @@ function(simd_codegen_add_tests arch) cmake_path(GET test_path FILENAME test_file) cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) + if (test_name STREQUAL "plus_f32" AND NOT arch EQUAL 100) + continue() + endif() + set(target_name "simd_codegen_sm${arch}_${test_name}") set(check_prefixes "SMXX,SM${arch}") diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu index 1b02e8ba22a..e9f9182bd6e 100644 --- a/libcudacxx/test/simd_codegen/fma_bf16.cu +++ b/libcudacxx/test/simd_codegen/fma_bf16.cu @@ -3,14 +3,14 @@ #if _CCCL_HAS_NVBF16() -#include +# include namespace simd = cuda::std::simd; using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; -extern "C" __global__ void test_fma_bf16_4( - const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out) +extern "C" __global__ void +test_fma_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out) { const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu index 62af327f9c3..901d341fe46 100644 --- a/libcudacxx/test/simd_codegen/fma_f16.cu +++ b/libcudacxx/test/simd_codegen/fma_f16.cu @@ -3,7 +3,7 @@ #if _CCCL_HAS_NVFP16() -#include +# include namespace simd = cuda::std::simd; diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu index bb61ebdc672..3fc6287a32c 100644 --- a/libcudacxx/test/simd_codegen/less_bf16.cu +++ b/libcudacxx/test/simd_codegen/less_bf16.cu @@ -3,14 +3,13 @@ #if _CCCL_HAS_NVBF16() -#include +# include namespace simd = cuda::std::simd; using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; -extern "C" __global__ void -test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out) +extern "C" __global__ void test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out) { const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu index 408d90e46d3..97e5070f5a3 100644 --- a/libcudacxx/test/simd_codegen/less_f16.cu +++ b/libcudacxx/test/simd_codegen/less_f16.cu @@ -3,7 +3,7 @@ #if _CCCL_HAS_NVFP16() -#include +# include namespace simd = cuda::std::simd; diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu index 37d0a3d9533..c66bc326b60 100644 --- a/libcudacxx/test/simd_codegen/multiplies_bf16.cu +++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu @@ -3,7 +3,7 @@ #if _CCCL_HAS_NVBF16() -#include +# include namespace simd = cuda::std::simd; diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu index 06f40655f22..aa0c32fbe35 100644 --- a/libcudacxx/test/simd_codegen/multiplies_f16.cu +++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu @@ -3,7 +3,7 @@ #if _CCCL_HAS_NVFP16() -#include +# include namespace simd = cuda::std::simd; diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu index dcffca128b0..4492a4edca1 100644 --- a/libcudacxx/test/simd_codegen/plus_bf16.cu +++ b/libcudacxx/test/simd_codegen/plus_bf16.cu @@ -3,7 +3,7 @@ #if _CCCL_HAS_NVBF16() -#include +# include namespace simd = cuda::std::simd; diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu index 03bb80bbf5c..5a56242da89 100644 --- a/libcudacxx/test/simd_codegen/plus_f16.cu +++ b/libcudacxx/test/simd_codegen/plus_f16.cu @@ -3,7 +3,7 @@ #if _CCCL_HAS_NVFP16() -#include +# include namespace simd = cuda::std::simd; diff --git a/libcudacxx/test/simd_codegen/plus_f32.cu b/libcudacxx/test/simd_codegen/plus_f32.cu new file mode 100644 index 00000000000..b8b11732c55 --- /dev/null +++ b/libcudacxx/test/simd_codegen/plus_f32.cu @@ -0,0 +1,33 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_SIMD_F32X2() + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const float* rhs, float* out) +{ + const cuda::std::array lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f32_4 lhs_vec(lhs_values); + const Vec_f32_4 rhs_vec(rhs_values); + const Vec_f32_4 result = lhs_vec + rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ + +#endif // _CCCL_HAS_SIMD_F32X2() From 7bf2ed96a07acd5f25b98fd4ee5b1c72ee12a4f5 Mon Sep 17 00:00:00 2001 From: fbusato Date: Thu, 30 Apr 2026 18:25:02 -0700 Subject: [PATCH 4/7] fix _CCCL_DEVICE_API --- .../cuda/std/__simd/specializations/intrinsic.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h index 3571b01248d..016f43b55eb 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h @@ -37,7 +37,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD -_CCCL_API inline void __add_f32x2( +_CCCL_DEVICE_API inline void __add_f32x2( const float __lhs1, const float __lhs2, const float __rhs1, @@ -46,9 +46,12 @@ _CCCL_API inline void __add_f32x2( float& __result2) noexcept { # if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) - const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); - __result1 = __result.x; - __result2 = __result.y; + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on # elif __cccl_ptx_isa >= 860ULL asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};" : "=f"(__result1), "=f"(__result2) @@ -57,7 +60,7 @@ _CCCL_API inline void __add_f32x2( } template -_CCCL_API constexpr void +_CCCL_DEVICE_API constexpr void __plus_f32x2(const float (&__lhs)[_Np], const float (&__rhs)[_Np], float (&__result)[_Np]) noexcept { _CCCL_PRAGMA_UNROLL_FULL() From 3dd3298e83a6c34709bdfa0b4b97ffd82054b764 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 1 May 2026 11:32:05 -0700 Subject: [PATCH 5/7] extend f32x2 optmization to sub/increment/decrement/unary minus --- .../__simd/specializations/fixed_size_mask.h | 1 + .../specializations/fixed_size_storage.h | 68 +++++ .../__simd/specializations/fixed_size_vec.h | 122 ++++----- .../specializations/fp32x2_intrinsics.h | 255 ++++++++++++++++++ .../std/__simd/specializations/intrinsic.h | 83 ------ libcudacxx/test/simd_codegen/CMakeLists.txt | 2 +- .../test/simd_codegen/decrement_f32x2.cu | 31 +++ .../test/simd_codegen/increment_f32x2.cu | 31 +++ libcudacxx/test/simd_codegen/minus_f32x2.cu | 33 +++ .../{plus_f32.cu => plus_f32x2.cu} | 0 .../test/simd_codegen/unary_minus_f32x2.cu | 31 +++ 11 files changed, 513 insertions(+), 144 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h delete mode 100644 libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h create mode 100644 libcudacxx/test/simd_codegen/decrement_f32x2.cu create mode 100644 libcudacxx/test/simd_codegen/increment_f32x2.cu create mode 100644 libcudacxx/test/simd_codegen/minus_f32x2.cu rename libcudacxx/test/simd_codegen/{plus_f32.cu => plus_f32x2.cu} (100%) create mode 100644 libcudacxx/test/simd_codegen/unary_minus_f32x2.cu diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h index a60aa034bbc..ca07cee583b 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h new file mode 100644 index 00000000000..fc88e3ff7e2 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h @@ -0,0 +1,68 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +template <__simd_size_type _Np> +struct __fixed_size +{ + static_assert(_Np > 0, "_Np must be greater than 0"); + + static constexpr __simd_size_type __simd_size = _Np; +}; + +// Element-per-slot simd storage for fixed_size ABI +template +struct __simd_storage<_Tp, __fixed_size<_Np>> +{ + using value_type = _Tp; + + _Tp __data[_Np]{}; + + _CCCL_HIDE_FROM_ABI constexpr __simd_storage() = default; + _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&) = default; + _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default; + + [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h index c87325688e6..a35d08baf48 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -21,12 +21,9 @@ # pragma system_header #endif // no system header -#include -#include -#include -#include #include -#include +#include +#include #include #include @@ -34,39 +31,6 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD -template <__simd_size_type _Np> -struct __fixed_size -{ - static_assert(_Np > 0, "_Np must be greater than 0"); - - static constexpr __simd_size_type __simd_size = _Np; -}; - -// Element-per-slot simd storage for fixed_size ABI -template -struct __simd_storage<_Tp, __fixed_size<_Np>> -{ - using value_type = _Tp; - - _Tp __data[_Np]{}; - - _CCCL_HIDE_FROM_ABI constexpr __simd_storage() = default; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&) = default; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default; - - [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept - { - _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); - return __data[__idx]; - } - - _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept - { - _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); - __data[__idx] = __v; - } -}; - // Simd operations for fixed_size ABI template struct __simd_operations<_Tp, __fixed_size<_Np>> @@ -108,6 +72,20 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (is_same_v<_Tp, float> && _Np >= 2) + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __one = __broadcast(1.0f); + __s = ::cuda::std::simd::__plus_f32x2(__s, __one); + return;)) + // clang-format on + } + } +#endif // _CCCL_HAS_SIMD_F32X2() _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) { @@ -117,6 +95,20 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (is_same_v<_Tp, float> && _Np >= 2) + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __one = __broadcast(1.0f); + __s = ::cuda::std::simd::__minus_f32x2(__s, __one); + return;)) + // clang-format on + } + } +#endif // _CCCL_HAS_SIMD_F32X2() _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) { @@ -150,6 +142,19 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (is_same_v<_Tp, float> && _Np >= 2) + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __zero = __broadcast(0.0f); + return ::cuda::std::simd::__minus_f32x2(__zero, __s);)) + // clang-format on + } + } +#endif // _CCCL_HAS_SIMD_F32X2() _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) @@ -170,31 +175,10 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> { if constexpr (is_same_v<_Tp, float> && _Np >= 2) { - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, - (_SimdStorage __result; // - ::cuda::std::simd::__plus_f32x2(__lhs.__data, __rhs.__data, __result.__data); - return __result;)) + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);)) } } #endif // _CCCL_HAS_SIMD_F32X2() - -#if 0 // _CCCL_HAS_NVFP16() - if constexpr (is_same_v<_Tp, ::__half> && _Np >= 2) - { - constexpr auto __half_size = ::cuda::ceil_div(_Np, __simd_size_type{2}); - using __half2_storage = __simd_storage<::__half2, __fixed_size<__half_size>>; - using __half2_operations = __simd_operations<::__half2, __fixed_size<__half_size>>; - __half2_storage __lhs_half2; - __half2_storage __rhs_half2; - ::cuda::std::memcpy((void*) &__lhs_half2, (void*) &__lhs, sizeof(__lhs)); - ::cuda::std::memcpy((void*) &__rhs_half2, (void*) &__rhs, sizeof(__rhs)); - const auto __result_half2 = __half2_operations::__plus(__lhs_half2, __rhs_half2); - _SimdStorage __result; - ::cuda::std::memcpy((void*) &__result, (void*) &__result_half2, sizeof(__result)); - return __result; - } - else -#endif // _CCCL_HAS_NVFP16() { _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() @@ -209,6 +193,15 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (is_same_v<_Tp, float> && _Np >= 2) + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);)) + } + } +#endif // _CCCL_HAS_SIMD_F32X2() _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) @@ -221,6 +214,15 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + if constexpr (is_same_v<_Tp, float> && _Np >= 2) + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);)) + } + } +#endif // _CCCL_HAS_SIMD_F32X2() _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h new file mode 100644 index 00000000000..8fdd7773c1e --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h @@ -0,0 +1,255 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL) +# define _CCCL_HAS_SIMD_F32X2() 1 +#else +# define _CCCL_HAS_SIMD_F32X2() 0 +#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL) + +#if _CCCL_HAS_SIMD_F32X2() + +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +_CCCL_DEVICE_API inline void __add_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 + asm("{.reg .b64 __lhs, __rhs, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "add.f32x2 __result, __lhs, __rhs;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); +# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +} + +_CCCL_DEVICE_API inline void __mul_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fmul2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 + asm("{.reg .b64 __lhs, __rhs, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "mul.f32x2 __result, __lhs, __rhs;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); +# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +} + +_CCCL_DEVICE_API inline void __sub_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{-__rhs1, -__rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (asm("{.reg .b64 __lhs, __rhs, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "sub.f32x2 __result, __lhs, __rhs;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));)) + // clang-format on +# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +} + +_CCCL_DEVICE_API inline void __fma_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + const float __add1, + const float __add2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = + ::__ffma2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}, ::float2{__add1, __add2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 + asm("{.reg .b64 __lhs, __rhs, __add, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "mov.b64 __add, {%6, %7};" + "fma.rn.f32x2 __result, __lhs, __rhs, __add;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2), "f"(__add1), "f"(__add2)); +# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +} + +template <__simd_size_type _Np> +using __simd_storage_f32 = __simd_storage>; + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__plus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__add_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] + __rhs.__data[_Np - 1]; + } + return __result; +} + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__minus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__sub_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] - __rhs.__data[_Np - 1]; + } + return __result; +} + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__multiplies_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__mul_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1]; + } + return __result; +} + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__fma_f32x2(const __simd_storage_f32<_Np>& __lhs, + const __simd_storage_f32<_Np>& __rhs, + const __simd_storage_f32<_Np>& __add) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__fma_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __add.__data[__i], + __add.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1] + __add.__data[_Np - 1]; + } + return __result; +} + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +# include + +#endif // _CCCL_HAS_SIMD_F32X2() +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h b/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h deleted file mode 100644 index 016f43b55eb..00000000000 --- a/libcudacxx/include/cuda/std/__simd/specializations/intrinsic.h +++ /dev/null @@ -1,83 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of libcu++ in the CUDA C++ Core Libraries, -// under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. -// -//===----------------------------------------------------------------------===// - -#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H -#define _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H - -#include - -#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) -# pragma GCC system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) -# pragma clang system_header -#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) -# pragma system_header -#endif // no system header - -#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL -# define _CCCL_HAS_SIMD_F32X2() 1 -#else -# define _CCCL_HAS_SIMD_F32X2() 0 -#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || __cccl_ptx_isa >= 860ULL - -#if _CCCL_HAS_SIMD_F32X2() - -# include - -# include - -# include - -_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD - -_CCCL_DEVICE_API inline void __add_f32x2( - const float __lhs1, - const float __lhs2, - const float __rhs1, - const float __rhs2, - float& __result1, - float& __result2) noexcept -{ -# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) - // clang-format off - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, - (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); - __result1 = __result.x; - __result2 = __result.y;)) - // clang-format on -# elif __cccl_ptx_isa >= 860ULL - asm("add.f32x2 {%0, %1}, {%2, %3}, {%4, %5};" - : "=f"(__result1), "=f"(__result2) - : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); -# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) -} - -template -_CCCL_DEVICE_API constexpr void -__plus_f32x2(const float (&__lhs)[_Np], const float (&__rhs)[_Np], float (&__result)[_Np]) noexcept -{ - _CCCL_PRAGMA_UNROLL_FULL() - for (size_t __i = 0; __i < (_Np / 2) * 2; __i += 2) - { - ::cuda::std::simd::__add_f32x2( - __lhs[__i], __lhs[__i + 1], __rhs[__i], __rhs[__i + 1], __result[__i], __result[__i + 1]); - } - if (_Np % 2 != 0) - { - __result[_Np - 1] = __lhs[_Np - 1] + __rhs[_Np - 1]; - } -} - -_CCCL_END_NAMESPACE_CUDA_STD_SIMD - -# include - -#endif // _CCCL_HAS_SIMD_F32X2() -#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_INTRINSIC_H diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index 6e4250695d4..1ef17d67e24 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -29,7 +29,7 @@ function(simd_codegen_add_tests arch) cmake_path(GET test_path FILENAME test_file) cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) - if (test_name STREQUAL "plus_f32" AND NOT arch EQUAL 100) + if (test_name MATCHES "_f32x2$" AND NOT arch EQUAL 100) continue() endif() diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/decrement_f32x2.cu new file mode 100644 index 00000000000..14272c13c0c --- /dev/null +++ b/libcudacxx/test/simd_codegen/decrement_f32x2.cu @@ -0,0 +1,31 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_SIMD_F32X2() + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* out) +{ + const cuda::std::array values{in[0], in[1], in[2], in[3]}; + + Vec_f32_4 vec(values); + --vec; + + out[0] = vec[0]; + out[1] = vec[1]; + out[2] = vec[2]; + out[3] = vec[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_decrement_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ + +#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/increment_f32x2.cu new file mode 100644 index 00000000000..58ffc9e6225 --- /dev/null +++ b/libcudacxx/test/simd_codegen/increment_f32x2.cu @@ -0,0 +1,31 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_SIMD_F32X2() + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* out) +{ + const cuda::std::array values{in[0], in[1], in[2], in[3]}; + + Vec_f32_4 vec(values); + ++vec; + + out[0] = vec[0]; + out[1] = vec[1]; + out[2] = vec[2]; + out[3] = vec[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_increment_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ + +#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu new file mode 100644 index 00000000000..8393de408aa --- /dev/null +++ b/libcudacxx/test/simd_codegen/minus_f32x2.cu @@ -0,0 +1,33 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_SIMD_F32X2() + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const float* rhs, float* out) +{ + const cuda::std::array lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f32_4 lhs_vec(lhs_values); + const Vec_f32_4 rhs_vec(rhs_values); + const Vec_f32_4 result = lhs_vec - rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_minus_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ + +#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/plus_f32.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu similarity index 100% rename from libcudacxx/test/simd_codegen/plus_f32.cu rename to libcudacxx/test/simd_codegen/plus_f32x2.cu diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu new file mode 100644 index 00000000000..d70e7b26b83 --- /dev/null +++ b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu @@ -0,0 +1,31 @@ +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_SIMD_F32X2() + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, float* out) +{ + const cuda::std::array values{in[0], in[1], in[2], in[3]}; + + const Vec_f32_4 vec(values); + const Vec_f32_4 result = -vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_unary_minus_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ + +#endif // _CCCL_HAS_SIMD_F32X2() From 6a29aa2c60bf4e53cf05d545a0871f5298033241 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 1 May 2026 14:55:29 -0700 Subject: [PATCH 6/7] address comments --- .../specializations/fixed_size_storage.h | 4 --- .../__simd/specializations/fixed_size_vec.h | 4 +-- .../specializations/fp32x2_intrinsics.h | 33 +++++++++---------- .../test/atomic_codegen/dump_and_check.bash | 10 +++--- libcudacxx/test/simd_codegen/CMakeLists.txt | 12 ++++++- .../test/simd_codegen/decrement_f32x2.cu | 14 +++++--- libcudacxx/test/simd_codegen/fma_bf16.cu | 10 ++++++ libcudacxx/test/simd_codegen/fma_f16.cu | 10 ++++++ .../test/simd_codegen/increment_f32x2.cu | 14 +++++--- libcudacxx/test/simd_codegen/less_bf16.cu | 10 ++++++ libcudacxx/test/simd_codegen/less_f16.cu | 10 ++++++ libcudacxx/test/simd_codegen/minus_f32x2.cu | 14 +++++--- .../test/simd_codegen/multiplies_bf16.cu | 10 ++++++ .../test/simd_codegen/multiplies_f16.cu | 10 ++++++ libcudacxx/test/simd_codegen/plus_bf16.cu | 10 ++++++ libcudacxx/test/simd_codegen/plus_f16.cu | 10 ++++++ libcudacxx/test/simd_codegen/plus_f32x2.cu | 14 +++++--- .../test/simd_codegen/unary_minus_f32x2.cu | 14 +++++--- 18 files changed, 164 insertions(+), 49 deletions(-) diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h index fc88e3ff7e2..846142cb62f 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h @@ -44,10 +44,6 @@ struct __simd_storage<_Tp, __fixed_size<_Np>> _Tp __data[_Np]{}; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage() = default; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&) = default; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default; - [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept { _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h index a35d08baf48..16bf2fbd459 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -80,7 +80,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (constexpr _SimdStorage __one = __broadcast(1.0f); - __s = ::cuda::std::simd::__plus_f32x2(__s, __one); + __s = ::cuda::std::simd::__plus_f32x2(__s, __one); return;)) // clang-format on } @@ -103,7 +103,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (constexpr _SimdStorage __one = __broadcast(1.0f); - __s = ::cuda::std::simd::__minus_f32x2(__s, __one); + __s = ::cuda::std::simd::__minus_f32x2(__s, __one); return;)) // clang-format on } diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h index 8fdd7773c1e..ae80855c662 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h @@ -21,11 +21,10 @@ # pragma system_header #endif // no system header -#if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL) -# define _CCCL_HAS_SIMD_F32X2() 1 -#else -# define _CCCL_HAS_SIMD_F32X2() 0 -#endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) || (__cccl_ptx_isa >= 860ULL) +#define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK()) +#define _CCCL_HAS_SIMD_F32X2_PTX() (__cccl_ptx_isa >= 860ULL) + +#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) #if _CCCL_HAS_SIMD_F32X2() @@ -45,14 +44,14 @@ _CCCL_DEVICE_API inline void __add_f32x2( float& __result1, float& __result2) noexcept { -# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); __result1 = __result.x; __result2 = __result.y;)) // clang-format on -# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 asm("{.reg .b64 __lhs, __rhs, __result;" "mov.b64 __lhs, {%2, %3};" "mov.b64 __rhs, {%4, %5};" @@ -60,7 +59,7 @@ _CCCL_DEVICE_API inline void __add_f32x2( "mov.b64 {%0, %1}, __result;}" : "=f"(__result1), "=f"(__result2) : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); -# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() } _CCCL_DEVICE_API inline void __mul_f32x2( @@ -71,14 +70,14 @@ _CCCL_DEVICE_API inline void __mul_f32x2( float& __result1, float& __result2) noexcept { -# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (const auto __result = ::__fmul2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); __result1 = __result.x; __result2 = __result.y;)) // clang-format on -# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 asm("{.reg .b64 __lhs, __rhs, __result;" "mov.b64 __lhs, {%2, %3};" "mov.b64 __rhs, {%4, %5};" @@ -86,7 +85,7 @@ _CCCL_DEVICE_API inline void __mul_f32x2( "mov.b64 {%0, %1}, __result;}" : "=f"(__result1), "=f"(__result2) : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); -# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() } _CCCL_DEVICE_API inline void __sub_f32x2( @@ -97,14 +96,14 @@ _CCCL_DEVICE_API inline void __sub_f32x2( float& __result1, float& __result2) noexcept { -# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{-__rhs1, -__rhs2}); __result1 = __result.x; __result2 = __result.y;)) // clang-format on -# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (asm("{.reg .b64 __lhs, __rhs, __result;" @@ -115,7 +114,7 @@ _CCCL_DEVICE_API inline void __sub_f32x2( : "=f"(__result1), "=f"(__result2) : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));)) // clang-format on -# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() } _CCCL_DEVICE_API inline void __fma_f32x2( @@ -128,7 +127,7 @@ _CCCL_DEVICE_API inline void __fma_f32x2( float& __result1, float& __result2) noexcept { -# if _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() // clang-format off NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (const auto __result = @@ -136,7 +135,7 @@ _CCCL_DEVICE_API inline void __fma_f32x2( __result1 = __result.x; __result2 = __result.y;)) // clang-format on -# elif (__cccl_ptx_isa >= 860ULL) // PTX ISA 8.6 +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 asm("{.reg .b64 __lhs, __rhs, __add, __result;" "mov.b64 __lhs, {%2, %3};" "mov.b64 __rhs, {%4, %5};" @@ -145,7 +144,7 @@ _CCCL_DEVICE_API inline void __fma_f32x2( "mov.b64 {%0, %1}, __result;}" : "=f"(__result1), "=f"(__result2) : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2), "f"(__add1), "f"(__add2)); -# endif // _CCCL_CUDA_COMPILER(NVCC, >=, 12, 8) +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() } template <__simd_size_type _Np> diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash index 51e9978dc57..525c188700e 100755 --- a/libcudacxx/test/atomic_codegen/dump_and_check.bash +++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash @@ -2,10 +2,10 @@ set -euo pipefail ## Usage: dump_and_check test.a test.cu PREFIXES [cuobjdump-mode] -input_archive=$1 -input_testfile=$2 -input_prefix=$3 -dump_mode=${4:---dump-ptx} -filecheck=${FILECHECK:-FileCheck} +input_archive="${1}" +input_testfile="${2}" +input_prefix="${3}" +dump_mode="${4:---dump-ptx}" +filecheck="${FILECHECK:-FileCheck}" cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes "${input_prefix}" "${input_testfile}" diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index 1ef17d67e24..ab882c76ef0 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -1,3 +1,13 @@ +##===----------------------------------------------------------------------===## +## +## Part of libcu++ in the CUDA C++ Core Libraries, +## under the Apache License v2.0 with LLVM Exceptions. +## See https://llvm.org/LICENSE.txt for license information. +## SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +## SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +## +##===----------------------------------------------------------------------===## + add_custom_target(libcudacxx.test.simd.sass) find_program( @@ -20,7 +30,7 @@ if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") endif() set(simd_codegen_cuda_archs 80 90) -if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.9) +if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) list(APPEND simd_codegen_cuda_archs 100 120) endif() diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/decrement_f32x2.cu index 14272c13c0c..41ec986c9c8 100644 --- a/libcudacxx/test/simd_codegen/decrement_f32x2.cu +++ b/libcudacxx/test/simd_codegen/decrement_f32x2.cu @@ -1,8 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include -#if _CCCL_HAS_SIMD_F32X2() - namespace simd = cuda::std::simd; using Vec_f32_4 = simd::basic_vec>; @@ -27,5 +35,3 @@ extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* ; SM100: {{.*FADD2.*}} */ - -#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu index e9f9182bd6e..f608f209e09 100644 --- a/libcudacxx/test/simd_codegen/fma_bf16.cu +++ b/libcudacxx/test/simd_codegen/fma_bf16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu index 901d341fe46..a0dad310ac9 100644 --- a/libcudacxx/test/simd_codegen/fma_f16.cu +++ b/libcudacxx/test/simd_codegen/fma_f16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/increment_f32x2.cu index 58ffc9e6225..9c696f11e40 100644 --- a/libcudacxx/test/simd_codegen/increment_f32x2.cu +++ b/libcudacxx/test/simd_codegen/increment_f32x2.cu @@ -1,8 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include -#if _CCCL_HAS_SIMD_F32X2() - namespace simd = cuda::std::simd; using Vec_f32_4 = simd::basic_vec>; @@ -27,5 +35,3 @@ extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* ; SM100: {{.*FADD2.*}} */ - -#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu index 3fc6287a32c..fff63e06d1c 100644 --- a/libcudacxx/test/simd_codegen/less_bf16.cu +++ b/libcudacxx/test/simd_codegen/less_bf16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu index 97e5070f5a3..beef111b99e 100644 --- a/libcudacxx/test/simd_codegen/less_f16.cu +++ b/libcudacxx/test/simd_codegen/less_f16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu index 8393de408aa..2c00e62b9ee 100644 --- a/libcudacxx/test/simd_codegen/minus_f32x2.cu +++ b/libcudacxx/test/simd_codegen/minus_f32x2.cu @@ -1,8 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include -#if _CCCL_HAS_SIMD_F32X2() - namespace simd = cuda::std::simd; using Vec_f32_4 = simd::basic_vec>; @@ -29,5 +37,3 @@ extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const flo ; SM100: {{.*FADD2.*}} */ - -#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu index c66bc326b60..3036c4fd526 100644 --- a/libcudacxx/test/simd_codegen/multiplies_bf16.cu +++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu index aa0c32fbe35..bf721009051 100644 --- a/libcudacxx/test/simd_codegen/multiplies_f16.cu +++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu index 4492a4edca1..3d192fdad83 100644 --- a/libcudacxx/test/simd_codegen/plus_bf16.cu +++ b/libcudacxx/test/simd_codegen/plus_bf16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu index 5a56242da89..4a044a19d1c 100644 --- a/libcudacxx/test/simd_codegen/plus_f16.cu +++ b/libcudacxx/test/simd_codegen/plus_f16.cu @@ -1,3 +1,13 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include diff --git a/libcudacxx/test/simd_codegen/plus_f32x2.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu index b8b11732c55..b0caf15341b 100644 --- a/libcudacxx/test/simd_codegen/plus_f32x2.cu +++ b/libcudacxx/test/simd_codegen/plus_f32x2.cu @@ -1,8 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include -#if _CCCL_HAS_SIMD_F32X2() - namespace simd = cuda::std::simd; using Vec_f32_4 = simd::basic_vec>; @@ -29,5 +37,3 @@ extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const floa ; SM100: {{.*FADD2.*}} */ - -#endif // _CCCL_HAS_SIMD_F32X2() diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu index d70e7b26b83..02d9c14f18b 100644 --- a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu +++ b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu @@ -1,8 +1,16 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + #include // IWYU pragma: keep #include -#if _CCCL_HAS_SIMD_F32X2() - namespace simd = cuda::std::simd; using Vec_f32_4 = simd::basic_vec>; @@ -27,5 +35,3 @@ extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, floa ; SM100: {{.*FADD2.*}} */ - -#endif // _CCCL_HAS_SIMD_F32X2() From 58e44c59e9bbbf30ccace8971516db94c11bd678 Mon Sep 17 00:00:00 2001 From: fbusato Date: Fri, 1 May 2026 17:05:46 -0700 Subject: [PATCH 7/7] improve organization --- libcudacxx/include/cuda/std/__fwd/simd.h | 2 +- .../include/cuda/std/__simd/basic_vec.h | 1 + .../specializations/fixed_size_float_vec.h | 128 ++++++++++++++++++ .../__simd/specializations/fixed_size_vec.h | 98 +++----------- .../test/atomic_codegen/dump_and_check.bash | 2 +- libcudacxx/test/simd_codegen/CMakeLists.txt | 41 ++++-- 6 files changed, 182 insertions(+), 90 deletions(-) create mode 100644 libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h diff --git a/libcudacxx/include/cuda/std/__fwd/simd.h b/libcudacxx/include/cuda/std/__fwd/simd.h index 282d5288914..9619f0c8998 100644 --- a/libcudacxx/include/cuda/std/__fwd/simd.h +++ b/libcudacxx/include/cuda/std/__fwd/simd.h @@ -46,7 +46,7 @@ using mask = basic_mask>; template struct __simd_storage; -template +template struct __simd_operations; template diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h index 8bda465ed17..f5f1459ecf3 100644 --- a/libcudacxx/include/cuda/std/__simd/basic_vec.h +++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h new file mode 100644 index 00000000000..b5378fb21e1 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h @@ -0,0 +1,128 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// Simd operations for fixed_size ABI with float elements and F32x2 fast paths. +template <__simd_size_type _Np> +struct __simd_operations, enable_if_t<__is_fixed_size_float_v>> + : __fixed_size_operations +{ + using __base = __fixed_size_operations; + using _SimdStorage = __simd_storage>; + + _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __one = __base::__broadcast(1.0f); + __s = ::cuda::std::simd::__plus_f32x2(__s, __one); + return;)) + // clang-format on + } +#endif // _CCCL_HAS_SIMD_F32X2() + __base::__increment(__s); + } + + _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __one = __base::__broadcast(1.0f); + __s = ::cuda::std::simd::__minus_f32x2(__s, __one); + return;)) + // clang-format on + } +#endif // _CCCL_HAS_SIMD_F32X2() + __base::__decrement(__s); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __zero = __base::__broadcast(0.0f); + return ::cuda::std::simd::__minus_f32x2(__zero, __s);)) + // clang-format on + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__unary_minus(__s); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);)) + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__plus(__lhs, __rhs); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);)) + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__minus(__lhs, __rhs); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);)) + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__multiplies(__lhs, __rhs); + } +}; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h index 16bf2fbd459..703d1617eb7 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -22,9 +22,11 @@ #endif // no system header #include +#include #include -#include +#include #include +#include #include #include @@ -33,7 +35,10 @@ _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD // Simd operations for fixed_size ABI template -struct __simd_operations<_Tp, __fixed_size<_Np>> +inline constexpr bool __is_fixed_size_float_v = is_same_v<_Tp, float> && _Np >= 2; + +template +struct __fixed_size_operations { using _SimdStorage = __simd_storage<_Tp, __fixed_size<_Np>>; using _MaskStorage = __mask_storage>; @@ -58,7 +63,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> ((__result.__data[_Is] = __g(integral_constant<__simd_size_type, _Is>())), ...); return __result; #else // ^^^ C++20 ^^^ / vvv C++17 vvv - return _SimdStorage{{ __g(integral_constant<__simd_size_type, _Is>())... }}; + return _SimdStorage{{__g(integral_constant<__simd_size_type, _Is>())...}}; #endif // _CCCL_STD_VER < 2020 } @@ -72,20 +77,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept { -#if _CCCL_HAS_SIMD_F32X2() - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (is_same_v<_Tp, float> && _Np >= 2) - { - // clang-format off - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, - (constexpr _SimdStorage __one = __broadcast(1.0f); - __s = ::cuda::std::simd::__plus_f32x2(__s, __one); - return;)) - // clang-format on - } - } -#endif // _CCCL_HAS_SIMD_F32X2() _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) { @@ -95,20 +86,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept { -#if _CCCL_HAS_SIMD_F32X2() - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (is_same_v<_Tp, float> && _Np >= 2) - { - // clang-format off - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, - (constexpr _SimdStorage __one = __broadcast(1.0f); - __s = ::cuda::std::simd::__minus_f32x2(__s, __one); - return;)) - // clang-format on - } - } -#endif // _CCCL_HAS_SIMD_F32X2() _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) { @@ -142,19 +119,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> _CCCL_DIAG_SUPPRESS_MSVC(4146) // unary minus applied to unsigned type [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept { -#if _CCCL_HAS_SIMD_F32X2() - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (is_same_v<_Tp, float> && _Np >= 2) - { - // clang-format off - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, - (constexpr _SimdStorage __zero = __broadcast(0.0f); - return ::cuda::std::simd::__minus_f32x2(__zero, __s);)) - // clang-format on - } - } -#endif // _CCCL_HAS_SIMD_F32X2() _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) @@ -170,38 +134,18 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { -#if _CCCL_HAS_SIMD_F32X2() - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (is_same_v<_Tp, float> && _Np >= 2) - { - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);)) - } - } -#endif // _CCCL_HAS_SIMD_F32X2() + _SimdStorage __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < _Np; ++__i) { - _SimdStorage __result; - _CCCL_PRAGMA_UNROLL_FULL() - for (__simd_size_type __i = 0; __i < _Np; ++__i) - { - __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]); - } - return __result; + __result.__data[__i] = (__lhs.__data[__i] + __rhs.__data[__i]); } + return __result; } [[nodiscard]] _CCCL_API static constexpr _SimdStorage __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { -#if _CCCL_HAS_SIMD_F32X2() - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (is_same_v<_Tp, float> && _Np >= 2) - { - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);)) - } - } -#endif // _CCCL_HAS_SIMD_F32X2() _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) @@ -214,15 +158,6 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> [[nodiscard]] _CCCL_API static constexpr _SimdStorage __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept { -#if _CCCL_HAS_SIMD_F32X2() - _CCCL_IF_NOT_CONSTEVAL_DEFAULT - { - if constexpr (is_same_v<_Tp, float> && _Np >= 2) - { - NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);)) - } - } -#endif // _CCCL_HAS_SIMD_F32X2() _SimdStorage __result; _CCCL_PRAGMA_UNROLL_FULL() for (__simd_size_type __i = 0; __i < _Np; ++__i) @@ -392,6 +327,13 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> return __result; } }; + +// Default path (no optimizations) +template +struct __simd_operations<_Tp, __fixed_size<_Np>, enable_if_t>> + : __fixed_size_operations<_Tp, _Np> +{}; + _CCCL_END_NAMESPACE_CUDA_STD_SIMD #include diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash index 525c188700e..326fe0dd186 100755 --- a/libcudacxx/test/atomic_codegen/dump_and_check.bash +++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash @@ -8,4 +8,4 @@ input_prefix="${3}" dump_mode="${4:---dump-ptx}" filecheck="${FILECHECK:-FileCheck}" -cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes "${input_prefix}" "${input_testfile}" +cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes="${input_prefix}" "${input_testfile}" diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt index ab882c76ef0..7a82933ba97 100644 --- a/libcudacxx/test/simd_codegen/CMakeLists.txt +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -34,17 +34,38 @@ if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) list(APPEND simd_codegen_cuda_archs 100 120) endif() -function(simd_codegen_add_tests arch) - foreach (test_path IN LISTS libcudacxx_simd_codegen_tests) - cmake_path(GET test_path FILENAME test_file) - cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) +function(simd_codegen_add_tests test_path) + cmake_path(GET test_path FILENAME test_file) + cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) - if (test_name MATCHES "_f32x2$" AND NOT arch EQUAL 100) - continue() - endif() + file(READ "${test_path}" test_contents) + string( + REGEX MATCH + "SM[0-9][0-9]*" + has_arch_specific_prefix + "${test_contents}" + ) + + # search for SMXX prefixes in the test contents + set(test_archs) + if (has_arch_specific_prefix) + foreach (arch IN LISTS simd_codegen_cuda_archs) + string(FIND "${test_contents}" "SM${arch}" arch_specific_prefix) + if (NOT arch_specific_prefix EQUAL -1) + list(APPEND test_archs "${arch}") + endif() + endforeach() + else() + set(test_archs ${simd_codegen_cuda_archs}) + endif() + # Run tests for each architecture specified in the test file + foreach (arch IN LISTS test_archs) set(target_name "simd_codegen_sm${arch}_${test_name}") - set(check_prefixes "SMXX,SM${arch}") + set(check_prefixes "SMXX") + if (has_arch_specific_prefix) + string(APPEND check_prefixes ",SM${arch}") + endif() add_library(${target_name} STATIC "${test_path}") @@ -77,6 +98,6 @@ function(simd_codegen_add_tests arch) endforeach() endfunction() -foreach (arch IN LISTS simd_codegen_cuda_archs) - simd_codegen_add_tests(${arch}) +foreach (test_path IN LISTS libcudacxx_simd_codegen_tests) + simd_codegen_add_tests("${test_path}") endforeach()