diff --git a/libcudacxx/include/cuda/std/__fwd/simd.h b/libcudacxx/include/cuda/std/__fwd/simd.h index 282d5288914..9619f0c8998 100644 --- a/libcudacxx/include/cuda/std/__fwd/simd.h +++ b/libcudacxx/include/cuda/std/__fwd/simd.h @@ -46,7 +46,7 @@ using mask = basic_mask>; template struct __simd_storage; -template +template struct __simd_operations; template diff --git a/libcudacxx/include/cuda/std/__simd/basic_vec.h b/libcudacxx/include/cuda/std/__simd/basic_vec.h index 8bda465ed17..f5f1459ecf3 100644 --- a/libcudacxx/include/cuda/std/__simd/basic_vec.h +++ b/libcudacxx/include/cuda/std/__simd/basic_vec.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h new file mode 100644 index 00000000000..b5378fb21e1 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_float_vec.h @@ -0,0 +1,128 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +// Simd operations for fixed_size ABI with float elements and F32x2 fast paths. +template <__simd_size_type _Np> +struct __simd_operations, enable_if_t<__is_fixed_size_float_v>> + : __fixed_size_operations +{ + using __base = __fixed_size_operations; + using _SimdStorage = __simd_storage>; + + _CCCL_API static constexpr void __increment(_SimdStorage& __s) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __one = __base::__broadcast(1.0f); + __s = ::cuda::std::simd::__plus_f32x2(__s, __one); + return;)) + // clang-format on + } +#endif // _CCCL_HAS_SIMD_F32X2() + __base::__increment(__s); + } + + _CCCL_API static constexpr void __decrement(_SimdStorage& __s) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __one = __base::__broadcast(1.0f); + __s = ::cuda::std::simd::__minus_f32x2(__s, __one); + return;)) + // clang-format on + } +#endif // _CCCL_HAS_SIMD_F32X2() + __base::__decrement(__s); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage __unary_minus(const _SimdStorage& __s) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (constexpr _SimdStorage __zero = __base::__broadcast(0.0f); + return ::cuda::std::simd::__minus_f32x2(__zero, __s);)) + // clang-format on + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__unary_minus(__s); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __plus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__plus_f32x2(__lhs, __rhs);)) + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__plus(__lhs, __rhs); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __minus(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__minus_f32x2(__lhs, __rhs);)) + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__minus(__lhs, __rhs); + } + + [[nodiscard]] _CCCL_API static constexpr _SimdStorage + __multiplies(const _SimdStorage& __lhs, const _SimdStorage& __rhs) noexcept + { +#if _CCCL_HAS_SIMD_F32X2() + _CCCL_IF_NOT_CONSTEVAL_DEFAULT + { + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, (return ::cuda::std::simd::__multiplies_f32x2(__lhs, __rhs);)) + } +#endif // _CCCL_HAS_SIMD_F32X2() + return __base::__multiplies(__lhs, __rhs); + } +}; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_FLOAT_VEC_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h index a60aa034bbc..ca07cee583b 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_mask.h @@ -24,6 +24,7 @@ #include #include #include +#include #include #include diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h new file mode 100644 index 00000000000..846142cb62f --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_storage.h @@ -0,0 +1,64 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include +#include + +#include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +template <__simd_size_type _Np> +struct __fixed_size +{ + static_assert(_Np > 0, "_Np must be greater than 0"); + + static constexpr __simd_size_type __simd_size = _Np; +}; + +// Element-per-slot simd storage for fixed_size ABI +template +struct __simd_storage<_Tp, __fixed_size<_Np>> +{ + using value_type = _Tp; + + _Tp __data[_Np]{}; + + [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + return __data[__idx]; + } + + _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept + { + _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); + __data[__idx] = __v; + } +}; + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +#include + +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FIXED_SIZE_STORAGE_H diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h index 0dc5576d935..703d1617eb7 100644 --- a/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h +++ b/libcudacxx/include/cuda/std/__simd/specializations/fixed_size_vec.h @@ -21,51 +21,24 @@ # pragma system_header #endif // no system header -#include #include +#include +#include +#include #include +#include #include #include _CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD -template <__simd_size_type _Np> -struct __fixed_size -{ - static_assert(_Np > 0, "_Np must be greater than 0"); - - static constexpr __simd_size_type __simd_size = _Np; -}; - -// Element-per-slot simd storage for fixed_size ABI +// Simd operations for fixed_size ABI template -struct __simd_storage<_Tp, __fixed_size<_Np>> -{ - using value_type = _Tp; - - _Tp __data[_Np]{}; - - _CCCL_HIDE_FROM_ABI constexpr __simd_storage() = default; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage(const __simd_storage&) = default; - _CCCL_HIDE_FROM_ABI constexpr __simd_storage& operator=(const __simd_storage&) = default; +inline constexpr bool __is_fixed_size_float_v = is_same_v<_Tp, float> && _Np >= 2; - [[nodiscard]] _CCCL_API constexpr _Tp __get(const __simd_size_type __idx) const noexcept - { - _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); - return __data[__idx]; - } - - _CCCL_API constexpr void __set(const __simd_size_type __idx, const _Tp __v) noexcept - { - _CCCL_ASSERT(::cuda::in_range(__idx, __simd_size_type{0}, _Np), "Index is out of bounds"); - __data[__idx] = __v; - } -}; - -// Simd operations for fixed_size ABI template -struct __simd_operations<_Tp, __fixed_size<_Np>> +struct __fixed_size_operations { using _SimdStorage = __simd_storage<_Tp, __fixed_size<_Np>>; using _MaskStorage = __mask_storage>; @@ -90,7 +63,7 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> ((__result.__data[_Is] = __g(integral_constant<__simd_size_type, _Is>())), ...); return __result; #else // ^^^ C++20 ^^^ / vvv C++17 vvv - return _SimdStorage{{ __g(integral_constant<__simd_size_type, _Is>())... }}; + return _SimdStorage{{__g(integral_constant<__simd_size_type, _Is>())...}}; #endif // _CCCL_STD_VER < 2020 } @@ -354,6 +327,13 @@ struct __simd_operations<_Tp, __fixed_size<_Np>> return __result; } }; + +// Default path (no optimizations) +template +struct __simd_operations<_Tp, __fixed_size<_Np>, enable_if_t>> + : __fixed_size_operations<_Tp, _Np> +{}; + _CCCL_END_NAMESPACE_CUDA_STD_SIMD #include diff --git a/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h new file mode 100644 index 00000000000..ae80855c662 --- /dev/null +++ b/libcudacxx/include/cuda/std/__simd/specializations/fp32x2_intrinsics.h @@ -0,0 +1,254 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H +#define _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#define _CCCL_HAS_SIMD_F32X2_INTRINSICS() (_CCCL_CUDACC_AT_LEAST(12, 8) && _CCCL_HAS_CTK()) +#define _CCCL_HAS_SIMD_F32X2_PTX() (__cccl_ptx_isa >= 860ULL) + +#define _CCCL_HAS_SIMD_F32X2() (_CCCL_HAS_SIMD_F32X2_INTRINSICS() || _CCCL_HAS_SIMD_F32X2_PTX()) + +#if _CCCL_HAS_SIMD_F32X2() + +# include + +# include + +# include + +_CCCL_BEGIN_NAMESPACE_CUDA_STD_SIMD + +_CCCL_DEVICE_API inline void __add_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 + asm("{.reg .b64 __lhs, __rhs, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "add.f32x2 __result, __lhs, __rhs;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() +} + +_CCCL_DEVICE_API inline void __mul_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fmul2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 + asm("{.reg .b64 __lhs, __rhs, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "mul.f32x2 __result, __lhs, __rhs;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2)); +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() +} + +_CCCL_DEVICE_API inline void __sub_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = ::__fadd2_rn(::float2{__lhs1, __lhs2}, ::float2{-__rhs1, -__rhs2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (asm("{.reg .b64 __lhs, __rhs, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "sub.f32x2 __result, __lhs, __rhs;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2));)) + // clang-format on +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() +} + +_CCCL_DEVICE_API inline void __fma_f32x2( + const float __lhs1, + const float __lhs2, + const float __rhs1, + const float __rhs2, + const float __add1, + const float __add2, + float& __result1, + float& __result2) noexcept +{ +# if _CCCL_HAS_SIMD_F32X2_INTRINSICS() + // clang-format off + NV_IF_TARGET(NV_IS_EXACTLY_SM_100, + (const auto __result = + ::__ffma2_rn(::float2{__lhs1, __lhs2}, ::float2{__rhs1, __rhs2}, ::float2{__add1, __add2}); + __result1 = __result.x; + __result2 = __result.y;)) + // clang-format on +# elif _CCCL_HAS_SIMD_F32X2_PTX() // PTX ISA 8.6 + asm("{.reg .b64 __lhs, __rhs, __add, __result;" + "mov.b64 __lhs, {%2, %3};" + "mov.b64 __rhs, {%4, %5};" + "mov.b64 __add, {%6, %7};" + "fma.rn.f32x2 __result, __lhs, __rhs, __add;" + "mov.b64 {%0, %1}, __result;}" + : "=f"(__result1), "=f"(__result2) + : "f"(__lhs1), "f"(__lhs2), "f"(__rhs1), "f"(__rhs2), "f"(__add1), "f"(__add2)); +# endif // _CCCL_HAS_SIMD_F32X2_INTRINSICS() +} + +template <__simd_size_type _Np> +using __simd_storage_f32 = __simd_storage>; + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__plus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__add_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] + __rhs.__data[_Np - 1]; + } + return __result; +} + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__minus_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__sub_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] - __rhs.__data[_Np - 1]; + } + return __result; +} + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__multiplies_f32x2(const __simd_storage_f32<_Np>& __lhs, const __simd_storage_f32<_Np>& __rhs) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__mul_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1]; + } + return __result; +} + +template <__simd_size_type _Np> +[[nodiscard]] _CCCL_DEVICE_API constexpr __simd_storage_f32<_Np> +__fma_f32x2(const __simd_storage_f32<_Np>& __lhs, + const __simd_storage_f32<_Np>& __rhs, + const __simd_storage_f32<_Np>& __add) noexcept +{ + __simd_storage_f32<_Np> __result; + _CCCL_PRAGMA_UNROLL_FULL() + for (__simd_size_type __i = 0; __i < (_Np / 2) * 2; __i += 2) + { + ::cuda::std::simd::__fma_f32x2( + __lhs.__data[__i], + __lhs.__data[__i + 1], + __rhs.__data[__i], + __rhs.__data[__i + 1], + __add.__data[__i], + __add.__data[__i + 1], + __result.__data[__i], + __result.__data[__i + 1]); + } + if (_Np % 2 != 0) + { + __result.__data[_Np - 1] = __lhs.__data[_Np - 1] * __rhs.__data[_Np - 1] + __add.__data[_Np - 1]; + } + return __result; +} + +_CCCL_END_NAMESPACE_CUDA_STD_SIMD + +# include + +#endif // _CCCL_HAS_SIMD_F32X2() +#endif // _CUDA_STD___SIMD_SPECIALIZATIONS_FP32X2_INTRINSICS_H diff --git a/libcudacxx/test/CMakeLists.txt b/libcudacxx/test/CMakeLists.txt index 039c917fa7d..20fb09610ed 100644 --- a/libcudacxx/test/CMakeLists.txt +++ b/libcudacxx/test/CMakeLists.txt @@ -98,3 +98,4 @@ endif() add_subdirectory(nvtarget) add_subdirectory(atomic_codegen) +add_subdirectory(simd_codegen) diff --git a/libcudacxx/test/atomic_codegen/dump_and_check.bash b/libcudacxx/test/atomic_codegen/dump_and_check.bash index d1c5f16b949..326fe0dd186 100755 --- a/libcudacxx/test/atomic_codegen/dump_and_check.bash +++ b/libcudacxx/test/atomic_codegen/dump_and_check.bash @@ -1,9 +1,11 @@ #!/bin/bash set -euo pipefail -## Usage: dump_and_check test.a test.cu PREFIX -input_archive="$1" -input_testfile="$2" -input_prefix="$3" +## Usage: dump_and_check test.a test.cu PREFIXES [cuobjdump-mode] +input_archive="${1}" +input_testfile="${2}" +input_prefix="${3}" +dump_mode="${4:---dump-ptx}" +filecheck="${FILECHECK:-FileCheck}" -cuobjdump --dump-ptx "$input_archive" | FileCheck --match-full-lines --check-prefix "$input_prefix" "$input_testfile" +cuobjdump "${dump_mode}" "${input_archive}" | "${filecheck}" --match-full-lines --check-prefixes="${input_prefix}" "${input_testfile}" diff --git a/libcudacxx/test/simd_codegen/CMakeLists.txt b/libcudacxx/test/simd_codegen/CMakeLists.txt new file mode 100644 index 00000000000..7a82933ba97 --- /dev/null +++ b/libcudacxx/test/simd_codegen/CMakeLists.txt @@ -0,0 +1,103 @@ +##===----------------------------------------------------------------------===## +## +## Part of libcu++ in the CUDA C++ Core Libraries, +## under the Apache License v2.0 with LLVM Exceptions. +## See https://llvm.org/LICENSE.txt for license information. +## SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +## SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +## +##===----------------------------------------------------------------------===## + +add_custom_target(libcudacxx.test.simd.sass) + +find_program( + filecheck + NAMES FileCheck FileCheck-21 FileCheck-20 FileCheck-19 FileCheck-18 +) + +if (filecheck) + message("-- ${filecheck} found... building simd codegen tests") +else() + return() +endif() + +find_program(cuobjdump "cuobjdump" REQUIRED) +find_program(bash "bash" REQUIRED) + +set(libcudacxx_simd_codegen_tests) +if (NOT "NVHPC" STREQUAL "${CMAKE_CXX_COMPILER_ID}") + file(GLOB libcudacxx_simd_codegen_tests "*.cu") +endif() + +set(simd_codegen_cuda_archs 80 90) +if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) + list(APPEND simd_codegen_cuda_archs 100 120) +endif() + +function(simd_codegen_add_tests test_path) + cmake_path(GET test_path FILENAME test_file) + cmake_path(REMOVE_EXTENSION test_file LAST_ONLY OUTPUT_VARIABLE test_name) + + file(READ "${test_path}" test_contents) + string( + REGEX MATCH + "SM[0-9][0-9]*" + has_arch_specific_prefix + "${test_contents}" + ) + + # search for SMXX prefixes in the test contents + set(test_archs) + if (has_arch_specific_prefix) + foreach (arch IN LISTS simd_codegen_cuda_archs) + string(FIND "${test_contents}" "SM${arch}" arch_specific_prefix) + if (NOT arch_specific_prefix EQUAL -1) + list(APPEND test_archs "${arch}") + endif() + endforeach() + else() + set(test_archs ${simd_codegen_cuda_archs}) + endif() + + # Run tests for each architecture specified in the test file + foreach (arch IN LISTS test_archs) + set(target_name "simd_codegen_sm${arch}_${test_name}") + set(check_prefixes "SMXX") + if (has_arch_specific_prefix) + string(APPEND check_prefixes ",SM${arch}") + endif() + + add_library(${target_name} STATIC "${test_path}") + + set_target_properties( + ${target_name} + PROPERTIES CUDA_ARCHITECTURES "${arch}" + ) + + target_compile_options(${target_name} PRIVATE "-Wno-comment") + + target_include_directories( + ${target_name} + PRIVATE "${libcudacxx_SOURCE_DIR}/include" + ) + add_dependencies(libcudacxx.test.simd.sass ${target_name}) + + add_custom_command( + TARGET libcudacxx.test.simd.sass + POST_BUILD + # gersemi: off + COMMAND + ${CMAKE_COMMAND} -E env "FILECHECK=${filecheck}" + "${CMAKE_CURRENT_SOURCE_DIR}/../atomic_codegen/dump_and_check.bash" + $ + "${test_path}" + "${check_prefixes}" + --dump-sass + # gersemi: on + ) + endforeach() +endfunction() + +foreach (test_path IN LISTS libcudacxx_simd_codegen_tests) + simd_codegen_add_tests("${test_path}") +endforeach() diff --git a/libcudacxx/test/simd_codegen/decrement_f32x2.cu b/libcudacxx/test/simd_codegen/decrement_f32x2.cu new file mode 100644 index 00000000000..41ec986c9c8 --- /dev/null +++ b/libcudacxx/test/simd_codegen/decrement_f32x2.cu @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_decrement_f32_4(const float* in, float* out) +{ + const cuda::std::array values{in[0], in[1], in[2], in[3]}; + + Vec_f32_4 vec(values); + --vec; + + out[0] = vec[0]; + out[1] = vec[1]; + out[2] = vec[2]; + out[3] = vec[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_decrement_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/fma_bf16.cu b/libcudacxx/test/simd_codegen/fma_bf16.cu new file mode 100644 index 00000000000..f608f209e09 --- /dev/null +++ b/libcudacxx/test/simd_codegen/fma_bf16.cu @@ -0,0 +1,54 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void +test_fma_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, const __nv_bfloat16* add, __nv_bfloat16* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> add_values{add[0], add[1], add[2], add[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const Vec_bf16_4 add_vec(add_values); + const Vec_bf16_4 result = lhs_vec * rhs_vec + add_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_bf16_4 +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/fma_f16.cu b/libcudacxx/test/simd_codegen/fma_f16.cu new file mode 100644 index 00000000000..a0dad310ac9 --- /dev/null +++ b/libcudacxx/test/simd_codegen/fma_f16.cu @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_fma_f16_4(const __half* lhs, const __half* rhs, const __half* add, __half* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + const cuda::std::array<__half, 4> add_values{add[0], add[1], add[2], add[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const Vec_f16_4 add_vec(add_values); + const Vec_f16_4 result = lhs_vec * rhs_vec + add_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_fma_f16_4 +; SM80: {{.*HFMA2.*}} +; SM80: {{.*HFMA2.*}} +; SM90: {{.*HFMA2.*}} +; SM90: {{.*HFMA2.*}} +; SM100: {{.*HFMA2.*}} +; SM100: {{.*HFMA2.*}} +; SM120: {{.*HFMA2.*}} +; SM120: {{.*HFMA2.*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/increment_f32x2.cu b/libcudacxx/test/simd_codegen/increment_f32x2.cu new file mode 100644 index 00000000000..9c696f11e40 --- /dev/null +++ b/libcudacxx/test/simd_codegen/increment_f32x2.cu @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_increment_f32_4(const float* in, float* out) +{ + const cuda::std::array values{in[0], in[1], in[2], in[3]}; + + Vec_f32_4 vec(values); + ++vec; + + out[0] = vec[0]; + out[1] = vec[1]; + out[2] = vec[2]; + out[3] = vec[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_increment_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/less_bf16.cu b/libcudacxx/test/simd_codegen/less_bf16.cu new file mode 100644 index 00000000000..fff63e06d1c --- /dev/null +++ b/libcudacxx/test/simd_codegen/less_bf16.cu @@ -0,0 +1,53 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void test_less_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, bool* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const auto result = lhs_vec < rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_bf16_4 +; SM80: {{.*FSETP\.LT.*}} +; SM80: {{.*FSETP\.LT.*}} +; SM80: {{.*FSETP\.LT.*}} +; SM80: {{.*FSETP\.LT.*}} +; SM90: {{.*HSETP2.*BF16.*}} +; SM90: {{.*HSETP2.*BF16.*}} +; SM100: {{.*HSETP2.*BF16.*}} +; SM100: {{.*HSETP2.*BF16.*}} +; SM120: {{.*HSETP2.*BF16.*}} +; SM120: {{.*HSETP2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/less_f16.cu b/libcudacxx/test/simd_codegen/less_f16.cu new file mode 100644 index 00000000000..beef111b99e --- /dev/null +++ b/libcudacxx/test/simd_codegen/less_f16.cu @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_less_f16_4(const __half* lhs, const __half* rhs, bool* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const auto result = lhs_vec < rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_less_f16_4 +; SM80: {{.*HSETP2.*}} +; SM80: {{.*HSETP2.*}} +; SM90: {{.*HSETP2.*}} +; SM90: {{.*HSETP2.*}} +; SM100: {{.*HSETP2.*}} +; SM100: {{.*HSETP2.*}} +; SM120: {{.*HSETP2.*}} +; SM120: {{.*HSETP2.*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/minus_f32x2.cu b/libcudacxx/test/simd_codegen/minus_f32x2.cu new file mode 100644 index 00000000000..2c00e62b9ee --- /dev/null +++ b/libcudacxx/test/simd_codegen/minus_f32x2.cu @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_minus_f32_4(const float* lhs, const float* rhs, float* out) +{ + const cuda::std::array lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f32_4 lhs_vec(lhs_values); + const Vec_f32_4 rhs_vec(rhs_values); + const Vec_f32_4 result = lhs_vec - rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_minus_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/multiplies_bf16.cu b/libcudacxx/test/simd_codegen/multiplies_bf16.cu new file mode 100644 index 00000000000..3036c4fd526 --- /dev/null +++ b/libcudacxx/test/simd_codegen/multiplies_bf16.cu @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void +test_operator_multiplies_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const Vec_bf16_4 result = lhs_vec * rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_bf16_4 +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HMUL2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HMUL2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HMUL2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/multiplies_f16.cu b/libcudacxx/test/simd_codegen/multiplies_f16.cu new file mode 100644 index 00000000000..bf721009051 --- /dev/null +++ b/libcudacxx/test/simd_codegen/multiplies_f16.cu @@ -0,0 +1,51 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_operator_multiplies_f16_4(const __half* lhs, const __half* rhs, __half* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const Vec_f16_4 result = lhs_vec * rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_multiplies_f16_4 +; SM80: {{.*(HMUL2|HFMA2).*}} +; SM80: {{.*(HMUL2|HFMA2).*}} +; SM90: {{.*(HMUL2|HFMA2).*}} +; SM90: {{.*(HMUL2|HFMA2).*}} +; SM100: {{.*(HMUL2|HFMA2).*}} +; SM100: {{.*(HMUL2|HFMA2).*}} +; SM120: {{.*(HMUL2|HFMA2).*}} +; SM120: {{.*(HMUL2|HFMA2).*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/plus_bf16.cu b/libcudacxx/test/simd_codegen/plus_bf16.cu new file mode 100644 index 00000000000..3d192fdad83 --- /dev/null +++ b/libcudacxx/test/simd_codegen/plus_bf16.cu @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVBF16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_bf16_4 = simd::basic_vec<__nv_bfloat16, simd::fixed_size<4>>; + +extern "C" __global__ void +test_operator_plus_bf16_4(const __nv_bfloat16* lhs, const __nv_bfloat16* rhs, __nv_bfloat16* out) +{ + const cuda::std::array<__nv_bfloat16, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__nv_bfloat16, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_bf16_4 lhs_vec(lhs_values); + const Vec_bf16_4 rhs_vec(rhs_values); + const Vec_bf16_4 result = lhs_vec + rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_bf16_4 +; SM80: {{.*HFMA2.*BF16.*}} +; SM80: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HFMA2.*BF16.*}} +; SM90: {{.*HADD2.*BF16.*}} +; SM100: {{.*HFMA2.*BF16.*}} +; SM100: {{.*HADD2.*BF16.*}} +; SM120: {{.*HFMA2.*BF16.*}} +; SM120: {{.*HADD2.*BF16.*}} + +*/ + +#endif // _CCCL_HAS_NVBF16() diff --git a/libcudacxx/test/simd_codegen/plus_f16.cu b/libcudacxx/test/simd_codegen/plus_f16.cu new file mode 100644 index 00000000000..4a044a19d1c --- /dev/null +++ b/libcudacxx/test/simd_codegen/plus_f16.cu @@ -0,0 +1,45 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +#if _CCCL_HAS_NVFP16() + +# include + +namespace simd = cuda::std::simd; + +using Vec_f16_4 = simd::basic_vec<__half, simd::fixed_size<4>>; + +extern "C" __global__ void test_operator_plus_f16_4(const __half* lhs, const __half* rhs, __half* out) +{ + const cuda::std::array<__half, 4> lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array<__half, 4> rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f16_4 lhs_vec(lhs_values); + const Vec_f16_4 rhs_vec(rhs_values); + const Vec_f16_4 result = lhs_vec + rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f16_4 +; SMXX: {{.*(HADD2|HFMA2).*}} +; SMXX: {{.*(HADD2|HFMA2).*}} + +*/ + +#endif // _CCCL_HAS_NVFP16() diff --git a/libcudacxx/test/simd_codegen/plus_f32x2.cu b/libcudacxx/test/simd_codegen/plus_f32x2.cu new file mode 100644 index 00000000000..b0caf15341b --- /dev/null +++ b/libcudacxx/test/simd_codegen/plus_f32x2.cu @@ -0,0 +1,39 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_plus_f32_4(const float* lhs, const float* rhs, float* out) +{ + const cuda::std::array lhs_values{lhs[0], lhs[1], lhs[2], lhs[3]}; + const cuda::std::array rhs_values{rhs[0], rhs[1], rhs[2], rhs[3]}; + + const Vec_f32_4 lhs_vec(lhs_values); + const Vec_f32_4 rhs_vec(rhs_values); + const Vec_f32_4 result = lhs_vec + rhs_vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_plus_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/ diff --git a/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu new file mode 100644 index 00000000000..02d9c14f18b --- /dev/null +++ b/libcudacxx/test/simd_codegen/unary_minus_f32x2.cu @@ -0,0 +1,37 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++ in the CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include // IWYU pragma: keep +#include + +namespace simd = cuda::std::simd; + +using Vec_f32_4 = simd::basic_vec>; + +extern "C" __global__ void test_operator_unary_minus_f32_4(const float* in, float* out) +{ + const cuda::std::array values{in[0], in[1], in[2], in[3]}; + + const Vec_f32_4 vec(values); + const Vec_f32_4 result = -vec; + + out[0] = result[0]; + out[1] = result[1]; + out[2] = result[2]; + out[3] = result[3]; +} + +/* + +; SMXX-LABEL: {{[[:space:]]*}}Function : test_operator_unary_minus_f32_4 +; SM100: {{.*FADD2.*}} +; SM100: {{.*FADD2.*}} + +*/