From 21d763db127900ef0d84c08033d54d54e546760f Mon Sep 17 00:00:00 2001 From: "zhongxiao.yzx" Date: Thu, 18 Feb 2021 09:50:03 +0800 Subject: [PATCH] [simd] add gather/scatter methods for simd (#32929292) Summary: gather and scatter methods of simd have not supported those need to be implemented Test Plan: make test Reviewers: chengbin.cb, liangbin.mj, yifeng.dongyifeng, longfei.alf, chuanqi.xcq Issue: https://aone.alibaba-inc.com/req/32929292 CR: https://code.aone.alibaba-inc.com/cpp_libs/std-simd/codereview/4783624 --- experimental/bits/simd.h | 51 +++++++++++++++++++++++++ experimental/bits/simd_builtin.h | 59 +++++++++++++++++++++++++++-- experimental/bits/simd_fixed_size.h | 59 +++++++++++++++++++++++++++++ experimental/bits/simd_scalar.h | 36 ++++++++++++++++++ tests/simd.cpp | 45 ++++++++++++++++++++++ 5 files changed, 246 insertions(+), 4 deletions(-) diff --git a/experimental/bits/simd.h b/experimental/bits/simd.h index 4c893114f..9eb59c9dc 100644 --- a/experimental/bits/simd.h +++ b/experimental/bits/simd.h @@ -32,6 +32,7 @@ #include "simd_detail.h" #include "numeric_traits.h" +#include #include #include #ifdef _GLIBCXX_DEBUG_UB @@ -4966,6 +4967,15 @@ template _Impl::_S_load(_Flags::template _S_apply(__mem), _S_type_tag)) {} + // gather constructor + template + _GLIBCXX_SIMD_ALWAYS_INLINE + simd(const _Up* __mem, const __int_for_sizeof_t<_Up>* __idx, _Flags) + : _M_data( + _Impl::_S_gather(_Flags::template _S_apply(__mem), + __idx, _S_type_tag)) + {} + // loads [simd.load] template _GLIBCXX_SIMD_ALWAYS_INLINE void @@ -4984,6 +4994,47 @@ template _S_type_tag); } + // gather [simd.gather] + template + _GLIBCXX_SIMD_ALWAYS_INLINE void + gather(const _Vectorizable<_Up>* __mem, + const std::array& __idx, _Flags) + { + _M_data = static_cast( + _Impl::_S_gather(_Flags::template _S_apply(__mem), __idx.data(), + _S_type_tag)); + } + + template + _GLIBCXX_SIMD_ALWAYS_INLINE void + gather(const _Vectorizable<_Up>* __mem, + const __int_for_sizeof_t<_Up>* __idx, _Flags) + { + _M_data = static_cast( + _Impl::_S_gather(_Flags::template _S_apply(__mem), __idx, + _S_type_tag)); + } + + // scatter [simd.scatter] + template + _GLIBCXX_SIMD_ALWAYS_INLINE void + scatter(_Vectorizable<_Up>* __mem, std::array& __idx, + _Flags) const + { + _Impl::_S_scatter(_M_data, _Flags::template _S_apply(__mem), + __idx.data(), _S_type_tag); + } + + // scatter [simd.scatter] + template + _GLIBCXX_SIMD_ALWAYS_INLINE void + scatter(_Vectorizable<_Up>* __mem, const __int_for_sizeof_t<_Up>* __idx, + _Flags) const + { + _Impl::_S_scatter(_M_data, _Flags::template _S_apply(__mem), + __idx, _S_type_tag); + } + // scalar access _GLIBCXX_SIMD_ALWAYS_INLINE _GLIBCXX_SIMD_CONSTEXPR reference operator[](size_t __i) diff --git a/experimental/bits/simd_builtin.h b/experimental/bits/simd_builtin.h index f048c07e6..1e7937c38 100644 --- a/experimental/bits/simd_builtin.h +++ b/experimental/bits/simd_builtin.h @@ -1431,6 +1431,54 @@ template }); } + // _S_gather {{{2 + template + _GLIBCXX_SIMD_INTRINSIC static _SimdMember<_Tp> + _S_gather(const _Up* __mem, const __int_for_sizeof_t<_Up>* __idx, + _TypeTag<_Tp>) noexcept + { + constexpr size_t _Np = _S_size<_Tp>; + return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&]( + auto __i) constexpr { + return static_cast<_Tp>(__i < _Np ? __mem[__idx[__i]] : 0); + }); + } + // _S_gather + template + _GLIBCXX_SIMD_INTRINSIC static _SimdMember<_Tp> + _S_gather(const _Up* __mem, const _SimdMember<_Tp>& __idx, + _TypeTag<_Tp>) noexcept + { + constexpr size_t _Np = _S_size<_Tp>; + return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>([&]( + auto __i) constexpr { + return static_cast<_Tp>(__i < _Np ? __mem[__idx[__i]] : 0); + }); + } // }}} + + // _S_scatter {{{2 + template + _GLIBCXX_SIMD_INTRINSIC static void + _S_scatter(_SimdMember<_Tp> __v, _Up* __mem, + const __int_for_sizeof_t<_Up>* __idx, _TypeTag<_Tp>) noexcept + { + constexpr size_t _Np = _S_size<_Tp>; + __execute_n_times<_Np>([&](auto __i) constexpr { + __mem[__idx[__i]] = static_cast<_Up>(__v[__i]); + }); + } + // _S_scatter + template + _GLIBCXX_SIMD_INTRINSIC static void + _S_scatter(_SimdMember<_Tp> __v, _Up* __mem, + const _SimdMember<_Tp>& __idx, _TypeTag<_Tp>) noexcept + { + constexpr size_t _Np = _S_size<_Tp>; + __execute_n_times<_Np>([&](auto __i) constexpr { + __mem[__idx[__i]] = static_cast<_Up>(__v[__i]); + }); + } // }}} + // _S_load {{{2 template _GLIBCXX_SIMD_INTRINSIC static _SimdMember<_Tp> @@ -2284,7 +2332,8 @@ template const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x)); const auto __maxn = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); - return __absn <= __maxn; + return _MaskImpl::template _S_convert<_Tp>( + _MaskImpl::_S_to_bits(__as_wrapper<_Np>(__absn <= __maxn))); #endif } @@ -2342,11 +2391,13 @@ template const auto __minn = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>)); #if __FINITE_MATH_ONLY__ - return __absn >= __minn; + return _MaskImpl::template _S_convert<_Tp>( + _MaskImpl::_S_to_bits(__as_wrapper<_Np>(__absn >= __minn))); #else const auto __maxn = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>)); - return __minn <= __absn && __absn <= __maxn; + return _MaskImpl::template _S_convert<_Tp>(_MaskImpl::_S_to_bits( + __as_wrapper<_Np>(__minn <= __absn && __absn <= __maxn))); #endif } @@ -2837,7 +2888,7 @@ template // smart_reference access {{{2 template - static constexpr void _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, + static constexpr void _S_set(_SimdWrapper<_Tp, _Np>& __k, size_t __i, bool __x) noexcept { if constexpr (is_same_v<_Tp, bool>) diff --git a/experimental/bits/simd_fixed_size.h b/experimental/bits/simd_fixed_size.h index 9988313ff..be5b029d2 100644 --- a/experimental/bits/simd_fixed_size.h +++ b/experimental/bits/simd_fixed_size.h @@ -395,6 +395,15 @@ template return second.template _M_simd_at<_Np - 1>(); } + template + _GLIBCXX_SIMD_INTRINSIC constexpr auto _M_tuple_at() const + { + if constexpr (_Offset == 0) + return first; + else + return second.template _M_tuple_at<_Offset - simd_size_v<_Tp, _Abi0>>(); + } + template _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdTuple _S_generate(_Fp&& __gen, _SizeConstant<_Offset> = {}) @@ -1331,6 +1340,56 @@ template }); } + // _S_gather {{{2 + template + static inline _SimdMember<_Tp> + _S_gather(const _Up* __mem, const __int_for_sizeof_t<_Up>* __idx, + _TypeTag<_Tp>) noexcept + { + return _SimdMember<_Tp>::_S_generate([&](auto __meta) { + return __meta._S_gather(__mem, &__idx[__meta._S_offset], + _TypeTag<_Tp>()); + }); + } + + // _S_gather {{{2 + template + static inline _SimdMember<_Tp> + _S_gather(const _Up* __mem, const _SimdMember<_Tp>& __idx, + _TypeTag<_Tp>) noexcept + { + return _SimdMember<_Tp>::_S_generate([&](auto __meta) { + return __meta._S_gather( + __mem, __idx.template _M_tuple_at<__meta._S_offset>(), + _TypeTag<_Tp>()); + }); + } + + // _S_scatter {{{2 + template + static inline void + _S_scatter(const _SimdMember<_Tp>& __v, _Up* __mem, + const __int_for_sizeof_t<_Up>* __idx, _TypeTag<_Tp>) noexcept + { + __for_each(__v, [&](auto __meta, auto __native) { + __meta._S_scatter(__native, __mem, &__idx[__meta._S_offset], + _TypeTag<_Tp>()); + }); + } + + // _S_scatter {{{2 + template + static inline void + _S_scatter(const _SimdMember<_Tp>& __v, _Up* __mem, + const _SimdMember<_Tp>& __idx, _TypeTag<_Tp>) noexcept + { + __for_each(__v, __idx, + [&](auto __meta, auto __v_tuple, auto __idx_tuple) { + __meta._S_scatter(__v_tuple, __mem, __idx_tuple, + _TypeTag<_Tp>()); + }); + } + // _S_load {{{2 template static inline _SimdMember<_Tp> _S_load(const _Up* __mem, diff --git a/experimental/bits/simd_scalar.h b/experimental/bits/simd_scalar.h index 4d4fe7350..47f587195 100644 --- a/experimental/bits/simd_scalar.h +++ b/experimental/bits/simd_scalar.h @@ -150,6 +150,42 @@ struct _SimdImplScalar _TypeTag<_Tp>) { return __gen(_SizeConstant<0>()); } + // _S_gather {{{2 + template + _GLIBCXX_SIMD_INTRINSIC static _Tp + _S_gather(const _Up* __mem, const __int_for_sizeof_t<_Up>* __idx, + _TypeTag<_Tp>) noexcept + { + return static_cast<_Tp>(__mem[__idx[0]]); + } + + // _S_gather + template + _GLIBCXX_SIMD_INTRINSIC static _Tp + _S_gather(const _Up* __mem, const _Tp& __idx, _TypeTag<_Tp>) noexcept + { + return static_cast<_Tp>(__mem[__idx]); + } // }}} + + // _S_scatter {{{2 + template + _GLIBCXX_SIMD_INTRINSIC static void + _S_scatter(const _Tp& __v, _Up* __mem, + [[maybe_unused]] const __int_for_sizeof_t<_Up>* __idx, + _TypeTag<_Tp>) noexcept + { + __mem[__idx[0]] = static_cast<_Up>(__v); + } + + // _S_scatter + template + _GLIBCXX_SIMD_INTRINSIC static void + _S_scatter(_Tp& __v, _Up* __mem, [[maybe_unused]] const _Tp& __idx, + _TypeTag<_Tp>) noexcept + { + __mem[__idx] = static_cast<_Up>(__v); + } // }}} + // _S_load {{{2 template _GLIBCXX_SIMD_INTRINSIC static _Tp _S_load(const _Up* __mem, diff --git a/tests/simd.cpp b/tests/simd.cpp index c4c918bbc..709a76b1c 100644 --- a/tests/simd.cpp +++ b/tests/simd.cpp @@ -1273,3 +1273,48 @@ TEST_TYPES(V, algorithms, all_test_types) COMPARE(min(a, b), V{0}); COMPARE(max(a, b), V{1}); } + + +TEST_TYPES(V, gather, all_test_types) +{ + using namespace std::experimental::parallelism_v2; + using T = typename V::value_type; + + // generate simd value data + std::vector vec_data(V::size(), 0); + __execute_n_times( + [&](size_t __i) { vec_data[__i] = static_cast(__i); }); + // generate index + using index_type = __int_for_sizeof_t; + std::vector index(V::size(), 0); + __execute_n_times( + [&](auto __i) { index[__i] = ((__i + 3) % V::size()); }); + + V x(vec_data.data(), index.data(), vector_aligned); + V y([&](auto __i) { return vec_data[index[__i]]; }); + + COMPARE(x, y); +} + +TEST_TYPES(V, scatter, all_test_types) +{ + using namespace std::experimental::parallelism_v2; + using T = typename V::value_type; + + // generate simd value data + std::vector vec_data(V::size(), 0); + __execute_n_times( + [&](size_t __i) { vec_data[__i] = static_cast(__i); }); + // generate index + using index_type = __int_for_sizeof_t; + std::vector index(V::size(), 0); + __execute_n_times( + [&](auto __i) { index[__i] = ((__i + 3) % V::size()); }); + + V x(vec_data.data(), index.data(), vector_aligned); + + std::vector vec_out(V::size(), 0); + x.scatter(&vec_out[0], index.data(), vector_aligned); + V y(vec_out.data(), vector_aligned); + COMPARE(vec_data, vec_out) << x << " => " << y; +}