diff --git a/clang/runtime/dpct-rt/include/dpct/dpl_extras/vector.h b/clang/runtime/dpct-rt/include/dpct/dpl_extras/vector.h index 49f050dc08ba..e61a88542b04 100644 --- a/clang/runtime/dpct-rt/include/dpct/dpl_extras/vector.h +++ b/clang/runtime/dpct-rt/include/dpct/dpl_extras/vector.h @@ -11,6 +11,7 @@ #include #include +#include #include @@ -41,6 +42,114 @@ struct is_iterator : std::true_type {}; #ifndef DPCT_USM_LEVEL_NONE +// device_allocator_traits is a helper struct which makes use of custom +// allocator constructor routines when they are specified serially on the host, +// while enabling oneDPL pstl accelleration when such custom constructors are +// not provided. +template struct device_allocator_traits { + + // taken from libc++ + template + struct __has_construct_impl : ::std::false_type {}; + + template + struct __has_construct_impl().construct( + std::declval<_Args>()...)), + _Alloc, _Args...> : ::std::true_type {}; + + template + struct __has_construct : __has_construct_impl {}; + + template + struct __has_destroy : ::std::false_type {}; + + template + struct __has_destroy<_Alloc, _Pointer, + decltype((void)std::declval<_Alloc>().destroy( + std::declval<_Pointer>()))> : ::std::true_type {}; + // end of taken from libc++ + + template + static void uninitialized_value_construct_n(_Allocator alloc, T *p, Size n) { + assert(p != nullptr && "value constructing null data"); + if constexpr (__has_construct<_Allocator, T *>::value) { + for (Size i = 0; i < n; i++) { + ::std::allocator_traits<_Allocator>::construct(alloc, p + i); + } + } else { + ::std::uninitialized_value_construct_n( + oneapi::dpl::execution::make_device_policy(get_default_queue()), p, + n); + } + } + + template + static void uninitialized_fill_n(_Allocator alloc, T *first, Size n, + const Value &value) { + assert(first != nullptr && "filling null data"); + if constexpr (__has_construct<_Allocator, T *, const Value &>::value) { + for (Size i = 0; i < n; i++) { + ::std::allocator_traits<_Allocator>::construct(alloc, first + i, value); + } + } else { + ::std::uninitialized_fill_n( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, n, value); + } + } + + template + static void __uninitialized_custom_copy_n(_Allocator alloc, Iter1 first, + Size n, T *d_first) { + for (Size i = 0; i < n; i++) { + ::std::allocator_traits<_Allocator>::construct(alloc, d_first + i, + *(first + i)); + } + } + + template + static void uninitialized_device_copy_n(_Allocator alloc, Iter1 first, Size n, + T *d_first) { + assert(d_first != nullptr && "copying into null data"); + if constexpr (__has_construct<_Allocator, T *, + typename ::std::iterator_traits< + Iter1>::value_type>::value) { + __uninitialized_custom_copy_n(alloc, first, n, d_first); + } else { + ::std::uninitialized_copy_n( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + first, n, d_first); + } + } + + template + static void uninitialized_host_copy_n(_Allocator alloc, Iter1 first, Size n, + T *d_first) { + assert(d_first != nullptr && "copying into null data"); + if constexpr (__has_construct<_Allocator, T *, + typename ::std::iterator_traits< + Iter1>::value_type>::value) { + __uninitialized_custom_copy_n(alloc, first, n, d_first); + } else { + ::std::uninitialized_copy_n(first, n, d_first); + } + } + + template + static void destroy_n(_Allocator alloc, T *p, Size n) { + assert(p != nullptr && "destroying null data"); + if constexpr (__has_destroy<_Allocator, T *>::value) { + for (Size i = 0; i < n; i++) { + ::std::allocator_traits<_Allocator>::destroy(alloc, p + i); + } + } else { + ::std::destroy_n( + oneapi::dpl::execution::make_device_policy(get_default_queue()), p, + n); + } + } +}; + template > class device_vector { @@ -55,6 +164,8 @@ class device_vector { using difference_type = typename ::std::iterator_traits::difference_type; using size_type = ::std::size_t; + using allocator_type = Allocator; + using alloc_traits = ::std::allocator_traits; private: Allocator _alloc; @@ -66,145 +177,213 @@ class device_vector { void _set_capacity_and_alloc() { _capacity = ::std::max(_size * 2, _min_capacity()); - _storage = _alloc.allocate(_capacity); + _storage = alloc_traits::allocate(_alloc, _capacity); + } + + void _construct(size_type n, size_type start_idx = 0) { + if (n > 0) { + device_allocator_traits::uninitialized_value_construct_n( + _alloc, _storage + start_idx, n); + } + } + + void _construct(size_type n, const T &value, size_type start_idx = 0) { + if (n > 0) { + device_allocator_traits::uninitialized_fill_n( + _alloc, _storage + start_idx, n, value); + } + } + + template + void _construct_iter(Iter first, size_type n, size_type start_idx = 0) { + if (n > 0) { + device_allocator_traits::uninitialized_device_copy_n( + _alloc, first, n, _storage + start_idx); + } + } + + template + void _construct_iter_host(Iter first, size_type n, size_type start_idx = 0) { + if (n > 0) { + device_allocator_traits::uninitialized_host_copy_n( + _alloc, first, n, _storage + start_idx); + } + } + + void _destroy(size_type n, size_type start_idx = 0) { + if (n > 0) { + device_allocator_traits::destroy_n(_alloc, + _storage + start_idx, n); + } + } + + void _assign_elements(const device_vector &other) { + if (other.size() <= _size) { + // if incoming elements fit within existing elements, copy then destroy + // excess + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + other.begin(), other.end(), begin()); + resize(other.size()); + } else if (other.size() < _capacity) { + // if incoming elements don't fit within existing elements but do fit + // within total capacity + // copy elements that fit, then use uninitialized copy to ge the rest + // and adjust size + std::copy_n( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + other.begin(), _size, begin()); + device_allocator_traits::uninitialized_device_copy_n( + _alloc, other.begin() + _size, other.size() - _size, + _storage + _size); + _size = other.size(); + } else { + // If incoming elements exceed current capacity, destroy all existing + // elements, then allocate incoming vectors capacity, then use + // uninitialized copy + clear(); + reserve(other.capacity()); + device_allocator_traits::uninitialized_device_copy_n( + _alloc, other.begin(), other.size(), _storage); + _size = other.size(); + } } public: template operator ::std::vector() const { auto __tmp = ::std::vector(this->size()); ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - this->begin(), this->end(), __tmp.begin()); + this->begin(), this->end(), __tmp.begin()); return __tmp; } - device_vector() - : _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) { + + device_vector(const Allocator &alloc = Allocator(get_default_queue())) + : _alloc(alloc), _size(0), _capacity(_min_capacity()) { _set_capacity_and_alloc(); } - ~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); }; - explicit device_vector(size_type n) : device_vector(n, T()) {} - explicit device_vector(size_type n, const T &value) - : _alloc(get_default_queue()), _size(n) { + + ~device_vector() /*= default*/ { + clear(); + alloc_traits::deallocate(_alloc, _storage, _capacity); + } + + explicit device_vector( + size_type n, const Allocator &alloc = Allocator(get_default_queue())) + : _alloc(alloc), _size(n) { _set_capacity_and_alloc(); - if (_size > 0) { - ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), - begin(), end(), T(value)); - } + _construct(n); } - device_vector(const device_vector &other) : _alloc(get_default_queue()) { - _size = other.size(); - _capacity = other.capacity(); - _storage = _alloc.allocate(_capacity); - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - other.begin(), other.end(), begin()); + + explicit device_vector( + size_type n, const T &value, + const Allocator &alloc = Allocator(get_default_queue())) + : _alloc(alloc), _size(n) { + _set_capacity_and_alloc(); + _construct(n, value); } + device_vector(device_vector &&other) - : _alloc(get_default_queue()), _size(other.size()), + : _alloc(std::move(other._alloc)), _size(other.size()), _capacity(other.capacity()), _storage(other._storage) { other._size = 0; - other._capacity = 0; + other._capacity = 0; other._storage = nullptr; } - template - device_vector(InputIterator first, - typename ::std::enable_if< - internal::is_iterator::value && - !::std::is_pointer::value && - ::std::is_same::iterator_category, - ::std::random_access_iterator_tag>::value, - InputIterator>::type last) - : _alloc(get_default_queue()) { - _size = ::std::distance(first, last); - _set_capacity_and_alloc(); - if (_size > 0) { - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - first, last, begin()); - } + device_vector(device_vector &&other, const Allocator &alloc) + : _alloc(alloc), _size(other.size()), _capacity(other.capacity()) { + _storage = alloc_traits::allocate(_alloc, _capacity); + _construct_iter(other.begin(), _size); // ok to parallelize + other._size = 0; + other._capacity = 0; + other._storage = nullptr; } template - device_vector(InputIterator first, - typename ::std::enable_if<::std::is_pointer::value, - InputIterator>::type last) - : _alloc(get_default_queue()) { + device_vector( + InputIterator first, + typename ::std::enable_if_t< + dpct::internal::is_iterator::value, InputIterator> + last, + const Allocator &alloc = Allocator(get_default_queue())) + : _alloc(alloc) { _size = ::std::distance(first, last); _set_capacity_and_alloc(); - if (_size > 0) { - auto ptr_type = sycl::get_pointer_type(first, get_default_context()); - if (ptr_type != sycl::usm::alloc::host && - ptr_type != sycl::usm::alloc::unknown) { - ::std::copy( - oneapi::dpl::execution::make_device_policy(get_default_queue()), - first, last, begin()); - } else { - sycl::buffer::value_type, - 1> - buf(first, last); - auto buf_first = oneapi::dpl::begin(buf); - auto buf_last = oneapi::dpl::end(buf); - ::std::copy( - oneapi::dpl::execution::make_device_policy(get_default_queue()), - buf_first, buf_last, begin()); - } - } + // unsafe to parallelize on device as we dont know if InputIterator is + // valid oneDPL input type + _construct_iter_host(first, _size); } - template - device_vector(InputIterator first, - typename ::std::enable_if< - internal::is_iterator::value && - !::std::is_pointer::value && - !::std::is_same::iterator_category, - ::std::random_access_iterator_tag>::value, - InputIterator>::type last) - : _alloc(get_default_queue()), _size(::std::distance(first, last)) { - _set_capacity_and_alloc(); - ::std::vector _tmp(first, last); - if (_size > 0) { - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - _tmp.begin(), _tmp.end(), this->begin()); - } + device_vector(const device_vector &other, const Allocator &alloc) + : _alloc(alloc) { + _size = other.size(); + _capacity = other.capacity(); + _storage = alloc_traits::allocate(_alloc, _capacity); + _construct_iter(other.begin(), _size); + } + + device_vector(const device_vector &other) + : device_vector( + other, + alloc_traits::select_on_container_copy_construction(other._alloc)) { } template - device_vector(const device_vector &v) - : _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()), - _capacity(v.capacity()) {} + device_vector(const device_vector &other, + const Allocator &alloc = Allocator(get_default_queue())) + : _alloc(alloc) { + _size = other.size(); + _capacity = other.capacity(); + _storage = alloc_traits::allocate(_alloc, _capacity); + _construct_iter(other.begin(), _size); + } template device_vector(const ::std::vector &v) - : _alloc(get_default_queue()), _size(v.size()) { - _set_capacity_and_alloc(); - if (_size > 0) { - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - v.begin(), v.end(), this->begin()); - } - } + : device_vector(v.begin(), v.end()) {} + + template + device_vector(const ::std::vector &v, + const Allocator &alloc) + : device_vector(v.begin(), v.end(), alloc) {} template device_vector &operator=(const ::std::vector &v) { resize(v.size()); - if (_size > 0) { - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), v.begin(), v.end(), begin()); - } return *this; } + device_vector &operator=(const device_vector &other) { // Copy assignment operator: - resize(other.size()); - if (_size > 0) { - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - other.begin(), other.end(), begin()); + if constexpr (alloc_traits::propagate_on_container_copy_assignment::value) { + clear(); + alloc_traits::deallocate(_alloc, _storage, _capacity); + _capacity = 0; + _alloc = other._alloc; } + _assign_elements(other); return *this; } device_vector &operator=(device_vector &&other) { // Move assignment operator: - device_vector dummy(::std::move(other)); - this->swap(dummy); + if constexpr (alloc_traits::propagate_on_container_move_assignment::value) { + clear(); + alloc_traits::deallocate(_alloc, _storage, _capacity); + _alloc = ::std::move(other._alloc); + _storage = ::std::move(other._storage); + _capacity = ::std::move(other._capacity); + _size = ::std::move(other._size); + } else { + _assign_elements(other); + // destroy and deallocate other vector + other.clear(); + alloc_traits::deallocate(other._alloc, other._storage, other._capacity); + } + other._size = 0; + other._capacity = 0; + other._storage = nullptr; return *this; } size_type size() const { return _size; } @@ -219,31 +398,59 @@ class device_vector { T *real_begin() { return _storage; } const T *real_begin() const { return _storage; } void swap(device_vector &v) { - ::std::swap(_size, v._size); - ::std::swap(_capacity, v._capacity); - ::std::swap(_storage, v._storage); - ::std::swap(_alloc, v._alloc); + if constexpr (::std::allocator_traits< + Allocator>::propagate_on_container_swap::value) { + ::std::swap(_alloc, v._alloc); + ::std::swap(_size, v._size); + ::std::swap(_capacity, v._capacity); + ::std::swap(_storage, v._storage); + } else { + // swap all elements up to the minimum size between the two vectors + size_type min_size = ::std::min(size(), v.size()); + auto zip = oneapi::dpl::make_zip_iterator(begin(), v.begin()); + ::std::for_each( + oneapi::dpl::execution::make_device_policy(get_default_queue()), zip, + zip + min_size, [](auto zip_ele) { + std::swap(::std::get<0>(zip_ele), ::std::get<1>(zip_ele)); + }); + // then copy the elements beyond the end of the smaller list, and resize + if (size() > v.size()) { + v.reserve(capacity()); + device_allocator_traits::uninitialized_device_copy_n( + _alloc, begin() + min_size, begin() + size(), size() - min_size, + v.begin() + min_size); + v._size = size(); + } else if (size() < v.size()) { + reserve(v.capacity()); + device_allocator_traits::uninitialized_device_copy_n( + _alloc, v.begin() + min_size, v.begin() + v.size(), + v.size() - min_size, begin() + min_size); + _size = v.size(); + } + } } reference operator[](size_type n) { return _storage[n]; } const_reference operator[](size_type n) const { return _storage[n]; } void reserve(size_type n) { if (n > capacity()) { // allocate buffer for new size - auto tmp = _alloc.allocate(2 * n); + auto tmp = alloc_traits::allocate(_alloc, n); // copy content (old buffer to new buffer) - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - begin(), end(), tmp); - // deallocate old memory - _alloc.deallocate(_storage, _capacity); + if (capacity() > 0) { + device_allocator_traits::uninitialized_device_copy_n( + _alloc, begin(), n, tmp); + alloc_traits::deallocate(_alloc, _storage, _capacity); + } _storage = tmp; - _capacity = 2 * n; + _capacity = n; } } void resize(size_type new_size, const T &x = T()) { reserve(new_size); - if (_size < new_size) { - ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), - begin() + _size, begin() + new_size, x); + if (new_size > size()) { + _construct(new_size - size(), x, size()); + } else { + _destroy(_size - new_size, new_size); } _size = new_size; } @@ -258,15 +465,15 @@ class device_vector { pointer data(void) { return _storage; } const_pointer data(void) const { return _storage; } void shrink_to_fit(void) { - if (_size != capacity()) { + if (_size != capacity() && capacity() > _min_capacity()) { size_type tmp_capacity = ::std::max(_size, _min_capacity()); - auto tmp = _alloc.allocate(tmp_capacity); + auto tmp = alloc_traits::allocate(_alloc, tmp_capacity); if (_size > 0) { ::std::copy( oneapi::dpl::execution::make_device_policy(get_default_queue()), begin(), end(), tmp); } - _alloc.deallocate(_storage, _capacity); + alloc_traits::deallocate(_alloc, _storage, _capacity); _storage = tmp; _capacity = tmp_capacity; } @@ -274,47 +481,62 @@ class device_vector { void assign(size_type n, const T &x) { resize(n); if (_size > 0) { - ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), - begin(), begin() + n, x); + ::std::fill( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + begin(), begin() + n, x); } } template void assign(InputIterator first, typename ::std::enable_if::value, - InputIterator>::type last) { + InputIterator>::type last) { auto n = ::std::distance(first, last); resize(n); if (_size > 0) { - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - first, last, begin()); + // unsafe to call on device as we don't know the InputIterator type + ::std::copy(first, last, begin()); } } - void clear(void) { _size = 0; } + void clear(void) { + _destroy(_size); + _size = 0; + } bool empty(void) const { return (size() == 0); } void push_back(const T &x) { insert(end(), size_type(1), x); } void pop_back(void) { - if (_size > 0) + if (_size > 0) { + _destroy(1, _size - 1); --_size; + } } iterator erase(iterator first, iterator last) { auto n = ::std::distance(first, last); if (last == end()) { + _destroy(n, _size - n); _size = _size - n; return end(); } auto m = ::std::distance(last, end()); if (m <= 0) { return end(); + } else if (n >= m) { + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), last, + last + m, first); + } else { + auto tmp = alloc_traits::allocate(_alloc, m); + + device_allocator_traits::uninitialized_device_copy_n( + _alloc, last, m, tmp); + + std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), + tmp, tmp + m, first); + device_allocator_traits::destroy_n(_alloc, tmp, m); + alloc_traits::deallocate(_alloc, tmp, m); } - auto tmp = _alloc.allocate(m); - // copy remainder to temporary buffer. - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - last, end(), tmp); - // override (erase) subsequence in storage. - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - tmp, tmp + m, first); - _alloc.deallocate(tmp, m); + // now destroy the remaining elements + _destroy(n, size() - n); _size -= n; return begin() + first.get_idx() + n; } @@ -326,58 +548,61 @@ class device_vector { } void insert(iterator position, size_type n, const T &x) { if (position == end()) { - resize(size() + n); - ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), - end() - n, end(), x); + reserve(size() + n); + device_allocator_traits::uninitialized_fill_n( + _alloc, _storage + size(), n, x); + _size += n; } else { auto i_n = ::std::distance(begin(), position); // allocate temporary storage auto m = ::std::distance(position, end()); // will throw if position is not inside active vector - auto tmp = _alloc.allocate(m); + auto tmp = alloc_traits::allocate(_alloc, m); // copy remainder - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - position, end(), tmp); - + device_allocator_traits::uninitialized_device_copy_n( + _alloc, position, m, tmp); resize(size() + n); // resizing might invalidate position position = begin() + position.get_idx(); - - ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()), - position, position + n, x); - - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - tmp, tmp + m, position + n); - _alloc.deallocate(tmp, m); + ::std::fill( + oneapi::dpl::execution::make_device_policy(get_default_queue()), + position, position + n, x); + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), tmp, + tmp + m, position + n); + device_allocator_traits::destroy_n(_alloc, tmp, m); + alloc_traits::deallocate(_alloc, tmp, m); } } template void insert(iterator position, InputIterator first, typename ::std::enable_if::value, - InputIterator>::type last) { + InputIterator>::type last) { auto n = ::std::distance(first, last); if (position == end()) { - resize(size() + n); - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - first, last, end()); + reserve(size() + n); + // unsafe to call on device as we dont know the InputIterator type + ::std::uninitialized_copy(first, last, end()); + _size += n; } else { auto m = ::std::distance(position, end()); // will throw if position is not inside active vector - auto tmp = _alloc.allocate(m); + auto tmp = alloc_traits::allocate(_alloc, m); - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - position, end(), tmp); + device_allocator_traits::uninitialized_device_copy_n( + _alloc, position, m, tmp); resize(size() + n); // resizing might invalidate position position = begin() + position.get_idx(); - - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - first, last, position); - ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()), - tmp, tmp + m, position + n); - _alloc.deallocate(tmp, m); + // unsafe to call on device as we dont know the InputIterator type + ::std::copy(first, last, position); + ::std::copy( + oneapi::dpl::execution::make_device_policy(get_default_queue()), tmp, + tmp + m, position + n); + device_allocator_traits::destroy_n(_alloc, tmp, m); + alloc_traits::deallocate(_alloc, tmp, m); } } Allocator get_allocator() const { return _alloc; }