From 2034ddef6a1813dd66dc030282316fa6e51d86cf Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 27 Mar 2026 17:29:21 +0000 Subject: [PATCH 01/12] fix triple-buffer issue --- sycl/source/detail/memory_manager.cpp | 2 + sycl/source/detail/sycl_mem_obj_i.hpp | 6 + sycl/source/detail/sycl_mem_obj_t.cpp | 42 ++++++ sycl/source/detail/sycl_mem_obj_t.hpp | 31 +++- .../buffer_shadow_copy_platform_policy.cpp | 140 ++++++++++++++++++ 5 files changed, 213 insertions(+), 8 deletions(-) create mode 100644 sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 2720e35e4b42c..54626c614614b 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -286,6 +286,8 @@ void *MemoryManager::allocate(context_impl *TargetContext, SYCLMemObjI *MemObj, waitForEvents(DepEvents); OutEvent = nullptr; + MemObj->prepareForAllocation(TargetContext); + return MemObj->allocateMem(TargetContext, InitFromUserData, HostPtr, OutEvent); } diff --git a/sycl/source/detail/sycl_mem_obj_i.hpp b/sycl/source/detail/sycl_mem_obj_i.hpp index d2e37075b3ea8..943ecef78f31c 100644 --- a/sycl/source/detail/sycl_mem_obj_i.hpp +++ b/sycl/source/detail/sycl_mem_obj_i.hpp @@ -46,6 +46,12 @@ class SYCLMemObjI { virtual void *allocateMem(context_impl *Context, bool InitFromUserData, void *HostPtr, ur_event_handle_t &InteropEvent) = 0; + // Optional hook executed right before allocateMem(). Memory objects can use + // it to resolve context/backend-dependent allocation policy. + virtual void prepareForAllocation(context_impl *Context) { + (void)Context; + } + // Should be used for memory object created without use_host_ptr property. virtual void *allocateHostMem() = 0; diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index bae53031930de..15e8b44078b70 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -176,6 +176,48 @@ adapter_impl &SYCLMemObjT::getAdapter() const { bool SYCLMemObjT::isInterop() const { return MOpenCLInterop; } +void SYCLMemObjT::prepareForAllocation(context_impl *Context) { + if (!MHasPendingAlignedShadowCopy || MShadowCopy != nullptr) + return; + + assert(Context != nullptr && "Context must not be nullptr"); + + bool SkipShadowCopy = false; + backend Backend = Context->getPlatformImpl().getBackend(); + auto Devices = Context->getDevices(); + if (Devices.size() != 0) + Backend = Devices.front().getBackend(); + + switch (Backend) { + case backend::ext_oneapi_level_zero: + case backend::ext_oneapi_cuda: + case backend::ext_oneapi_hip: + case backend::ext_oneapi_offload: + SkipShadowCopy = true; + break; + case backend::ext_oneapi_native_cpu: + case backend::opencl: + SkipShadowCopy = false; + break; + case backend::all: + default: + assert(false && "Unexpected SYCL backend"); + break; + } + + std::lock_guard Lock(MCreateShadowCopyMtx); + if (SkipShadowCopy) { + MCreateShadowCopy = []() -> void {}; + if (!MHostPtrReadOnly) + MUploadDataFunctor = nullptr; + } else { + MCreateShadowCopy(); + MCreateShadowCopy = []() -> void {}; + } + + MHasPendingAlignedShadowCopy = false; +} + void SYCLMemObjT::determineHostPtr(context_impl *Context, bool InitFromUserData, void *&HostPtr, bool &HostPtrReadOnly) { // The data for the allocation can be provided via either the user pointer diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index 4893d154a7566..a5415cd0ac68a 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -196,10 +196,15 @@ class SYCLMemObjT : public SYCLMemObjI { std::memcpy(MUserPtr, HostPtr, MSizeInBytes); }; } else { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr, MSizeInBytes); + MUserPtr = HostPtr; + MHasPendingAlignedShadowCopy = true; + std::lock_guard Lock(MCreateShadowCopyMtx); + MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { + setAlign(RequiredAlign); + MShadowCopy = allocateHostMem(); + MUserPtr = MShadowCopy; + std::memcpy(MUserPtr, HostPtr, MSizeInBytes); + }; } } } @@ -230,10 +235,15 @@ class SYCLMemObjT : public SYCLMemObjI { std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); }; } else { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); + MUserPtr = HostPtr.get(); + MHasPendingAlignedShadowCopy = true; + std::lock_guard Lock(MCreateShadowCopyMtx); + MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { + setAlign(RequiredAlign); + MShadowCopy = allocateHostMem(); + MUserPtr = MShadowCopy; + std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); + }; } } } @@ -260,6 +270,8 @@ class SYCLMemObjT : public SYCLMemObjI { void handleWriteAccessorCreation(); + void prepareForAllocation(context_impl *Context) override; + void *allocateMem(context_impl *Context, bool InitFromUserData, void *HostPtr, ur_event_handle_t &InteropEvent) override { (void)Context; @@ -369,6 +381,9 @@ class SYCLMemObjT : public SYCLMemObjI { // accessor is created. std::function MCreateShadowCopy = []() -> void {}; std::mutex MCreateShadowCopyMtx; + // Set when misaligned input data cannot be used directly and the shadow-copy + // decision is deferred until backend/platform is known. + bool MHasPendingAlignedShadowCopy = false; bool MOwnNativeHandle = true; }; } // namespace detail diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp new file mode 100644 index 0000000000000..8452fd0d04a0f --- /dev/null +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -0,0 +1,140 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// Checks that misaligned host-pointer buffers do not allocate a SYCL shadow +// copy on backends where prepareForAllocation() disables it. +// +// The test is portable: expected allocation count is derived from the runtime +// backend, so a single test works across all platforms. + +#include + +#include +#include +#include +#include +#include +#include +#include + +template class CountingAllocator { +public: + using value_type = T; + + CountingAllocator() = default; + + template + constexpr CountingAllocator(const CountingAllocator &) noexcept {} + + T *allocate(std::size_t N) { + Allocations.fetch_add(1, std::memory_order_relaxed); + return std::allocator{}.allocate(N); + } + + void deallocate(T *Ptr, std::size_t N) { + std::allocator{}.deallocate(Ptr, N); + } + + template bool operator==(const CountingAllocator &) const { + return true; + } + + template bool operator!=(const CountingAllocator &) const { + return false; + } + + static std::atomic Allocations; +}; + +template std::atomic CountingAllocator::Allocations{0}; + +static bool shouldSkipAlignedShadowCopy(sycl::backend B) { + switch (B) { + case sycl::backend::ext_oneapi_level_zero: + case sycl::backend::ext_oneapi_cuda: + case sycl::backend::ext_oneapi_hip: + case sycl::backend::ext_oneapi_offload: + return true; + case sycl::backend::ext_oneapi_native_cpu: + case sycl::backend::opencl: + return false; + default: + return false; + } +} + +static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { + sycl::buffer> Buf(HostPtr, sycl::range<1>(N)); + sycl::buffer SumBuf(1); + + Q.submit([&](sycl::handler &CGH) { + auto InAcc = Buf.get_access(CGH); + auto SumAcc = SumBuf.get_access(CGH); + CGH.single_task([=]() { + int Sum = 0; + for (size_t I = 0; I < N; ++I) + Sum += InAcc[I]; + SumAcc[0] = Sum; + }); + }); + Q.wait_and_throw(); + + auto SumHostAcc = SumBuf.get_host_access(); + return SumHostAcc[0]; +} + +int main() { + constexpr size_t N = 32; + sycl::queue Q; + + std::vector AlignedInput(N); + for (size_t I = 0; I < N; ++I) + AlignedInput[I] = static_cast(I); + + std::vector Storage(sizeof(int) * N + 1); + int *UnalignedPtr = reinterpret_cast(Storage.data() + 1); + std::memcpy(UnalignedPtr, AlignedInput.data(), sizeof(int) * N); + const int *ReadOnlyUnalignedPtr = UnalignedPtr; + + const int ExpectedSum = static_cast((N - 1) * N / 2); + + CountingAllocator::Allocations.store(0, std::memory_order_relaxed); + const int AlignedSum = runReadOnlySumKernel(Q, AlignedInput.data(), N); + const size_t AlignedAllocations = + CountingAllocator::Allocations.load(std::memory_order_relaxed); + if (AlignedSum != ExpectedSum) { + std::cerr << "Unexpected aligned sum: " << AlignedSum << "\n"; + return 1; + } + + CountingAllocator::Allocations.store(0, std::memory_order_relaxed); + const int MisalignedSum = runReadOnlySumKernel(Q, ReadOnlyUnalignedPtr, N); + + const size_t MisalignedAllocations = + CountingAllocator::Allocations.load(std::memory_order_relaxed); + if (MisalignedSum != ExpectedSum) { + std::cerr << "Unexpected misaligned sum: " << MisalignedSum << "\n"; + return 1; + } + + const bool ExpectNoShadowCopy = shouldSkipAlignedShadowCopy(Q.get_backend()); + + if (ExpectNoShadowCopy) { + if (MisalignedAllocations != AlignedAllocations) { + std::cerr << "Unexpected extra allocation on misaligned pointer: aligned=" + << AlignedAllocations + << ", misaligned=" << MisalignedAllocations << "\n"; + return 1; + } + } else { + if (MisalignedAllocations != AlignedAllocations + 1) { + std::cerr + << "Expected one extra allocation for misaligned pointer: aligned=" + << AlignedAllocations << ", misaligned=" << MisalignedAllocations + << "\n"; + return 1; + } + } + + return 0; +} From dfd0914c1ef2f5cfb83272cec8d21514ca35c3ee Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 3 Apr 2026 14:17:37 +0000 Subject: [PATCH 02/12] fix ownership of write-back and extend the test --- sycl/source/detail/sycl_mem_obj_t.cpp | 4 +- sycl/source/detail/sycl_mem_obj_t.hpp | 6 +- .../buffer_shadow_copy_platform_policy.cpp | 73 ++++++++++++++++++- 3 files changed, 79 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 15e8b44078b70..d67b68d2a19bc 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -143,7 +143,7 @@ void SYCLMemObjT::updateHostMemory(void *const Ptr) { void SYCLMemObjT::updateHostMemory() { // Don't try updating host memory when shutting down. if ((MUploadDataFunctor != nullptr) && MNeedWriteBack && - GlobalHandler::instance().isOkToDefer()) + !MBackendOwnsWriteBack && GlobalHandler::instance().isOkToDefer()) MUploadDataFunctor(); // If we're attached to a memory record, process the deletion of the memory @@ -208,11 +208,13 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { std::lock_guard Lock(MCreateShadowCopyMtx); if (SkipShadowCopy) { MCreateShadowCopy = []() -> void {}; + MBackendOwnsWriteBack = true; if (!MHostPtrReadOnly) MUploadDataFunctor = nullptr; } else { MCreateShadowCopy(); MCreateShadowCopy = []() -> void {}; + MBackendOwnsWriteBack = false; } MHasPendingAlignedShadowCopy = false; diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index a5415cd0ac68a..a09068b0cd12f 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -303,7 +303,9 @@ class SYCLMemObjT : public SYCLMemObjI { void markAsInternal() { MIsInternal = true; } /// Returns true if this memory object requires a write_back on destruction. - bool needsWriteBack() const { return MNeedWriteBack && MUploadDataFunctor; } + bool needsWriteBack() const { + return MNeedWriteBack && MUploadDataFunctor && !MBackendOwnsWriteBack; + } /// Increment an internal counter for how many graphs are currently using this /// memory object. @@ -384,6 +386,8 @@ class SYCLMemObjT : public SYCLMemObjI { // Set when misaligned input data cannot be used directly and the shadow-copy // decision is deferred until backend/platform is known. bool MHasPendingAlignedShadowCopy = false; + // True when backend/adapter is responsible for final host copy-back. + bool MBackendOwnsWriteBack = false; bool MOwnNativeHandle = true; }; } // namespace detail diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index 8452fd0d04a0f..153232948637f 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -1,9 +1,13 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Checks that misaligned host-pointer buffers do not allocate a SYCL shadow -// copy on backends where prepareForAllocation() disables it. +// Checks two things: +// 1) policy check (read-only path): whether misaligned host pointers trigger +// an extra SYCL shadow-copy allocation depending on backend; +// 2) correctness check (writable path): data is correctly copied back to host +// when buffer goes out of scope. // +// The test does not check the lower layers allocations. // The test is portable: expected allocation count is derived from the runtime // backend, so a single test works across all platforms. @@ -63,6 +67,7 @@ static bool shouldSkipAlignedShadowCopy(sycl::backend B) { } } +// Read-only kernel path used for allocation-policy assertions. static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { sycl::buffer> Buf(HostPtr, sycl::range<1>(N)); sycl::buffer SumBuf(1); @@ -83,6 +88,34 @@ static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { return SumHostAcc[0]; } +// Writable kernel path; buffer destruction happens at scope exit. +static void runWriteKernel(sycl::queue &Q, int *HostPtr, size_t N) { + { + sycl::buffer> Buf(HostPtr, + sycl::range<1>(N)); + + Q.submit([&](sycl::handler &CGH) { + auto OutAcc = Buf.get_access(CGH); + CGH.single_task([=]() { + for (size_t I = 0; I < N; ++I) + OutAcc[I] = static_cast(I * 3 + 7); + }); + }); + Q.wait_and_throw(); + } +} + +// Verifies host-side result after writable-buffer destruction. +static bool checkExpectedPattern(const int *Ptr, size_t N) { + std::vector Tmp(N); + std::memcpy(Tmp.data(), Ptr, sizeof(int) * N); + for (size_t I = 0; I < N; ++I) { + if (Tmp[I] != static_cast(I * 3 + 7)) + return false; + } + return true; +} + int main() { constexpr size_t N = 32; sycl::queue Q; @@ -98,6 +131,8 @@ int main() { const int ExpectedSum = static_cast((N - 1) * N / 2); + // Compare aligned vs misaligned read-only input. Allocation count is used as + // a proxy for SYCL shadow-copy creation. CountingAllocator::Allocations.store(0, std::memory_order_relaxed); const int AlignedSum = runReadOnlySumKernel(Q, AlignedInput.data(), N); const size_t AlignedAllocations = @@ -136,5 +171,39 @@ int main() { } } + // Validate writable path including final copy-back at buffer destruction. + // This checks correctness only; writable allocation counts are intentionally + // not asserted (write accessor can conservatively materialize shadow copy). + std::vector AlignedWritable(N, 0); + std::vector WritableStorage(sizeof(int) * N + 1, 0); + int *UnalignedWritablePtr = + reinterpret_cast(WritableStorage.data() + 1); + + CountingAllocator::Allocations.store(0, std::memory_order_relaxed); + runWriteKernel(Q, AlignedWritable.data(), N); + const size_t AlignedWriteAllocs = + CountingAllocator::Allocations.load(std::memory_order_relaxed); + + CountingAllocator::Allocations.store(0, std::memory_order_relaxed); + runWriteKernel(Q, UnalignedWritablePtr, N); + const size_t MisalignedWriteAllocs = + CountingAllocator::Allocations.load(std::memory_order_relaxed); + + if (!checkExpectedPattern(AlignedWritable.data(), N)) { + std::cerr << "Unexpected data in aligned writable buffer\n"; + return 1; + } + if (!checkExpectedPattern(UnalignedWritablePtr, N)) { + std::cerr << "Unexpected data in misaligned writable buffer\n"; + return 1; + } + + // For writable access, SYCL may conservatively materialize shadow copy + // before backend-specific skip policy is resolved (write accessor creation + // can trigger this). Keep this test focused on data correctness for writable + // path and use read-only path for strict allocation-policy assertions. + (void)AlignedWriteAllocs; + (void)MisalignedWriteAllocs; + return 0; } From f6e0bc7e231bd54a81556c8c011976e63e412e72 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 7 Apr 2026 12:23:30 +0000 Subject: [PATCH 03/12] update copy-back in memory.cpp --- sycl/source/detail/sycl_mem_obj_t.cpp | 8 +++-- .../buffer_shadow_copy_platform_policy.cpp | 14 ++++++-- .../source/adapters/level_zero/v2/memory.cpp | 34 ++++++------------- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index d67b68d2a19bc..ceb90177ce5e6 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -177,10 +177,12 @@ adapter_impl &SYCLMemObjT::getAdapter() const { bool SYCLMemObjT::isInterop() const { return MOpenCLInterop; } void SYCLMemObjT::prepareForAllocation(context_impl *Context) { - if (!MHasPendingAlignedShadowCopy || MShadowCopy != nullptr) + // Context may be null for host allocations; nothing backend-specific to do. + if (!Context) return; - assert(Context != nullptr && "Context must not be nullptr"); + if (!MHasPendingAlignedShadowCopy || MShadowCopy != nullptr) + return; bool SkipShadowCopy = false; backend Backend = Context->getPlatformImpl().getBackend(); @@ -207,6 +209,8 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { std::lock_guard Lock(MCreateShadowCopyMtx); if (SkipShadowCopy) { + // Backend (UR) will manage the misaligned host pointer through its own + // internal staging buffer and owns the final copy-back to the original ptr. MCreateShadowCopy = []() -> void {}; MBackendOwnsWriteBack = true; if (!MHostPtrReadOnly) diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index 153232948637f..3ca287bd97169 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -92,7 +92,7 @@ static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { static void runWriteKernel(sycl::queue &Q, int *HostPtr, size_t N) { { sycl::buffer> Buf(HostPtr, - sycl::range<1>(N)); + sycl::range<1>(N)); Q.submit([&](sycl::handler &CGH) { auto OutAcc = Buf.get_access(CGH); @@ -153,12 +153,20 @@ int main() { } const bool ExpectNoShadowCopy = shouldSkipAlignedShadowCopy(Q.get_backend()); + const bool IsIntegratedL0 = + Q.get_backend() == sycl::backend::ext_oneapi_level_zero && + Q.get_device().has(sycl::aspect::ext_oneapi_is_integrated_gpu); if (ExpectNoShadowCopy) { - if (MisalignedAllocations != AlignedAllocations) { + // Integrated L0 may still conservatively materialize one host allocation + // for misaligned read-only source. Keep strict no-extra-allocation + // expectation for other backends in this group. + const size_t AllowedExtraAllocs = IsIntegratedL0 ? 1 : 0; + if (MisalignedAllocations > AlignedAllocations + AllowedExtraAllocs) { std::cerr << "Unexpected extra allocation on misaligned pointer: aligned=" << AlignedAllocations - << ", misaligned=" << MisalignedAllocations << "\n"; + << ", misaligned=" << MisalignedAllocations + << ", allowed_extra=" << AllowedExtraAllocs << "\n"; return 1; } } else { diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index e91348ad257ce..b97bda6009928 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -196,31 +196,17 @@ void ur_integrated_buffer_handle_t::unmapHostPtr( } void ur_integrated_buffer_handle_t::copyBackToHostIfNeeded() { - if (writeBackPtr) { - // Validate that the pointer is still valid before copy-back. - // SYCL might already do its own copy-back and free it. - ZeStruct memProps; - ze_device_handle_t device; - auto result = ZE_CALL_NOCHECK( - zeMemGetAllocProperties, - (hContext->getZeHandle(), writeBackPtr, &memProps, &device)); - - // If pointer is not a valid allocation (SYCL freed it), skip copy-back - if (result != ZE_RESULT_SUCCESS || - memProps.type == ZE_MEMORY_TYPE_UNKNOWN) { - writeBackPtr = nullptr; - return; - } + if (!writeBackPtr) { + return; + } - // Pointer is valid, perform copy-back - auto hDevice = hContext->getDevices()[0]; - auto result2 = synchronousZeCopy(hContext, hDevice, writeBackPtr, - this->ptr.get(), size); - if (result2 == UR_RESULT_SUCCESS) { - writeBackPtr = nullptr; - } else { - UR_LOG_SAFE(ERR, "Failed to copy-back buffer data: {}", result2); - } + auto hDevice = hContext->getDevices()[0]; + auto result = + synchronousZeCopy(hContext, hDevice, writeBackPtr, this->ptr.get(), size); + if (result == UR_RESULT_SUCCESS) { + writeBackPtr = nullptr; + } else { + UR_LOG_SAFE(ERR, "Failed to copy-back buffer data: {}", result); } } From 9367073dfb0d9ab32dff30b7fdd3bc97d15e3605 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 7 Apr 2026 13:17:21 +0000 Subject: [PATCH 04/12] update the test --- .../buffer_shadow_copy_platform_policy.cpp | 54 ++++++++++++------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index 3ca287bd97169..94bb75a199847 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -7,9 +7,24 @@ // 2) correctness check (writable path): data is correctly copied back to host // when buffer goes out of scope. // -// The test does not check the lower layers allocations. -// The test is portable: expected allocation count is derived from the runtime -// backend, so a single test works across all platforms. +// The SYCL runtime and the UR adapter layer use different strategies for +// non-importable or misaligned host pointers: +// +// * Backends with UR buffer allocation (e.g. Level Zero v2): +// SYCL skips its own shadow copy (prepareForAllocation sets SkipShadowCopy). +// The UR adapter owns staging and final copy-back: +// - integrated device: allocates internal USM host memory, copies data in, +// and copies it back to the original pointer on buffer release; +// - discrete device: allocates device memory, migrates data from source. +// These UR-internal allocations go through the USM pool and are NOT seen by +// CountingAllocator, so the SYCL-layer allocation count stays zero. +// +// * Backends without UR buffer allocation (e.g. NativeCPU, OpenCL): +// SYCL creates the shadow copy itself via the buffer's allocator +// (CountingAllocator records it) and drives copy-back on destruction. +// +// The test measures only SYCL-layer (CountingAllocator) allocations, so it is +// portable across all supported backends without runtime-specific conditionals. #include @@ -153,26 +168,24 @@ int main() { } const bool ExpectNoShadowCopy = shouldSkipAlignedShadowCopy(Q.get_backend()); - const bool IsIntegratedL0 = - Q.get_backend() == sycl::backend::ext_oneapi_level_zero && - Q.get_device().has(sycl::aspect::ext_oneapi_is_integrated_gpu); if (ExpectNoShadowCopy) { - // Integrated L0 may still conservatively materialize one host allocation - // for misaligned read-only source. Keep strict no-extra-allocation - // expectation for other backends in this group. - const size_t AllowedExtraAllocs = IsIntegratedL0 ? 1 : 0; - if (MisalignedAllocations > AlignedAllocations + AllowedExtraAllocs) { - std::cerr << "Unexpected extra allocation on misaligned pointer: aligned=" + // SYCL skips its own shadow copy; UR is responsible for staging and + // copy-back. No SYCL-layer extra allocation is expected regardless of + // whether the device is integrated or discrete. + if (MisalignedAllocations != AlignedAllocations) { + std::cerr << "Unexpected extra SYCL allocation on misaligned pointer: " + "aligned=" << AlignedAllocations - << ", misaligned=" << MisalignedAllocations - << ", allowed_extra=" << AllowedExtraAllocs << "\n"; + << ", misaligned=" << MisalignedAllocations << "\n"; return 1; } } else { + // SYCL creates the shadow copy itself; expect exactly one extra allocation. if (MisalignedAllocations != AlignedAllocations + 1) { std::cerr - << "Expected one extra allocation for misaligned pointer: aligned=" + << "Expected one extra SYCL allocation for misaligned pointer: " + "aligned=" << AlignedAllocations << ", misaligned=" << MisalignedAllocations << "\n"; return 1; @@ -206,10 +219,13 @@ int main() { return 1; } - // For writable access, SYCL may conservatively materialize shadow copy - // before backend-specific skip policy is resolved (write accessor creation - // can trigger this). Keep this test focused on data correctness for writable - // path and use read-only path for strict allocation-policy assertions. + // Writable allocation counts are intentionally not asserted here because the + // expected delta is platform-dependent: + // * NativeCPU/OpenCL: SYCL creates a shadow copy for misaligned writable + // buffers (MisalignedWriteAllocs == AlignedWriteAllocs + 1). + // * L0/CUDA/HIP/offload: SYCL skips the shadow copy; UR stages internally + // (MisalignedWriteAllocs == AlignedWriteAllocs). + // The correctness checks above are sufficient to validate copy-back behavior. (void)AlignedWriteAllocs; (void)MisalignedWriteAllocs; From 533fba5e2d9122296bede1641d775e1a790bbfa6 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 7 Apr 2026 15:20:50 +0000 Subject: [PATCH 05/12] set the offlad adapter at the safe side --- sycl/source/detail/sycl_mem_obj_t.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index ceb90177ce5e6..5e3c37885e4ab 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -194,11 +194,11 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { case backend::ext_oneapi_level_zero: case backend::ext_oneapi_cuda: case backend::ext_oneapi_hip: - case backend::ext_oneapi_offload: SkipShadowCopy = true; break; - case backend::ext_oneapi_native_cpu: case backend::opencl: + case backend::ext_oneapi_native_cpu: + case backend::ext_oneapi_offload: SkipShadowCopy = false; break; case backend::all: From 13c8c697509b761ff7add2bff4d5be22e60163c7 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 7 Apr 2026 15:24:08 +0000 Subject: [PATCH 06/12] fix the test --- .../Regression/buffer_shadow_copy_platform_policy.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index 94bb75a199847..b633c04524b22 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -72,10 +72,10 @@ static bool shouldSkipAlignedShadowCopy(sycl::backend B) { case sycl::backend::ext_oneapi_level_zero: case sycl::backend::ext_oneapi_cuda: case sycl::backend::ext_oneapi_hip: - case sycl::backend::ext_oneapi_offload: return true; - case sycl::backend::ext_oneapi_native_cpu: case sycl::backend::opencl: + case sycl::backend::ext_oneapi_native_cpu: + case sycl::backend::ext_oneapi_offload: return false; default: return false; From 48e52a1f5ac43a6c5c311d6d380148d609d4a475 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Thu, 9 Apr 2026 11:46:53 +0000 Subject: [PATCH 07/12] fix the e2e test issues --- sycl/source/detail/buffer_impl.cpp | 9 ++- sycl/source/detail/memory_manager.cpp | 36 +++++----- sycl/source/detail/memory_manager.hpp | 7 +- sycl/source/detail/sycl_mem_obj_t.hpp | 2 + .../source/adapters/level_zero/v2/memory.cpp | 68 ++++++++++++++++--- .../source/adapters/level_zero/v2/memory.hpp | 4 ++ 6 files changed, 93 insertions(+), 33 deletions(-) diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 0db81e3f02129..8327a13b56859 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -23,13 +23,16 @@ void *buffer_impl::allocateMem(context_impl *Context, bool InitFromUserData, ur_event_handle_t &OutEventToWait) { bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); + const bool BackendOwnedWriteBack = HostPtr != nullptr && + HostPtr == BaseT::getUserPtr() && + BaseT::backendOwnsWriteBack(); assert(!(nullptr == HostPtr && BaseT::useHostPtr() && !Context) && "Internal error. Allocating memory on the host " "while having use_host_ptr property"); return MemoryManager::allocateMemBuffer( - Context, this, HostPtr, HostPtrReadOnly, BaseT::getSizeInBytes(), - BaseT::MInteropEvent, BaseT::MInteropContext.get(), MProps, - OutEventToWait); + Context, this, HostPtr, HostPtrReadOnly, BackendOwnedWriteBack, + BaseT::getSizeInBytes(), BaseT::MInteropEvent, + BaseT::MInteropContext.get(), MProps, OutEventToWait); } void buffer_impl::constructorNotification(const detail::code_location &CodeLoc, void *UserObj, const void *HostObj, diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 54626c614614b..ec6a043cfaaf4 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -324,13 +324,17 @@ void *MemoryManager::allocateInteropMemObject( return UserPtr; } -static ur_mem_flags_t getMemObjCreationFlags(void *UserPtr, - bool HostPtrReadOnly) { +static ur_mem_flags_t +getMemObjCreationFlags(void *UserPtr, bool HostPtrReadOnly, + bool BackendOwnedWriteBack = false) { // Create read_write mem object to handle arbitrary uses. ur_mem_flags_t Result = HostPtrReadOnly ? UR_MEM_FLAG_READ_ONLY : UR_MEM_FLAG_READ_WRITE; - if (UserPtr) + if (UserPtr) { Result |= UR_MEM_FLAG_USE_HOST_POINTER; + if (BackendOwnedWriteBack) + Result |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; + } return Result; } @@ -350,12 +354,12 @@ void *MemoryManager::allocateImageObject(context_impl *TargetContext, return NewMem; } -void * -MemoryManager::allocateBufferObject(context_impl *TargetContext, void *UserPtr, - bool HostPtrReadOnly, const size_t Size, - const sycl::property_list &PropsList) { +void *MemoryManager::allocateBufferObject( + context_impl *TargetContext, void *UserPtr, bool HostPtrReadOnly, + bool BackendOwnedWriteBack, const size_t Size, + const sycl::property_list &PropsList) { ur_mem_flags_t CreationFlags = - getMemObjCreationFlags(UserPtr, HostPtrReadOnly); + getMemObjCreationFlags(UserPtr, HostPtrReadOnly, BackendOwnedWriteBack); if (PropsList.has_property< sycl::ext::oneapi::property::buffer::use_pinned_host_memory>()) CreationFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; @@ -371,13 +375,11 @@ MemoryManager::allocateBufferObject(context_impl *TargetContext, void *UserPtr, return NewMem; } -void *MemoryManager::allocateMemBuffer(context_impl *TargetContext, - SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, - const EventImplPtr &InteropEvent, - context_impl *InteropContext, - const sycl::property_list &PropsList, - ur_event_handle_t &OutEventToWait) { +void *MemoryManager::allocateMemBuffer( + context_impl *TargetContext, SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, bool BackendOwnedWriteBack, size_t Size, + const EventImplPtr &InteropEvent, context_impl *InteropContext, + const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait) { void *MemPtr; if (!TargetContext) MemPtr = @@ -387,8 +389,8 @@ void *MemoryManager::allocateMemBuffer(context_impl *TargetContext, allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); else - MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size, - PropsList); + MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, + BackendOwnedWriteBack, Size, PropsList); XPTIRegistry::bufferAssociateNotification(MemObj, MemPtr); return MemPtr; } diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 1ab72d1975055..b4549aad9bdc0 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -62,7 +62,8 @@ class MemoryManager { // one(not host). static void *allocateMemBuffer(context_impl *TargetContext, SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, size_t Size, + bool HostPtrReadOnly, + bool BackendOwnedWriteBack, size_t Size, const EventImplPtr &InteropEvent, context_impl *InteropContext, const sycl::property_list &PropsList, @@ -101,7 +102,9 @@ class MemoryManager { const sycl::property_list &PropsList); static void *allocateBufferObject(context_impl *TargetContext, void *UserPtr, - bool HostPtrReadOnly, const size_t Size, + bool HostPtrReadOnly, + bool BackendOwnedWriteBack, + const size_t Size, const sycl::property_list &PropsList); // Copies memory between: host and device, host and host, diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index a09068b0cd12f..fdca42bcf196a 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -307,6 +307,8 @@ class SYCLMemObjT : public SYCLMemObjI { return MNeedWriteBack && MUploadDataFunctor && !MBackendOwnsWriteBack; } + bool backendOwnsWriteBack() const { return MBackendOwnsWriteBack; } + /// Increment an internal counter for how many graphs are currently using this /// memory object. void markBeingUsedInGraph() { MGraphUseCount += 1; } diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index b97bda6009928..c148bda18160b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -82,6 +82,19 @@ static ur_result_t synchronousZeCopy(ur_context_handle_t hContext, return exceptionToResult(std::current_exception()); } +static ur_result_t +initializeBufferFromHost(ur_context_handle_t hContext, ur_mem_handle_t hBuffer, + ur_mem_buffer_t::device_access_mode_t accessMode, + const void *hostPtr, size_t size) try { + wait_list_view waitList(nullptr, 0); + void *dst = hBuffer->getBuffer()->getDevicePtr(nullptr, accessMode, 0, size, + nullptr, waitList); + auto hDevice = hContext->getDevices()[0]; + return synchronousZeCopy(hContext, hDevice, dst, hostPtr, size); +} catch (...) { + return exceptionToResult(std::current_exception()); +} + ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( ur_context_handle_t hContext, void *hostPtr, size_t size, device_access_mode_t accessMode) @@ -113,9 +126,7 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( auto hDevice = hContext->getDevices()[0]; UR_CALL_THROWS( synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size)); - // Store writeBackPtr for copy-back - needed when original pointer - // cannot be imported (e.g., misaligned, wrong allocation type) - writeBackPtr = hostPtr; + mapToPtr = hostPtr; } } } @@ -142,9 +153,9 @@ void *ur_integrated_buffer_handle_t::getDevicePtr( void *ur_integrated_buffer_handle_t::mapHostPtr( ur_map_flags_t flags, size_t offset, size_t mapSize, ze_command_list_handle_t cmdList, wait_list_view & /*waitListView*/) { - if (writeBackPtr) { - // Copy-back path: user gets back their original pointer - void *mappedPtr = ur_cast(writeBackPtr) + offset; + if (mapToPtr) { + // Staging path: map through the host pointer associated with this buffer. + void *mappedPtr = ur_cast(mapToPtr) + offset; if (flags & UR_MAP_FLAG_READ) { // Use Level Zero copy for USM HOST memory to ensure GPU visibility @@ -167,7 +178,7 @@ void *ur_integrated_buffer_handle_t::mapHostPtr( void ur_integrated_buffer_handle_t::unmapHostPtr( void *pMappedPtr, ze_command_list_handle_t cmdList, wait_list_view & /*waitListView*/) { - if (writeBackPtr) { + if (mapToPtr) { // Copy-back path: find the mapped region and copy data back if needed auto mappedRegion = std::find_if(mappedRegions.begin(), mappedRegions.end(), @@ -623,6 +634,8 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, } void *hostPtr = pProperties ? pProperties->pHost : nullptr; + const bool useHostPtr = (flags & UR_MEM_FLAG_USE_HOST_POINTER) != 0; + const bool copyHostPtr = (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) != 0; auto accessMode = ur_mem_buffer_t::getDeviceAccessMode(flags); // For integrated devices, use zero-copy host buffers. The integrated buffer @@ -632,11 +645,44 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, // 3. Host pointer can be imported - import it // 4. Otherwise - allocate USM and copy-back through map/unmap operations if (useHostBuffer(hContext)) { - *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, accessMode); + if (useHostPtr) { + auto buffer = std::unique_ptr( + ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode)); + if (copyHostPtr && hostPtr) { + static_cast(buffer->getBuffer()) + ->setWriteBackPtr(hostPtr); + } + *phBuffer = buffer.release(); + } else if (copyHostPtr && hostPtr) { + auto buffer = std::unique_ptr( + ur_mem_handle_t_::create( + hContext, nullptr, size, accessMode)); + UR_CALL(initializeBufferFromHost(hContext, buffer.get(), accessMode, + hostPtr, size)); + *phBuffer = buffer.release(); + } else { + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); + } } else { - *phBuffer = ur_mem_handle_t_::create( - hContext, hostPtr, size, accessMode); + if (useHostPtr && copyHostPtr && hostPtr) { + *phBuffer = ur_mem_handle_t_::create( + hContext, nullptr, nullptr, size, accessMode, hostPtr, false); + } else if (useHostPtr) { + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); + } else if (copyHostPtr && hostPtr) { + auto buffer = std::unique_ptr( + ur_mem_handle_t_::create( + hContext, nullptr, size, accessMode)); + UR_CALL(initializeBufferFromHost(hContext, buffer.get(), accessMode, + hostPtr, size)); + *phBuffer = buffer.release(); + } else { + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); + } } return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index c793cf97d79eb..3c4bcdcc99022 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -123,8 +123,12 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { // Perform final copy-back to original host pointer if needed void copyBackToHostIfNeeded(); + void setMapToPtr(void *hostPtr) { mapToPtr = hostPtr; } + void setWriteBackPtr(void *hostPtr) { writeBackPtr = hostPtr; } + private: usm_unique_ptr_t ptr; + void *mapToPtr = nullptr; void *writeBackPtr = nullptr; std::vector mappedRegions; }; From 88408662659c2b5d8593148fb85cbe4a0163f1ae Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Thu, 9 Apr 2026 12:08:27 +0000 Subject: [PATCH 08/12] --amend --- .../Regression/buffer_shadow_copy_platform_policy.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index b633c04524b22..ff91698d8be2a 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -183,11 +183,10 @@ int main() { } else { // SYCL creates the shadow copy itself; expect exactly one extra allocation. if (MisalignedAllocations != AlignedAllocations + 1) { - std::cerr - << "Expected one extra SYCL allocation for misaligned pointer: " - "aligned=" - << AlignedAllocations << ", misaligned=" << MisalignedAllocations - << "\n"; + std::cerr << "Expected one extra SYCL allocation for misaligned pointer: " + "aligned=" + << AlignedAllocations + << ", misaligned=" << MisalignedAllocations << "\n"; return 1; } } From fdda47b8c0eab4f2d0d123cbcd78a995703c3a91 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 10 Apr 2026 15:03:16 +0000 Subject: [PATCH 09/12] fix for opencl and simplify shadow alloc --- sycl/source/detail/memory_manager.hpp | 13 +- sycl/source/detail/sycl_mem_obj_i.hpp | 4 +- sycl/source/detail/sycl_mem_obj_t.cpp | 110 +++++++++++-- sycl/source/detail/sycl_mem_obj_t.hpp | 47 +++--- .../buffer_shadow_copy_platform_policy.cpp | 149 +++--------------- 5 files changed, 149 insertions(+), 174 deletions(-) diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index b4549aad9bdc0..5b27d0e155001 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -60,14 +60,11 @@ class MemoryManager { // Allocates buffer in specified context taking into account situations such // as host ptr or cl_mem provided by user. TargetContext should be device // one(not host). - static void *allocateMemBuffer(context_impl *TargetContext, - SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, - bool BackendOwnedWriteBack, size_t Size, - const EventImplPtr &InteropEvent, - context_impl *InteropContext, - const sycl::property_list &PropsList, - ur_event_handle_t &OutEventToWait); + static void *allocateMemBuffer( + context_impl *TargetContext, SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, bool BackendOwnedWriteBack, size_t Size, + const EventImplPtr &InteropEvent, context_impl *InteropContext, + const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait); // Allocates images in specified context taking into account situations such // as host ptr or cl_mem provided by user. TargetContext should be device diff --git a/sycl/source/detail/sycl_mem_obj_i.hpp b/sycl/source/detail/sycl_mem_obj_i.hpp index 943ecef78f31c..3f325c7653cbf 100644 --- a/sycl/source/detail/sycl_mem_obj_i.hpp +++ b/sycl/source/detail/sycl_mem_obj_i.hpp @@ -48,9 +48,7 @@ class SYCLMemObjI { // Optional hook executed right before allocateMem(). Memory objects can use // it to resolve context/backend-dependent allocation policy. - virtual void prepareForAllocation(context_impl *Context) { - (void)Context; - } + virtual void prepareForAllocation(context_impl *Context) { (void)Context; } // Should be used for memory object created without use_host_ptr property. virtual void *allocateHostMem() = 0; diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 5e3c37885e4ab..e1a0f3978174d 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -13,10 +13,32 @@ #include #include +#include + namespace sycl { inline namespace _V1 { namespace detail { +namespace { + +size_t getBackendShadowCopyAlignment(context_impl *Context) { + size_t RequiredAlign = 1; + for (const auto &Device : Context->getDevices()) { + const uint32_t AlignBits = + Device.get_info(); + if (AlignBits == 0) + continue; + + // UR reports MEM_BASE_ADDR_ALIGN in bits. + const size_t AlignBytes = (static_cast(AlignBits) + 7) / 8; + if (AlignBytes > RequiredAlign) + RequiredAlign = AlignBytes; + } + return RequiredAlign; +} + +} // namespace + SYCLMemObjT::SYCLMemObjT(ur_native_handle_t MemObject, const context &SyclContext, const size_t, event AvailableEvent, @@ -162,12 +184,69 @@ void SYCLMemObjT::updateHostMemory() { (Result || !GlobalHandler::instance().isOkToDefer()) && "removeMemoryObject should not return false in mem object destructor"); } - releaseHostMem(MShadowCopy); + detail::OSUtil::alignedFree(MShadowCopy); if (MOpenCLInterop) { getAdapter().call(MInteropMemObject); } } + +void SYCLMemObjT::materializeShadowCopy(const void *SourcePtr, + size_t RequiredAlign) { + if (MPendingShadowCopyAlignment > RequiredAlign) + RequiredAlign = MPendingShadowCopyAlignment; + + if (RequiredAlign == 0) + RequiredAlign = 1; + + MPendingShadowCopyAlignment = RequiredAlign; + + void *OldUserPtr = MUserPtr; + void *OldShadowCopy = MShadowCopy; + const void *CopySource = SourcePtr; + if (OldShadowCopy) { + if ((reinterpret_cast(OldShadowCopy) % RequiredAlign) == + 0) { + MUserPtr = OldShadowCopy; + return; + } + CopySource = OldShadowCopy; + } + + assert(CopySource != nullptr && + "Cannot materialize a shadow copy without source data"); + + // Allocate the shadow copy via the platform-aligned allocator directly, + // bypassing the user-provided allocator. Shadow copies are an internal + // runtime detail; the user allocator cannot be relied upon to satisfy + // backend alignment requirements (e.g. CL_DEVICE_MEM_BASE_ADDR_ALIGN). + const size_t AllocBytes = + MSizeInBytes == 0 ? RequiredAlign + : ((MSizeInBytes + RequiredAlign - 1) / RequiredAlign) * + RequiredAlign; + void *NewShadowCopy = detail::OSUtil::alignedAlloc(RequiredAlign, AllocBytes); + if (!NewShadowCopy) + throw std::bad_alloc(); + if (MSizeInBytes != 0) + std::memcpy(NewShadowCopy, CopySource, MSizeInBytes); + + MShadowCopy = NewShadowCopy; + MUserPtr = NewShadowCopy; + updateRecordedMemAllocation(OldUserPtr, NewShadowCopy); + + detail::OSUtil::alignedFree(OldShadowCopy); +} + +void SYCLMemObjT::updateRecordedMemAllocation(void *OldPtr, void *NewPtr) { + if (MRecord == nullptr || OldPtr == nullptr || OldPtr == NewPtr) + return; + + for (auto *AllocaCmd : MRecord->MAllocaCommands) { + if (AllocaCmd->MMemAllocation == OldPtr) + AllocaCmd->MMemAllocation = NewPtr; + } +} + adapter_impl &SYCLMemObjT::getAdapter() const { assert((MInteropContext != nullptr) && "Trying to get Adapter from SYCLMemObjT with nullptr ContextImpl."); @@ -181,7 +260,7 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { if (!Context) return; - if (!MHasPendingAlignedShadowCopy || MShadowCopy != nullptr) + if (!MHasPendingAlignedShadowCopy) return; bool SkipShadowCopy = false; @@ -190,6 +269,10 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { if (Devices.size() != 0) Backend = Devices.front().getBackend(); + const size_t BackendRequiredAlign = getBackendShadowCopyAlignment(Context); + if (BackendRequiredAlign > MPendingShadowCopyAlignment) + MPendingShadowCopyAlignment = BackendRequiredAlign; + switch (Backend) { case backend::ext_oneapi_level_zero: case backend::ext_oneapi_cuda: @@ -209,18 +292,25 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { std::lock_guard Lock(MCreateShadowCopyMtx); if (SkipShadowCopy) { + if (MShadowCopy != nullptr) { + // A writable host accessor already forced a SYCL shadow copy. Keep using + // that path so the final copy-back still targets the original user ptr. + return; + } + // Backend (UR) will manage the misaligned host pointer through its own // internal staging buffer and owns the final copy-back to the original ptr. MCreateShadowCopy = []() -> void {}; MBackendOwnsWriteBack = true; if (!MHostPtrReadOnly) MUploadDataFunctor = nullptr; - } else { - MCreateShadowCopy(); - MCreateShadowCopy = []() -> void {}; - MBackendOwnsWriteBack = false; + MHasPendingAlignedShadowCopy = false; + return; } + materializeShadowCopy(MUserPtr, BackendRequiredAlign); + MCreateShadowCopy = []() -> void {}; + MBackendOwnsWriteBack = false; MHasPendingAlignedShadowCopy = false; } @@ -280,13 +370,7 @@ void SYCLMemObjT::handleWriteAccessorCreation() { MCreateShadowCopy(); MCreateShadowCopy = []() -> void {}; } - if (MRecord != nullptr && MUserPtr != InitialUserPtr) { - for (auto &it : MRecord->MAllocaCommands) { - if (it->MMemAllocation == InitialUserPtr) { - it->MMemAllocation = MUserPtr; - } - } - } + updateRecordedMemAllocation(InitialUserPtr, MUserPtr); } } // namespace detail diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index fdca42bcf196a..55f39f82c1f1e 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -151,6 +152,8 @@ class SYCLMemObjT : public SYCLMemObjI { protected: void updateHostMemory(void *const Ptr); + void materializeShadowCopy(const void *SourcePtr, size_t RequiredAlign); + void updateRecordedMemAllocation(void *OldPtr, void *NewPtr); // Update host with the latest data + notify scheduler that the memory object // is going to die. After this method is finished no further operations with @@ -190,20 +193,16 @@ class SYCLMemObjT : public SYCLMemObjI { MUserPtr = HostPtr; std::lock_guard Lock(MCreateShadowCopyMtx); MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr, MSizeInBytes); + materializeShadowCopy(HostPtr, RequiredAlign); }; } else { MUserPtr = HostPtr; + if (RequiredAlign > MPendingShadowCopyAlignment) + MPendingShadowCopyAlignment = RequiredAlign; MHasPendingAlignedShadowCopy = true; std::lock_guard Lock(MCreateShadowCopyMtx); MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr, MSizeInBytes); + materializeShadowCopy(HostPtr, RequiredAlign); }; } } @@ -229,20 +228,16 @@ class SYCLMemObjT : public SYCLMemObjI { MUserPtr = HostPtr.get(); std::lock_guard Lock(MCreateShadowCopyMtx); MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); + materializeShadowCopy(HostPtr.get(), RequiredAlign); }; } else { MUserPtr = HostPtr.get(); + if (RequiredAlign > MPendingShadowCopyAlignment) + MPendingShadowCopyAlignment = RequiredAlign; MHasPendingAlignedShadowCopy = true; std::lock_guard Lock(MCreateShadowCopyMtx); MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); + materializeShadowCopy(HostPtr.get(), RequiredAlign); }; } } @@ -257,8 +252,13 @@ class SYCLMemObjT : public SYCLMemObjI { "Buffer constructor from a pair of iterator values does " "not support use_host_ptr property."); - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); + // Shadow copies are an internal runtime detail; always allocate via the + // platform-aligned allocator so all MShadowCopy frees are uniform. + MShadowCopy = detail::OSUtil::alignedAlloc( + RequiredAlign, std::max(MSizeInBytes, RequiredAlign)); + if (!MShadowCopy) + throw exception(make_error_code(errc::runtime), + "Failed to allocate shadow copy"); MUserPtr = MShadowCopy; CopyFromInput(MUserPtr); @@ -331,9 +331,9 @@ class SYCLMemObjT : public SYCLMemObjI { /// Returns true if any graphs are currently using this memory object. bool isUsedInGraph() const { return MGraphUseCount > 0; } - + const property_list &getPropList() const { return MProps; } - + protected: // An allocateMem helper that determines which host ptr to use void determineHostPtr(context_impl *Context, bool InitFromUserData, @@ -362,7 +362,8 @@ class SYCLMemObjT : public SYCLMemObjI { size_t MSizeInBytes = 0; // User's pointer passed to constructor. void *MUserPtr; - // Copy of memory passed by user to constructor. + // Copy of memory passed by user to constructor. Always allocated via + // OSUtil::alignedAlloc (never via MAllocator) so teardown is uniform. void *MShadowCopy; // Function which update host with final data on memory object destruction. std::function MUploadDataFunctor; @@ -385,6 +386,10 @@ class SYCLMemObjT : public SYCLMemObjI { // accessor is created. std::function MCreateShadowCopy = []() -> void {}; std::mutex MCreateShadowCopyMtx; + // The strongest backend alignment requirement observed so far. Deferred + // shadow-copy materialization uses this to upgrade from frontend alignment + // (e.g. alignof(T)) to the backend host-pointer requirement. + size_t MPendingShadowCopyAlignment = 0; // Set when misaligned input data cannot be used directly and the shadow-copy // decision is deferred until backend/platform is known. bool MHasPendingAlignedShadowCopy = false; diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index ff91698d8be2a..677244aca4a83 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -1,90 +1,27 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out -// Checks two things: -// 1) policy check (read-only path): whether misaligned host pointers trigger -// an extra SYCL shadow-copy allocation depending on backend; -// 2) correctness check (writable path): data is correctly copied back to host -// when buffer goes out of scope. +// Regression test for the triple-buffer issue: when a SYCL buffer is +// constructed with a misaligned or non-USM host pointer, the runtime must +// ensure that: +// 1) (read path) the kernel observes the original host data correctly; +// 2) (write path) kernel-side modifications are written back to the original +// host pointer once the buffer goes out of scope. // -// The SYCL runtime and the UR adapter layer use different strategies for -// non-importable or misaligned host pointers: -// -// * Backends with UR buffer allocation (e.g. Level Zero v2): -// SYCL skips its own shadow copy (prepareForAllocation sets SkipShadowCopy). -// The UR adapter owns staging and final copy-back: -// - integrated device: allocates internal USM host memory, copies data in, -// and copies it back to the original pointer on buffer release; -// - discrete device: allocates device memory, migrates data from source. -// These UR-internal allocations go through the USM pool and are NOT seen by -// CountingAllocator, so the SYCL-layer allocation count stays zero. -// -// * Backends without UR buffer allocation (e.g. NativeCPU, OpenCL): -// SYCL creates the shadow copy itself via the buffer's allocator -// (CountingAllocator records it) and drives copy-back on destruction. -// -// The test measures only SYCL-layer (CountingAllocator) allocations, so it is -// portable across all supported backends without runtime-specific conditionals. +// These two invariants must hold regardless of whether the SYCL runtime or the +// UR adapter is responsible for the internal copy/write-back. The test is +// intentionally backend-agnostic. #include -#include -#include #include #include #include -#include #include -template class CountingAllocator { -public: - using value_type = T; - - CountingAllocator() = default; - - template - constexpr CountingAllocator(const CountingAllocator &) noexcept {} - - T *allocate(std::size_t N) { - Allocations.fetch_add(1, std::memory_order_relaxed); - return std::allocator{}.allocate(N); - } - - void deallocate(T *Ptr, std::size_t N) { - std::allocator{}.deallocate(Ptr, N); - } - - template bool operator==(const CountingAllocator &) const { - return true; - } - - template bool operator!=(const CountingAllocator &) const { - return false; - } - - static std::atomic Allocations; -}; - -template std::atomic CountingAllocator::Allocations{0}; - -static bool shouldSkipAlignedShadowCopy(sycl::backend B) { - switch (B) { - case sycl::backend::ext_oneapi_level_zero: - case sycl::backend::ext_oneapi_cuda: - case sycl::backend::ext_oneapi_hip: - return true; - case sycl::backend::opencl: - case sycl::backend::ext_oneapi_native_cpu: - case sycl::backend::ext_oneapi_offload: - return false; - default: - return false; - } -} - -// Read-only kernel path used for allocation-policy assertions. +// Read-only kernel: sum all elements and return the result. static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { - sycl::buffer> Buf(HostPtr, sycl::range<1>(N)); + sycl::buffer Buf(HostPtr, sycl::range<1>(N)); sycl::buffer SumBuf(1); Q.submit([&](sycl::handler &CGH) { @@ -106,8 +43,7 @@ static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { // Writable kernel path; buffer destruction happens at scope exit. static void runWriteKernel(sycl::queue &Q, int *HostPtr, size_t N) { { - sycl::buffer> Buf(HostPtr, - sycl::range<1>(N)); + sycl::buffer Buf(HostPtr, sycl::range<1>(N)); Q.submit([&](sycl::handler &CGH) { auto OutAcc = Buf.get_access(CGH); @@ -135,10 +71,13 @@ int main() { constexpr size_t N = 32; sycl::queue Q; + // Build aligned reference data. std::vector AlignedInput(N); for (size_t I = 0; I < N; ++I) AlignedInput[I] = static_cast(I); + // Build a deliberately misaligned copy: offset by 1 byte so that the int* + // is not naturally aligned. std::vector Storage(sizeof(int) * N + 1); int *UnalignedPtr = reinterpret_cast(Storage.data() + 1); std::memcpy(UnalignedPtr, AlignedInput.data(), sizeof(int) * N); @@ -146,68 +85,30 @@ int main() { const int ExpectedSum = static_cast((N - 1) * N / 2); - // Compare aligned vs misaligned read-only input. Allocation count is used as - // a proxy for SYCL shadow-copy creation. - CountingAllocator::Allocations.store(0, std::memory_order_relaxed); + // --- Read path correctness --- + // Both aligned and misaligned host pointers must produce the correct sum. const int AlignedSum = runReadOnlySumKernel(Q, AlignedInput.data(), N); - const size_t AlignedAllocations = - CountingAllocator::Allocations.load(std::memory_order_relaxed); if (AlignedSum != ExpectedSum) { std::cerr << "Unexpected aligned sum: " << AlignedSum << "\n"; return 1; } - CountingAllocator::Allocations.store(0, std::memory_order_relaxed); const int MisalignedSum = runReadOnlySumKernel(Q, ReadOnlyUnalignedPtr, N); - - const size_t MisalignedAllocations = - CountingAllocator::Allocations.load(std::memory_order_relaxed); if (MisalignedSum != ExpectedSum) { std::cerr << "Unexpected misaligned sum: " << MisalignedSum << "\n"; return 1; } - const bool ExpectNoShadowCopy = shouldSkipAlignedShadowCopy(Q.get_backend()); - - if (ExpectNoShadowCopy) { - // SYCL skips its own shadow copy; UR is responsible for staging and - // copy-back. No SYCL-layer extra allocation is expected regardless of - // whether the device is integrated or discrete. - if (MisalignedAllocations != AlignedAllocations) { - std::cerr << "Unexpected extra SYCL allocation on misaligned pointer: " - "aligned=" - << AlignedAllocations - << ", misaligned=" << MisalignedAllocations << "\n"; - return 1; - } - } else { - // SYCL creates the shadow copy itself; expect exactly one extra allocation. - if (MisalignedAllocations != AlignedAllocations + 1) { - std::cerr << "Expected one extra SYCL allocation for misaligned pointer: " - "aligned=" - << AlignedAllocations - << ", misaligned=" << MisalignedAllocations << "\n"; - return 1; - } - } - - // Validate writable path including final copy-back at buffer destruction. - // This checks correctness only; writable allocation counts are intentionally - // not asserted (write accessor can conservatively materialize shadow copy). + // --- Write-back correctness --- + // After the buffer goes out of scope the kernel-written pattern must be + // visible at the original host pointer, even when that pointer is misaligned. std::vector AlignedWritable(N, 0); std::vector WritableStorage(sizeof(int) * N + 1, 0); int *UnalignedWritablePtr = reinterpret_cast(WritableStorage.data() + 1); - CountingAllocator::Allocations.store(0, std::memory_order_relaxed); runWriteKernel(Q, AlignedWritable.data(), N); - const size_t AlignedWriteAllocs = - CountingAllocator::Allocations.load(std::memory_order_relaxed); - - CountingAllocator::Allocations.store(0, std::memory_order_relaxed); runWriteKernel(Q, UnalignedWritablePtr, N); - const size_t MisalignedWriteAllocs = - CountingAllocator::Allocations.load(std::memory_order_relaxed); if (!checkExpectedPattern(AlignedWritable.data(), N)) { std::cerr << "Unexpected data in aligned writable buffer\n"; @@ -218,15 +119,5 @@ int main() { return 1; } - // Writable allocation counts are intentionally not asserted here because the - // expected delta is platform-dependent: - // * NativeCPU/OpenCL: SYCL creates a shadow copy for misaligned writable - // buffers (MisalignedWriteAllocs == AlignedWriteAllocs + 1). - // * L0/CUDA/HIP/offload: SYCL skips the shadow copy; UR stages internally - // (MisalignedWriteAllocs == AlignedWriteAllocs). - // The correctness checks above are sufficient to validate copy-back behavior. - (void)AlignedWriteAllocs; - (void)MisalignedWriteAllocs; - return 0; } From 77e781e8412a82aad9e4ab0471430639277045cf Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Tue, 28 Apr 2026 15:11:59 +0000 Subject: [PATCH 10/12] apply comments --- sycl/source/detail/buffer_impl.cpp | 9 ++--- sycl/source/detail/memory_manager.cpp | 36 +++++++++---------- sycl/source/detail/memory_manager.hpp | 16 ++++----- sycl/source/detail/sycl_mem_obj_t.cpp | 6 ++-- .../source/adapters/level_zero/v2/memory.cpp | 19 +++++----- .../source/adapters/level_zero/v2/memory.hpp | 3 -- 6 files changed, 42 insertions(+), 47 deletions(-) diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp index 8327a13b56859..0db81e3f02129 100644 --- a/sycl/source/detail/buffer_impl.cpp +++ b/sycl/source/detail/buffer_impl.cpp @@ -23,16 +23,13 @@ void *buffer_impl::allocateMem(context_impl *Context, bool InitFromUserData, ur_event_handle_t &OutEventToWait) { bool HostPtrReadOnly = false; BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly); - const bool BackendOwnedWriteBack = HostPtr != nullptr && - HostPtr == BaseT::getUserPtr() && - BaseT::backendOwnsWriteBack(); assert(!(nullptr == HostPtr && BaseT::useHostPtr() && !Context) && "Internal error. Allocating memory on the host " "while having use_host_ptr property"); return MemoryManager::allocateMemBuffer( - Context, this, HostPtr, HostPtrReadOnly, BackendOwnedWriteBack, - BaseT::getSizeInBytes(), BaseT::MInteropEvent, - BaseT::MInteropContext.get(), MProps, OutEventToWait); + Context, this, HostPtr, HostPtrReadOnly, BaseT::getSizeInBytes(), + BaseT::MInteropEvent, BaseT::MInteropContext.get(), MProps, + OutEventToWait); } void buffer_impl::constructorNotification(const detail::code_location &CodeLoc, void *UserObj, const void *HostObj, diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index ec6a043cfaaf4..54626c614614b 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -324,17 +324,13 @@ void *MemoryManager::allocateInteropMemObject( return UserPtr; } -static ur_mem_flags_t -getMemObjCreationFlags(void *UserPtr, bool HostPtrReadOnly, - bool BackendOwnedWriteBack = false) { +static ur_mem_flags_t getMemObjCreationFlags(void *UserPtr, + bool HostPtrReadOnly) { // Create read_write mem object to handle arbitrary uses. ur_mem_flags_t Result = HostPtrReadOnly ? UR_MEM_FLAG_READ_ONLY : UR_MEM_FLAG_READ_WRITE; - if (UserPtr) { + if (UserPtr) Result |= UR_MEM_FLAG_USE_HOST_POINTER; - if (BackendOwnedWriteBack) - Result |= UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER; - } return Result; } @@ -354,12 +350,12 @@ void *MemoryManager::allocateImageObject(context_impl *TargetContext, return NewMem; } -void *MemoryManager::allocateBufferObject( - context_impl *TargetContext, void *UserPtr, bool HostPtrReadOnly, - bool BackendOwnedWriteBack, const size_t Size, - const sycl::property_list &PropsList) { +void * +MemoryManager::allocateBufferObject(context_impl *TargetContext, void *UserPtr, + bool HostPtrReadOnly, const size_t Size, + const sycl::property_list &PropsList) { ur_mem_flags_t CreationFlags = - getMemObjCreationFlags(UserPtr, HostPtrReadOnly, BackendOwnedWriteBack); + getMemObjCreationFlags(UserPtr, HostPtrReadOnly); if (PropsList.has_property< sycl::ext::oneapi::property::buffer::use_pinned_host_memory>()) CreationFlags |= UR_MEM_FLAG_ALLOC_HOST_POINTER; @@ -375,11 +371,13 @@ void *MemoryManager::allocateBufferObject( return NewMem; } -void *MemoryManager::allocateMemBuffer( - context_impl *TargetContext, SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, bool BackendOwnedWriteBack, size_t Size, - const EventImplPtr &InteropEvent, context_impl *InteropContext, - const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait) { +void *MemoryManager::allocateMemBuffer(context_impl *TargetContext, + SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, size_t Size, + const EventImplPtr &InteropEvent, + context_impl *InteropContext, + const sycl::property_list &PropsList, + ur_event_handle_t &OutEventToWait) { void *MemPtr; if (!TargetContext) MemPtr = @@ -389,8 +387,8 @@ void *MemoryManager::allocateMemBuffer( allocateInteropMemObject(TargetContext, UserPtr, InteropEvent, InteropContext, PropsList, OutEventToWait); else - MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, - BackendOwnedWriteBack, Size, PropsList); + MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size, + PropsList); XPTIRegistry::bufferAssociateNotification(MemObj, MemPtr); return MemPtr; } diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 5b27d0e155001..1ab72d1975055 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -60,11 +60,13 @@ class MemoryManager { // Allocates buffer in specified context taking into account situations such // as host ptr or cl_mem provided by user. TargetContext should be device // one(not host). - static void *allocateMemBuffer( - context_impl *TargetContext, SYCLMemObjI *MemObj, void *UserPtr, - bool HostPtrReadOnly, bool BackendOwnedWriteBack, size_t Size, - const EventImplPtr &InteropEvent, context_impl *InteropContext, - const sycl::property_list &PropsList, ur_event_handle_t &OutEventToWait); + static void *allocateMemBuffer(context_impl *TargetContext, + SYCLMemObjI *MemObj, void *UserPtr, + bool HostPtrReadOnly, size_t Size, + const EventImplPtr &InteropEvent, + context_impl *InteropContext, + const sycl::property_list &PropsList, + ur_event_handle_t &OutEventToWait); // Allocates images in specified context taking into account situations such // as host ptr or cl_mem provided by user. TargetContext should be device @@ -99,9 +101,7 @@ class MemoryManager { const sycl::property_list &PropsList); static void *allocateBufferObject(context_impl *TargetContext, void *UserPtr, - bool HostPtrReadOnly, - bool BackendOwnedWriteBack, - const size_t Size, + bool HostPtrReadOnly, const size_t Size, const sycl::property_list &PropsList); // Copies memory between: host and device, host and host, diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index e1a0f3978174d..a79acc5ddcfdb 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -275,10 +275,10 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { switch (Backend) { case backend::ext_oneapi_level_zero: - case backend::ext_oneapi_cuda: - case backend::ext_oneapi_hip: SkipShadowCopy = true; break; + case backend::ext_oneapi_cuda: + case backend::ext_oneapi_hip: case backend::opencl: case backend::ext_oneapi_native_cpu: case backend::ext_oneapi_offload: @@ -291,6 +291,8 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { } std::lock_guard Lock(MCreateShadowCopyMtx); + if (!MHasPendingAlignedShadowCopy) + return; if (SkipShadowCopy) { if (MShadowCopy != nullptr) { // A writable host accessor already forced a SYCL shadow copy. Keep using diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index c148bda18160b..1ffa08777c5d4 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -122,11 +122,12 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( }); if (hostPtr) { - // Initial copy using Level Zero for USM HOST memory auto hDevice = hContext->getDevices()[0]; UR_CALL_THROWS( synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size)); mapToPtr = hostPtr; + if (accessMode != device_access_mode_t::read_only) + writeBackPtr = hostPtr; } } } @@ -646,15 +647,12 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, // 4. Otherwise - allocate USM and copy-back through map/unmap operations if (useHostBuffer(hContext)) { if (useHostPtr) { - auto buffer = std::unique_ptr( - ur_mem_handle_t_::create( - hContext, hostPtr, size, accessMode)); - if (copyHostPtr && hostPtr) { - static_cast(buffer->getBuffer()) - ->setWriteBackPtr(hostPtr); - } - *phBuffer = buffer.release(); + *phBuffer = ur_mem_handle_t_::create( + hContext, hostPtr, size, accessMode); } else if (copyHostPtr && hostPtr) { + // Not used by SYCL today. Kept for direct UR calls to + // urMemBufferCreate(..., UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER, ...) + // without UR_MEM_FLAG_USE_HOST_POINTER. auto buffer = std::unique_ptr( ur_mem_handle_t_::create( hContext, nullptr, size, accessMode)); @@ -673,6 +671,9 @@ ur_result_t urMemBufferCreate(ur_context_handle_t hContext, *phBuffer = ur_mem_handle_t_::create( hContext, hostPtr, size, accessMode); } else if (copyHostPtr && hostPtr) { + // Not used by SYCL today. Kept for direct UR calls to + // urMemBufferCreate(..., UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER, ...) + // without UR_MEM_FLAG_USE_HOST_POINTER. auto buffer = std::unique_ptr( ur_mem_handle_t_::create( hContext, nullptr, size, accessMode)); diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index 3c4bcdcc99022..702b91811419d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -123,9 +123,6 @@ struct ur_integrated_buffer_handle_t : ur_mem_buffer_t { // Perform final copy-back to original host pointer if needed void copyBackToHostIfNeeded(); - void setMapToPtr(void *hostPtr) { mapToPtr = hostPtr; } - void setWriteBackPtr(void *hostPtr) { writeBackPtr = hostPtr; } - private: usm_unique_ptr_t ptr; void *mapToPtr = nullptr; From 94f0ae79184f4b7555c5ca4f744f9881e3b18c78 Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 8 May 2026 11:46:14 +0000 Subject: [PATCH 11/12] bugfix --- unified-runtime/source/adapters/level_zero/v2/memory.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index 1ffa08777c5d4..d22535203cb0d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -126,8 +126,6 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( UR_CALL_THROWS( synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size)); mapToPtr = hostPtr; - if (accessMode != device_access_mode_t::read_only) - writeBackPtr = hostPtr; } } } From 33d56cfebde7271008a77bd793d4b6f79a88f55e Mon Sep 17 00:00:00 2001 From: "Mateusz P. Nowak" Date: Fri, 15 May 2026 15:41:10 +0000 Subject: [PATCH 12/12] apply comments --- sycl/source/detail/sycl_mem_obj_t.cpp | 13 ++++++---- sycl/source/detail/sycl_mem_obj_t.hpp | 2 -- .../buffer_shadow_copy_platform_policy.cpp | 24 ++++++++----------- .../source/adapters/level_zero/v2/memory.cpp | 4 ++++ 4 files changed, 22 insertions(+), 21 deletions(-) diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index a79acc5ddcfdb..be4b0e46ffff2 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -221,9 +221,10 @@ void SYCLMemObjT::materializeShadowCopy(const void *SourcePtr, // runtime detail; the user allocator cannot be relied upon to satisfy // backend alignment requirements (e.g. CL_DEVICE_MEM_BASE_ADDR_ALIGN). const size_t AllocBytes = - MSizeInBytes == 0 ? RequiredAlign - : ((MSizeInBytes + RequiredAlign - 1) / RequiredAlign) * - RequiredAlign; + MSizeInBytes == 0 + ? RequiredAlign + : MSizeInBytes + + (RequiredAlign - (MSizeInBytes % RequiredAlign)) % RequiredAlign; void *NewShadowCopy = detail::OSUtil::alignedAlloc(RequiredAlign, AllocBytes); if (!NewShadowCopy) throw std::bad_alloc(); @@ -270,8 +271,6 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { Backend = Devices.front().getBackend(); const size_t BackendRequiredAlign = getBackendShadowCopyAlignment(Context); - if (BackendRequiredAlign > MPendingShadowCopyAlignment) - MPendingShadowCopyAlignment = BackendRequiredAlign; switch (Backend) { case backend::ext_oneapi_level_zero: @@ -293,10 +292,14 @@ void SYCLMemObjT::prepareForAllocation(context_impl *Context) { std::lock_guard Lock(MCreateShadowCopyMtx); if (!MHasPendingAlignedShadowCopy) return; + if (BackendRequiredAlign > MPendingShadowCopyAlignment) + MPendingShadowCopyAlignment = BackendRequiredAlign; if (SkipShadowCopy) { if (MShadowCopy != nullptr) { // A writable host accessor already forced a SYCL shadow copy. Keep using // that path so the final copy-back still targets the original user ptr. + MBackendOwnsWriteBack = false; + MHasPendingAlignedShadowCopy = false; return; } diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index 55f39f82c1f1e..c26490e2c8954 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -307,8 +307,6 @@ class SYCLMemObjT : public SYCLMemObjI { return MNeedWriteBack && MUploadDataFunctor && !MBackendOwnsWriteBack; } - bool backendOwnsWriteBack() const { return MBackendOwnsWriteBack; } - /// Increment an internal counter for how many graphs are currently using this /// memory object. void markBeingUsedInGraph() { MGraphUseCount += 1; } diff --git a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp index 677244aca4a83..669174d98d1f3 100644 --- a/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp +++ b/sycl/test-e2e/Regression/buffer_shadow_copy_platform_policy.cpp @@ -42,26 +42,22 @@ static int runReadOnlySumKernel(sycl::queue &Q, const int *HostPtr, size_t N) { // Writable kernel path; buffer destruction happens at scope exit. static void runWriteKernel(sycl::queue &Q, int *HostPtr, size_t N) { - { - sycl::buffer Buf(HostPtr, sycl::range<1>(N)); - - Q.submit([&](sycl::handler &CGH) { - auto OutAcc = Buf.get_access(CGH); - CGH.single_task([=]() { - for (size_t I = 0; I < N; ++I) - OutAcc[I] = static_cast(I * 3 + 7); - }); + sycl::buffer Buf(HostPtr, sycl::range<1>(N)); + + Q.submit([&](sycl::handler &CGH) { + auto OutAcc = Buf.get_access(CGH); + CGH.single_task([=]() { + for (size_t I = 0; I < N; ++I) + OutAcc[I] = static_cast(I * 3 + 7); }); - Q.wait_and_throw(); - } + }); + Q.wait_and_throw(); } // Verifies host-side result after writable-buffer destruction. static bool checkExpectedPattern(const int *Ptr, size_t N) { - std::vector Tmp(N); - std::memcpy(Tmp.data(), Ptr, sizeof(int) * N); for (size_t I = 0; I < N; ++I) { - if (Tmp[I] != static_cast(I * 3 + 7)) + if (Ptr[I] != static_cast(I * 3 + 7)) return false; } return true; diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index d22535203cb0d..185693f0d059b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -126,6 +126,10 @@ ur_integrated_buffer_handle_t::ur_integrated_buffer_handle_t( UR_CALL_THROWS( synchronousZeCopy(hContext, hDevice, this->ptr.get(), hostPtr, size)); mapToPtr = hostPtr; + // Mirror mapToPtr so that ~ur_integrated_buffer_handle_t copies device + // updates back to the original host pointer even without an explicit + // map/unmap. + writeBackPtr = hostPtr; } } }