From 5b2c398147782623c78ba98efa29a165c9fe050f Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 28 Apr 2026 08:54:13 +0000 Subject: [PATCH 01/23] [UR][L0] Restrict USM residency to peers with enabled P2P access - Skip peers with disabled P2P in makeProvider (USM pool creation) - Add urUsmP2PEnablePeerAccessExp / urUsmP2PDisablePeerAccessExp - Track per-device peer status in ur_device_handle_t_::peers[] - Update existing USM pool residency on P2P enable/disable Signed-off-by: Lukasz Dorau --- sycl/source/device.cpp | 26 +- .../source/adapters/level_zero/CMakeLists.txt | 2 +- .../source/adapters/level_zero/context.cpp | 2 + .../source/adapters/level_zero/device.cpp | 21 ++ .../source/adapters/level_zero/device.hpp | 14 +- .../source/adapters/level_zero/platform.cpp | 43 +++- .../source/adapters/level_zero/platform.hpp | 9 +- .../source/adapters/level_zero/usm_p2p.cpp | 18 +- .../source/adapters/level_zero/v2/context.cpp | 230 +++++++++++------- .../source/adapters/level_zero/v2/context.hpp | 19 +- .../source/adapters/level_zero/v2/memory.cpp | 7 +- .../source/adapters/level_zero/v2/usm.cpp | 74 +++++- .../source/adapters/level_zero/v2/usm.hpp | 2 + .../source/adapters/level_zero/v2/usm_p2p.cpp | 125 ++++++++++ unified-runtime/source/common/backtrace.hpp | 4 +- .../source/common/backtrace_lin.cpp | 4 +- .../source/common/backtrace_win.cpp | 5 +- .../source/common/ur_pool_manager.hpp | 29 ++- .../layers/validation/ur_leak_check.hpp | 2 - .../level_zero/v2/memory_residency.cpp | 37 +++ 20 files changed, 534 insertions(+), 139 deletions(-) create mode 100644 unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 77f4ada363ae2..8967d8ef2dff1 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -212,19 +212,33 @@ bool device::has(aspect Aspect) const { return impl->has(Aspect); } void device::ext_oneapi_enable_peer_access(const device &peer) { ur_device_handle_t Device = impl->getHandleRef(); ur_device_handle_t Peer = peer.impl->getHandleRef(); - if (Device != Peer) { - detail::adapter_impl &Adapter = impl->getAdapter(); - Adapter.call(Device, Peer); + + if (Device == Peer) + return; + + if (peer.get_platform() != get_platform()) { + throw exception(errc::invalid, + "Cannot enable peer access between different platforms"); } + + impl->getAdapter().call( + Device, Peer); } void device::ext_oneapi_disable_peer_access(const device &peer) { ur_device_handle_t Device = impl->getHandleRef(); ur_device_handle_t Peer = peer.impl->getHandleRef(); - if (Device != Peer) { - detail::adapter_impl &Adapter = impl->getAdapter(); - Adapter.call(Device, Peer); + + if (Device == Peer) + return; + + if (peer.get_platform() != get_platform()) { + throw exception(errc::invalid, + "Cannot disable peer access between different platforms"); } + + impl->getAdapter().call( + Device, Peer); } bool device::ext_oneapi_can_access_peer(const device &peer, diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 73c003ad90f88..42e63e952f392 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -150,7 +150,6 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/helpers/kernel_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/helpers/memory_helpers.cpp ${CMAKE_CURRENT_SOURCE_DIR}/helpers/mutable_helpers.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../ur/ur.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sampler.hpp @@ -194,6 +193,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm_p2p.cpp ) install_ur_library(ur_adapter_level_zero_v2) diff --git a/unified-runtime/source/adapters/level_zero/context.cpp b/unified-runtime/source/adapters/level_zero/context.cpp index b45020efd8ee9..cc58068623020 100644 --- a/unified-runtime/source/adapters/level_zero/context.cpp +++ b/unified-runtime/source/adapters/level_zero/context.cpp @@ -41,6 +41,8 @@ ur_result_t urContextCreate( Context->initialize(); *RetContext = reinterpret_cast(Context); + // TODO: delete below 'if' when memory isolation in the context is + // implemented in the driver if (IndirectAccessTrackingEnabled) { std::scoped_lock Lock(Platform->ContextsMutex); Platform->Contexts.push_back(*RetContext); diff --git a/unified-runtime/source/adapters/level_zero/device.cpp b/unified-runtime/source/adapters/level_zero/device.cpp index f1bf6b249c570..2b2e66ec9f703 100644 --- a/unified-runtime/source/adapters/level_zero/device.cpp +++ b/unified-runtime/source/adapters/level_zero/device.cpp @@ -2361,3 +2361,24 @@ void ZeUSMImportExtension::doZeUSMRelease(ze_driver_handle_t DriverHandle, void *HostPtr) { ZE_CALL_NOCHECK(zexDriverReleaseImportedPointer, (DriverHandle, HostPtr)); } + +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_ const &device_handle) { + if (device_handle.Id.has_value()) { + return os << device_handle.Id.value(); + } + return os << "NONE"; +} + +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_::PeerStatus peer_status) { + switch (peer_status) { + case ur_device_handle_t_::PeerStatus::DISABLED: + return os << "DISABLED"; + case ur_device_handle_t_::PeerStatus::ENABLED: + return os << "ENABLED"; + case ur_device_handle_t_::PeerStatus::NO_CONNECTION: + return os << "NO_CONNECTION"; + } + return os << "UNKNOWN"; +} diff --git a/unified-runtime/source/adapters/level_zero/device.hpp b/unified-runtime/source/adapters/level_zero/device.hpp index e3e8b9c8125ae..0e184c0c5f01d 100644 --- a/unified-runtime/source/adapters/level_zero/device.hpp +++ b/unified-runtime/source/adapters/level_zero/device.hpp @@ -261,17 +261,29 @@ struct ur_device_handle_t_ : ur_object { std::unordered_map ZeOffsetToImageHandleMap; - // unique ephemeral identifer of the device in the adapter + // Devices which user enabled p2p access by + // urUsmP2P(Enable|Disable)PeerAccessExp. Devices are indexed by device id. + enum class PeerStatus : char { ENABLED, DISABLED, NO_CONNECTION }; + std::vector + peers; // info if our device can access given peer device allocations + + // unique ephemeral identifier of the device in the adapter std::optional Id; ur::RefCount RefCount; }; +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_ const &device_handle); +std::ostream &operator<<(std::ostream &os, + ur_device_handle_t_::PeerStatus peer_status); + // Collects a flat vector of unique devices for USM memory pool creation. // Traverses the input devices and their sub-devices, ensuring each Level Zero // device handle appears only once in the result. inline std::vector CollectDevicesForUsmPoolCreation( const std::vector &Devices) { + std::vector DevicesAndSubDevices; std::unordered_set Seen; diff --git a/unified-runtime/source/adapters/level_zero/platform.cpp b/unified-runtime/source/adapters/level_zero/platform.cpp index a805dbd73d149..5e8716b41a59f 100644 --- a/unified-runtime/source/adapters/level_zero/platform.cpp +++ b/unified-runtime/source/adapters/level_zero/platform.cpp @@ -746,9 +746,9 @@ ur_platform_handle_t_::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) { std::shared_lock Lock(URDevicesCacheMutex); auto it = std::find_if(URDevicesCache.begin(), URDevicesCache.end(), [&](std::unique_ptr &D) { - return D.get()->ZeDevice == ZeDevice && - (D.get()->RootDevice == nullptr || - D.get()->RootDevice->RootDevice == nullptr); + return D->ZeDevice == ZeDevice && + (D->RootDevice == nullptr || + D->RootDevice->RootDevice == nullptr); }); if (it != URDevicesCache.end()) { return (*it).get(); @@ -914,6 +914,43 @@ ur_result_t ur_platform_handle_t_::populateDeviceCacheIfNeeded() { ZeDeviceSynchronizeSupported = Supported; } + for (auto &dev : URDevicesCache) { + dev->peers = std::vector( + URDevicesCache.size(), ur_device_handle_t_::PeerStatus::NO_CONNECTION); + + for (size_t peerId = 0; peerId < URDevicesCache.size(); ++peerId) { + if (peerId == dev->Id.value()) + continue; + + ZeStruct p2pProperties; + ZE2UR_CALL( + zeDeviceGetP2PProperties, + (dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2pProperties)); + if (!(p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ACCESS)) { + UR_LOG(INFO, + "p2p access to memory of dev:{} from dev:{} not possible due to " + "lack of p2p property", + peerId, dev->Id.value()); + continue; + } + + ze_bool_t p2p; + ZE2UR_CALL(zeDeviceCanAccessPeer, + (dev->ZeDevice, URDevicesCache[peerId]->ZeDevice, &p2p)); + if (!p2p) { + UR_LOG(INFO, + "p2p access to memory of dev:{} from dev:{} not possible due to " + "no connection", + peerId, dev->Id.value()); + continue; + } + + UR_LOG(INFO, "p2p access to memory of dev:{} from dev:{} can be enabled", + peerId, dev->Id.value()); + dev->peers[peerId] = ur_device_handle_t_::PeerStatus::DISABLED; + } + } + return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/platform.hpp b/unified-runtime/source/adapters/level_zero/platform.hpp index 82396632e7ea8..766160807a198 100644 --- a/unified-runtime/source/adapters/level_zero/platform.hpp +++ b/unified-runtime/source/adapters/level_zero/platform.hpp @@ -97,11 +97,10 @@ struct ur_platform_handle_t_ : ur::handle_base, uint32_t VersionMinor, uint32_t VersionBuild); - // Keep track of all contexts in the platform. This is needed to manage - // a lifetime of memory allocations in each context when there are kernels - // with indirect access. - // TODO: should be deleted when memory isolation in the context is implemented - // in the driver. + // Keep track of all contexts in the platform. In v1 L0 this is needed to + // manage a lifetime of memory allocations in each context when there are + // kernels with indirect access. In v2 it is used during + // ext_oneapi_enable_peer_access and ext_oneapi_disable_peer_access calls. std::list Contexts; ur_shared_mutex ContextsMutex; diff --git a/unified-runtime/source/adapters/level_zero/usm_p2p.cpp b/unified-runtime/source/adapters/level_zero/usm_p2p.cpp index 67e44fd06fbfe..7b094d42c1f81 100644 --- a/unified-runtime/source/adapters/level_zero/usm_p2p.cpp +++ b/unified-runtime/source/adapters/level_zero/usm_p2p.cpp @@ -12,17 +12,23 @@ namespace ur::level_zero { -ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t /*commandDevice*/, - ur_device_handle_t /*peerDevice*/) { +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { - // L0 has peer devices enabled by default + UR_LOG(INFO, + "ignored enabling peer access from {} to memory of {}, because P2P is " + "always enabled in Level Zero V1 adapter", + (void *)commandDevice, (void *)peerDevice); return UR_RESULT_SUCCESS; } -ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t /*commandDevice*/, - ur_device_handle_t /*peerDevice*/) { +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { - // L0 has peer devices enabled by default + UR_LOG(INFO, + "ignored disabling peer access from {} to memory of {}, because P2P " + "is always enabled in Level Zero V1 adapter", + (void *)commandDevice, (void *)peerDevice); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/context.cpp b/unified-runtime/source/adapters/level_zero/v2/context.cpp index eed549a58ca95..d869c7c0e6247 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.cpp @@ -13,53 +13,6 @@ #include "event_provider_counter.hpp" #include "event_provider_normal.hpp" -static std::vector -filterP2PDevices(ur_device_handle_t hSourceDevice, - const std::vector &devices) { - std::vector p2pDevices; - for (auto &device : devices) { - if (device == hSourceDevice) { - continue; - } - - ze_bool_t p2p; - ZE2UR_CALL_THROWS(zeDeviceCanAccessPeer, - (device->ZeDevice, hSourceDevice->ZeDevice, &p2p)); - - if (p2p) { - p2pDevices.push_back(device); - } - } - return p2pDevices; -} - -static std::vector> -populateP2PDevices(const std::vector &devices) { - std::vector allDevices; - std::function collectDeviceAndSubdevices = - [&allDevices, &collectDeviceAndSubdevices](ur_device_handle_t device) { - allDevices.push_back(device); - for (auto &subDevice : device->SubDevices) { - collectDeviceAndSubdevices(subDevice); - } - }; - - for (auto &device : devices) { - collectDeviceAndSubdevices(device); - } - - uint64_t maxDeviceId = 0; - for (auto &device : allDevices) { - maxDeviceId = std::max(maxDeviceId, device->Id.value()); - } - - std::vector> p2pDevices(maxDeviceId + 1); - for (auto &device : allDevices) { - p2pDevices[device->Id.value()] = filterP2PDevices(device, allDevices); - } - return p2pDevices; -} - static std::vector uniqueDevices(uint32_t numDevices, const ur_device_handle_t *phDevices) { std::vector devices(phDevices, phDevices + numDevices); @@ -104,22 +57,15 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, nativeEventsPool(this, std::make_unique( this, v2::QUEUE_IMMEDIATE, v2::EVENT_FLAGS_PROFILING_ENABLED)), - p2pAccessDevices(populateP2PDevices(this->hDevices)), - defaultUSMPool(this, nullptr), asyncPool(this, nullptr) {} + defaultUSMPool(this, nullptr), asyncPool(this, nullptr) { + UR_LOG(INFO, "UR context created with {} devices", numDevices); +} ur_result_t ur_context_handle_t_::retain() { RefCount.retain(); return UR_RESULT_SUCCESS; } -ur_result_t ur_context_handle_t_::release() { - if (!RefCount.release()) - return UR_RESULT_SUCCESS; - - delete this; - return UR_RESULT_SUCCESS; -} - ur_platform_handle_t ur_context_handle_t_::getPlatform() const { return hDevices[0]->Platform; } @@ -145,37 +91,130 @@ ur_usm_pool_handle_t ur_context_handle_t_::getDefaultUSMPool() { ur_usm_pool_handle_t ur_context_handle_t_::getAsyncPool() { return &asyncPool; } void ur_context_handle_t_::addUsmPool(ur_usm_pool_handle_t hPool) { + UR_LOG(INFO, "Adding USM pool {} to context:{}", hPool, this); std::scoped_lock lock(Mutex); usmPoolHandles.push_back(hPool); } void ur_context_handle_t_::removeUsmPool(ur_usm_pool_handle_t hPool) { + UR_LOG(INFO, "Removing USM pool {} from context:{}", hPool, this) std::scoped_lock lock(Mutex); usmPoolHandles.remove(hPool); } -const std::vector & -ur_context_handle_t_::getP2PDevices(ur_device_handle_t hDevice) const { - return p2pAccessDevices[hDevice->Id.value()]; +void ur_context_handle_t_::changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, + bool isAdding) { + if (!isValidDevice(hDevice)) { + UR_LOG(INFO, + "skipped changing peer device in context:{} because " + "commandDevice ptr:{} is invalid in this context", + (void *)this, (void *)hDevice); + return; + } + + if (!isValidDevice(peerDevice)) { + UR_LOG(INFO, + "skipped changing peer device in context:{} because " + "peerDevice ptr:{} is invalid in this context", + (void *)this, (void *)peerDevice); + return; + } + + std::shared_lock lock(Mutex); + UR_LOG(INFO, "{} peerDevice:{} in the default pool and {} usmPools", + isAdding ? "adding" : "removing", peerDevice->Id.value(), + usmPoolHandles.size()) + defaultUSMPool.changeResidentDevice(hDevice, peerDevice, isAdding); + for (const auto &hPool : usmPoolHandles) { + hPool->changeResidentDevice(hDevice, peerDevice, isAdding); + } +} + +std::vector +ur_context_handle_t_::getDevicesWhoseAllocationsCanBeAccessedFrom( + ur_device_handle_t hDevice) { + UR_FASSERT(hDevice != nullptr && hDevice->Id.has_value(), + "invalid device handle"); + + std::vector peers; + { + std::shared_lock lock(hDevice->Mutex); + peers = hDevice->peers; + } + + std::vector retVal; + std::copy_if( + std::begin(hDevices), std::end(hDevices), std::back_inserter(retVal), + [&](ur_device_handle_t peerCandidateDevice) { + const auto candidateId = peerCandidateDevice->Id.value(); + UR_FASSERT(candidateId < peers.size(), + "there is no device:" + << candidateId << " in peers table, number of devices:" + << peers.size()); + return peers[candidateId] == ur_device_handle_t_::PeerStatus::ENABLED; + }); + + return retVal; +} + +std::vector +ur_context_handle_t_::getDevicesWhichCanAccessAllocationsPresentOn( + ur_device_handle_t hDevice) { + UR_FASSERT(hDevice != nullptr && hDevice->Id.has_value(), + "invalid device handle"); + + const auto hDeviceId = hDevice->Id.value(); + std::vector retVal; + std::copy_if( + std::begin(hDevices), std::end(hDevices), std::back_inserter(retVal), + [&](ur_device_handle_t peerCandidateDevice) { + const auto candidateId = peerCandidateDevice->Id.value(); + std::shared_lock lock(peerCandidateDevice->Mutex); + UR_FASSERT( + hDeviceId < peerCandidateDevice->peers.size(), + "there is no device:" + << hDeviceId << " in peers table of device:" << candidateId + << ", number of devices:" << peerCandidateDevice->peers.size()); + return peerCandidateDevice->peers[hDeviceId] == + ur_device_handle_t_::PeerStatus::ENABLED; + }); + + return retVal; } namespace ur::level_zero { ur_result_t urContextCreate(uint32_t deviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t * /*pProperties*/, - ur_context_handle_t *phContext) try { - - ur_platform_handle_t hPlatform = phDevices[0]->Platform; - ZeStruct contextDesc{}; - - ze_context_handle_t zeContext{}; - ZE2UR_CALL(zeContextCreate, (hPlatform->ZeDriver, &contextDesc, &zeContext)); - - *phContext = - new ur_context_handle_t_(zeContext, deviceCount, phDevices, true); - return UR_RESULT_SUCCESS; -} catch (...) { - return exceptionToResult(std::current_exception()); + ur_context_handle_t *phContext) { + *phContext = nullptr; + try { + + ur_platform_handle_t hPlatform = phDevices[0]->Platform; + ZeStruct contextDesc{}; + + ze_context_handle_t rawZeContext{}; + ZE2UR_CALL(zeContextCreate, + (hPlatform->ZeDriver, &contextDesc, &rawZeContext)); + UR_LOG(INFO, "ZE context created with {} devices", deviceCount); + + // Wrap immediately so any exception thrown by the ur_context_handle_t_ + // constructor (after hContext member is initialised) does not double-free + // the Level Zero context handle. + *phContext = + new ur_context_handle_t_(rawZeContext, deviceCount, phDevices, true); + { + std::scoped_lock Lock(hPlatform->ContextsMutex); + hPlatform->Contexts.push_back(*phContext); + } + return UR_RESULT_SUCCESS; + } catch (...) { + UR_LOG(ERR, "creating context failed"); + delete *phContext; + *phContext = nullptr; + return exceptionToResult(std::current_exception()); + } } ur_result_t urContextGetNativeHandle(ur_context_handle_t hContext, @@ -191,16 +230,26 @@ ur_result_t urContextCreateWithNativeHandle( ur_native_handle_t hNativeContext, ur_adapter_handle_t, uint32_t numDevices, const ur_device_handle_t *phDevices, const ur_context_native_properties_t *pProperties, - ur_context_handle_t *phContext) try { - auto zeContext = reinterpret_cast(hNativeContext); - - auto ownZeHandle = pProperties ? pProperties->isNativeHandleOwned : false; - - *phContext = - new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle); - return UR_RESULT_SUCCESS; -} catch (...) { - return exceptionToResult(std::current_exception()); + ur_context_handle_t *phContext) { + *phContext = nullptr; + try { + auto zeContext = reinterpret_cast(hNativeContext); + + auto ownZeHandle = pProperties ? pProperties->isNativeHandleOwned : false; + + *phContext = + new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle); + { + auto hPlatform = phDevices[0]->Platform; + std::scoped_lock Lock(hPlatform->ContextsMutex); + hPlatform->Contexts.push_back(*phContext); + } + return UR_RESULT_SUCCESS; + } catch (...) { + delete *phContext; + *phContext = nullptr; + return exceptionToResult(std::current_exception()); + } } ur_result_t urContextRetain(ur_context_handle_t hContext) try { @@ -210,7 +259,20 @@ ur_result_t urContextRetain(ur_context_handle_t hContext) try { } ur_result_t urContextRelease(ur_context_handle_t hContext) try { - return hContext->release(); + if (!hContext->RefCount.release()) + return UR_RESULT_SUCCESS; + + auto Platform = hContext->getPlatform(); + { + std::scoped_lock Lock(Platform->ContextsMutex); + auto &Contexts = Platform->Contexts; + auto It = std::find(Contexts.begin(), Contexts.end(), hContext); + if (It != Contexts.end()) { + Contexts.erase(It); + } + } + delete hContext; + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/unified-runtime/source/adapters/level_zero/v2/context.hpp b/unified-runtime/source/adapters/level_zero/v2/context.hpp index c9cb3abec81f6..3092ba193a521 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.hpp @@ -15,6 +15,7 @@ #include "common.hpp" #include "common/ur_ref_count.hpp" #include "event_pool_cache.hpp" +#include "logger/ur_logger.hpp" #include "usm.hpp" enum class PoolCacheType { Immediate, Regular }; @@ -24,7 +25,6 @@ struct ur_context_handle_t_ : ur_object { const ur_device_handle_t *phDevices, bool ownZeContext); ur_result_t retain(); - ur_result_t release(); inline ze_context_handle_t getZeHandle() const { return hContext.get(); } ur_platform_handle_t getPlatform() const; @@ -35,6 +35,8 @@ struct ur_context_handle_t_ : ur_object { void addUsmPool(ur_usm_pool_handle_t hPool); void removeUsmPool(ur_usm_pool_handle_t hPool); + void changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, bool isAdding); template void forEachUsmPool(Func func) { std::shared_lock lock(Mutex); @@ -44,8 +46,11 @@ struct ur_context_handle_t_ : ur_object { } } - const std::vector & - getP2PDevices(ur_device_handle_t hDevice) const; + std::vector + getDevicesWhoseAllocationsCanBeAccessedFrom(ur_device_handle_t hDevice); + + std::vector + getDevicesWhichCanAccessAllocationsPresentOn(ur_device_handle_t hDevice); v2::event_pool &getNativeEventsPool() { return nativeEventsPool; } v2::command_list_cache_t &getCommandListCache() { return commandListCache; } @@ -81,7 +86,10 @@ struct ur_context_handle_t_ : ur_object { private: const v2::raii::ze_context_handle_t hContext; - const std::vector hDevices; + const std::vector + hDevices; // possibly without subdevices, only what was passed to ctor, + // context may have user-defined, limited subset of available + // devices v2::command_list_cache_t commandListCache; v2::event_pool_cache eventPoolCacheImmediate; v2::event_pool_cache eventPoolCacheRegular; @@ -90,9 +98,6 @@ struct ur_context_handle_t_ : ur_object { // (uses non-counter based events to allow for signaling from host) v2::event_pool nativeEventsPool; - // P2P devices for each device in the context, indexed by device id. - const std::vector> p2pAccessDevices; - ur_usm_pool_handle_t_ defaultUSMPool; ur_usm_pool_handle_t_ asyncPool; std::list usmPoolHandles; diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index e44ee872161cf..b0601ba956af2 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -360,12 +360,17 @@ void *ur_discrete_buffer_handle_t::getDevicePtr( return getActiveDeviceAlloc(offset); } - auto &p2pDevices = hContext->getP2PDevices(hDevice); + auto p2pDevices = + hContext->getDevicesWhoseAllocationsCanBeAccessedFrom(hDevice); auto p2pAccessible = std::find(p2pDevices.begin(), p2pDevices.end(), activeAllocationDevice) != p2pDevices.end(); if (!p2pAccessible) { // TODO: migrate buffer through the host + UR_LOG(WARN, + "p2p is not accessible: requesting device ptr:{} cannot access " + "allocation on device ptr:{}", + (void *)hDevice, (void *)activeAllocationDevice); throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.cpp b/unified-runtime/source/adapters/level_zero/v2/usm.cpp index 85f51e215e094..3bc216421e2d6 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.cpp @@ -111,19 +111,33 @@ makeProvider(usm::pool_descriptor poolDescriptor) { UMF_CALL_THROWS(umfLevelZeroMemoryProviderParamsSetMemoryType( hParams, urToUmfMemoryType(poolDescriptor.type))); - std::vector residentZeHandles; + // zeDeviceHandles has to be in the scope of hParams because hParams keeps + // reference of it inside. + std::vector zeDeviceHandles; - if (poolDescriptor.type == UR_USM_TYPE_DEVICE) { + if (poolDescriptor.supportsResidentDevices()) { assert(level_zero_device_handle); - auto residentHandles = - poolDescriptor.hContext->getP2PDevices(poolDescriptor.hDevice); - residentZeHandles.push_back(level_zero_device_handle); - for (auto &device : residentHandles) { - residentZeHandles.push_back(device->ZeDevice); + + // Make memory resident on the source device itself plus all peer devices + // that have explicitly enabled peer access to it. + zeDeviceHandles.push_back(level_zero_device_handle); + for (auto dev : + poolDescriptor.hContext->getDevicesWhichCanAccessAllocationsPresentOn( + poolDescriptor.hDevice)) { + zeDeviceHandles.push_back(dev->ZeDevice); } UMF_CALL_THROWS(umfLevelZeroMemoryProviderParamsSetResidentDevices( - hParams, residentZeHandles.data(), residentZeHandles.size())); + hParams, zeDeviceHandles.data(), zeDeviceHandles.size())); + + UR_LOG(INFO, + "memory provider will be created with {} resident device(s), " + "desc:{}", + zeDeviceHandles.size(), + logger::makeStringFromStreamable(poolDescriptor)); + } else { + UR_LOG(INFO, "memory provider does not support resident devices, desc:{}", + logger::makeStringFromStreamable(poolDescriptor)); } UMF_CALL_THROWS(umfLevelZeroMemoryProviderParamsSetFreePolicy( @@ -431,6 +445,50 @@ size_t ur_usm_pool_handle_t_::getTotalUsedSize() { size_t ur_usm_pool_handle_t_::getPeakUsedSize() { return allocStats.getPeak(); } +void ur_usm_pool_handle_t_::changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, + bool isAdding) { + poolManager.forEachPoolWithDesc([=](const auto &desc, auto pool) { + if (desc.supportsResidentDevices() && desc.hDevice && + desc.hDevice->ZeDevice == hDevice->ZeDevice) { + UR_LOG(INFO, "found {} of srcDevice:{} valid to {} peerDevice:{}", + logger::makeStringFromStreamable(desc), desc.hDevice->Id.value(), + isAdding ? "add" : "remove", peerDevice->Id.value()); + umf_memory_provider_handle_t hProvider; + umf_result_t getProviderResult = + umfPoolGetMemoryProvider(pool->umfPool.get(), &hProvider); + if (getProviderResult != UMF_RESULT_SUCCESS) { + UR_LOG(ERR, "getting memory provider failed with:{}", + getProviderResult); + return true; + } + umf_result_t changeResult = + umfLevelZeroMemoryProviderResidentDeviceChange( + hProvider, peerDevice->ZeDevice, isAdding); + if (changeResult != UMF_RESULT_SUCCESS) { + // UMF updates its internal resident-device list before calling + // zeContextMakeMemoryResident / zeContextEvictMemory, so both the + // UR peer-status table and the UMF provider state are already + // consistent when this error is observed. The failure originates + // from zeContextEvictMemory returning a non-SUCCESS result for + // device USM memory: zeContextMakeMemoryResident is called at + // allocation time (inside the UMF provider) and returns SUCCESS for + // device memory, but zeContextEvictMemory subsequently fails because + // the Level Zero driver treats the make-resident call as a no-op for + // device USM (the memory is already on the source device; no + // explicit hardware pinning on the peer is required). Aborting here + // would be wrong: the system is in a fully consistent state and + // future allocations will correctly omit the now-disabled peer from + // their resident-device set. + UR_LOG(WARN, + "changing resident devices in UMF failed with:{}, continuing", + changeResult); + } + } + return true; + }); +} + namespace ur::level_zero { ur_result_t urUSMPoolCreate( /// [in] handle of the context object diff --git a/unified-runtime/source/adapters/level_zero/v2/usm.hpp b/unified-runtime/source/adapters/level_zero/v2/usm.hpp index fb0b83ce68f3d..5ea29d7e3220b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm.hpp @@ -78,6 +78,8 @@ struct ur_usm_pool_handle_t_ : ur_object { size_t getPeakReservedSize(); size_t getTotalUsedSize(); size_t getPeakUsedSize(); + void changeResidentDevice(ur_device_handle_t hDevice, + ur_device_handle_t peerDevice, bool isAdding); UsmPool *getPool(const usm::pool_descriptor &desc); diff --git a/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp b/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp new file mode 100644 index 0000000000000..28973a567e0e1 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp @@ -0,0 +1,125 @@ +//===----------- usm_p2p.cpp - L0 Adapter ---------------------------------===// +// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../device.hpp" +#include "context.hpp" +#include "logger/ur_logger.hpp" + +namespace ur::level_zero { + +// Validates that two devices are compatible for P2P operations: both must have +// an assigned Id, must belong to the same platform (i.e. share the same device +// cache), and peerDevice's id must be a valid index into commandDevice->peers. +static ur_result_t validateP2PDevicePair(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + if (!commandDevice->Id.has_value() || !peerDevice->Id.has_value()) { + UR_LOG(ERR, "P2P operation requires devices with assigned ids"); + return UR_RESULT_ERROR_INVALID_DEVICE; + } + if (commandDevice->Platform != peerDevice->Platform) { + UR_LOG(ERR, "P2P operation requires devices from the same platform"); + return UR_RESULT_ERROR_INVALID_DEVICE; + } + if (peerDevice->Id.value() >= commandDevice->peers.size()) { + UR_LOG(ERR, + "peerDevice id:{} is out of range for commandDevice peers table " + "(size:{})", + peerDevice->Id.value(), commandDevice->peers.size()); + return UR_RESULT_ERROR_INVALID_DEVICE; + } + return UR_RESULT_SUCCESS; +} + +static ur_result_t urUsmP2PChangePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + bool isAdding) { + UR_CALL(validateP2PDevicePair(commandDevice, peerDevice)); + + UR_LOG(INFO, "user tries to {} peer access to memory of {} from {}", + (isAdding ? "enable" : "disable"), *peerDevice, *commandDevice); + + { + const auto expectedPeerStatus = + isAdding ? ur_device_handle_t_::PeerStatus::DISABLED + : ur_device_handle_t_::PeerStatus::ENABLED; + std::scoped_lock Lock(commandDevice->Mutex); + const auto existingPeerStatus = + commandDevice->peers[peerDevice->Id.value()]; + if (existingPeerStatus != expectedPeerStatus) { + UR_LOG(ERR, + "existing peer status:{} does not match expected peer status:{}", + existingPeerStatus, expectedPeerStatus); + return UR_RESULT_ERROR_INVALID_OPERATION; + } + commandDevice->peers[peerDevice->Id.value()] = + (isAdding ? ur_device_handle_t_::PeerStatus::ENABLED + : ur_device_handle_t_::PeerStatus::DISABLED); + } + + auto Platform = commandDevice->Platform; + // Copy the context list under the mutex and iterate outside the critical + // section to avoid holding ContextsMutex during potentially heavy + // changeResidentDevice calls and to reduce deadlock risk. + std::list Contexts; + { + std::scoped_lock Lock(Platform->ContextsMutex); + Contexts = Platform->Contexts; + } + UR_LOG(INFO, "changing peers in {} contexts", Contexts.size()); + for (auto Context : Contexts) { + Context->changeResidentDevice(commandDevice, peerDevice, isAdding); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t urUsmP2PEnablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + return urUsmP2PChangePeerAccessExp(commandDevice, peerDevice, true); +} + +ur_result_t urUsmP2PDisablePeerAccessExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice) { + return urUsmP2PChangePeerAccessExp(commandDevice, peerDevice, false); +} + +ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, + ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + UR_CALL(validateP2PDevicePair(commandDevice, peerDevice)); + + int propertyValue = 0; + switch (propName) { + case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORT: { + std::scoped_lock Lock(commandDevice->Mutex); + propertyValue = commandDevice->peers[peerDevice->Id.value()] != + ur_device_handle_t_::PeerStatus::NO_CONNECTION; + break; + } + case UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORT: { + ZeStruct p2pProperties; + ZE2UR_CALL(zeDeviceGetP2PProperties, + (commandDevice->ZeDevice, peerDevice->ZeDevice, &p2pProperties)); + propertyValue = + (p2pProperties.flags & ZE_DEVICE_P2P_PROPERTY_FLAG_ATOMICS) != 0; + break; + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + return ReturnValue(propertyValue); +} +} // namespace ur::level_zero diff --git a/unified-runtime/source/common/backtrace.hpp b/unified-runtime/source/common/backtrace.hpp index f94f8e8a88fb0..d3b67bed2d72e 100644 --- a/unified-runtime/source/common/backtrace.hpp +++ b/unified-runtime/source/common/backtrace.hpp @@ -8,10 +8,10 @@ #include #include -#define MAX_BACKTRACE_FRAMES 64 - namespace ur { +static constexpr int MaxBacktraceFrames = 64; + using BacktraceLine = std::string; std::vector getCurrentBacktrace(); diff --git a/unified-runtime/source/common/backtrace_lin.cpp b/unified-runtime/source/common/backtrace_lin.cpp index 86a924a3be56d..89993f3f2ca6d 100644 --- a/unified-runtime/source/common/backtrace_lin.cpp +++ b/unified-runtime/source/common/backtrace_lin.cpp @@ -13,8 +13,8 @@ namespace ur { std::vector getCurrentBacktrace() { - void *backtraceFrames[MAX_BACKTRACE_FRAMES]; - int frameCount = ::backtrace(backtraceFrames, MAX_BACKTRACE_FRAMES); + void *backtraceFrames[MaxBacktraceFrames]; + int frameCount = ::backtrace(backtraceFrames, MaxBacktraceFrames); char **backtraceStr = ::backtrace_symbols(backtraceFrames, frameCount); // TODO: implement getting demangled symbols using abi::__cxa_demangle if (backtraceStr == nullptr) { diff --git a/unified-runtime/source/common/backtrace_win.cpp b/unified-runtime/source/common/backtrace_win.cpp index fa76b1d9a772c..09dcca186d9cc 100644 --- a/unified-runtime/source/common/backtrace_win.cpp +++ b/unified-runtime/source/common/backtrace_win.cpp @@ -20,9 +20,8 @@ std::vector getCurrentBacktrace() { HANDLE process = GetCurrentProcess(); SymInitialize(process, nullptr, true); - PVOID frames[MAX_BACKTRACE_FRAMES]; - WORD frameCount = - CaptureStackBackTrace(0, MAX_BACKTRACE_FRAMES, frames, NULL); + PVOID frames[MaxBacktraceFrames]; + WORD frameCount = CaptureStackBackTrace(0, MaxBacktraceFrames, frames, NULL); if (frameCount == 0) { SymCleanup(process); diff --git a/unified-runtime/source/common/ur_pool_manager.hpp b/unified-runtime/source/common/ur_pool_manager.hpp index 198ee4e005f18..a08e4f91cfd25 100644 --- a/unified-runtime/source/common/ur_pool_manager.hpp +++ b/unified-runtime/source/common/ur_pool_manager.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -64,12 +65,13 @@ struct pool_descriptor { createFromDevices(ur_usm_pool_handle_t poolHandle, ur_context_handle_t hContext, const std::vector &devices); -}; -static inline bool -isSharedAllocationReadOnlyOnDevice(const pool_descriptor &desc) { - return desc.type == UR_USM_TYPE_SHARED && desc.deviceReadOnly; -} + bool isSharedAllocationReadOnlyOnDevice() const { + return type == UR_USM_TYPE_SHARED && deviceReadOnly; + } + + bool supportsResidentDevices() const { return type == UR_USM_TYPE_DEVICE; } +}; inline bool pool_descriptor::operator==(const pool_descriptor &other) const { static usm::detail::ddiTables ddi; @@ -99,8 +101,8 @@ inline bool pool_descriptor::operator==(const pool_descriptor &other) const { } return lhsNative == rhsNative && lhs.type == rhs.type && - (isSharedAllocationReadOnlyOnDevice(lhs) == - isSharedAllocationReadOnlyOnDevice(rhs)) && + (lhs.isSharedAllocationReadOnlyOnDevice() == + rhs.isSharedAllocationReadOnlyOnDevice()) && lhs.poolHandle == rhs.poolHandle; } @@ -152,6 +154,7 @@ inline std::vector pool_descriptor::createFromDevices( template struct pool_manager { private: + static_assert(std::is_same_v); using pool_handle_t = H *; using unique_pool_handle_t = std::unique_ptr>; using desc_to_pool_map_t = std::unordered_map; @@ -174,6 +177,8 @@ template struct pool_manager { } ur_result_t addPool(const D &desc, unique_pool_handle_t &&hPool) { + UR_LOG(INFO, "Adding USM pool {} ptr:{} into pool_manager, size:{}", desc, + hPool.get(), descToPoolMap.size()); if (!descToPoolMap.try_emplace(desc, std::move(hPool)).second) { UR_LOG(ERR, "Pool for pool descriptor: {}, already exists", desc); return UR_RESULT_ERROR_INVALID_ARGUMENT; @@ -191,12 +196,20 @@ template struct pool_manager { return it->second.get(); } + template void forEachPool(Func func) { for (const auto &[desc, pool] : descToPoolMap) { if (!func(pool.get())) break; } } + + template void forEachPoolWithDesc(Func func) { + for (const auto &[desc, pool] : descToPoolMap) { + if (!func(desc, pool.get())) + break; + } + } }; inline umf::pool_unique_handle_t @@ -238,7 +251,7 @@ template <> struct hash { } return combine_hashes(0, desc.type, native, - isSharedAllocationReadOnlyOnDevice(desc), + desc.isSharedAllocationReadOnlyOnDevice(), desc.poolHandle); } }; diff --git a/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp b/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp index 9ee1e31e85ce6..6de53189a3868 100644 --- a/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp +++ b/unified-runtime/source/loader/layers/validation/ur_leak_check.hpp @@ -13,8 +13,6 @@ #include #include -#define MAX_BACKTRACE_FRAMES 64 - namespace ur_validation_layer { struct RefCountContext { diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 79e4964a85e12..c70e5e2845e94 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -41,3 +41,40 @@ TEST_P(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { ASSERT_SUCCESS(urUSMFree(context, ptr)); } + +struct urMemoryMultiResidencyTest : uur::urMultiDeviceContextTestTemplate<2> { + + void SetUp() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiDeviceContextTestTemplate<2>::SetUp()); + + for (std::size_t i = 0; i < 2; i++) { + ur_bool_t usm_p2p_support = false; + ASSERT_SUCCESS( + urDeviceGetInfo(devices[i], UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP, + sizeof(usm_p2p_support), &usm_p2p_support, nullptr)); + if (!usm_p2p_support) { + GTEST_SKIP() << "EXP usm p2p feature is not supported."; + } + } + } + + void TearDown() override { + UUR_RETURN_ON_FATAL_FAILURE( + uur::urMultiDeviceContextTestTemplate<2>::TearDown()); + } +}; + +UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urMemoryMultiResidencyTest); + +TEST_P(urMemoryMultiResidencyTest, allocationInitiallyAbsentOnPeer) {} + +TEST_P(urMemoryMultiResidencyTest, allocationExistsOnPeerWithEnabledAccess) { + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, 1, &ptr)); + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} + +TEST_P(urMemoryMultiResidencyTest, allocationAbsentOnPeerWithDisabledAccess) {} From 28d3621aedf69a1be193c8292e655c5f5dbf835e Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 28 Apr 2026 09:16:29 +0000 Subject: [PATCH 02/23] [UR][L0] Extend memory residency tests with P2P checks - Fill in three placeholder multi-device tests in memory_residency.cpp - Tests verify P2P-driven residency: absent-on-peer without P2P, enable/disable state machine checks, end-to-end data transfer Signed-off-by: Lukasz Dorau --- .../level_zero/v2/memory_residency.cpp | 339 +++++++++++++++++- 1 file changed, 331 insertions(+), 8 deletions(-) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index c70e5e2845e94..993a23b9c071e 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -9,6 +9,9 @@ #include "uur/fixtures.h" #include "uur/utils.h" +#include +#include + using urMemoryResidencyTest = uur::urMultiDeviceContextTestTemplate<1>; UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urMemoryResidencyTest); @@ -19,9 +22,9 @@ TEST_P(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { GTEST_SKIP() << "Test requires a PVC device"; } - size_t initialMemFree = 0; + uint64_t initialMemFree = 0; ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(size_t), &initialMemFree, nullptr)); + sizeof(uint64_t), &initialMemFree, nullptr)); if (initialMemFree < allocSize) { GTEST_SKIP() << "Not enough device memory available"; @@ -31,9 +34,9 @@ TEST_P(urMemoryResidencyTest, allocatingDeviceMemoryWillResultInOOM) { ASSERT_SUCCESS( urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); - size_t currentMemFree = 0; + uint64_t currentMemFree = 0; ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(size_t), ¤tMemFree, nullptr)); + sizeof(uint64_t), ¤tMemFree, nullptr)); // amount of free memory should decrease after making a memory allocation // resident @@ -57,24 +60,344 @@ struct urMemoryMultiResidencyTest : uur::urMultiDeviceContextTestTemplate<2> { GTEST_SKIP() << "EXP usm p2p feature is not supported."; } } + + if (!uur::isPVC(devices[0]) || !uur::isPVC(devices[1])) { + GTEST_SKIP() << "Test requires PVC devices"; + } + + if (!hasHardwareP2PSupport()) { + GTEST_SKIP() << "No hardware P2P connection between devices"; + } } void TearDown() override { + // Disable peer access if a test enabled it but did not clean up (e.g. due + // to an assertion failure), so subsequent tests start from a clean state. + if (peerAccessEnabled) { + EXPECT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + } UUR_RETURN_ON_FATAL_FAILURE( uur::urMultiDeviceContextTestTemplate<2>::TearDown()); } + + // Returns true when hardware P2P connectivity exists in the direction used + // by the tests: devices[1] accessing allocations on devices[0]. + bool hasHardwareP2PSupport() { + int supported = 0; + if (urUsmP2PPeerAccessGetInfoExp( + devices[1], devices[0], UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORT, + sizeof(int), &supported, nullptr) != UR_RESULT_SUCCESS) { + return false; + } + return supported != 0; + } + + // Whether peer access from devices[1] to devices[0] has been enabled by + // this test and must be disabled in TearDown. + bool peerAccessEnabled = false; }; UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urMemoryMultiResidencyTest); -TEST_P(urMemoryMultiResidencyTest, allocationInitiallyAbsentOnPeer) {} +// Verify that allocating USM memory on devices[0] does NOT make it resident on +// devices[1] when peer access has not been enabled. Only the peer device's +// free memory is checked: it must not decrease by allocSize. The source +// device free memory is intentionally not checked because deferred frees from +// earlier tests complete asynchronously and make the source baseline +// unreliable; that property is already covered by +// allocatingDeviceMemoryWillResultInOOM. +TEST_P(urMemoryMultiResidencyTest, allocationInitiallyAbsentOnPeer) { + static constexpr size_t allocSize = 1024 * 1024; -TEST_P(urMemoryMultiResidencyTest, allocationExistsOnPeerWithEnabledAccess) { + uint64_t initialMemFreePeer = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreePeer, + nullptr)); + if (initialMemFreePeer < allocSize) { + GTEST_SKIP() + << "Not enough peer device memory available for reliable check"; + } + // Allocate on devices[0] WITHOUT enabling P2P. void *ptr = nullptr; ASSERT_SUCCESS( - urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, 1, &ptr)); + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + uint64_t currentMemFreePeer = 0; + ur_result_t res = + urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreePeer, nullptr); + ASSERT_SUCCESS(urUSMFree(context, ptr)); + + ASSERT_SUCCESS(res); + // Without P2P, the allocation must not be resident on the peer: + // free memory on devices[1] must not have decreased by a full allocSize. + ASSERT_GT(currentMemFreePeer, initialMemFreePeer - allocSize); } -TEST_P(urMemoryMultiResidencyTest, allocationAbsentOnPeerWithDisabledAccess) {} +// Verify that enabling peer access succeeds and that a second enable attempt +// returns UR_RESULT_ERROR_INVALID_OPERATION (access already enabled). Confirms +// that source-device free memory decreases by at least allocSize, showing the +// allocation succeeded on devices[0] with P2P enabled. Also verifies end-to-end +// P2P data transfer: the allocation on devices[0] is filled with a known +// pattern and then read by devices[1]'s command engine; the result is checked +// for correctness to confirm the feature works in the correct direction. +// Note: peer-device free memory is not checked because +// UR_DEVICE_INFO_GLOBAL_MEM_FREE does not reliably reflect +// zeContextMakeMemoryResident behaviour for device USM allocations. +TEST_P(urMemoryMultiResidencyTest, + enablePeerAccessStateMachineAndSourceAllocation) { + // Enable devices[1] to access allocations on devices[0], so that new + // allocations on devices[0] are made resident on devices[1] too. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + // A second enable must be rejected because access is already enabled. + ASSERT_EQ(urUsmP2PEnablePeerAccessExp(devices[1], devices[0]), + UR_RESULT_ERROR_INVALID_OPERATION); + + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + // Fill ptr on devices[0] with a known pattern using devices[0]'s queue. + static constexpr uint8_t fillPattern = 0xAB; + ur_queue_handle_t srcQueue = nullptr; + ur_result_t fillRes1 = urQueueCreate(context, devices[0], nullptr, &srcQueue); + ur_result_t fillRes2 = + (fillRes1 == UR_RESULT_SUCCESS) + ? urEnqueueUSMFill(srcQueue, ptr, sizeof(fillPattern), &fillPattern, + allocSize, 0, nullptr, nullptr) + : fillRes1; + ur_result_t fillRes3 = + (fillRes2 == UR_RESULT_SUCCESS) ? urQueueFinish(srcQueue) : fillRes2; + if (srcQueue) { + urQueueRelease(srcQueue); + } + + // Verify end-to-end P2P access: copy ptr (on devices[0]) to dstPtr (on + // devices[1]) using devices[1]'s queue, then read back for data verification. + void *dstPtr = nullptr; + ur_queue_handle_t peerQueue = nullptr; + std::vector hostData(allocSize); + ur_result_t p2pRes1 = urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr); + ur_result_t p2pRes2 = + (p2pRes1 == UR_RESULT_SUCCESS) + ? urQueueCreate(context, devices[1], nullptr, &peerQueue) + : p2pRes1; + // devices[1]'s engine reads ptr from devices[0] via P2P. + ur_result_t p2pRes3 = (p2pRes2 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, dstPtr, ptr, + allocSize, 0, nullptr, nullptr) + : p2pRes2; + // Read result back to host for verification. + ur_result_t p2pRes4 = + (p2pRes3 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, hostData.data(), dstPtr, + allocSize, 0, nullptr, nullptr) + : p2pRes3; + + // Save return code so ptr is freed before any ASSERT terminates the test. + uint64_t currentMemFreeSource = 0; + ur_result_t res = + urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreeSource, nullptr); + + if (peerQueue) { + urQueueRelease(peerQueue); + } + if (dstPtr) { + urUSMFree(context, dstPtr); + } + ASSERT_SUCCESS(urUSMFree(context, ptr)); + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + ASSERT_SUCCESS(res); + // Allocation is physically on devices[0]: its free memory must decrease. + ASSERT_LE(currentMemFreeSource, initialMemFreeSource - allocSize); + + ASSERT_SUCCESS(fillRes1); + ASSERT_SUCCESS(fillRes2); + ASSERT_SUCCESS(fillRes3); + + ASSERT_SUCCESS(p2pRes1); + ASSERT_SUCCESS(p2pRes2); + ASSERT_SUCCESS(p2pRes3); + ASSERT_SUCCESS(p2pRes4); + // All bytes transferred via P2P must match the fill pattern. + EXPECT_TRUE(std::all_of(hostData.begin(), hostData.end(), + [](uint8_t b) { return b == fillPattern; })); +} + +// Verify that disabling peer access succeeds and that a second disable attempt +// returns UR_RESULT_ERROR_INVALID_OPERATION (access already disabled). Confirms +// that source-device free memory still shows the allocation after peer access is +// disabled, proving the allocation on devices[0] remains valid. +// Note: peer-device eviction is not verified via free memory because +// UR_DEVICE_INFO_GLOBAL_MEM_FREE does not reliably reflect +// zeContextEvictMemory behaviour for device USM allocations. +TEST_P(urMemoryMultiResidencyTest, + disablePeerAccessStateMachineAndSourceAllocationPersists) { + // Enable devices[1] to access allocations on devices[0], so that new + // allocations on devices[0] are made resident on devices[1] too. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + // Disable P2P; the runtime evicts the allocation from devices[1] (not + // verified here, see the function-level comment above). Save return codes so + // ptr is freed before any ASSERT terminates the test. + ur_result_t res1 = urUsmP2PDisablePeerAccessExp(devices[1], devices[0]); + if (res1 == UR_RESULT_SUCCESS) { + peerAccessEnabled = false; + } + + // A second disable must be rejected because access is already disabled. + ur_result_t res2 = urUsmP2PDisablePeerAccessExp(devices[1], devices[0]); + + uint64_t currentMemFreeSource = 0; + ur_result_t res3 = + urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreeSource, nullptr); + + ASSERT_SUCCESS(urUSMFree(context, ptr)); + + ASSERT_SUCCESS(res1); + ASSERT_EQ(res2, UR_RESULT_ERROR_INVALID_OPERATION); + ASSERT_SUCCESS(res3); + // Allocation is physically on devices[0]: its free memory must decrease. + ASSERT_LE(currentMemFreeSource, initialMemFreeSource - allocSize); +} + +// Verify that USM memory allocated on devices[0] and filled with a known +// pattern can be correctly read by devices[1] when P2P access is enabled. +// This confirms end-to-end P2P data transfer works in the correct direction. +TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { + static constexpr size_t allocSize = 1024 * 1024; + static constexpr uint8_t fillPattern = 0xAB; + + // Allocate on devices[0] and fill with a known pattern. + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + // Enable P2P: devices[1] can now access allocations on devices[0]. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + // Copy srcPtr (on devices[0]) to dstPtr (on devices[1]) using devices[1]'s + // queue (P2P read), then copy dstPtr back to the host for data verification. + void *dstPtr = nullptr; + ur_queue_handle_t peerQueue = nullptr; + std::vector hostData(allocSize, 0); + ur_result_t res1 = urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr); + ur_result_t res2 = + (res1 == UR_RESULT_SUCCESS) + ? urQueueCreate(context, devices[1], nullptr, &peerQueue) + : res1; + ur_result_t res3 = (res2 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, + allocSize, 0, nullptr, nullptr) + : res2; + ur_result_t res4 = + (res3 == UR_RESULT_SUCCESS) + ? urEnqueueUSMMemcpy(peerQueue, true, hostData.data(), dstPtr, + allocSize, 0, nullptr, nullptr) + : res3; + + if (peerQueue) { + urQueueRelease(peerQueue); + } + if (dstPtr) { + urUSMFree(context, dstPtr); + } + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + ASSERT_SUCCESS(res1); + ASSERT_SUCCESS(res2); + ASSERT_SUCCESS(res3); + ASSERT_SUCCESS(res4); + // All bytes transferred via P2P must match the fill pattern. + EXPECT_TRUE(std::all_of(hostData.begin(), hostData.end(), + [](uint8_t b) { return b == fillPattern; })); +} + +// Verify that a USM allocation on devices[0] is NOT made resident on +// devices[1] when P2P access has not been enabled. The feature under test +// restricts residency, not hardware access: Level Zero hardware can still +// transfer data cross-device via the interconnect regardless of residency +// state, so the copy result is not checked here. The observable guarantee +// is that devices[1] free memory must not decrease by a full allocSize, +// proving the allocation was never pinned on the peer device. +TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) { + static constexpr size_t allocSize = 1024 * 1024; + static constexpr uint8_t fillPattern = 0xAB; + + uint64_t initialMemFreePeer = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreePeer, + nullptr)); + if (initialMemFreePeer < allocSize) { + GTEST_SKIP() + << "Not enough peer device memory available for reliable check"; + } + + // Allocate on devices[0] WITHOUT enabling P2P — must not consume + // devices[1] memory. + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + uint64_t currentMemFreePeer = 0; + ur_result_t memRes = + urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), ¤tMemFreePeer, nullptr); + + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + ASSERT_SUCCESS(memRes); + // Without P2P the allocation must not be resident on devices[1]: free + // memory on devices[1] must not have decreased by a full allocSize. + ASSERT_GT(currentMemFreePeer, initialMemFreePeer - allocSize); +} From 5cf0199cda9d64444ebbb320cd08f7a5b375584c Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Mon, 11 May 2026 12:29:23 +0000 Subject: [PATCH 03/23] [SYCL] Refactor P2P peer access helpers to avoid duplication Extract common logic from ext_oneapi_enable_peer_access and ext_oneapi_disable_peer_access into a templated p2pAccessHelper function to avoid code duplication. Signed-off-by: Lukasz Dorau --- sycl/source/device.cpp | 42 +++++----- .../source/adapters/level_zero/v2/context.cpp | 81 +++++++++---------- 2 files changed, 58 insertions(+), 65 deletions(-) diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 8967d8ef2dff1..056ed1477bb71 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -209,36 +209,32 @@ ur_native_handle_t device::getNative() const { return impl->getNative(); } bool device::has(aspect Aspect) const { return impl->has(Aspect); } -void device::ext_oneapi_enable_peer_access(const device &peer) { - ur_device_handle_t Device = impl->getHandleRef(); - ur_device_handle_t Peer = peer.impl->getHandleRef(); - +template +static void p2pAccessHelper(const device &self, const device &peer, + ur_device_handle_t Device, ur_device_handle_t Peer, + detail::adapter_impl &Adapter, + const char *errorMsg) { if (Device == Peer) return; - if (peer.get_platform() != get_platform()) { - throw exception(errc::invalid, - "Cannot enable peer access between different platforms"); - } + if (peer.get_platform() != self.get_platform()) + throw exception(errc::invalid, errorMsg); - impl->getAdapter().call( - Device, Peer); + Adapter.call(Device, Peer); } -void device::ext_oneapi_disable_peer_access(const device &peer) { - ur_device_handle_t Device = impl->getHandleRef(); - ur_device_handle_t Peer = peer.impl->getHandleRef(); - - if (Device == Peer) - return; - - if (peer.get_platform() != get_platform()) { - throw exception(errc::invalid, - "Cannot disable peer access between different platforms"); - } +void device::ext_oneapi_enable_peer_access(const device &peer) { + p2pAccessHelper( + *this, peer, impl->getHandleRef(), peer.impl->getHandleRef(), + impl->getAdapter(), + "Cannot enable peer access between different platforms"); +} - impl->getAdapter().call( - Device, Peer); +void device::ext_oneapi_disable_peer_access(const device &peer) { + p2pAccessHelper( + *this, peer, impl->getHandleRef(), peer.impl->getHandleRef(), + impl->getAdapter(), + "Cannot disable peer access between different platforms"); } bool device::ext_oneapi_can_access_peer(const device &peer, diff --git a/unified-runtime/source/adapters/level_zero/v2/context.cpp b/unified-runtime/source/adapters/level_zero/v2/context.cpp index d869c7c0e6247..25fe5a7ee9ece 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.cpp @@ -187,34 +187,32 @@ namespace ur::level_zero { ur_result_t urContextCreate(uint32_t deviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t * /*pProperties*/, - ur_context_handle_t *phContext) { + ur_context_handle_t *phContext) try { *phContext = nullptr; - try { - - ur_platform_handle_t hPlatform = phDevices[0]->Platform; - ZeStruct contextDesc{}; - - ze_context_handle_t rawZeContext{}; - ZE2UR_CALL(zeContextCreate, - (hPlatform->ZeDriver, &contextDesc, &rawZeContext)); - UR_LOG(INFO, "ZE context created with {} devices", deviceCount); - - // Wrap immediately so any exception thrown by the ur_context_handle_t_ - // constructor (after hContext member is initialised) does not double-free - // the Level Zero context handle. - *phContext = - new ur_context_handle_t_(rawZeContext, deviceCount, phDevices, true); - { - std::scoped_lock Lock(hPlatform->ContextsMutex); - hPlatform->Contexts.push_back(*phContext); - } - return UR_RESULT_SUCCESS; - } catch (...) { - UR_LOG(ERR, "creating context failed"); - delete *phContext; - *phContext = nullptr; - return exceptionToResult(std::current_exception()); + + ur_platform_handle_t hPlatform = phDevices[0]->Platform; + ZeStruct contextDesc{}; + + ze_context_handle_t rawZeContext{}; + ZE2UR_CALL(zeContextCreate, + (hPlatform->ZeDriver, &contextDesc, &rawZeContext)); + UR_LOG(INFO, "ZE context created with {} devices", deviceCount); + + // Wrap immediately so any exception thrown by the ur_context_handle_t_ + // constructor (after hContext member is initialised) does not double-free + // the Level Zero context handle. + *phContext = + new ur_context_handle_t_(rawZeContext, deviceCount, phDevices, true); + { + std::scoped_lock Lock(hPlatform->ContextsMutex); + hPlatform->Contexts.push_back(*phContext); } + return UR_RESULT_SUCCESS; +} catch (...) { + UR_LOG(ERR, "creating context failed"); + delete *phContext; + *phContext = nullptr; + return exceptionToResult(std::current_exception()); } ur_result_t urContextGetNativeHandle(ur_context_handle_t hContext, @@ -230,26 +228,25 @@ ur_result_t urContextCreateWithNativeHandle( ur_native_handle_t hNativeContext, ur_adapter_handle_t, uint32_t numDevices, const ur_device_handle_t *phDevices, const ur_context_native_properties_t *pProperties, - ur_context_handle_t *phContext) { + ur_context_handle_t *phContext) try { *phContext = nullptr; - try { - auto zeContext = reinterpret_cast(hNativeContext); - auto ownZeHandle = pProperties ? pProperties->isNativeHandleOwned : false; + auto zeContext = reinterpret_cast(hNativeContext); - *phContext = - new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle); - { - auto hPlatform = phDevices[0]->Platform; - std::scoped_lock Lock(hPlatform->ContextsMutex); - hPlatform->Contexts.push_back(*phContext); - } - return UR_RESULT_SUCCESS; - } catch (...) { - delete *phContext; - *phContext = nullptr; - return exceptionToResult(std::current_exception()); + auto ownZeHandle = pProperties ? pProperties->isNativeHandleOwned : false; + + *phContext = + new ur_context_handle_t_(zeContext, numDevices, phDevices, ownZeHandle); + { + auto hPlatform = phDevices[0]->Platform; + std::scoped_lock Lock(hPlatform->ContextsMutex); + hPlatform->Contexts.push_back(*phContext); } + return UR_RESULT_SUCCESS; +} catch (...) { + delete *phContext; + *phContext = nullptr; + return exceptionToResult(std::current_exception()); } ur_result_t urContextRetain(ur_context_handle_t hContext) try { From b074b41bb1f8c15468c94daac896d189905e6224 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Mon, 11 May 2026 14:52:08 +0000 Subject: [PATCH 04/23] [UR][L0] Fix flaky disablePeerAccess test by removing free-memory check The disablePeerAccessStateMachineAndSourceAllocationPersists test was failing intermittently because deferred frees from the preceding test complete asynchronously, causing UR_DEVICE_INFO_GLOBAL_MEM_FREE to report more free memory than the baseline captured at the start of the test. Remove the unreliable source-device free-memory assertion and the allocation it required, keeping only the state-machine checks (disable succeeds, double-disable returns UR_RESULT_ERROR_INVALID_OPERATION). The source-device allocation property is already covered by allocatingDeviceMemoryWillResultInOOM which runs first in isolation. --- .../level_zero/v2/memory_residency.cpp | 40 +++---------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 993a23b9c071e..b754de068000b 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -242,35 +242,17 @@ TEST_P(urMemoryMultiResidencyTest, } // Verify that disabling peer access succeeds and that a second disable attempt -// returns UR_RESULT_ERROR_INVALID_OPERATION (access already disabled). Confirms -// that source-device free memory still shows the allocation after peer access is -// disabled, proving the allocation on devices[0] remains valid. -// Note: peer-device eviction is not verified via free memory because -// UR_DEVICE_INFO_GLOBAL_MEM_FREE does not reliably reflect -// zeContextEvictMemory behaviour for device USM allocations. -TEST_P(urMemoryMultiResidencyTest, - disablePeerAccessStateMachineAndSourceAllocationPersists) { +// returns UR_RESULT_ERROR_INVALID_OPERATION (access already disabled). +// Source-device free memory is not checked because deferred frees from earlier +// tests complete asynchronously and make the baseline unreliable; that property +// is already covered by allocatingDeviceMemoryWillResultInOOM. +TEST_P(urMemoryMultiResidencyTest, disablePeerAccessStateMachine) { // Enable devices[1] to access allocations on devices[0], so that new // allocations on devices[0] are made resident on devices[1] too. ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); peerAccessEnabled = true; - static constexpr size_t allocSize = 1024 * 1024; - uint64_t initialMemFreeSource = 0; - ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(uint64_t), &initialMemFreeSource, - nullptr)); - if (initialMemFreeSource < allocSize) { - GTEST_SKIP() << "Not enough source device memory available"; - } - - void *ptr = nullptr; - ASSERT_SUCCESS( - urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); - - // Disable P2P; the runtime evicts the allocation from devices[1] (not - // verified here, see the function-level comment above). Save return codes so - // ptr is freed before any ASSERT terminates the test. + // Disable P2P; the runtime evicts the allocation from devices[1]. ur_result_t res1 = urUsmP2PDisablePeerAccessExp(devices[1], devices[0]); if (res1 == UR_RESULT_SUCCESS) { peerAccessEnabled = false; @@ -279,18 +261,8 @@ TEST_P(urMemoryMultiResidencyTest, // A second disable must be rejected because access is already disabled. ur_result_t res2 = urUsmP2PDisablePeerAccessExp(devices[1], devices[0]); - uint64_t currentMemFreeSource = 0; - ur_result_t res3 = - urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(uint64_t), ¤tMemFreeSource, nullptr); - - ASSERT_SUCCESS(urUSMFree(context, ptr)); - ASSERT_SUCCESS(res1); ASSERT_EQ(res2, UR_RESULT_ERROR_INVALID_OPERATION); - ASSERT_SUCCESS(res3); - // Allocation is physically on devices[0]: its free memory must decrease. - ASSERT_LE(currentMemFreeSource, initialMemFreeSource - allocSize); } // Verify that USM memory allocated on devices[0] and filled with a known From d4674a767f760ca8f947cc446878c5b019cb65ed Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 12 May 2026 08:27:00 +0000 Subject: [PATCH 05/23] [UR][L0] Add two new tests: allocAfterEnablingPeerAccess and allocBeforeEnablingPeerAccess Signed-off-by: Lukasz Dorau --- .../level_zero/v2/memory_residency.cpp | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index b754de068000b..658efa5054766 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -136,6 +136,55 @@ TEST_P(urMemoryMultiResidencyTest, allocationInitiallyAbsentOnPeer) { ASSERT_GT(currentMemFreePeer, initialMemFreePeer - allocSize); } +TEST_P(urMemoryMultiResidencyTest, allocAfterEnablingPeerAccess) { + // Enable devices[1] to access allocations on devices[0], so that new + // allocations on devices[0] are made resident on devices[1] too. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + ASSERT_SUCCESS(urUSMFree(context, ptr)); + + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; +} + +TEST_P(urMemoryMultiResidencyTest, allocBeforeEnablingPeerAccess) { + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + // Enable devices[1] to access allocations on devices[0], so that new + // allocations on devices[0] are made resident on devices[1] too. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + ASSERT_SUCCESS(urUSMFree(context, ptr)); +} + // Verify that enabling peer access succeeds and that a second enable attempt // returns UR_RESULT_ERROR_INVALID_OPERATION (access already enabled). Confirms // that source-device free memory decreases by at least allocSize, showing the From 70cfe665e6bd9bef2d985c27a8d9fe81a93bc6a5 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 12 May 2026 13:37:19 +0000 Subject: [PATCH 06/23] [UR][L0] Fix P2P internal helper to operate on peerDevice The peer table lives on peerDevice: peerDevice->peers[commandDevice->Id] tracks whether commandDevice is allowed to access peerDevice's allocations. Update urUsmP2PChangePeerAccessExp to lock peerDevice's mutex, read/write peerDevice's peer table, use peerDevice's platform for context iteration, and pass (peerDevice, commandDevice) to changeResidentDevice and validateP2PDevicePair. Also fix urUsmP2PPeerAccessGetInfoExp to query the peer table on peerDevice rather than commandDevice. Signed-off-by: Lukasz Dorau --- .../source/adapters/level_zero/v2/usm_p2p.cpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp b/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp index 28973a567e0e1..0f424ebcdef3a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/usm_p2p.cpp @@ -36,10 +36,13 @@ static ur_result_t validateP2PDevicePair(ur_device_handle_t commandDevice, return UR_RESULT_SUCCESS; } +// commandDevice wants to access peerDevice's memory. The peer table is stored +// on peerDevice: peerDevice->peers[commandDevice->Id] tracks whether +// commandDevice is allowed to access peerDevice's allocations. static ur_result_t urUsmP2PChangePeerAccessExp(ur_device_handle_t commandDevice, ur_device_handle_t peerDevice, bool isAdding) { - UR_CALL(validateP2PDevicePair(commandDevice, peerDevice)); + UR_CALL(validateP2PDevicePair(peerDevice, commandDevice)); UR_LOG(INFO, "user tries to {} peer access to memory of {} from {}", (isAdding ? "enable" : "disable"), *peerDevice, *commandDevice); @@ -48,21 +51,21 @@ static ur_result_t urUsmP2PChangePeerAccessExp(ur_device_handle_t commandDevice, const auto expectedPeerStatus = isAdding ? ur_device_handle_t_::PeerStatus::DISABLED : ur_device_handle_t_::PeerStatus::ENABLED; - std::scoped_lock Lock(commandDevice->Mutex); + std::scoped_lock Lock(peerDevice->Mutex); const auto existingPeerStatus = - commandDevice->peers[peerDevice->Id.value()]; + peerDevice->peers[commandDevice->Id.value()]; if (existingPeerStatus != expectedPeerStatus) { UR_LOG(ERR, "existing peer status:{} does not match expected peer status:{}", existingPeerStatus, expectedPeerStatus); return UR_RESULT_ERROR_INVALID_OPERATION; } - commandDevice->peers[peerDevice->Id.value()] = + peerDevice->peers[commandDevice->Id.value()] = (isAdding ? ur_device_handle_t_::PeerStatus::ENABLED : ur_device_handle_t_::PeerStatus::DISABLED); } - auto Platform = commandDevice->Platform; + auto Platform = peerDevice->Platform; // Copy the context list under the mutex and iterate outside the critical // section to avoid holding ContextsMutex during potentially heavy // changeResidentDevice calls and to reduce deadlock risk. @@ -73,7 +76,7 @@ static ur_result_t urUsmP2PChangePeerAccessExp(ur_device_handle_t commandDevice, } UR_LOG(INFO, "changing peers in {} contexts", Contexts.size()); for (auto Context : Contexts) { - Context->changeResidentDevice(commandDevice, peerDevice, isAdding); + Context->changeResidentDevice(peerDevice, commandDevice, isAdding); } return UR_RESULT_SUCCESS; @@ -97,13 +100,13 @@ ur_result_t urUsmP2PPeerAccessGetInfoExp(ur_device_handle_t commandDevice, UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); - UR_CALL(validateP2PDevicePair(commandDevice, peerDevice)); + UR_CALL(validateP2PDevicePair(peerDevice, commandDevice)); int propertyValue = 0; switch (propName) { case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORT: { - std::scoped_lock Lock(commandDevice->Mutex); - propertyValue = commandDevice->peers[peerDevice->Id.value()] != + std::scoped_lock Lock(peerDevice->Mutex); + propertyValue = peerDevice->peers[commandDevice->Id.value()] != ur_device_handle_t_::PeerStatus::NO_CONNECTION; break; } From 840f6a281a027af9532e14397a46b9cefc9eb563 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Tue, 12 May 2026 12:58:03 +0000 Subject: [PATCH 07/23] [UR][L0] Enforce P2P access check in urEnqueueUSMMemcpy Add a P2P peer-status check in command_list_manager::appendUSMMemcpy. When both source and destination are device memory and the source resides on a different device, the adapter queries the source device's peer table to verify that access has been granted to the queue's device. Returns UR_RESULT_ERROR_INVALID_OPERATION if P2P access has not been enabled. Copies to host or shared memory are always allowed regardless of P2P state. Previously, zeCommandListAppendMemoryCopy would silently succeed for cross-device copies via the copy engine regardless of P2P state, making it impossible to test that ext_oneapi_disable_peer_access actually revokes access. Also adds negative-pair tests that verify urEnqueueUSMMemcpy fails when P2P is disabled: - enablePeerAccessStateMachineAndSourceAllocationFailsWithoutP2P - p2pReadFailsWithoutPeerAccessDisabled Signed-off-by: Lukasz Dorau --- .../level_zero/v2/command_list_manager.cpp | 33 +++++++ .../level_zero/v2/memory_residency.cpp | 98 ++++++++++++++++++- 2 files changed, 126 insertions(+), 5 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 82571666081e4..e9df195277170 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -357,6 +357,39 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy"); + // Check P2P access: if the source pointer is device memory on a different + // device AND the destination is also device memory (not host/shared), verify + // that peer access has been enabled. Copies to host memory always succeed + // regardless of P2P state. + ZeStruct dstMemProps; + ze_device_handle_t dstZeDevice = nullptr; + auto zeDstResult = ZE_CALL_NOCHECK( + zeMemGetAllocProperties, + (hContext.get()->getZeHandle(), pDst, &dstMemProps, &dstZeDevice)); + if (zeDstResult == ZE_RESULT_SUCCESS && + dstMemProps.type == ZE_MEMORY_TYPE_DEVICE) { + ZeStruct srcMemProps; + ze_device_handle_t srcZeDevice = nullptr; + auto zeSrcResult = ZE_CALL_NOCHECK( + zeMemGetAllocProperties, + (hContext.get()->getZeHandle(), pSrc, &srcMemProps, &srcZeDevice)); + if (zeSrcResult == ZE_RESULT_SUCCESS && + srcMemProps.type == ZE_MEMORY_TYPE_DEVICE && srcZeDevice && + srcZeDevice != hDevice.get()->ZeDevice) { + auto *srcDevice = + hContext.get()->getPlatform()->getDeviceFromNativeHandle(srcZeDevice); + if (srcDevice && srcDevice->Id.has_value() && + hDevice.get()->Id.has_value() && + hDevice.get()->Id.value() < srcDevice->peers.size()) { + std::scoped_lock lock(srcDevice->Mutex); + if (srcDevice->peers[hDevice.get()->Id.value()] != + ur_device_handle_t_::PeerStatus::ENABLED) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + } + } + } + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); auto [pWaitEvents, numWaitEvents, _] = waitListView; diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 658efa5054766..9a65c1e8f93f6 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -290,6 +290,54 @@ TEST_P(urMemoryMultiResidencyTest, [](uint8_t b) { return b == fillPattern; })); } +// Verify that the end-to-end P2P data transfer in +// enablePeerAccessStateMachineAndSourceAllocation fails when P2P is disabled. +// The adapter returns UR_RESULT_ERROR_INVALID_OPERATION from urEnqueueUSMMemcpy +// because the source pointer on devices[0] is not accessible from devices[1]. +TEST_P(urMemoryMultiResidencyTest, + enablePeerAccessStateMachineAndSourceAllocationFailsWithoutP2P) { + static constexpr size_t allocSize = 1024 * 1024; + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + // Allocate on devices[0] WITHOUT enabling P2P. + void *ptr = nullptr; + ASSERT_SUCCESS( + urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, allocSize, &ptr)); + + // Fill ptr on devices[0] with a known pattern using devices[0]'s queue. + static constexpr uint8_t fillPattern = 0xAB; + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, ptr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + // Attempt P2P copy: devices[1]'s queue reads ptr from devices[0] — should + // fail because P2P is disabled. + void *dstPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr)); + ur_queue_handle_t peerQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[1], nullptr, &peerQueue)); + + ur_result_t copyResult = urEnqueueUSMMemcpy(peerQueue, true, dstPtr, ptr, + allocSize, 0, nullptr, nullptr); + + urQueueRelease(peerQueue); + urUSMFree(context, dstPtr); + ASSERT_SUCCESS(urUSMFree(context, ptr)); + + ASSERT_EQ(copyResult, UR_RESULT_ERROR_INVALID_OPERATION); +} + // Verify that disabling peer access succeeds and that a second disable attempt // returns UR_RESULT_ERROR_INVALID_OPERATION (access already disabled). // Source-device free memory is not checked because deferred frees from earlier @@ -316,11 +364,18 @@ TEST_P(urMemoryMultiResidencyTest, disablePeerAccessStateMachine) { // Verify that USM memory allocated on devices[0] and filled with a known // pattern can be correctly read by devices[1] when P2P access is enabled. -// This confirms end-to-end P2P data transfer works in the correct direction. +// The test requires P2P to be enabled BEFORE allocation so that the memory +// provider registers the peer as a resident device. Without P2P enabled at +// allocation time, urEnqueueUSMMemcpy would fail because the allocation is +// not accessible from the peer device's command list. TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { static constexpr size_t allocSize = 1024 * 1024; static constexpr uint8_t fillPattern = 0xAB; + // Enable P2P: devices[1] can now access allocations on devices[0]. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + // Allocate on devices[0] and fill with a known pattern. void *srcPtr = nullptr; ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, @@ -333,10 +388,6 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { ASSERT_SUCCESS(urQueueFinish(srcQueue)); urQueueRelease(srcQueue); - // Enable P2P: devices[1] can now access allocations on devices[0]. - ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); - peerAccessEnabled = true; - // Copy srcPtr (on devices[0]) to dstPtr (on devices[1]) using devices[1]'s // queue (P2P read), then copy dstPtr back to the host for data verification. void *dstPtr = nullptr; @@ -377,6 +428,43 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { [](uint8_t b) { return b == fillPattern; })); } +// Verify that urEnqueueUSMMemcpy from devices[0]'s memory fails when P2P +// access has NOT been enabled from devices[1]. The adapter checks the peer +// table and returns UR_RESULT_ERROR_INVALID_OPERATION. +TEST_P(urMemoryMultiResidencyTest, p2pReadFailsWithoutPeerAccessDisabled) { + static constexpr size_t allocSize = 1024 * 1024; + static constexpr uint8_t fillPattern = 0xCD; + + // Allocate on devices[0] and fill — P2P is NOT enabled. + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + // Attempt to copy srcPtr (on devices[0]) to dstPtr (on devices[1]) using + // devices[1]'s queue — should fail because P2P is disabled. + void *dstPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr)); + ur_queue_handle_t peerQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[1], nullptr, &peerQueue)); + + ur_result_t copyResult = urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, + allocSize, 0, nullptr, nullptr); + + urQueueRelease(peerQueue); + urUSMFree(context, dstPtr); + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + + ASSERT_EQ(copyResult, UR_RESULT_ERROR_INVALID_OPERATION); +} + // Verify that a USM allocation on devices[0] is NOT made resident on // devices[1] when P2P access has not been enabled. The feature under test // restricts residency, not hardware access: Level Zero hardware can still From 0452f8390386dc9c498c4edc98266f8b3a541c79 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 6 May 2026 16:27:14 +0000 Subject: [PATCH 08/23] [SYCL][E2E][USM] Add P2P USM residency test for L0 v2 adapter Adds sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp to verify that the Level Zero v2 adapter correctly handles P2P access for USM device memory between peer devices. Phase 1: Allocates memory on dev0, fills it with a known pattern, enables P2P access from dev1 to dev0, then uses dev1's queue to copy the data to the host and verifies all values match. Phase 2 (opposite direction): Allocates memory on dev1, fills it with a different pattern, enables P2P access from dev0 to dev1, then uses dev0's queue to copy the data to the host and verifies correctness. Phase 3 (negative): Enables then disables P2P access from dev1 to dev0, then attempts a memcpy via dev1's queue. The test passes if the memcpy throws an exception or if the copied data does not match the original fill pattern, confirming that ext_oneapi_disable_peer_access actually revokes access. Also adds the 'two-or-more-gpu-devices' lit feature to lit.cfg.py, set when sycl-ls reports at least two GPU devices. The test uses this feature to skip on single-GPU machines. Signed-off-by: Lukasz Dorau --- sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp | 194 ++++++++++++++++++++ sycl/test-e2e/lit.cfg.py | 7 + 2 files changed, 201 insertions(+) create mode 100644 sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp diff --git a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp new file mode 100644 index 0000000000000..a3a88ef931225 --- /dev/null +++ b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp @@ -0,0 +1,194 @@ +//==-- p2p_usm_residency.cpp - P2P USM residency test ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Verify that the Level Zero v2 adapter correctly makes USM device memory +// resident on peer devices when P2P access is enabled. +// +// Phase 1: Allocates memory on dev0, fills it with a known pattern, enables +// P2P access from dev1 to dev0, then uses dev1's queue to copy the data to +// the host and verifies all values match the fill pattern. +// +// Phase 2 (opposite direction): Allocates memory on dev1, fills it with a +// different pattern, enables P2P access from dev0 to dev1, then uses dev0's +// queue to copy the data to the host and verifies correctness. +// +// Phase 3 (negative): Allocates memory on dev0, enables then disables P2P +// access from dev1, and verifies that a subsequent device-to-device memcpy +// via dev1's queue throws an exception. +// +// REQUIRES: level_zero && two-or-more-gpu-devices +// UNSUPPORTED: level_zero_v1_adapter +// UNSUPPORTED-INTENDED: Test is specific to the Level Zero v2 adapter. +// +// RUN: %{build} -o %t.out +// RUN: env UR_LOADER_USE_LEVEL_ZERO_V2=1 %{run} %t.out + +#include +#include + +#include +#include +#include + +using namespace sycl; + +// Allocate N ints on srcQueue's device, fill with fillVal, enable P2P so that +// dstDev can access srcDev's allocations, copy to host via dstQueue, verify +// all values, then clean up. Returns false on failure. +static bool testP2PRead(context &ctx, queue &srcQueue, device &srcDev, + queue &dstQueue, device &dstDev, size_t N, int fillVal, + const char *label) { + int *src = sycl::malloc_device(N, srcQueue); + if (!src) { + std::cout << label << ": device alloc failed. Skipping.\n"; + return true; // not a test failure + } + srcQueue.fill(src, fillVal, N).wait(); + + // Enable P2P: dstDev may now access allocations on srcDev. Under the + // Level Zero v2 adapter this also makes the srcDev allocation resident + // on dstDev. + std::cout << "Enabling P2P: dstDev may now access allocations on srcDev.\n"; + dstDev.ext_oneapi_enable_peer_access(srcDev); + + std::vector result(N, 0); + dstQueue.memcpy(result.data(), src, N * sizeof(int)).wait(); + + sycl::free(src, ctx); + std::cout + << "Disabling P2P: dstDev may no longer access allocations on srcDev.\n"; + dstDev.ext_oneapi_disable_peer_access(srcDev); + + for (size_t i = 0; i < N; ++i) { + if (result[i] != fillVal) { + std::cout << label << ": FAIL at index " << i << ": got " << result[i] + << ", expected " << fillVal << "\n"; + return false; + } + } + std::cout << label << ": OK\n"; + return true; +} + +// Allocate N ints on srcQueue's device, fill with fillVal, enable P2P, then +// disable P2P, and verify that a device-to-device memcpy from dstQueue fails +// (since dstDev should no longer be able to access srcDev's allocations after +// P2P is disabled). +static bool testP2PReadFailsAfterDisable(context &ctx, queue &srcQueue, + device &srcDev, queue &dstQueue, + device &dstDev, size_t N, int fillVal, + const char *label) { + int *src = sycl::malloc_device(N, srcQueue); + if (!src) { + std::cout << label << ": device alloc failed (src). Skipping.\n"; + return true; + } + + int *dst = sycl::malloc_device(N, dstQueue); + if (!dst) { + std::cout << label << ": device alloc failed (dst). Skipping.\n"; + sycl::free(src, ctx); + return true; + } + + srcQueue.fill(src, fillVal, N).wait(); + + // Enable then disable P2P: dstDev should no longer be able to access + // allocations on srcDev. + std::cout << "Enabling P2P (temporarily).\n"; + dstDev.ext_oneapi_enable_peer_access(srcDev); + std::cout << "Disabling P2P: dstDev should no longer access srcDev.\n"; + dstDev.ext_oneapi_disable_peer_access(srcDev); + + // Attempt a device-to-device memcpy from src (on srcDev) to dst (on dstDev) + // via dstQueue after P2P has been revoked — this should fail. + bool gotException = false; + try { + dstQueue.memcpy(dst, src, N * sizeof(int)).wait(); + } catch (sycl::exception &e) { + std::cout << label << ": memcpy threw exception: " << e.what() << "\n"; + gotException = true; + } + + sycl::free(dst, ctx); + sycl::free(src, ctx); + + if (!gotException) { + std::cout << label + << ": FAIL — device-to-device memcpy succeeded after P2P was " + "disabled\n"; + return false; + } + std::cout << label << ": OK (memcpy correctly failed after P2P disable)\n"; + return true; +} + +int main() { + // Find a platform with at least two GPU devices. + std::vector gpus; + for (auto &plat : platform::get_platforms()) { + gpus = plat.get_devices(info::device_type::gpu); + if (gpus.size() >= 2) + break; + } + + if (gpus.size() < 2) { + std::cout << "Test requires at least two GPU devices on the same platform. " + "Skipping.\n"; + return 0; + } + + device &dev0 = gpus[0]; + device &dev1 = gpus[1]; + + std::cout << "Device 0: " << dev0.get_info() << "\n"; + std::cout << "Device 1: " << dev1.get_info() << "\n"; + + // Both devices share a single context for cross-device USM. + context ctx({dev0, dev1}); + queue q0(ctx, dev0); + queue q1(ctx, dev1); + + // Allocation size must exceed the disjoint pool's MaxPoolableSize (4 MB for + // device memory) so that the allocation goes directly to the memory provider + // where residency is established. + constexpr size_t N = 2 * 1024 * 1024; // 2M ints = 8 MB + + // Phase 1: dev1 reads dev0's memory (P2P: dev1 -> dev0). + std::cout << "Phase 1: dev1 reads dev0's memory (P2P: dev1 -> dev0).\n"; + if (!dev1.ext_oneapi_can_access_peer( + dev0, ext::oneapi::peer_access::access_supported)) { + std::cout << "No hardware P2P support (dev1->dev0). Skipping.\n"; + return 0; + } + if (!testP2PRead(ctx, q0, dev0, q1, dev1, N, 0x42, + "Phase 1 (dev1 reads dev0)")) + return 1; + + // Phase 2 (opposite): dev0 reads dev1's memory (P2P: dev0 -> dev1). + std::cout + << "Phase 2 (opposite): dev0 reads dev1's memory (P2P: dev0 -> dev1).\n"; + if (!dev0.ext_oneapi_can_access_peer( + dev1, ext::oneapi::peer_access::access_supported)) { + std::cout << "No hardware P2P support (dev0->dev1). Skipping phase 2.\n"; + std::cout << "PASS\n"; + return 0; + } + if (!testP2PRead(ctx, q1, dev1, q0, dev0, N, 0x55, + "Phase 2 (dev0 reads dev1)")) + return 1; + + // Phase 3: verify that memcpy fails after P2P is disabled. + std::cout << "Phase 3: verify memcpy fails after P2P is disabled.\n"; + if (!testP2PReadFailsAfterDisable(ctx, q0, dev0, q1, dev1, N, 0x77, + "Phase 3 (dev1 reads dev0 after disable)")) + return 1; + + std::cout << "PASS\n"; + return 0; +} diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 847d474f59b74..b4cc43d9cdf22 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -686,6 +686,13 @@ def open_check_file(file_name): if "opencl:cpu" in sycl_ls_output: config.available_features.add("opencl-cpu-rt") + # Count physical GPU devices: each physical GPU produces one output line + # that contains ":gpu]". Add a feature when at least two are present so + # tests requiring multi-GPU hardware can be skipped on single-GPU machines. + gpu_device_lines = [l for l in sycl_ls_output.splitlines() if ":gpu]" in l] + if len(gpu_device_lines) >= 2: + config.available_features.add("two-or-more-gpu-devices") + if len(config.sycl_devices) == 1 and config.sycl_devices[0] == "all": devices = set() for line in sycl_ls_output.splitlines(): From f5f9e96067f5eccb4e3ff3dac4bc9e4894eb4098 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 13 May 2026 10:15:28 +0000 Subject: [PATCH 09/23] [UR][L0] Remove flaky free-memory assertion from enablePeerAccess test The source-device free-memory check in enablePeerAccessStateMachineAndSourceAllocation was failing intermittently because deferred frees from earlier tests complete asynchronously, causing UR_DEVICE_INFO_GLOBAL_MEM_FREE to report more free memory than the baseline. Remove the unreliable assertion. The test's actual value is in verifying the P2P state machine (double-enable returns error) and the end-to-end data transfer correctness. Signed-off-by: Lukasz Dorau --- .../level_zero/v2/memory_residency.cpp | 31 +++---------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 9a65c1e8f93f6..11412df3a69f7 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -186,15 +186,11 @@ TEST_P(urMemoryMultiResidencyTest, allocBeforeEnablingPeerAccess) { } // Verify that enabling peer access succeeds and that a second enable attempt -// returns UR_RESULT_ERROR_INVALID_OPERATION (access already enabled). Confirms -// that source-device free memory decreases by at least allocSize, showing the -// allocation succeeded on devices[0] with P2P enabled. Also verifies end-to-end -// P2P data transfer: the allocation on devices[0] is filled with a known -// pattern and then read by devices[1]'s command engine; the result is checked -// for correctness to confirm the feature works in the correct direction. -// Note: peer-device free memory is not checked because -// UR_DEVICE_INFO_GLOBAL_MEM_FREE does not reliably reflect -// zeContextMakeMemoryResident behaviour for device USM allocations. +// returns UR_RESULT_ERROR_INVALID_OPERATION (access already enabled). Also +// verifies end-to-end P2P data transfer: the allocation on devices[0] is filled +// with a known pattern and then read by devices[1]'s command engine; the result +// is checked for correctness to confirm the feature works in the correct +// direction. TEST_P(urMemoryMultiResidencyTest, enablePeerAccessStateMachineAndSourceAllocation) { // Enable devices[1] to access allocations on devices[0], so that new @@ -207,13 +203,6 @@ TEST_P(urMemoryMultiResidencyTest, UR_RESULT_ERROR_INVALID_OPERATION); static constexpr size_t allocSize = 1024 * 1024; - uint64_t initialMemFreeSource = 0; - ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(uint64_t), &initialMemFreeSource, - nullptr)); - if (initialMemFreeSource < allocSize) { - GTEST_SKIP() << "Not enough source device memory available"; - } void *ptr = nullptr; ASSERT_SUCCESS( @@ -257,12 +246,6 @@ TEST_P(urMemoryMultiResidencyTest, allocSize, 0, nullptr, nullptr) : p2pRes3; - // Save return code so ptr is freed before any ASSERT terminates the test. - uint64_t currentMemFreeSource = 0; - ur_result_t res = - urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, - sizeof(uint64_t), ¤tMemFreeSource, nullptr); - if (peerQueue) { urQueueRelease(peerQueue); } @@ -273,10 +256,6 @@ TEST_P(urMemoryMultiResidencyTest, ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); peerAccessEnabled = false; - ASSERT_SUCCESS(res); - // Allocation is physically on devices[0]: its free memory must decrease. - ASSERT_LE(currentMemFreeSource, initialMemFreeSource - allocSize); - ASSERT_SUCCESS(fillRes1); ASSERT_SUCCESS(fillRes2); ASSERT_SUCCESS(fillRes3); From fc17fed7bfd9240cae7b482cb337d7275284af10 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 14 May 2026 11:03:54 +0000 Subject: [PATCH 10/23] [SYCL][E2E] Fix P2P cleanup order in p2p_usm_residency test Disable P2P access before freeing the allocation. The previous order freed the memory while P2P was still enabled, leaving a brief window where the peer device held access rights to a released allocation. The correct cleanup sequence is to revoke peer access first and then free the memory. Signed-off-by: Lukasz Dorau --- sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp index a3a88ef931225..edb082b0e5726 100644 --- a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp +++ b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp @@ -59,10 +59,10 @@ static bool testP2PRead(context &ctx, queue &srcQueue, device &srcDev, std::vector result(N, 0); dstQueue.memcpy(result.data(), src, N * sizeof(int)).wait(); - sycl::free(src, ctx); std::cout << "Disabling P2P: dstDev may no longer access allocations on srcDev.\n"; dstDev.ext_oneapi_disable_peer_access(srcDev); + sycl::free(src, ctx); for (size_t i = 0; i < N; ++i) { if (result[i] != fillVal) { From 27d19f992048e40211370cc855c7a5baf37ac1a3 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 14 May 2026 11:08:28 +0000 Subject: [PATCH 11/23] [UR][L0v2] Extract checkP2PAccess static helper from appendUSMMemcpy Replace the nested if-ladder in appendUSMMemcpy with a flat static helper function checkP2PAccess that uses early returns, as suggested in the review. The logic is identical; the refactoring makes the control flow easier to follow. Signed-off-by: Lukasz Dorau --- .../level_zero/v2/command_list_manager.cpp | 81 +++++++++++-------- 1 file changed, 49 insertions(+), 32 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index e9df195277170..d466d259d9f67 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -352,43 +352,60 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( return UR_RESULT_SUCCESS; } +// Check P2P access for a device-to-device memcpy. Returns +// UR_RESULT_ERROR_INVALID_OPERATION when the destination is device memory, +// the source is device memory residing on a different device, and peer access +// between those two devices has not been enabled. In all other cases +// (host/shared memory, same device, or unknown allocation type) returns +// UR_RESULT_SUCCESS so the copy can proceed. +static ur_result_t checkP2PAccess(ze_context_handle_t zeContext, + const void *pDst, const void *pSrc, + ur_context_handle_t urContext, + ur_device_handle_t urDevice) { + ZeStruct dstProps; + ze_device_handle_t dstZeDevice = nullptr; + if (ZE_CALL_NOCHECK(zeMemGetAllocProperties, + (zeContext, pDst, &dstProps, &dstZeDevice)) != + ZE_RESULT_SUCCESS || + dstProps.type != ZE_MEMORY_TYPE_DEVICE) { + return UR_RESULT_SUCCESS; + } + + ZeStruct srcProps; + ze_device_handle_t srcZeDevice = nullptr; + if (ZE_CALL_NOCHECK(zeMemGetAllocProperties, + (zeContext, pSrc, &srcProps, &srcZeDevice)) != + ZE_RESULT_SUCCESS || + srcProps.type != ZE_MEMORY_TYPE_DEVICE || !srcZeDevice || + srcZeDevice == urDevice->ZeDevice) { + return UR_RESULT_SUCCESS; + } + + auto *srcDevice = + urContext->getPlatform()->getDeviceFromNativeHandle(srcZeDevice); + if (!srcDevice || !srcDevice->Id.has_value() || !urDevice->Id.has_value() || + urDevice->Id.value() >= srcDevice->peers.size()) { + return UR_RESULT_SUCCESS; + } + + std::scoped_lock lock(srcDevice->Mutex); + if (srcDevice->peers[urDevice->Id.value()] != + ur_device_handle_t_::PeerStatus::ENABLED) { + return UR_RESULT_ERROR_INVALID_OPERATION; + } + + return UR_RESULT_SUCCESS; +} + ur_result_t ur_command_list_manager::appendUSMMemcpy( bool blocking, void *pDst, const void *pSrc, size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy"); - // Check P2P access: if the source pointer is device memory on a different - // device AND the destination is also device memory (not host/shared), verify - // that peer access has been enabled. Copies to host memory always succeed - // regardless of P2P state. - ZeStruct dstMemProps; - ze_device_handle_t dstZeDevice = nullptr; - auto zeDstResult = ZE_CALL_NOCHECK( - zeMemGetAllocProperties, - (hContext.get()->getZeHandle(), pDst, &dstMemProps, &dstZeDevice)); - if (zeDstResult == ZE_RESULT_SUCCESS && - dstMemProps.type == ZE_MEMORY_TYPE_DEVICE) { - ZeStruct srcMemProps; - ze_device_handle_t srcZeDevice = nullptr; - auto zeSrcResult = ZE_CALL_NOCHECK( - zeMemGetAllocProperties, - (hContext.get()->getZeHandle(), pSrc, &srcMemProps, &srcZeDevice)); - if (zeSrcResult == ZE_RESULT_SUCCESS && - srcMemProps.type == ZE_MEMORY_TYPE_DEVICE && srcZeDevice && - srcZeDevice != hDevice.get()->ZeDevice) { - auto *srcDevice = - hContext.get()->getPlatform()->getDeviceFromNativeHandle(srcZeDevice); - if (srcDevice && srcDevice->Id.has_value() && - hDevice.get()->Id.has_value() && - hDevice.get()->Id.value() < srcDevice->peers.size()) { - std::scoped_lock lock(srcDevice->Mutex); - if (srcDevice->peers[hDevice.get()->Id.value()] != - ur_device_handle_t_::PeerStatus::ENABLED) { - return UR_RESULT_ERROR_INVALID_OPERATION; - } - } - } - } + // Verify P2P access when copying between device allocations on different + // devices. Copies to/from host or shared memory always succeed. + UR_CALL(checkP2PAccess(hContext.get()->getZeHandle(), pDst, pSrc, + hContext.get(), hDevice.get())); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); auto [pWaitEvents, numWaitEvents, _] = waitListView; From d6050eaeddfcf8a44704f9b6049f54bce200386f Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 14 May 2026 11:19:34 +0000 Subject: [PATCH 12/23] [UR][L0v2] Fix inaccurate comment in p2pReadSucceedsWithPeerAccessEnabled test The old comment claimed that P2P must be enabled before the allocation, which is incorrect. urUsmP2PEnablePeerAccessExp calls changeResidentDevice on all contexts after updating the peer-status table, which retroactively makes already-existing allocations resident on the peer device. Enabling P2P after allocation is therefore equally valid, as demonstrated by the allocBeforeEnablingPeerAccess test. Update the comment to reflect the actual adapter behaviour. Signed-off-by: Lukasz Dorau --- .../test/adapters/level_zero/v2/memory_residency.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 11412df3a69f7..e07c9f3501f38 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -343,10 +343,10 @@ TEST_P(urMemoryMultiResidencyTest, disablePeerAccessStateMachine) { // Verify that USM memory allocated on devices[0] and filled with a known // pattern can be correctly read by devices[1] when P2P access is enabled. -// The test requires P2P to be enabled BEFORE allocation so that the memory -// provider registers the peer as a resident device. Without P2P enabled at -// allocation time, urEnqueueUSMMemcpy would fail because the allocation is -// not accessible from the peer device's command list. +// P2P is enabled before the allocation here, but enabling it after the +// allocation is equally valid: urUsmP2PEnablePeerAccessExp calls +// changeResidentDevice on all contexts, which retroactively makes existing +// allocations resident on the peer device. TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { static constexpr size_t allocSize = 1024 * 1024; static constexpr uint8_t fillPattern = 0xAB; From 481c108c4aae870f8dd506cbdc5c86b6d7c6191b Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 14 May 2026 11:12:39 +0000 Subject: [PATCH 13/23] [UR][L0v2] Remove redundant p2pReadFailsWithoutPeerAccessDisabled test The test exercises exactly the same code path as the existing enablePeerAccessStateMachineAndSourceAllocationFailsWithoutP2P: both allocate on devices[0] without P2P, attempt a copy via devices[1]'s queue, and assert UR_RESULT_ERROR_INVALID_OPERATION. Remove the duplicate to avoid maintaining two tests that provide no additional coverage. Signed-off-by: Lukasz Dorau --- .../level_zero/v2/memory_residency.cpp | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index e07c9f3501f38..d5bebbcd8a5b5 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -407,43 +407,6 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { [](uint8_t b) { return b == fillPattern; })); } -// Verify that urEnqueueUSMMemcpy from devices[0]'s memory fails when P2P -// access has NOT been enabled from devices[1]. The adapter checks the peer -// table and returns UR_RESULT_ERROR_INVALID_OPERATION. -TEST_P(urMemoryMultiResidencyTest, p2pReadFailsWithoutPeerAccessDisabled) { - static constexpr size_t allocSize = 1024 * 1024; - static constexpr uint8_t fillPattern = 0xCD; - - // Allocate on devices[0] and fill — P2P is NOT enabled. - void *srcPtr = nullptr; - ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, - allocSize, &srcPtr)); - ur_queue_handle_t srcQueue = nullptr; - ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); - ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), - &fillPattern, allocSize, 0, nullptr, - nullptr)); - ASSERT_SUCCESS(urQueueFinish(srcQueue)); - urQueueRelease(srcQueue); - - // Attempt to copy srcPtr (on devices[0]) to dstPtr (on devices[1]) using - // devices[1]'s queue — should fail because P2P is disabled. - void *dstPtr = nullptr; - ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, - allocSize, &dstPtr)); - ur_queue_handle_t peerQueue = nullptr; - ASSERT_SUCCESS(urQueueCreate(context, devices[1], nullptr, &peerQueue)); - - ur_result_t copyResult = urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, - allocSize, 0, nullptr, nullptr); - - urQueueRelease(peerQueue); - urUSMFree(context, dstPtr); - ASSERT_SUCCESS(urUSMFree(context, srcPtr)); - - ASSERT_EQ(copyResult, UR_RESULT_ERROR_INVALID_OPERATION); -} - // Verify that a USM allocation on devices[0] is NOT made resident on // devices[1] when P2P access has not been enabled. The feature under test // restricts residency, not hardware access: Level Zero hardware can still From 0134ef2d70b298e4ea078aca6c2829920e9576c2 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 14 May 2026 12:50:58 +0000 Subject: [PATCH 14/23] [SYCL][E2E] Fix P2P direction and cleanup in p2p_copy e2e test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The test was calling Devs[0].ext_oneapi_enable_peer_access(Devs[1]), which sets Devs[1]->peers[Devs[0].Id] = ENABLED, meaning Devs[0] can access Devs[1]'s memory. However the actual P2P copy was Queues[1].copy(arr0, arr1, N) — Devs[1]'s queue reading arr0 which lives on Devs[0] — which requires the opposite direction: Devs[0]->peers[Devs[1].Id] = ENABLED, set by Devs[1].ext_oneapi_enable_peer_access(Devs[0]). The wrong direction was harmless with the L0v1 adapter (no peer-access check on memcpy), but the L0v2 adapter's checkP2PAccess enforces the correct direction and returned UR_RESULT_ERROR_INVALID_OPERATION. Also add ext_oneapi_disable_peer_access before freeing arr0, consistent with the rule that peer access should be revoked before the guarded allocation is released. Signed-off-by: Lukasz Dorau --- sycl/test-e2e/USM/P2P/p2p_copy.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/USM/P2P/p2p_copy.cpp b/sycl/test-e2e/USM/P2P/p2p_copy.cpp index 1f4d2733a055c..2437e645d7d0a 100644 --- a/sycl/test-e2e/USM/P2P/p2p_copy.cpp +++ b/sycl/test-e2e/USM/P2P/p2p_copy.cpp @@ -37,8 +37,9 @@ int main() { return 0; } - // Enables Devs[0] to access Devs[1] memory. - Devs[0].ext_oneapi_enable_peer_access(Devs[1]); + // Enables Devs[1] to access Devs[0] memory (Devs[1]'s queue will read + // from arr0 which lives on Devs[0]). + Devs[1].ext_oneapi_enable_peer_access(Devs[0]); std::vector input(N); std::iota(input.begin(), input.end(), 0); @@ -53,6 +54,8 @@ int main() { int out[N]; Queues[1].copy(arr1, out, N).wait(); + // Disable P2P before releasing the allocation it was guarding. + Devs[1].ext_oneapi_disable_peer_access(Devs[0]); sycl::free(arr0, Queues[0]); sycl::free(arr1, Queues[1]); From 6b031227d596d5a01b24c541d2afbbc32f39356f Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 15 May 2026 09:48:43 +0000 Subject: [PATCH 15/23] [UR][L0v2] Add p2pReadFailsAfterRevokingAccess test Verify that revoking peer access prevents subsequent USM copies. The test enables P2P (devices[1] -> devices[0]), confirms that a urEnqueueUSMMemcpy succeeds, then disables P2P and asserts that the same copy returns UR_RESULT_ERROR_INVALID_OPERATION. Signed-off-by: Lukasz Dorau --- .../level_zero/v2/memory_residency.cpp | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index d5bebbcd8a5b5..9b29be08f57fa 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -407,6 +407,63 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { [](uint8_t b) { return b == fillPattern; })); } +// Verify that revoking peer access from devices[1] to devices[0] prevents +// subsequent USM copies from devices[1]'s queue. A successful copy is first +// performed with P2P enabled to confirm the setup is correct; then P2P is +// disabled and the same copy is expected to fail with +// UR_RESULT_ERROR_INVALID_OPERATION. +TEST_P(urMemoryMultiResidencyTest, p2pReadFailsAfterRevokingAccess) { + static constexpr size_t allocSize = 1024 * 1024; + + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + uint64_t initialMemFreePeer = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreePeer, + nullptr)); + if (initialMemFreePeer < allocSize) { + GTEST_SKIP() << "Not enough peer device memory available"; + } + + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + + void *dstPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr)); + + // Enable P2P and confirm a copy from devices[0] to devices[1] succeeds. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + ur_queue_handle_t peerQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[1], nullptr, &peerQueue)); + + ASSERT_SUCCESS(urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, allocSize, + 0, nullptr, nullptr)); + + // Revoke P2P access. + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + // Copy must now fail: devices[1] can no longer access srcPtr on devices[0]. + ur_result_t copyResult = urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, + allocSize, 0, nullptr, nullptr); + + urQueueRelease(peerQueue); + urUSMFree(context, dstPtr); + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + + ASSERT_EQ(copyResult, UR_RESULT_ERROR_INVALID_OPERATION); +} + // Verify that a USM allocation on devices[0] is NOT made resident on // devices[1] when P2P access has not been enabled. The feature under test // restricts residency, not hardware access: Level Zero hardware can still From 234aad4f3a4d46a31cf0060cd06d57991cae0d46 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 15 May 2026 10:34:07 +0000 Subject: [PATCH 16/23] [UR][L0v2] Add p2pReadSucceedsAfterEnablingAccess test Verify the transition from blocked to permitted: attempt a USM copy from devices[1]'s queue without P2P (expects UR_RESULT_ERROR_INVALID_OPERATION), then enable P2P on the same allocation and retry (expects success with correct data). Signed-off-by: Lukasz Dorau --- .../level_zero/v2/memory_residency.cpp | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp index 9b29be08f57fa..8128494bba8c5 100644 --- a/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/memory_residency.cpp @@ -407,6 +407,79 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsWithPeerAccessEnabled) { [](uint8_t b) { return b == fillPattern; })); } +// Verify the transition from blocked to permitted: attempt a USM copy from +// devices[1]'s queue without P2P (must fail), then enable P2P on the same +// allocation and retry the copy (must succeed with correct data). +TEST_P(urMemoryMultiResidencyTest, p2pReadSucceedsAfterEnablingAccess) { + static constexpr size_t allocSize = 1024 * 1024; + static constexpr uint8_t fillPattern = 0xCD; + + uint64_t initialMemFreeSource = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[0], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreeSource, + nullptr)); + if (initialMemFreeSource < allocSize) { + GTEST_SKIP() << "Not enough source device memory available"; + } + + uint64_t initialMemFreePeer = 0; + ASSERT_SUCCESS(urDeviceGetInfo(devices[1], UR_DEVICE_INFO_GLOBAL_MEM_FREE, + sizeof(uint64_t), &initialMemFreePeer, + nullptr)); + if (initialMemFreePeer < allocSize) { + GTEST_SKIP() << "Not enough peer device memory available"; + } + + void *srcPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr, + allocSize, &srcPtr)); + + void *dstPtr = nullptr; + ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[1], nullptr, nullptr, + allocSize, &dstPtr)); + + ur_queue_handle_t srcQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[0], nullptr, &srcQueue)); + ASSERT_SUCCESS(urEnqueueUSMFill(srcQueue, srcPtr, sizeof(fillPattern), + &fillPattern, allocSize, 0, nullptr, + nullptr)); + ASSERT_SUCCESS(urQueueFinish(srcQueue)); + urQueueRelease(srcQueue); + + ur_queue_handle_t peerQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[1], nullptr, &peerQueue)); + + // Without P2P the copy must be rejected. + ur_result_t copyWithoutP2P = urEnqueueUSMMemcpy( + peerQueue, true, dstPtr, srcPtr, allocSize, 0, nullptr, nullptr); + ASSERT_EQ(copyWithoutP2P, UR_RESULT_ERROR_INVALID_OPERATION); + + // Enable P2P: devices[1] can now access allocations on devices[0]. + ASSERT_SUCCESS(urUsmP2PEnablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = true; + + // Retry the copy — must succeed now. + ASSERT_SUCCESS(urEnqueueUSMMemcpy(peerQueue, true, dstPtr, srcPtr, allocSize, + 0, nullptr, nullptr)); + + // Read result back to host for verification. + std::vector hostData(allocSize); + ur_queue_handle_t hostQueue = nullptr; + ASSERT_SUCCESS(urQueueCreate(context, devices[1], nullptr, &hostQueue)); + ASSERT_SUCCESS(urEnqueueUSMMemcpy(hostQueue, true, hostData.data(), dstPtr, + allocSize, 0, nullptr, nullptr)); + urQueueRelease(hostQueue); + + urQueueRelease(peerQueue); + urUSMFree(context, dstPtr); + ASSERT_SUCCESS(urUSMFree(context, srcPtr)); + ASSERT_SUCCESS(urUsmP2PDisablePeerAccessExp(devices[1], devices[0])); + peerAccessEnabled = false; + + EXPECT_TRUE(std::all_of(hostData.begin(), hostData.end(), + [](uint8_t b) { return b == fillPattern; })); +} + // Verify that revoking peer access from devices[1] to devices[0] prevents // subsequent USM copies from devices[1]'s queue. A successful copy is first // performed with P2P enabled to confirm the setup is correct; then P2P is From 247ac7ffe881efe7bdde2ac2a95a2b334c0630a2 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 15 May 2026 09:50:07 +0000 Subject: [PATCH 17/23] [SYCL][E2E] Add Phase 4 to p2p_usm_residency: memcpy fails without P2P Add testP2PReadFailsWithoutEnable: allocates on dev0, attempts a device-to-device memcpy via dev1's queue without ever calling ext_oneapi_enable_peer_access, and asserts a SYCL exception is thrown. This covers the case where P2P was never enabled (Phase 3 already covers the revoke case). Signed-off-by: Lukasz Dorau --- sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp | 52 +++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp index edb082b0e5726..170c06c721f1f 100644 --- a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp +++ b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp @@ -128,6 +128,52 @@ static bool testP2PReadFailsAfterDisable(context &ctx, queue &srcQueue, return true; } +// Allocate N ints on srcQueue's device, fill with fillVal, and verify that a +// device-to-device memcpy from dstQueue fails without ever enabling P2P (since +// dstDev must not access srcDev's allocations when P2P has never been enabled). +static bool testP2PReadFailsWithoutEnable(context &ctx, queue &srcQueue, + device &srcDev, queue &dstQueue, + device &dstDev, size_t N, int fillVal, + const char *label) { + (void)srcDev; + (void)dstDev; + + int *src = sycl::malloc_device(N, srcQueue); + if (!src) { + std::cout << label << ": device alloc failed (src). Skipping.\n"; + return true; + } + + int *dst = sycl::malloc_device(N, dstQueue); + if (!dst) { + std::cout << label << ": device alloc failed (dst). Skipping.\n"; + sycl::free(src, ctx); + return true; + } + + srcQueue.fill(src, fillVal, N).wait(); + + // Attempt a device-to-device memcpy without ever enabling P2P — must fail. + bool gotException = false; + try { + dstQueue.memcpy(dst, src, N * sizeof(int)).wait(); + } catch (sycl::exception &e) { + std::cout << label << ": memcpy threw exception: " << e.what() << "\n"; + gotException = true; + } + + sycl::free(dst, ctx); + sycl::free(src, ctx); + + if (!gotException) { + std::cout << label + << ": FAIL — device-to-device memcpy succeeded without P2P\n"; + return false; + } + std::cout << label << ": OK (memcpy correctly failed without P2P)\n"; + return true; +} + int main() { // Find a platform with at least two GPU devices. std::vector gpus; @@ -189,6 +235,12 @@ int main() { "Phase 3 (dev1 reads dev0 after disable)")) return 1; + // Phase 4: verify that memcpy fails without ever enabling P2P. + std::cout << "Phase 4: verify memcpy fails without ever enabling P2P.\n"; + if (!testP2PReadFailsWithoutEnable(ctx, q0, dev0, q1, dev1, N, 0x99, + "Phase 4 (dev1 reads dev0 without P2P)")) + return 1; + std::cout << "PASS\n"; return 0; } From 64f238d8e24cd683491340cb9cd17d8aa8511d27 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Fri, 15 May 2026 10:37:30 +0000 Subject: [PATCH 18/23] [SYCL][E2E] Add Phase 5 to p2p_usm_residency: fail then succeed after P2P enable Add testP2PReadFailsThenSucceedsAfterEnable: attempts a device-to-device memcpy without P2P (expects a SYCL exception), then enables P2P on the same allocation and retries (expects success with correct data), then disables P2P and cleans up. Signed-off-by: Lukasz Dorau --- sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp | 81 +++++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp index 170c06c721f1f..bc8cb748b19bf 100644 --- a/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp +++ b/sycl/test-e2e/USM/P2P/p2p_usm_residency.cpp @@ -174,6 +174,79 @@ static bool testP2PReadFailsWithoutEnable(context &ctx, queue &srcQueue, return true; } +// Verify the transition from blocked to permitted using the same allocation: +// first attempt a device-to-device memcpy from dstQueue without P2P (must +// fail), then enable P2P and retry the copy (must succeed with correct data). +static bool testP2PReadFailsThenSucceedsAfterEnable( + context &ctx, queue &srcQueue, device &srcDev, queue &dstQueue, + device &dstDev, size_t N, int fillVal, const char *label) { + int *src = sycl::malloc_device(N, srcQueue); + if (!src) { + std::cout << label << ": device alloc failed (src). Skipping.\n"; + return true; + } + + int *dst = sycl::malloc_device(N, dstQueue); + if (!dst) { + std::cout << label << ": device alloc failed (dst). Skipping.\n"; + sycl::free(src, ctx); + return true; + } + + srcQueue.fill(src, fillVal, N).wait(); + + // Without P2P the copy must fail. + bool gotException = false; + try { + dstQueue.memcpy(dst, src, N * sizeof(int)).wait(); + } catch (sycl::exception &e) { + std::cout << label << ": first memcpy (no P2P) threw: " << e.what() << "\n"; + gotException = true; + } + + if (!gotException) { + std::cout << label << ": FAIL — first memcpy succeeded without P2P\n"; + sycl::free(dst, ctx); + sycl::free(src, ctx); + return false; + } + + // Enable P2P: dstDev may now access allocations on srcDev. + std::cout << label << ": enabling P2P.\n"; + dstDev.ext_oneapi_enable_peer_access(srcDev); + + // Retry — must succeed now. + bool copyOk = true; + std::vector result(N, 0); + try { + dstQueue.memcpy(dst, src, N * sizeof(int)).wait(); + // Read back to host for verification. + dstQueue.memcpy(result.data(), dst, N * sizeof(int)).wait(); + } catch (sycl::exception &e) { + std::cout << label << ": second memcpy (P2P enabled) threw: " << e.what() + << "\n"; + copyOk = false; + } + + std::cout << label << ": disabling P2P.\n"; + dstDev.ext_oneapi_disable_peer_access(srcDev); + sycl::free(dst, ctx); + sycl::free(src, ctx); + + if (!copyOk) + return false; + + for (size_t i = 0; i < N; ++i) { + if (result[i] != fillVal) { + std::cout << label << ": FAIL at index " << i << ": got " << result[i] + << ", expected " << fillVal << "\n"; + return false; + } + } + std::cout << label << ": OK (failed without P2P, succeeded after enable)\n"; + return true; +} + int main() { // Find a platform with at least two GPU devices. std::vector gpus; @@ -241,6 +314,14 @@ int main() { "Phase 4 (dev1 reads dev0 without P2P)")) return 1; + // Phase 5: verify the transition from blocked to permitted. + std::cout << "Phase 5: verify memcpy fails without P2P then succeeds after " + "enabling it.\n"; + if (!testP2PReadFailsThenSucceedsAfterEnable( + ctx, q0, dev0, q1, dev1, N, 0xAA, + "Phase 5 (dev1 reads dev0: fail then succeed)")) + return 1; + std::cout << "PASS\n"; return 0; } From a74d1d6467ed6859b8c2fa48d646c5157704af77 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 13 May 2026 09:06:34 +0000 Subject: [PATCH 19/23] [UR] Fix isPVC() to handle unsupported UR_DEVICE_INFO_DEVICE_ID MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The isPVC() utility used EXPECT_EQ which recorded a test failure when the adapter does not support UR_DEVICE_INFO_DEVICE_ID (e.g. Native CPU). This caused the test to be marked FAILED even though it would subsequently skip via GTEST_SKIP(). Return false instead of asserting when the query fails — if the adapter cannot report a device ID, the device is clearly not a PVC. Signed-off-by: Lukasz Dorau --- .../test/conformance/testing/include/uur/utils.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h index b1c757a4aae7a..cb5f8a26a573c 100644 --- a/unified-runtime/test/conformance/testing/include/uur/utils.h +++ b/unified-runtime/test/conformance/testing/include/uur/utils.h @@ -522,9 +522,11 @@ inline void isQueueBatched(ur_queue_handle_t queue, bool *info) { // when that is stable. static inline bool isPVC(ur_device_handle_t hDevice) { uint32_t deviceId; - EXPECT_EQ(urDeviceGetInfo(hDevice, UR_DEVICE_INFO_DEVICE_ID, sizeof(uint32_t), - &deviceId, nullptr), - UR_RESULT_SUCCESS); + ur_result_t result = urDeviceGetInfo(hDevice, UR_DEVICE_INFO_DEVICE_ID, + sizeof(uint32_t), &deviceId, nullptr); + if (result != UR_RESULT_SUCCESS) { + return false; + } return (deviceId & 0xff0) == 0xbd0 || (deviceId & 0xff0) == 0xb60; } From 002f408b3f96d147514cf6ffa06ff8687e595ffa Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 13 May 2026 13:00:34 +0000 Subject: [PATCH 20/23] [UR][L0] Fix event_pool_test to properly release events through DDI table The event_pool_test's mock DDI table was blank, causing urEventRelease (called through the loader) to return UR_RESULT_ERROR_UNINITIALIZED instead of actually releasing the event back to the pool. This made the Basic and ProviderNormalUseMostFreePool tests fail because events were never recycled. Fix by populating the DDI table with the adapter's urEventRelease, and clearing the global mockVec in TearDown to prevent cross-test pollution between different device backends. Signed-off-by: Lukasz Dorau --- .../test/adapters/level_zero/v2/event_pool_test.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp index 884103c11d30f..d559a517898e6 100644 --- a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp @@ -14,6 +14,7 @@ #include "level_zero/common.hpp" #include "level_zero/device.hpp" +#include "level_zero/ur_interface_loader.hpp" #include "../ze_helpers.hpp" #include "context.hpp" @@ -37,8 +38,8 @@ using namespace v2; static constexpr size_t MAX_DEVICES = 10; const ur_dditable_t *ur::level_zero::ddi_getter::value() { - // Return a blank dditable static ur_dditable_t table{}; + table.Event.pfnRelease = ur::level_zero::urEventRelease; return &table; }; @@ -147,6 +148,7 @@ struct EventPoolTest : public uur::urQueueTestWithParam { } void TearDown() override { cache.reset(); + mockVec.clear(); UUR_RETURN_ON_FATAL_FAILURE(urQueueTestWithParam::TearDown()); } From edc38fced51b7d22c498d0d41a47f1e8910b8d43 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 13 May 2026 13:06:57 +0000 Subject: [PATCH 21/23] [UR][L0] Mark deferred_kernel_memcheck test as XFAIL Valgrind crashes with "debuginfo reader: ensure_valid failed" on libumf.so due to corrupted/incompatible debug info, preventing it from producing the expected ERROR SUMMARY output. Mark as XFAIL to match the pattern of the other memcheck.test in the same test directory. Signed-off-by: Lukasz Dorau --- .../test/adapters/level_zero/v2/deferred_kernel_memcheck.test | 3 +++ 1 file changed, 3 insertions(+) diff --git a/unified-runtime/test/adapters/level_zero/v2/deferred_kernel_memcheck.test b/unified-runtime/test/adapters/level_zero/v2/deferred_kernel_memcheck.test index b600f975cc0a5..9450115d5a3bc 100644 --- a/unified-runtime/test/adapters/level_zero/v2/deferred_kernel_memcheck.test +++ b/unified-runtime/test/adapters/level_zero/v2/deferred_kernel_memcheck.test @@ -3,4 +3,7 @@ REQUIRES: v2 REQUIRES: valgrind UNSUPPORTED: system-windows +COM: Valgrind may fail with corrupted debuginfo in some library builds. +XFAIL: * + CHECK: ERROR SUMMARY: 0 errors from 0 contexts From 78e65529d96aed9f593ff4a75e6712de12980b83 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Wed, 13 May 2026 10:49:17 +0000 Subject: [PATCH 22/23] [UR][L0] Migrate discrete buffer through host when P2P is not accessible When a buffer on a discrete GPU needs to be accessed from a different device and P2P access is not enabled, migrate the data through a temporary host buffer instead of returning UR_RESULT_ERROR_UNSUPPORTED_FEATURE. Before migrating, wait for pending operations (from the wait list) to complete, ensuring that prior kernel writes to the buffer are visible. Fixes: #22007 Fixes: #22008 Signed-off-by: Lukasz Dorau --- .../source/adapters/level_zero/v2/memory.cpp | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index b0601ba956af2..9619bf59b636d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -341,7 +341,7 @@ void *ur_discrete_buffer_handle_t::getActiveDeviceAlloc(size_t offset) { void *ur_discrete_buffer_handle_t::getDevicePtr( ur_device_handle_t hDevice, device_access_mode_t /*access*/, size_t offset, size_t /*size*/, ze_command_list_handle_t /*cmdList*/, - wait_list_view & /*waitListView*/) { + wait_list_view &waitListView) { TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::getDevicePtr"); if (!activeAllocationDevice) { @@ -366,12 +366,26 @@ void *ur_discrete_buffer_handle_t::getDevicePtr( activeAllocationDevice) != p2pDevices.end(); if (!p2pAccessible) { - // TODO: migrate buffer through the host - UR_LOG(WARN, - "p2p is not accessible: requesting device ptr:{} cannot access " - "allocation on device ptr:{}", - (void *)hDevice, (void *)activeAllocationDevice); - throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // Wait for pending operations on the buffer to complete before migrating. + for (uint32_t i = 0; i < waitListView.num; i++) { + ZE2UR_CALL_THROWS(zeEventHostSynchronize, + (waitListView.handles[i], UINT64_MAX)); + } + waitListView.clear(); + + // Migrate buffer through the host: copy from the current device to a + // temporary host buffer, then from host to the target device. + auto bufferSize = getSize(); + std::vector hostBuf(bufferSize); + + UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice, + hostBuf.data(), getActiveDeviceAlloc(), + bufferSize)); + + UR_CALL_THROWS(migrateBufferTo(hDevice, hostBuf.data(), bufferSize)); + + activeAllocationDevice = hDevice; + return getActiveDeviceAlloc(offset); } // TODO: see if it's better to migrate the memory to the specified device From 39a3ba8e57ba3f8f2966ca9bd68339b521e1cbc3 Mon Sep 17 00:00:00 2001 From: Lukasz Dorau Date: Thu, 14 May 2026 13:54:25 +0000 Subject: [PATCH 23/23] [UR][L0v2] Improve discrete buffer host-migration when P2P is not accessible When a discrete buffer needs to be migrated between devices via the host (because P2P access is not enabled), improve the implementation in three ways: 1. Use a USM HOST allocation for the staging buffer instead of a std::vector. USM HOST memory is accessible by all devices in the context, making it more suitable as an intermediate staging area. 2. Fix event ordering: append zeCommandListAppendWaitOnEvents (when there are wait events) to order the migration relative to all in-flight work on the destination command list, then drain it with zeCommandListHostSynchronize before reading source device memory. 3. Make the host->device copy async: after the synchronous device->host copy completes (using the source device's own synchronous command list, since the destination device cannot access source device memory without P2P), enqueue only the host->device copy on the provided command list. Host memory is accessible by all devices, so this is safe. The staging buffer is stored in a new migrationStagingBuffers member of ur_discrete_buffer_handle_t and freed when the buffer is released, ensuring it remains valid for the duration of the async copy. A fully synchronous fallback is kept for the case where no command list is available (e.g. urMemGetNativeHandle). Signed-off-by: Lukasz Dorau --- .../source/adapters/level_zero/v2/memory.cpp | 70 +++++++++++++++---- .../source/adapters/level_zero/v2/memory.hpp | 5 ++ 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.cpp b/unified-runtime/source/adapters/level_zero/v2/memory.cpp index 9619bf59b636d..bc805d1cc291a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.cpp @@ -340,7 +340,7 @@ void *ur_discrete_buffer_handle_t::getActiveDeviceAlloc(size_t offset) { void *ur_discrete_buffer_handle_t::getDevicePtr( ur_device_handle_t hDevice, device_access_mode_t /*access*/, size_t offset, - size_t /*size*/, ze_command_list_handle_t /*cmdList*/, + size_t /*size*/, ze_command_list_handle_t cmdList, wait_list_view &waitListView) { TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::getDevicePtr"); @@ -366,23 +366,63 @@ void *ur_discrete_buffer_handle_t::getDevicePtr( activeAllocationDevice) != p2pDevices.end(); if (!p2pAccessible) { - // Wait for pending operations on the buffer to complete before migrating. - for (uint32_t i = 0; i < waitListView.num; i++) { - ZE2UR_CALL_THROWS(zeEventHostSynchronize, - (waitListView.handles[i], UINT64_MAX)); - } - waitListView.clear(); - - // Migrate buffer through the host: copy from the current device to a - // temporary host buffer, then from host to the target device. + // Allocate a USM HOST staging buffer for the migration. auto bufferSize = getSize(); - std::vector hostBuf(bufferSize); + void *hostBuf = nullptr; + UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate( + hContext, nullptr, nullptr, UR_USM_TYPE_HOST, bufferSize, &hostBuf)); + usm_unique_ptr_t hostBufPtr( + hostBuf, [hContext = this->hContext](void *ptr) { + auto ret = hContext->getDefaultUSMPool()->free(ptr); + if (ret != UR_RESULT_SUCCESS) { + UR_LOG(ERR, "Failed to free migration staging buffer: {}", ret); + } + }); - UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice, - hostBuf.data(), getActiveDeviceAlloc(), - bufferSize)); + if (cmdList) { + // Order the migration relative to both the explicit wait events and any + // in-flight work already on the destination command list, then drain it + // so the host can safely read from the source device. + if (waitListView.num > 0) { + ZE2UR_CALL_THROWS(zeCommandListAppendWaitOnEvents, + (cmdList, waitListView.num, waitListView.handles)); + } + ZE2UR_CALL_THROWS(zeCommandListHostSynchronize, (cmdList, UINT64_MAX)); + waitListView.clear(); + + // The destination device's command list cannot access source device + // memory (P2P is not available), so use the source device's own + // synchronous command list for the device->host copy. + UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice, + hostBuf, getActiveDeviceAlloc(), + bufferSize)); + + void *dstDevPtr = deviceAllocations[hDevice->Id.value()].get() + ? deviceAllocations[hDevice->Id.value()].get() + : allocateOnDevice(hDevice, bufferSize); + + // Host memory is accessible by all devices; enqueue the host->dest + // copy on the provided command list to keep the destination side async. + ZE2UR_CALL_THROWS( + zeCommandListAppendMemoryCopy, + (cmdList, dstDevPtr, hostBuf, bufferSize, nullptr, 0, nullptr)); + + // Keep the staging buffer alive until the async copy completes. + // It will be freed when the discrete buffer handle is destroyed. + migrationStagingBuffers.emplace_back(std::move(hostBufPtr)); + } else { + // Synchronous fallback when no command list is available. + for (uint32_t i = 0; i < waitListView.num; i++) { + ZE2UR_CALL_THROWS(zeEventHostSynchronize, + (waitListView.handles[i], UINT64_MAX)); + } + waitListView.clear(); - UR_CALL_THROWS(migrateBufferTo(hDevice, hostBuf.data(), bufferSize)); + UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice, + hostBuf, getActiveDeviceAlloc(), + bufferSize)); + UR_CALL_THROWS(migrateBufferTo(hDevice, hostBuf, bufferSize)); + } activeAllocationDevice = hDevice; return getActiveDeviceAlloc(offset); diff --git a/unified-runtime/source/adapters/level_zero/v2/memory.hpp b/unified-runtime/source/adapters/level_zero/v2/memory.hpp index 709bb3e71c600..c757f173f435a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/memory.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/memory.hpp @@ -171,6 +171,11 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t { std::vector hostAllocations; + // USM HOST staging buffers used for async host-based migration + // (when P2P is not accessible). Kept alive until the buffer is released + // so the async copies have valid source/destination memory. + std::vector migrationStagingBuffers; + void *getActiveDeviceAlloc(size_t offset = 0); void *allocateOnDevice(ur_device_handle_t hDevice, size_t size); ur_result_t migrateBufferTo(ur_device_handle_t hDevice, void *src,