From 177edf20321132e90473f3b2a1f1824cdf503e40 Mon Sep 17 00:00:00 2001
From: Katarzyna Kaczmarska <katarzynax.e.kaczmarska@intel.com>
Date: Fri, 22 May 2026 09:43:08 +0200
Subject: [PATCH 1/3] [TEST] Reproduce urUSMContextMemcpyExp CUDA failure

This commit sets up CI to reproduce the sporadic failure in
urUSMContextMemcpyExpTestDevice::Success on CUDA.

Changes:
- Remove UUR_KNOWN_FAILURE_ON(uur::CUDA{}) from test
- Add loop with 20 iterations to increase probability of catching
  the race condition (flaky ~1/3-5 runs)
- Re-initialize allocations between iterations
- Comment out non-CUDA jobs in ur-precommit.yml to minimize CI run
  time and focus only on CUDA testing
---
 .github/workflows/ur-precommit.yml            | 148 +++++++++---------
 .../urUSMContextMemcpyExp.cpp                 |  17 +-
 2 files changed, 87 insertions(+), 78 deletions(-)

diff --git a/.github/workflows/ur-precommit.yml b/.github/workflows/ur-precommit.yml
index 403ac4d285826..6437a9e291485 100644
--- a/.github/workflows/ur-precommit.yml
+++ b/.github/workflows/ur-precommit.yml
@@ -56,39 +56,39 @@ jobs:
       # Extra native CPU jobs are here to force the loader to be used.
       # UR will not use the loader if there is only one target.
         include:
-          - name: L0
-            runner: UR_L0
-            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
-          - name: L0_V2
-            runner: UR_L0
-            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
-          - name: L0
-            runner: UR_L0
-            static: ON
-            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
-          - name: L0
-            runner: UR_L0
-            other_adapter: NATIVE_CPU
-            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
-          - name: L0_V2
-            runner: UR_L0_BMG
-            image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+          # - name: L0
+          #   runner: UR_L0
+          #   image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+          # - name: L0_V2
+          #   runner: UR_L0
+          #   image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+          # - name: L0
+          #   runner: UR_L0
+          #   static: ON
+          #   image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+          # - name: L0
+          #   runner: UR_L0
+          #   other_adapter: NATIVE_CPU
+          #   image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
+          # - name: L0_V2
+          #   runner: UR_L0_BMG
+          #   image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN
           - name: CUDA
             runner: UR_CUDA
             image_options: -u 1001 --privileged --cap-add SYS_ADMIN --gpus all
-          - name: OPENCL
-            runner: UR_OPENCL
-            docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
-            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN
-          - name: OPENCL
-            runner: UR_OPENCL
-            other_adapter: NATIVE_CPU
-            docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
-            image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN
-          - name: NATIVE_CPU
-            runner: UR_NATIVE_CPU
-            docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
-            image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
+          # - name: OPENCL
+          #   runner: UR_OPENCL
+          #   docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
+          #   image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN
+          # - name: OPENCL
+          #   runner: UR_OPENCL
+          #   other_adapter: NATIVE_CPU
+          #   docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
+          #   image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN
+          # - name: NATIVE_CPU
+          #   runner: UR_NATIVE_CPU
+          #   docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest"
+          #   image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
     uses: ./.github/workflows/ur-build-hw.yml
     with:
       adapter_name: ${{ matrix.name }}
@@ -100,48 +100,48 @@ jobs:
       image_options: ${{ matrix.image_options || '' }}
       install_igc_driver: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }}
 
-  offload_build:
-    name: Adapters (Offload)
-    needs: [detect_changes, source_checks]
-    if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur_offload_adapter') }}
-    uses: ./.github/workflows/ur-build-offload.yml
-
-  macos:
-    name: MacOS build only
-    needs: [detect_changes, source_checks]
-    if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur') }}
-    strategy:
-      matrix:
-        os: ['macos-latest']
-    runs-on: ${{matrix.os}}
-
-    steps:
-    - name: Checkout LLVM
-      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-
-    - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.1.0
-      with:
-        python-version: "3.10"
-
-    - name: Install prerequisites
-      working-directory: ${{github.workspace}}/unified-runtime
-      run: |
-        python3 -m pip install -r third_party/requirements.txt
-        python3 -m pip install -r third_party/requirements_testing.txt
-
-    - name: Install hwloc
-      run: brew install hwloc
-
-    - name: Configure Unified Runtime project
-      working-directory: ${{github.workspace}}/unified-runtime
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DUR_ENABLE_TRACING=ON
-        -DUR_DEVELOPER_MODE=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DUR_BUILD_TESTS=ON
-        -DUR_FORMAT_CPP_STYLE=ON
+  # offload_build:
+  #   name: Adapters (Offload)
+  #   needs: [detect_changes, source_checks]
+  #   if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur_offload_adapter') }}
+  #   uses: ./.github/workflows/ur-build-offload.yml
 
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(sysctl -n hw.logicalcpu)
+  # macos:
+  #   name: MacOS build only
+  #   needs: [detect_changes, source_checks]
+  #   if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur') }}
+  #   strategy:
+  #     matrix:
+  #       os: ['macos-latest']
+  #   runs-on: ${{matrix.os}}
+  #
+  #   steps:
+  #   - name: Checkout LLVM
+  #     uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+  #
+  #   - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.1.0
+  #     with:
+  #       python-version: "3.10"
+  #
+  #   - name: Install prerequisites
+  #     working-directory: ${{github.workspace}}/unified-runtime
+  #     run: |
+  #       python3 -m pip install -r third_party/requirements.txt
+  #       python3 -m pip install -r third_party/requirements_testing.txt
+  #
+  #   - name: Install hwloc
+  #     run: brew install hwloc
+  #
+  #   - name: Configure Unified Runtime project
+  #     working-directory: ${{github.workspace}}/unified-runtime
+  #     run: >
+  #       cmake
+  #       -B${{github.workspace}}/build
+  #       -DUR_ENABLE_TRACING=ON
+  #       -DUR_DEVELOPER_MODE=ON
+  #       -DCMAKE_BUILD_TYPE=Release
+  #       -DUR_BUILD_TESTS=ON
+  #       -DUR_FORMAT_CPP_STYLE=ON
+  #
+  #   - name: Build
+  #     run: cmake --build ${{github.workspace}}/build -j $(sysctl -n hw.logicalcpu)
diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp
index e051d8a33cd21..55368f6066aba 100644
--- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp
+++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp
@@ -81,10 +81,19 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_MULTI_QUEUE(urUSMContextMemcpyExpTestDevice);
 
 TEST_P(urUSMContextMemcpyExpTestDevice, Success) {
   // https://github.com/intel/llvm/issues/19688
-  UUR_KNOWN_FAILURE_ON(uur::CUDA{});
-  ASSERT_SUCCESS(
-      urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size));
-  verifyData();
+  // Testing without xfail to reproduce sporadic failure
+  // Run multiple iterations to increase chance of catching race condition
+  constexpr int NumIterations = 20;
+  for (int i = 0; i < NumIterations; ++i) {
+    ASSERT_SUCCESS(
+        urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size));
+    verifyData();
+
+    // Re-initialize for next iteration
+    if (i < NumIterations - 1) {
+      initAllocations();
+    }
+  }
 }
 
 // Arbitrarily do the negative tests with device allocations. These are mostly a

From 9919525a46ee002ce237cc2d1924f12304cd78ad Mon Sep 17 00:00:00 2001
From: Katarzyna Kaczmarska <katarzynax.e.kaczmarska@intel.com>
Date: Fri, 22 May 2026 10:09:47 +0200
Subject: [PATCH 2/3] [UR][CUDA] Fix urUSMContextMemcpyExp synchronization
 issue

---
 unified-runtime/source/adapters/cuda/usm.cpp    | 16 ++++++++++++----
 .../urUSMContextMemcpyExp.cpp                   | 17 ++++-------------
 2 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp
index 7a56030c8978a..510f2f101f567 100644
--- a/unified-runtime/source/adapters/cuda/usm.cpp
+++ b/unified-runtime/source/adapters/cuda/usm.cpp
@@ -573,10 +573,18 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice,
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t,
-                                                          void *pDst,
-                                                          const void *pSrc,
-                                                          size_t Size) {
+UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(
+    ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t Size) {
+  // cuMemcpy is synchronous with respect to the host, but it does not
+  // synchronize with any device streams. We need to synchronize all streams
+  // in the context before performing the copy to ensure all previous
+  // operations have completed.
+  //
+  // Set the context and synchronize all streams
+  ScopedContext Active(hContext->getDevices().front());
+  UR_CHECK_ERROR(cuCtxSynchronize());
+
+  // Now perform the synchronous copy
   UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size));
   return UR_RESULT_SUCCESS;
 }
diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp
index 55368f6066aba..7a7af397180f3 100644
--- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp
+++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp
@@ -81,19 +81,10 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_MULTI_QUEUE(urUSMContextMemcpyExpTestDevice);
 
 TEST_P(urUSMContextMemcpyExpTestDevice, Success) {
   // https://github.com/intel/llvm/issues/19688
-  // Testing without xfail to reproduce sporadic failure
-  // Run multiple iterations to increase chance of catching race condition
-  constexpr int NumIterations = 20;
-  for (int i = 0; i < NumIterations; ++i) {
-    ASSERT_SUCCESS(
-        urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size));
-    verifyData();
-
-    // Re-initialize for next iteration
-    if (i < NumIterations - 1) {
-      initAllocations();
-    }
-  }
+  // Fixed by adding cuCtxSynchronize() before cuMemcpy in CUDA adapter
+  ASSERT_SUCCESS(
+      urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size));
+  verifyData();
 }
 
 // Arbitrarily do the negative tests with device allocations. These are mostly a

From 2afc8c47c604cfd3535b492af2ceeefb99f31b8c Mon Sep 17 00:00:00 2001
From: Katarzyna Kaczmarska <katarzynax.e.kaczmarska@intel.com>
Date: Fri, 22 May 2026 11:04:57 +0200
Subject: [PATCH 3/3] [UR][CUDA] Use cuMemcpyAsync with full synchronization
 for urUSMContextMemcpyExp

---
 unified-runtime/source/adapters/cuda/usm.cpp | 27 ++++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp
index 510f2f101f567..aada917372b10 100644
--- a/unified-runtime/source/adapters/cuda/usm.cpp
+++ b/unified-runtime/source/adapters/cuda/usm.cpp
@@ -575,17 +575,28 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice,
 
 UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(
     ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t Size) {
-  // cuMemcpy is synchronous with respect to the host, but it does not
-  // synchronize with any device streams. We need to synchronize all streams
-  // in the context before performing the copy to ensure all previous
-  // operations have completed.
+  // cuMemcpy for device-to-device copies may NOT synchronize with the host
+  // or with other streams. According to CUDA documentation, device-to-device
+  // copies using cuMemcpy can execute asynchronously.
   //
-  // Set the context and synchronize all streams
+  // Solution:
+  // 1. Synchronize all streams in context to ensure prior operations complete
+  // 2. Use cuMemcpyAsync with default stream for the copy
+  // 3. Synchronize the stream to ensure copy completes before returning
+  //
+  // Set the context
   ScopedContext Active(hContext->getDevices().front());
+  
+  // Ensure all pending operations in all streams have completed
   UR_CHECK_ERROR(cuCtxSynchronize());
-
-  // Now perform the synchronous copy
-  UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size));
+  
+  // Perform the copy using async API with default stream
+  UR_CHECK_ERROR(
+      cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size, 0));
+  
+  // Synchronize the stream to ensure the copy has completed
+  UR_CHECK_ERROR(cuStreamSynchronize(0));
+  
   return UR_RESULT_SUCCESS;
 }