From 177edf20321132e90473f3b2a1f1824cdf503e40 Mon Sep 17 00:00:00 2001 From: Katarzyna Kaczmarska Date: Fri, 22 May 2026 09:43:08 +0200 Subject: [PATCH 1/3] [TEST] Reproduce urUSMContextMemcpyExp CUDA failure This commit sets up CI to reproduce the sporadic failure in urUSMContextMemcpyExpTestDevice::Success on CUDA. Changes: - Remove UUR_KNOWN_FAILURE_ON(uur::CUDA{}) from test - Add loop with 20 iterations to increase probability of catching the race condition (flaky ~1/3-5 runs) - Re-initialize allocations between iterations - Comment out non-CUDA jobs in ur-precommit.yml to minimize CI run time and focus only on CUDA testing --- .github/workflows/ur-precommit.yml | 148 +++++++++--------- .../urUSMContextMemcpyExp.cpp | 17 +- 2 files changed, 87 insertions(+), 78 deletions(-) diff --git a/.github/workflows/ur-precommit.yml b/.github/workflows/ur-precommit.yml index 403ac4d285826..6437a9e291485 100644 --- a/.github/workflows/ur-precommit.yml +++ b/.github/workflows/ur-precommit.yml @@ -56,39 +56,39 @@ jobs: # Extra native CPU jobs are here to force the loader to be used. # UR will not use the loader if there is only one target. include: - - name: L0 - runner: UR_L0 - image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN - - name: L0_V2 - runner: UR_L0 - image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN - - name: L0 - runner: UR_L0 - static: ON - image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN - - name: L0 - runner: UR_L0 - other_adapter: NATIVE_CPU - image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN - - name: L0_V2 - runner: UR_L0_BMG - image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + # - name: L0 + # runner: UR_L0 + # image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + # - name: L0_V2 + # runner: UR_L0 + # image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + # - name: L0 + # runner: UR_L0 + # static: ON + # image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + # - name: L0 + # runner: UR_L0 + # other_adapter: NATIVE_CPU + # image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN + # - name: L0_V2 + # runner: UR_L0_BMG + # image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN - name: CUDA runner: UR_CUDA image_options: -u 1001 --privileged --cap-add SYS_ADMIN --gpus all - - name: OPENCL - runner: UR_OPENCL - docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest" - image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN - - name: OPENCL - runner: UR_OPENCL - other_adapter: NATIVE_CPU - docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest" - image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN - - name: NATIVE_CPU - runner: UR_NATIVE_CPU - docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest" - image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + # - name: OPENCL + # runner: UR_OPENCL + # docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest" + # image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN + # - name: OPENCL + # runner: UR_OPENCL + # other_adapter: NATIVE_CPU + # docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest" + # image_options: -u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN + # - name: NATIVE_CPU + # runner: UR_NATIVE_CPU + # docker_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest" + # image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN uses: ./.github/workflows/ur-build-hw.yml with: adapter_name: ${{ matrix.name }} @@ -100,48 +100,48 @@ jobs: image_options: ${{ matrix.image_options || '' }} install_igc_driver: ${{ contains(needs.detect_changes.outputs.filters, 'drivers') }} - offload_build: - name: Adapters (Offload) - needs: [detect_changes, source_checks] - if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur_offload_adapter') }} - uses: ./.github/workflows/ur-build-offload.yml - - macos: - name: MacOS build only - needs: [detect_changes, source_checks] - if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur') }} - strategy: - matrix: - os: ['macos-latest'] - runs-on: ${{matrix.os}} - - steps: - - name: Checkout LLVM - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.1.0 - with: - python-version: "3.10" - - - name: Install prerequisites - working-directory: ${{github.workspace}}/unified-runtime - run: | - python3 -m pip install -r third_party/requirements.txt - python3 -m pip install -r third_party/requirements_testing.txt - - - name: Install hwloc - run: brew install hwloc - - - name: Configure Unified Runtime project - working-directory: ${{github.workspace}}/unified-runtime - run: > - cmake - -B${{github.workspace}}/build - -DUR_ENABLE_TRACING=ON - -DUR_DEVELOPER_MODE=ON - -DCMAKE_BUILD_TYPE=Release - -DUR_BUILD_TESTS=ON - -DUR_FORMAT_CPP_STYLE=ON + # offload_build: + # name: Adapters (Offload) + # needs: [detect_changes, source_checks] + # if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur_offload_adapter') }} + # uses: ./.github/workflows/ur-build-offload.yml - - name: Build - run: cmake --build ${{github.workspace}}/build -j $(sysctl -n hw.logicalcpu) + # macos: + # name: MacOS build only + # needs: [detect_changes, source_checks] + # if: ${{ !cancelled() && contains(needs.detect_changes.outputs.filters, 'ur') }} + # strategy: + # matrix: + # os: ['macos-latest'] + # runs-on: ${{matrix.os}} + # + # steps: + # - name: Checkout LLVM + # uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + # + # - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.1.0 + # with: + # python-version: "3.10" + # + # - name: Install prerequisites + # working-directory: ${{github.workspace}}/unified-runtime + # run: | + # python3 -m pip install -r third_party/requirements.txt + # python3 -m pip install -r third_party/requirements_testing.txt + # + # - name: Install hwloc + # run: brew install hwloc + # + # - name: Configure Unified Runtime project + # working-directory: ${{github.workspace}}/unified-runtime + # run: > + # cmake + # -B${{github.workspace}}/build + # -DUR_ENABLE_TRACING=ON + # -DUR_DEVELOPER_MODE=ON + # -DCMAKE_BUILD_TYPE=Release + # -DUR_BUILD_TESTS=ON + # -DUR_FORMAT_CPP_STYLE=ON + # + # - name: Build + # run: cmake --build ${{github.workspace}}/build -j $(sysctl -n hw.logicalcpu) diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp index e051d8a33cd21..55368f6066aba 100644 --- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -81,10 +81,19 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_MULTI_QUEUE(urUSMContextMemcpyExpTestDevice); TEST_P(urUSMContextMemcpyExpTestDevice, Success) { // https://github.com/intel/llvm/issues/19688 - UUR_KNOWN_FAILURE_ON(uur::CUDA{}); - ASSERT_SUCCESS( - urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); - verifyData(); + // Testing without xfail to reproduce sporadic failure + // Run multiple iterations to increase chance of catching race condition + constexpr int NumIterations = 20; + for (int i = 0; i < NumIterations; ++i) { + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); + + // Re-initialize for next iteration + if (i < NumIterations - 1) { + initAllocations(); + } + } } // Arbitrarily do the negative tests with device allocations. These are mostly a From 9919525a46ee002ce237cc2d1924f12304cd78ad Mon Sep 17 00:00:00 2001 From: Katarzyna Kaczmarska Date: Fri, 22 May 2026 10:09:47 +0200 Subject: [PATCH 2/3] [UR][CUDA] Fix urUSMContextMemcpyExp synchronization issue --- unified-runtime/source/adapters/cuda/usm.cpp | 16 ++++++++++++---- .../urUSMContextMemcpyExp.cpp | 17 ++++------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index 7a56030c8978a..510f2f101f567 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -573,10 +573,18 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp(ur_context_handle_t, - void *pDst, - const void *pSrc, - size_t Size) { +UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( + ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t Size) { + // cuMemcpy is synchronous with respect to the host, but it does not + // synchronize with any device streams. We need to synchronize all streams + // in the context before performing the copy to ensure all previous + // operations have completed. + // + // Set the context and synchronize all streams + ScopedContext Active(hContext->getDevices().front()); + UR_CHECK_ERROR(cuCtxSynchronize()); + + // Now perform the synchronous copy UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size)); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp index 55368f6066aba..7a7af397180f3 100644 --- a/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp +++ b/unified-runtime/test/conformance/exp_usm_context_memcpy/urUSMContextMemcpyExp.cpp @@ -81,19 +81,10 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE_MULTI_QUEUE(urUSMContextMemcpyExpTestDevice); TEST_P(urUSMContextMemcpyExpTestDevice, Success) { // https://github.com/intel/llvm/issues/19688 - // Testing without xfail to reproduce sporadic failure - // Run multiple iterations to increase chance of catching race condition - constexpr int NumIterations = 20; - for (int i = 0; i < NumIterations; ++i) { - ASSERT_SUCCESS( - urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); - verifyData(); - - // Re-initialize for next iteration - if (i < NumIterations - 1) { - initAllocations(); - } - } + // Fixed by adding cuCtxSynchronize() before cuMemcpy in CUDA adapter + ASSERT_SUCCESS( + urUSMContextMemcpyExp(context, dst_ptr, src_ptr, allocation_size)); + verifyData(); } // Arbitrarily do the negative tests with device allocations. These are mostly a From 2afc8c47c604cfd3535b492af2ceeefb99f31b8c Mon Sep 17 00:00:00 2001 From: Katarzyna Kaczmarska Date: Fri, 22 May 2026 11:04:57 +0200 Subject: [PATCH 3/3] [UR][CUDA] Use cuMemcpyAsync with full synchronization for urUSMContextMemcpyExp --- unified-runtime/source/adapters/cuda/usm.cpp | 27 ++++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/usm.cpp b/unified-runtime/source/adapters/cuda/usm.cpp index 510f2f101f567..aada917372b10 100644 --- a/unified-runtime/source/adapters/cuda/usm.cpp +++ b/unified-runtime/source/adapters/cuda/usm.cpp @@ -575,17 +575,28 @@ urUSMPoolTrimToExp(ur_context_handle_t hContext, ur_device_handle_t hDevice, UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t Size) { - // cuMemcpy is synchronous with respect to the host, but it does not - // synchronize with any device streams. We need to synchronize all streams - // in the context before performing the copy to ensure all previous - // operations have completed. + // cuMemcpy for device-to-device copies may NOT synchronize with the host + // or with other streams. According to CUDA documentation, device-to-device + // copies using cuMemcpy can execute asynchronously. // - // Set the context and synchronize all streams + // Solution: + // 1. Synchronize all streams in context to ensure prior operations complete + // 2. Use cuMemcpyAsync with default stream for the copy + // 3. Synchronize the stream to ensure copy completes before returning + // + // Set the context ScopedContext Active(hContext->getDevices().front()); + + // Ensure all pending operations in all streams have completed UR_CHECK_ERROR(cuCtxSynchronize()); - - // Now perform the synchronous copy - UR_CHECK_ERROR(cuMemcpy((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size)); + + // Perform the copy using async API with default stream + UR_CHECK_ERROR( + cuMemcpyAsync((CUdeviceptr)pDst, (CUdeviceptr)pSrc, Size, 0)); + + // Synchronize the stream to ensure the copy has completed + UR_CHECK_ERROR(cuStreamSynchronize(0)); + return UR_RESULT_SUCCESS; }