diff --git a/.gitignore b/.gitignore index d329731..6278da7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,8 @@ .*.sw* build +build-*/ tags ishmem/config.h +*.pyc +__pycache__/ +*.egg-info/ diff --git a/CMakeLists.txt b/CMakeLists.txt index abb9ab4..add742b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,6 +27,8 @@ option(BUILD_UNIT_TESTS "Build unit tests" FALSE) option(BUILD_PERF_TESTS "Build performance tests" FALSE) option(BUILD_EXAMPLES "Build examples" FALSE) option(BUILD_APPS "Build apps" FALSE) +option(BUILD_PYTHON_BINDINGS "Build Python bindings" FALSE) +option(ISHMEM4PY_BUILD_TORCH_INTEROP "Build optional Torch/XPU interop for ishmem4py" FALSE) option(BUILD_CMAKE_CONFIG "Build CMake config files" TRUE) option(ENABLE_ERROR_CHECKING "Validate API inputs" FALSE) option(ENABLE_DLMALLOC "Enable dlmalloc for shared heap" TRUE) @@ -79,6 +81,8 @@ message(STATUS "Build unit tests: ${BUILD_UNIT_TESTS}") message(STATUS "Build performance tests: ${BUILD_PERF_TESTS}") message(STATUS "Build examples: ${BUILD_EXAMPLES}") message(STATUS "Build apps: ${BUILD_APPS}") +message(STATUS "Build Python bindings: ${BUILD_PYTHON_BINDINGS}") +message(STATUS "ishmem4py Torch interop: ${ISHMEM4PY_BUILD_TORCH_INTEROP}") message(STATUS "Build CMake configs: ${BUILD_CMAKE_CONFIG}") message(STATUS "Enable input validation: ${ENABLE_ERROR_CHECKING}") message(STATUS "Enable dlmalloc: ${ENABLE_DLMALLOC}") @@ -161,6 +165,10 @@ add_subdirectory(src) # Build options for tests are handled directly in test/CMakeLists.txt add_subdirectory(test) +if (BUILD_PYTHON_BINDINGS) + add_subdirectory(ishmem4py) +endif() + if (BUILD_EXAMPLES) add_subdirectory(examples) endif() diff --git a/README.md b/README.md index b2822e1..3cedcd2 100644 --- a/README.md +++ b/README.md @@ -146,8 +146,8 @@ ISHMEM_RUNTIME= mpiexec.hydra -n 2 -hosts ./scripts - *Note:* Current supported launchers include: MPI process launchers (i.e. `mpiexec`, `mpiexec.hydra`, `mpirun`, etc.), Slurm (i.e. `srun`, `salloc`, etc.), and PBS (i.e. `qsub`). -- *Note:* Intel® SHMEM execution model requires applications to use a 1:1 mapping between PEs and GPU devices. Attempting to run an application without the `ishmrun` launch script may result in failure if this mapping is not maintained. - - For further details on device selection, please see [the ONEAPI_DEVICE_SELECTOR](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector). +- *Note:* Intel® SHMEM execution model requires applications to use a 1:1 mapping between PEs and GPU devices. The default `ishmrun` launcher maintains that mapping by restricting each PE to a single visible GPU device. When multiple GPUs remain visible to a PE, set `ishmemx_attr_t.device_id` explicitly before calling `ishmemx_init_attr`. + - For further details on launcher-based device selection, please see [the ONEAPI_DEVICE_SELECTOR](https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#oneapi_device_selector). 3. Validate the application ran successfully; example output: diff --git a/docs/source/conf.py b/docs/source/conf.py index d60fa83..47df9cd 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -10,6 +10,8 @@ # -- Packages ---------------------------------------------------------------- import datetime +import os +import sys # -- Project information ----------------------------------------------------- @@ -24,9 +26,16 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. +sys.path.insert(0, os.path.abspath('../../ishmem4py/python')) + extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', ] +autosummary_generate = False +autodoc_member_order = 'bysource' + # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. diff --git a/docs/source/execution_model.rst b/docs/source/execution_model.rst index f9bfdcf..06bd01b 100644 --- a/docs/source/execution_model.rst +++ b/docs/source/execution_model.rst @@ -21,7 +21,9 @@ library`. that assigns the environment variable **ZE_AFFINITY_MASK** so that each PE is assigned a single SYCL device. Usage of this script is described in Section :ref:`Compiling and Running - Programs`. + Programs`. When multiple GPUs remain + visible to a PE, applications must select one explicitly with + ``ishmemx_attr_t.device_id`` before calling ``ishmemx_init_attr``. .. note:: Intel® Data Center GPU Max Series devices utilize a multi-tile architecture (as of Intel® SHMEM v1.0.0 with 1 or 2 tiles). By default, @@ -110,4 +112,3 @@ operations, see :ref:`Memory Ordering`. .. segment. .. For example, objects located in the symmetric data segment and objects .. located in the symmetric heap can be provided as arguments to the same OpenSHMEM operation. - diff --git a/docs/source/index.rst b/docs/source/index.rst index 6520a62..bc509f3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -45,6 +45,12 @@ Intel® SHMEM memory_ordering utility +.. toctree:: + :maxdepth: 2 + :caption: Python Bindings + + ishmem4py + .. --------------- .. Team Management .. --------------- diff --git a/docs/source/ishmem4py.rst b/docs/source/ishmem4py.rst new file mode 100644 index 0000000..51e3780 --- /dev/null +++ b/docs/source/ishmem4py.rst @@ -0,0 +1,297 @@ +.. _ishmem4py: + +-------------------- +ishmem4py Bindings +-------------------- + +``ishmem4py`` is the in-tree Python interface for Intel(R) SHMEM. +It targets the same host-side workflow that the C and C++ Intel SHMEM APIs already support, +while adopting a small, Python-oriented surface inspired by ``nvshmem4py`` where the models +overlap. + +The current binding is intentionally host-driven. +It now exposes queue-based host ``put`` and ``get`` operations for XPU workflows, but it does +not yet expose device-initiated Python APIs. + +.. currentmodule:: ishmem4py + +Import styles: + +.. code-block:: python + + import ishmem4py as ishmem + # or + import ishmem4py.core as ishmem + +API Reference +^^^^^^^^^^^^^ + +Initialization and Queries +"""""""""""""""""""""""""" + +.. autosummary:: + + init + finalize + init_status + is_initialized + my_pe + n_pes + info_get_version + info_get_name + get_version + Version + +Memory Management +""""""""""""""""" + +.. autosummary:: + + malloc + calloc + buffer + free + tensor + free_tensor + get_peer_tensor + tensor_base + is_symmetric_tensor + ptr + ishmem_ptr + SymmetricMemory + MemoryPointer + +Remote Memory Access +"""""""""""""""""""" + +.. autosummary:: + + put + get + putmem + getmem + fence + quiet + +Collectives +""""""""""" + +.. autosummary:: + + barrier_all + sync_all + barrier + sync + broadcast + collect + fcollect + alltoall + reduce + reducescatter + +Teams +""""" + +.. autosummary:: + + TEAM_WORLD + TEAM_SHARED + TEAM_INVALID + Team + team_my_pe + team_n_pes + team_translate_pe + team_sync + +Basic Example +^^^^^^^^^^^^^ + +.. literalinclude:: ../../ishmem4py/examples/init_fini.py + :language: python + +RMA Example +^^^^^^^^^^^ + +.. literalinclude:: ../../ishmem4py/examples/ring_put_get.py + :language: python + +Memory Model +^^^^^^^^^^^^ + +``ishmem4py`` keeps symmetric allocations as explicit Python objects instead of exposing the +symmetric heap as a general Python buffer. +That is deliberate: + +- Intel SHMEM symmetric memory may not be directly host-accessible in every runtime mode. +- Explicit ``put`` and ``get`` calls are the portable host-side mechanism. +- Python can still use ordinary local buffers such as ``bytes``, ``bytearray``, and + ``memoryview`` as the source or destination for host-initiated RMA. + +Example: + +.. code-block:: python + + ishmem.init() + try: + buf = ishmem.malloc(16) + buf.write(b"abcd") + + host = bytearray(16) + ishmem.get(host, buf, pe=ishmem.my_pe()) + finally: + ishmem.free(buf) + ishmem.finalize() + +When multiple XPU devices are visible to a process, pass ``device_id=...`` to +``ishmem.init(...)`` to select the visible device ordinal for Intel SHMEM. +``device_id`` may be an integer visible-device ordinal, a string like +``"xpu:3"``, or a ``torch.device("xpu", 3)``: + +.. code-block:: python + + local_rank = int(os.environ["MPI_LOCALRANKID"]) + ishmem.init(device_id=local_rank) + +Torch/XPU interop is also available as an optional build-time feature. +Those helpers allocate symmetric memory directly as ``torch.Tensor`` objects backed by the +Intel SHMEM heap, which is useful for host-driven one-sided XPU workflows: + +.. code-block:: python + + import torch + import ishmem4py as ishmem + + ishmem.init() + try: + src = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + dst = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + + src.fill_(7.0) + stream = torch.xpu.Stream() + ishmem.put(dst, src, pe=ishmem.my_pe(), queue=stream) + ishmem.quiet(queue=stream) + finally: + ishmem.free_tensor(dst) + ishmem.free_tensor(src) + ishmem.finalize() + +``get_peer_tensor`` provides the tensor-level analogue of ``ishmem_ptr`` for Torch/XPU interop: + +.. code-block:: python + + peer = ishmem.get_peer_tensor(src, pe=neighbor) + peer.add_(update) + +The returned tensor is a non-owning XPU alias of a remote symmetric allocation. It is valid for +device-side loads and stores, but must not be passed to ``ishmem.free_tensor(...)``. + +When torch interop is built, the top-level module exposes ``tensor``, ``free_tensor``, +``get_peer_tensor``, ``tensor_base``, and ``is_symmetric_tensor`` lazily so that plain +``import ishmem4py`` does +not require importing PyTorch up front. + +Queue-Based RMA +^^^^^^^^^^^^^^^ + +``ishmem4py.put`` and ``ishmem4py.get`` accept an optional ``queue=...`` argument for +host-initiated queue-based Intel SHMEM operations. +This is intended for XPU workflows that pass a ``torch.xpu.Stream`` or a raw SYCL queue handle. + +Completion remains host-driven: after issuing queue-based RMA, wait for the queue and then call +``ishmem.quiet(queue=stream)``. + +Example: + +.. code-block:: python + + stream = torch.xpu.Stream() + ishmem.get(dst, src, pe=peer, queue=stream) + ishmem.quiet(queue=stream) + +Collectives and Teams +^^^^^^^^^^^^^^^^^^^^^ + +World-team collectives are available directly from Python: + +.. code-block:: python + + reduce_src = ishmem.malloc(4) + reduce_dst = ishmem.calloc(1, 4) + + reduce_src.write((ishmem.my_pe() + 1).to_bytes(4, "little", signed=True)) + ishmem.reduce("sum", reduce_dst, reduce_src, dtype="int32") + +``reducescatter`` is also available for API compatibility with ``nvshmem4py``. +Intel SHMEM does not currently expose a host-side primitive for it, so ``ishmem4py`` implements +that operation as a small software fallback on top of ``reduce``. + +The binding also exposes team objects and team query or synchronization routines. +Those APIs follow Intel SHMEM handles rather than CUDA-object wrappers. + +Building and Installing +^^^^^^^^^^^^^^^^^^^^^^^ + +Build Intel SHMEM with Python bindings enabled: + +.. code-block:: bash + + cmake -S /path/to/ishmem -B /path/to/build -DBUILD_PYTHON_BINDINGS=ON ... + cmake --build /path/to/build --target ishmem4py -j4 + +To include optional Torch/XPU symmetric tensor support in the package, enable the extra build: + +.. code-block:: bash + + cmake -S /path/to/ishmem -B /path/to/build \ + -DBUILD_PYTHON_BINDINGS=ON \ + -DISHMEM4PY_BUILD_TORCH_INTEROP=ON \ + -DPython3_EXECUTABLE=/path/to/python-from-torch-xpu-env \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch; print(torch.utils.cmake_prefix_path)')" ... + cmake --build /path/to/build --target ishmem4py -j4 + +Install from the build tree: + +.. code-block:: bash + + pip install /path/to/build/ishmem4py/python + +For editable development, install the source tree and point it at the built runtime: + +.. code-block:: bash + + pip install -e /path/to/ishmem/ishmem4py/python + export ISHMEM4PY_RUNTIME_LIBRARY=/path/to/build/ishmem4py/python/ishmem4py/_ishmem4py_runtime.so + +Testing +^^^^^^^ + +Typical source-tree test environment: + +.. code-block:: bash + + unset ISHMEM_DIR + export PYTHONPATH=/path/to/ishmem/ishmem4py/python:/path/to/ishmem/ishmem4py/test + export ISHMEM4PY_RUNTIME_LIBRARY=/path/to/build/ishmem4py/python/ishmem4py/_ishmem4py_runtime.so + export LD_LIBRARY_PATH=/path/to/openshmem/lib:$LD_LIBRARY_PATH + export ISHMEM_RUNTIME=OPENSHMEM + +Examples: + +.. code-block:: bash + + mpiexec -n 1 ishmrun python3 /path/to/ishmem/ishmem4py/test/smoke_test.py + mpiexec -n 2 ishmrun python3 /path/to/ishmem/ishmem4py/test/ring_test.py + mpiexec -n 2 ishmrun python3 /path/to/ishmem/ishmem4py/test/collective_test.py + mpiexec -n 1 ishmrun python3 /path/to/ishmem/ishmem4py/test/torch_tensor_test.py + mpiexec -n 2 ishmrun python3 /path/to/ishmem/ishmem4py/test/torch_queue_get_test.py + +Use real script files for multi-PE validation rather than ``python - <<'PY'`` heredoc launches; +the latter can appear to hang under ``mpiexec`` because of stdin interaction with the process +manager. + +Current Scope +^^^^^^^^^^^^^ + +The exported Python API is limited to functionality that has been validated in this environment. +Host atomics and dynamic team-split helpers were intentionally removed from the public bindings +after backend-level failures in the current OpenSHMEM runtime. diff --git a/docs/source/library_setup_exit_query.rst b/docs/source/library_setup_exit_query.rst index 3374fd1..222acb7 100644 --- a/docs/source/library_setup_exit_query.rst +++ b/docs/source/library_setup_exit_query.rst @@ -58,6 +58,7 @@ OpenSHMEM, MPI, or PMI. .. c:var:: ishmemx_runtime_type_t runtime .. c:var:: bool initialize_runtime = true .. c:var:: bool gpu = true + .. c:var:: int device_id = -1 .. c:var:: void *mpi_comm **Description:** @@ -68,7 +69,11 @@ library. By default, the parallel runtime is initialized by Intel® SHMEM (**initialize_runtime** default is ``true``). The **gpu** boolean indicates whether to use GPU memory for the symmetric -heap (default is ``true``). **mpi_comm** is a pointer to the corresponding +heap (default is ``true``). ``device_id`` selects the visible GPU ordinal to +use when multiple GPU devices are visible to a PE. The default value of ``-1`` +preserves the traditional single-visible-device behavior: Intel® SHMEM will +auto-select the device only when exactly one GPU is visible. **mpi_comm** is a +pointer to the corresponding MPI communicator for representing ``ISHMEM_TEAM_WORLD`` when used with ``ISHMEM_RUNTIME_MPI`` (default is ``MPI_COMM_WORLD``). @@ -283,4 +288,3 @@ character. If the **name** memory buffer is provided with size less than ISHMEM_MAX_NAME_LEN, behavior is undefined. For a given library implementation, the vendor string returned is consistent with the library constant ISHMEM_VENDOR_STRING. - diff --git a/examples/3_library_apis.cpp b/examples/3_library_apis.cpp index a25836a..33758b2 100644 --- a/examples/3_library_apis.cpp +++ b/examples/3_library_apis.cpp @@ -16,8 +16,10 @@ int main() << std::endl; /* Initialize ISHMEM - * The ISHMEM launch script will set things up so that ishmem uses - * the same GPU device as the SYCL queue above + * The default ishmrun launch script will set things up so that ishmem uses + * the same GPU device as the SYCL queue above. When multiple GPUs are + * visible to each PE, use ishmemx_attr_t.device_id to select the device + * explicitly before calling ishmemx_init_attr(). */ ishmem_init(); diff --git a/ishmem4py/CMakeLists.txt b/ishmem4py/CMakeLists.txt new file mode 100644 index 0000000..0de2aae --- /dev/null +++ b/ishmem4py/CMakeLists.txt @@ -0,0 +1,120 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +cmake_minimum_required(VERSION 3.17) + +set(PROJECT_NAME "ishmem4py") +set(PROJECT_FULL_NAME "Intel(R) SHMEM Python bindings") + +project(${PROJECT_NAME}) + +find_package(Python3 COMPONENTS Interpreter REQUIRED) + +set(ISHMEM4PY_BUILD_ROOT "${CMAKE_CURRENT_BINARY_DIR}") +set(ISHMEM4PY_PYTHON_BUILD_ROOT "${ISHMEM4PY_BUILD_ROOT}/python") +set(ISHMEM4PY_PKG_BUILD_DIR "${ISHMEM4PY_PYTHON_BUILD_ROOT}/ishmem4py") +set(ISHMEM4PY_SOURCE_PYTHON_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/python") +set(ISHMEM4PY_SOURCE_PKG_DIR "${ISHMEM4PY_SOURCE_PYTHON_ROOT}/ishmem4py") + +add_custom_target(ishmem4py-package-files + COMMAND ${CMAKE_COMMAND} -E make_directory "${ISHMEM4PY_PYTHON_BUILD_ROOT}" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${ISHMEM4PY_SOURCE_PYTHON_ROOT}/pyproject.toml" + "${ISHMEM4PY_PYTHON_BUILD_ROOT}/pyproject.toml" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${ISHMEM4PY_SOURCE_PYTHON_ROOT}/README.md" + "${ISHMEM4PY_PYTHON_BUILD_ROOT}/README.md" + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${ISHMEM4PY_SOURCE_PYTHON_ROOT}/setup.py" + "${ISHMEM4PY_PYTHON_BUILD_ROOT}/setup.py" + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_CURRENT_SOURCE_DIR}/test" + "${ISHMEM4PY_BUILD_ROOT}/test" + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${CMAKE_CURRENT_SOURCE_DIR}/examples" + "${ISHMEM4PY_BUILD_ROOT}/examples" + COMMAND ${CMAKE_COMMAND} -E copy_directory + "${ISHMEM4PY_SOURCE_PKG_DIR}" + "${ISHMEM4PY_PKG_BUILD_DIR}" + VERBATIM) + +add_library(ishmem4py-runtime SHARED + csrc/ishmem4py_runtime.cpp) + +target_include_directories(ishmem4py-runtime PRIVATE + "${SYCL_INCLUDE_SYCL_DIR}" + "${LEVEL_ZERO_INCLUDEDIR}" + "${ISHMEM_BUILD_ROOT}/include" + "${PROJECT_SOURCE_DIR}/src" + "${CMAKE_CURRENT_SOURCE_DIR}/csrc") + +target_link_libraries(ishmem4py-runtime PRIVATE ishmem-static) +set_target_properties(ishmem4py-runtime PROPERTIES + OUTPUT_NAME "_ishmem4py_runtime" + PREFIX "" + LIBRARY_OUTPUT_DIRECTORY "${ISHMEM4PY_PKG_BUILD_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${ISHMEM4PY_PKG_BUILD_DIR}" + POSITION_INDEPENDENT_CODE 1) + +if (ISHMEM4PY_BUILD_TORCH_INTEROP) + find_package(Python3 COMPONENTS Development.Module REQUIRED) + + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import torch; print(torch.utils.cmake_prefix_path)" + OUTPUT_VARIABLE TORCH_CMAKE_PREFIX + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE TORCH_PREFIX_RESULT) + if (NOT TORCH_PREFIX_RESULT EQUAL 0) + message(FATAL_ERROR + "ISHMEM4PY_BUILD_TORCH_INTEROP=ON requires PyTorch to be importable from " + "Python3_EXECUTABLE (${Python3_EXECUTABLE})") + endif() + + execute_process( + COMMAND ${Python3_EXECUTABLE} -c + "import torch.utils.cpp_extension as ce; print(ce.library_paths()[0])" + OUTPUT_VARIABLE TORCH_LIBRARY_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE TORCH_LIBRARY_RESULT) + if (NOT TORCH_LIBRARY_RESULT EQUAL 0) + message(FATAL_ERROR "failed to determine the PyTorch library directory") + endif() + + list(PREPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PREFIX}") + find_package(Torch REQUIRED) + + add_library(ishmem4py-torch MODULE + csrc/ishmem4py_torch.cpp) + + target_include_directories(ishmem4py-torch PRIVATE + "${CMAKE_CURRENT_SOURCE_DIR}/csrc") + target_link_directories(ishmem4py-torch PRIVATE "${TORCH_LIBRARY_DIR}") + target_link_libraries(ishmem4py-torch PRIVATE + Python3::Module + ishmem4py-runtime + torch_python + ${TORCH_LIBRARIES}) + target_compile_features(ishmem4py-torch PRIVATE cxx_std_17) + target_compile_options(ishmem4py-torch PRIVATE ${TORCH_CXX_FLAGS}) + set_target_properties(ishmem4py-torch PROPERTIES + OUTPUT_NAME "_ishmem4py_torch" + PREFIX "" + BUILD_RPATH "${TORCH_LIBRARY_DIR}" + LIBRARY_OUTPUT_DIRECTORY "${ISHMEM4PY_PKG_BUILD_DIR}" + RUNTIME_OUTPUT_DIRECTORY "${ISHMEM4PY_PKG_BUILD_DIR}" + POSITION_INDEPENDENT_CODE 1) +endif() + +add_custom_target(ishmem4py ALL + DEPENDS ishmem4py-package-files ishmem4py-runtime) + +if (ISHMEM4PY_BUILD_TORCH_INTEROP) + add_dependencies(ishmem4py ishmem4py-torch) +endif() + +message(STATUS "\n======= ishmem4py summary ========") +message(STATUS "Python interpreter: ${Python3_EXECUTABLE}") +message(STATUS "Python package root: ${ISHMEM4PY_PYTHON_BUILD_ROOT}") +message(STATUS "Runtime library output: ${ISHMEM4PY_PKG_BUILD_DIR}") +message(STATUS "Torch/XPU interop enabled: ${ISHMEM4PY_BUILD_TORCH_INTEROP}") +message(STATUS "==================================\n") diff --git a/ishmem4py/README.md b/ishmem4py/README.md new file mode 100644 index 0000000..7db42fe --- /dev/null +++ b/ishmem4py/README.md @@ -0,0 +1,125 @@ +# ishmem4py + +`ishmem4py` is the in-tree Python binding layer for Intel SHMEM. + +The binding stays close to Intel SHMEM/OpenSHMEM naming where that produces a clear Python API, +but it also mirrors the high-level shape of `nvshmem4py` where practical: + +- synchronous host-side `put` / `get` +- symmetric-memory handles managed from Python +- world/team queries and team objects +- host collectives over symmetric buffers + +The current implementation is intentionally host-driven. Stream-based and device-initiated APIs +remain future work. + +## Public API + +Main imports: + +```python +import ishmem4py as ishmem +# or +import ishmem4py.core as ishmem +``` + +Primary entry points: + +- setup and queries + - `init`, `finalize`, `init_status`, `is_initialized` + - `my_pe`, `n_pes` + - `info_get_version`, `info_get_name`, `get_version` +- symmetric memory + - `malloc`, `calloc`, `buffer`, `free` + - `SymmetricMemory` + - `ptr`, `ishmem_ptr` +- RMA + - `put`, `get`, `quiet`, `fence` + - `putmem`, `getmem` compatibility aliases +- collectives + - `barrier_all`, `sync_all` + - `barrier`, `sync` + - `broadcast`, `collect`, `fcollect`, `alltoall`, `reduce`, `reducescatter` + - `reducescatter` is implemented as a software fallback on top of `reduce` +- teams + - `TEAM_WORLD`, `TEAM_SHARED`, `TEAM_INVALID` + - `Team` + - `team_my_pe`, `team_n_pes`, `team_translate_pe`, `team_sync` + +Supported reduction dtypes: + +- `int32` +- `int64` +- `uint32` +- `uint64` +- `float32` and `float64` for arithmetic reductions + +## Design Notes + +`ishmem4py` keeps symmetric allocations as explicit Python objects instead of exposing the +symmetric heap as a direct Python buffer. That matches Intel SHMEM more closely than pretending +every symmetric allocation is safely CPU-dereferenceable. + +Data movement is therefore centered on ordinary Python buffers and explicit SHMEM operations: + +```python +buf = ishmem.malloc(16) +buf.write(b"abcd") + +host = bytearray(16) +ishmem.get(host, buf, pe=0) +``` + +## Build + +Typical build: + +```bash +source /opt/intel/oneapi/setvars.sh + +cmake -S /docker-mount/ishmem \ + -B /docker-mount/ishmem/build-ishmem4py-icpx-noaot \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DENABLE_OPENSHMEM=ON \ + -DSHMEM_DIR=/home/xiii/pkg/SOS-2026-03-06 \ + -DBUILD_PYTHON_BINDINGS=ON \ + -DBUILD_UNIT_TESTS=OFF \ + -DBUILD_PERF_TESTS=OFF \ + -DBUILD_EXAMPLES=OFF \ + -DBUILD_APPS=OFF \ + -DENABLE_AOT_COMPILATION=OFF + +cmake --build /docker-mount/ishmem/build-ishmem4py-icpx-noaot --target ishmem4py -j4 +``` + +## Run / Test Environment + +Source-tree development: + +```bash +unset ISHMEM_DIR +export PYTHONPATH=/docker-mount/ishmem/ishmem4py/python:/docker-mount/ishmem/ishmem4py/test +export ISHMEM4PY_RUNTIME_LIBRARY=/docker-mount/ishmem/build-ishmem4py-icpx-noaot/ishmem4py/python/ishmem4py/_ishmem4py_runtime.so +export LD_LIBRARY_PATH=/home/xiii/pkg/SOS-2026-03-06/lib:$LD_LIBRARY_PATH +export ISHMEM_RUNTIME=OPENSHMEM +``` + +Example test runs: + +```bash +mpiexec -n 1 /docker-mount/ishmem/scripts/ishmrun \ + python3 /docker-mount/ishmem/ishmem4py/test/smoke_test.py + +mpiexec -n 2 /docker-mount/ishmem/scripts/ishmrun \ + python3 /docker-mount/ishmem/ishmem4py/test/ring_test.py + +mpiexec -n 2 /docker-mount/ishmem/scripts/ishmrun \ + python3 /docker-mount/ishmem/ishmem4py/test/collective_test.py +``` + +## Current Scope + +The public Python surface is limited to functionality that has been validated in this environment. +Host atomics and dynamic team-split helpers were intentionally removed from the exported API after +backend-level failures in the current OpenSHMEM runtime. diff --git a/ishmem4py/csrc/ishmem4py_runtime.cpp b/ishmem4py/csrc/ishmem4py_runtime.cpp new file mode 100644 index 0000000..25dc1b8 --- /dev/null +++ b/ishmem4py/csrc/ishmem4py_runtime.cpp @@ -0,0 +1,531 @@ +/* Copyright (C) 2026 Intel Corporation + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "ishmem4py_runtime.h" + +#include +#include + +#include + +namespace { + +template +uint64_t ishmem4py_to_bits(T value) +{ + uint64_t bits = 0; + static_assert(sizeof(T) <= sizeof(bits)); + std::memcpy(&bits, &value, sizeof(T)); + return bits; +} + +template +T ishmem4py_from_bits(uint64_t bits) +{ + T value{}; + std::memcpy(&value, &bits, sizeof(T)); + return value; +} + +const ishmem_team_config_t *ishmem4py_convert_team_config(const ishmem4py_team_config_t *config, + ishmem_team_config_t *tmp) +{ + if (config == nullptr) return nullptr; + tmp->num_contexts = config->num_contexts; + return tmp; +} + +#define ISHMEM4PY_ATOMIC_FETCH_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX) \ + case DTYPE_ENUM: \ + return ishmem4py_to_bits( \ + ishmem_##SUFFIX##_atomic_fetch(reinterpret_cast(source), pe)) + +#define ISHMEM4PY_ATOMIC_SET_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX) \ + case DTYPE_ENUM: \ + ishmem_##SUFFIX##_atomic_set(reinterpret_cast(dest), \ + ishmem4py_from_bits(value_bits), pe); \ + return + +#define ISHMEM4PY_ATOMIC_SWAP_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX) \ + case DTYPE_ENUM: \ + return ishmem4py_to_bits( \ + ishmem_##SUFFIX##_atomic_swap(reinterpret_cast(dest), \ + ishmem4py_from_bits(value_bits), pe)) + +#define ISHMEM4PY_ATOMIC_COMPARE_SWAP_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX) \ + case DTYPE_ENUM: \ + return ishmem4py_to_bits( \ + ishmem_##SUFFIX##_atomic_compare_swap( \ + reinterpret_cast(dest), ishmem4py_from_bits(cond_bits), \ + ishmem4py_from_bits(value_bits), pe)) + +#define ISHMEM4PY_ATOMIC_FETCH_INC_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX) \ + case DTYPE_ENUM: \ + return ishmem4py_to_bits( \ + ishmem_##SUFFIX##_atomic_fetch_inc(reinterpret_cast(dest), pe)) + +#define ISHMEM4PY_ATOMIC_INC_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX) \ + case DTYPE_ENUM: \ + ishmem_##SUFFIX##_atomic_inc(reinterpret_cast(dest), pe); \ + return + +#define ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX, OP) \ + case DTYPE_ENUM: \ + return ishmem4py_to_bits( \ + ishmem_##SUFFIX##_atomic_fetch_##OP(reinterpret_cast(dest), \ + ishmem4py_from_bits(value_bits), pe)) + +#define ISHMEM4PY_ATOMIC_BINARY_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX, OP) \ + case DTYPE_ENUM: \ + ishmem_##SUFFIX##_atomic_##OP(reinterpret_cast(dest), \ + ishmem4py_from_bits(value_bits), pe); \ + return + +#define ISHMEM4PY_REDUCE_CASE(DTYPE_ENUM, CPP_TYPE, SUFFIX, OP) \ + case DTYPE_ENUM: \ + return ishmem_##SUFFIX##_##OP##_reduce( \ + static_cast(team), reinterpret_cast(dest), \ + reinterpret_cast(src), count) + +} // namespace + +extern "C" { + +void ishmem4py_init(void) +{ + ishmem_init(); +} + +void ishmem4py_init_with_device(int device_id) +{ + ishmemx_attr_t attr{}; + attr.device_id = device_id; + ishmemx_init_attr(&attr); +} + +void ishmem4py_finalize(void) +{ + ishmem_finalize(); +} + +int ishmem4py_my_pe(void) +{ + return ishmem_my_pe(); +} + +int ishmem4py_n_pes(void) +{ + return ishmem_n_pes(); +} + +void ishmem4py_info_get_version(int *major, int *minor) +{ + ishmem_info_get_version(major, minor); +} + +void ishmem4py_info_get_name(char *name) +{ + ishmem_info_get_name(name); +} + +void ishmem4py_vendor_get_version(int *major, int *minor, int *patch) +{ + if (major != nullptr) *major = ISHMEM_MAJOR_VERSION; + if (minor != nullptr) *minor = ISHMEM_MINOR_VERSION; + if (patch != nullptr) *patch = ISHMEM_PATCH_VERSION; +} + +void ishmem4py_barrier_all(void) +{ + ishmem_barrier_all(); +} + +void ishmem4py_sync_all(void) +{ + ishmem_sync_all(); +} + +void ishmem4py_fence(void) +{ + ishmem_fence(); +} + +void ishmem4py_quiet(void) +{ + ishmem_quiet(); +} + +void *ishmem4py_malloc(size_t size) +{ + return ishmem_malloc(size); +} + +void *ishmem4py_calloc(size_t count, size_t size) +{ + return ishmem_calloc(count, size); +} + +void ishmem4py_free(void *ptr) +{ + ishmem_free(ptr); +} + +void ishmem4py_putmem(void *dest, const void *src, size_t nbytes, int pe) +{ + ishmem_putmem(dest, src, nbytes, pe); +} + +void ishmem4py_getmem(void *dest, const void *src, size_t nbytes, int pe) +{ + ishmem_getmem(dest, src, nbytes, pe); +} + +void ishmem4py_putmem_on_queue(void *dest, const void *src, size_t nbytes, int pe, void *queue) +{ + auto *q = reinterpret_cast(queue); + ishmemx_putmem_on_queue(dest, src, nbytes, pe, *q, {}); +} + +void ishmem4py_getmem_on_queue(void *dest, const void *src, size_t nbytes, int pe, void *queue) +{ + auto *q = reinterpret_cast(queue); + ishmemx_getmem_on_queue(dest, src, nbytes, pe, *q, {}); +} + +void ishmem4py_quiet_on_queue(void *queue) +{ + auto *q = reinterpret_cast(queue); + ishmemx_quiet_on_queue(*q, {}); +} + +void ishmem4py_queue_sync(void *queue) +{ + auto *q = reinterpret_cast(queue); + q->wait_and_throw(); +} + +void *ishmem4py_ptr(const void *dest, int pe) +{ + return ishmem_ptr(dest, pe); +} + +int ishmem4py_team_my_pe(int team) +{ + return ishmem_team_my_pe(static_cast(team)); +} + +int ishmem4py_team_n_pes(int team) +{ + return ishmem_team_n_pes(static_cast(team)); +} + +int ishmem4py_team_translate_pe(int src_team, int src_pe, int dest_team) +{ + return ishmem_team_translate_pe(static_cast(src_team), src_pe, + static_cast(dest_team)); +} + +int ishmem4py_team_sync(int team) +{ + return ishmem_team_sync(static_cast(team)); +} + +int ishmem4py_team_split_strided(int parent_team, int start, int stride, int size, + const ishmem4py_team_config_t *config, long config_mask, + int *new_team) +{ + ishmem_team_t result_team = ISHMEM_TEAM_INVALID; + ishmem_team_config_t tmp_config{}; + const ishmem_team_config_t *converted_config = + ishmem4py_convert_team_config(config, &tmp_config); + int result = ishmem_team_split_strided(static_cast(parent_team), start, stride, + size, converted_config, config_mask, &result_team); + if (new_team != nullptr) *new_team = result_team; + return result; +} + +int ishmem4py_team_split_2d(int parent_team, int xrange, + const ishmem4py_team_config_t *xaxis_config, long xaxis_mask, + int *xaxis_team, const ishmem4py_team_config_t *yaxis_config, + long yaxis_mask, int *yaxis_team) +{ + ishmem_team_t xaxis_result = ISHMEM_TEAM_INVALID; + ishmem_team_t yaxis_result = ISHMEM_TEAM_INVALID; + ishmem_team_config_t xaxis_tmp{}; + ishmem_team_config_t yaxis_tmp{}; + const ishmem_team_config_t *converted_xaxis = + ishmem4py_convert_team_config(xaxis_config, &xaxis_tmp); + const ishmem_team_config_t *converted_yaxis = + ishmem4py_convert_team_config(yaxis_config, &yaxis_tmp); + + int result = ishmem_team_split_2d(static_cast(parent_team), xrange, + converted_xaxis, xaxis_mask, &xaxis_result, converted_yaxis, + yaxis_mask, &yaxis_result); + if (xaxis_team != nullptr) *xaxis_team = xaxis_result; + if (yaxis_team != nullptr) *yaxis_team = yaxis_result; + return result; +} + +void ishmem4py_team_destroy(int team) +{ + ishmem_team_destroy(static_cast(team)); +} + +int ishmem4py_broadcastmem(int team, void *dest, const void *src, size_t nbytes, int root) +{ + return ishmem_broadcastmem(static_cast(team), dest, src, nbytes, root); +} + +int ishmem4py_collectmem(int team, void *dest, const void *src, size_t nbytes) +{ + return ishmem_collectmem(static_cast(team), dest, src, nbytes); +} + +int ishmem4py_fcollectmem(int team, void *dest, const void *src, size_t nbytes) +{ + return ishmem_fcollectmem(static_cast(team), dest, src, nbytes); +} + +int ishmem4py_alltoallmem(int team, void *dest, const void *src, size_t nbytes) +{ + return ishmem_alltoallmem(static_cast(team), dest, src, nbytes); +} + +int ishmem4py_reduce(int op, int dtype, int team, void *dest, const void *src, size_t count) +{ + switch (op) { + case ISHMEM4PY_REDUCE_SUM: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, sum); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, sum); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, sum); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, sum); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float, sum); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double, sum); + } + break; + case ISHMEM4PY_REDUCE_PROD: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, prod); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, prod); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, prod); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, prod); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float, prod); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double, prod); + } + break; + case ISHMEM4PY_REDUCE_AND: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, and); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, and); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, and); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, and); + } + break; + case ISHMEM4PY_REDUCE_OR: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, or); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, or); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, or); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, or); + } + break; + case ISHMEM4PY_REDUCE_XOR: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, xor); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, xor); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, xor); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, xor); + } + break; + case ISHMEM4PY_REDUCE_MIN: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, min); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, min); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, min); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, min); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float, min); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double, min); + } + break; + case ISHMEM4PY_REDUCE_MAX: + switch (dtype) { + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, max); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, max); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, max); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, max); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float, max); + ISHMEM4PY_REDUCE_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double, max); + } + break; + } + return -1; +} + +uint64_t ishmem4py_atomic_fetch(int dtype, void *source, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_FETCH_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32); + ISHMEM4PY_ATOMIC_FETCH_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64); + ISHMEM4PY_ATOMIC_FETCH_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32); + ISHMEM4PY_ATOMIC_FETCH_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64); + ISHMEM4PY_ATOMIC_FETCH_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float); + ISHMEM4PY_ATOMIC_FETCH_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double); + } + return 0; +} + +void ishmem4py_atomic_set(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_SET_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32); + ISHMEM4PY_ATOMIC_SET_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64); + ISHMEM4PY_ATOMIC_SET_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32); + ISHMEM4PY_ATOMIC_SET_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64); + ISHMEM4PY_ATOMIC_SET_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float); + ISHMEM4PY_ATOMIC_SET_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double); + } +} + +uint64_t ishmem4py_atomic_swap(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_SWAP_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32); + ISHMEM4PY_ATOMIC_SWAP_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64); + ISHMEM4PY_ATOMIC_SWAP_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32); + ISHMEM4PY_ATOMIC_SWAP_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64); + ISHMEM4PY_ATOMIC_SWAP_CASE(ISHMEM4PY_DTYPE_FLOAT32, float, float); + ISHMEM4PY_ATOMIC_SWAP_CASE(ISHMEM4PY_DTYPE_FLOAT64, double, double); + } + return 0; +} + +uint64_t ishmem4py_atomic_compare_swap(int dtype, void *dest, uint64_t cond_bits, + uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_COMPARE_SWAP_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32); + ISHMEM4PY_ATOMIC_COMPARE_SWAP_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64); + ISHMEM4PY_ATOMIC_COMPARE_SWAP_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32); + ISHMEM4PY_ATOMIC_COMPARE_SWAP_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64); + } + return 0; +} + +uint64_t ishmem4py_atomic_fetch_inc(int dtype, void *dest, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_FETCH_INC_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32); + ISHMEM4PY_ATOMIC_FETCH_INC_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64); + ISHMEM4PY_ATOMIC_FETCH_INC_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32); + ISHMEM4PY_ATOMIC_FETCH_INC_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64); + } + return 0; +} + +void ishmem4py_atomic_inc(int dtype, void *dest, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_INC_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32); + ISHMEM4PY_ATOMIC_INC_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64); + ISHMEM4PY_ATOMIC_INC_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32); + ISHMEM4PY_ATOMIC_INC_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64); + } +} + +uint64_t ishmem4py_atomic_fetch_add(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, add); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, add); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, add); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, add); + } + return 0; +} + +void ishmem4py_atomic_add(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, add); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, add); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, add); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, add); + } +} + +uint64_t ishmem4py_atomic_fetch_and(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, and); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, and); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, and); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, and); + } + return 0; +} + +void ishmem4py_atomic_and(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, and); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, and); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, and); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, and); + } +} + +uint64_t ishmem4py_atomic_fetch_or(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, or); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, or); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, or); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, or); + } + return 0; +} + +void ishmem4py_atomic_or(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, or); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, or); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, or); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, or); + } +} + +uint64_t ishmem4py_atomic_fetch_xor(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, xor); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, xor); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, xor); + ISHMEM4PY_ATOMIC_FETCH_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, xor); + } + return 0; +} + +void ishmem4py_atomic_xor(int dtype, void *dest, uint64_t value_bits, int pe) +{ + switch (dtype) { + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT32, int32_t, int32, xor); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_INT64, int64_t, int64, xor); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT32, uint32_t, uint32, xor); + ISHMEM4PY_ATOMIC_BINARY_CASE(ISHMEM4PY_DTYPE_UINT64, uint64_t, uint64, xor); + } +} + +uintptr_t ishmem4py_cast_ptr_to_uintptr(const void *ptr) +{ + return reinterpret_cast(ptr); +} + +void *ishmem4py_cast_uintptr_to_ptr(uintptr_t value) +{ + return reinterpret_cast(value); +} + +} // extern "C" diff --git a/ishmem4py/csrc/ishmem4py_runtime.h b/ishmem4py/csrc/ishmem4py_runtime.h new file mode 100644 index 0000000..3f60107 --- /dev/null +++ b/ishmem4py/csrc/ishmem4py_runtime.h @@ -0,0 +1,114 @@ +/* Copyright (C) 2026 Intel Corporation + * SPDX-License-Identifier: BSD-3-Clause + */ + +#ifndef ISHMEM4PY_RUNTIME_H +#define ISHMEM4PY_RUNTIME_H + +#include +#include + +#if defined(__GNUC__) +#define ISHMEM4PY_EXPORT __attribute__((visibility("default"))) +#else +#define ISHMEM4PY_EXPORT +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum ishmem4py_dtype_t { + ISHMEM4PY_DTYPE_INT32 = 0, + ISHMEM4PY_DTYPE_INT64 = 1, + ISHMEM4PY_DTYPE_UINT32 = 2, + ISHMEM4PY_DTYPE_UINT64 = 3, + ISHMEM4PY_DTYPE_FLOAT32 = 4, + ISHMEM4PY_DTYPE_FLOAT64 = 5, +} ishmem4py_dtype_t; + +typedef enum ishmem4py_reduce_op_t { + ISHMEM4PY_REDUCE_SUM = 0, + ISHMEM4PY_REDUCE_PROD = 1, + ISHMEM4PY_REDUCE_AND = 2, + ISHMEM4PY_REDUCE_OR = 3, + ISHMEM4PY_REDUCE_XOR = 4, + ISHMEM4PY_REDUCE_MIN = 5, + ISHMEM4PY_REDUCE_MAX = 6, +} ishmem4py_reduce_op_t; + +typedef struct ishmem4py_team_config_t { + int num_contexts; +} ishmem4py_team_config_t; + +ISHMEM4PY_EXPORT void ishmem4py_init(void); +ISHMEM4PY_EXPORT void ishmem4py_init_with_device(int device_id); +ISHMEM4PY_EXPORT void ishmem4py_finalize(void); +ISHMEM4PY_EXPORT int ishmem4py_my_pe(void); +ISHMEM4PY_EXPORT int ishmem4py_n_pes(void); +ISHMEM4PY_EXPORT void ishmem4py_info_get_version(int *major, int *minor); +ISHMEM4PY_EXPORT void ishmem4py_info_get_name(char *name); +ISHMEM4PY_EXPORT void ishmem4py_vendor_get_version(int *major, int *minor, int *patch); +ISHMEM4PY_EXPORT void ishmem4py_barrier_all(void); +ISHMEM4PY_EXPORT void ishmem4py_sync_all(void); +ISHMEM4PY_EXPORT void ishmem4py_fence(void); +ISHMEM4PY_EXPORT void ishmem4py_quiet(void); +ISHMEM4PY_EXPORT void *ishmem4py_malloc(size_t size); +ISHMEM4PY_EXPORT void *ishmem4py_calloc(size_t count, size_t size); +ISHMEM4PY_EXPORT void ishmem4py_free(void *ptr); +ISHMEM4PY_EXPORT void ishmem4py_putmem(void *dest, const void *src, size_t nbytes, int pe); +ISHMEM4PY_EXPORT void ishmem4py_getmem(void *dest, const void *src, size_t nbytes, int pe); +ISHMEM4PY_EXPORT void ishmem4py_putmem_on_queue(void *dest, const void *src, size_t nbytes, + int pe, void *queue); +ISHMEM4PY_EXPORT void ishmem4py_getmem_on_queue(void *dest, const void *src, size_t nbytes, + int pe, void *queue); +ISHMEM4PY_EXPORT void ishmem4py_quiet_on_queue(void *queue); +ISHMEM4PY_EXPORT void ishmem4py_queue_sync(void *queue); +ISHMEM4PY_EXPORT void *ishmem4py_ptr(const void *dest, int pe); +ISHMEM4PY_EXPORT int ishmem4py_team_my_pe(int team); +ISHMEM4PY_EXPORT int ishmem4py_team_n_pes(int team); +ISHMEM4PY_EXPORT int ishmem4py_team_translate_pe(int src_team, int src_pe, int dest_team); +ISHMEM4PY_EXPORT int ishmem4py_team_sync(int team); +ISHMEM4PY_EXPORT int ishmem4py_team_split_strided(int parent_team, int start, int stride, int size, + const ishmem4py_team_config_t *config, + long config_mask, int *new_team); +ISHMEM4PY_EXPORT int ishmem4py_team_split_2d(int parent_team, int xrange, + const ishmem4py_team_config_t *xaxis_config, + long xaxis_mask, int *xaxis_team, + const ishmem4py_team_config_t *yaxis_config, + long yaxis_mask, int *yaxis_team); +ISHMEM4PY_EXPORT void ishmem4py_team_destroy(int team); +ISHMEM4PY_EXPORT int ishmem4py_broadcastmem(int team, void *dest, const void *src, size_t nbytes, + int root); +ISHMEM4PY_EXPORT int ishmem4py_collectmem(int team, void *dest, const void *src, size_t nbytes); +ISHMEM4PY_EXPORT int ishmem4py_fcollectmem(int team, void *dest, const void *src, size_t nbytes); +ISHMEM4PY_EXPORT int ishmem4py_alltoallmem(int team, void *dest, const void *src, size_t nbytes); +ISHMEM4PY_EXPORT int ishmem4py_reduce(int op, int dtype, int team, void *dest, const void *src, + size_t count); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_fetch(int dtype, void *source, int pe); +ISHMEM4PY_EXPORT void ishmem4py_atomic_set(int dtype, void *dest, uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_swap(int dtype, void *dest, uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_compare_swap(int dtype, void *dest, uint64_t cond_bits, + uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_fetch_inc(int dtype, void *dest, int pe); +ISHMEM4PY_EXPORT void ishmem4py_atomic_inc(int dtype, void *dest, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_fetch_add(int dtype, void *dest, uint64_t value_bits, + int pe); +ISHMEM4PY_EXPORT void ishmem4py_atomic_add(int dtype, void *dest, uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_fetch_and(int dtype, void *dest, uint64_t value_bits, + int pe); +ISHMEM4PY_EXPORT void ishmem4py_atomic_and(int dtype, void *dest, uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_fetch_or(int dtype, void *dest, uint64_t value_bits, + int pe); +ISHMEM4PY_EXPORT void ishmem4py_atomic_or(int dtype, void *dest, uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uint64_t ishmem4py_atomic_fetch_xor(int dtype, void *dest, uint64_t value_bits, + int pe); +ISHMEM4PY_EXPORT void ishmem4py_atomic_xor(int dtype, void *dest, uint64_t value_bits, int pe); +ISHMEM4PY_EXPORT uintptr_t ishmem4py_cast_ptr_to_uintptr(const void *ptr); +ISHMEM4PY_EXPORT void *ishmem4py_cast_uintptr_to_ptr(uintptr_t value); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/ishmem4py/csrc/ishmem4py_torch.cpp b/ishmem4py/csrc/ishmem4py_torch.cpp new file mode 100644 index 0000000..38be51c --- /dev/null +++ b/ishmem4py/csrc/ishmem4py_torch.cpp @@ -0,0 +1,68 @@ +/* Copyright (C) 2026 Intel Corporation + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "ishmem4py_runtime.h" + +#include +#include + +#include +#include +#include + +namespace { + +size_t ishmem4py_numel(c10::IntArrayRef sizes) +{ + return std::accumulate(sizes.begin(), sizes.end(), static_cast(1), + std::multiplies()); +} + +at::Tensor ishmem4py_wrap_tensor(void *ptr, c10::IntArrayRef sizes, c10::ScalarType scalar_type, + c10::DeviceIndex device_index) +{ + const auto device = c10::Device(c10::DeviceType::XPU, device_index); + const auto options = at::TensorOptions().dtype(scalar_type).device(device); + return at::for_blob(ptr, sizes).options(options).target_device(device).make_tensor(); +} + +} // namespace + +at::Tensor ishmem4py_alloc_tensor(const std::vector &sizes, int64_t scalar_type, + int64_t device_index) +{ + TORCH_CHECK(device_index >= 0, "device index must be >= 0"); + const auto dtype = static_cast(scalar_type); + const auto nbytes = ishmem4py_numel(sizes) * c10::elementSize(dtype); + if (nbytes == 0) { + return at::empty(sizes, at::TensorOptions().dtype(dtype).device(c10::DeviceType::XPU, + device_index)); + } + + void *ptr = ishmem4py_malloc(nbytes); + TORCH_CHECK(ptr != nullptr, "ishmem_malloc(", nbytes, ") returned NULL"); + return ishmem4py_wrap_tensor(ptr, sizes, dtype, static_cast(device_index)); +} + +at::Tensor ishmem4py_tensor_from_ptr(uint64_t ptr, const std::vector &sizes, + int64_t scalar_type, int64_t device_index) +{ + TORCH_CHECK(device_index >= 0, "device index must be >= 0"); + return ishmem4py_wrap_tensor(reinterpret_cast(ptr), sizes, + static_cast(scalar_type), + static_cast(device_index)); +} + +void ishmem4py_free_ptr(uint64_t ptr) +{ + if (ptr == 0) return; + ishmem4py_free(reinterpret_cast(ptr)); +} + +PYBIND11_MODULE(_ishmem4py_torch, m) +{ + m.def("alloc_tensor", &ishmem4py_alloc_tensor); + m.def("tensor_from_ptr", &ishmem4py_tensor_from_ptr); + m.def("free_ptr", &ishmem4py_free_ptr); +} diff --git a/ishmem4py/examples/init_fini.py b/ishmem4py/examples/init_fini.py new file mode 100644 index 0000000..b0c4a56 --- /dev/null +++ b/ishmem4py/examples/init_fini.py @@ -0,0 +1,17 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +import ishmem4py as ishmem + + +def main() -> None: + ishmem.init() + try: + major, minor = ishmem.info_get_version() + print(f"PE {ishmem.my_pe()} / {ishmem.n_pes()} using Intel SHMEM {major}.{minor}") + finally: + ishmem.finalize() + + +if __name__ == "__main__": + main() diff --git a/ishmem4py/examples/ring_put_get.py b/ishmem4py/examples/ring_put_get.py new file mode 100644 index 0000000..c9c2427 --- /dev/null +++ b/ishmem4py/examples/ring_put_get.py @@ -0,0 +1,45 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +import struct + +import ishmem4py as ishmem + + +def main() -> None: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + next_pe = (my_pe + 1) % npes + prev_pe = (my_pe + npes - 1) % npes + + src = ishmem.malloc(4) + dst = ishmem.calloc(1, 4) + try: + src.write(struct.pack("=i", my_pe)) + ishmem.barrier_all() + + ishmem.put(dst, src.read(4), pe=next_pe) + ishmem.barrier_all() + + received = struct.unpack("=i", dst.read(4))[0] + + fetched = bytearray(4) + ishmem.get(fetched, src, pe=next_pe) + fetched_value = struct.unpack("=i", fetched)[0] + + print( + f"PE {my_pe}: dst after put={received} (expected {prev_pe}), " + f"get from PE {next_pe} returned {fetched_value}" + ) + ishmem.barrier_all() + finally: + ishmem.free(dst) + ishmem.free(src) + finally: + ishmem.finalize() + + +if __name__ == "__main__": + main() diff --git a/ishmem4py/python/README.md b/ishmem4py/python/README.md new file mode 100644 index 0000000..8a608b7 --- /dev/null +++ b/ishmem4py/python/README.md @@ -0,0 +1,85 @@ +# ishmem4py Python Package + +`ishmem4py` supports two install modes. The base package provides host-side Intel SHMEM +allocation, RMA, collectives, and team APIs. An optional Torch/XPU extension adds symmetric +`torch.Tensor` allocation plus queue-aware XPU RMA. + +## Standard Install + +Build `ishmem` with `-DBUILD_PYTHON_BINDINGS=ON`, then install from the build tree: + +```bash +pip install /path/to/ishmem-build/ishmem4py/python +``` + +That package directory includes `_ishmem4py_runtime.so`, so `ISHMEM4PY_RUNTIME_LIBRARY` is not +needed. + +To include the optional Torch/XPU interop module in the package, build with +`-DISHMEM4PY_BUILD_TORCH_INTEROP=ON` and use a Python executable from an environment where +PyTorch XPU is already installed: + +```bash +cmake -S /path/to/ishmem-src -B /path/to/ishmem-build \ + -DBUILD_PYTHON_BINDINGS=ON \ + -DISHMEM4PY_BUILD_TORCH_INTEROP=ON \ + -DPython3_EXECUTABLE=/path/to/python \ + -DCMAKE_PREFIX_PATH="$(python -c 'import torch; print(torch.utils.cmake_prefix_path)')" +cmake --build /path/to/ishmem-build --target ishmem4py -j4 +``` + +## Editable / Dev Install + +For active development: + +```bash +pip install -e /path/to/ishmem-src/ishmem4py/python +``` + +Point the editable package at the CMake-built runtime: + +```bash +export ISHMEM4PY_RUNTIME_LIBRARY=/path/to/ishmem-build/ishmem4py/python/ishmem4py/_ishmem4py_runtime.so +``` + +If Torch/XPU interop is enabled in the build tree, the editable package will also find +`_ishmem4py_torch.so` from that package directory through `PYTHONPATH`. + +## Runtime Environment + +The package still depends on the surrounding Intel SHMEM runtime setup: + +```bash +source /opt/intel/oneapi/setvars.sh +unset ISHMEM_DIR +export ISHMEM_RUNTIME=OPENSHMEM +export LD_LIBRARY_PATH=/path/to/openshmem/lib:$LD_LIBRARY_PATH +``` + +For source-tree testing: + +```bash +export PYTHONPATH=/path/to/ishmem-src/ishmem4py/python:/path/to/ishmem-src/ishmem4py/test +``` + +## Notes + +- `import ishmem4py` and `import ishmem4py.core` both expose the full public API. +- Torch/XPU interop is imported lazily through `ishmem4py.tensor(...)`, + `ishmem4py.free_tensor(...)`, `ishmem4py.get_peer_tensor(...)`, + `ishmem4py.tensor_base(...)`, and + `ishmem4py.is_symmetric_tensor(...)` so that plain package import does not eagerly import + PyTorch. +- `ishmem4py.get_peer_tensor(tensor, pe=...)` mirrors `nvshmem4py`'s peer-tensor helper for + directly addressable XPU aliases returned by `ishmem_ptr`. The returned tensor is a non-owning + alias and must not be passed to `ishmem4py.free_tensor(...)`. +- The runtime library is loaded lazily, so importing the package for documentation or static + inspection does not require a live Intel SHMEM runtime. +- `ishmem.init(device_id=...)` selects an explicit visible XPU ordinal for multi-device launch + modes. `device_id` may be an integer, an XPU device string such as `xpu:1`, or a + `torch.device("xpu", 1)`. Leaving it unset preserves the single-visible-device behavior. +- `put`, `get`, and `quiet` accept `queue=` for queue-based host-initiated XPU RMA. Supported + queue objects are `torch.xpu.Stream`, `ctypes.c_void_p`, and raw integer SYCL queue pointers. +- The current Torch/XPU MVP is focused on XPU device memory and XPU-to-XPU one-sided transfers. +- The exported API is limited to functionality that has been validated in the current + OpenSHMEM-backed test environment. diff --git a/ishmem4py/python/ishmem4py/__init__.py b/ishmem4py/python/ishmem4py/__init__.py new file mode 100644 index 0000000..be5d295 --- /dev/null +++ b/ishmem4py/python/ishmem4py/__init__.py @@ -0,0 +1,10 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Public top-level imports for the Intel SHMEM Python bindings.""" + +from . import core +from .core import * +from .version import __version__ + +__all__ = list(core.__all__) + ["__version__"] diff --git a/ishmem4py/python/ishmem4py/_common.py b/ishmem4py/python/ishmem4py/_common.py new file mode 100644 index 0000000..17f987a --- /dev/null +++ b/ishmem4py/python/ishmem4py/_common.py @@ -0,0 +1,478 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Shared types and validation helpers for the public ``ishmem4py`` API.""" + +from __future__ import annotations + +import ctypes +import sys +from dataclasses import dataclass +from enum import IntEnum +from typing import Optional + + +class IshmemError(RuntimeError): + """Base exception for ``ishmem4py`` failures.""" + + pass + + +class IshmemStateError(IshmemError): + """Raised when Intel SHMEM is used in an invalid process state.""" + + pass + + +class Comparison(IntEnum): + """Comparison operators for wait/test style APIs.""" + + EQ = 1 + NE = 2 + GT = 3 + GE = 4 + LT = 5 + LE = 6 + + +class SignalOp(IntEnum): + """Signal operations retained for API compatibility.""" + + SET = 0 + ADD = 1 + + +class InitStatus(IntEnum): + """Process-local Intel SHMEM initialization state.""" + + UNINITIALIZED = 0 + INITIALIZED = 1 + FINALIZED = 2 + + +@dataclass(frozen=True) +class Version: + """Combined OpenSHMEM, Intel SHMEM, and Python binding version metadata.""" + + openshmem_spec_version: str + ishmem4py_version: str + libishmem_version: str + vendor_name: str + + +@dataclass(frozen=True) +class Team: + """Python wrapper for an Intel SHMEM team handle.""" + + handle: int + name: Optional[str] = None + + def __int__(self) -> int: + return self.handle + + @property + def valid(self) -> bool: + return self.handle != -1 + + def __repr__(self) -> str: + label = self.name if self.name is not None else f"TEAM_{self.handle}" + return f"Team(handle={self.handle}, name={label!r})" + + +@dataclass(frozen=True) +class TeamConfig: + """Subset of ``ishmem_team_config_t`` exposed by the MVP bindings.""" + + num_contexts: int = 0 + + +@dataclass(frozen=True) +class MemoryPointer: + """A PE-relative address returned by ``ptr`` or ``ishmem_ptr``.""" + + address: int + size: int + pe: int + base: "SymmetricMemory" + + def __int__(self) -> int: + return self.address + + def __bool__(self) -> bool: + return self.address != 0 + + +@dataclass(frozen=True) +class _DTypeInfo: + name: str + code: int + ctype: type[ctypes._SimpleCData] + itemsize: int + + +_DTYPES = { + "int32": _DTypeInfo("int32", 0, ctypes.c_int32, 4), + "int64": _DTypeInfo("int64", 1, ctypes.c_int64, 8), + "uint32": _DTypeInfo("uint32", 2, ctypes.c_uint32, 4), + "uint64": _DTypeInfo("uint64", 3, ctypes.c_uint64, 8), + "float32": _DTypeInfo("float32", 4, ctypes.c_float, 4), + "float64": _DTypeInfo("float64", 5, ctypes.c_double, 8), +} + +_DTYPE_ALIASES = { + "i": "int32", + "int": "int32", + "int32": "int32", + "l": "int64", + "q": "int64", + "long": "int64", + "int64": "int64", + "i4": "int32", + "i8": "int64", + "u": "uint32", + "I": "uint32", + "uint": "uint32", + "uint32": "uint32", + "Q": "uint64", + "L": "uint64", + "ulong": "uint64", + "uint64": "uint64", + "u4": "uint32", + "u8": "uint64", + "f": "float32", + "float": "float32", + "float32": "float32", + "f4": "float32", + "d": "float64", + "double": "float64", + "float64": "float64", + "f8": "float64", +} + +_CTYPE_ALIASES = { + ctypes.c_int32: "int32", + ctypes.c_int64: "int64", + ctypes.c_uint32: "uint32", + ctypes.c_uint64: "uint64", + ctypes.c_float: "float32", + ctypes.c_double: "float64", +} + +_ATOMIC_FETCH_DTYPES = frozenset(_DTYPES) +_ATOMIC_STANDARD_DTYPES = frozenset({"int32", "int64", "uint32", "uint64"}) +_ATOMIC_BITWISE_DTYPES = frozenset({"int32", "int64", "uint32", "uint64"}) +_REDUCTION_DTYPES = { + "sum": frozenset(_DTYPES), + "prod": frozenset(_DTYPES), + "and": frozenset({"int32", "int64", "uint32", "uint64"}), + "or": frozenset({"int32", "int64", "uint32", "uint64"}), + "xor": frozenset({"int32", "int64", "uint32", "uint64"}), + "min": frozenset(_DTYPES), + "max": frozenset(_DTYPES), +} +_REDUCTION_CODES = { + "sum": 0, + "prod": 1, + "and": 2, + "or": 3, + "xor": 4, + "min": 5, + "max": 6, +} + + +class SymmetricMemory: + """An opaque handle to a symmetric heap allocation.""" + + def __init__(self, ptr: int, size: int): + self._ptr = int(ptr) + self._size = int(size) + self._freed = False + + @property + def ptr(self) -> int: + return self._ptr + + @property + def size(self) -> int: + return self._size + + @property + def freed(self) -> bool: + return self._freed + + def _mark_freed(self) -> None: + self._freed = True + + def read(self, size: Optional[int] = None, *, offset: int = 0, pe: Optional[int] = None) -> bytes: + """Fetch bytes from this symmetric allocation into a new ``bytes`` object.""" + from .rma import get + from .init_fini import my_pe + + if pe is None: + pe = my_pe() + length = _normalize_span(self, size=size, offset=offset) + dst = bytearray(length) + get(dst, self, pe=pe, size=length, dest_offset=0, src_offset=offset) + return bytes(dst) + + def write(self, data, *, offset: int = 0, pe: Optional[int] = None) -> int: + """Store bytes from a Python buffer into this symmetric allocation.""" + from .rma import put + from .init_fini import my_pe + + if pe is None: + pe = my_pe() + return put(self, data, pe=pe, dest_offset=offset) + + def __int__(self) -> int: + return self._ptr + + def __repr__(self) -> str: + state = "freed" if self._freed else "active" + return f"SymmetricMemory(ptr=0x{self._ptr:x}, size={self._size}, state={state})" + + +TEAM_INVALID = Team(-1, "TEAM_INVALID") +TEAM_WORLD = Team(0, "TEAM_WORLD") +TEAM_SHARED = Team(1, "TEAM_SHARED") + + +_initialized = False +_finalized = False +_live_allocations: dict[int, SymmetricMemory] = {} + + +def _register_allocation(symm: SymmetricMemory) -> None: + _live_allocations[id(symm)] = symm + + +def _unregister_allocation(symm: SymmetricMemory) -> None: + _live_allocations.pop(id(symm), None) + + +def _require_initialized() -> None: + if not _initialized: + raise IshmemStateError("Intel SHMEM is not initialized") + + +def _set_initialized() -> None: + global _initialized + _initialized = True + + +def _set_finalized() -> None: + global _initialized, _finalized + _initialized = False + _finalized = True + + +def _get_init_status() -> InitStatus: + if _initialized: + return InitStatus.INITIALIZED + if _finalized: + return InitStatus.FINALIZED + return InitStatus.UNINITIALIZED + + +def _check_can_init() -> None: + if _initialized: + raise IshmemStateError("Intel SHMEM is already initialized") + if _finalized: + raise IshmemStateError( + "Intel SHMEM has already been finalized in this process and cannot be reinitialized" + ) + + +def _check_can_finalize() -> None: + if not _initialized: + raise IshmemStateError("Intel SHMEM is not initialized") + leaked_ptrs = [ + f"0x{symm.ptr:x}" for symm in sorted(_live_allocations.values(), key=lambda item: item.ptr) + ] + try: + from . import torch as _ishmem_torch + + leaked_ptrs.extend(f"0x{ptr:x}" for ptr in _ishmem_torch._live_tensor_pointers()) + except Exception: + pass + if leaked_ptrs: + leaked = ", ".join(leaked_ptrs) + raise IshmemStateError( + "cannot finalize while symmetric allocations are still live; free them first: " f"{leaked}" + ) + + +def _require_active(symm: SymmetricMemory) -> None: + if not isinstance(symm, SymmetricMemory): + raise TypeError("expected a SymmetricMemory instance") + if symm.freed: + raise IshmemStateError("symmetric allocation has already been freed") + + +def _normalize_span(obj, *, size: Optional[int], offset: int) -> int: + if offset < 0: + raise ValueError("offset must be >= 0") + + available = obj.size - offset + if available < 0: + raise ValueError("offset is past the end of the object") + + if size is None: + return available + if size < 0: + raise ValueError("size must be >= 0") + if size > available: + raise ValueError("requested size extends past the end of the object") + return size + + +def _ensure_contiguous_memoryview(obj, *, writable: bool): + mv = memoryview(obj) + if not mv.contiguous: + raise ValueError("buffer must be contiguous") + if writable and mv.readonly: + raise ValueError("destination buffer must be writable") + return mv + + +@dataclass(frozen=True) +class _PointerInfo: + ptr: int + size: int + keepalive: object + + +def _pointer_from_local_buffer(obj, *, writable: bool, offset: int = 0) -> _PointerInfo: + if isinstance(obj, SymmetricMemory): + _require_active(obj) + size = _normalize_span(obj, size=None, offset=offset) + return _PointerInfo(ptr=obj.ptr + offset, size=size, keepalive=obj) + + try: + from . import torch as _ishmem_torch + + tensor_info = _ishmem_torch._pointer_from_tensor(obj, writable=writable, offset=offset) + if tensor_info is not None: + return tensor_info + except Exception: + pass + + mv = _ensure_contiguous_memoryview(obj, writable=writable) + + if offset < 0: + raise ValueError("offset must be >= 0") + if offset > mv.nbytes: + raise ValueError("offset is past the end of the buffer") + + if mv.readonly: + if writable: + raise ValueError("destination buffer must be writable") + copied = (ctypes.c_char * mv.nbytes).from_buffer_copy(mv) + return _PointerInfo(ptr=ctypes.addressof(copied) + offset, size=mv.nbytes - offset, keepalive=copied) + + raw = (ctypes.c_char * mv.nbytes).from_buffer(mv) + return _PointerInfo(ptr=ctypes.addressof(raw) + offset, size=mv.nbytes - offset, keepalive=(mv, raw)) + + +def _symmetric_target(obj): + if isinstance(obj, SymmetricMemory): + _require_active(obj) + return obj + + try: + from . import torch as _ishmem_torch + + target = _ishmem_torch._symmetric_tensor_target(obj) + if target is not None: + return target + except Exception: + pass + + raise TypeError("expected a SymmetricMemory instance or a live ishmem4py symmetric tensor") + + +def _check_status(status: int | None, opname: str) -> None: + if status not in (0, None): + raise IshmemError(f"{opname} failed with status {status}") + + +def _normalize_team(team: Team | int | None) -> int: + if team is None: + return TEAM_WORLD.handle + if isinstance(team, Team): + return team.handle + return int(team) + + +def _team_from_handle(handle: int, *, name: Optional[str] = None) -> Team | None: + if int(handle) == TEAM_INVALID.handle: + return None + return Team(int(handle), name) + + +def _normalize_dtype(dtype) -> _DTypeInfo: + if isinstance(dtype, _DTypeInfo): + return dtype + if dtype in _CTYPE_ALIASES: + return _DTYPES[_CTYPE_ALIASES[dtype]] + + if isinstance(dtype, str): + alias = dtype.strip() + if alias.startswith("="): + alias = alias[1:] + normalized = _DTYPE_ALIASES.get(alias.lower(), _DTYPE_ALIASES.get(alias)) + if normalized is not None: + return _DTYPES[normalized] + + raise TypeError("unsupported dtype; use one of int32, int64, uint32, uint64, float32, float64") + + +def _require_atomic_dtype(dtype, allowed: frozenset[str], opname: str) -> _DTypeInfo: + info = _normalize_dtype(dtype) + if info.name not in allowed: + supported = ", ".join(sorted(allowed)) + raise TypeError(f"{opname} does not support dtype {info.name}; supported dtypes: {supported}") + return info + + +def _require_reduction_dtype(op: str, dtype) -> _DTypeInfo: + normalized_op = op.strip().lower() + if normalized_op not in _REDUCTION_CODES: + supported = ", ".join(sorted(_REDUCTION_CODES)) + raise ValueError(f"unsupported reduction op {op!r}; supported ops: {supported}") + + info = _normalize_dtype(dtype) + allowed = _REDUCTION_DTYPES[normalized_op] + if info.name not in allowed: + supported = ", ".join(sorted(allowed)) + raise TypeError(f"reduction {normalized_op!r} does not support dtype {info.name}; supported dtypes: {supported}") + return info + + +def _normalize_reduction_op(op: str) -> int: + return _REDUCTION_CODES[op.strip().lower()] + + +def _dtype_element_count(src: SymmetricMemory, dest: SymmetricMemory, dtype, count: Optional[int]) -> tuple[_DTypeInfo, int]: + info = _normalize_dtype(dtype) + available = min(src.size, dest.size) // info.itemsize + if count is None: + return info, available + if count < 0: + raise ValueError("count must be >= 0") + if count > available: + raise ValueError("count exceeds the available symmetric storage") + return info, count + + +def _scalar_to_bits(value, dtype_info: _DTypeInfo) -> int: + c_value = dtype_info.ctype(value) + raw = ctypes.string_at(ctypes.byref(c_value), dtype_info.itemsize) + return int.from_bytes(raw, sys.byteorder, signed=False) + + +def _bits_to_scalar(bits: int, dtype_info: _DTypeInfo): + raw = int(bits).to_bytes(dtype_info.itemsize, sys.byteorder, signed=False) + value = dtype_info.ctype.from_buffer_copy(raw) + return value.value diff --git a/ishmem4py/python/ishmem4py/_lib.py b/ishmem4py/python/ishmem4py/_lib.py new file mode 100644 index 0000000..f44b984 --- /dev/null +++ b/ishmem4py/python/ishmem4py/_lib.py @@ -0,0 +1,301 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import ctypes +import os +from pathlib import Path + +_RUNTIME_ENVVAR = "ISHMEM4PY_RUNTIME_LIBRARY" + + +def _candidate_paths(): + env_path = os.environ.get(_RUNTIME_ENVVAR) + if env_path: + yield Path(env_path) + + package_dir = Path(__file__).resolve().parent + yield package_dir / "_ishmem4py_runtime.so" + + +def _configure_runtime(runtime): + runtime.ishmem4py_init.argtypes = [] + runtime.ishmem4py_init.restype = None + + runtime.ishmem4py_init_with_device.argtypes = [ctypes.c_int] + runtime.ishmem4py_init_with_device.restype = None + + runtime.ishmem4py_finalize.argtypes = [] + runtime.ishmem4py_finalize.restype = None + + runtime.ishmem4py_my_pe.argtypes = [] + runtime.ishmem4py_my_pe.restype = ctypes.c_int + + runtime.ishmem4py_n_pes.argtypes = [] + runtime.ishmem4py_n_pes.restype = ctypes.c_int + + runtime.ishmem4py_info_get_version.argtypes = [ + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), + ] + runtime.ishmem4py_info_get_version.restype = None + + runtime.ishmem4py_info_get_name.argtypes = [ctypes.c_void_p] + runtime.ishmem4py_info_get_name.restype = None + + runtime.ishmem4py_vendor_get_version.argtypes = [ + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), + ctypes.POINTER(ctypes.c_int), + ] + runtime.ishmem4py_vendor_get_version.restype = None + + runtime.ishmem4py_barrier_all.argtypes = [] + runtime.ishmem4py_barrier_all.restype = None + + runtime.ishmem4py_sync_all.argtypes = [] + runtime.ishmem4py_sync_all.restype = None + + runtime.ishmem4py_fence.argtypes = [] + runtime.ishmem4py_fence.restype = None + + runtime.ishmem4py_quiet.argtypes = [] + runtime.ishmem4py_quiet.restype = None + + runtime.ishmem4py_malloc.argtypes = [ctypes.c_size_t] + runtime.ishmem4py_malloc.restype = ctypes.c_void_p + + runtime.ishmem4py_calloc.argtypes = [ctypes.c_size_t, ctypes.c_size_t] + runtime.ishmem4py_calloc.restype = ctypes.c_void_p + + runtime.ishmem4py_free.argtypes = [ctypes.c_void_p] + runtime.ishmem4py_free.restype = None + + runtime.ishmem4py_putmem.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.c_int, + ] + runtime.ishmem4py_putmem.restype = None + + runtime.ishmem4py_getmem.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.c_int, + ] + runtime.ishmem4py_getmem.restype = None + + runtime.ishmem4py_putmem_on_queue.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.c_int, + ctypes.c_void_p, + ] + runtime.ishmem4py_putmem_on_queue.restype = None + + runtime.ishmem4py_getmem_on_queue.argtypes = [ + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.c_int, + ctypes.c_void_p, + ] + runtime.ishmem4py_getmem_on_queue.restype = None + + runtime.ishmem4py_quiet_on_queue.argtypes = [ctypes.c_void_p] + runtime.ishmem4py_quiet_on_queue.restype = None + + runtime.ishmem4py_queue_sync.argtypes = [ctypes.c_void_p] + runtime.ishmem4py_queue_sync.restype = None + + runtime.ishmem4py_ptr.argtypes = [ctypes.c_void_p, ctypes.c_int] + runtime.ishmem4py_ptr.restype = ctypes.c_void_p + + runtime.ishmem4py_team_my_pe.argtypes = [ctypes.c_int] + runtime.ishmem4py_team_my_pe.restype = ctypes.c_int + + runtime.ishmem4py_team_n_pes.argtypes = [ctypes.c_int] + runtime.ishmem4py_team_n_pes.restype = ctypes.c_int + + runtime.ishmem4py_team_translate_pe.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int] + runtime.ishmem4py_team_translate_pe.restype = ctypes.c_int + + runtime.ishmem4py_team_sync.argtypes = [ctypes.c_int] + runtime.ishmem4py_team_sync.restype = ctypes.c_int + + runtime.ishmem4py_team_split_strided.argtypes = [ + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_long, + ctypes.POINTER(ctypes.c_int), + ] + runtime.ishmem4py_team_split_strided.restype = ctypes.c_int + + runtime.ishmem4py_team_split_2d.argtypes = [ + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_long, + ctypes.POINTER(ctypes.c_int), + ctypes.c_void_p, + ctypes.c_long, + ctypes.POINTER(ctypes.c_int), + ] + runtime.ishmem4py_team_split_2d.restype = ctypes.c_int + + runtime.ishmem4py_team_destroy.argtypes = [ctypes.c_int] + runtime.ishmem4py_team_destroy.restype = None + + runtime.ishmem4py_broadcastmem.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ctypes.c_int, + ] + runtime.ishmem4py_broadcastmem.restype = ctypes.c_int + + runtime.ishmem4py_collectmem.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ] + runtime.ishmem4py_collectmem.restype = ctypes.c_int + + runtime.ishmem4py_fcollectmem.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ] + runtime.ishmem4py_fcollectmem.restype = ctypes.c_int + + runtime.ishmem4py_alltoallmem.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ] + runtime.ishmem4py_alltoallmem.restype = ctypes.c_int + + runtime.ishmem4py_reduce.argtypes = [ + ctypes.c_int, + ctypes.c_int, + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_void_p, + ctypes.c_size_t, + ] + runtime.ishmem4py_reduce.restype = ctypes.c_int + + runtime.ishmem4py_atomic_fetch.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + runtime.ishmem4py_atomic_fetch.restype = ctypes.c_uint64 + + runtime.ishmem4py_atomic_set.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_int, + ] + runtime.ishmem4py_atomic_set.restype = None + + runtime.ishmem4py_atomic_swap.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_int, + ] + runtime.ishmem4py_atomic_swap.restype = ctypes.c_uint64 + + runtime.ishmem4py_atomic_compare_swap.argtypes = [ + ctypes.c_int, + ctypes.c_void_p, + ctypes.c_uint64, + ctypes.c_uint64, + ctypes.c_int, + ] + runtime.ishmem4py_atomic_compare_swap.restype = ctypes.c_uint64 + + runtime.ishmem4py_atomic_fetch_inc.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + runtime.ishmem4py_atomic_fetch_inc.restype = ctypes.c_uint64 + + runtime.ishmem4py_atomic_inc.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_int] + runtime.ishmem4py_atomic_inc.restype = None + + for name in ( + "ishmem4py_atomic_fetch_add", + "ishmem4py_atomic_fetch_and", + "ishmem4py_atomic_fetch_or", + "ishmem4py_atomic_fetch_xor", + ): + func = getattr(runtime, name) + func.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int] + func.restype = ctypes.c_uint64 + + for name in ( + "ishmem4py_atomic_add", + "ishmem4py_atomic_and", + "ishmem4py_atomic_or", + "ishmem4py_atomic_xor", + ): + func = getattr(runtime, name) + func.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_uint64, ctypes.c_int] + func.restype = None + + runtime.ishmem4py_cast_ptr_to_uintptr.argtypes = [ctypes.c_void_p] + runtime.ishmem4py_cast_ptr_to_uintptr.restype = ctypes.c_uint64 + + runtime.ishmem4py_cast_uintptr_to_ptr.argtypes = [ctypes.c_uint64] + runtime.ishmem4py_cast_uintptr_to_ptr.restype = ctypes.c_void_p + + return runtime + + +def _load_runtime(): + last_error = None + for candidate in _candidate_paths(): + if not candidate.exists(): + continue + try: + runtime = ctypes.CDLL(str(candidate)) + except OSError as exc: + last_error = exc + continue + return _configure_runtime(runtime) + + if last_error is not None: + raise RuntimeError( + "Found ishmem4py runtime library candidate but failed to load it. " + "Ensure oneAPI runtime libraries are in LD_LIBRARY_PATH and set " + f"{_RUNTIME_ENVVAR} if needed." + ) from last_error + + raise RuntimeError( + "Could not locate _ishmem4py_runtime.so. For a standard install, build ishmem with " + "-DBUILD_PYTHON_BINDINGS=ON and pip install the build-tree package directory. " + f"For an editable/dev install, set {_RUNTIME_ENVVAR} to the built runtime library." + ) + + +class _RuntimeProxy: + def __init__(self): + self._runtime = None + + def _get_runtime(self): + if self._runtime is None: + self._runtime = _load_runtime() + return self._runtime + + def __getattr__(self, name): + return getattr(self._get_runtime(), name) + + +RUNTIME = _RuntimeProxy() diff --git a/ishmem4py/python/ishmem4py/collective.py b/ishmem4py/python/ishmem4py/collective.py new file mode 100644 index 0000000..4c9c868 --- /dev/null +++ b/ishmem4py/python/ishmem4py/collective.py @@ -0,0 +1,288 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Host-side collective operations over symmetric allocations.""" + +from __future__ import annotations + +import ctypes +from typing import Optional + +from ._common import ( + IshmemError, + SymmetricMemory, + TEAM_WORLD, + _check_status, + _dtype_element_count, + _normalize_reduction_op, + _normalize_span, + _normalize_team, + _require_active, + _require_initialized, + _require_reduction_dtype, +) +from ._lib import RUNTIME +from .teams import team_my_pe, team_n_pes, team_sync + +__all__ = [ + "alltoall", + "barrier", + "barrier_all", + "broadcast", + "collect", + "fcollect", + "reduce", + "reducescatter", + "sync", + "sync_all", +] + + +def _collective_span( + dest: SymmetricMemory, + src: SymmetricMemory, + *, + size: Optional[int], + dest_offset: int, + src_offset: int, +) -> tuple[int, int, int]: + _require_active(dest) + _require_active(src) + dest_size = _normalize_span(dest, size=None, offset=dest_offset) + src_size = _normalize_span(src, size=None, offset=src_offset) + nbytes = min(dest_size, src_size) if size is None else size + if nbytes < 0: + raise ValueError("size must be >= 0") + if nbytes > dest_size: + raise ValueError("size extends past the destination symmetric object") + if nbytes > src_size: + raise ValueError("size extends past the source symmetric object") + return dest_size, src_size, nbytes + + +def barrier_all() -> None: + """Block until all PEs in ``TEAM_WORLD`` arrive.""" + _require_initialized() + RUNTIME.ishmem4py_barrier_all() + + +def sync_all() -> None: + """Synchronize all PEs in ``TEAM_WORLD``.""" + _require_initialized() + RUNTIME.ishmem4py_sync_all() + + +def sync(team=None) -> None: + """Synchronize all PEs in ``team``.""" + team_sync(team) + + +def barrier(team=None) -> None: + """Barrier on ``team``.""" + if team is None or _normalize_team(team) == TEAM_WORLD.handle: + barrier_all() + return + team_sync(team) + + +def broadcast( + dest: SymmetricMemory, + src: SymmetricMemory, + *, + root: int, + team=None, + size: Optional[int] = None, + dest_offset: int = 0, + src_offset: int = 0, +) -> int: + """Broadcast ``src`` from ``root`` into ``dest`` across ``team``.""" + _require_initialized() + _, _, nbytes = _collective_span(dest, src, size=size, dest_offset=dest_offset, src_offset=src_offset) + status = RUNTIME.ishmem4py_broadcastmem( + _normalize_team(team), + ctypes.c_void_p(dest.ptr + dest_offset), + ctypes.c_void_p(src.ptr + src_offset), + nbytes, + root, + ) + _check_status(status, "ishmem_broadcastmem") + return nbytes + + +def collect( + dest: SymmetricMemory, + src: SymmetricMemory, + *, + team=None, + size: Optional[int] = None, + dest_offset: int = 0, + src_offset: int = 0, +) -> int: + """Concatenate each PE's contribution from ``src`` into ``dest``.""" + _require_initialized() + participants = team_n_pes(team) + dest_size = _normalize_span(dest, size=None, offset=dest_offset) + src_size = _normalize_span(src, size=None, offset=src_offset) + per_pe = min(src_size, dest_size // max(1, participants)) if size is None else size + if per_pe < 0: + raise ValueError("size must be >= 0") + if per_pe > src_size: + raise ValueError("size extends past the source symmetric object") + if per_pe * max(1, participants) > dest_size: + raise ValueError("destination symmetric object is too small for collect") + status = RUNTIME.ishmem4py_collectmem( + _normalize_team(team), + ctypes.c_void_p(dest.ptr + dest_offset), + ctypes.c_void_p(src.ptr + src_offset), + per_pe, + ) + _check_status(status, "ishmem_collectmem") + return per_pe + + +def fcollect( + dest: SymmetricMemory, + src: SymmetricMemory, + *, + team=None, + size: Optional[int] = None, + dest_offset: int = 0, + src_offset: int = 0, +) -> int: + """Fixed-size collect from ``src`` into ``dest``.""" + _require_initialized() + participants = team_n_pes(team) + dest_size = _normalize_span(dest, size=None, offset=dest_offset) + src_size = _normalize_span(src, size=None, offset=src_offset) + per_pe = min(src_size, dest_size // max(1, participants)) if size is None else size + if per_pe < 0: + raise ValueError("size must be >= 0") + if per_pe > src_size: + raise ValueError("size extends past the source symmetric object") + if per_pe * max(1, participants) > dest_size: + raise ValueError("destination symmetric object is too small for fcollect") + status = RUNTIME.ishmem4py_fcollectmem( + _normalize_team(team), + ctypes.c_void_p(dest.ptr + dest_offset), + ctypes.c_void_p(src.ptr + src_offset), + per_pe, + ) + _check_status(status, "ishmem_fcollectmem") + return per_pe + + +def alltoall( + dest: SymmetricMemory, + src: SymmetricMemory, + *, + team=None, + size: Optional[int] = None, + dest_offset: int = 0, + src_offset: int = 0, +) -> int: + """Exchange equally sized blocks among all PEs in ``team``.""" + _require_initialized() + participants = team_n_pes(team) + if participants <= 0: + raise IshmemError("alltoall requires a valid team") + dest_size = _normalize_span(dest, size=None, offset=dest_offset) + src_size = _normalize_span(src, size=None, offset=src_offset) + per_pe = min(src_size // participants, dest_size // participants) if size is None else size + if per_pe < 0: + raise ValueError("size must be >= 0") + if per_pe * participants > src_size: + raise ValueError("source symmetric object is too small for alltoall") + if per_pe * participants > dest_size: + raise ValueError("destination symmetric object is too small for alltoall") + status = RUNTIME.ishmem4py_alltoallmem( + _normalize_team(team), + ctypes.c_void_p(dest.ptr + dest_offset), + ctypes.c_void_p(src.ptr + src_offset), + per_pe, + ) + _check_status(status, "ishmem_alltoallmem") + return per_pe + + +def reduce( + op: str, + dest: SymmetricMemory, + src: SymmetricMemory, + *, + dtype, + count: Optional[int] = None, + team=None, +) -> int: + """Reduce ``src`` into ``dest`` across ``team`` using ``op`` and ``dtype``.""" + _require_initialized() + _require_active(dest) + _require_active(src) + info = _require_reduction_dtype(op, dtype) + _, count = _dtype_element_count(src, dest, info, count) + status = RUNTIME.ishmem4py_reduce( + _normalize_reduction_op(op), + info.code, + _normalize_team(team), + ctypes.c_void_p(dest.ptr), + ctypes.c_void_p(src.ptr), + count, + ) + _check_status(status, f"ishmem_{info.name}_{op}_reduce") + return count + + +def reducescatter( + op: str, + dest: SymmetricMemory, + src: SymmetricMemory, + *, + dtype, + count: Optional[int] = None, + team=None, +) -> int: + """Reduce ``src`` across ``team`` and return this PE's block in ``dest``. + + Intel SHMEM does not currently expose a host-side reducescatter routine. + ``ishmem4py`` therefore implements the operation as a small software + fallback: perform a full ``reduce`` into a temporary symmetric buffer, then + copy the local team's slice into ``dest``. + """ + from .init_fini import my_pe + from .memory import free, malloc + from .rma import get + + _require_initialized() + _require_active(dest) + _require_active(src) + info = _require_reduction_dtype(op, dtype) + participants = team_n_pes(team) + if participants <= 0: + raise IshmemError("reducescatter requires a valid team") + + dest_available = dest.size // info.itemsize + src_available = src.size // info.itemsize + if count is None: + count = min(dest_available, src_available // participants) + if count < 0: + raise ValueError("count must be >= 0") + if count > dest_available: + raise ValueError("count exceeds the available destination symmetric storage") + + total_count = count * participants + if total_count > src_available: + raise ValueError("count exceeds the available source symmetric storage for reducescatter") + + scratch = malloc(total_count * info.itemsize) + try: + reduce(op, scratch, src, dtype=info, count=total_count, team=team) + get( + dest, + scratch, + pe=my_pe(), + size=count * info.itemsize, + src_offset=team_my_pe(team) * count * info.itemsize, + ) + finally: + free(scratch) + + return count diff --git a/ishmem4py/python/ishmem4py/core.py b/ishmem4py/python/ishmem4py/core.py new file mode 100644 index 0000000..cb427c8 --- /dev/null +++ b/ishmem4py/python/ishmem4py/core.py @@ -0,0 +1,85 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Aggregated public ``ishmem4py`` API.""" + +from . import collective, init_fini, memory, rma, teams +from ._common import ( + Comparison, + InitStatus, + IshmemError, + IshmemStateError, + MemoryPointer, + SignalOp, + SymmetricMemory, + TEAM_INVALID, + TEAM_SHARED, + TEAM_WORLD, + Team, + Version, +) +from .collective import * +from .init_fini import * +from .memory import * +from .rma import * +from .teams import * + +__all__ = [ + "Comparison", + "InitStatus", + "IshmemError", + "IshmemStateError", + "MemoryPointer", + "SignalOp", + "SymmetricMemory", + "TEAM_INVALID", + "TEAM_SHARED", + "TEAM_WORLD", + "Team", + "Version", +] + +__all__ += collective.__all__ +__all__ += init_fini.__all__ +__all__ += memory.__all__ +__all__ += rma.__all__ +__all__ += teams.__all__ + + +def tensor(*args, **kwargs): + from . import torch as torch_interop + + return torch_interop.tensor(*args, **kwargs) + + +def free_tensor(*args, **kwargs): + from . import torch as torch_interop + + return torch_interop.free_tensor(*args, **kwargs) + + +def get_peer_tensor(*args, **kwargs): + from . import torch as torch_interop + + return torch_interop.get_peer_tensor(*args, **kwargs) + + +def tensor_base(*args, **kwargs): + from . import torch as torch_interop + + return torch_interop.tensor_base(*args, **kwargs) + + +def is_symmetric_tensor(*args, **kwargs): + from . import torch as torch_interop + + return torch_interop.is_symmetric_tensor(*args, **kwargs) + + +__all__ += [ + "free_tensor", + "get_peer_tensor", + "is_symmetric_tensor", + "tensor", + "tensor_base", +] diff --git a/ishmem4py/python/ishmem4py/init_fini.py b/ishmem4py/python/ishmem4py/init_fini.py new file mode 100644 index 0000000..58d4312 --- /dev/null +++ b/ishmem4py/python/ishmem4py/init_fini.py @@ -0,0 +1,173 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Initialization, finalization, and runtime queries.""" + +from __future__ import annotations + +import ctypes +from typing import Any + +from ._common import ( + InitStatus, + Version, + _check_can_finalize, + _check_can_init, + _get_init_status, + _require_initialized, + _set_finalized, + _set_initialized, +) +from ._lib import RUNTIME +from .version import __version__ + +_INFO_NAME_BYTES = 256 + +__all__ = [ + "InitStatus", + "Version", + "finalize", + "get_version", + "info_get_name", + "info_get_version", + "init", + "init_status", + "is_initialized", + "my_pe", + "n_pes", +] + + +def init(*, device_id: Any = None) -> None: + """Initialize Intel SHMEM for the current process. + + When ``device_id`` is provided, Intel SHMEM selects that visible XPU ordinal + through ``ishmemx_init_attr``. The default ``None`` preserves the legacy + single-visible-device behavior. + """ + _check_can_init() + normalized_device_id = _normalize_device_id(device_id) + if normalized_device_id is None: + RUNTIME.ishmem4py_init() + else: + RUNTIME.ishmem4py_init_with_device(normalized_device_id) + _set_initialized() + + +def _normalize_device_id(device_id: Any) -> int | None: + if device_id is None: + return None + + if isinstance(device_id, bool): + raise TypeError("device_id must be an int, an XPU device string, a torch.device, or None") + + if isinstance(device_id, int): + if device_id < -1: + raise ValueError("device_id must be >= -1") + return int(device_id) + + if isinstance(device_id, str): + if device_id == "xpu": + return _current_torch_xpu_device() + if device_id.startswith("xpu:"): + index_str = device_id.split(":", 1)[1] + if not index_str: + raise ValueError("device string 'xpu:' is missing a device index") + try: + index = int(index_str) + except ValueError as exc: + raise ValueError(f"invalid XPU device string: {device_id!r}") from exc + if index < 0: + raise ValueError("device_id must be >= -1") + return index + raise ValueError(f"unsupported device string for ishmem.init: {device_id!r}") + + torch = _import_torch() + if torch is not None and isinstance(device_id, torch.device): + if device_id.type != "xpu": + raise ValueError(f"unsupported torch.device for ishmem.init: {device_id}") + if device_id.index is None: + return _current_torch_xpu_device() + if device_id.index < 0: + raise ValueError("device_id must be >= -1") + return int(device_id.index) + + raise TypeError("device_id must be an int, an XPU device string, a torch.device, or None") + + +def _import_torch(): + try: + import torch + except Exception: + return None + return torch + + +def _current_torch_xpu_device() -> int: + torch = _import_torch() + if torch is None: + raise RuntimeError("normalizing an XPU device requires PyTorch to be installed") + return int(torch.xpu.current_device()) + + +def finalize() -> None: + """Finalize Intel SHMEM for the current process.""" + _check_can_finalize() + RUNTIME.ishmem4py_finalize() + _set_finalized() + + +def init_status() -> InitStatus: + """Return the process-local initialization state.""" + return _get_init_status() + + +def is_initialized() -> bool: + """Return ``True`` if Intel SHMEM is currently initialized.""" + return init_status() == InitStatus.INITIALIZED + + +def my_pe() -> int: + """Return the calling PE's global rank.""" + _require_initialized() + return int(RUNTIME.ishmem4py_my_pe()) + + +def n_pes() -> int: + """Return the number of PEs in ``TEAM_WORLD``.""" + _require_initialized() + return int(RUNTIME.ishmem4py_n_pes()) + + +def info_get_version() -> tuple[int, int]: + """Return the OpenSHMEM specification version reported by the runtime.""" + major = ctypes.c_int() + minor = ctypes.c_int() + RUNTIME.ishmem4py_info_get_version(ctypes.byref(major), ctypes.byref(minor)) + return int(major.value), int(minor.value) + + +def info_get_name() -> str: + """Return the runtime vendor name.""" + name = ctypes.create_string_buffer(_INFO_NAME_BYTES) + RUNTIME.ishmem4py_info_get_name(ctypes.cast(name, ctypes.c_void_p)) + return name.value.decode("utf-8") + + +def get_version() -> Version: + """Return combined OpenSHMEM, Intel SHMEM, and ``ishmem4py`` version metadata.""" + spec_major, spec_minor = info_get_version() + lib_major = ctypes.c_int() + lib_minor = ctypes.c_int() + lib_patch = ctypes.c_int() + RUNTIME.ishmem4py_vendor_get_version( + ctypes.byref(lib_major), + ctypes.byref(lib_minor), + ctypes.byref(lib_patch), + ) + return Version( + openshmem_spec_version=f"{spec_major}.{spec_minor}", + ishmem4py_version=__version__, + libishmem_version=f"{lib_major.value}.{lib_minor.value}.{lib_patch.value}", + vendor_name=info_get_name(), + ) diff --git a/ishmem4py/python/ishmem4py/memory.py b/ishmem4py/python/ishmem4py/memory.py new file mode 100644 index 0000000..048ed9b --- /dev/null +++ b/ishmem4py/python/ishmem4py/memory.py @@ -0,0 +1,88 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Symmetric memory allocation and address helpers.""" + +from __future__ import annotations + +import ctypes + +from ._common import ( + IshmemError, + MemoryPointer, + SymmetricMemory, + _register_allocation, + _require_active, + _require_initialized, + _unregister_allocation, +) +from ._lib import RUNTIME + +__all__ = [ + "MemoryPointer", + "SymmetricMemory", + "buffer", + "calloc", + "free", + "ishmem_ptr", + "malloc", + "ptr", +] + + +def malloc(size: int) -> SymmetricMemory: + """Allocate ``size`` bytes from the symmetric heap.""" + _require_initialized() + if size < 0: + raise ValueError("size must be >= 0") + ptr_value = int(RUNTIME.ishmem4py_malloc(size) or 0) + if size > 0 and not ptr_value: + raise IshmemError(f"ishmem_malloc({size}) returned NULL") + result = SymmetricMemory(ptr=ptr_value, size=size) + _register_allocation(result) + return result + + +def calloc(count: int, size: int) -> SymmetricMemory: + """Allocate ``count * size`` zeroed bytes from the symmetric heap.""" + _require_initialized() + if count < 0 or size < 0: + raise ValueError("count and size must be >= 0") + total_size = count * size + ptr_value = int(RUNTIME.ishmem4py_calloc(count, size) or 0) + if total_size > 0 and not ptr_value: + raise IshmemError(f"ishmem_calloc({count}, {size}) returned NULL") + result = SymmetricMemory(ptr=ptr_value, size=total_size) + _register_allocation(result) + return result + + +def buffer(size: int, *, zero: bool = False) -> SymmetricMemory: + """Allocate a symmetric byte buffer.""" + if zero: + return calloc(1, size) + return malloc(size) + + +def free(symm: SymmetricMemory) -> None: + """Free a symmetric allocation previously returned by ``malloc`` or ``calloc``.""" + _require_initialized() + _require_active(symm) + RUNTIME.ishmem4py_free(ctypes.c_void_p(symm.ptr)) + _unregister_allocation(symm) + symm._mark_freed() + + +def ptr(symm: SymmetricMemory, *, pe: int) -> MemoryPointer | None: + """Return a directly addressable pointer for ``symm`` on ``pe`` when available.""" + _require_initialized() + _require_active(symm) + address = int(RUNTIME.ishmem4py_ptr(ctypes.c_void_p(symm.ptr), pe) or 0) + if address == 0: + return None + return MemoryPointer(address=address, size=symm.size, pe=pe, base=symm) + + +def ishmem_ptr(symm: SymmetricMemory, *, pe: int) -> MemoryPointer | None: + """Alias for ``ptr`` for Intel SHMEM naming compatibility.""" + return ptr(symm, pe=pe) diff --git a/ishmem4py/python/ishmem4py/rma.py b/ishmem4py/python/ishmem4py/rma.py new file mode 100644 index 0000000..5f25bbb --- /dev/null +++ b/ishmem4py/python/ishmem4py/rma.py @@ -0,0 +1,156 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Host-initiated one-sided RMA operations.""" + +from __future__ import annotations + +import ctypes +from typing import Optional + +from ._common import ( + SymmetricMemory, + _normalize_span, + _pointer_from_local_buffer, + _require_initialized, + _symmetric_target, +) +from ._lib import RUNTIME + +__all__ = [ + "fence", + "get", + "getmem", + "put", + "putmem", + "quiet", +] + + +def _queue_handle(queue) -> int: + if queue is None: + raise ValueError("queue must not be None") + if isinstance(queue, int): + handle = int(queue) + elif isinstance(queue, ctypes.c_void_p): + handle = int(queue.value or 0) + elif hasattr(queue, "sycl_queue"): + handle = int(queue.sycl_queue) + elif hasattr(queue, "_as_parameter_"): + handle = int(getattr(queue._as_parameter_, "value", queue._as_parameter_)) + else: + raise TypeError("queue must be a torch.xpu.Stream, ctypes.c_void_p, or integer queue pointer") + if handle == 0: + raise ValueError("queue pointer must be non-zero") + return handle + + +def fence() -> None: + """Order previously issued put-like operations.""" + _require_initialized() + RUNTIME.ishmem4py_fence() + + +def quiet(queue=None) -> None: + """Wait for completion of previously issued RMA operations.""" + _require_initialized() + if queue is not None: + queue_ptr = ctypes.c_void_p(_queue_handle(queue)) + RUNTIME.ishmem4py_queue_sync(queue_ptr) + RUNTIME.ishmem4py_quiet() + return + RUNTIME.ishmem4py_quiet() + + +def put( + dest: SymmetricMemory, + src, + *, + pe: int, + size: Optional[int] = None, + dest_offset: int = 0, + src_offset: int = 0, + queue=None, +) -> int: + """Copy bytes from a local buffer into symmetric memory on ``pe``.""" + _require_initialized() + dest = _symmetric_target(dest) + src_info = _pointer_from_local_buffer(src, writable=False, offset=src_offset) + dest_size = _normalize_span(dest, size=None, offset=dest_offset) + nbytes = min(dest_size, src_info.size) if size is None else size + + if nbytes < 0: + raise ValueError("size must be >= 0") + if nbytes > dest_size: + raise ValueError("size extends past the destination symmetric object") + if nbytes > src_info.size: + raise ValueError("size extends past the source buffer") + + if queue is None: + RUNTIME.ishmem4py_putmem( + ctypes.c_void_p(dest.ptr + dest_offset), + ctypes.c_void_p(src_info.ptr), + nbytes, + pe, + ) + else: + RUNTIME.ishmem4py_putmem_on_queue( + ctypes.c_void_p(dest.ptr + dest_offset), + ctypes.c_void_p(src_info.ptr), + nbytes, + pe, + ctypes.c_void_p(_queue_handle(queue)), + ) + return nbytes + + +def get( + dest, + src: SymmetricMemory, + *, + pe: int, + size: Optional[int] = None, + dest_offset: int = 0, + src_offset: int = 0, + queue=None, +) -> int: + """Copy bytes from symmetric memory on ``pe`` into a local buffer.""" + _require_initialized() + src = _symmetric_target(src) + dest_info = _pointer_from_local_buffer(dest, writable=True, offset=dest_offset) + src_size = _normalize_span(src, size=None, offset=src_offset) + nbytes = min(dest_info.size, src_size) if size is None else size + + if nbytes < 0: + raise ValueError("size must be >= 0") + if nbytes > dest_info.size: + raise ValueError("size extends past the destination buffer") + if nbytes > src_size: + raise ValueError("size extends past the source symmetric object") + + if queue is None: + RUNTIME.ishmem4py_getmem( + ctypes.c_void_p(dest_info.ptr), + ctypes.c_void_p(src.ptr + src_offset), + nbytes, + pe, + ) + else: + RUNTIME.ishmem4py_getmem_on_queue( + ctypes.c_void_p(dest_info.ptr), + ctypes.c_void_p(src.ptr + src_offset), + nbytes, + pe, + ctypes.c_void_p(_queue_handle(queue)), + ) + return nbytes + + +def putmem(*args, **kwargs) -> int: + """Alias for ``put`` for Intel SHMEM naming compatibility.""" + return put(*args, **kwargs) + + +def getmem(*args, **kwargs) -> int: + """Alias for ``get`` for Intel SHMEM naming compatibility.""" + return get(*args, **kwargs) diff --git a/ishmem4py/python/ishmem4py/teams.py b/ishmem4py/python/ishmem4py/teams.py new file mode 100644 index 0000000..ae5d79d --- /dev/null +++ b/ishmem4py/python/ishmem4py/teams.py @@ -0,0 +1,56 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Team queries and synchronization helpers.""" + +from __future__ import annotations + +from ._common import ( + TEAM_SHARED, + TEAM_WORLD, + Team, + _check_status, + _normalize_team, + _require_initialized, +) +from ._lib import RUNTIME + +__all__ = [ + "TEAM_SHARED", + "TEAM_WORLD", + "Team", + "team_my_pe", + "team_n_pes", + "team_sync", + "team_translate_pe", +] + + +def team_my_pe(team: Team | int | None = None) -> int: + """Return the calling PE's rank within ``team``.""" + _require_initialized() + return int(RUNTIME.ishmem4py_team_my_pe(_normalize_team(team))) + + +def team_n_pes(team: Team | int | None = None) -> int: + """Return the number of PEs in ``team``.""" + _require_initialized() + return int(RUNTIME.ishmem4py_team_n_pes(_normalize_team(team))) + + +def team_translate_pe(src_team: Team | int | None, src_pe: int, dest_team: Team | int | None) -> int: + """Translate ``src_pe`` from ``src_team`` numbering into ``dest_team`` numbering.""" + _require_initialized() + return int( + RUNTIME.ishmem4py_team_translate_pe( + _normalize_team(src_team), + src_pe, + _normalize_team(dest_team), + ) + ) + + +def team_sync(team: Team | int | None = None) -> None: + """Synchronize all PEs in ``team``.""" + _require_initialized() + _check_status(RUNTIME.ishmem4py_team_sync(_normalize_team(team)), "ishmem_team_sync") diff --git a/ishmem4py/python/ishmem4py/torch.py b/ishmem4py/python/ishmem4py/torch.py new file mode 100644 index 0000000..fb2c52f --- /dev/null +++ b/ishmem4py/python/ishmem4py/torch.py @@ -0,0 +1,287 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +"""Optional Torch/XPU interoperability helpers for ``ishmem4py``.""" + +from __future__ import annotations + +import ctypes +from dataclasses import dataclass +from typing import Iterable + +from ._common import IshmemStateError, _PointerInfo, _require_initialized +from ._lib import RUNTIME + +try: + import torch as _torch +except Exception: + _torch = None + +try: + from . import _ishmem4py_torch as _TORCH_EXT +except Exception: + _TORCH_EXT = None + +__all__ = [ + "free_tensor", + "get_peer_tensor", + "is_symmetric_tensor", + "tensor", + "tensor_base", +] + + +_DTYPE_TO_SCALAR_TYPE = { + _torch.uint8 if _torch is not None else object(): 0, + _torch.int8 if _torch is not None else object(): 1, + _torch.int16 if _torch is not None else object(): 2, + _torch.int32 if _torch is not None else object(): 3, + _torch.int64 if _torch is not None else object(): 4, + _torch.float16 if _torch is not None else object(): 5, + _torch.float32 if _torch is not None else object(): 6, + _torch.float64 if _torch is not None else object(): 7, + _torch.bool if _torch is not None else object(): 11, + _torch.bfloat16 if _torch is not None else object(): 15, +} + + +@dataclass +class _TensorAllocation: + tensor: "_torch.Tensor" + ptr: int + size_bytes: int + + +_live_tensor_allocations: dict[int, _TensorAllocation] = {} + + +def _require_torch() -> None: + if _torch is None: + raise RuntimeError("Torch/XPU interop requires PyTorch to be installed") + if _TORCH_EXT is None: + raise RuntimeError( + "Torch/XPU interop is unavailable in this ishmem4py build; rebuild ishmem with " + "-DISHMEM4PY_BUILD_TORCH_INTEROP=ON using a Python executable from a PyTorch XPU environment" + ) + + +def _normalize_shape(size) -> tuple[int, ...]: + if isinstance(size, int): + return (int(size),) + if isinstance(size, Iterable): + return tuple(int(dim) for dim in size) + raise TypeError("size must be an int or an iterable of ints") + + +def _normalize_device(device) -> "_torch.device": + if _torch is None: + raise RuntimeError("PyTorch is not available") + if device is None: + return _torch.device("xpu", _torch.xpu.current_device()) + normalized = _torch.device(device) + if normalized.type != "xpu": + raise ValueError(f"ishmem4py.torch.tensor only supports XPU devices, got {normalized}") + if normalized.index is None: + return _torch.device("xpu", _torch.xpu.current_device()) + return normalized + + +def _tensor_nbytes(tensor: "_torch.Tensor") -> int: + return int(tensor.numel()) * int(tensor.element_size()) + + +def _register_tensor(base: "_torch.Tensor") -> None: + base_ptr = int(base.data_ptr()) + _live_tensor_allocations[base_ptr] = _TensorAllocation( + tensor=base, + ptr=base_ptr, + size_bytes=_tensor_nbytes(base), + ) + base._ishmem_alloc = True # type: ignore[attr-defined] + base._ishmem_base = base # type: ignore[attr-defined] + base._ishmem_free_owner = True # type: ignore[attr-defined] + + +def _allocation_from_base(base: "_torch.Tensor") -> _TensorAllocation: + allocation = _live_tensor_allocations.get(int(base.data_ptr())) + if allocation is None: + raise IshmemStateError("symmetric tensor allocation is not live") + return allocation + + +def _find_allocation_for_tensor(tensor: "_torch.Tensor") -> _TensorAllocation | None: + ptr = int(tensor.data_ptr()) + size_bytes = _tensor_nbytes(tensor) + for allocation in _live_tensor_allocations.values(): + start = allocation.ptr + end = allocation.ptr + allocation.size_bytes + if ptr < start: + continue + if ptr > end: + continue + if size_bytes > 0 and ptr + size_bytes > end: + continue + return allocation + return None + + +def _assert_contiguous_tensor(tensor: "_torch.Tensor") -> None: + if tensor.numel() != 0 and not tensor.is_contiguous(): + raise ValueError("tensor must be contiguous") + + +def _scalar_type_from_dtype(dtype) -> int: + scalar_type = _DTYPE_TO_SCALAR_TYPE.get(dtype) + if scalar_type is None: + raise TypeError(f"unsupported torch dtype for ishmem4py Torch interop: {dtype}") + return scalar_type + + +def _remote_tensor_ptr(base: "_torch.Tensor", *, pe: int) -> int: + address = int(RUNTIME.ishmem4py_ptr(ctypes.c_void_p(int(base.data_ptr())), int(pe)) or 0) + if address == 0: + raise IshmemStateError(f"ishmem_ptr returned NULL for PE {pe}") + return address + + +def _mark_tensor_alias(tensor: "_torch.Tensor", *, base: "_torch.Tensor", owner: bool) -> None: + tensor._ishmem_alias = True # type: ignore[attr-defined] + tensor._ishmem_alloc = True # type: ignore[attr-defined] + tensor._ishmem_base = base # type: ignore[attr-defined] + tensor._ishmem_free_owner = owner # type: ignore[attr-defined] + + +def _pointer_from_tensor(tensor, *, writable: bool, offset: int = 0) -> _PointerInfo | None: + if _torch is None or not isinstance(tensor, _torch.Tensor): + return None + _assert_contiguous_tensor(tensor) + size_bytes = _tensor_nbytes(tensor) + if offset < 0: + raise ValueError("offset must be >= 0") + if offset > size_bytes: + raise ValueError("offset is past the end of the tensor") + if writable and not tensor.is_leaf and tensor.requires_grad: + raise ValueError("destination tensor must be writable") + return _PointerInfo(ptr=int(tensor.data_ptr()) + offset, size=size_bytes - offset, keepalive=tensor) + + +def _symmetric_tensor_target(obj) -> _PointerInfo | None: + if _torch is None or not isinstance(obj, _torch.Tensor): + return None + _assert_contiguous_tensor(obj) + allocation = _find_allocation_for_tensor(obj) + if allocation is None: + return None + return _PointerInfo(ptr=int(obj.data_ptr()), size=_tensor_nbytes(obj), keepalive=obj) + + +def _live_tensor_pointers() -> list[int]: + return sorted(_live_tensor_allocations) + + +def tensor( + size, + *, + dtype=None, + device=None, + requires_grad: bool = False, +): + """Allocate a symmetric XPU tensor from the Intel SHMEM heap.""" + _require_initialized() + _require_torch() + shape = _normalize_shape(size) + if dtype is None: + dtype = _torch.get_default_dtype() + scalar_type = _scalar_type_from_dtype(dtype) + + normalized_device = _normalize_device(device) + with _torch.xpu.device(normalized_device): + result = _TORCH_EXT.alloc_tensor(shape, scalar_type, normalized_device.index) + + if result.numel() > 0: + _register_tensor(result) + else: + result._ishmem_alloc = True # type: ignore[attr-defined] + result._ishmem_base = result # type: ignore[attr-defined] + result._ishmem_free_owner = True # type: ignore[attr-defined] + if requires_grad: + result.requires_grad_(True) + return result + + +def tensor_base(tensor): + """Return the base symmetric allocation for ``tensor`` when available.""" + if _torch is None or not isinstance(tensor, _torch.Tensor): + raise TypeError("expected a torch.Tensor") + base = getattr(tensor, "_ishmem_base", None) + if base is not None: + return base + allocation = _find_allocation_for_tensor(tensor) + if allocation is None: + return tensor + return allocation.tensor + + +def get_peer_tensor(tensor, pe: int): + """Return an XPU tensor alias for ``tensor`` on remote PE ``pe``.""" + _require_initialized() + _require_torch() + if not isinstance(tensor, _torch.Tensor): + raise TypeError("expected a torch.Tensor") + if tensor.device.type != "xpu": + raise ValueError(f"get_peer_tensor only supports XPU tensors, got {tensor.device}") + if tensor.device.index is None: + raise ValueError("get_peer_tensor requires a concrete XPU device index") + + _assert_contiguous_tensor(tensor) + base = tensor_base(tensor) + if not is_symmetric_tensor(base): + raise IshmemStateError("tensor is not backed by ishmem4py symmetric memory") + + byte_offset = int(tensor.data_ptr()) - int(base.data_ptr()) + if byte_offset < 0: + raise IshmemStateError("tensor points before its tracked symmetric base") + remote_ptr = _remote_tensor_ptr(base, pe=pe) + byte_offset + result = _TORCH_EXT.tensor_from_ptr( + remote_ptr, + list(tensor.shape), + _scalar_type_from_dtype(tensor.dtype), + tensor.device.index, + ) + _mark_tensor_alias(result, base=base, owner=False) + result._ishmem_peer_pe = int(pe) # type: ignore[attr-defined] + result._ishmem_keepalive = tensor # type: ignore[attr-defined] + return result + + +def is_symmetric_tensor(tensor) -> bool: + """Return ``True`` when ``tensor`` references live symmetric memory.""" + if _torch is None or not isinstance(tensor, _torch.Tensor): + return False + return _find_allocation_for_tensor(tensor) is not None or bool( + getattr(tensor, "_ishmem_alloc", False) + ) + + +def free_tensor(tensor) -> None: + """Free a symmetric tensor previously returned by :func:`tensor`.""" + _require_initialized() + _require_torch() + if not isinstance(tensor, _torch.Tensor): + raise TypeError("expected a torch.Tensor") + if bool(getattr(tensor, "_ishmem_alias", False)) and not bool( + getattr(tensor, "_ishmem_free_owner", False) + ): + raise IshmemStateError("tensor is a non-owning symmetric alias and cannot be freed") + base = tensor_base(tensor) + if not getattr(base, "_ishmem_alloc", False): + raise IshmemStateError("tensor is not backed by ishmem4py symmetric memory") + base_ptr = int(base.data_ptr()) + allocation = _live_tensor_allocations.pop(base_ptr, None) + if allocation is None: + if base_ptr == 0: + base._ishmem_alloc = False # type: ignore[attr-defined] + return + raise IshmemStateError("symmetric tensor allocation has already been freed") + _TORCH_EXT.free_ptr(base_ptr) + allocation.tensor._ishmem_alloc = False # type: ignore[attr-defined] diff --git a/ishmem4py/python/ishmem4py/version.py b/ishmem4py/python/ishmem4py/version.py new file mode 100644 index 0000000..1ddab53 --- /dev/null +++ b/ishmem4py/python/ishmem4py/version.py @@ -0,0 +1,4 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +__version__ = "0.2.0a0" diff --git a/ishmem4py/python/pyproject.toml b/ishmem4py/python/pyproject.toml new file mode 100644 index 0000000..899e5ed --- /dev/null +++ b/ishmem4py/python/pyproject.toml @@ -0,0 +1,37 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "ishmem4py" +version = "0.1.0a0" +description = "Python bindings for Intel SHMEM" +readme = "README.md" +requires-python = ">=3.10" +authors = [ + { name = "Intel SHMEM contributors" } +] +license = { text = "BSD-3-Clause" } +classifiers = [ + "Development Status :: 3 - Alpha", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", +] + +[tool.setuptools] +include-package-data = false + +[tool.setuptools.packages.find] +where = ["."] +include = ["ishmem4py"] + +[tool.setuptools.package-data] +ishmem4py = [ + "_ishmem4py_runtime*.so", + "_ishmem4py_runtime*.dylib", + "_ishmem4py_runtime*.pyd", + "_ishmem4py_torch*.so", + "_ishmem4py_torch*.dylib", + "_ishmem4py_torch*.pyd", +] diff --git a/ishmem4py/python/setup.py b/ishmem4py/python/setup.py new file mode 100644 index 0000000..57568da --- /dev/null +++ b/ishmem4py/python/setup.py @@ -0,0 +1,30 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from setuptools import setup +from setuptools.dist import Distribution + +try: + from wheel.bdist_wheel import bdist_wheel as _bdist_wheel +except ImportError: + _bdist_wheel = None + + +class BinaryDistribution(Distribution): + def has_ext_modules(self): + return True + + +cmdclass = {} + +if _bdist_wheel is not None: + class bdist_wheel(_bdist_wheel): + def finalize_options(self): + super().finalize_options() + self.root_is_pure = False + + + cmdclass["bdist_wheel"] = bdist_wheel + + +setup(distclass=BinaryDistribution, cmdclass=cmdclass) diff --git a/ishmem4py/test/collective_test.py b/ishmem4py/test/collective_test.py new file mode 100644 index 0000000..e17c8d1 --- /dev/null +++ b/ishmem4py/test/collective_test.py @@ -0,0 +1,88 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import ishmem4py as ishmem + +from utils import expect_equal, pack_int32, unpack_int32, unpack_int32_list + + +def main() -> int: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("collective_test.py requires at least 2 PEs") + + broadcast_src = ishmem.malloc(4) + broadcast_dst = ishmem.calloc(1, 4) + collect_src = ishmem.malloc(4) + collect_dst = ishmem.calloc(npes, 4) + fcollect_dst = ishmem.calloc(npes, 4) + alltoall_src = ishmem.malloc(npes * 4) + alltoall_dst = ishmem.calloc(npes, 4) + reduce_src = ishmem.malloc(4) + reduce_sum_dst = ishmem.calloc(1, 4) + reduce_max_dst = ishmem.calloc(1, 4) + reducescatter_src = ishmem.malloc(npes * 4) + reducescatter_dst = ishmem.calloc(1, 4) + try: + broadcast_src.write(pack_int32(1000 + my_pe)) + collect_src.write(pack_int32(my_pe)) + alltoall_src.write(pack_int32(*[(my_pe * 100) + peer for peer in range(npes)])) + reduce_src.write(pack_int32(my_pe + 1)) + reducescatter_src.write(pack_int32(*[(my_pe + 1) * (peer + 1) for peer in range(npes)])) + + ishmem.barrier_all() + ishmem.sync_all() + + ishmem.broadcast(broadcast_dst, broadcast_src, root=0) + ishmem.collect(collect_dst, collect_src) + ishmem.fcollect(fcollect_dst, collect_src) + ishmem.alltoall(alltoall_dst, alltoall_src) + ishmem.reduce("sum", reduce_sum_dst, reduce_src, dtype="int32") + ishmem.reduce("max", reduce_max_dst, reduce_src, dtype="int32") + ishmem.reducescatter("sum", reducescatter_dst, reducescatter_src, dtype="int32", count=1) + + ishmem.barrier_all() + + expect_equal("broadcast result", unpack_int32(broadcast_dst.read(4)), 1000) + expect_equal("collect result", unpack_int32_list(collect_dst.read(npes * 4)), list(range(npes))) + expect_equal("fcollect result", unpack_int32_list(fcollect_dst.read(npes * 4)), list(range(npes))) + expect_equal( + "alltoall result", + unpack_int32_list(alltoall_dst.read(npes * 4)), + [(peer * 100) + my_pe for peer in range(npes)], + ) + expect_equal("sum reduce result", unpack_int32(reduce_sum_dst.read(4)), npes * (npes + 1) // 2) + expect_equal("max reduce result", unpack_int32(reduce_max_dst.read(4)), npes) + expect_equal( + "reducescatter result", + unpack_int32(reducescatter_dst.read(4)), + (my_pe + 1) * (npes * (npes + 1) // 2), + ) + finally: + ishmem.free(reducescatter_dst) + ishmem.free(reducescatter_src) + ishmem.free(reduce_max_dst) + ishmem.free(reduce_sum_dst) + ishmem.free(reduce_src) + ishmem.free(alltoall_dst) + ishmem.free(alltoall_src) + ishmem.free(fcollect_dst) + ishmem.free(collect_dst) + ishmem.free(collect_src) + ishmem.free(broadcast_dst) + ishmem.free(broadcast_src) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/init_device_id_test.py b/ishmem4py/test/init_device_id_test.py new file mode 100644 index 0000000..db89679 --- /dev/null +++ b/ishmem4py/test/init_device_id_test.py @@ -0,0 +1,122 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import os +import subprocess +import sys + +import ishmem4py as ishmem +from ishmem4py import init_fini as _init_fini + +from utils import expect_equal, pack_int32, unpack_int32 + +try: + import torch +except Exception: + torch = None + + +def _visible_level_zero_gpu_count() -> int: + try: + output = subprocess.check_output(["sycl-ls"], text=True) + except Exception: + return 0 + return sum(1 for line in output.splitlines() if "[level_zero:" in line) + + +def _local_rank() -> int: + for name in ("MPI_LOCALRANKID", "OMPI_COMM_WORLD_LOCAL_RANK", "PMI_LOCAL_RANK", "SLURM_LOCALID"): + value = os.environ.get(name) + if value is not None: + return int(value) + return 0 + + +def main() -> int: + try: + ishmem.init(device_id="0") # type: ignore[arg-type] + except ValueError: + pass + else: + raise AssertionError("init(device_id='0') should raise ValueError") + + try: + ishmem.init(device_id=-2) + except ValueError: + pass + else: + raise AssertionError("init(device_id=-2) should raise ValueError") + + try: + ishmem.init(device_id="cuda:0") + except ValueError: + pass + else: + raise AssertionError("init(device_id='cuda:0') should raise ValueError") + + visible_gpus = _visible_level_zero_gpu_count() + local_rank = _local_rank() + if visible_gpus < 2: + print("Skipping init_device_id_test.py because fewer than 2 visible Level Zero GPUs were found") + return 77 + if local_rank >= visible_gpus: + print( + f"Skipping init_device_id_test.py because local rank {local_rank} exceeds " + f"visible Level Zero GPU count {visible_gpus}" + ) + return 77 + + if torch is not None: + with torch.xpu.device(local_rank): + expect_equal("xpu string current device normalization", _init_fini._normalize_device_id("xpu"), local_rank) + expect_equal( + "torch.device current device normalization", + _init_fini._normalize_device_id(torch.device("xpu")), + local_rank, + ) + expect_equal( + "torch.device explicit normalization", + _init_fini._normalize_device_id(torch.device("xpu", local_rank)), + local_rank, + ) + expect_equal( + "xpu explicit string normalization", + _init_fini._normalize_device_id(f"xpu:{local_rank}"), + local_rank, + ) + + init_device = torch.device("xpu", local_rank) if torch is not None else f"xpu:{local_rank}" + ishmem.init(device_id=init_device) + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("init_device_id_test.py requires at least 2 PEs") + + next_pe = (my_pe + 1) % npes + prev_pe = (my_pe + npes - 1) % npes + + src = ishmem.malloc(4) + dst = ishmem.calloc(1, 4) + try: + src.write(pack_int32(my_pe)) + + ishmem.barrier_all() + ishmem.put(dst, src.read(4), pe=next_pe) + ishmem.barrier_all() + + received = unpack_int32(dst.read(4)) + expect_equal("put ring result with explicit device_id", received, prev_pe) + finally: + ishmem.free(dst) + ishmem.free(src) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/ring_test.py b/ishmem4py/test/ring_test.py new file mode 100644 index 0000000..76b1ac8 --- /dev/null +++ b/ishmem4py/test/ring_test.py @@ -0,0 +1,57 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import ishmem4py as ishmem + +from utils import expect_equal, pack_int32, unpack_int32 + + +def main() -> int: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("ring_test.py requires at least 2 PEs") + + next_pe = (my_pe + 1) % npes + prev_pe = (my_pe + npes - 1) % npes + + src = ishmem.malloc(4) + dst = ishmem.calloc(1, 4) + try: + src.write(pack_int32(my_pe)) + + ishmem.barrier_all() + + ishmem.put(dst, src.read(4), pe=next_pe) + ishmem.barrier_all() + + received = unpack_int32(dst.read(4)) + expect_equal("put ring result", received, prev_pe) + + host_value = bytearray(4) + ishmem.get(host_value, src, pe=next_pe) + expect_equal("get ring result", unpack_int32(host_value), next_pe) + + remote_ptr = ishmem.ishmem_ptr(src, pe=next_pe) + if remote_ptr is not None: + expect_equal("ptr metadata size", remote_ptr.size, src.size) + expect_equal("ptr metadata pe", remote_ptr.pe, next_pe) + + ishmem.barrier_all() + finally: + ishmem.free(dst) + ishmem.free(src) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/smoke_test.py b/ishmem4py/test/smoke_test.py new file mode 100644 index 0000000..49abead --- /dev/null +++ b/ishmem4py/test/smoke_test.py @@ -0,0 +1,65 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import ishmem4py as ishmem + +from utils import expect_equal, expect_true, pack_int32, unpack_int32, unpack_int32_list + + +def main() -> int: + version = ishmem.get_version() + expect_true("version vendor string is populated", bool(version.vendor_name)) + expect_true("version strings are populated", bool(version.libishmem_version)) + expect_equal("pre-init status", ishmem.init_status(), ishmem.InitStatus.UNINITIALIZED) + + ishmem.init() + try: + expect_equal("post-init status", ishmem.init_status(), ishmem.InitStatus.INITIALIZED) + expect_equal("n_pes", ishmem.n_pes(), 1) + expect_equal("my_pe", ishmem.my_pe(), 0) + + major, minor = ishmem.info_get_version() + expect_true("OpenSHMEM spec major version", major >= 1) + expect_true("OpenSHMEM spec minor version", minor >= 0) + expect_true("library name", ishmem.info_get_name().startswith("Intel")) + + buf = ishmem.malloc(16) + zeroed = ishmem.calloc(4, 4) + alias = ishmem.buffer(8, zero=True) + try: + buf.write(pack_int32(1, 2, 3, 4)) + expect_equal("local put/get", unpack_int32_list(buf.read(16)), [1, 2, 3, 4]) + expect_equal("calloc zeroed", zeroed.read(16), b"\x00" * 16) + expect_equal("buffer(zero=True)", alias.read(8), b"\x00" * 8) + + host_value = bytearray(4) + ishmem.get(host_value, buf, pe=0, size=4, src_offset=8) + expect_equal("get into host buffer", unpack_int32(host_value), 3) + + ishmem.put(buf, pack_int32(9), pe=0, dest_offset=4) + expect_equal("put alias", unpack_int32(buf.read(4, offset=4)), 9) + + local_ptr = ishmem.ptr(buf, pe=0) + expect_true("ptr result type", local_ptr is None or local_ptr.size == buf.size) + if local_ptr is not None: + expect_equal("ptr pe", local_ptr.pe, 0) + expect_true("ptr address", int(local_ptr) != 0) + + ishmem.barrier_all() + finally: + ishmem.free(alias) + ishmem.free(zeroed) + ishmem.free(buf) + finally: + ishmem.finalize() + + expect_equal("post-finalize status", ishmem.init_status(), ishmem.InitStatus.FINALIZED) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/torch_peer_tensor_test.py b/ishmem4py/test/torch_peer_tensor_test.py new file mode 100644 index 0000000..83131c1 --- /dev/null +++ b/ishmem4py/test/torch_peer_tensor_test.py @@ -0,0 +1,61 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import torch + +import ishmem4py as ishmem + +from utils import expect_equal, expect_true + + +def main() -> int: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("torch_peer_tensor_test.py requires at least 2 PEs") + + next_pe = (my_pe + 1) % npes + prev_pe = (my_pe + npes - 1) % npes + + target = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + try: + target.fill_(10.0 * my_pe) + torch.xpu.synchronize() + ishmem.barrier_all() + + peer = ishmem.get_peer_tensor(target, pe=next_pe) + expect_true("peer tensor is symmetric", ishmem.is_symmetric_tensor(peer)) + expect_true("peer tensor base returns original allocation", ishmem.tensor_base(peer) is target) + + peer.add_(torch.full_like(peer, float(my_pe + 1))) + torch.xpu.synchronize() + + try: + ishmem.free_tensor(peer) + except ishmem.IshmemStateError: + pass + else: + raise AssertionError("free_tensor(peer) should fail for non-owning aliases") + + ishmem.barrier_all() + expect_equal( + "remote peer alias update", + target.cpu().tolist(), + [10.0 * my_pe + float(prev_pe + 1)] * 4, + ) + finally: + ishmem.free_tensor(target) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/torch_queue_get_test.py b/ishmem4py/test/torch_queue_get_test.py new file mode 100644 index 0000000..a444cca --- /dev/null +++ b/ishmem4py/test/torch_queue_get_test.py @@ -0,0 +1,50 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import torch + +import ishmem4py as ishmem + +from utils import expect_equal + + +def main() -> int: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("torch_queue_get_test.py requires at least 2 PEs") + + next_pe = (my_pe + 1) % npes + + src = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + recv = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + try: + src.fill_(float(my_pe + 10)) + recv.zero_() + torch.xpu.synchronize() + + ishmem.barrier_all() + + stream = torch.xpu.Stream() + ishmem.get(recv, src, pe=next_pe, queue=stream) + ishmem.quiet(queue=stream) + torch.xpu.synchronize() + + expect_equal("queued symmetric tensor get ring", recv.cpu().tolist(), [float(next_pe + 10)] * 4) + finally: + ishmem.free_tensor(recv) + ishmem.free_tensor(src) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/torch_queue_put_test.py b/ishmem4py/test/torch_queue_put_test.py new file mode 100644 index 0000000..e960d83 --- /dev/null +++ b/ishmem4py/test/torch_queue_put_test.py @@ -0,0 +1,52 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import torch + +import ishmem4py as ishmem + +from utils import expect_equal + + +def main() -> int: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("torch_queue_put_test.py requires at least 2 PEs") + + next_pe = (my_pe + 1) % npes + prev_pe = (my_pe + npes - 1) % npes + + src = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + dst = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + try: + src.fill_(float(my_pe + 10)) + dst.zero_() + torch.xpu.synchronize() + + ishmem.barrier_all() + + stream = torch.xpu.Stream() + ishmem.put(dst, src, pe=next_pe, queue=stream) + ishmem.quiet(queue=stream) + + ishmem.barrier_all() + torch.xpu.synchronize() + expect_equal("queued symmetric tensor put ring", dst.cpu().tolist(), [float(prev_pe + 10)] * 4) + finally: + ishmem.free_tensor(dst) + ishmem.free_tensor(src) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/torch_ring_tensor_test.py b/ishmem4py/test/torch_ring_tensor_test.py new file mode 100644 index 0000000..d5523f5 --- /dev/null +++ b/ishmem4py/test/torch_ring_tensor_test.py @@ -0,0 +1,57 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import torch + +import ishmem4py as ishmem + +from utils import expect_equal + + +def main() -> int: + ishmem.init() + try: + my_pe = ishmem.my_pe() + npes = ishmem.n_pes() + if npes < 2: + raise RuntimeError("torch_ring_tensor_test.py requires at least 2 PEs") + + next_pe = (my_pe + 1) % npes + prev_pe = (my_pe + npes - 1) % npes + + src = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + dst = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + recv = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + try: + src.fill_(float(my_pe + 10)) + dst.zero_() + recv.zero_() + torch.xpu.synchronize() + + ishmem.barrier_all() + + ishmem.put(dst, src, pe=next_pe) + + ishmem.barrier_all() + torch.xpu.synchronize() + expect_equal("symmetric tensor put ring", dst.cpu().tolist(), [float(prev_pe + 10)] * 4) + + ishmem.get(recv, src, pe=next_pe) + torch.xpu.synchronize() + expect_equal("symmetric tensor get ring", recv.cpu().tolist(), [float(next_pe + 10)] * 4) + finally: + ishmem.free_tensor(recv) + ishmem.free_tensor(dst) + ishmem.free_tensor(src) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/torch_tensor_test.py b/ishmem4py/test/torch_tensor_test.py new file mode 100644 index 0000000..f87f2d8 --- /dev/null +++ b/ishmem4py/test/torch_tensor_test.py @@ -0,0 +1,66 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import sys + +import torch + +import ishmem4py as ishmem + +from utils import expect_equal, expect_true + + +def main() -> int: + ishmem.init() + try: + tensor = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + mirror = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + try: + tensor.fill_(3.0) + torch.xpu.synchronize() + + expect_true("tensor allocation is symmetric", ishmem.is_symmetric_tensor(tensor)) + expect_true("tensor base returns self for base allocation", ishmem.tensor_base(tensor) is tensor) + expect_equal("tensor contents", tensor.cpu().tolist(), [3.0, 3.0, 3.0, 3.0]) + + peer = ishmem.get_peer_tensor(tensor, pe=ishmem.my_pe()) + expect_true("peer tensor is symmetric", ishmem.is_symmetric_tensor(peer)) + expect_true("peer tensor base returns original allocation", ishmem.tensor_base(peer) is tensor) + peer.add_(2.0) + torch.xpu.synchronize() + expect_equal("peer tensor self alias update", tensor.cpu().tolist(), [5.0, 5.0, 5.0, 5.0]) + + try: + ishmem.free_tensor(peer) + except ishmem.IshmemStateError: + pass + else: + raise AssertionError("free_tensor(peer) should fail for non-owning aliases") + + stream = torch.xpu.Stream() + ishmem.put(mirror, tensor, pe=ishmem.my_pe(), queue=stream) + ishmem.quiet(queue=stream) + expect_equal("queued self put", mirror.cpu().tolist(), [5.0, 5.0, 5.0, 5.0]) + + recv = ishmem.tensor((4,), dtype=torch.float32, device="xpu") + try: + recv.zero_() + stream = torch.xpu.Stream() + ishmem.get(recv, tensor, pe=ishmem.my_pe(), queue=stream) + ishmem.quiet(queue=stream) + expect_equal("queued self get", recv.cpu().tolist(), [5.0, 5.0, 5.0, 5.0]) + finally: + ishmem.free_tensor(recv) + finally: + ishmem.free_tensor(mirror) + ishmem.free_tensor(tensor) + finally: + ishmem.finalize() + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/ishmem4py/test/utils.py b/ishmem4py/test/utils.py new file mode 100644 index 0000000..2e97a9b --- /dev/null +++ b/ishmem4py/test/utils.py @@ -0,0 +1,46 @@ +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import struct + + +def expect_equal(label: str, actual, expected) -> None: + if actual != expected: + raise AssertionError(f"{label}: expected {expected!r}, got {actual!r}") + + +def expect_true(label: str, condition: bool) -> None: + if not condition: + raise AssertionError(label) + + +def pack_int32(*values: int) -> bytes: + return struct.pack(f"={len(values)}i", *values) + + +def pack_uint32(*values: int) -> bytes: + return struct.pack(f"={len(values)}I", *values) + + +def unpack_int32(data: bytes) -> int: + return struct.unpack("=i", data)[0] + + +def unpack_uint32(data: bytes) -> int: + return struct.unpack("=I", data)[0] + + +def unpack_int32_list(data: bytes) -> list[int]: + if len(data) % 4 != 0: + raise ValueError("int32 payload size must be a multiple of 4 bytes") + count = len(data) // 4 + return list(struct.unpack(f"={count}i", data)) + + +def unpack_uint32_list(data: bytes) -> list[int]: + if len(data) % 4 != 0: + raise ValueError("uint32 payload size must be a multiple of 4 bytes") + count = len(data) // 4 + return list(struct.unpack(f"={count}I", data)) diff --git a/scripts/ctest/all_visible_wrapper b/scripts/ctest/all_visible_wrapper new file mode 100755 index 0000000..90e59a4 --- /dev/null +++ b/scripts/ctest/all_visible_wrapper @@ -0,0 +1,8 @@ +#!/bin/bash + +export ZE_FLAT_DEVICE_HIERARCHY=FLAT +export SYCL_DEVICE_FILTER=:gpu +unset ZE_AFFINITY_MASK +unset ONEAPI_DEVICE_SELECTOR + +exec "$@" diff --git a/scripts/ishmrun b/scripts/ishmrun index 1986807..92877b9 100755 --- a/scripts/ishmrun +++ b/scripts/ishmrun @@ -179,8 +179,8 @@ gpu_binding() # Determine how to set ZE_AFFINITY_MASK if [ $which_sycl_ls -eq 0 ]; then - local root_count=$(ONEAPI_DEVICE_SELECTOR=level_zero:* sycl-ls 2>/dev/null | wc -l) - local sub_count=$(ONEAPI_DEVICE_SELECTOR=level_zero:*.* sycl-ls 2>/dev/null | wc -l) + local root_count=$(ONEAPI_DEVICE_SELECTOR=level_zero:* sycl-ls 2>/dev/null | grep -c "\[level_zero") + local sub_count=$(ONEAPI_DEVICE_SELECTOR=level_zero:*.* sycl-ls 2>/dev/null | grep -c "\[level_zero") elif [ $which_clinfo -eq 0 ]; then local platforms=$(clinfo -l | grep -i Platform | wc -l) @@ -313,6 +313,8 @@ gpu_env() # Intel(R) Arc(TM) B-Series GPU Family # Necessary for GPU IPC export RenderCompressedBuffersEnabled=0 + # Atomic ops from the GPU require Xe-Links - instead route atomic ops through the host + export ISHMEM_ENABLE_DEVICE_ATOMICS=0 elif [ $(echo $output | grep -Po "Intel.*Data.*Center.*GPU" | wc -l) -gt 0 ]; then # Intel(R) Data Center GPU Max Series # No support for Implicit Scaling diff --git a/src/accelerator.cpp b/src/accelerator.cpp index 26f0274..158a95f 100644 --- a/src/accelerator.cpp +++ b/src/accelerator.cpp @@ -6,6 +6,7 @@ #include "accelerator.h" #include #include +#include /* TODO: Workaround to resolve compiler limitation. Need to be fixed later */ #if __INTEL_CLANG_COMPILER <= 20210400 @@ -15,12 +16,21 @@ #endif namespace { + struct ishmemi_visible_gpu_t { + ze_driver_handle_t driver = nullptr; + ze_device_handle_t device = nullptr; + ze_device_properties_t properties = {}; + uint32_t driver_idx = 0; + }; + /* L0 driver */ ze_driver_handle_t *all_drivers = nullptr; ze_device_handle_t **all_devices = nullptr; uint32_t driver_count = 0; uint32_t driver_idx = 0; bool driver_found = false; + std::vector visible_gpus; + int selected_device_id = -1; /* L0 device */ ze_device_properties_t device_properties = {}; @@ -95,10 +105,30 @@ static inline uint32_t get_next_link_index() return index; } +sycl::device ishmemi_get_selected_sycl_device() +{ + return sycl::make_device(ishmemi_gpu_device); +} + +void ishmemi_validate_queue_device(const sycl::queue &q) +{ + try { + auto queue_device = + sycl::get_native(q.get_device()); + if (queue_device != ishmemi_gpu_device) { + RAISE_ERROR_MSG( + "Queue device does not match the selected ISHMEM device. Set " + "ishmemx_attr_t.device_id to the queue device ordinal.\n"); + } + } catch (const sycl::exception &) { + RAISE_ERROR_MSG("Queue device is not a Level Zero GPU device\n"); + } +} + int ishmemi_accelerator_preinit() { int ret = 0; - uint32_t i; + uint32_t i, j; uint32_t device_count = 0; ze_init_flag_t flags = ZE_INIT_FLAG_GPU_ONLY; @@ -134,44 +164,44 @@ int ishmemi_accelerator_preinit() ZE_CHECK(zeDriverGet(&driver_count, all_drivers)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - /* Parse the drivers for a suitable driver */ + visible_gpus.clear(); + + /* Parse the drivers for visible GPU devices */ for (i = 0; i < driver_count; i++) { device_count = 0; ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, nullptr)); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); if (device_count == 0) continue; - /* Ensure a single device is detected */ - ISHMEM_CHECK_GOTO_MSG(device_count != 1, fn_fail, "Detected more than one device\n"); all_devices[i] = (ze_device_handle_t *) ::malloc(device_count * sizeof(ze_device_handle_t)); - ISHMEM_CHECK_GOTO_MSG(all_devices == nullptr, fn_fail, + ISHMEM_CHECK_GOTO_MSG(all_devices[i] == nullptr, fn_fail, "Allocation of all_drivers[%d] failed\n", i); ZE_CHECK(zeDeviceGet(all_drivers[i], &device_count, all_devices[i])); ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - ZE_CHECK(zeDeviceGetProperties(all_devices[i][0], &device_properties)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - - if (ZE_DEVICE_TYPE_GPU == device_properties.type && !driver_found) { - ishmemi_gpu_driver = all_drivers[i]; - driver_idx = i; - driver_found = true; + for (j = 0; j < device_count; ++j) { + ze_device_properties_t props = {}; + ZE_CHECK(zeDeviceGetProperties(all_devices[i][j], &props)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); + + if (ZE_DEVICE_TYPE_GPU == props.type) { + visible_gpus.push_back({ + .driver = all_drivers[i], + .device = all_devices[i][j], + .properties = props, + .driver_idx = i, + }); + } } } - if (!driver_found) { + if (visible_gpus.empty()) { ISHMEM_ERROR_MSG("No ZE driver found for GPU\n"); ret = ISHMEMI_NO_DEVICES; goto fn_fail; } - /* Create the ZE context */ - ishmemi_ze_context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC; - - ZE_CHECK(zeContextCreate(ishmemi_gpu_driver, &ishmemi_ze_context_desc, &ishmemi_ze_context)); - ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); - fn_exit: ishmemi_accelerator_preinitialized = true; return ret; @@ -181,7 +211,7 @@ int ishmemi_accelerator_preinit() goto fn_exit; } -int ishmemi_accelerator_init() +int ishmemi_accelerator_init(const ishmemx_attr_t *attr) { int ret = 0; uint32_t i, j; @@ -192,9 +222,37 @@ int ishmemi_accelerator_init() ret = ishmemi_accelerator_preinit(); ISHMEMI_CHECK_RESULT(ret, 0, fn_exit); - if (driver_found) { - /* Set the default GPU */ - ishmemi_gpu_device = all_devices[driver_idx][0]; + if (!ishmemi_accelerator_initialized) { + ISHMEM_CHECK_GOTO_MSG(attr == nullptr, fn_fail, + "Accelerator initialization requires non-null attributes\n"); + ISHMEM_CHECK_GOTO_MSG(attr->device_id < -1, fn_fail, + "Invalid device_id %d provided in ishmemx_attr_t\n", + attr->device_id); + + if (attr->device_id == -1) { + ISHMEM_CHECK_GOTO_MSG( + visible_gpus.size() != 1, fn_fail, + "Detected %zu visible GPU devices. Set ishmemx_attr_t.device_id to select one.\n", + visible_gpus.size()); + selected_device_id = 0; + } else { + ISHMEM_CHECK_GOTO_MSG( + static_cast(attr->device_id) >= visible_gpus.size(), fn_fail, + "Requested device_id %d is out of range for %zu visible GPU devices\n", + attr->device_id, visible_gpus.size()); + selected_device_id = attr->device_id; + } + + const auto &selected_device = visible_gpus[static_cast(selected_device_id)]; + ishmemi_gpu_driver = selected_device.driver; + ishmemi_gpu_device = selected_device.device; + device_properties = selected_device.properties; + driver_idx = selected_device.driver_idx; + driver_found = true; + + ishmemi_ze_context_desc.stype = ZE_STRUCTURE_TYPE_CONTEXT_DESC; + ZE_CHECK(zeContextCreate(ishmemi_gpu_driver, &ishmemi_ze_context_desc, &ishmemi_ze_context)); + ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); /* Discover command queue groups */ ZE_CHECK( @@ -289,10 +347,11 @@ int ishmemi_accelerator_init() ISHMEMI_CHECK_RESULT(ret, 0, fn_fail); fn_exit: - ishmemi_accelerator_initialized = true; + ishmemi_accelerator_initialized = (ret == 0); return ret; fn_fail: ishmemi_accelerator_fini(); + if (!ret) ret = 1; goto fn_exit; } @@ -326,12 +385,17 @@ int ishmemi_accelerator_fini(void) ISHMEMI_FREE(::free, all_devices[i]); ISHMEMI_FREE(::free, all_devices); ISHMEMI_FREE(::free, all_drivers); + visible_gpus.clear(); ishmemi_accelerator_preinitialized = false; ishmemi_accelerator_initialized = false; driver_found = false; driver_idx = 0; driver_count = 0; + selected_device_id = -1; + ishmemi_gpu_driver = nullptr; + ishmemi_gpu_device = nullptr; + device_properties = {}; if (ishmemi_ze_context) { ZE_CHECK(zeContextDestroy(ishmemi_ze_context)); diff --git a/src/accelerator.h b/src/accelerator.h index 9512e9e..fa138b9 100644 --- a/src/accelerator.h +++ b/src/accelerator.h @@ -59,12 +59,16 @@ static inline void ishmemi_print_device_properties(const ze_device_properties_t /* Initialize accelerator */ int ishmemi_accelerator_preinit(void); -int ishmemi_accelerator_init(void); +int ishmemi_accelerator_init(const ishmemx_attr_t *attr); /* Finalize accelerator */ int ishmemi_accelerator_fini(void); int ishmemi_accelerator_postfini(void); +/* Selected-device helpers */ +sycl::device ishmemi_get_selected_sycl_device(); +void ishmemi_validate_queue_device(const sycl::queue &q); + /* Query allocation memory type */ int ishmemi_get_memory_type(const void *ptr, ze_memory_type_t *type); diff --git a/src/ishmem.cpp b/src/ishmem.cpp index 44b6571..42d8d76 100644 --- a/src/ishmem.cpp +++ b/src/ishmem.cpp @@ -280,7 +280,7 @@ static void ishmemi_init(ishmemx_attr_t *attr, bool user_attr) ishmemi_cpu_info->n_pes = ishmemi_n_pes; if (attr->gpu) { - ret = ishmemi_accelerator_init(); + ret = ishmemi_accelerator_init(attr); if (ret == ISHMEMI_NO_DEVICE_ACCESS) { attr->gpu = false; /* TODO need to enable SHARED HEAP config */ diff --git a/src/ishmemx.h b/src/ishmemx.h index de87a1c..e988287 100644 --- a/src/ishmemx.h +++ b/src/ishmemx.h @@ -25,6 +25,8 @@ typedef struct ishmemx_attr_t { bool initialize_runtime = true; /* By default, gpu is used */ bool gpu = true; + /* By default, select the only visible GPU device */ + int device_id = -1; /* By default, the base team/comm is uninitialized, representing the default global team/comm */ union { /* TODO: add support for user-provided shmem_team as global team */ diff --git a/src/memory.cpp b/src/memory.cpp index 4f35408..b650be4 100644 --- a/src/memory.cpp +++ b/src/memory.cpp @@ -98,7 +98,7 @@ int ishmemi_memory_init() /* SYCL queue to initialize global_info */ try { - sycl::queue q; + sycl::queue q(ishmemi_get_selected_sycl_device()); q.copy(&ishmemi_gpu_info, global_info).wait_and_throw(); } catch (...) { ret = -1; diff --git a/src/on_queue.h b/src/on_queue.h index 2fe3e6a..ba02181 100644 --- a/src/on_queue.h +++ b/src/on_queue.h @@ -5,6 +5,7 @@ #ifndef ISHMEM_ON_QUEUE_H #define ISHMEM_ON_QUEUE_H +#include "accelerator.h" #include struct ishmemi_on_queue_map_entry_t { @@ -38,6 +39,8 @@ class ishmemi_on_queue_map : public std::map +#include +#include + +namespace { +std::vector get_visible_level_zero_gpus() +{ + std::vector devices; + + for (const auto &platform : sycl::platform::get_platforms()) { + if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) continue; + + for (const auto &device : platform.get_devices()) { + if (device.is_gpu()) { + devices.push_back(device); + } + } + } + + return devices; +} + +int get_local_rank() +{ + constexpr const char *env_names[] = { + "MPI_LOCALRANKID", + "OMPI_COMM_WORLD_LOCAL_RANK", + "PMI_LOCAL_RANK", + "SLURM_LOCALID", + }; + + for (const char *name : env_names) { + const char *value = std::getenv(name); + if (value != nullptr) { + return std::atoi(value); + } + } + + return 0; +} +} // namespace + +int main() +{ + validate_runtime(); + + auto devices = get_visible_level_zero_gpus(); + if (devices.size() < 2) { + std::cout << "Skipping explicit device_id test because fewer than 2 visible Level Zero GPUs " + "were detected" + << std::endl; + return 77; + } + + int local_rank = get_local_rank(); + if ((local_rank < 0) || (static_cast(local_rank) >= devices.size())) { + std::cerr << "Invalid local rank " << local_rank << " for " << devices.size() + << " visible GPU devices" << std::endl; + return EXIT_FAILURE; + } + + ishmemx_attr_t attr; + attr.initialize_runtime = true; + attr.runtime = ishmemi_test_runtime->get_type(); + attr.device_id = local_rank; + ishmemx_init_attr(&attr); + + int my_pe = ishmem_my_pe(); + int npes = ishmem_n_pes(); + int peer = (my_pe + 1) % npes; + + sycl::queue q(devices[static_cast(local_rank)]); + + std::cout << "PE " << my_pe << " selected device_id " << local_rank << ": " + << q.get_device().get_info() << std::endl; + + int *source = (int *) ishmem_malloc(sizeof(int)); + CHECK_ALLOC(source); + int *target = (int *) ishmem_malloc(sizeof(int)); + CHECK_ALLOC(target); + int *host_value = sycl::malloc_host(1, q); + CHECK_ALLOC(host_value); + + q.fill(source, my_pe, 1).wait_and_throw(); + q.fill(target, -1, 1).wait_and_throw(); + ishmem_barrier_all(); + + auto e = ishmemx_int_get_on_queue(target, source, 1, peer, q); + e.wait_and_throw(); + ishmemx_quiet_on_queue(q).wait_and_throw(); + q.copy(target, host_value, 1).wait_and_throw(); + + int rc = EXIT_SUCCESS; + if (*host_value != peer) { + std::cerr << "PE " << my_pe << " expected " << peer << " but received " << *host_value + << std::endl; + rc = EXIT_FAILURE; + } else if (my_pe == 0) { + std::cout << "Test Passed" << std::endl; + } + + ishmem_barrier_all(); + sycl::free(host_value, q); + ishmem_free(target); + ishmem_free(source); + ishmem_finalize(); + + return rc; +} diff --git a/test/unit/init_attr_device_id_required.cpp b/test/unit/init_attr_device_id_required.cpp new file mode 100644 index 0000000..938f93e --- /dev/null +++ b/test/unit/init_attr_device_id_required.cpp @@ -0,0 +1,53 @@ +/* Copyright (C) 2025 Intel Corporation + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include "accelerator.h" +#include +#include +#include + +namespace { +std::vector get_visible_level_zero_gpus() +{ + std::vector devices; + + for (const auto &platform : sycl::platform::get_platforms()) { + if (platform.get_backend() != sycl::backend::ext_oneapi_level_zero) continue; + + for (const auto &device : platform.get_devices()) { + if (device.is_gpu()) { + devices.push_back(device); + } + } + } + + return devices; +} +} // namespace + +int main() +{ + auto devices = get_visible_level_zero_gpus(); + if (devices.size() < 2) { + std::cout << "Skipping device_id-required test because fewer than 2 visible Level Zero " + "GPUs were detected" + << std::endl; + return 77; + } + + ishmemx_attr_t attr; + int ret = ishmemi_accelerator_init(&attr); + + if (ret == 0) { + std::cerr << "Expected accelerator initialization to fail when multiple GPUs are visible " + "and device_id is not set" + << std::endl; + ishmemi_accelerator_fini(); + return EXIT_FAILURE; + } + + std::cout << "Detected required explicit device selection with " << devices.size() + << " visible GPUs" << std::endl; + return EXIT_SUCCESS; +}