linum-uqam · FIrgolitsch · Apr 29, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/docs/GPU_ACCELERATION.md b/docs/GPU_ACCELERATION.md
@@ -33,8 +33,8 @@ backend selection is per-call (`backend="cpu" | "gpu" | "auto"`); the auto path
 nvidia-smi | grep "CUDA Version"
 
 # Install linumpy with GPU support (choose your CUDA version)
-uv pip install 'linumpy[gpu]'           # CUDA 12.x (default)
-uv pip install 'linumpy[gpu-cuda13]'    # CUDA 13.x
+uv pip install 'linumpy[gpu]'           # CUDA 13.x (default)
+uv pip install 'linumpy[gpu-cuda12]'    # CUDA 12.x
 
 # Verify GPU
 linum_gpu_info.py
@@ -57,8 +57,8 @@ linum_diagnose_pipeline.py --benchmark
 
 | CUDA Version | CuPy Package | linumpy extra |
 |--------------|--------------|---------------|
-| CUDA 12.x    | `cupy-cuda12x` | `linumpy[gpu]` |
-| CUDA 13.x    | `cupy-cuda13x` | `linumpy[gpu-cuda13]` |
+| CUDA 13.x    | `cupy-cuda13x` | `linumpy[gpu]` (default) |
+| CUDA 12.x    | `cupy-cuda12x` | `linumpy[gpu-cuda12]` |
 
 ---
 
@@ -232,4 +232,64 @@ from linumpy.gpu.fft_ops import fft2, ifft2, phase_correlation
 from linumpy.gpu.interpolation import resize, affine_transform
 from linumpy.gpu.morphology import binary_closing, gaussian_filter
 from linumpy.gpu.array_ops import normalize_percentile, clip_percentile
+from linumpy.gpu.zarr_io import read_zarr_to_gpu
 ```
+
+---
+
+## Fast zarr → GPU loading
+
+For zarr arrays on local NVMe, `linumpy.gpu.zarr_io.read_zarr_to_gpu` is the recommended entry point. It dispatches to the fastest backend available at runtime:
+
+```python
+from linumpy.gpu.zarr_io import read_zarr_to_gpu
+
+dev = read_zarr_to_gpu("/scratch_nvme/volume.zarr")
+# dev is a cupy.ndarray
+```
+
+Backend implementations live in their own modules:
+
+- `linumpy.gpu.kvikio_zarr` — kvikio / GPUDirect Storage reader (uncompressed zarr v2/v3 only).
+
+Selection order (when `prefer="auto"`):
+
+1. **kvikio (GPUDirect Storage, native mode)** — chunks DMA'd directly from NVMe into GPU memory. Fastest. Requires `kvikio` installed, GDS native mode enabled (`/etc/cufile.json`: `allow_compat_mode=false`), and an uncompressed zarr.
+2. **`zarr.config.enable_gpu()`** — host I/O with on-host decode, single H→D copy. Works for any zarr (compressed or not) and is the automatic fallback when GDS is unavailable, in compat mode, or the array is compressed.
+
+You can force a specific path with `prefer="kvikio"` or `prefer="zarr-gpu"`. The legacy `cupy.asarray(zarr.open_array(...)[:])` path is kept only as a reference baseline in `linum_benchmark_kvikio_zarr.py`.
+
+### Reference benchmark
+
+`scripts/linum_benchmark_kvikio_zarr.py` measures all three paths. On a 16 GiB float32 zarr v3 (256³ chunks) on local NVMe ext4 with an RTX A6000:
+
+| Path | Cold | Warm |
+|---|---|---|
+| kvikio (GDS native) | 8.2 GiB/s | **9.9 GiB/s** |
+| `zarr.config.enable_gpu()` | 6.3 GiB/s | 7.1 GiB/s |
+| `zarr → numpy → cupy.asarray` | 1.1 GiB/s | 2.8 GiB/s |
+
+In compat mode (GDS bounce-buffer), kvikio drops to ~4 GiB/s — slower than the `zarr-gpu` path. The auto selector detects this via `kvikio.defaults.compat_mode()` and prefers `zarr-gpu` in that case.
+
+### Installing the GDS path
+
+```bash
+# CuPy + linumpy GPU support
+uv pip install 'linumpy[gpu]'           # CUDA 13.x (default)
+uv pip install 'linumpy[gpu-cuda12]'    # CUDA 12.x
+
+# kvikio (optional, only needed for the GDS fast path)
+uv pip install 'linumpy[gds]'           # CUDA 13.x (default)
+uv pip install 'linumpy[gds-cuda12]'    # CUDA 12.x
+```
+
+### Enabling native GDS
+
+Native GDS additionally requires:
+
+- A CUDA-aware filesystem on the source path (ext4 / xfs on local NVMe is fine; NFS, overlayfs, encrypted FS are not).
+- `/etc/cufile.json`: `properties.use_compat_mode = false`.
+- IOMMU disabled or in passthrough (`amd_iommu=off iommu=off` on AMD; `intel_iommu=off` on Intel).
+- nvidia-fs DKMS module loaded with matching ABI; verify `dmesg | grep nvidia_fs` shows no "no extended symbol version" warnings after driver/kernel updates.
+
+If any of these are missing, kvikio silently falls back to a POSIX bounce-buffer (compat mode) and `read_zarr_to_gpu` will route around it.
diff --git a/docs/LIBRARY_MODULES.md b/docs/LIBRARY_MODULES.md
@@ -40,15 +40,21 @@ OME-Zarr is the canonical on-disk format. Two writers are provided:
 Common entry points:
 
 ```python
-from linumpy.io.zarr import read_omezarr, save_omezarr, OmeZarrWriter, AnalysisOmeZarrWriter
+from linumpy.io.zarr import read_omezarr, read_omezarr_array, save_omezarr, OmeZarrWriter, AnalysisOmeZarrWriter
 from linumpy.io.test_data import get_data
 from linumpy.io import slice_config        # slice_config.json reader/writer
 from linumpy.io.thorlabs import ThorImageOCT
 ```
 
-`read_omezarr(path, level=0)` returns a `(zarr.Array, voxel_size)` tuple. The
-voxel size is ordered to match the array axes (Z, Y, X for 3D volumes;
-Y, X for 2D mosaics).
+`read_omezarr(path, level=0)` returns a `(zarr.Array, voxel_size)` tuple (lazy).
+`read_omezarr_array(path, level=0, use_gpu=False)` is the high-level entry
+point: it materialises the requested level into memory and returns
+`(array, voxel_size)`. With ``use_gpu=False`` (default) the array is a
+``numpy.ndarray``; with ``use_gpu=True`` it dispatches through
+[`linumpy.gpu.zarr_io.read_zarr_to_gpu`](GPU_ACCELERATION.md) and returns a
+``cupy.ndarray`` (kvikio/GPUDirect Storage when available, otherwise the
+``zarr.config.enable_gpu`` fallback). Voxel size is ordered to match the array
+axes (Z, Y, X for 3D volumes; Y, X for 2D mosaics).
 
 See [Mosaic Grid Format](MOSAIC_GRID_FORMAT.md) and
 [Slice Config Feature](SLICE_CONFIG_FEATURE.md) for format details.
@@ -136,6 +142,9 @@ CuPy-backed versions of hot paths. Each public entry point either takes a
 * `gpu.registration`, `gpu.corrections` — GPU registration and correction
   passes used by `linum_estimate_transform.py`,
   `linum_normalize_intensities_per_slice.py`, etc.
+* `gpu.zarr_io` — high-level `read_zarr_to_gpu` dispatcher: picks the
+  fastest backend available (kvikio/GDS native → `zarr.config.enable_gpu`).
+* `gpu.kvikio_zarr` — kvikio / GPUDirect Storage backend for the dispatcher.
 
 See [GPU Acceleration](GPU_ACCELERATION.md) and [N4 GPU](N4_GPU.md).
 

diff --git a/linumpy/geometry/interface.py b/linumpy/geometry/interface.py
@@ -116,6 +116,7 @@ def find_tissue_interface(
     mask: np.ndarray | None = None,
     order: int = 1,
     detect_cutting_errors: bool = False,
+    use_gpu: bool = False,
 ) -> np.ndarray:
     """Detect the tissue interface.
 
@@ -135,13 +136,31 @@ def find_tissue_interface(
         Gaussian filter order.
     detect_cutting_errors : bool
         If True, detect and correct cutting artefacts.
+    use_gpu : bool
+        If True, use the GPU implementation when CuPy is available.
+        Ignored when ``mask`` is provided (the mask path stays on CPU).
 
     Returns
     -------
     ndarray
         Tissue interface depth
 
     """
+    if use_gpu and mask is None:
+        from linumpy.gpu import GPU_AVAILABLE
+
+        if GPU_AVAILABLE:
+            from linumpy.gpu.interface import find_tissue_interface_gpu
+
+            return find_tissue_interface_gpu(
+                vol,
+                s_xy=s_xy,
+                s_z=s_z,
+                use_log=use_log,
+                order=order,
+                detect_cutting_errors=detect_cutting_errors,
+            )
+
     if use_log:
         vol_p = np.copy(vol)
         vol_p[vol > 0] = np.log(vol[vol > 0])

diff --git a/linumpy/gpu/__init__.py b/linumpy/gpu/__init__.py
@@ -165,6 +165,21 @@ def to_cpu(array: Any) -> Any:
     return array
 
 
+def is_cupy_array(array: Any) -> bool:
+    """Return True iff ``array`` is a CuPy ``ndarray``.
+
+    Importing CuPy is gated on availability so this stays cheap to call from
+    code paths that may be hit without CUDA.
+    """
+    if not GPU_AVAILABLE:
+        return False
+    try:
+        import cupy as cp
+    except ImportError:
+        return False
+    return isinstance(array, cp.ndarray)
+
+
 def gpu_info() -> Any:
     """
     Get information about GPU availability and configuration.

diff --git a/linumpy/gpu/interface.py b/linumpy/gpu/interface.py
@@ -0,0 +1,50 @@
+"""GPU implementation of tissue interface detection.
+
+Mirrors the CPU path in :func:`linumpy.geometry.interface.find_tissue_interface`
+but routes the heavy filters through ``cupyx.scipy.ndimage`` to avoid
+host-device round trips when the caller already holds GPU data or when
+the volume is large enough that the transfer cost is amortised.
+"""
+
+from typing import Any
+
+import numpy as np
+
+from . import to_cpu
+
+
+def find_tissue_interface_gpu(
+    vol: Any,
+    s_xy: int = 15,
+    s_z: int = 2,
+    use_log: bool = True,
+    order: int = 1,
+    detect_cutting_errors: bool = False,
+) -> np.ndarray:
+    """GPU equivalent of ``find_tissue_interface`` (no mask path).
+
+    Always returns a NumPy array of integer depths so callers do not need
+    to be aware of the device.
+    """
+    import cupy as cp
+    from cupyx.scipy.ndimage import gaussian_filter1d, uniform_filter
+
+    vol_gpu = cp.asarray(vol)
+    if use_log:
+        vol_p = vol_gpu.astype(cp.float32, copy=True)
+        positive = vol_gpu > 0
+        vol_p[positive] = cp.log(vol_gpu[positive])
+    else:
+        vol_p = vol_gpu
+
+    vol_p = uniform_filter(vol_p, (s_xy, s_xy, 0))
+    vol_g = gaussian_filter1d(vol_p, s_z, axis=2, order=order)
+    z0 = cp.ceil(vol_g.argmax(axis=2) + s_z * 0.5).astype(cp.int64)
+
+    if detect_cutting_errors:
+        vol_p = gaussian_filter1d(vol_p, s_z, axis=2, order=0)
+        z0_p = cp.abs(vol_p).argmax(axis=2)
+        mask_max = z0_p < z0
+        z0 = cp.where(mask_max, z0_p, z0)
+
+    return to_cpu(z0)
diff --git a/linumpy/gpu/interpolation.py b/linumpy/gpu/interpolation.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from . import GPU_AVAILABLE, to_cpu
+from . import GPU_AVAILABLE, is_cupy_array, to_cpu
 
 
 def affine_transform(image: Any, matrix: Any, output_shape: Any = None, order: Any = 1, use_gpu: Any = True) -> Any:
@@ -44,15 +44,18 @@ def affine_transform(image: Any, matrix: Any, output_shape: Any = None, order: A
 
 
 def _affine_transform_gpu(image: Any, matrix: Any, output_shape: Any, order: Any) -> Any:
-    """GPU implementation of affine transform."""
+    """GPU implementation of affine transform. Device-preserving."""
     import cupy as cp
     from cupyx.scipy.ndimage import affine_transform as cp_affine
 
+    input_was_gpu = is_cupy_array(image)
     img_gpu = cp.asarray(image.astype(np.float32))
     matrix_gpu = cp.asarray(matrix.astype(np.float32))
 
     result = cp_affine(img_gpu, matrix_gpu, output_shape=output_shape, order=order)
 
+    if input_was_gpu:
+        return result
     return to_cpu(result)
 
 
@@ -90,15 +93,18 @@ def map_coordinates(image: Any, coordinates: Any, order: Any = 1, use_gpu: Any =
 
 
 def _map_coordinates_gpu(image: Any, coordinates: Any, order: Any) -> Any:
-    """GPU implementation of map_coordinates."""
+    """GPU implementation of map_coordinates. Device-preserving."""
     import cupy as cp
     from cupyx.scipy.ndimage import map_coordinates as cp_map
 
+    input_was_gpu = is_cupy_array(image)
     img_gpu = cp.asarray(image.astype(np.float32))
     coords_gpu = cp.asarray(coordinates.astype(np.float32))
 
     result = cp_map(img_gpu, coords_gpu, order=order)
 
+    if input_was_gpu:
+        return result
     return to_cpu(result)
 
 
@@ -138,11 +144,17 @@ def resize(image: Any, output_shape: Any, order: Any = 1, anti_aliasing: Any = T
 
 
 def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) -> Any:
-    """GPU implementation of resize using zoom."""
+    """GPU implementation of resize using zoom.
+
+    Device-preserving: returns a CuPy array if *image* was already on device,
+    a NumPy array if *image* was on host. This avoids forcing a D->H copy when
+    callers want to keep tiles resident for downstream GPU compute / writes.
+    """
     import cupy as cp
     from cupyx.scipy.ndimage import gaussian_filter as cp_gaussian
     from cupyx.scipy.ndimage import zoom as cp_zoom
 
+    input_was_gpu = is_cupy_array(image)
     img_gpu = cp.asarray(image if image.dtype == np.float32 else image.astype(np.float32))
 
     # Scale factors: input/output for Gaussian sigma, output/input for zoom.
@@ -158,6 +170,8 @@ def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) -
 
     result = cp_zoom(img_gpu, zoom_factors, order=order)
 
+    if input_was_gpu:
+        return result
     return to_cpu(result)