diff --git a/docs/GPU_ACCELERATION.md b/docs/GPU_ACCELERATION.md
index c41d871f..d863bb64 100644
--- a/docs/GPU_ACCELERATION.md
+++ b/docs/GPU_ACCELERATION.md
@@ -33,8 +33,8 @@ backend selection is per-call (`backend="cpu" | "gpu" | "auto"`); the auto path
 nvidia-smi | grep "CUDA Version"
 
 # Install linumpy with GPU support (choose your CUDA version)
-uv pip install 'linumpy[gpu]'           # CUDA 12.x (default)
-uv pip install 'linumpy[gpu-cuda13]'    # CUDA 13.x
+uv pip install 'linumpy[gpu]'           # CUDA 13.x (default)
+uv pip install 'linumpy[gpu-cuda12]'    # CUDA 12.x
 
 # Verify GPU
 linum_gpu_info.py
@@ -57,8 +57,8 @@ linum_diagnose_pipeline.py --benchmark
 
 | CUDA Version | CuPy Package | linumpy extra |
 |--------------|--------------|---------------|
-| CUDA 12.x    | `cupy-cuda12x` | `linumpy[gpu]` |
-| CUDA 13.x    | `cupy-cuda13x` | `linumpy[gpu-cuda13]` |
+| CUDA 13.x    | `cupy-cuda13x` | `linumpy[gpu]` (default) |
+| CUDA 12.x    | `cupy-cuda12x` | `linumpy[gpu-cuda12]` |
 
 ---
 
@@ -232,4 +232,64 @@ from linumpy.gpu.fft_ops import fft2, ifft2, phase_correlation
 from linumpy.gpu.interpolation import resize, affine_transform
 from linumpy.gpu.morphology import binary_closing, gaussian_filter
 from linumpy.gpu.array_ops import normalize_percentile, clip_percentile
+from linumpy.gpu.zarr_io import read_zarr_to_gpu
 ```
+
+---
+
+## Fast zarr → GPU loading
+
+For zarr arrays on local NVMe, `linumpy.gpu.zarr_io.read_zarr_to_gpu` is the recommended entry point. It dispatches to the fastest backend available at runtime:
+
+```python
+from linumpy.gpu.zarr_io import read_zarr_to_gpu
+
+dev = read_zarr_to_gpu("/scratch_nvme/volume.zarr")
+# dev is a cupy.ndarray
+```
+
+Backend implementations live in their own modules:
+
+- `linumpy.gpu.kvikio_zarr` — kvikio / GPUDirect Storage reader (uncompressed zarr v2/v3 only).
+
+Selection order (when `prefer="auto"`):
+
+1. **kvikio (GPUDirect Storage, native mode)** — chunks DMA'd directly from NVMe into GPU memory. Fastest. Requires `kvikio` installed, GDS native mode enabled (`/etc/cufile.json`: `allow_compat_mode=false`), and an uncompressed zarr.
+2. **`zarr.config.enable_gpu()`** — host I/O with on-host decode, single H→D copy. Works for any zarr (compressed or not) and is the automatic fallback when GDS is unavailable, in compat mode, or the array is compressed.
+
+You can force a specific path with `prefer="kvikio"` or `prefer="zarr-gpu"`. The legacy `cupy.asarray(zarr.open_array(...)[:])` path is kept only as a reference baseline in `linum_benchmark_kvikio_zarr.py`.
+
+### Reference benchmark
+
+`scripts/linum_benchmark_kvikio_zarr.py` measures all three paths. On a 16 GiB float32 zarr v3 (256³ chunks) on local NVMe ext4 with an RTX A6000:
+
+| Path | Cold | Warm |
+|---|---|---|
+| kvikio (GDS native) | 8.2 GiB/s | **9.9 GiB/s** |
+| `zarr.config.enable_gpu()` | 6.3 GiB/s | 7.1 GiB/s |
+| `zarr → numpy → cupy.asarray` | 1.1 GiB/s | 2.8 GiB/s |
+
+In compat mode (GDS bounce-buffer), kvikio drops to ~4 GiB/s — slower than the `zarr-gpu` path. The auto selector detects this via `kvikio.defaults.compat_mode()` and prefers `zarr-gpu` in that case.
+
+### Installing the GDS path
+
+```bash
+# CuPy + linumpy GPU support
+uv pip install 'linumpy[gpu]'           # CUDA 13.x (default)
+uv pip install 'linumpy[gpu-cuda12]'    # CUDA 12.x
+
+# kvikio (optional, only needed for the GDS fast path)
+uv pip install 'linumpy[gds]'           # CUDA 13.x (default)
+uv pip install 'linumpy[gds-cuda12]'    # CUDA 12.x
+```
+
+### Enabling native GDS
+
+Native GDS additionally requires:
+
+- A CUDA-aware filesystem on the source path (ext4 / xfs on local NVMe is fine; NFS, overlayfs, encrypted FS are not).
+- `/etc/cufile.json`: `properties.use_compat_mode = false`.
+- IOMMU disabled or in passthrough (`amd_iommu=off iommu=off` on AMD; `intel_iommu=off` on Intel).
+- nvidia-fs DKMS module loaded with matching ABI; verify `dmesg | grep nvidia_fs` shows no "no extended symbol version" warnings after driver/kernel updates.
+
+If any of these are missing, kvikio silently falls back to a POSIX bounce-buffer (compat mode) and `read_zarr_to_gpu` will route around it.
diff --git a/docs/LIBRARY_MODULES.md b/docs/LIBRARY_MODULES.md
index 943bde24..7ac2e53c 100644
--- a/docs/LIBRARY_MODULES.md
+++ b/docs/LIBRARY_MODULES.md
@@ -40,15 +40,21 @@ OME-Zarr is the canonical on-disk format. Two writers are provided:
 Common entry points:
 
 ```python
-from linumpy.io.zarr import read_omezarr, save_omezarr, OmeZarrWriter, AnalysisOmeZarrWriter
+from linumpy.io.zarr import read_omezarr, read_omezarr_array, save_omezarr, OmeZarrWriter, AnalysisOmeZarrWriter
 from linumpy.io.test_data import get_data
 from linumpy.io import slice_config        # slice_config.json reader/writer
 from linumpy.io.thorlabs import ThorImageOCT
 ```
 
-`read_omezarr(path, level=0)` returns a `(zarr.Array, voxel_size)` tuple. The
-voxel size is ordered to match the array axes (Z, Y, X for 3D volumes;
-Y, X for 2D mosaics).
+`read_omezarr(path, level=0)` returns a `(zarr.Array, voxel_size)` tuple (lazy).
+`read_omezarr_array(path, level=0, use_gpu=False)` is the high-level entry
+point: it materialises the requested level into memory and returns
+`(array, voxel_size)`. With ``use_gpu=False`` (default) the array is a
+``numpy.ndarray``; with ``use_gpu=True`` it dispatches through
+[`linumpy.gpu.zarr_io.read_zarr_to_gpu`](GPU_ACCELERATION.md) and returns a
+``cupy.ndarray`` (kvikio/GPUDirect Storage when available, otherwise the
+``zarr.config.enable_gpu`` fallback). Voxel size is ordered to match the array
+axes (Z, Y, X for 3D volumes; Y, X for 2D mosaics).
 
 See [Mosaic Grid Format](MOSAIC_GRID_FORMAT.md) and
 [Slice Config Feature](SLICE_CONFIG_FEATURE.md) for format details.
@@ -136,6 +142,9 @@ CuPy-backed versions of hot paths. Each public entry point either takes a
 * `gpu.registration`, `gpu.corrections` — GPU registration and correction
   passes used by `linum_estimate_transform.py`,
   `linum_normalize_intensities_per_slice.py`, etc.
+* `gpu.zarr_io` — high-level `read_zarr_to_gpu` dispatcher: picks the
+  fastest backend available (kvikio/GDS native → `zarr.config.enable_gpu`).
+* `gpu.kvikio_zarr` — kvikio / GPUDirect Storage backend for the dispatcher.
 
 See [GPU Acceleration](GPU_ACCELERATION.md) and [N4 GPU](N4_GPU.md).
 
diff --git a/linumpy/geometry/interface.py b/linumpy/geometry/interface.py
index 47996b59..9fe943eb 100644
--- a/linumpy/geometry/interface.py
+++ b/linumpy/geometry/interface.py
@@ -116,6 +116,7 @@ def find_tissue_interface(
     mask: np.ndarray | None = None,
     order: int = 1,
     detect_cutting_errors: bool = False,
+    use_gpu: bool = False,
 ) -> np.ndarray:
     """Detect the tissue interface.
 
@@ -135,6 +136,9 @@ def find_tissue_interface(
         Gaussian filter order.
     detect_cutting_errors : bool
         If True, detect and correct cutting artefacts.
+    use_gpu : bool
+        If True, use the GPU implementation when CuPy is available.
+        Ignored when ``mask`` is provided (the mask path stays on CPU).
 
     Returns
     -------
@@ -142,6 +146,21 @@ def find_tissue_interface(
         Tissue interface depth
 
     """
+    if use_gpu and mask is None:
+        from linumpy.gpu import GPU_AVAILABLE
+
+        if GPU_AVAILABLE:
+            from linumpy.gpu.interface import find_tissue_interface_gpu
+
+            return find_tissue_interface_gpu(
+                vol,
+                s_xy=s_xy,
+                s_z=s_z,
+                use_log=use_log,
+                order=order,
+                detect_cutting_errors=detect_cutting_errors,
+            )
+
     if use_log:
         vol_p = np.copy(vol)
         vol_p[vol > 0] = np.log(vol[vol > 0])
diff --git a/linumpy/gpu/__init__.py b/linumpy/gpu/__init__.py
index 49fae783..499f8ead 100644
--- a/linumpy/gpu/__init__.py
+++ b/linumpy/gpu/__init__.py
@@ -165,6 +165,21 @@ def to_cpu(array: Any) -> Any:
     return array
 
 
+def is_cupy_array(array: Any) -> bool:
+    """Return True iff ``array`` is a CuPy ``ndarray``.
+
+    Importing CuPy is gated on availability so this stays cheap to call from
+    code paths that may be hit without CUDA.
+    """
+    if not GPU_AVAILABLE:
+        return False
+    try:
+        import cupy as cp
+    except ImportError:
+        return False
+    return isinstance(array, cp.ndarray)
+
+
 def gpu_info() -> Any:
     """
     Get information about GPU availability and configuration.
diff --git a/linumpy/gpu/interface.py b/linumpy/gpu/interface.py
new file mode 100644
index 00000000..c717e6b7
--- /dev/null
+++ b/linumpy/gpu/interface.py
@@ -0,0 +1,50 @@
+"""GPU implementation of tissue interface detection.
+
+Mirrors the CPU path in :func:`linumpy.geometry.interface.find_tissue_interface`
+but routes the heavy filters through ``cupyx.scipy.ndimage`` to avoid
+host-device round trips when the caller already holds GPU data or when
+the volume is large enough that the transfer cost is amortised.
+"""
+
+from typing import Any
+
+import numpy as np
+
+from . import to_cpu
+
+
+def find_tissue_interface_gpu(
+    vol: Any,
+    s_xy: int = 15,
+    s_z: int = 2,
+    use_log: bool = True,
+    order: int = 1,
+    detect_cutting_errors: bool = False,
+) -> np.ndarray:
+    """GPU equivalent of ``find_tissue_interface`` (no mask path).
+
+    Always returns a NumPy array of integer depths so callers do not need
+    to be aware of the device.
+    """
+    import cupy as cp
+    from cupyx.scipy.ndimage import gaussian_filter1d, uniform_filter
+
+    vol_gpu = cp.asarray(vol)
+    if use_log:
+        vol_p = vol_gpu.astype(cp.float32, copy=True)
+        positive = vol_gpu > 0
+        vol_p[positive] = cp.log(vol_gpu[positive])
+    else:
+        vol_p = vol_gpu
+
+    vol_p = uniform_filter(vol_p, (s_xy, s_xy, 0))
+    vol_g = gaussian_filter1d(vol_p, s_z, axis=2, order=order)
+    z0 = cp.ceil(vol_g.argmax(axis=2) + s_z * 0.5).astype(cp.int64)
+
+    if detect_cutting_errors:
+        vol_p = gaussian_filter1d(vol_p, s_z, axis=2, order=0)
+        z0_p = cp.abs(vol_p).argmax(axis=2)
+        mask_max = z0_p < z0
+        z0 = cp.where(mask_max, z0_p, z0)
+
+    return to_cpu(z0)
diff --git a/linumpy/gpu/interpolation.py b/linumpy/gpu/interpolation.py
index 728367c0..97da205d 100644
--- a/linumpy/gpu/interpolation.py
+++ b/linumpy/gpu/interpolation.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 
-from . import GPU_AVAILABLE, to_cpu
+from . import GPU_AVAILABLE, is_cupy_array, to_cpu
 
 
 def affine_transform(image: Any, matrix: Any, output_shape: Any = None, order: Any = 1, use_gpu: Any = True) -> Any:
@@ -44,15 +44,18 @@ def affine_transform(image: Any, matrix: Any, output_shape: Any = None, order: A
 
 
 def _affine_transform_gpu(image: Any, matrix: Any, output_shape: Any, order: Any) -> Any:
-    """GPU implementation of affine transform."""
+    """GPU implementation of affine transform. Device-preserving."""
     import cupy as cp
     from cupyx.scipy.ndimage import affine_transform as cp_affine
 
+    input_was_gpu = is_cupy_array(image)
     img_gpu = cp.asarray(image.astype(np.float32))
     matrix_gpu = cp.asarray(matrix.astype(np.float32))
 
     result = cp_affine(img_gpu, matrix_gpu, output_shape=output_shape, order=order)
 
+    if input_was_gpu:
+        return result
     return to_cpu(result)
 
 
@@ -90,15 +93,18 @@ def map_coordinates(image: Any, coordinates: Any, order: Any = 1, use_gpu: Any =
 
 
 def _map_coordinates_gpu(image: Any, coordinates: Any, order: Any) -> Any:
-    """GPU implementation of map_coordinates."""
+    """GPU implementation of map_coordinates. Device-preserving."""
     import cupy as cp
     from cupyx.scipy.ndimage import map_coordinates as cp_map
 
+    input_was_gpu = is_cupy_array(image)
     img_gpu = cp.asarray(image.astype(np.float32))
     coords_gpu = cp.asarray(coordinates.astype(np.float32))
 
     result = cp_map(img_gpu, coords_gpu, order=order)
 
+    if input_was_gpu:
+        return result
     return to_cpu(result)
 
 
@@ -138,11 +144,17 @@ def resize(image: Any, output_shape: Any, order: Any = 1, anti_aliasing: Any = T
 
 
 def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) -> Any:
-    """GPU implementation of resize using zoom."""
+    """GPU implementation of resize using zoom.
+
+    Device-preserving: returns a CuPy array if *image* was already on device,
+    a NumPy array if *image* was on host. This avoids forcing a D->H copy when
+    callers want to keep tiles resident for downstream GPU compute / writes.
+    """
     import cupy as cp
     from cupyx.scipy.ndimage import gaussian_filter as cp_gaussian
     from cupyx.scipy.ndimage import zoom as cp_zoom
 
+    input_was_gpu = is_cupy_array(image)
     img_gpu = cp.asarray(image if image.dtype == np.float32 else image.astype(np.float32))
 
     # Scale factors: input/output for Gaussian sigma, output/input for zoom.
@@ -158,6 +170,8 @@ def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) -
 
     result = cp_zoom(img_gpu, zoom_factors, order=order)
 
+    if input_was_gpu:
+        return result
     return to_cpu(result)
 
 
diff --git a/linumpy/gpu/kvikio_zarr.py b/linumpy/gpu/kvikio_zarr.py
new file mode 100644
index 00000000..93a65b49
--- /dev/null
+++ b/linumpy/gpu/kvikio_zarr.py
@@ -0,0 +1,228 @@
+"""kvikio / GPUDirect Storage reader for uncompressed zarr arrays.
+
+This module implements the :func:`read_zarr_via_kvikio` backend used by
+:mod:`linumpy.gpu.zarr_io`. It reads zarr v2 *or* zarr v3 chunks directly
+into GPU memory using ``kvikio.CuFile``. The version is auto-detected from
+the on-disk metadata file (``zarr.json`` for v3, ``.zarray`` for v2).
+
+Most callers should use :func:`linumpy.gpu.zarr_io.read_zarr_to_gpu`, which
+dispatches to this backend only when GDS native mode is available and the
+array is uncompressed; otherwise it falls back to ``zarr.config.enable_gpu``.
+
+Supported on-disk formats
+-------------------------
+
+* zarr v3: ``codecs=[{"name": "bytes"}]`` only (or empty) — raw little/big-
+  endian bytes. Any compression codec (blosc, gzip, zstd, ...) is rejected
+  because GDS reads bytes verbatim and on-device decompression would require
+  nvCOMP.
+* zarr v2: ``compressor=None``, ``filters=None``, ``order='C'``.
+
+Notes
+-----
+* Requires ``kvikio`` and ``cupy``. Both are imported lazily so the rest of
+  linumpy is unaffected if they are not installed.
+* For the GDS fast path the source filesystem must support GDS natively
+  (ext4 on local NVMe, IOMMU disabled or in passthrough, and
+  ``properties.use_compat_mode=false`` in ``/etc/cufile.json``). Otherwise
+  kvikio falls back to a posix bounce-buffer path with no speed-up.
+"""
+
+from __future__ import annotations
+
+import json
+import sys
+from itertools import product
+from pathlib import Path
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable, Iterator
+
+
+def _require_kvikio() -> tuple[Any, Any]:
+    """Import kvikio + cupy lazily and return (kvikio, cupy)."""
+    try:
+        import cupy
+        import kvikio
+    except ImportError as exc:  # pragma: no cover - hardware-dependent
+        raise RuntimeError(
+            "kvikio + cupy are required for the GDS prototype. Install with:\n"
+            "    pip install kvikio-cu13 cupy-cuda13x  # or matching your CUDA"
+        ) from exc
+    return kvikio, cupy
+
+
+def _parse_v3_dtype(dt: Any) -> np.dtype:
+    """Map a zarr v3 ``data_type`` field to a numpy dtype."""
+    if isinstance(dt, str):
+        return np.dtype(dt)
+    raise NotImplementedError(f"unsupported v3 data_type: {dt!r}")
+
+
+def _v3_chunk_path(array_path: Path, idx: tuple[int, ...], encoding: dict) -> Path:
+    """Build the on-disk chunk path for a zarr v3 array."""
+    name = encoding.get("name", "default")
+    sep = encoding.get("configuration", {}).get("separator", "/")
+    parts = sep.join(str(i) for i in idx)
+    if name == "default":
+        return array_path / "c" / parts if sep == "/" else array_path / f"c{sep}{parts}"
+    if name == "v2":
+        return array_path / parts
+    raise NotImplementedError(f"unsupported chunk_key_encoding: {name!r}")
+
+
+def _v2_chunk_path(array_path: Path, idx: tuple[int, ...], dim_separator: str) -> Path:
+    return array_path / dim_separator.join(str(i) for i in idx)
+
+
+class _ArraySpec:
+    """Resolved, format-agnostic view of a zarr array on disk."""
+
+    __slots__ = ("_v2_dim_sep", "_v3_encoding", "chunks", "dtype", "fill_value", "format", "path", "shape")
+
+    def __init__(
+        self,
+        *,
+        path: Path,
+        shape: tuple[int, ...],
+        chunks: tuple[int, ...],
+        dtype: np.dtype,
+        fill_value: Any,
+        format: int,
+        v3_encoding: dict | None = None,
+        v2_dim_sep: str | None = None,
+    ) -> None:
+        self.path = path
+        self.shape = shape
+        self.chunks = chunks
+        self.dtype = dtype
+        self.fill_value = fill_value
+        self.format = format
+        self._v3_encoding = v3_encoding
+        self._v2_dim_sep = v2_dim_sep
+
+    def chunk_path(self, idx: tuple[int, ...]) -> Path:
+        if self.format == 3:
+            assert self._v3_encoding is not None
+            return _v3_chunk_path(self.path, idx, self._v3_encoding)
+        assert self._v2_dim_sep is not None
+        return _v2_chunk_path(self.path, idx, self._v2_dim_sep)
+
+
+def _load_array_spec(array_path: Path) -> _ArraySpec:
+    """Inspect ``array_path`` and return a normalized spec for v2 or v3."""
+    v3_meta = array_path / "zarr.json"
+    v2_meta = array_path / ".zarray"
+
+    if v3_meta.exists():
+        meta = json.loads(v3_meta.read_text())
+        if meta.get("zarr_format") != 3:
+            raise ValueError(f"{array_path}: zarr.json has zarr_format != 3")
+        if meta.get("node_type") != "array":
+            raise ValueError(f"{array_path}: zarr.json is not an array node")
+        codecs = meta.get("codecs", [])
+        non_bytes = [c for c in codecs if c.get("name") != "bytes"]
+        if non_bytes:
+            raise NotImplementedError(
+                f"{array_path}: codecs={[c.get('name') for c in codecs]!r}; this prototype "
+                "requires raw bytes only (no compression). On-device decompression needs nvCOMP."
+            )
+        host_endian = "big" if sys.byteorder == "big" else "little"
+        for c in codecs:
+            endian = c.get("configuration", {}).get("endian", "little")
+            if endian != host_endian:
+                raise NotImplementedError(f"{array_path}: endian={endian!r} differs from host")
+        chunk_grid = meta.get("chunk_grid", {})
+        if chunk_grid.get("name") != "regular":
+            raise NotImplementedError(f"{array_path}: chunk_grid={chunk_grid.get('name')!r}")
+        return _ArraySpec(
+            path=array_path,
+            shape=tuple(meta["shape"]),
+            chunks=tuple(chunk_grid["configuration"]["chunk_shape"]),
+            dtype=_parse_v3_dtype(meta["data_type"]),
+            fill_value=meta.get("fill_value", 0),
+            format=3,
+            v3_encoding=meta.get("chunk_key_encoding", {"name": "default", "configuration": {"separator": "/"}}),
+        )
+
+    if v2_meta.exists():
+        meta = json.loads(v2_meta.read_text())
+        if meta.get("zarr_format") != 2:
+            raise ValueError(f"{array_path}: .zarray has zarr_format != 2")
+        if meta.get("compressor") is not None:
+            raise NotImplementedError(
+                f"{array_path}: compressor={meta['compressor']!r}; this prototype "
+                "requires uncompressed chunks. On-device decompression needs nvCOMP."
+            )
+        if meta.get("order", "C") != "C":
+            raise NotImplementedError(f"{array_path}: order={meta['order']!r} unsupported")
+        if meta.get("filters"):
+            raise NotImplementedError(f"{array_path}: filters unsupported in prototype")
+        return _ArraySpec(
+            path=array_path,
+            shape=tuple(meta["shape"]),
+            chunks=tuple(meta["chunks"]),
+            dtype=np.dtype(meta["dtype"]),
+            fill_value=meta.get("fill_value", 0),
+            format=2,
+            v2_dim_sep=meta.get("dimension_separator", "."),
+        )
+
+    raise FileNotFoundError(f"{array_path}: no zarr.json (v3) or .zarray (v2) found")
+
+
+def _iter_chunk_indices(shape: Iterable[int], chunks: Iterable[int]) -> Iterator[tuple[int, ...]]:
+    n = [(s + c - 1) // c for s, c in zip(shape, chunks, strict=True)]
+    yield from product(*[range(k) for k in n])
+
+
+def read_zarr_via_kvikio(array_path: str | Path) -> Any:
+    """Load a full uncompressed zarr (v2 or v3) array into a CuPy array via GDS.
+
+    Parameters
+    ----------
+    array_path
+        Path to the zarr array directory (containing ``zarr.json`` for v3 or
+        ``.zarray`` for v2).
+
+    Returns
+    -------
+    cupy.ndarray
+        Device-resident array of shape and dtype matching the zarr metadata.
+    """
+    kvikio, cupy = _require_kvikio()
+    spec = _load_array_spec(Path(array_path))
+
+    out = cupy.full(spec.shape, spec.fill_value, dtype=spec.dtype)
+    chunk_nbytes_full = int(np.prod(spec.chunks) * spec.dtype.itemsize)
+    scratch = cupy.empty(spec.chunks, dtype=spec.dtype)
+
+    for idx in _iter_chunk_indices(spec.shape, spec.chunks):
+        cf_path = spec.chunk_path(idx)
+        if not cf_path.exists():
+            continue  # zarr fill-value semantics
+
+        slices: list[slice] = []
+        edge_shape: list[int] = []
+        for k, c, s in zip(idx, spec.chunks, spec.shape, strict=True):
+            start = k * c
+            stop = min(start + c, s)
+            slices.append(slice(start, stop))
+            edge_shape.append(stop - start)
+        edge_shape_t = tuple(edge_shape)
+
+        # kvikio.CuFile.pread requires a contiguous device buffer; a slice
+        # into ``out`` is generally not contiguous, so we read into a
+        # chunk-shaped scratch and copy the valid region into ``out``.
+        with kvikio.CuFile(str(cf_path), "r") as f:
+            f.pread(scratch, chunk_nbytes_full).get()
+        if edge_shape_t == spec.chunks:
+            out[tuple(slices)] = scratch
+        else:
+            sub = tuple(slice(0, e) for e in edge_shape_t)
+            out[tuple(slices)] = scratch[sub]
+
+    return out
diff --git a/linumpy/gpu/zarr_io.py b/linumpy/gpu/zarr_io.py
new file mode 100644
index 00000000..43b2e97e
--- /dev/null
+++ b/linumpy/gpu/zarr_io.py
@@ -0,0 +1,179 @@
+"""High-level zarr → GPU loading with automatic backend selection.
+
+Public entry points:
+
+* :func:`read_zarr_to_gpu` — load an entire zarr array onto the GPU using the
+  fastest available path (kvikio / GDS, falling back to ``zarr.config.enable_gpu``).
+* :func:`gpu_zarr_context` — context manager that flips zarr into GPU mode for
+  its duration, so subsequent ``zarr.open_array(...)`` calls return arrays whose
+  slicing materialises directly into ``cupy.ndarray``. Use this for tile-by-tile
+  / per-slab access patterns where loading the whole volume at once is wasteful.
+
+Selection order (when ``prefer='auto'``):
+
+1. **kvikio (GPUDirect Storage, native mode)** — chunks DMA'd directly from
+   NVMe into GPU memory. Requires ``kvikio`` installed, GDS in native mode,
+   and an uncompressed zarr v2/v3.
+2. **zarr.config.enable_gpu()** — host I/O with on-host decode then a single
+   H→D copy. Works for any zarr (compressed or not). The fallback when GDS
+   is unavailable, in compat mode, or the array is compressed.
+
+Backend implementations live in their own modules:
+
+* :mod:`linumpy.gpu.kvikio_zarr` — kvikio / GDS reader.
+
+Reference numbers on a 16 GiB float32 zarr v3 (RTX A6000, ext4, GDS native)
+warm cache: kvikio ~9.9 GiB/s, zarr-gpu ~7.1 GiB/s.
+"""
+
+from __future__ import annotations
+
+from contextlib import contextmanager
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Literal
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+Backend = Literal["auto", "kvikio", "zarr-gpu"]
+
+
+def _kvikio_native_mode_available() -> bool:
+    """Return True iff kvikio is importable and not stuck in compat mode.
+
+    kvikio always succeeds at import; the relevant question is whether
+    ``cufile.json`` (or env vars) allow native GDS. ``compat_mode`` returns a
+    ``CompatMode`` enum: ``OFF`` means native is forced/used, ``ON`` means
+    compat-only, ``AUTO`` means kvikio picks per-mount.
+    """
+    try:
+        import kvikio  # noqa: F401
+        from kvikio.defaults import compat_mode
+    except ImportError:
+        return False
+    try:
+        mode = compat_mode()
+    except Exception:  # pragma: no cover - older kvikio variants
+        return False
+    name = getattr(mode, "name", str(mode)).upper()
+    return name in {"OFF", "AUTO"}
+
+
+def _array_is_kvikio_compatible(array_path: Path) -> bool:
+    """Return True iff the on-disk array meets kvikio's raw-bytes constraints."""
+    from linumpy.gpu.kvikio_zarr import _load_array_spec
+
+    try:
+        _load_array_spec(array_path)
+    except (NotImplementedError, FileNotFoundError, ValueError):
+        return False
+    return True
+
+
+def read_zarr_via_zarr_gpu(array_path: str | Path) -> Any:
+    """Load a zarr array onto the GPU using ``zarr.config.enable_gpu()``.
+
+    Host I/O with on-host decode, then a single H→D copy. Works for any zarr
+    array (including compressed) and is the recommended fallback when GDS is
+    unavailable or stuck in compat mode.
+
+    Parameters
+    ----------
+    array_path
+        Path to the zarr array directory.
+
+    Returns
+    -------
+    cupy.ndarray
+        Device-resident array.
+    """
+    try:
+        import cupy
+        import zarr
+    except ImportError as exc:  # pragma: no cover - hardware-dependent
+        raise RuntimeError("cupy + zarr are required for the zarr-gpu fallback path.") from exc
+
+    with zarr.config.enable_gpu():
+        z = zarr.open_array(str(array_path), mode="r")
+        dev = z[:]
+    cupy.cuda.Stream.null.synchronize()
+    return dev
+
+
+def read_zarr_to_gpu(array_path: str | Path, *, prefer: Backend = "auto") -> Any:
+    """Load a zarr array onto the GPU using the fastest available path.
+
+    Selection order (when ``prefer='auto'``):
+
+    1. kvikio / GDS — only if kvikio is in native or auto mode AND the array
+       is uncompressed v2/v3.
+    2. ``zarr.config.enable_gpu()`` — works for any zarr.
+
+    Parameters
+    ----------
+    array_path
+        Path to the zarr array directory.
+    prefer
+        ``'auto'`` (default), ``'kvikio'``, or ``'zarr-gpu'``. Forcing a path
+        will raise if that path is unavailable for this array.
+
+    Returns
+    -------
+    cupy.ndarray
+        Device-resident array of shape and dtype matching the zarr metadata.
+    """
+    path = Path(array_path)
+
+    if prefer == "kvikio":
+        from linumpy.gpu.kvikio_zarr import read_zarr_via_kvikio
+
+        return read_zarr_via_kvikio(path)
+    if prefer == "zarr-gpu":
+        return read_zarr_via_zarr_gpu(path)
+    if prefer != "auto":
+        raise ValueError(f"unknown prefer={prefer!r}; expected 'auto', 'kvikio', or 'zarr-gpu'")
+
+    if _kvikio_native_mode_available() and _array_is_kvikio_compatible(path):
+        from linumpy.gpu.kvikio_zarr import read_zarr_via_kvikio
+
+        try:
+            return read_zarr_via_kvikio(path)
+        except (RuntimeError, OSError, NotImplementedError):
+            pass
+
+    return read_zarr_via_zarr_gpu(path)
+
+
+@contextmanager
+def gpu_zarr_context() -> Iterator[None]:
+    """Context manager that puts zarr into GPU mode for arbitrary slice reads.
+
+    Inside this context, any subsequent ``zarr.open_array(...)`` returns an
+    array whose slicing produces ``cupy.ndarray`` results — chunks are decoded
+    on host then transferred to device on each ``vol[slice]`` operation. This
+    is the right pattern for tile-by-tile or per-slab work where loading the
+    full volume at once is wasteful.
+
+    Outside this context, zarr falls back to its normal numpy-backed mode.
+
+    Examples
+    --------
+    >>> from linumpy.gpu.zarr_io import gpu_zarr_context
+    >>> from linumpy.io.zarr import read_omezarr
+    >>> with gpu_zarr_context():
+    ...     vol, _ = read_omezarr(path, level=0)
+    ...     for tile_region in regions:
+    ...         tile_gpu = vol[tile_region]  # already cupy
+
+    Raises
+    ------
+    RuntimeError
+        If zarr is unavailable.
+    """
+    try:
+        import zarr
+    except ImportError as exc:  # pragma: no cover - zarr is a hard dep
+        raise RuntimeError("zarr is required for gpu_zarr_context().") from exc
+
+    with zarr.config.enable_gpu():
+        yield
diff --git a/linumpy/intensity/bias_field.py b/linumpy/intensity/bias_field.py
index 62cdf815..16cf95b2 100644
--- a/linumpy/intensity/bias_field.py
+++ b/linumpy/intensity/bias_field.py
@@ -180,7 +180,9 @@ def compute_tissue_mask(
     use_gpu : bool
         If True, run the dominant 3-D ``gaussian_filter`` on GPU via
         CuPy (Z-chunked for memory safety). Falls back to CPU silently
-        if CuPy is unavailable. Otsu and morphology stay on CPU.
+        if CuPy is unavailable. Otsu and morphology stay on CPU. When
+        *vol* is already a ``cupy.ndarray`` the GPU path is used
+        regardless and slabs are read with no host round-trip.
 
     Returns
     -------
@@ -191,7 +193,11 @@ def compute_tissue_mask(
     from skimage.filters import threshold_otsu
     from skimage.morphology import disk
 
-    if use_gpu:
+    from linumpy.gpu import is_cupy_array
+
+    # Cupy input always goes through the GPU path; the CPU fallback uses
+    # numpy/scipy ops that don't accept cupy arrays.
+    if use_gpu or is_cupy_array(vol):
         try:
             return _compute_tissue_mask_gpu(
                 vol,
@@ -202,7 +208,9 @@ def compute_tissue_mask(
                 z_closing_sections=z_closing_sections,
             )
         except ImportError:
-            pass  # CuPy missing -- fall back to CPU below.
+            if is_cupy_array(vol):
+                raise  # cupy installed but cupyx missing — surface clearly
+            # CuPy missing -- fall back to CPU below.
 
     # Anisotropic 3-D smoothing: stronger in XY, light in Z to preserve
     # oblique tissue boundaries without per-Z Otsu noise.
diff --git a/linumpy/intensity/normalization.py b/linumpy/intensity/normalization.py
index f66bfebc..ba30fcd7 100644
--- a/linumpy/intensity/normalization.py
+++ b/linumpy/intensity/normalization.py
@@ -6,6 +6,8 @@
 based on agarose background detection.
 """
 
+from typing import Any
+
 import numpy as np
 
 
@@ -289,25 +291,38 @@ def apply_histogram_matching(
     use_gpu : bool
         If True, run the per-chunk matching loop on GPU via CuPy. Falls back
         to CPU silently if CuPy is unavailable. The volume itself is moved to
-        GPU one chunk at a time, so memory usage stays bounded.
+        GPU one chunk at a time, so memory usage stays bounded. When *vol* is
+        already a ``cupy.ndarray`` the GPU path is used regardless and slabs
+        are read with no host round-trip.
 
     Returns
     -------
     np.ndarray
         Histogram-matched volume.
     """
-    flat_all = vol.ravel()
-    ref_bins, ref_cdf, tissue_count = _build_tissue_cdf(flat_all, n_bins, tissue_threshold)
+    from linumpy.gpu import is_cupy_array
+
+    cupy_input = is_cupy_array(vol)
+
+    if cupy_input:
+        # Build the reference CDF on device so we don't D→H the whole volume.
+        ref_bins, ref_cdf, tissue_count = _build_tissue_cdf_gpu(vol, n_bins, tissue_threshold)
+    else:
+        flat_all = vol.ravel()
+        ref_bins, ref_cdf, tissue_count = _build_tissue_cdf(flat_all, n_bins, tissue_threshold)
     if tissue_count < 500:
-        return vol
+        from linumpy.gpu import to_cpu
+
+        return to_cpu(vol)
 
     bounds = _chunk_boundaries(vol.shape[0], n_serial_slices)
 
-    if use_gpu:
+    if cupy_input or use_gpu:
         try:
             return _apply_histogram_matching_gpu(vol, bounds, ref_bins, ref_cdf, n_bins, tissue_threshold)
         except ImportError:
-            pass
+            if cupy_input:
+                raise  # cupy is installed but cupyx missing — surface clearly
 
     out = np.empty_like(vol)
     for s, e in bounds:
@@ -317,6 +332,26 @@ def apply_histogram_matching(
     return out
 
 
+def _build_tissue_cdf_gpu(vol: Any, n_bins: int, tissue_threshold: float) -> tuple[np.ndarray, np.ndarray, int]:
+    """GPU equivalent of :func:`_build_tissue_cdf` for a CuPy-resident volume.
+
+    Computes the histogram on device and returns small host-side arrays
+    so callers can keep the rest of their state on CPU.
+    """
+    import cupy as cp
+
+    lo = tissue_threshold + max(1e-6, tissue_threshold * 1e-6)
+    lo = min(lo, 1.0)
+    hist = cp.histogram(vol.ravel(), bins=n_bins, range=(lo, 1.0))[0]
+    edges = cp.linspace(lo, 1.0, n_bins + 1)
+    bin_centers = 0.5 * (edges[:-1] + edges[1:])
+    total = int(hist.sum().item())
+    cdf = cp.cumsum(hist).astype(cp.float64)
+    if cdf[-1] > 0:
+        cdf /= cdf[-1]
+    return cp.asnumpy(bin_centers), cp.asnumpy(cdf), total
+
+
 def _apply_histogram_matching_gpu(
     vol: np.ndarray,
     bounds: list[tuple[int, int]],
@@ -330,7 +365,9 @@ def _apply_histogram_matching_gpu(
     Each chunk is moved to GPU, has its tissue CDF computed, an
     ``n_bins``-sized LUT built, and the per-voxel mapping applied.
     Result is moved back to CPU per chunk so the host array fills
-    incrementally without holding the whole volume on GPU.
+    incrementally without holding the whole volume on GPU. Accepts
+    either a host (numpy) volume or a device (cupy) volume; in the
+    cupy case ``cp.asarray`` is a no-op so no extra H→D copy occurs.
     """
     import cupy as cp
 
@@ -340,7 +377,10 @@ def _apply_histogram_matching_gpu(
     lo = tissue_threshold + max(1e-6, tissue_threshold * 1e-6)
     lo = min(lo, 1.0)
 
-    out = np.empty_like(vol)
+    # Always allocate the output on the host: the next pipeline stage
+    # (z-profile smoothing, N4) operates on numpy.
+    out = np.empty(vol.shape, dtype=np.float32)
+    cupy_input = isinstance(vol, cp.ndarray)
     for s, e in bounds:
         chunk_g = cp.asarray(vol[s:e], dtype=cp.float32)
         flat = chunk_g.ravel()
@@ -348,7 +388,7 @@ def _apply_histogram_matching_gpu(
         hist = cp.histogram(flat, bins=n_bins, range=(lo, 1.0))[0]
         tissue_count = int(hist.sum().item())
         if tissue_count < 500:
-            out[s:e] = vol[s:e]
+            out[s:e] = cp.asnumpy(vol[s:e]) if cupy_input else vol[s:e]
             continue
 
         edges = cp.linspace(lo, 1.0, n_bins + 1, dtype=cp.float32)
@@ -403,6 +443,12 @@ def apply_zprofile_smoothing(
     """
     from scipy.ndimage import gaussian_filter1d
 
+    from linumpy.gpu import to_cpu
+
+    # Pure-CPU op; pull cupy inputs to host so scipy/numpy can run unchanged.
+    vol = to_cpu(vol)
+    mask = to_cpu(mask)
+
     if sigma <= 0:
         return vol
     n_z = vol.shape[0]
diff --git a/linumpy/io/zarr.py b/linumpy/io/zarr.py
index 21015a65..da391567 100644
--- a/linumpy/io/zarr.py
+++ b/linumpy/io/zarr.py
@@ -18,9 +18,8 @@
 from ome_zarr.format import CurrentFormat
 from ome_zarr.io import parse_url
 from ome_zarr.reader import Multiscales, Reader
-from ome_zarr.scale import Scaler
+from ome_zarr.scale import Methods
 from ome_zarr.writer import write_image, write_multiscales_metadata
-from skimage.transform import resize
 
 configure_dask()
 
@@ -41,80 +40,6 @@ def create_tempstore(dir: str | None = None, suffix: str | None = None) -> zarr.
     return zarr_store
 
 
-class CustomScaler(Scaler):
-    """
-    Custom ome_zarr.scale.Scaler class for handling downscaling up to 3D.
-
-    Only `resize_image` method is implemented. Interpolation is ALWAYS done
-    using 1-st order (linear) interpolation.
-    """
-
-    def resize_image(self, image: np.ndarray | da.Array) -> np.ndarray | da.Array:
-        """
-        Resize a numpy array OR a dask array to a smaller array (not pyramid).
-
-        This method is the only method from the Scaler class called from
-        `ome_zarr.writer.write_image`.
-        """
-        if isinstance(image, da.Array):
-
-            def _resize(image: Any, out_shape: tuple, **kwargs: Any) -> da.Array:  # type: ignore[override]
-                return da_resize(image, out_shape, **kwargs)
-
-        else:
-            _resize = resize
-
-        # downsample in X, Y, and Z.
-        new_shape = list(image.shape)
-        new_shape[-1] = image.shape[-1] // self.downscale
-        new_shape[-2] = image.shape[-2] // self.downscale
-        if len(new_shape) > 2:
-            new_shape[-3] = image.shape[-3] // self.downscale
-        out_shape = tuple(new_shape)
-
-        dtype = image.dtype
-        if np.iscomplexobj(image):
-            image = _resize(
-                image.real.astype(np.float64),
-                out_shape,
-                order=1,
-                mode="reflect",
-                anti_aliasing=False,
-            ) + 1j * _resize(
-                image.imag.astype(np.float64),
-                out_shape,
-                order=1,
-                mode="reflect",
-                anti_aliasing=False,
-            )
-        else:
-            image = _resize(
-                image.astype(np.float64),
-                out_shape,
-                order=1,
-                mode="reflect",
-                anti_aliasing=False,
-            )
-        return image.astype(dtype)
-
-    def linear(self, base: np.ndarray) -> list:
-        """Downsample using :func:`skimage.transform.resize` with linear interpolation."""
-        pyramid: list[np.ndarray | da.Array] = [base]
-        max_axes_resize = min(len(base.shape), 3)
-        level = self.max_layer
-        while level > 0 and np.all(np.asarray(pyramid[-1].shape[:-max_axes_resize]) >= self.downscale):
-            pyramid.append(self.resize_image(pyramid[-1]))
-            level -= 1
-        return pyramid
-
-    def _by_plane(self, base: np.ndarray, func: object) -> np.ndarray:
-        # This method is called by base class when interpolation methods (e.g. nearest)
-        # are called directly. Because `write_image` never call these methods, we don't
-        # need to implement it here. We raise an error to make sure the CustomScaler class
-        # is not used for this purpose.
-        raise NotImplementedError("_by_plane method not implemented for CustomScaler")
-
-
 def create_transformation_dict(nlevels: int, voxel_size: Sequence, ndims: int = 3) -> list:
     """Create a list of coordinate transformation dicts for OME-Zarr pyramid levels.
 
@@ -217,6 +142,7 @@ def save_omezarr(
     chunks: tuple | Sequence = (128, 128, 128),
     n_levels: int = 5,
     overwrite: bool = True,
+    shards: tuple | Sequence | None = None,
 ) -> zarr.Group:
     """Save array to disk in OME-NGFF zarr format.
 
@@ -231,27 +157,49 @@ def save_omezarr(
     :type chunks: tuple of n `int`, with n the number of dimensions.
     :param chunks: Chunk size on disk.
     :type n_levels: int
-    :param n_levels: Number of levels in Gaussian pyramid.
+    :param n_levels: Number of pyramid levels (downsamples by 2 along all spatial
+        axes including Z).
     :type overwrite: bool
     :param overwrite: Overwrite `store_path` if it already exists.
+    :type shards: tuple of n `int` or None
+    :param shards: If provided, group multiple chunks together into a single
+        shard on disk (zarr v3 native sharding). Must be a multiple of
+        ``chunks`` along every dimension. Sharding reduces the number of files
+        on disk dramatically for large pyramids while keeping per-chunk random
+        access; useful for stacked volumes whose chunk size is far smaller than
+        the per-axis extent. ``None`` disables sharding.
 
     :type zarr_group: zarr.hierarchy.group
     :return zarr_group: Resulting zarr group saved to disk.
     """
     n_levels = validate_n_levels(n_levels, data.shape)
 
-    # pyramidal decomposition (ome_zarr.scale.Scaler) keywords
-    pyramid_kw = {"max_layer": int(n_levels), "method": "linear", "downscale": 2}
-
-    ome_zarr_version = version("ome-zarr")
-    metadata = {"method": "ome_zarr.scale.Scaler", "version": ome_zarr_version, "args": pyramid_kw}
-
-    # # axes and coordinate transformations
+    # axes and coordinate transformations (c, z, y, x order)
     ndims = len(data.shape)
     axes = generate_axes_dict(ndims)
     coordinate_transformations = create_transformation_dict(n_levels + 1, voxel_size=voxel_size, ndims=ndims)
 
-    # create directory for zarr storage
+    # ome-zarr's default ``scale_factors`` (e.g. ``(2, 4, 8, 16)``) applies
+    # downsampling only to spatial axes *except z*. linumpy datasets are
+    # acquired with isotropic-ish voxel sizes, so we want true 3D downsampling
+    # at every level. Build the per-level dict explicitly.
+    spatial_axes = [a["name"] for a in axes if a.get("type") == "space"]
+    scale_factors: list[dict[str, int]] = [dict.fromkeys(spatial_axes, 2**i) for i in range(1, n_levels + 1)]
+
+    # storage_options: one dict per dataset (level0 + n_levels). Each level
+    # gets the same chunk shape; sharding (when requested) likewise applies
+    # to every level.
+    storage_options: list[dict[str, Any]] = []
+    for _ in range(n_levels + 1):
+        opts: dict[str, Any] = {"chunks": tuple(chunks)}
+        if shards is not None:
+            opts["shards"] = tuple(shards)
+        storage_options.append(opts)
+
+    pyramid_kw = {"max_layer": int(n_levels), "method": Methods.RESIZE.value, "downscale": 2}
+    ome_zarr_version = version("ome-zarr")
+    metadata = {"method": "ome_zarr.writer.write_image", "version": ome_zarr_version, "args": pyramid_kw}
+
     create_directory(store_path, overwrite)
     _loc = parse_url(store_path, mode="w")
     assert _loc is not None
@@ -261,18 +209,52 @@ def save_omezarr(
     write_image(
         data,
         zarr_group,
+        scale_factors=scale_factors,
+        method=Methods.RESIZE,
         axes=axes,
-        scaler=CustomScaler(max_layer=int(n_levels), method="linear", downscale=2),
-        storage_options={"chunks": chunks},
+        storage_options=storage_options,
         coordinate_transformations=coordinate_transformations,
         compute=True,
         metadata=metadata,
     )
 
-    # return zarr group containing saved data
     return zarr_group
 
 
+def resolve_omezarr_level_path(zarr_path: Path, level: int = 0) -> Path:
+    """Return the on-disk path of the array for *level* of an OME-Zarr group.
+
+    Resolves the multiscale dataset path advertised by the OME-Zarr metadata
+    (``zarr_path / multiscale.datasets[level]``) without opening the array.
+    This is the backend-agnostic way to locate the actual chunked store on
+    disk — independent of any particular ``zarr.storage`` implementation.
+
+    Parameters
+    ----------
+    zarr_path
+        Path to the OME-Zarr group.
+    level
+        Pyramid level (0 = full resolution).
+
+    Returns
+    -------
+    Path
+        On-disk path of the level's zarr array.
+    """
+    _zarr_loc = parse_url(zarr_path)
+    assert _zarr_loc is not None
+    reader = Reader(_zarr_loc)
+    nodes = list(reader())
+    image_node = nodes[0]
+
+    multiscale = None
+    for spec in image_node.specs:
+        if isinstance(spec, Multiscales):
+            multiscale = spec
+    assert multiscale is not None, "No Multiscales spec found in zarr file"
+    return Path(zarr_path) / multiscale.datasets[level]
+
+
 def read_omezarr(zarr_path: Path, level: int = 0) -> tuple:
     """Read OME-Zarr image at *zarr_path* and return the array and voxel size.
 
@@ -288,18 +270,12 @@ def read_omezarr(zarr_path: Path, level: int = 0) -> tuple:
     :type res: tuple (3,)
     :return res: Voxel size of zarr array.
     """
-    # read the image data
     _zarr_loc = parse_url(zarr_path)
     assert _zarr_loc is not None
     reader = Reader(_zarr_loc)
-    # nodes may include images, labels etc
     nodes = list(reader())
-
-    # first node will be the image pixel data
     image_node = nodes[0]
 
-    # By default omezarr will return dask array (vol = image_node.data[level]).
-    # Here we load a zarr array directly and let the user convert to dask if needed.
     multiscale = None
     for spec in image_node.specs:
         if isinstance(spec, Multiscales):
@@ -317,6 +293,55 @@ def read_omezarr(zarr_path: Path, level: int = 0) -> tuple:
     return vol, scale
 
 
+def read_omezarr_array(
+    zarr_path: Path,
+    level: int = 0,
+    *,
+    use_gpu: bool = False,
+) -> tuple[Any, list[float]]:
+    """Read an OME-Zarr image and materialise it into a single in-memory array.
+
+    Unlike :func:`read_omezarr` (which returns a lazy ``zarr.Array``), this helper
+    returns a fully-loaded array sized to fit in host or device memory, plus the
+    voxel size for *level*.
+
+    When ``use_gpu=True`` the underlying multiscale array is loaded via
+    :func:`linumpy.gpu.zarr_io.read_zarr_to_gpu`, which selects the fastest path
+    available at runtime (kvikio / GPUDirect Storage when native mode is on and
+    the array is uncompressed; otherwise ``zarr.config.enable_gpu``). The
+    returned array is a ``cupy.ndarray``.
+
+    When ``use_gpu=False`` (default) the array is returned as a contiguous
+    ``numpy.ndarray`` — keeps existing CPU-only pipelines working unchanged.
+
+    Parameters
+    ----------
+    zarr_path
+        Path to the OME-Zarr group.
+    level
+        Pyramid level to load (0 = full resolution).
+    use_gpu
+        If True and CuPy is available, return a device-resident ``cupy.ndarray``
+        loaded through the GPU dispatcher.
+
+    Returns
+    -------
+    array : numpy.ndarray or cupy.ndarray
+        Fully-materialised volume.
+    scale : list of float
+        Voxel size for the requested pyramid level.
+    """
+    vol, scale = read_omezarr(zarr_path, level=level)
+    if use_gpu:
+        # Resolve the on-disk path of the actual array (level subdirectory) so
+        # the GPU dispatcher can hit kvikio's raw-bytes fast path when possible.
+        from linumpy.gpu.zarr_io import read_zarr_to_gpu
+
+        array_path = resolve_omezarr_level_path(zarr_path, level=level)
+        return read_zarr_to_gpu(array_path), scale
+    return np.asarray(vol[:]), scale
+
+
 class OmeZarrWriter:
     """Write OME-Zarr files to disk in a pyramidal format, chunk by chunk."""
 
@@ -468,10 +493,18 @@ def finalize(self, res: list | tuple, n_levels: int = 5) -> None:
         for p, t in zip(paths, transformations, strict=False):
             datasets.append({"path": p, "coordinateTransformations": t})
 
-        pyramid_kw = {"max_layer": n_levels, "method": "linear", "downscale": self.downscale_factor}
+        pyramid_kw = {
+            "max_layer": n_levels,
+            "method": "resize",
+            "downscale": self.downscale_factor,
+        }
 
         ome_zarr_version = version("ome-zarr")
-        metadata = {"method": "ome_zarr.scale.Scaler", "version": ome_zarr_version, "args": pyramid_kw}
+        metadata = {
+            "method": "linumpy.io.zarr.OmeZarrWriter._downsample_pyramid_on_disk",
+            "version": ome_zarr_version,
+            "args": pyramid_kw,
+        }
 
         write_multiscales_metadata(self.root, datasets, axes=self.axes, metadata=metadata)
 
diff --git a/linumpy/mosaic/stacking.py b/linumpy/mosaic/stacking.py
index e357bc71..63036ba3 100644
--- a/linumpy/mosaic/stacking.py
+++ b/linumpy/mosaic/stacking.py
@@ -96,7 +96,12 @@ def enforce_z_consistency(
 
 
 def find_z_overlap(
-    fixed_vol: np.ndarray, moving_vol: np.ndarray, slicing_interval_mm: float, search_range_mm: float, resolution_um: float
+    fixed_vol: np.ndarray,
+    moving_vol: np.ndarray,
+    slicing_interval_mm: float,
+    search_range_mm: float,
+    resolution_um: float,
+    use_gpu: bool = False,
 ) -> tuple[int, float]:
     """Find optimal Z-overlap between consecutive slices using cross-correlation.
 
@@ -115,6 +120,8 @@ def find_z_overlap(
         Search range around expected position in mm.
     resolution_um : float
         Z resolution in microns per voxel.
+    use_gpu : bool
+        If True, run the NCC sweep on the GPU when CuPy is available.
 
     Returns
     -------
@@ -138,17 +145,29 @@ def find_z_overlap(
     y_slice = slice(margin, h - margin)
     x_slice = slice(margin, w - margin)
 
+    xp: Any = np
+    if use_gpu:
+        from linumpy.gpu import GPU_AVAILABLE
+
+        if GPU_AVAILABLE:
+            import cupy as cp
+
+            xp = cp
+
+    fixed_xp = xp.asarray(fixed_vol[-max_overlap:, y_slice, x_slice].astype(np.float32))
+    moving_xp = xp.asarray(moving_vol[:max_overlap, y_slice, x_slice].astype(np.float32))
+
     best_overlap = expected_overlap_vox
-    best_corr = -np.inf
+    best_corr = -float("inf")
 
     for overlap in range(min_overlap, max_overlap + 1):
-        fixed_region = fixed_vol[-overlap:, y_slice, x_slice]
-        moving_region = moving_vol[:overlap, y_slice, x_slice]
+        fixed_region = fixed_xp[-overlap:]
+        moving_region = moving_xp[:overlap]
 
         fixed_norm = (fixed_region - fixed_region.mean()) / (fixed_region.std() + 1e-8)
         moving_norm = (moving_region - moving_region.mean()) / (moving_region.std() + 1e-8)
 
-        corr = np.mean(fixed_norm * moving_norm)
+        corr = float((fixed_norm * moving_norm).mean())
         if corr > best_corr:
             best_corr = corr
             best_overlap = overlap
diff --git a/pyproject.toml b/pyproject.toml
index f6b75bbf..a12c823c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -39,7 +39,7 @@ dependencies = [
     "hyperactive",
     "basicpy",
     "zarr>=3.0.10",
-    "ome-zarr>=0.9.0",
+    "ome-zarr>=0.16.0",
     "napari",
     "napari-ome-zarr",
     "pynrrd",
@@ -53,10 +53,16 @@ Repository = "https://github.com/linum-uqam/linumpy"
 
 [project.optional-dependencies]
 gpu = [
+    "cupy-cuda13x>=13.0.0",
+]
+gpu-cuda12 = [
     "cupy-cuda12x>=12.0.0",
 ]
-gpu-cuda13 = [
-    "cupy-cuda13x>=13.0.0",
+gds = [
+    "kvikio-cu13",
+]
+gds-cuda12 = [
+    "kvikio-cu12",
 ]
 docs = [
     "sphinx>=9.1",
@@ -270,7 +276,7 @@ possibly-unresolved-reference = "error"
 # Optional GPU deps — not installed in the standard dev env.
 # Using replace-imports-with-any so downstream attribute/subscript access
 # on the imported module is typed as Any rather than causing cascading errors.
-replace-imports-with-any = ["cupy.**", "cupyx.**", "numba.**"]
+replace-imports-with-any = ["cupy.**", "cupyx.**", "numba.**", "kvikio.**"]
 
 # --- ty overrides ---
 
diff --git a/scripts/linum_aip.py b/scripts/linum_aip.py
index 23873d3c..b0d20e17 100644
--- a/scripts/linum_aip.py
+++ b/scripts/linum_aip.py
@@ -1,17 +1,22 @@
 #!/usr/bin/env python3
 
-"""Compute the average intensity projection of a 3D zarr volume."""
+"""Compute the average intensity projection of a 3D zarr volume.
+
+Falls back to CPU if GPU is not available or --no-use_gpu is passed.
+"""
 
 # Configure thread limits before numpy/scipy imports
 import linumpy.config.threads  # noqa: F401
 
 import argparse
 from pathlib import Path
+from typing import Any
 
 import dask.array as da
 import numpy as np
 import zarr
 
+from linumpy.gpu import GPU_AVAILABLE, is_cupy_array, print_gpu_info, to_cpu
 from linumpy.io.zarr import create_tempstore, read_omezarr, save_omezarr
 
 
@@ -19,32 +24,25 @@ def _build_arg_parser() -> argparse.ArgumentParser:
     p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter)
     p.add_argument("input_zarr", type=Path, help="Full path to the zarr volume.")
     p.add_argument("output_image", type=Path, default=None, help="Full path to the output zarr image")
-
+    p.add_argument(
+        "--use_gpu",
+        default=True,
+        action=argparse.BooleanOptionalAction,
+        help="Use GPU acceleration if available. [%(default)s]",
+    )
+    p.add_argument("--verbose", "-v", action="store_true", help="Print GPU information.")
     return p
 
 
-def main() -> None:
-    """Run the average intensity projection script."""
-    # Parse arguments
-    p = _build_arg_parser()
-    args = p.parse_args()
-
-    # Parameters
-    input_file = Path(args.input_zarr)
-    output_file = Path(args.output_image)
-
-    # Open the zarr volume
-    vol, resolution = read_omezarr(input_file, level=0)
-
-    # Prepare the output
+def _compute_aip(vol: Any, use_gpu: bool) -> tuple[zarr.Array, tuple]:
+    """Compute the per-tile mean and store it into a temporary 2D zarr array."""
     shape = vol.shape[1:3]
+    tile_shape = vol.chunks
     zarr_store = create_tempstore(suffix=".zarr")
     _aip = zarr.open(zarr_store, mode="w", shape=shape, dtype=np.float32, chunks=vol.chunks[1:3])
     assert isinstance(_aip, zarr.Array)
     aip = _aip
 
-    # Process every tile
-    tile_shape = vol.chunks
     nx = vol.shape[1] // tile_shape[1]
     ny = vol.shape[2] // tile_shape[2]
     for i in range(nx):
@@ -53,8 +51,48 @@ def main() -> None:
             rmax = (i + 1) * tile_shape[1]
             cmin = j * tile_shape[2]
             cmax = (j + 1) * tile_shape[2]
-            tile = np.asarray(vol[:, rmin:rmax, cmin:cmax]).mean(axis=0)
-            aip[rmin:rmax, cmin:cmax] = tile
+            tile = vol[:, rmin:rmax, cmin:cmax]
+            if use_gpu:
+                import cupy as cp
+
+                # Tile is already cupy when the reader is inside gpu_zarr_context;
+                # otherwise transfer it once.
+                tile_gpu = tile if is_cupy_array(tile) else cp.asarray(np.asarray(tile))
+                aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu.astype(cp.float32), axis=0))
+                del tile_gpu
+            else:
+                aip[rmin:rmax, cmin:cmax] = np.asarray(tile).mean(axis=0)
+    return aip, tile_shape
+
+
+def main() -> None:
+    """Run the average intensity projection script."""
+    p = _build_arg_parser()
+    args = p.parse_args()
+
+    input_file = Path(args.input_zarr)
+    output_file = Path(args.output_image)
+
+    use_gpu = args.use_gpu and GPU_AVAILABLE
+
+    if args.verbose:
+        print_gpu_info()
+    if args.use_gpu and not GPU_AVAILABLE:
+        print("WARNING: GPU requested but not available, falling back to CPU")
+    elif use_gpu:
+        print("GPU: ENABLED")
+    else:
+        print("GPU: DISABLED (using CPU)")
+
+    if use_gpu:
+        from linumpy.gpu.zarr_io import gpu_zarr_context
+
+        with gpu_zarr_context():
+            vol, resolution = read_omezarr(input_file, level=0)
+            aip, tile_shape = _compute_aip(vol, use_gpu=True)
+    else:
+        vol, resolution = read_omezarr(input_file, level=0)
+        aip, tile_shape = _compute_aip(vol, use_gpu=False)
 
     out_dask = da.from_zarr(aip)
     save_omezarr(out_dask, output_file, resolution[1:], tile_shape[1:])
diff --git a/scripts/linum_aip_png.py b/scripts/linum_aip_png.py
index 23b56a2c..76356f3d 100755
--- a/scripts/linum_aip_png.py
+++ b/scripts/linum_aip_png.py
@@ -51,16 +51,19 @@ def compute_aip(vol: Any, use_gpu: bool = True) -> np.ndarray:
             cmin = j * tile_shape[2]
             cmax = (j + 1) * tile_shape[2]
 
-            tile = np.asarray(vol[:, rmin:rmax, cmin:cmax])
+            tile = vol[:, rmin:rmax, cmin:cmax]
 
             if use_gpu:
                 import cupy as cp
 
-                tile_gpu = cp.asarray(tile.astype(np.float32))
-                aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu, axis=0))
+                # Slices may already be cupy when the read happens inside
+                # ``gpu_zarr_context`` (no extra H→D copy). Otherwise we
+                # transfer the host tile once.
+                tile_gpu = tile if isinstance(tile, cp.ndarray) else cp.asarray(np.asarray(tile))
+                aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu.astype(cp.float32), axis=0))
                 del tile_gpu
             else:
-                aip[rmin:rmax, cmin:cmax] = tile.mean(axis=0)
+                aip[rmin:rmax, cmin:cmax] = np.asarray(tile).mean(axis=0)
 
     if use_gpu:
         try:
@@ -127,8 +130,17 @@ def main() -> None:
     else:
         print("GPU: DISABLED (using CPU)")
 
-    vol, _ = read_omezarr(input_file, level=0)
-    aip = compute_aip(vol, use_gpu=use_gpu)
+    if use_gpu:
+        # Open the array inside the GPU context so each tile slice is read
+        # directly into device memory (no host round-trip per tile).
+        from linumpy.gpu.zarr_io import gpu_zarr_context
+
+        with gpu_zarr_context():
+            vol, _ = read_omezarr(input_file, level=0)
+            aip = compute_aip(vol, use_gpu=True)
+    else:
+        vol, _ = read_omezarr(input_file, level=0)
+        aip = compute_aip(vol, use_gpu=False)
     save_aip_png(aip, output_file)
 
 
diff --git a/scripts/linum_benchmark_kvikio_zarr.py b/scripts/linum_benchmark_kvikio_zarr.py
new file mode 100755
index 00000000..8147d6ff
--- /dev/null
+++ b/scripts/linum_benchmark_kvikio_zarr.py
@@ -0,0 +1,190 @@
+#!/usr/bin/env python3
+r"""Benchmark zarr → GPU loading: kvikio (GDS) vs zarr+cupy vs zarr.enable_gpu.
+
+Three paths to get an uncompressed zarr v3 array onto the GPU are compared:
+
+  A) ``zarr.open(...)[:]`` → ``np.asarray`` → ``cupy.asarray`` (legacy path).
+  B) ``zarr.config.enable_gpu(); arr[:]`` returns a ``cupy.ndarray`` directly,
+     but the codec pipeline still decodes on the CPU before the H→D copy.
+  C) ``linumpy.gpu.kvikio_zarr.read_zarr_via_kvikio`` — kvikio CuFile / GDS.
+  D) ``linumpy.gpu.zarr_io.read_zarr_to_gpu`` — auto-dispatch (kvikio if GDS
+     native + uncompressed, else zarr-gpu).
+
+Reference numbers on a 16 GiB float32 zarr v3 (256³ chunks) on /scratch_nvme,
+RTX A6000, ext4, GDS native mode (warm cache):
+
+* kvikio (GDS native):     ~9.9 GiB/s
+* zarr.config.enable_gpu:  ~7.1 GiB/s
+* zarr → numpy → cupy:     ~2.8 GiB/s
+
+For production code, prefer :func:`linumpy.gpu.zarr_io.read_zarr_to_gpu`
+which picks the best available path at runtime.
+
+Examples
+--------
+Generate a 16 GiB random uncompressed zarr v3 on /scratch_nvme and bench::
+
+    linum_benchmark_kvikio_zarr.py \
+        --generate /scratch_nvme/bench.zarr \
+        --shape 2048 2048 1024 --chunks 256 256 128 --dtype float32 \
+        --runs 3
+"""
+
+from __future__ import annotations
+
+import argparse
+import contextlib
+import sys
+import time
+from pathlib import Path
+
+import numpy as np
+
+
+def _human_bytes(n: float) -> str:
+    units = ["B", "KiB", "MiB", "GiB", "TiB"]
+    i = 0
+    while n >= 1024 and i < len(units) - 1:
+        n /= 1024
+        i += 1
+    return f"{n:.2f} {units[i]}"
+
+
+def _generate_dataset(path: Path, shape: tuple[int, ...], chunks: tuple[int, ...], dtype: str) -> None:
+    import zarr
+
+    print(f"[generate] shape={shape} chunks={chunks} dtype={dtype} path={path}")
+    if path.exists():
+        raise SystemExit(f"refusing to overwrite existing {path}")
+
+    # zarr v3 with raw bytes only (no compression) so the kvikio path works.
+    arr = zarr.create_array(
+        store=str(path),
+        shape=shape,
+        chunks=chunks,
+        dtype=dtype,
+        compressors=None,
+        zarr_format=3,
+        fill_value=0,
+    )
+    rng = np.random.default_rng(0)
+    z_step = chunks[0]
+    for z0 in range(0, shape[0], z_step):
+        z1 = min(z0 + z_step, shape[0])
+        block = rng.standard_normal((z1 - z0, *shape[1:])).astype(dtype, copy=False)
+        arr[z0:z1] = block
+    print(f"[generate] done: {_human_bytes(arr.nbytes)} written")
+
+
+def _bench_zarr_cupy(array_path: Path) -> tuple[float, int]:
+    import cupy
+    import zarr
+
+    t0 = time.perf_counter()
+    z = zarr.open_array(str(array_path), mode="r")
+    host = np.asarray(z[:])
+    dev = cupy.asarray(host)
+    cupy.cuda.Stream.null.synchronize()
+    t1 = time.perf_counter()
+    return t1 - t0, dev.nbytes
+
+
+def _bench_zarr_gpu(array_path: Path) -> tuple[float, int]:
+    """zarr.config.enable_gpu(): host I/O, host decode, final buffer is CuPy."""
+    import cupy
+    import zarr
+
+    with zarr.config.enable_gpu():
+        t0 = time.perf_counter()
+        z = zarr.open_array(str(array_path), mode="r")
+        dev = z[:]
+        cupy.cuda.Stream.null.synchronize()
+        t1 = time.perf_counter()
+    nbytes = int(np.prod(z.shape)) * np.dtype(z.dtype).itemsize
+    del dev  # keep reference alive through synchronize
+    return t1 - t0, nbytes
+
+
+def _bench_kvikio(array_path: Path) -> tuple[float, int]:
+    import cupy
+
+    from linumpy.gpu.kvikio_zarr import read_zarr_via_kvikio
+
+    t0 = time.perf_counter()
+    dev = read_zarr_via_kvikio(array_path)
+    cupy.cuda.Stream.null.synchronize()
+    t1 = time.perf_counter()
+    return t1 - t0, dev.nbytes
+
+
+def _bench_auto(array_path: Path) -> tuple[float, int]:
+    import cupy
+
+    from linumpy.gpu.zarr_io import read_zarr_to_gpu
+
+    t0 = time.perf_counter()
+    dev = read_zarr_to_gpu(array_path)
+    cupy.cuda.Stream.null.synchronize()
+    t1 = time.perf_counter()
+    return t1 - t0, dev.nbytes
+
+
+def _drop_caches() -> None:
+    """Best-effort page-cache drop (requires root). Silently skipped otherwise."""
+    with contextlib.suppress(PermissionError, FileNotFoundError, OSError):
+        Path("/proc/sys/vm/drop_caches").write_text("3\n")
+
+
+PATHS = {
+    "zarr+cupy": _bench_zarr_cupy,
+    "zarr-gpu": _bench_zarr_gpu,
+    "kvikio": _bench_kvikio,
+    "auto": _bench_auto,
+}
+
+
+def main() -> None:
+    """Run the benchmark from the command line."""
+    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("array_path", type=Path, nargs="?", help="zarr array directory")
+    p.add_argument("--generate", type=Path, help="create a synthetic uncompressed zarr v3 at this path")
+    p.add_argument("--shape", type=int, nargs="+", default=[2048, 2048, 512])
+    p.add_argument("--chunks", type=int, nargs="+", default=[256, 256, 128])
+    p.add_argument("--dtype", default="float32")
+    p.add_argument("--runs", type=int, default=3)
+    p.add_argument("--skip-cache-drop", action="store_true", help="don't try to drop page cache between runs")
+    p.add_argument("--only", action="append", choices=list(PATHS), help="only benchmark these paths (repeatable)")
+    args = p.parse_args()
+
+    if args.generate is not None:
+        _generate_dataset(args.generate, tuple(args.shape), tuple(args.chunks), args.dtype)
+        if args.array_path is None:
+            args.array_path = args.generate
+
+    if args.array_path is None:
+        p.error("array_path is required (unless --generate also gives one)")
+
+    selected = args.only or list(PATHS)
+    print(f"[bench] array={args.array_path} runs={args.runs}")
+    for name in selected:
+        fn = PATHS[name]
+        times: list[float] = []
+        nbytes = 0
+        for r in range(args.runs):
+            if not args.skip_cache_drop:
+                _drop_caches()
+            try:
+                dt, nbytes = fn(args.array_path)
+            except Exception as exc:
+                print(f"  [{name}] run {r}: ERROR {exc!r}")
+                break
+            tput = nbytes / dt / (1024**3)
+            times.append(dt)
+            print(f"  [{name}] run {r}: {dt:.3f}s  {tput:.2f} GiB/s")
+        if times:
+            best = min(times)
+            print(f"  [{name}] best: {best:.3f}s  {nbytes / best / (1024**3):.2f} GiB/s")
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/linum_compensate_attenuation.py b/scripts/linum_compensate_attenuation.py
index 94566964..cc075f27 100644
--- a/scripts/linum_compensate_attenuation.py
+++ b/scripts/linum_compensate_attenuation.py
@@ -1,9 +1,5 @@
 #!/usr/bin/env python
-"""
-Compensate the tissue attenuation using a precomputed attenuation.
-
-bias field.
-"""
+"""Compensate the tissue attenuation using a precomputed attenuation bias field."""
 
 # Configure thread limits before numpy/scipy imports
 import linumpy.config.threads  # noqa: F401
diff --git a/scripts/linum_correct_bias_field.py b/scripts/linum_correct_bias_field.py
index c217b8c7..46360137 100644
--- a/scripts/linum_correct_bias_field.py
+++ b/scripts/linum_correct_bias_field.py
@@ -29,7 +29,7 @@
     n4_correct_per_section,
 )
 from linumpy.intensity.normalization import apply_histogram_matching, apply_zprofile_smoothing
-from linumpy.io.zarr import AnalysisOmeZarrWriter, read_omezarr
+from linumpy.io.zarr import AnalysisOmeZarrWriter, read_omezarr_array
 
 logger = logging.getLogger(__name__)
 
@@ -207,12 +207,9 @@ def main() -> None:
 
     n_processes = parse_processes_arg(args.n_processes)
 
-    # Load volume
-    vol_da, res = read_omezarr(args.in_image, level=0)
-    vol = np.asarray(vol_da).astype(np.float32)
-    logger.info("Loaded volume %s from %s", vol.shape, args.in_image)
-
-    # Resolve GPU usage from --backend choice for non-N4 stages.
+    # Resolve GPU usage from --backend choice for non-N4 stages. We resolve
+    # this BEFORE reading so we can stream the volume directly into device
+    # memory through the GDS / zarr-gpu fast path when the GPU is in play.
     if args.backend == "gpu":
         use_gpu_pre = True
     elif args.backend == "auto":
@@ -222,6 +219,11 @@ def main() -> None:
     else:
         use_gpu_pre = False
 
+    # Load volume — onto GPU directly when use_gpu_pre, else host.
+    vol, res = read_omezarr_array(args.in_image, level=0, use_gpu=use_gpu_pre)
+    vol = vol.astype(np.float32)  # works on both numpy and cupy arrays
+    logger.info("Loaded volume %s from %s (gpu=%s)", vol.shape, args.in_image, use_gpu_pre)
+
     # Tissue mask (per serial section)
     mask = compute_tissue_mask(
         vol,
@@ -260,7 +262,7 @@ def main() -> None:
     n4_kwargs = {
         "shrink_factor": args.shrink_factor,
         "n_iterations": args.n_iterations,
-        "voxel_size_mm": tuple(res),
+        "voxel_size_mm": (float(res[0]), float(res[1]), float(res[2])),
         "backend": args.backend,
     }
 
diff --git a/scripts/linum_detect_focal_curvature.py b/scripts/linum_detect_focal_curvature.py
index d09fd1c8..d9d8bff3 100644
--- a/scripts/linum_detect_focal_curvature.py
+++ b/scripts/linum_detect_focal_curvature.py
@@ -7,6 +7,7 @@
 
 import argparse
 from pathlib import Path
+from typing import Any
 
 import dask.array as da
 import numpy as np
@@ -14,11 +15,10 @@
 from basicpy import BaSiC
 
 from linumpy.geometry.interface import find_tissue_interface
+from linumpy.gpu import GPU_AVAILABLE, to_cpu
 from linumpy.io.zarr import create_tempstore, read_omezarr, save_omezarr
 
-# TODO: Replace by interpolation using deformation field
-# TODO: optimize for full resolution data
-# TODO: parallelize the correction
+# TODO: Replace integer roll by sub-voxel interpolation using a deformation field
 
 
 def _build_arg_parser() -> argparse.ArgumentParser:
@@ -36,6 +36,12 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         "--sigma_z", type=float, default=2.0, help="Gaussian smoothing sigma in Z before interface detection [%(default)s]"
     )
     p.add_argument("--use_log", action="store_true", help="Apply log transform before gradient detection")
+    p.add_argument(
+        "--use_gpu",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Use GPU acceleration for the per-tile focal-plane shift if available [%(default)s].",
+    )
     return p
 
 
@@ -54,7 +60,13 @@ def main() -> None:
     dtype = vol.dtype
     data = np.moveaxis(np.asarray(vol), 0, -1)
     # Estimate the water-tissue interface
-    z0 = find_tissue_interface(np.abs(data), s_xy=args.sigma_xy, s_z=args.sigma_z, use_log=args.use_log)
+    z0 = find_tissue_interface(
+        np.abs(data),
+        s_xy=args.sigma_xy,
+        s_z=args.sigma_z,
+        use_log=args.use_log,
+        use_gpu=args.use_gpu,
+    )
 
     # Extract the tile shape from the filename
     tile_shape = vol.chunks
@@ -84,6 +96,20 @@ def main() -> None:
     # Apply the correction to a tile
     corr = ((flatfield - 1) * z0.mean()).astype(int)
 
+    use_gpu = args.use_gpu and GPU_AVAILABLE
+    if use_gpu:
+        import cupy as cp
+
+        xp: Any = cp
+    else:
+        xp = np
+
+    nz_full = vol.shape[0]
+    z_arange = xp.arange(nz_full)[:, None, None]
+    corr_xp = xp.asarray(corr)
+    # Per-(m, n) circular shift along Z, vectorised via take_along_axis.
+    # Equivalent to: tile[:, m, n] = roll(tile[:, m, n], -corr[m, n]).
+
     temp_store = create_tempstore()
     _vol_corr = zarr.open(temp_store, mode="w", shape=vol.shape, dtype=dtype, chunks=tile_shape)
     assert isinstance(_vol_corr, zarr.Array)
@@ -95,14 +121,11 @@ def main() -> None:
             rmax = (i + 1) * tile_shape[1]
             cmin = j * tile_shape[2]
             cmax = (j + 1) * tile_shape[2]
-            tile = np.asarray(vol[:, rmin:rmax, cmin:cmax])
-
-            # Apply the correction (shift the focal plane to flatten it)
-            for m in range(tile.shape[1]):
-                for n in range(tile.shape[2]):
-                    tile[:, m, n] = np.roll(tile[:, m, n], -corr[m, n])
-
-            vol_corr[:, rmin:rmax, cmin:cmax] = tile
+            tile_np = np.asarray(vol[:, rmin:rmax, cmin:cmax])
+            tile_xp = xp.asarray(tile_np) if use_gpu else tile_np
+            z_idx_tile = (z_arange + corr_xp[None, rmin:rmax, cmin:cmax]) % nz_full
+            tile_rolled = xp.take_along_axis(tile_xp, z_idx_tile, axis=0)
+            vol_corr[:, rmin:rmax, cmin:cmax] = to_cpu(tile_rolled) if use_gpu else tile_rolled
 
     # save to ome-zarr
     dask_arr = da.from_zarr(vol_corr)
diff --git a/scripts/linum_generate_mosaic_aips.py b/scripts/linum_generate_mosaic_aips.py
index 320cb455..c2513897 100755
--- a/scripts/linum_generate_mosaic_aips.py
+++ b/scripts/linum_generate_mosaic_aips.py
@@ -62,16 +62,19 @@ def compute_aip(vol: Any, use_gpu: bool = True) -> np.ndarray:
             cmin = j * tile_shape[2]
             cmax = (j + 1) * tile_shape[2]
 
-            tile = np.asarray(vol[:, rmin:rmax, cmin:cmax])
+            tile = vol[:, rmin:rmax, cmin:cmax]
 
             if use_gpu:
                 import cupy as cp
 
-                tile_gpu = cp.asarray(tile.astype(np.float32))
-                aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu, axis=0))
+                # Slices may already be cupy when the read happens inside
+                # ``gpu_zarr_context`` (no extra H→D copy). Otherwise we
+                # transfer the host tile once.
+                tile_gpu = tile if isinstance(tile, cp.ndarray) else cp.asarray(np.asarray(tile))
+                aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu.astype(cp.float32), axis=0))
                 del tile_gpu
             else:
-                aip[rmin:rmax, cmin:cmax] = tile.mean(axis=0)
+                aip[rmin:rmax, cmin:cmax] = np.asarray(tile).mean(axis=0)
 
     if use_gpu:
         try:
@@ -166,12 +169,25 @@ def main() -> None:
             f"No mosaic grid files found in {input_dir}.\nExpected files matching 'mosaic_grid_3d_z*.ome.zarr'."
         )
 
-    for mosaic_file in tqdm(mosaic_files, desc="Generating AIPs"):
-        slice_id = mosaic_file.name[len("mosaic_grid_3d_z") : -len(".ome.zarr")]
-        output_file = output_dir / f"aip_z{slice_id}.png"
-        vol, _ = read_omezarr(mosaic_file, level=args.level)
-        aip = compute_aip(vol, use_gpu=use_gpu)
-        save_aip_png(aip, output_file)
+    if use_gpu:
+        # Open each volume inside the GPU context so tile slices land on
+        # device memory directly (no host round-trip per tile).
+        from linumpy.gpu.zarr_io import gpu_zarr_context
+
+        for mosaic_file in tqdm(mosaic_files, desc="Generating AIPs"):
+            slice_id = mosaic_file.name[len("mosaic_grid_3d_z") : -len(".ome.zarr")]
+            output_file = output_dir / f"aip_z{slice_id}.png"
+            with gpu_zarr_context():
+                vol, _ = read_omezarr(mosaic_file, level=args.level)
+                aip = compute_aip(vol, use_gpu=True)
+            save_aip_png(aip, output_file)
+    else:
+        for mosaic_file in tqdm(mosaic_files, desc="Generating AIPs"):
+            slice_id = mosaic_file.name[len("mosaic_grid_3d_z") : -len(".ome.zarr")]
+            output_file = output_dir / f"aip_z{slice_id}.png"
+            vol, _ = read_omezarr(mosaic_file, level=args.level)
+            aip = compute_aip(vol, use_gpu=False)
+            save_aip_png(aip, output_file)
 
 
 if __name__ == "__main__":
diff --git a/scripts/linum_interpolate_missing_slice.py b/scripts/linum_interpolate_missing_slice.py
index ecd9d7c9..15de1320 100644
--- a/scripts/linum_interpolate_missing_slice.py
+++ b/scripts/linum_interpolate_missing_slice.py
@@ -71,6 +71,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         "  zmorph   - z-aware morphing (respects serial-section geometry; recommended)\n"
         "  average  - Simple average of adjacent slices\n"
         "  weighted - Weighted average with distance falloff\n"
+        "\n"
         "[default: %(default)s]",
     )
     p.add_argument(
@@ -80,6 +81,7 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         help="Blending method for combining warped slices:\n"
         "  linear - Equal 50/50 blend (may show edges)\n"
         "  gaussian - Feathered blend using distance transform (recommended)\n"
+        "\n"
         "[default: %(default)s]",
     )
     p.add_argument(
diff --git a/scripts/linum_resample_mosaic_grid.py b/scripts/linum_resample_mosaic_grid.py
index 329b0d41..f346266a 100644
--- a/scripts/linum_resample_mosaic_grid.py
+++ b/scripts/linum_resample_mosaic_grid.py
@@ -22,6 +22,7 @@
 from linumpy.geometry.resampling import resolution_is_mm
 from linumpy.gpu import GPU_AVAILABLE, print_gpu_info
 from linumpy.gpu.interpolation import resize
+from linumpy.gpu.zarr_io import gpu_zarr_context
 from linumpy.io import OmeZarrWriter, read_omezarr
 
 
@@ -66,8 +67,12 @@ def rescale(image: Any, scale: float | Sequence[float], order: int = 1, use_gpu:
 
 
 def _read_tile(vol: Any, i: Any, j: Any, tile_shape: Any) -> Any:
-    """Read one tile from the input zarr array (I/O stage of the pipeline)."""
-    return np.asarray(vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]])
+    """Read one tile from the input zarr array (I/O stage of the pipeline).
+
+    Returns whatever array type the zarr backend yields: NumPy by default, or
+    CuPy when the volume was opened inside :func:`gpu_zarr_context`.
+    """
+    return vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]]
 
 
 def _run_pipelined(
@@ -90,15 +95,10 @@ def _run_pipelined(
     if not tile_iter:
         return
 
-    cp: Any = None
-    cupy_available = False
-    if use_gpu:
-        try:
-            import cupy as cp
-
-            cupy_available = True
-        except Exception:
-            pass
+    # Note: do NOT periodically call cp.get_default_memory_pool().free_all_blocks().
+    # All tiles are the same size, so the pool reuses GPU buffers across iterations.
+    # Flushing forces cudaMalloc on the next tile and stalls the GPU; CuPy already
+    # retries with free_all_blocks() automatically on OOM.
 
     with ThreadPoolExecutor(max_workers=1) as prefetch_executor:
         i0, j0 = tile_iter[0]
@@ -116,9 +116,6 @@ def _run_pipelined(
                 :, i * out_tile_shape[1] : (i + 1) * out_tile_shape[1], j * out_tile_shape[2] : (j + 1) * out_tile_shape[2]
             ] = resampled
 
-            if cupy_available and cp is not None and k % 10 == 9:
-                cp.get_default_memory_pool().free_all_blocks()
-
 
 def main() -> None:
     """Run function."""
@@ -172,11 +169,24 @@ def main() -> None:
     out_shape = (out_tile_shape[0], nx * out_tile_shape[1], ny * out_tile_shape[2])
     print(f"  Output shape: {out_shape} ({total_tiles} tiles)")
 
-    out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True)
-
     tile_iter = list(itertools.product(range(nx), range(ny)))
-    _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu)
 
+    if use_gpu:
+        # Open BOTH the input volume and the output writer inside the GPU zarr
+        # context. That way per-tile reads land directly on device and per-tile
+        # writes accept cupy arrays through zarr's GPU buffer prototype, so the
+        # resample loop is fully device-resident:
+        #   read tile -> cupy gaussian + zoom -> write cupy slice
+        with gpu_zarr_context():
+            vol_gpu, _ = read_omezarr(args.in_mosaic)
+            out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True)
+            _run_pipelined(vol_gpu, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu)
+    else:
+        out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True)
+        _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu)
+
+    # Pyramid build reads the just-written level0 back via dask; run it outside
+    # the GPU context so dask uses the normal numpy buffer prototype.
     print("Building pyramid...")
     out_res = [target_res] * 3
     out_zarr.finalize(out_res, args.n_levels)
diff --git a/scripts/linum_stack_slices_motor.py b/scripts/linum_stack_slices_motor.py
index b2884358..709a8633 100644
--- a/scripts/linum_stack_slices_motor.py
+++ b/scripts/linum_stack_slices_motor.py
@@ -288,6 +288,13 @@ def _build_arg_parser() -> argparse.ArgumentParser:
         "Default: none (use only automated transforms).",
     )
 
+    p.add_argument(
+        "--use_gpu",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Use GPU acceleration for the Z-overlap NCC sweep when available [%(default)s].",
+    )
+
     add_overwrite_arg(p)
     return p
 
@@ -940,7 +947,9 @@ def main() -> None:
         else:
             # find_z_overlap expects resolution in µm for its internal calculation
             res_z_um = res_z_mm * 1000
-            overlap, corr = find_z_overlap(prev_vol, vol, args.slicing_interval_mm, args.search_range_mm, res_z_um)
+            overlap, corr = find_z_overlap(
+                prev_vol, vol, args.slicing_interval_mm, args.search_range_mm, res_z_um, use_gpu=args.use_gpu
+            )
             # Fall back to expected overlap when correlation is too low to trust
             if args.z_overlap_min_corr > 0 and corr < args.z_overlap_min_corr:
                 interval_voxels = int(args.slicing_interval_mm / res_z_mm)
diff --git a/shell_scripts/server_setup/nvfs_kernel7_patch.sh b/shell_scripts/server_setup/nvfs_kernel7_patch.sh
new file mode 100755
index 00000000..e5226053
--- /dev/null
+++ b/shell_scripts/server_setup/nvfs_kernel7_patch.sh
@@ -0,0 +1,169 @@
+#!/bin/bash
+# Patch nvidia-fs 2.28.4 to build against Linux kernel 7.0 (Ubuntu 26.04).
+#
+# REMOVAL TRIGGER
+# ---------------
+# Delete this script as soon as nvidia-fs-dkms ships an upstream version
+# that builds cleanly on the running kernel. Quick check:
+#
+#   sudo dkms add /usr/src/nvidia-fs-<NEW_VER>      # try the stock sources
+#   sudo dkms build nvidia-fs/<NEW_VER>             # if this succeeds, drop the patch
+#
+# Two upstream API changes this patch works around:
+#   1) struct vm_area_struct: __vm_flags removed, vm_flags is now a const
+#      field of an anonymous union (writes still go through vm_flags_set).
+#      Fix: read via vma->vm_flags.
+#   2) struct blk_dma_iter::iter is now struct blk_map_iter (was
+#      struct req_iterator). They share .iter (bvec_iter) and .bio in the
+#      same order, so we add a configure probe HAVE_BLK_MAP_ITER and
+#      typedef nvfs_req_iter_t accordingly. Helper signatures use the typedef.
+#   3) create_nv.symvers.sh only decompressed *.ko.xz; Ubuntu 26.04 ships
+#      kernel modules as *.ko.zst, so nm fails and Module.symvers ends up
+#      empty. Without CRCs the runtime symbol-version check on
+#      nvidia_p2p_dma_unmap_pages fails and nvidia-fs falls back to the
+#      no-symver path -> GDS stays in compat mode. Fix: add a *.ko.zst
+#      branch that decompresses with zstd before nm runs.
+#
+# USAGE (server-side, as root)
+# ----------------------------
+#   sudo bash nvfs_kernel7_patch.sh
+#   sudo dkms remove nvidia-fs/2.28.4 --all 2>/dev/null || true
+#   sudo dkms add /usr/src/nvidia-fs-2.28.4
+#   sudo dkms build nvidia-fs/2.28.4
+#   sudo dkms install nvidia-fs/2.28.4
+#   sudo modprobe nvidia-fs
+#   /usr/local/cuda/gds/tools/gdscheck -p
+#
+# The script is idempotent (creates *.orig backups, skips reapplying if the
+# probe is already present). Safe to re-run after kernel header upgrades.
+
+set -e
+SRC=${NVFS_SRC:-/usr/src/nvidia-fs-2.28.4}
+if [ ! -d "$SRC" ]; then
+    echo "ERROR: $SRC not found. Set NVFS_SRC=/usr/src/nvidia-fs-<ver> if version differs." >&2
+    exit 2
+fi
+cd "$SRC"
+
+# Idempotency: keep one .orig backup
+for f in nvfs-mmap.c nvfs-dma.c configure create_nv.symvers.sh; do
+    [ -f "$f.orig" ] || cp -p "$f" "$f.orig"
+done
+
+# --- nvfs-mmap.c: drop ACCESS_PRIVATE(__vm_flags) --------------------------
+# kernel 7.0 dropped the __vm_flags private name; vm_flags is the public
+# (const) field name again.
+python3 - <<'PY'
+from pathlib import Path
+p = Path("nvfs-mmap.c")
+s = p.read_text()
+old = "\tvm_flags = ACCESS_PRIVATE(vma, __vm_flags);"
+new = "\tvm_flags = vma->vm_flags;"
+assert old in s, "expected ACCESS_PRIVATE line not found"
+p.write_text(s.replace(old, new, 1))
+print("nvfs-mmap.c: patched vm_flags read")
+PY
+
+# --- configure: add HAVE_BLK_MAP_ITER probe --------------------------------
+python3 - <<'PY'
+from pathlib import Path
+p = Path("configure")
+s = p.read_text()
+marker = 'if compile_prog "Checking if blk_rq_dma_map_iter_start is present..."; then\n        output_sym "HAVE_BLK_RQ_DMA_MAP_ITER_START"\nfi\n'
+addition = '''
+cat > $TEST_C <<EOF
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-dma.h>
+#include "test.h"
+
+int test (void)
+{
+        struct blk_map_iter iter;
+        (void)iter;
+        return 0;
+}
+
+EOF
+if compile_prog "Checking if struct blk_map_iter is present..."; then
+        output_sym "HAVE_BLK_MAP_ITER"
+fi
+'''
+if "HAVE_BLK_MAP_ITER" in s:
+    print("configure: HAVE_BLK_MAP_ITER probe already present")
+else:
+    assert marker in s, "anchor not found in configure"
+    s = s.replace(marker, marker + addition, 1)
+    p.write_text(s)
+    print("configure: added HAVE_BLK_MAP_ITER probe")
+PY
+
+# --- nvfs-dma.c: typedef nvfs_req_iter_t and rewire V2 path ----------------
+python3 - <<'PY'
+from pathlib import Path
+p = Path("nvfs-dma.c")
+s = p.read_text()
+
+# Insert typedef just after the V2 #ifdef
+anchor = "// V2 ops implementations for kernels with iterator API\n#ifdef HAVE_BLK_RQ_DMA_MAP_ITER_START\n"
+typedef_block = '''
+/*
+ * Kernel 6.16+ (e.g. 7.0) replaced struct req_iterator inside struct
+ * blk_dma_iter with struct blk_map_iter. Both layouts share .iter
+ * (bvec_iter) and .bio in the same order, so the helpers below only
+ * use those two fields.
+ */
+#ifdef HAVE_BLK_MAP_ITER
+typedef struct blk_map_iter nvfs_req_iter_t;
+#else
+typedef struct req_iterator nvfs_req_iter_t;
+#endif
+
+'''
+if "nvfs_req_iter_t" not in s:
+    assert anchor in s, "V2 #ifdef anchor not found"
+    s = s.replace(anchor, anchor + typedef_block, 1)
+    print("nvfs-dma.c: inserted typedef")
+
+# Replace `struct req_iterator` with the typedef *only* inside the V2 path
+# (i.e. between the V2 #ifdef and the matching #endif). The V1 path lives
+# above this block and still expects struct req_iterator.
+v2_start = s.index("// V2 ops implementations for kernels with iterator API\n#ifdef HAVE_BLK_RQ_DMA_MAP_ITER_START\n")
+# match closing #endif that terminates the V2 section: it's the last #endif in file
+# but to be safe locate the next "#endif /* HAVE_BLK_RQ_DMA_MAP_ITER_START */"
+# Try canonical first, then fall back to last #endif before EOF.
+end_marker = "#endif /* HAVE_BLK_RQ_DMA_MAP_ITER_START */"
+if end_marker in s:
+    v2_end = s.index(end_marker, v2_start)
+else:
+    v2_end = s.rfind("#endif")
+
+before = s[:v2_start]
+v2 = s[v2_start:v2_end]
+after = s[v2_end:]
+
+new_v2 = v2.replace("struct req_iterator", "nvfs_req_iter_t")
+if new_v2 != v2:
+    s = before + new_v2 + after
+    p.write_text(s)
+    print("nvfs-dma.c: rewired V2 path to nvfs_req_iter_t")
+else:
+    print("nvfs-dma.c: no further substitution needed (already patched)")
+PY
+
+# --- create_nv.symvers.sh: handle *.ko.zst (Ubuntu 26.04) ------------------
+python3 - <<'PY'
+from pathlib import Path
+p = Path("create_nv.symvers.sh")
+s = p.read_text()
+anchor = '''\tcase "$nvidia_mod" in\n\t\t*ko.xz)\n\t\t\t/bin/cp -fv $nvidia_mod .\n\t\t\tnvidia_mod=$(basename $nvidia_mod | sed -e "s/.xz//g")\n\t\t\txz -d ${nvidia_mod}.xz\n\t\t\t;;\n\tesac'''
+addition = '''\tcase "$nvidia_mod" in\n\t\t*ko.xz)\n\t\t\t/bin/cp -fv $nvidia_mod .\n\t\t\tnvidia_mod=$(basename $nvidia_mod | sed -e "s/.xz//g")\n\t\t\txz -d ${nvidia_mod}.xz\n\t\t\t;;\n\t\t*ko.zst)\n\t\t\t/bin/cp -fv $nvidia_mod .\n\t\t\tnvidia_mod=$(basename $nvidia_mod | sed -e "s/.zst//g")\n\t\t\tzstd -d --rm ${nvidia_mod}.zst\n\t\t\t;;\n\tesac'''
+if "*ko.zst" in s:
+    print("create_nv.symvers.sh: zst branch already present")
+else:
+    assert anchor in s, "xz case anchor not found in create_nv.symvers.sh"
+    p.write_text(s.replace(anchor, addition, 1))
+    print("create_nv.symvers.sh: added zst decompression branch")
+PY
+
+echo "patch done"
diff --git a/uv.lock b/uv.lock
index b93b7436..4c96bff3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1551,6 +1551,36 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b5/91/53255615acd2a1eaca307ede3c90eb550bae9c94581f8c00081b6b1c8f44/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-win_amd64.whl", hash = "sha256:1f1489f769582498610e015a8ef2d36f28f505ab3096d0e16b4858a9ec214f57", size = 75987, upload-time = "2026-03-09T13:15:39.65Z" },
 ]
 
+[[package]]
+name = "kvikio-cu12"
+version = "26.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cupy-cuda12x" },
+    { name = "libkvikio-cu12" },
+    { name = "numpy" },
+    { name = "packaging" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/24/95abc4335e8db5b8e140dbe7bdaa9bd7c8620cc087b33ccf748f474c43b0/kvikio_cu12-26.4.0-cp311-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72cf92d7786b9cb34c60d616a25ef1c802ee465ebc20dd4b5ca2a365b409be16", size = 587150, upload-time = "2026-04-09T11:38:07.743Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/2e/08785cce15da72c9185285b8933d0a14a040f2350241beeb92c7a3dd8ee1/kvikio_cu12-26.4.0-cp311-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:58fa8aa9782f5e426c0568a05cbf232946e80f89efc6d5ef0eaec78f0771580a", size = 626286, upload-time = "2026-04-09T11:25:20.927Z" },
+]
+
+[[package]]
+name = "kvikio-cu13"
+version = "26.4.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cupy-cuda13x" },
+    { name = "libkvikio-cu13" },
+    { name = "numpy" },
+    { name = "packaging" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/2c/4c/ef90121df05681f697dcb5a7629b270406189b5c69ec3fb36b039d699252/kvikio_cu13-26.4.0-cp311-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0f9ca47618d8c258456faf763e5c7a1c2dc6e94ba1cea7a7e87439c8aad9cbd", size = 587146, upload-time = "2026-04-09T11:37:19.209Z" },
+    { url = "https://files.pythonhosted.org/packages/79/8c/350a0d40ae61875cd6bea3bc74cd6938d6f499bdaa92634e681d665dc3ca/kvikio_cu13-26.4.0-cp311-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b261b23fda12b16bacef1a27cea4f1a2e676e0284f5430ea8bf38f4fb8a69c30", size = 626286, upload-time = "2026-04-09T11:24:47.952Z" },
+]
+
 [[package]]
 name = "lazy-loader"
 version = "0.5"
@@ -1563,6 +1593,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" },
 ]
 
+[[package]]
+name = "libkvikio-cu12"
+version = "26.4.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5e/1a/237ad2d561e683eff68217c5b862d52f90ed4cb64d376681cb0f65a1f521/libkvikio_cu12-26.4.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:0c051a60322877785ce37f5b1229902a5af3099c2180dd20c68a27c8dc55b9cf", size = 2218685, upload-time = "2026-04-09T11:26:41.1Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/15/62e5947e72aeeafdcaf8ebaf8e352ec505d445634514fd939b56c44cc467/libkvikio_cu12-26.4.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:1334b5210d3b05d760085cd3575af96cfe3dc2717065e8532bdcc7c9b195233b", size = 2367209, upload-time = "2026-04-09T11:20:56.055Z" },
+]
+
+[[package]]
+name = "libkvikio-cu13"
+version = "26.4.0"
+source = { registry = "https://pypi.org/simple" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/74/a6/146d2101afbb768e6aed5cca77822706355873b4cc69db17d9b7e74ed415/libkvikio_cu13-26.4.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:cadac4668d426d11637211f04714f3df0e4ac37a4bde603575f57efeacc1f652", size = 2218592, upload-time = "2026-04-09T11:25:55.48Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/d6/612a5015a719705d682b27f0075233068bc81d9ea38ba4bd5f3e21296bd4/libkvikio_cu13-26.4.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3e0fd921efc7407f8319b4accbd4dc2c1ae65a05853d4e11d68b61e4f3013140", size = 2367571, upload-time = "2026-04-09T11:20:34.086Z" },
+]
+
 [[package]]
 name = "linkify-it-py"
 version = "2.1.0"
@@ -1618,12 +1666,18 @@ docs = [
     { name = "sphinxcontrib-mermaid" },
     { name = "sphinxext-opengraph" },
 ]
-gpu = [
-    { name = "cupy-cuda12x" },
+gds = [
+    { name = "kvikio-cu13" },
 ]
-gpu-cuda13 = [
+gds-cuda12 = [
+    { name = "kvikio-cu12" },
+]
+gpu = [
     { name = "cupy-cuda13x" },
 ]
+gpu-cuda12 = [
+    { name = "cupy-cuda12x" },
+]
 
 [package.dev-dependencies]
 dev = [
@@ -1641,11 +1695,13 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "basicpy" },
-    { name = "cupy-cuda12x", marker = "extra == 'gpu'", specifier = ">=12.0.0" },
-    { name = "cupy-cuda13x", marker = "extra == 'gpu-cuda13'", specifier = ">=13.0.0" },
+    { name = "cupy-cuda12x", marker = "extra == 'gpu-cuda12'", specifier = ">=12.0.0" },
+    { name = "cupy-cuda13x", marker = "extra == 'gpu'", specifier = ">=13.0.0" },
     { name = "dask", specifier = ">=2024.10.0" },
     { name = "dipy" },
     { name = "hyperactive" },
+    { name = "kvikio-cu12", marker = "extra == 'gds-cuda12'" },
+    { name = "kvikio-cu13", marker = "extra == 'gds'" },
     { name = "linkify-it-py", marker = "extra == 'docs'", specifier = ">=2.1" },
     { name = "matplotlib" },
     { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=5.0" },
@@ -1654,7 +1710,7 @@ requires-dist = [
     { name = "nibabel" },
     { name = "numcodecs" },
     { name = "numpy" },
-    { name = "ome-zarr", specifier = ">=0.9.0" },
+    { name = "ome-zarr", specifier = ">=0.16.0" },
     { name = "pandas" },
     { name = "pqdm" },
     { name = "pydata-sphinx-theme", marker = "extra == 'docs'", specifier = ">=0.17" },
@@ -1676,7 +1732,7 @@ requires-dist = [
     { name = "tqdm" },
     { name = "zarr", specifier = ">=3.0.10" },
 ]
-provides-extras = ["gpu", "gpu-cuda13", "docs"]
+provides-extras = ["gpu", "gpu-cuda12", "gds", "gds-cuda12", "docs"]
 
 [package.metadata.requires-dev]
 dev = [
diff --git a/workflows/reconst_3d/nextflow.config b/workflows/reconst_3d/nextflow.config
index b60728f0..f7cafa5b 100644
--- a/workflows/reconst_3d/nextflow.config
+++ b/workflows/reconst_3d/nextflow.config
@@ -391,7 +391,16 @@ process {
 
     withName: "resample_mosaic_grid" {
         scratch = false
-        // Allow parallel mask creation on GPU
+        // Parallel resampling on GPU. Peak ~9 GB/fork (Gaussian intermediate at
+        // 2x tile size). With 2x A6000 (48 GB each), 6 forks = 3/GPU = ~27 GB,
+        // leaving headroom for fix_illumination running concurrently.
+        maxForks = params.use_gpu ? 6 : null
+    }
+
+    withName: "fix_focal_curvature" {
+        // GPU path is light (uniform/gaussian filter on the volume + per-tile
+        // take_along_axis). Memory dominated by the volume; a few forks share
+        // the GPU comfortably.
         maxForks = params.use_gpu ? 4 : null
     }
 
diff --git a/workflows/reconst_3d/soct_3d_reconst.nf b/workflows/reconst_3d/soct_3d_reconst.nf
index 65a8f361..3dbac57b 100644
--- a/workflows/reconst_3d/soct_3d_reconst.nf
+++ b/workflows/reconst_3d/soct_3d_reconst.nf
@@ -145,8 +145,9 @@ process fix_focal_curvature {
     tuple val(slice_id), path("mosaic_grid_z${slice_id}_focal_fix.ome.zarr")
 
     script:
+    def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu"
     """
-    linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr"
+    linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" ${gpu_flag}
     """
 
     stub:
@@ -675,6 +676,7 @@ process stack {
     path("stacking_decisions.csv"), optional: true, emit: stacking_decisions
 
     script:
+    def gpu_flag = params.use_gpu ? " --use_gpu" : " --no-use_gpu"
     def options = Helpers.stackBlendingArgs(params) +
                   Helpers.stackZMatchingArgs(params) +
                   Helpers.stackPairwiseTransformArgs(params) +
@@ -683,6 +685,7 @@ process stack {
                   Helpers.stackCumulativeArgs(params) +
                   Helpers.stackSmoothingArgs(params) +
                   " --no_xy_shift" +  // slices are already in common space
+                  gpu_flag +
                   Helpers.pyramidArgs(params)
 
     def annotated_args = Helpers.annotatedScreenshotArgs(params, slice_ids_str)