diff --git a/docs/GPU_ACCELERATION.md b/docs/GPU_ACCELERATION.md index c41d871f..d863bb64 100644 --- a/docs/GPU_ACCELERATION.md +++ b/docs/GPU_ACCELERATION.md @@ -33,8 +33,8 @@ backend selection is per-call (`backend="cpu" | "gpu" | "auto"`); the auto path nvidia-smi | grep "CUDA Version" # Install linumpy with GPU support (choose your CUDA version) -uv pip install 'linumpy[gpu]' # CUDA 12.x (default) -uv pip install 'linumpy[gpu-cuda13]' # CUDA 13.x +uv pip install 'linumpy[gpu]' # CUDA 13.x (default) +uv pip install 'linumpy[gpu-cuda12]' # CUDA 12.x # Verify GPU linum_gpu_info.py @@ -57,8 +57,8 @@ linum_diagnose_pipeline.py --benchmark | CUDA Version | CuPy Package | linumpy extra | |--------------|--------------|---------------| -| CUDA 12.x | `cupy-cuda12x` | `linumpy[gpu]` | -| CUDA 13.x | `cupy-cuda13x` | `linumpy[gpu-cuda13]` | +| CUDA 13.x | `cupy-cuda13x` | `linumpy[gpu]` (default) | +| CUDA 12.x | `cupy-cuda12x` | `linumpy[gpu-cuda12]` | --- @@ -232,4 +232,64 @@ from linumpy.gpu.fft_ops import fft2, ifft2, phase_correlation from linumpy.gpu.interpolation import resize, affine_transform from linumpy.gpu.morphology import binary_closing, gaussian_filter from linumpy.gpu.array_ops import normalize_percentile, clip_percentile +from linumpy.gpu.zarr_io import read_zarr_to_gpu ``` + +--- + +## Fast zarr → GPU loading + +For zarr arrays on local NVMe, `linumpy.gpu.zarr_io.read_zarr_to_gpu` is the recommended entry point. It dispatches to the fastest backend available at runtime: + +```python +from linumpy.gpu.zarr_io import read_zarr_to_gpu + +dev = read_zarr_to_gpu("/scratch_nvme/volume.zarr") +# dev is a cupy.ndarray +``` + +Backend implementations live in their own modules: + +- `linumpy.gpu.kvikio_zarr` — kvikio / GPUDirect Storage reader (uncompressed zarr v2/v3 only). + +Selection order (when `prefer="auto"`): + +1. **kvikio (GPUDirect Storage, native mode)** — chunks DMA'd directly from NVMe into GPU memory. Fastest. Requires `kvikio` installed, GDS native mode enabled (`/etc/cufile.json`: `allow_compat_mode=false`), and an uncompressed zarr. +2. **`zarr.config.enable_gpu()`** — host I/O with on-host decode, single H→D copy. Works for any zarr (compressed or not) and is the automatic fallback when GDS is unavailable, in compat mode, or the array is compressed. + +You can force a specific path with `prefer="kvikio"` or `prefer="zarr-gpu"`. The legacy `cupy.asarray(zarr.open_array(...)[:])` path is kept only as a reference baseline in `linum_benchmark_kvikio_zarr.py`. + +### Reference benchmark + +`scripts/linum_benchmark_kvikio_zarr.py` measures all three paths. On a 16 GiB float32 zarr v3 (256³ chunks) on local NVMe ext4 with an RTX A6000: + +| Path | Cold | Warm | +|---|---|---| +| kvikio (GDS native) | 8.2 GiB/s | **9.9 GiB/s** | +| `zarr.config.enable_gpu()` | 6.3 GiB/s | 7.1 GiB/s | +| `zarr → numpy → cupy.asarray` | 1.1 GiB/s | 2.8 GiB/s | + +In compat mode (GDS bounce-buffer), kvikio drops to ~4 GiB/s — slower than the `zarr-gpu` path. The auto selector detects this via `kvikio.defaults.compat_mode()` and prefers `zarr-gpu` in that case. + +### Installing the GDS path + +```bash +# CuPy + linumpy GPU support +uv pip install 'linumpy[gpu]' # CUDA 13.x (default) +uv pip install 'linumpy[gpu-cuda12]' # CUDA 12.x + +# kvikio (optional, only needed for the GDS fast path) +uv pip install 'linumpy[gds]' # CUDA 13.x (default) +uv pip install 'linumpy[gds-cuda12]' # CUDA 12.x +``` + +### Enabling native GDS + +Native GDS additionally requires: + +- A CUDA-aware filesystem on the source path (ext4 / xfs on local NVMe is fine; NFS, overlayfs, encrypted FS are not). +- `/etc/cufile.json`: `properties.use_compat_mode = false`. +- IOMMU disabled or in passthrough (`amd_iommu=off iommu=off` on AMD; `intel_iommu=off` on Intel). +- nvidia-fs DKMS module loaded with matching ABI; verify `dmesg | grep nvidia_fs` shows no "no extended symbol version" warnings after driver/kernel updates. + +If any of these are missing, kvikio silently falls back to a POSIX bounce-buffer (compat mode) and `read_zarr_to_gpu` will route around it. diff --git a/docs/LIBRARY_MODULES.md b/docs/LIBRARY_MODULES.md index 943bde24..7ac2e53c 100644 --- a/docs/LIBRARY_MODULES.md +++ b/docs/LIBRARY_MODULES.md @@ -40,15 +40,21 @@ OME-Zarr is the canonical on-disk format. Two writers are provided: Common entry points: ```python -from linumpy.io.zarr import read_omezarr, save_omezarr, OmeZarrWriter, AnalysisOmeZarrWriter +from linumpy.io.zarr import read_omezarr, read_omezarr_array, save_omezarr, OmeZarrWriter, AnalysisOmeZarrWriter from linumpy.io.test_data import get_data from linumpy.io import slice_config # slice_config.json reader/writer from linumpy.io.thorlabs import ThorImageOCT ``` -`read_omezarr(path, level=0)` returns a `(zarr.Array, voxel_size)` tuple. The -voxel size is ordered to match the array axes (Z, Y, X for 3D volumes; -Y, X for 2D mosaics). +`read_omezarr(path, level=0)` returns a `(zarr.Array, voxel_size)` tuple (lazy). +`read_omezarr_array(path, level=0, use_gpu=False)` is the high-level entry +point: it materialises the requested level into memory and returns +`(array, voxel_size)`. With ``use_gpu=False`` (default) the array is a +``numpy.ndarray``; with ``use_gpu=True`` it dispatches through +[`linumpy.gpu.zarr_io.read_zarr_to_gpu`](GPU_ACCELERATION.md) and returns a +``cupy.ndarray`` (kvikio/GPUDirect Storage when available, otherwise the +``zarr.config.enable_gpu`` fallback). Voxel size is ordered to match the array +axes (Z, Y, X for 3D volumes; Y, X for 2D mosaics). See [Mosaic Grid Format](MOSAIC_GRID_FORMAT.md) and [Slice Config Feature](SLICE_CONFIG_FEATURE.md) for format details. @@ -136,6 +142,9 @@ CuPy-backed versions of hot paths. Each public entry point either takes a * `gpu.registration`, `gpu.corrections` — GPU registration and correction passes used by `linum_estimate_transform.py`, `linum_normalize_intensities_per_slice.py`, etc. +* `gpu.zarr_io` — high-level `read_zarr_to_gpu` dispatcher: picks the + fastest backend available (kvikio/GDS native → `zarr.config.enable_gpu`). +* `gpu.kvikio_zarr` — kvikio / GPUDirect Storage backend for the dispatcher. See [GPU Acceleration](GPU_ACCELERATION.md) and [N4 GPU](N4_GPU.md). diff --git a/linumpy/geometry/interface.py b/linumpy/geometry/interface.py index 47996b59..9fe943eb 100644 --- a/linumpy/geometry/interface.py +++ b/linumpy/geometry/interface.py @@ -116,6 +116,7 @@ def find_tissue_interface( mask: np.ndarray | None = None, order: int = 1, detect_cutting_errors: bool = False, + use_gpu: bool = False, ) -> np.ndarray: """Detect the tissue interface. @@ -135,6 +136,9 @@ def find_tissue_interface( Gaussian filter order. detect_cutting_errors : bool If True, detect and correct cutting artefacts. + use_gpu : bool + If True, use the GPU implementation when CuPy is available. + Ignored when ``mask`` is provided (the mask path stays on CPU). Returns ------- @@ -142,6 +146,21 @@ def find_tissue_interface( Tissue interface depth """ + if use_gpu and mask is None: + from linumpy.gpu import GPU_AVAILABLE + + if GPU_AVAILABLE: + from linumpy.gpu.interface import find_tissue_interface_gpu + + return find_tissue_interface_gpu( + vol, + s_xy=s_xy, + s_z=s_z, + use_log=use_log, + order=order, + detect_cutting_errors=detect_cutting_errors, + ) + if use_log: vol_p = np.copy(vol) vol_p[vol > 0] = np.log(vol[vol > 0]) diff --git a/linumpy/gpu/__init__.py b/linumpy/gpu/__init__.py index 49fae783..499f8ead 100644 --- a/linumpy/gpu/__init__.py +++ b/linumpy/gpu/__init__.py @@ -165,6 +165,21 @@ def to_cpu(array: Any) -> Any: return array +def is_cupy_array(array: Any) -> bool: + """Return True iff ``array`` is a CuPy ``ndarray``. + + Importing CuPy is gated on availability so this stays cheap to call from + code paths that may be hit without CUDA. + """ + if not GPU_AVAILABLE: + return False + try: + import cupy as cp + except ImportError: + return False + return isinstance(array, cp.ndarray) + + def gpu_info() -> Any: """ Get information about GPU availability and configuration. diff --git a/linumpy/gpu/interface.py b/linumpy/gpu/interface.py new file mode 100644 index 00000000..c717e6b7 --- /dev/null +++ b/linumpy/gpu/interface.py @@ -0,0 +1,50 @@ +"""GPU implementation of tissue interface detection. + +Mirrors the CPU path in :func:`linumpy.geometry.interface.find_tissue_interface` +but routes the heavy filters through ``cupyx.scipy.ndimage`` to avoid +host-device round trips when the caller already holds GPU data or when +the volume is large enough that the transfer cost is amortised. +""" + +from typing import Any + +import numpy as np + +from . import to_cpu + + +def find_tissue_interface_gpu( + vol: Any, + s_xy: int = 15, + s_z: int = 2, + use_log: bool = True, + order: int = 1, + detect_cutting_errors: bool = False, +) -> np.ndarray: + """GPU equivalent of ``find_tissue_interface`` (no mask path). + + Always returns a NumPy array of integer depths so callers do not need + to be aware of the device. + """ + import cupy as cp + from cupyx.scipy.ndimage import gaussian_filter1d, uniform_filter + + vol_gpu = cp.asarray(vol) + if use_log: + vol_p = vol_gpu.astype(cp.float32, copy=True) + positive = vol_gpu > 0 + vol_p[positive] = cp.log(vol_gpu[positive]) + else: + vol_p = vol_gpu + + vol_p = uniform_filter(vol_p, (s_xy, s_xy, 0)) + vol_g = gaussian_filter1d(vol_p, s_z, axis=2, order=order) + z0 = cp.ceil(vol_g.argmax(axis=2) + s_z * 0.5).astype(cp.int64) + + if detect_cutting_errors: + vol_p = gaussian_filter1d(vol_p, s_z, axis=2, order=0) + z0_p = cp.abs(vol_p).argmax(axis=2) + mask_max = z0_p < z0 + z0 = cp.where(mask_max, z0_p, z0) + + return to_cpu(z0) diff --git a/linumpy/gpu/interpolation.py b/linumpy/gpu/interpolation.py index 728367c0..97da205d 100644 --- a/linumpy/gpu/interpolation.py +++ b/linumpy/gpu/interpolation.py @@ -9,7 +9,7 @@ import numpy as np -from . import GPU_AVAILABLE, to_cpu +from . import GPU_AVAILABLE, is_cupy_array, to_cpu def affine_transform(image: Any, matrix: Any, output_shape: Any = None, order: Any = 1, use_gpu: Any = True) -> Any: @@ -44,15 +44,18 @@ def affine_transform(image: Any, matrix: Any, output_shape: Any = None, order: A def _affine_transform_gpu(image: Any, matrix: Any, output_shape: Any, order: Any) -> Any: - """GPU implementation of affine transform.""" + """GPU implementation of affine transform. Device-preserving.""" import cupy as cp from cupyx.scipy.ndimage import affine_transform as cp_affine + input_was_gpu = is_cupy_array(image) img_gpu = cp.asarray(image.astype(np.float32)) matrix_gpu = cp.asarray(matrix.astype(np.float32)) result = cp_affine(img_gpu, matrix_gpu, output_shape=output_shape, order=order) + if input_was_gpu: + return result return to_cpu(result) @@ -90,15 +93,18 @@ def map_coordinates(image: Any, coordinates: Any, order: Any = 1, use_gpu: Any = def _map_coordinates_gpu(image: Any, coordinates: Any, order: Any) -> Any: - """GPU implementation of map_coordinates.""" + """GPU implementation of map_coordinates. Device-preserving.""" import cupy as cp from cupyx.scipy.ndimage import map_coordinates as cp_map + input_was_gpu = is_cupy_array(image) img_gpu = cp.asarray(image.astype(np.float32)) coords_gpu = cp.asarray(coordinates.astype(np.float32)) result = cp_map(img_gpu, coords_gpu, order=order) + if input_was_gpu: + return result return to_cpu(result) @@ -138,11 +144,17 @@ def resize(image: Any, output_shape: Any, order: Any = 1, anti_aliasing: Any = T def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) -> Any: - """GPU implementation of resize using zoom.""" + """GPU implementation of resize using zoom. + + Device-preserving: returns a CuPy array if *image* was already on device, + a NumPy array if *image* was on host. This avoids forcing a D->H copy when + callers want to keep tiles resident for downstream GPU compute / writes. + """ import cupy as cp from cupyx.scipy.ndimage import gaussian_filter as cp_gaussian from cupyx.scipy.ndimage import zoom as cp_zoom + input_was_gpu = is_cupy_array(image) img_gpu = cp.asarray(image if image.dtype == np.float32 else image.astype(np.float32)) # Scale factors: input/output for Gaussian sigma, output/input for zoom. @@ -158,6 +170,8 @@ def _resize_gpu(image: Any, output_shape: Any, order: Any, anti_aliasing: Any) - result = cp_zoom(img_gpu, zoom_factors, order=order) + if input_was_gpu: + return result return to_cpu(result) diff --git a/linumpy/gpu/kvikio_zarr.py b/linumpy/gpu/kvikio_zarr.py new file mode 100644 index 00000000..93a65b49 --- /dev/null +++ b/linumpy/gpu/kvikio_zarr.py @@ -0,0 +1,228 @@ +"""kvikio / GPUDirect Storage reader for uncompressed zarr arrays. + +This module implements the :func:`read_zarr_via_kvikio` backend used by +:mod:`linumpy.gpu.zarr_io`. It reads zarr v2 *or* zarr v3 chunks directly +into GPU memory using ``kvikio.CuFile``. The version is auto-detected from +the on-disk metadata file (``zarr.json`` for v3, ``.zarray`` for v2). + +Most callers should use :func:`linumpy.gpu.zarr_io.read_zarr_to_gpu`, which +dispatches to this backend only when GDS native mode is available and the +array is uncompressed; otherwise it falls back to ``zarr.config.enable_gpu``. + +Supported on-disk formats +------------------------- + +* zarr v3: ``codecs=[{"name": "bytes"}]`` only (or empty) — raw little/big- + endian bytes. Any compression codec (blosc, gzip, zstd, ...) is rejected + because GDS reads bytes verbatim and on-device decompression would require + nvCOMP. +* zarr v2: ``compressor=None``, ``filters=None``, ``order='C'``. + +Notes +----- +* Requires ``kvikio`` and ``cupy``. Both are imported lazily so the rest of + linumpy is unaffected if they are not installed. +* For the GDS fast path the source filesystem must support GDS natively + (ext4 on local NVMe, IOMMU disabled or in passthrough, and + ``properties.use_compat_mode=false`` in ``/etc/cufile.json``). Otherwise + kvikio falls back to a posix bounce-buffer path with no speed-up. +""" + +from __future__ import annotations + +import json +import sys +from itertools import product +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import numpy as np + +if TYPE_CHECKING: + from collections.abc import Iterable, Iterator + + +def _require_kvikio() -> tuple[Any, Any]: + """Import kvikio + cupy lazily and return (kvikio, cupy).""" + try: + import cupy + import kvikio + except ImportError as exc: # pragma: no cover - hardware-dependent + raise RuntimeError( + "kvikio + cupy are required for the GDS prototype. Install with:\n" + " pip install kvikio-cu13 cupy-cuda13x # or matching your CUDA" + ) from exc + return kvikio, cupy + + +def _parse_v3_dtype(dt: Any) -> np.dtype: + """Map a zarr v3 ``data_type`` field to a numpy dtype.""" + if isinstance(dt, str): + return np.dtype(dt) + raise NotImplementedError(f"unsupported v3 data_type: {dt!r}") + + +def _v3_chunk_path(array_path: Path, idx: tuple[int, ...], encoding: dict) -> Path: + """Build the on-disk chunk path for a zarr v3 array.""" + name = encoding.get("name", "default") + sep = encoding.get("configuration", {}).get("separator", "/") + parts = sep.join(str(i) for i in idx) + if name == "default": + return array_path / "c" / parts if sep == "/" else array_path / f"c{sep}{parts}" + if name == "v2": + return array_path / parts + raise NotImplementedError(f"unsupported chunk_key_encoding: {name!r}") + + +def _v2_chunk_path(array_path: Path, idx: tuple[int, ...], dim_separator: str) -> Path: + return array_path / dim_separator.join(str(i) for i in idx) + + +class _ArraySpec: + """Resolved, format-agnostic view of a zarr array on disk.""" + + __slots__ = ("_v2_dim_sep", "_v3_encoding", "chunks", "dtype", "fill_value", "format", "path", "shape") + + def __init__( + self, + *, + path: Path, + shape: tuple[int, ...], + chunks: tuple[int, ...], + dtype: np.dtype, + fill_value: Any, + format: int, + v3_encoding: dict | None = None, + v2_dim_sep: str | None = None, + ) -> None: + self.path = path + self.shape = shape + self.chunks = chunks + self.dtype = dtype + self.fill_value = fill_value + self.format = format + self._v3_encoding = v3_encoding + self._v2_dim_sep = v2_dim_sep + + def chunk_path(self, idx: tuple[int, ...]) -> Path: + if self.format == 3: + assert self._v3_encoding is not None + return _v3_chunk_path(self.path, idx, self._v3_encoding) + assert self._v2_dim_sep is not None + return _v2_chunk_path(self.path, idx, self._v2_dim_sep) + + +def _load_array_spec(array_path: Path) -> _ArraySpec: + """Inspect ``array_path`` and return a normalized spec for v2 or v3.""" + v3_meta = array_path / "zarr.json" + v2_meta = array_path / ".zarray" + + if v3_meta.exists(): + meta = json.loads(v3_meta.read_text()) + if meta.get("zarr_format") != 3: + raise ValueError(f"{array_path}: zarr.json has zarr_format != 3") + if meta.get("node_type") != "array": + raise ValueError(f"{array_path}: zarr.json is not an array node") + codecs = meta.get("codecs", []) + non_bytes = [c for c in codecs if c.get("name") != "bytes"] + if non_bytes: + raise NotImplementedError( + f"{array_path}: codecs={[c.get('name') for c in codecs]!r}; this prototype " + "requires raw bytes only (no compression). On-device decompression needs nvCOMP." + ) + host_endian = "big" if sys.byteorder == "big" else "little" + for c in codecs: + endian = c.get("configuration", {}).get("endian", "little") + if endian != host_endian: + raise NotImplementedError(f"{array_path}: endian={endian!r} differs from host") + chunk_grid = meta.get("chunk_grid", {}) + if chunk_grid.get("name") != "regular": + raise NotImplementedError(f"{array_path}: chunk_grid={chunk_grid.get('name')!r}") + return _ArraySpec( + path=array_path, + shape=tuple(meta["shape"]), + chunks=tuple(chunk_grid["configuration"]["chunk_shape"]), + dtype=_parse_v3_dtype(meta["data_type"]), + fill_value=meta.get("fill_value", 0), + format=3, + v3_encoding=meta.get("chunk_key_encoding", {"name": "default", "configuration": {"separator": "/"}}), + ) + + if v2_meta.exists(): + meta = json.loads(v2_meta.read_text()) + if meta.get("zarr_format") != 2: + raise ValueError(f"{array_path}: .zarray has zarr_format != 2") + if meta.get("compressor") is not None: + raise NotImplementedError( + f"{array_path}: compressor={meta['compressor']!r}; this prototype " + "requires uncompressed chunks. On-device decompression needs nvCOMP." + ) + if meta.get("order", "C") != "C": + raise NotImplementedError(f"{array_path}: order={meta['order']!r} unsupported") + if meta.get("filters"): + raise NotImplementedError(f"{array_path}: filters unsupported in prototype") + return _ArraySpec( + path=array_path, + shape=tuple(meta["shape"]), + chunks=tuple(meta["chunks"]), + dtype=np.dtype(meta["dtype"]), + fill_value=meta.get("fill_value", 0), + format=2, + v2_dim_sep=meta.get("dimension_separator", "."), + ) + + raise FileNotFoundError(f"{array_path}: no zarr.json (v3) or .zarray (v2) found") + + +def _iter_chunk_indices(shape: Iterable[int], chunks: Iterable[int]) -> Iterator[tuple[int, ...]]: + n = [(s + c - 1) // c for s, c in zip(shape, chunks, strict=True)] + yield from product(*[range(k) for k in n]) + + +def read_zarr_via_kvikio(array_path: str | Path) -> Any: + """Load a full uncompressed zarr (v2 or v3) array into a CuPy array via GDS. + + Parameters + ---------- + array_path + Path to the zarr array directory (containing ``zarr.json`` for v3 or + ``.zarray`` for v2). + + Returns + ------- + cupy.ndarray + Device-resident array of shape and dtype matching the zarr metadata. + """ + kvikio, cupy = _require_kvikio() + spec = _load_array_spec(Path(array_path)) + + out = cupy.full(spec.shape, spec.fill_value, dtype=spec.dtype) + chunk_nbytes_full = int(np.prod(spec.chunks) * spec.dtype.itemsize) + scratch = cupy.empty(spec.chunks, dtype=spec.dtype) + + for idx in _iter_chunk_indices(spec.shape, spec.chunks): + cf_path = spec.chunk_path(idx) + if not cf_path.exists(): + continue # zarr fill-value semantics + + slices: list[slice] = [] + edge_shape: list[int] = [] + for k, c, s in zip(idx, spec.chunks, spec.shape, strict=True): + start = k * c + stop = min(start + c, s) + slices.append(slice(start, stop)) + edge_shape.append(stop - start) + edge_shape_t = tuple(edge_shape) + + # kvikio.CuFile.pread requires a contiguous device buffer; a slice + # into ``out`` is generally not contiguous, so we read into a + # chunk-shaped scratch and copy the valid region into ``out``. + with kvikio.CuFile(str(cf_path), "r") as f: + f.pread(scratch, chunk_nbytes_full).get() + if edge_shape_t == spec.chunks: + out[tuple(slices)] = scratch + else: + sub = tuple(slice(0, e) for e in edge_shape_t) + out[tuple(slices)] = scratch[sub] + + return out diff --git a/linumpy/gpu/zarr_io.py b/linumpy/gpu/zarr_io.py new file mode 100644 index 00000000..43b2e97e --- /dev/null +++ b/linumpy/gpu/zarr_io.py @@ -0,0 +1,179 @@ +"""High-level zarr → GPU loading with automatic backend selection. + +Public entry points: + +* :func:`read_zarr_to_gpu` — load an entire zarr array onto the GPU using the + fastest available path (kvikio / GDS, falling back to ``zarr.config.enable_gpu``). +* :func:`gpu_zarr_context` — context manager that flips zarr into GPU mode for + its duration, so subsequent ``zarr.open_array(...)`` calls return arrays whose + slicing materialises directly into ``cupy.ndarray``. Use this for tile-by-tile + / per-slab access patterns where loading the whole volume at once is wasteful. + +Selection order (when ``prefer='auto'``): + +1. **kvikio (GPUDirect Storage, native mode)** — chunks DMA'd directly from + NVMe into GPU memory. Requires ``kvikio`` installed, GDS in native mode, + and an uncompressed zarr v2/v3. +2. **zarr.config.enable_gpu()** — host I/O with on-host decode then a single + H→D copy. Works for any zarr (compressed or not). The fallback when GDS + is unavailable, in compat mode, or the array is compressed. + +Backend implementations live in their own modules: + +* :mod:`linumpy.gpu.kvikio_zarr` — kvikio / GDS reader. + +Reference numbers on a 16 GiB float32 zarr v3 (RTX A6000, ext4, GDS native) +warm cache: kvikio ~9.9 GiB/s, zarr-gpu ~7.1 GiB/s. +""" + +from __future__ import annotations + +from contextlib import contextmanager +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal + +if TYPE_CHECKING: + from collections.abc import Iterator + +Backend = Literal["auto", "kvikio", "zarr-gpu"] + + +def _kvikio_native_mode_available() -> bool: + """Return True iff kvikio is importable and not stuck in compat mode. + + kvikio always succeeds at import; the relevant question is whether + ``cufile.json`` (or env vars) allow native GDS. ``compat_mode`` returns a + ``CompatMode`` enum: ``OFF`` means native is forced/used, ``ON`` means + compat-only, ``AUTO`` means kvikio picks per-mount. + """ + try: + import kvikio # noqa: F401 + from kvikio.defaults import compat_mode + except ImportError: + return False + try: + mode = compat_mode() + except Exception: # pragma: no cover - older kvikio variants + return False + name = getattr(mode, "name", str(mode)).upper() + return name in {"OFF", "AUTO"} + + +def _array_is_kvikio_compatible(array_path: Path) -> bool: + """Return True iff the on-disk array meets kvikio's raw-bytes constraints.""" + from linumpy.gpu.kvikio_zarr import _load_array_spec + + try: + _load_array_spec(array_path) + except (NotImplementedError, FileNotFoundError, ValueError): + return False + return True + + +def read_zarr_via_zarr_gpu(array_path: str | Path) -> Any: + """Load a zarr array onto the GPU using ``zarr.config.enable_gpu()``. + + Host I/O with on-host decode, then a single H→D copy. Works for any zarr + array (including compressed) and is the recommended fallback when GDS is + unavailable or stuck in compat mode. + + Parameters + ---------- + array_path + Path to the zarr array directory. + + Returns + ------- + cupy.ndarray + Device-resident array. + """ + try: + import cupy + import zarr + except ImportError as exc: # pragma: no cover - hardware-dependent + raise RuntimeError("cupy + zarr are required for the zarr-gpu fallback path.") from exc + + with zarr.config.enable_gpu(): + z = zarr.open_array(str(array_path), mode="r") + dev = z[:] + cupy.cuda.Stream.null.synchronize() + return dev + + +def read_zarr_to_gpu(array_path: str | Path, *, prefer: Backend = "auto") -> Any: + """Load a zarr array onto the GPU using the fastest available path. + + Selection order (when ``prefer='auto'``): + + 1. kvikio / GDS — only if kvikio is in native or auto mode AND the array + is uncompressed v2/v3. + 2. ``zarr.config.enable_gpu()`` — works for any zarr. + + Parameters + ---------- + array_path + Path to the zarr array directory. + prefer + ``'auto'`` (default), ``'kvikio'``, or ``'zarr-gpu'``. Forcing a path + will raise if that path is unavailable for this array. + + Returns + ------- + cupy.ndarray + Device-resident array of shape and dtype matching the zarr metadata. + """ + path = Path(array_path) + + if prefer == "kvikio": + from linumpy.gpu.kvikio_zarr import read_zarr_via_kvikio + + return read_zarr_via_kvikio(path) + if prefer == "zarr-gpu": + return read_zarr_via_zarr_gpu(path) + if prefer != "auto": + raise ValueError(f"unknown prefer={prefer!r}; expected 'auto', 'kvikio', or 'zarr-gpu'") + + if _kvikio_native_mode_available() and _array_is_kvikio_compatible(path): + from linumpy.gpu.kvikio_zarr import read_zarr_via_kvikio + + try: + return read_zarr_via_kvikio(path) + except (RuntimeError, OSError, NotImplementedError): + pass + + return read_zarr_via_zarr_gpu(path) + + +@contextmanager +def gpu_zarr_context() -> Iterator[None]: + """Context manager that puts zarr into GPU mode for arbitrary slice reads. + + Inside this context, any subsequent ``zarr.open_array(...)`` returns an + array whose slicing produces ``cupy.ndarray`` results — chunks are decoded + on host then transferred to device on each ``vol[slice]`` operation. This + is the right pattern for tile-by-tile or per-slab work where loading the + full volume at once is wasteful. + + Outside this context, zarr falls back to its normal numpy-backed mode. + + Examples + -------- + >>> from linumpy.gpu.zarr_io import gpu_zarr_context + >>> from linumpy.io.zarr import read_omezarr + >>> with gpu_zarr_context(): + ... vol, _ = read_omezarr(path, level=0) + ... for tile_region in regions: + ... tile_gpu = vol[tile_region] # already cupy + + Raises + ------ + RuntimeError + If zarr is unavailable. + """ + try: + import zarr + except ImportError as exc: # pragma: no cover - zarr is a hard dep + raise RuntimeError("zarr is required for gpu_zarr_context().") from exc + + with zarr.config.enable_gpu(): + yield diff --git a/linumpy/intensity/bias_field.py b/linumpy/intensity/bias_field.py index 62cdf815..16cf95b2 100644 --- a/linumpy/intensity/bias_field.py +++ b/linumpy/intensity/bias_field.py @@ -180,7 +180,9 @@ def compute_tissue_mask( use_gpu : bool If True, run the dominant 3-D ``gaussian_filter`` on GPU via CuPy (Z-chunked for memory safety). Falls back to CPU silently - if CuPy is unavailable. Otsu and morphology stay on CPU. + if CuPy is unavailable. Otsu and morphology stay on CPU. When + *vol* is already a ``cupy.ndarray`` the GPU path is used + regardless and slabs are read with no host round-trip. Returns ------- @@ -191,7 +193,11 @@ def compute_tissue_mask( from skimage.filters import threshold_otsu from skimage.morphology import disk - if use_gpu: + from linumpy.gpu import is_cupy_array + + # Cupy input always goes through the GPU path; the CPU fallback uses + # numpy/scipy ops that don't accept cupy arrays. + if use_gpu or is_cupy_array(vol): try: return _compute_tissue_mask_gpu( vol, @@ -202,7 +208,9 @@ def compute_tissue_mask( z_closing_sections=z_closing_sections, ) except ImportError: - pass # CuPy missing -- fall back to CPU below. + if is_cupy_array(vol): + raise # cupy installed but cupyx missing — surface clearly + # CuPy missing -- fall back to CPU below. # Anisotropic 3-D smoothing: stronger in XY, light in Z to preserve # oblique tissue boundaries without per-Z Otsu noise. diff --git a/linumpy/intensity/normalization.py b/linumpy/intensity/normalization.py index f66bfebc..ba30fcd7 100644 --- a/linumpy/intensity/normalization.py +++ b/linumpy/intensity/normalization.py @@ -6,6 +6,8 @@ based on agarose background detection. """ +from typing import Any + import numpy as np @@ -289,25 +291,38 @@ def apply_histogram_matching( use_gpu : bool If True, run the per-chunk matching loop on GPU via CuPy. Falls back to CPU silently if CuPy is unavailable. The volume itself is moved to - GPU one chunk at a time, so memory usage stays bounded. + GPU one chunk at a time, so memory usage stays bounded. When *vol* is + already a ``cupy.ndarray`` the GPU path is used regardless and slabs + are read with no host round-trip. Returns ------- np.ndarray Histogram-matched volume. """ - flat_all = vol.ravel() - ref_bins, ref_cdf, tissue_count = _build_tissue_cdf(flat_all, n_bins, tissue_threshold) + from linumpy.gpu import is_cupy_array + + cupy_input = is_cupy_array(vol) + + if cupy_input: + # Build the reference CDF on device so we don't D→H the whole volume. + ref_bins, ref_cdf, tissue_count = _build_tissue_cdf_gpu(vol, n_bins, tissue_threshold) + else: + flat_all = vol.ravel() + ref_bins, ref_cdf, tissue_count = _build_tissue_cdf(flat_all, n_bins, tissue_threshold) if tissue_count < 500: - return vol + from linumpy.gpu import to_cpu + + return to_cpu(vol) bounds = _chunk_boundaries(vol.shape[0], n_serial_slices) - if use_gpu: + if cupy_input or use_gpu: try: return _apply_histogram_matching_gpu(vol, bounds, ref_bins, ref_cdf, n_bins, tissue_threshold) except ImportError: - pass + if cupy_input: + raise # cupy is installed but cupyx missing — surface clearly out = np.empty_like(vol) for s, e in bounds: @@ -317,6 +332,26 @@ def apply_histogram_matching( return out +def _build_tissue_cdf_gpu(vol: Any, n_bins: int, tissue_threshold: float) -> tuple[np.ndarray, np.ndarray, int]: + """GPU equivalent of :func:`_build_tissue_cdf` for a CuPy-resident volume. + + Computes the histogram on device and returns small host-side arrays + so callers can keep the rest of their state on CPU. + """ + import cupy as cp + + lo = tissue_threshold + max(1e-6, tissue_threshold * 1e-6) + lo = min(lo, 1.0) + hist = cp.histogram(vol.ravel(), bins=n_bins, range=(lo, 1.0))[0] + edges = cp.linspace(lo, 1.0, n_bins + 1) + bin_centers = 0.5 * (edges[:-1] + edges[1:]) + total = int(hist.sum().item()) + cdf = cp.cumsum(hist).astype(cp.float64) + if cdf[-1] > 0: + cdf /= cdf[-1] + return cp.asnumpy(bin_centers), cp.asnumpy(cdf), total + + def _apply_histogram_matching_gpu( vol: np.ndarray, bounds: list[tuple[int, int]], @@ -330,7 +365,9 @@ def _apply_histogram_matching_gpu( Each chunk is moved to GPU, has its tissue CDF computed, an ``n_bins``-sized LUT built, and the per-voxel mapping applied. Result is moved back to CPU per chunk so the host array fills - incrementally without holding the whole volume on GPU. + incrementally without holding the whole volume on GPU. Accepts + either a host (numpy) volume or a device (cupy) volume; in the + cupy case ``cp.asarray`` is a no-op so no extra H→D copy occurs. """ import cupy as cp @@ -340,7 +377,10 @@ def _apply_histogram_matching_gpu( lo = tissue_threshold + max(1e-6, tissue_threshold * 1e-6) lo = min(lo, 1.0) - out = np.empty_like(vol) + # Always allocate the output on the host: the next pipeline stage + # (z-profile smoothing, N4) operates on numpy. + out = np.empty(vol.shape, dtype=np.float32) + cupy_input = isinstance(vol, cp.ndarray) for s, e in bounds: chunk_g = cp.asarray(vol[s:e], dtype=cp.float32) flat = chunk_g.ravel() @@ -348,7 +388,7 @@ def _apply_histogram_matching_gpu( hist = cp.histogram(flat, bins=n_bins, range=(lo, 1.0))[0] tissue_count = int(hist.sum().item()) if tissue_count < 500: - out[s:e] = vol[s:e] + out[s:e] = cp.asnumpy(vol[s:e]) if cupy_input else vol[s:e] continue edges = cp.linspace(lo, 1.0, n_bins + 1, dtype=cp.float32) @@ -403,6 +443,12 @@ def apply_zprofile_smoothing( """ from scipy.ndimage import gaussian_filter1d + from linumpy.gpu import to_cpu + + # Pure-CPU op; pull cupy inputs to host so scipy/numpy can run unchanged. + vol = to_cpu(vol) + mask = to_cpu(mask) + if sigma <= 0: return vol n_z = vol.shape[0] diff --git a/linumpy/io/zarr.py b/linumpy/io/zarr.py index 21015a65..da391567 100644 --- a/linumpy/io/zarr.py +++ b/linumpy/io/zarr.py @@ -18,9 +18,8 @@ from ome_zarr.format import CurrentFormat from ome_zarr.io import parse_url from ome_zarr.reader import Multiscales, Reader -from ome_zarr.scale import Scaler +from ome_zarr.scale import Methods from ome_zarr.writer import write_image, write_multiscales_metadata -from skimage.transform import resize configure_dask() @@ -41,80 +40,6 @@ def create_tempstore(dir: str | None = None, suffix: str | None = None) -> zarr. return zarr_store -class CustomScaler(Scaler): - """ - Custom ome_zarr.scale.Scaler class for handling downscaling up to 3D. - - Only `resize_image` method is implemented. Interpolation is ALWAYS done - using 1-st order (linear) interpolation. - """ - - def resize_image(self, image: np.ndarray | da.Array) -> np.ndarray | da.Array: - """ - Resize a numpy array OR a dask array to a smaller array (not pyramid). - - This method is the only method from the Scaler class called from - `ome_zarr.writer.write_image`. - """ - if isinstance(image, da.Array): - - def _resize(image: Any, out_shape: tuple, **kwargs: Any) -> da.Array: # type: ignore[override] - return da_resize(image, out_shape, **kwargs) - - else: - _resize = resize - - # downsample in X, Y, and Z. - new_shape = list(image.shape) - new_shape[-1] = image.shape[-1] // self.downscale - new_shape[-2] = image.shape[-2] // self.downscale - if len(new_shape) > 2: - new_shape[-3] = image.shape[-3] // self.downscale - out_shape = tuple(new_shape) - - dtype = image.dtype - if np.iscomplexobj(image): - image = _resize( - image.real.astype(np.float64), - out_shape, - order=1, - mode="reflect", - anti_aliasing=False, - ) + 1j * _resize( - image.imag.astype(np.float64), - out_shape, - order=1, - mode="reflect", - anti_aliasing=False, - ) - else: - image = _resize( - image.astype(np.float64), - out_shape, - order=1, - mode="reflect", - anti_aliasing=False, - ) - return image.astype(dtype) - - def linear(self, base: np.ndarray) -> list: - """Downsample using :func:`skimage.transform.resize` with linear interpolation.""" - pyramid: list[np.ndarray | da.Array] = [base] - max_axes_resize = min(len(base.shape), 3) - level = self.max_layer - while level > 0 and np.all(np.asarray(pyramid[-1].shape[:-max_axes_resize]) >= self.downscale): - pyramid.append(self.resize_image(pyramid[-1])) - level -= 1 - return pyramid - - def _by_plane(self, base: np.ndarray, func: object) -> np.ndarray: - # This method is called by base class when interpolation methods (e.g. nearest) - # are called directly. Because `write_image` never call these methods, we don't - # need to implement it here. We raise an error to make sure the CustomScaler class - # is not used for this purpose. - raise NotImplementedError("_by_plane method not implemented for CustomScaler") - - def create_transformation_dict(nlevels: int, voxel_size: Sequence, ndims: int = 3) -> list: """Create a list of coordinate transformation dicts for OME-Zarr pyramid levels. @@ -217,6 +142,7 @@ def save_omezarr( chunks: tuple | Sequence = (128, 128, 128), n_levels: int = 5, overwrite: bool = True, + shards: tuple | Sequence | None = None, ) -> zarr.Group: """Save array to disk in OME-NGFF zarr format. @@ -231,27 +157,49 @@ def save_omezarr( :type chunks: tuple of n `int`, with n the number of dimensions. :param chunks: Chunk size on disk. :type n_levels: int - :param n_levels: Number of levels in Gaussian pyramid. + :param n_levels: Number of pyramid levels (downsamples by 2 along all spatial + axes including Z). :type overwrite: bool :param overwrite: Overwrite `store_path` if it already exists. + :type shards: tuple of n `int` or None + :param shards: If provided, group multiple chunks together into a single + shard on disk (zarr v3 native sharding). Must be a multiple of + ``chunks`` along every dimension. Sharding reduces the number of files + on disk dramatically for large pyramids while keeping per-chunk random + access; useful for stacked volumes whose chunk size is far smaller than + the per-axis extent. ``None`` disables sharding. :type zarr_group: zarr.hierarchy.group :return zarr_group: Resulting zarr group saved to disk. """ n_levels = validate_n_levels(n_levels, data.shape) - # pyramidal decomposition (ome_zarr.scale.Scaler) keywords - pyramid_kw = {"max_layer": int(n_levels), "method": "linear", "downscale": 2} - - ome_zarr_version = version("ome-zarr") - metadata = {"method": "ome_zarr.scale.Scaler", "version": ome_zarr_version, "args": pyramid_kw} - - # # axes and coordinate transformations + # axes and coordinate transformations (c, z, y, x order) ndims = len(data.shape) axes = generate_axes_dict(ndims) coordinate_transformations = create_transformation_dict(n_levels + 1, voxel_size=voxel_size, ndims=ndims) - # create directory for zarr storage + # ome-zarr's default ``scale_factors`` (e.g. ``(2, 4, 8, 16)``) applies + # downsampling only to spatial axes *except z*. linumpy datasets are + # acquired with isotropic-ish voxel sizes, so we want true 3D downsampling + # at every level. Build the per-level dict explicitly. + spatial_axes = [a["name"] for a in axes if a.get("type") == "space"] + scale_factors: list[dict[str, int]] = [dict.fromkeys(spatial_axes, 2**i) for i in range(1, n_levels + 1)] + + # storage_options: one dict per dataset (level0 + n_levels). Each level + # gets the same chunk shape; sharding (when requested) likewise applies + # to every level. + storage_options: list[dict[str, Any]] = [] + for _ in range(n_levels + 1): + opts: dict[str, Any] = {"chunks": tuple(chunks)} + if shards is not None: + opts["shards"] = tuple(shards) + storage_options.append(opts) + + pyramid_kw = {"max_layer": int(n_levels), "method": Methods.RESIZE.value, "downscale": 2} + ome_zarr_version = version("ome-zarr") + metadata = {"method": "ome_zarr.writer.write_image", "version": ome_zarr_version, "args": pyramid_kw} + create_directory(store_path, overwrite) _loc = parse_url(store_path, mode="w") assert _loc is not None @@ -261,18 +209,52 @@ def save_omezarr( write_image( data, zarr_group, + scale_factors=scale_factors, + method=Methods.RESIZE, axes=axes, - scaler=CustomScaler(max_layer=int(n_levels), method="linear", downscale=2), - storage_options={"chunks": chunks}, + storage_options=storage_options, coordinate_transformations=coordinate_transformations, compute=True, metadata=metadata, ) - # return zarr group containing saved data return zarr_group +def resolve_omezarr_level_path(zarr_path: Path, level: int = 0) -> Path: + """Return the on-disk path of the array for *level* of an OME-Zarr group. + + Resolves the multiscale dataset path advertised by the OME-Zarr metadata + (``zarr_path / multiscale.datasets[level]``) without opening the array. + This is the backend-agnostic way to locate the actual chunked store on + disk — independent of any particular ``zarr.storage`` implementation. + + Parameters + ---------- + zarr_path + Path to the OME-Zarr group. + level + Pyramid level (0 = full resolution). + + Returns + ------- + Path + On-disk path of the level's zarr array. + """ + _zarr_loc = parse_url(zarr_path) + assert _zarr_loc is not None + reader = Reader(_zarr_loc) + nodes = list(reader()) + image_node = nodes[0] + + multiscale = None + for spec in image_node.specs: + if isinstance(spec, Multiscales): + multiscale = spec + assert multiscale is not None, "No Multiscales spec found in zarr file" + return Path(zarr_path) / multiscale.datasets[level] + + def read_omezarr(zarr_path: Path, level: int = 0) -> tuple: """Read OME-Zarr image at *zarr_path* and return the array and voxel size. @@ -288,18 +270,12 @@ def read_omezarr(zarr_path: Path, level: int = 0) -> tuple: :type res: tuple (3,) :return res: Voxel size of zarr array. """ - # read the image data _zarr_loc = parse_url(zarr_path) assert _zarr_loc is not None reader = Reader(_zarr_loc) - # nodes may include images, labels etc nodes = list(reader()) - - # first node will be the image pixel data image_node = nodes[0] - # By default omezarr will return dask array (vol = image_node.data[level]). - # Here we load a zarr array directly and let the user convert to dask if needed. multiscale = None for spec in image_node.specs: if isinstance(spec, Multiscales): @@ -317,6 +293,55 @@ def read_omezarr(zarr_path: Path, level: int = 0) -> tuple: return vol, scale +def read_omezarr_array( + zarr_path: Path, + level: int = 0, + *, + use_gpu: bool = False, +) -> tuple[Any, list[float]]: + """Read an OME-Zarr image and materialise it into a single in-memory array. + + Unlike :func:`read_omezarr` (which returns a lazy ``zarr.Array``), this helper + returns a fully-loaded array sized to fit in host or device memory, plus the + voxel size for *level*. + + When ``use_gpu=True`` the underlying multiscale array is loaded via + :func:`linumpy.gpu.zarr_io.read_zarr_to_gpu`, which selects the fastest path + available at runtime (kvikio / GPUDirect Storage when native mode is on and + the array is uncompressed; otherwise ``zarr.config.enable_gpu``). The + returned array is a ``cupy.ndarray``. + + When ``use_gpu=False`` (default) the array is returned as a contiguous + ``numpy.ndarray`` — keeps existing CPU-only pipelines working unchanged. + + Parameters + ---------- + zarr_path + Path to the OME-Zarr group. + level + Pyramid level to load (0 = full resolution). + use_gpu + If True and CuPy is available, return a device-resident ``cupy.ndarray`` + loaded through the GPU dispatcher. + + Returns + ------- + array : numpy.ndarray or cupy.ndarray + Fully-materialised volume. + scale : list of float + Voxel size for the requested pyramid level. + """ + vol, scale = read_omezarr(zarr_path, level=level) + if use_gpu: + # Resolve the on-disk path of the actual array (level subdirectory) so + # the GPU dispatcher can hit kvikio's raw-bytes fast path when possible. + from linumpy.gpu.zarr_io import read_zarr_to_gpu + + array_path = resolve_omezarr_level_path(zarr_path, level=level) + return read_zarr_to_gpu(array_path), scale + return np.asarray(vol[:]), scale + + class OmeZarrWriter: """Write OME-Zarr files to disk in a pyramidal format, chunk by chunk.""" @@ -468,10 +493,18 @@ def finalize(self, res: list | tuple, n_levels: int = 5) -> None: for p, t in zip(paths, transformations, strict=False): datasets.append({"path": p, "coordinateTransformations": t}) - pyramid_kw = {"max_layer": n_levels, "method": "linear", "downscale": self.downscale_factor} + pyramid_kw = { + "max_layer": n_levels, + "method": "resize", + "downscale": self.downscale_factor, + } ome_zarr_version = version("ome-zarr") - metadata = {"method": "ome_zarr.scale.Scaler", "version": ome_zarr_version, "args": pyramid_kw} + metadata = { + "method": "linumpy.io.zarr.OmeZarrWriter._downsample_pyramid_on_disk", + "version": ome_zarr_version, + "args": pyramid_kw, + } write_multiscales_metadata(self.root, datasets, axes=self.axes, metadata=metadata) diff --git a/linumpy/mosaic/stacking.py b/linumpy/mosaic/stacking.py index e357bc71..63036ba3 100644 --- a/linumpy/mosaic/stacking.py +++ b/linumpy/mosaic/stacking.py @@ -96,7 +96,12 @@ def enforce_z_consistency( def find_z_overlap( - fixed_vol: np.ndarray, moving_vol: np.ndarray, slicing_interval_mm: float, search_range_mm: float, resolution_um: float + fixed_vol: np.ndarray, + moving_vol: np.ndarray, + slicing_interval_mm: float, + search_range_mm: float, + resolution_um: float, + use_gpu: bool = False, ) -> tuple[int, float]: """Find optimal Z-overlap between consecutive slices using cross-correlation. @@ -115,6 +120,8 @@ def find_z_overlap( Search range around expected position in mm. resolution_um : float Z resolution in microns per voxel. + use_gpu : bool + If True, run the NCC sweep on the GPU when CuPy is available. Returns ------- @@ -138,17 +145,29 @@ def find_z_overlap( y_slice = slice(margin, h - margin) x_slice = slice(margin, w - margin) + xp: Any = np + if use_gpu: + from linumpy.gpu import GPU_AVAILABLE + + if GPU_AVAILABLE: + import cupy as cp + + xp = cp + + fixed_xp = xp.asarray(fixed_vol[-max_overlap:, y_slice, x_slice].astype(np.float32)) + moving_xp = xp.asarray(moving_vol[:max_overlap, y_slice, x_slice].astype(np.float32)) + best_overlap = expected_overlap_vox - best_corr = -np.inf + best_corr = -float("inf") for overlap in range(min_overlap, max_overlap + 1): - fixed_region = fixed_vol[-overlap:, y_slice, x_slice] - moving_region = moving_vol[:overlap, y_slice, x_slice] + fixed_region = fixed_xp[-overlap:] + moving_region = moving_xp[:overlap] fixed_norm = (fixed_region - fixed_region.mean()) / (fixed_region.std() + 1e-8) moving_norm = (moving_region - moving_region.mean()) / (moving_region.std() + 1e-8) - corr = np.mean(fixed_norm * moving_norm) + corr = float((fixed_norm * moving_norm).mean()) if corr > best_corr: best_corr = corr best_overlap = overlap diff --git a/pyproject.toml b/pyproject.toml index f6b75bbf..a12c823c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ dependencies = [ "hyperactive", "basicpy", "zarr>=3.0.10", - "ome-zarr>=0.9.0", + "ome-zarr>=0.16.0", "napari", "napari-ome-zarr", "pynrrd", @@ -53,10 +53,16 @@ Repository = "https://github.com/linum-uqam/linumpy" [project.optional-dependencies] gpu = [ + "cupy-cuda13x>=13.0.0", +] +gpu-cuda12 = [ "cupy-cuda12x>=12.0.0", ] -gpu-cuda13 = [ - "cupy-cuda13x>=13.0.0", +gds = [ + "kvikio-cu13", +] +gds-cuda12 = [ + "kvikio-cu12", ] docs = [ "sphinx>=9.1", @@ -270,7 +276,7 @@ possibly-unresolved-reference = "error" # Optional GPU deps — not installed in the standard dev env. # Using replace-imports-with-any so downstream attribute/subscript access # on the imported module is typed as Any rather than causing cascading errors. -replace-imports-with-any = ["cupy.**", "cupyx.**", "numba.**"] +replace-imports-with-any = ["cupy.**", "cupyx.**", "numba.**", "kvikio.**"] # --- ty overrides --- diff --git a/scripts/linum_aip.py b/scripts/linum_aip.py index 23873d3c..b0d20e17 100644 --- a/scripts/linum_aip.py +++ b/scripts/linum_aip.py @@ -1,17 +1,22 @@ #!/usr/bin/env python3 -"""Compute the average intensity projection of a 3D zarr volume.""" +"""Compute the average intensity projection of a 3D zarr volume. + +Falls back to CPU if GPU is not available or --no-use_gpu is passed. +""" # Configure thread limits before numpy/scipy imports import linumpy.config.threads # noqa: F401 import argparse from pathlib import Path +from typing import Any import dask.array as da import numpy as np import zarr +from linumpy.gpu import GPU_AVAILABLE, is_cupy_array, print_gpu_info, to_cpu from linumpy.io.zarr import create_tempstore, read_omezarr, save_omezarr @@ -19,32 +24,25 @@ def _build_arg_parser() -> argparse.ArgumentParser: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawTextHelpFormatter) p.add_argument("input_zarr", type=Path, help="Full path to the zarr volume.") p.add_argument("output_image", type=Path, default=None, help="Full path to the output zarr image") - + p.add_argument( + "--use_gpu", + default=True, + action=argparse.BooleanOptionalAction, + help="Use GPU acceleration if available. [%(default)s]", + ) + p.add_argument("--verbose", "-v", action="store_true", help="Print GPU information.") return p -def main() -> None: - """Run the average intensity projection script.""" - # Parse arguments - p = _build_arg_parser() - args = p.parse_args() - - # Parameters - input_file = Path(args.input_zarr) - output_file = Path(args.output_image) - - # Open the zarr volume - vol, resolution = read_omezarr(input_file, level=0) - - # Prepare the output +def _compute_aip(vol: Any, use_gpu: bool) -> tuple[zarr.Array, tuple]: + """Compute the per-tile mean and store it into a temporary 2D zarr array.""" shape = vol.shape[1:3] + tile_shape = vol.chunks zarr_store = create_tempstore(suffix=".zarr") _aip = zarr.open(zarr_store, mode="w", shape=shape, dtype=np.float32, chunks=vol.chunks[1:3]) assert isinstance(_aip, zarr.Array) aip = _aip - # Process every tile - tile_shape = vol.chunks nx = vol.shape[1] // tile_shape[1] ny = vol.shape[2] // tile_shape[2] for i in range(nx): @@ -53,8 +51,48 @@ def main() -> None: rmax = (i + 1) * tile_shape[1] cmin = j * tile_shape[2] cmax = (j + 1) * tile_shape[2] - tile = np.asarray(vol[:, rmin:rmax, cmin:cmax]).mean(axis=0) - aip[rmin:rmax, cmin:cmax] = tile + tile = vol[:, rmin:rmax, cmin:cmax] + if use_gpu: + import cupy as cp + + # Tile is already cupy when the reader is inside gpu_zarr_context; + # otherwise transfer it once. + tile_gpu = tile if is_cupy_array(tile) else cp.asarray(np.asarray(tile)) + aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu.astype(cp.float32), axis=0)) + del tile_gpu + else: + aip[rmin:rmax, cmin:cmax] = np.asarray(tile).mean(axis=0) + return aip, tile_shape + + +def main() -> None: + """Run the average intensity projection script.""" + p = _build_arg_parser() + args = p.parse_args() + + input_file = Path(args.input_zarr) + output_file = Path(args.output_image) + + use_gpu = args.use_gpu and GPU_AVAILABLE + + if args.verbose: + print_gpu_info() + if args.use_gpu and not GPU_AVAILABLE: + print("WARNING: GPU requested but not available, falling back to CPU") + elif use_gpu: + print("GPU: ENABLED") + else: + print("GPU: DISABLED (using CPU)") + + if use_gpu: + from linumpy.gpu.zarr_io import gpu_zarr_context + + with gpu_zarr_context(): + vol, resolution = read_omezarr(input_file, level=0) + aip, tile_shape = _compute_aip(vol, use_gpu=True) + else: + vol, resolution = read_omezarr(input_file, level=0) + aip, tile_shape = _compute_aip(vol, use_gpu=False) out_dask = da.from_zarr(aip) save_omezarr(out_dask, output_file, resolution[1:], tile_shape[1:]) diff --git a/scripts/linum_aip_png.py b/scripts/linum_aip_png.py index 23b56a2c..76356f3d 100755 --- a/scripts/linum_aip_png.py +++ b/scripts/linum_aip_png.py @@ -51,16 +51,19 @@ def compute_aip(vol: Any, use_gpu: bool = True) -> np.ndarray: cmin = j * tile_shape[2] cmax = (j + 1) * tile_shape[2] - tile = np.asarray(vol[:, rmin:rmax, cmin:cmax]) + tile = vol[:, rmin:rmax, cmin:cmax] if use_gpu: import cupy as cp - tile_gpu = cp.asarray(tile.astype(np.float32)) - aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu, axis=0)) + # Slices may already be cupy when the read happens inside + # ``gpu_zarr_context`` (no extra H→D copy). Otherwise we + # transfer the host tile once. + tile_gpu = tile if isinstance(tile, cp.ndarray) else cp.asarray(np.asarray(tile)) + aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu.astype(cp.float32), axis=0)) del tile_gpu else: - aip[rmin:rmax, cmin:cmax] = tile.mean(axis=0) + aip[rmin:rmax, cmin:cmax] = np.asarray(tile).mean(axis=0) if use_gpu: try: @@ -127,8 +130,17 @@ def main() -> None: else: print("GPU: DISABLED (using CPU)") - vol, _ = read_omezarr(input_file, level=0) - aip = compute_aip(vol, use_gpu=use_gpu) + if use_gpu: + # Open the array inside the GPU context so each tile slice is read + # directly into device memory (no host round-trip per tile). + from linumpy.gpu.zarr_io import gpu_zarr_context + + with gpu_zarr_context(): + vol, _ = read_omezarr(input_file, level=0) + aip = compute_aip(vol, use_gpu=True) + else: + vol, _ = read_omezarr(input_file, level=0) + aip = compute_aip(vol, use_gpu=False) save_aip_png(aip, output_file) diff --git a/scripts/linum_benchmark_kvikio_zarr.py b/scripts/linum_benchmark_kvikio_zarr.py new file mode 100755 index 00000000..8147d6ff --- /dev/null +++ b/scripts/linum_benchmark_kvikio_zarr.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +r"""Benchmark zarr → GPU loading: kvikio (GDS) vs zarr+cupy vs zarr.enable_gpu. + +Three paths to get an uncompressed zarr v3 array onto the GPU are compared: + + A) ``zarr.open(...)[:]`` → ``np.asarray`` → ``cupy.asarray`` (legacy path). + B) ``zarr.config.enable_gpu(); arr[:]`` returns a ``cupy.ndarray`` directly, + but the codec pipeline still decodes on the CPU before the H→D copy. + C) ``linumpy.gpu.kvikio_zarr.read_zarr_via_kvikio`` — kvikio CuFile / GDS. + D) ``linumpy.gpu.zarr_io.read_zarr_to_gpu`` — auto-dispatch (kvikio if GDS + native + uncompressed, else zarr-gpu). + +Reference numbers on a 16 GiB float32 zarr v3 (256³ chunks) on /scratch_nvme, +RTX A6000, ext4, GDS native mode (warm cache): + +* kvikio (GDS native): ~9.9 GiB/s +* zarr.config.enable_gpu: ~7.1 GiB/s +* zarr → numpy → cupy: ~2.8 GiB/s + +For production code, prefer :func:`linumpy.gpu.zarr_io.read_zarr_to_gpu` +which picks the best available path at runtime. + +Examples +-------- +Generate a 16 GiB random uncompressed zarr v3 on /scratch_nvme and bench:: + + linum_benchmark_kvikio_zarr.py \ + --generate /scratch_nvme/bench.zarr \ + --shape 2048 2048 1024 --chunks 256 256 128 --dtype float32 \ + --runs 3 +""" + +from __future__ import annotations + +import argparse +import contextlib +import sys +import time +from pathlib import Path + +import numpy as np + + +def _human_bytes(n: float) -> str: + units = ["B", "KiB", "MiB", "GiB", "TiB"] + i = 0 + while n >= 1024 and i < len(units) - 1: + n /= 1024 + i += 1 + return f"{n:.2f} {units[i]}" + + +def _generate_dataset(path: Path, shape: tuple[int, ...], chunks: tuple[int, ...], dtype: str) -> None: + import zarr + + print(f"[generate] shape={shape} chunks={chunks} dtype={dtype} path={path}") + if path.exists(): + raise SystemExit(f"refusing to overwrite existing {path}") + + # zarr v3 with raw bytes only (no compression) so the kvikio path works. + arr = zarr.create_array( + store=str(path), + shape=shape, + chunks=chunks, + dtype=dtype, + compressors=None, + zarr_format=3, + fill_value=0, + ) + rng = np.random.default_rng(0) + z_step = chunks[0] + for z0 in range(0, shape[0], z_step): + z1 = min(z0 + z_step, shape[0]) + block = rng.standard_normal((z1 - z0, *shape[1:])).astype(dtype, copy=False) + arr[z0:z1] = block + print(f"[generate] done: {_human_bytes(arr.nbytes)} written") + + +def _bench_zarr_cupy(array_path: Path) -> tuple[float, int]: + import cupy + import zarr + + t0 = time.perf_counter() + z = zarr.open_array(str(array_path), mode="r") + host = np.asarray(z[:]) + dev = cupy.asarray(host) + cupy.cuda.Stream.null.synchronize() + t1 = time.perf_counter() + return t1 - t0, dev.nbytes + + +def _bench_zarr_gpu(array_path: Path) -> tuple[float, int]: + """zarr.config.enable_gpu(): host I/O, host decode, final buffer is CuPy.""" + import cupy + import zarr + + with zarr.config.enable_gpu(): + t0 = time.perf_counter() + z = zarr.open_array(str(array_path), mode="r") + dev = z[:] + cupy.cuda.Stream.null.synchronize() + t1 = time.perf_counter() + nbytes = int(np.prod(z.shape)) * np.dtype(z.dtype).itemsize + del dev # keep reference alive through synchronize + return t1 - t0, nbytes + + +def _bench_kvikio(array_path: Path) -> tuple[float, int]: + import cupy + + from linumpy.gpu.kvikio_zarr import read_zarr_via_kvikio + + t0 = time.perf_counter() + dev = read_zarr_via_kvikio(array_path) + cupy.cuda.Stream.null.synchronize() + t1 = time.perf_counter() + return t1 - t0, dev.nbytes + + +def _bench_auto(array_path: Path) -> tuple[float, int]: + import cupy + + from linumpy.gpu.zarr_io import read_zarr_to_gpu + + t0 = time.perf_counter() + dev = read_zarr_to_gpu(array_path) + cupy.cuda.Stream.null.synchronize() + t1 = time.perf_counter() + return t1 - t0, dev.nbytes + + +def _drop_caches() -> None: + """Best-effort page-cache drop (requires root). Silently skipped otherwise.""" + with contextlib.suppress(PermissionError, FileNotFoundError, OSError): + Path("/proc/sys/vm/drop_caches").write_text("3\n") + + +PATHS = { + "zarr+cupy": _bench_zarr_cupy, + "zarr-gpu": _bench_zarr_gpu, + "kvikio": _bench_kvikio, + "auto": _bench_auto, +} + + +def main() -> None: + """Run the benchmark from the command line.""" + p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("array_path", type=Path, nargs="?", help="zarr array directory") + p.add_argument("--generate", type=Path, help="create a synthetic uncompressed zarr v3 at this path") + p.add_argument("--shape", type=int, nargs="+", default=[2048, 2048, 512]) + p.add_argument("--chunks", type=int, nargs="+", default=[256, 256, 128]) + p.add_argument("--dtype", default="float32") + p.add_argument("--runs", type=int, default=3) + p.add_argument("--skip-cache-drop", action="store_true", help="don't try to drop page cache between runs") + p.add_argument("--only", action="append", choices=list(PATHS), help="only benchmark these paths (repeatable)") + args = p.parse_args() + + if args.generate is not None: + _generate_dataset(args.generate, tuple(args.shape), tuple(args.chunks), args.dtype) + if args.array_path is None: + args.array_path = args.generate + + if args.array_path is None: + p.error("array_path is required (unless --generate also gives one)") + + selected = args.only or list(PATHS) + print(f"[bench] array={args.array_path} runs={args.runs}") + for name in selected: + fn = PATHS[name] + times: list[float] = [] + nbytes = 0 + for r in range(args.runs): + if not args.skip_cache_drop: + _drop_caches() + try: + dt, nbytes = fn(args.array_path) + except Exception as exc: + print(f" [{name}] run {r}: ERROR {exc!r}") + break + tput = nbytes / dt / (1024**3) + times.append(dt) + print(f" [{name}] run {r}: {dt:.3f}s {tput:.2f} GiB/s") + if times: + best = min(times) + print(f" [{name}] best: {best:.3f}s {nbytes / best / (1024**3):.2f} GiB/s") + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/linum_compensate_attenuation.py b/scripts/linum_compensate_attenuation.py index 94566964..cc075f27 100644 --- a/scripts/linum_compensate_attenuation.py +++ b/scripts/linum_compensate_attenuation.py @@ -1,9 +1,5 @@ #!/usr/bin/env python -""" -Compensate the tissue attenuation using a precomputed attenuation. - -bias field. -""" +"""Compensate the tissue attenuation using a precomputed attenuation bias field.""" # Configure thread limits before numpy/scipy imports import linumpy.config.threads # noqa: F401 diff --git a/scripts/linum_correct_bias_field.py b/scripts/linum_correct_bias_field.py index c217b8c7..46360137 100644 --- a/scripts/linum_correct_bias_field.py +++ b/scripts/linum_correct_bias_field.py @@ -29,7 +29,7 @@ n4_correct_per_section, ) from linumpy.intensity.normalization import apply_histogram_matching, apply_zprofile_smoothing -from linumpy.io.zarr import AnalysisOmeZarrWriter, read_omezarr +from linumpy.io.zarr import AnalysisOmeZarrWriter, read_omezarr_array logger = logging.getLogger(__name__) @@ -207,12 +207,9 @@ def main() -> None: n_processes = parse_processes_arg(args.n_processes) - # Load volume - vol_da, res = read_omezarr(args.in_image, level=0) - vol = np.asarray(vol_da).astype(np.float32) - logger.info("Loaded volume %s from %s", vol.shape, args.in_image) - - # Resolve GPU usage from --backend choice for non-N4 stages. + # Resolve GPU usage from --backend choice for non-N4 stages. We resolve + # this BEFORE reading so we can stream the volume directly into device + # memory through the GDS / zarr-gpu fast path when the GPU is in play. if args.backend == "gpu": use_gpu_pre = True elif args.backend == "auto": @@ -222,6 +219,11 @@ def main() -> None: else: use_gpu_pre = False + # Load volume — onto GPU directly when use_gpu_pre, else host. + vol, res = read_omezarr_array(args.in_image, level=0, use_gpu=use_gpu_pre) + vol = vol.astype(np.float32) # works on both numpy and cupy arrays + logger.info("Loaded volume %s from %s (gpu=%s)", vol.shape, args.in_image, use_gpu_pre) + # Tissue mask (per serial section) mask = compute_tissue_mask( vol, @@ -260,7 +262,7 @@ def main() -> None: n4_kwargs = { "shrink_factor": args.shrink_factor, "n_iterations": args.n_iterations, - "voxel_size_mm": tuple(res), + "voxel_size_mm": (float(res[0]), float(res[1]), float(res[2])), "backend": args.backend, } diff --git a/scripts/linum_detect_focal_curvature.py b/scripts/linum_detect_focal_curvature.py index d09fd1c8..d9d8bff3 100644 --- a/scripts/linum_detect_focal_curvature.py +++ b/scripts/linum_detect_focal_curvature.py @@ -7,6 +7,7 @@ import argparse from pathlib import Path +from typing import Any import dask.array as da import numpy as np @@ -14,11 +15,10 @@ from basicpy import BaSiC from linumpy.geometry.interface import find_tissue_interface +from linumpy.gpu import GPU_AVAILABLE, to_cpu from linumpy.io.zarr import create_tempstore, read_omezarr, save_omezarr -# TODO: Replace by interpolation using deformation field -# TODO: optimize for full resolution data -# TODO: parallelize the correction +# TODO: Replace integer roll by sub-voxel interpolation using a deformation field def _build_arg_parser() -> argparse.ArgumentParser: @@ -36,6 +36,12 @@ def _build_arg_parser() -> argparse.ArgumentParser: "--sigma_z", type=float, default=2.0, help="Gaussian smoothing sigma in Z before interface detection [%(default)s]" ) p.add_argument("--use_log", action="store_true", help="Apply log transform before gradient detection") + p.add_argument( + "--use_gpu", + action=argparse.BooleanOptionalAction, + default=True, + help="Use GPU acceleration for the per-tile focal-plane shift if available [%(default)s].", + ) return p @@ -54,7 +60,13 @@ def main() -> None: dtype = vol.dtype data = np.moveaxis(np.asarray(vol), 0, -1) # Estimate the water-tissue interface - z0 = find_tissue_interface(np.abs(data), s_xy=args.sigma_xy, s_z=args.sigma_z, use_log=args.use_log) + z0 = find_tissue_interface( + np.abs(data), + s_xy=args.sigma_xy, + s_z=args.sigma_z, + use_log=args.use_log, + use_gpu=args.use_gpu, + ) # Extract the tile shape from the filename tile_shape = vol.chunks @@ -84,6 +96,20 @@ def main() -> None: # Apply the correction to a tile corr = ((flatfield - 1) * z0.mean()).astype(int) + use_gpu = args.use_gpu and GPU_AVAILABLE + if use_gpu: + import cupy as cp + + xp: Any = cp + else: + xp = np + + nz_full = vol.shape[0] + z_arange = xp.arange(nz_full)[:, None, None] + corr_xp = xp.asarray(corr) + # Per-(m, n) circular shift along Z, vectorised via take_along_axis. + # Equivalent to: tile[:, m, n] = roll(tile[:, m, n], -corr[m, n]). + temp_store = create_tempstore() _vol_corr = zarr.open(temp_store, mode="w", shape=vol.shape, dtype=dtype, chunks=tile_shape) assert isinstance(_vol_corr, zarr.Array) @@ -95,14 +121,11 @@ def main() -> None: rmax = (i + 1) * tile_shape[1] cmin = j * tile_shape[2] cmax = (j + 1) * tile_shape[2] - tile = np.asarray(vol[:, rmin:rmax, cmin:cmax]) - - # Apply the correction (shift the focal plane to flatten it) - for m in range(tile.shape[1]): - for n in range(tile.shape[2]): - tile[:, m, n] = np.roll(tile[:, m, n], -corr[m, n]) - - vol_corr[:, rmin:rmax, cmin:cmax] = tile + tile_np = np.asarray(vol[:, rmin:rmax, cmin:cmax]) + tile_xp = xp.asarray(tile_np) if use_gpu else tile_np + z_idx_tile = (z_arange + corr_xp[None, rmin:rmax, cmin:cmax]) % nz_full + tile_rolled = xp.take_along_axis(tile_xp, z_idx_tile, axis=0) + vol_corr[:, rmin:rmax, cmin:cmax] = to_cpu(tile_rolled) if use_gpu else tile_rolled # save to ome-zarr dask_arr = da.from_zarr(vol_corr) diff --git a/scripts/linum_generate_mosaic_aips.py b/scripts/linum_generate_mosaic_aips.py index 320cb455..c2513897 100755 --- a/scripts/linum_generate_mosaic_aips.py +++ b/scripts/linum_generate_mosaic_aips.py @@ -62,16 +62,19 @@ def compute_aip(vol: Any, use_gpu: bool = True) -> np.ndarray: cmin = j * tile_shape[2] cmax = (j + 1) * tile_shape[2] - tile = np.asarray(vol[:, rmin:rmax, cmin:cmax]) + tile = vol[:, rmin:rmax, cmin:cmax] if use_gpu: import cupy as cp - tile_gpu = cp.asarray(tile.astype(np.float32)) - aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu, axis=0)) + # Slices may already be cupy when the read happens inside + # ``gpu_zarr_context`` (no extra H→D copy). Otherwise we + # transfer the host tile once. + tile_gpu = tile if isinstance(tile, cp.ndarray) else cp.asarray(np.asarray(tile)) + aip[rmin:rmax, cmin:cmax] = to_cpu(cp.mean(tile_gpu.astype(cp.float32), axis=0)) del tile_gpu else: - aip[rmin:rmax, cmin:cmax] = tile.mean(axis=0) + aip[rmin:rmax, cmin:cmax] = np.asarray(tile).mean(axis=0) if use_gpu: try: @@ -166,12 +169,25 @@ def main() -> None: f"No mosaic grid files found in {input_dir}.\nExpected files matching 'mosaic_grid_3d_z*.ome.zarr'." ) - for mosaic_file in tqdm(mosaic_files, desc="Generating AIPs"): - slice_id = mosaic_file.name[len("mosaic_grid_3d_z") : -len(".ome.zarr")] - output_file = output_dir / f"aip_z{slice_id}.png" - vol, _ = read_omezarr(mosaic_file, level=args.level) - aip = compute_aip(vol, use_gpu=use_gpu) - save_aip_png(aip, output_file) + if use_gpu: + # Open each volume inside the GPU context so tile slices land on + # device memory directly (no host round-trip per tile). + from linumpy.gpu.zarr_io import gpu_zarr_context + + for mosaic_file in tqdm(mosaic_files, desc="Generating AIPs"): + slice_id = mosaic_file.name[len("mosaic_grid_3d_z") : -len(".ome.zarr")] + output_file = output_dir / f"aip_z{slice_id}.png" + with gpu_zarr_context(): + vol, _ = read_omezarr(mosaic_file, level=args.level) + aip = compute_aip(vol, use_gpu=True) + save_aip_png(aip, output_file) + else: + for mosaic_file in tqdm(mosaic_files, desc="Generating AIPs"): + slice_id = mosaic_file.name[len("mosaic_grid_3d_z") : -len(".ome.zarr")] + output_file = output_dir / f"aip_z{slice_id}.png" + vol, _ = read_omezarr(mosaic_file, level=args.level) + aip = compute_aip(vol, use_gpu=False) + save_aip_png(aip, output_file) if __name__ == "__main__": diff --git a/scripts/linum_interpolate_missing_slice.py b/scripts/linum_interpolate_missing_slice.py index ecd9d7c9..15de1320 100644 --- a/scripts/linum_interpolate_missing_slice.py +++ b/scripts/linum_interpolate_missing_slice.py @@ -71,6 +71,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: " zmorph - z-aware morphing (respects serial-section geometry; recommended)\n" " average - Simple average of adjacent slices\n" " weighted - Weighted average with distance falloff\n" + "\n" "[default: %(default)s]", ) p.add_argument( @@ -80,6 +81,7 @@ def _build_arg_parser() -> argparse.ArgumentParser: help="Blending method for combining warped slices:\n" " linear - Equal 50/50 blend (may show edges)\n" " gaussian - Feathered blend using distance transform (recommended)\n" + "\n" "[default: %(default)s]", ) p.add_argument( diff --git a/scripts/linum_resample_mosaic_grid.py b/scripts/linum_resample_mosaic_grid.py index 329b0d41..f346266a 100644 --- a/scripts/linum_resample_mosaic_grid.py +++ b/scripts/linum_resample_mosaic_grid.py @@ -22,6 +22,7 @@ from linumpy.geometry.resampling import resolution_is_mm from linumpy.gpu import GPU_AVAILABLE, print_gpu_info from linumpy.gpu.interpolation import resize +from linumpy.gpu.zarr_io import gpu_zarr_context from linumpy.io import OmeZarrWriter, read_omezarr @@ -66,8 +67,12 @@ def rescale(image: Any, scale: float | Sequence[float], order: int = 1, use_gpu: def _read_tile(vol: Any, i: Any, j: Any, tile_shape: Any) -> Any: - """Read one tile from the input zarr array (I/O stage of the pipeline).""" - return np.asarray(vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]]) + """Read one tile from the input zarr array (I/O stage of the pipeline). + + Returns whatever array type the zarr backend yields: NumPy by default, or + CuPy when the volume was opened inside :func:`gpu_zarr_context`. + """ + return vol[:, i * tile_shape[1] : (i + 1) * tile_shape[1], j * tile_shape[2] : (j + 1) * tile_shape[2]] def _run_pipelined( @@ -90,15 +95,10 @@ def _run_pipelined( if not tile_iter: return - cp: Any = None - cupy_available = False - if use_gpu: - try: - import cupy as cp - - cupy_available = True - except Exception: - pass + # Note: do NOT periodically call cp.get_default_memory_pool().free_all_blocks(). + # All tiles are the same size, so the pool reuses GPU buffers across iterations. + # Flushing forces cudaMalloc on the next tile and stalls the GPU; CuPy already + # retries with free_all_blocks() automatically on OOM. with ThreadPoolExecutor(max_workers=1) as prefetch_executor: i0, j0 = tile_iter[0] @@ -116,9 +116,6 @@ def _run_pipelined( :, i * out_tile_shape[1] : (i + 1) * out_tile_shape[1], j * out_tile_shape[2] : (j + 1) * out_tile_shape[2] ] = resampled - if cupy_available and cp is not None and k % 10 == 9: - cp.get_default_memory_pool().free_all_blocks() - def main() -> None: """Run function.""" @@ -172,11 +169,24 @@ def main() -> None: out_shape = (out_tile_shape[0], nx * out_tile_shape[1], ny * out_tile_shape[2]) print(f" Output shape: {out_shape} ({total_tiles} tiles)") - out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True) - tile_iter = list(itertools.product(range(nx), range(ny))) - _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu) + if use_gpu: + # Open BOTH the input volume and the output writer inside the GPU zarr + # context. That way per-tile reads land directly on device and per-tile + # writes accept cupy arrays through zarr's GPU buffer prototype, so the + # resample loop is fully device-resident: + # read tile -> cupy gaussian + zoom -> write cupy slice + with gpu_zarr_context(): + vol_gpu, _ = read_omezarr(args.in_mosaic) + out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True) + _run_pipelined(vol_gpu, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu) + else: + out_zarr = OmeZarrWriter(args.out_mosaic, out_shape, out_tile_shape, dtype=vol.dtype, overwrite=True) + _run_pipelined(vol, out_zarr, tile_iter, tile_shape, out_tile_shape, scaling_factor, use_gpu) + + # Pyramid build reads the just-written level0 back via dask; run it outside + # the GPU context so dask uses the normal numpy buffer prototype. print("Building pyramid...") out_res = [target_res] * 3 out_zarr.finalize(out_res, args.n_levels) diff --git a/scripts/linum_stack_slices_motor.py b/scripts/linum_stack_slices_motor.py index b2884358..709a8633 100644 --- a/scripts/linum_stack_slices_motor.py +++ b/scripts/linum_stack_slices_motor.py @@ -288,6 +288,13 @@ def _build_arg_parser() -> argparse.ArgumentParser: "Default: none (use only automated transforms).", ) + p.add_argument( + "--use_gpu", + action=argparse.BooleanOptionalAction, + default=True, + help="Use GPU acceleration for the Z-overlap NCC sweep when available [%(default)s].", + ) + add_overwrite_arg(p) return p @@ -940,7 +947,9 @@ def main() -> None: else: # find_z_overlap expects resolution in µm for its internal calculation res_z_um = res_z_mm * 1000 - overlap, corr = find_z_overlap(prev_vol, vol, args.slicing_interval_mm, args.search_range_mm, res_z_um) + overlap, corr = find_z_overlap( + prev_vol, vol, args.slicing_interval_mm, args.search_range_mm, res_z_um, use_gpu=args.use_gpu + ) # Fall back to expected overlap when correlation is too low to trust if args.z_overlap_min_corr > 0 and corr < args.z_overlap_min_corr: interval_voxels = int(args.slicing_interval_mm / res_z_mm) diff --git a/shell_scripts/server_setup/nvfs_kernel7_patch.sh b/shell_scripts/server_setup/nvfs_kernel7_patch.sh new file mode 100755 index 00000000..e5226053 --- /dev/null +++ b/shell_scripts/server_setup/nvfs_kernel7_patch.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# Patch nvidia-fs 2.28.4 to build against Linux kernel 7.0 (Ubuntu 26.04). +# +# REMOVAL TRIGGER +# --------------- +# Delete this script as soon as nvidia-fs-dkms ships an upstream version +# that builds cleanly on the running kernel. Quick check: +# +# sudo dkms add /usr/src/nvidia-fs- # try the stock sources +# sudo dkms build nvidia-fs/ # if this succeeds, drop the patch +# +# Two upstream API changes this patch works around: +# 1) struct vm_area_struct: __vm_flags removed, vm_flags is now a const +# field of an anonymous union (writes still go through vm_flags_set). +# Fix: read via vma->vm_flags. +# 2) struct blk_dma_iter::iter is now struct blk_map_iter (was +# struct req_iterator). They share .iter (bvec_iter) and .bio in the +# same order, so we add a configure probe HAVE_BLK_MAP_ITER and +# typedef nvfs_req_iter_t accordingly. Helper signatures use the typedef. +# 3) create_nv.symvers.sh only decompressed *.ko.xz; Ubuntu 26.04 ships +# kernel modules as *.ko.zst, so nm fails and Module.symvers ends up +# empty. Without CRCs the runtime symbol-version check on +# nvidia_p2p_dma_unmap_pages fails and nvidia-fs falls back to the +# no-symver path -> GDS stays in compat mode. Fix: add a *.ko.zst +# branch that decompresses with zstd before nm runs. +# +# USAGE (server-side, as root) +# ---------------------------- +# sudo bash nvfs_kernel7_patch.sh +# sudo dkms remove nvidia-fs/2.28.4 --all 2>/dev/null || true +# sudo dkms add /usr/src/nvidia-fs-2.28.4 +# sudo dkms build nvidia-fs/2.28.4 +# sudo dkms install nvidia-fs/2.28.4 +# sudo modprobe nvidia-fs +# /usr/local/cuda/gds/tools/gdscheck -p +# +# The script is idempotent (creates *.orig backups, skips reapplying if the +# probe is already present). Safe to re-run after kernel header upgrades. + +set -e +SRC=${NVFS_SRC:-/usr/src/nvidia-fs-2.28.4} +if [ ! -d "$SRC" ]; then + echo "ERROR: $SRC not found. Set NVFS_SRC=/usr/src/nvidia-fs- if version differs." >&2 + exit 2 +fi +cd "$SRC" + +# Idempotency: keep one .orig backup +for f in nvfs-mmap.c nvfs-dma.c configure create_nv.symvers.sh; do + [ -f "$f.orig" ] || cp -p "$f" "$f.orig" +done + +# --- nvfs-mmap.c: drop ACCESS_PRIVATE(__vm_flags) -------------------------- +# kernel 7.0 dropped the __vm_flags private name; vm_flags is the public +# (const) field name again. +python3 - <<'PY' +from pathlib import Path +p = Path("nvfs-mmap.c") +s = p.read_text() +old = "\tvm_flags = ACCESS_PRIVATE(vma, __vm_flags);" +new = "\tvm_flags = vma->vm_flags;" +assert old in s, "expected ACCESS_PRIVATE line not found" +p.write_text(s.replace(old, new, 1)) +print("nvfs-mmap.c: patched vm_flags read") +PY + +# --- configure: add HAVE_BLK_MAP_ITER probe -------------------------------- +python3 - <<'PY' +from pathlib import Path +p = Path("configure") +s = p.read_text() +marker = 'if compile_prog "Checking if blk_rq_dma_map_iter_start is present..."; then\n output_sym "HAVE_BLK_RQ_DMA_MAP_ITER_START"\nfi\n' +addition = ''' +cat > $TEST_C < +#include +#include +#include "test.h" + +int test (void) +{ + struct blk_map_iter iter; + (void)iter; + return 0; +} + +EOF +if compile_prog "Checking if struct blk_map_iter is present..."; then + output_sym "HAVE_BLK_MAP_ITER" +fi +''' +if "HAVE_BLK_MAP_ITER" in s: + print("configure: HAVE_BLK_MAP_ITER probe already present") +else: + assert marker in s, "anchor not found in configure" + s = s.replace(marker, marker + addition, 1) + p.write_text(s) + print("configure: added HAVE_BLK_MAP_ITER probe") +PY + +# --- nvfs-dma.c: typedef nvfs_req_iter_t and rewire V2 path ---------------- +python3 - <<'PY' +from pathlib import Path +p = Path("nvfs-dma.c") +s = p.read_text() + +# Insert typedef just after the V2 #ifdef +anchor = "// V2 ops implementations for kernels with iterator API\n#ifdef HAVE_BLK_RQ_DMA_MAP_ITER_START\n" +typedef_block = ''' +/* + * Kernel 6.16+ (e.g. 7.0) replaced struct req_iterator inside struct + * blk_dma_iter with struct blk_map_iter. Both layouts share .iter + * (bvec_iter) and .bio in the same order, so the helpers below only + * use those two fields. + */ +#ifdef HAVE_BLK_MAP_ITER +typedef struct blk_map_iter nvfs_req_iter_t; +#else +typedef struct req_iterator nvfs_req_iter_t; +#endif + +''' +if "nvfs_req_iter_t" not in s: + assert anchor in s, "V2 #ifdef anchor not found" + s = s.replace(anchor, anchor + typedef_block, 1) + print("nvfs-dma.c: inserted typedef") + +# Replace `struct req_iterator` with the typedef *only* inside the V2 path +# (i.e. between the V2 #ifdef and the matching #endif). The V1 path lives +# above this block and still expects struct req_iterator. +v2_start = s.index("// V2 ops implementations for kernels with iterator API\n#ifdef HAVE_BLK_RQ_DMA_MAP_ITER_START\n") +# match closing #endif that terminates the V2 section: it's the last #endif in file +# but to be safe locate the next "#endif /* HAVE_BLK_RQ_DMA_MAP_ITER_START */" +# Try canonical first, then fall back to last #endif before EOF. +end_marker = "#endif /* HAVE_BLK_RQ_DMA_MAP_ITER_START */" +if end_marker in s: + v2_end = s.index(end_marker, v2_start) +else: + v2_end = s.rfind("#endif") + +before = s[:v2_start] +v2 = s[v2_start:v2_end] +after = s[v2_end:] + +new_v2 = v2.replace("struct req_iterator", "nvfs_req_iter_t") +if new_v2 != v2: + s = before + new_v2 + after + p.write_text(s) + print("nvfs-dma.c: rewired V2 path to nvfs_req_iter_t") +else: + print("nvfs-dma.c: no further substitution needed (already patched)") +PY + +# --- create_nv.symvers.sh: handle *.ko.zst (Ubuntu 26.04) ------------------ +python3 - <<'PY' +from pathlib import Path +p = Path("create_nv.symvers.sh") +s = p.read_text() +anchor = '''\tcase "$nvidia_mod" in\n\t\t*ko.xz)\n\t\t\t/bin/cp -fv $nvidia_mod .\n\t\t\tnvidia_mod=$(basename $nvidia_mod | sed -e "s/.xz//g")\n\t\t\txz -d ${nvidia_mod}.xz\n\t\t\t;;\n\tesac''' +addition = '''\tcase "$nvidia_mod" in\n\t\t*ko.xz)\n\t\t\t/bin/cp -fv $nvidia_mod .\n\t\t\tnvidia_mod=$(basename $nvidia_mod | sed -e "s/.xz//g")\n\t\t\txz -d ${nvidia_mod}.xz\n\t\t\t;;\n\t\t*ko.zst)\n\t\t\t/bin/cp -fv $nvidia_mod .\n\t\t\tnvidia_mod=$(basename $nvidia_mod | sed -e "s/.zst//g")\n\t\t\tzstd -d --rm ${nvidia_mod}.zst\n\t\t\t;;\n\tesac''' +if "*ko.zst" in s: + print("create_nv.symvers.sh: zst branch already present") +else: + assert anchor in s, "xz case anchor not found in create_nv.symvers.sh" + p.write_text(s.replace(anchor, addition, 1)) + print("create_nv.symvers.sh: added zst decompression branch") +PY + +echo "patch done" diff --git a/uv.lock b/uv.lock index b93b7436..4c96bff3 100644 --- a/uv.lock +++ b/uv.lock @@ -1551,6 +1551,36 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/91/53255615acd2a1eaca307ede3c90eb550bae9c94581f8c00081b6b1c8f44/kiwisolver-1.5.0-graalpy312-graalpy250_312_native-win_amd64.whl", hash = "sha256:1f1489f769582498610e015a8ef2d36f28f505ab3096d0e16b4858a9ec214f57", size = 75987, upload-time = "2026-03-09T13:15:39.65Z" }, ] +[[package]] +name = "kvikio-cu12" +version = "26.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cupy-cuda12x" }, + { name = "libkvikio-cu12" }, + { name = "numpy" }, + { name = "packaging" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/24/95abc4335e8db5b8e140dbe7bdaa9bd7c8620cc087b33ccf748f474c43b0/kvikio_cu12-26.4.0-cp311-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72cf92d7786b9cb34c60d616a25ef1c802ee465ebc20dd4b5ca2a365b409be16", size = 587150, upload-time = "2026-04-09T11:38:07.743Z" }, + { url = "https://files.pythonhosted.org/packages/1f/2e/08785cce15da72c9185285b8933d0a14a040f2350241beeb92c7a3dd8ee1/kvikio_cu12-26.4.0-cp311-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:58fa8aa9782f5e426c0568a05cbf232946e80f89efc6d5ef0eaec78f0771580a", size = 626286, upload-time = "2026-04-09T11:25:20.927Z" }, +] + +[[package]] +name = "kvikio-cu13" +version = "26.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cupy-cuda13x" }, + { name = "libkvikio-cu13" }, + { name = "numpy" }, + { name = "packaging" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/4c/ef90121df05681f697dcb5a7629b270406189b5c69ec3fb36b039d699252/kvikio_cu13-26.4.0-cp311-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0f9ca47618d8c258456faf763e5c7a1c2dc6e94ba1cea7a7e87439c8aad9cbd", size = 587146, upload-time = "2026-04-09T11:37:19.209Z" }, + { url = "https://files.pythonhosted.org/packages/79/8c/350a0d40ae61875cd6bea3bc74cd6938d6f499bdaa92634e681d665dc3ca/kvikio_cu13-26.4.0-cp311-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b261b23fda12b16bacef1a27cea4f1a2e676e0284f5430ea8bf38f4fb8a69c30", size = 626286, upload-time = "2026-04-09T11:24:47.952Z" }, +] + [[package]] name = "lazy-loader" version = "0.5" @@ -1563,6 +1593,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/8a/a1/8d812e53a5da1687abb10445275d41a8b13adb781bbf7196ddbcf8d88505/lazy_loader-0.5-py3-none-any.whl", hash = "sha256:ab0ea149e9c554d4ffeeb21105ac60bed7f3b4fd69b1d2360a4add51b170b005", size = 8044, upload-time = "2026-03-06T15:45:07.668Z" }, ] +[[package]] +name = "libkvikio-cu12" +version = "26.4.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5e/1a/237ad2d561e683eff68217c5b862d52f90ed4cb64d376681cb0f65a1f521/libkvikio_cu12-26.4.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:0c051a60322877785ce37f5b1229902a5af3099c2180dd20c68a27c8dc55b9cf", size = 2218685, upload-time = "2026-04-09T11:26:41.1Z" }, + { url = "https://files.pythonhosted.org/packages/c6/15/62e5947e72aeeafdcaf8ebaf8e352ec505d445634514fd939b56c44cc467/libkvikio_cu12-26.4.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:1334b5210d3b05d760085cd3575af96cfe3dc2717065e8532bdcc7c9b195233b", size = 2367209, upload-time = "2026-04-09T11:20:56.055Z" }, +] + +[[package]] +name = "libkvikio-cu13" +version = "26.4.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/74/a6/146d2101afbb768e6aed5cca77822706355873b4cc69db17d9b7e74ed415/libkvikio_cu13-26.4.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:cadac4668d426d11637211f04714f3df0e4ac37a4bde603575f57efeacc1f652", size = 2218592, upload-time = "2026-04-09T11:25:55.48Z" }, + { url = "https://files.pythonhosted.org/packages/ed/d6/612a5015a719705d682b27f0075233068bc81d9ea38ba4bd5f3e21296bd4/libkvikio_cu13-26.4.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:3e0fd921efc7407f8319b4accbd4dc2c1ae65a05853d4e11d68b61e4f3013140", size = 2367571, upload-time = "2026-04-09T11:20:34.086Z" }, +] + [[package]] name = "linkify-it-py" version = "2.1.0" @@ -1618,12 +1666,18 @@ docs = [ { name = "sphinxcontrib-mermaid" }, { name = "sphinxext-opengraph" }, ] -gpu = [ - { name = "cupy-cuda12x" }, +gds = [ + { name = "kvikio-cu13" }, ] -gpu-cuda13 = [ +gds-cuda12 = [ + { name = "kvikio-cu12" }, +] +gpu = [ { name = "cupy-cuda13x" }, ] +gpu-cuda12 = [ + { name = "cupy-cuda12x" }, +] [package.dev-dependencies] dev = [ @@ -1641,11 +1695,13 @@ dev = [ [package.metadata] requires-dist = [ { name = "basicpy" }, - { name = "cupy-cuda12x", marker = "extra == 'gpu'", specifier = ">=12.0.0" }, - { name = "cupy-cuda13x", marker = "extra == 'gpu-cuda13'", specifier = ">=13.0.0" }, + { name = "cupy-cuda12x", marker = "extra == 'gpu-cuda12'", specifier = ">=12.0.0" }, + { name = "cupy-cuda13x", marker = "extra == 'gpu'", specifier = ">=13.0.0" }, { name = "dask", specifier = ">=2024.10.0" }, { name = "dipy" }, { name = "hyperactive" }, + { name = "kvikio-cu12", marker = "extra == 'gds-cuda12'" }, + { name = "kvikio-cu13", marker = "extra == 'gds'" }, { name = "linkify-it-py", marker = "extra == 'docs'", specifier = ">=2.1" }, { name = "matplotlib" }, { name = "myst-parser", marker = "extra == 'docs'", specifier = ">=5.0" }, @@ -1654,7 +1710,7 @@ requires-dist = [ { name = "nibabel" }, { name = "numcodecs" }, { name = "numpy" }, - { name = "ome-zarr", specifier = ">=0.9.0" }, + { name = "ome-zarr", specifier = ">=0.16.0" }, { name = "pandas" }, { name = "pqdm" }, { name = "pydata-sphinx-theme", marker = "extra == 'docs'", specifier = ">=0.17" }, @@ -1676,7 +1732,7 @@ requires-dist = [ { name = "tqdm" }, { name = "zarr", specifier = ">=3.0.10" }, ] -provides-extras = ["gpu", "gpu-cuda13", "docs"] +provides-extras = ["gpu", "gpu-cuda12", "gds", "gds-cuda12", "docs"] [package.metadata.requires-dev] dev = [ diff --git a/workflows/reconst_3d/nextflow.config b/workflows/reconst_3d/nextflow.config index b60728f0..f7cafa5b 100644 --- a/workflows/reconst_3d/nextflow.config +++ b/workflows/reconst_3d/nextflow.config @@ -391,7 +391,16 @@ process { withName: "resample_mosaic_grid" { scratch = false - // Allow parallel mask creation on GPU + // Parallel resampling on GPU. Peak ~9 GB/fork (Gaussian intermediate at + // 2x tile size). With 2x A6000 (48 GB each), 6 forks = 3/GPU = ~27 GB, + // leaving headroom for fix_illumination running concurrently. + maxForks = params.use_gpu ? 6 : null + } + + withName: "fix_focal_curvature" { + // GPU path is light (uniform/gaussian filter on the volume + per-tile + // take_along_axis). Memory dominated by the volume; a few forks share + // the GPU comfortably. maxForks = params.use_gpu ? 4 : null } diff --git a/workflows/reconst_3d/soct_3d_reconst.nf b/workflows/reconst_3d/soct_3d_reconst.nf index 65a8f361..3dbac57b 100644 --- a/workflows/reconst_3d/soct_3d_reconst.nf +++ b/workflows/reconst_3d/soct_3d_reconst.nf @@ -145,8 +145,9 @@ process fix_focal_curvature { tuple val(slice_id), path("mosaic_grid_z${slice_id}_focal_fix.ome.zarr") script: + def gpu_flag = params.use_gpu ? "--use_gpu" : "--no-use_gpu" """ - linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" + linum_detect_focal_curvature.py ${mosaic_grid} "mosaic_grid_z${slice_id}_focal_fix.ome.zarr" ${gpu_flag} """ stub: @@ -675,6 +676,7 @@ process stack { path("stacking_decisions.csv"), optional: true, emit: stacking_decisions script: + def gpu_flag = params.use_gpu ? " --use_gpu" : " --no-use_gpu" def options = Helpers.stackBlendingArgs(params) + Helpers.stackZMatchingArgs(params) + Helpers.stackPairwiseTransformArgs(params) + @@ -683,6 +685,7 @@ process stack { Helpers.stackCumulativeArgs(params) + Helpers.stackSmoothingArgs(params) + " --no_xy_shift" + // slices are already in common space + gpu_flag + Helpers.pyramidArgs(params) def annotated_args = Helpers.annotatedScreenshotArgs(params, slice_ids_str)