From 14e62d17819bda7885dea8d4a9277efe37e1e392 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 10:52:44 -0400 Subject: [PATCH 01/10] feat: implement chunk-aligned indexing on ManifestArray MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Integer and slice indexers now subset a ManifestArray when they align with chunk boundaries (closes #51, supersedes #499). Misaligned selections raise SubChunkIndexingError — an IndexError subclass, not NotImplementedError, since splitting compressed chunks without loading the underlying data is a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while ints raised NotImplementedError. --- docs/releases.md | 13 ++ virtualizarr/manifests/indexing.py | 154 ++++++++++++++---- .../tests/test_manifests/test_array.py | 133 ++++++--------- 3 files changed, 182 insertions(+), 118 deletions(-) diff --git a/docs/releases.md b/docs/releases.md index 0f1ce1589..d835e71f5 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -1,5 +1,18 @@ # Release notes +## Unreleased + +### New Features + +- `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including integer indexing on length-1 chunked axes, multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (an `IndexError` subclass) — this is a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). + By [Tom Nicholas](https://github.com/TomNicholas). + +### Bug fixes + +### Documentation + +### Internal changes + ## v2.6.1 (3rd May 2026) Adds end-to-end support for inlined chunk references in `ChunkManifest` (read via Kerchunk parsers, write via Kerchunk and Icechunk writers), plus Zarr-Python 3.2.0 compatibility and several bug fixes. diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 5d69e1f97..11b10f720 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -4,6 +4,8 @@ import numpy as np from virtualizarr.manifests.array_api import expand_dims +from virtualizarr.manifests.manifest import ChunkManifest +from virtualizarr.manifests.utils import copy_and_replace_metadata # indexer with only basic selectors, no new axes or ellipsis T_BasicIndexer_1d: TypeAlias = int | slice | np.ndarray @@ -14,6 +16,18 @@ T_Indexer: TypeAlias = T_Indexer_1d | tuple[T_Indexer_1d, ...] +class SubChunkIndexingError(IndexError): + """ + Raised when an indexer would split individual chunks of a ManifestArray. + + This is not a NotImplementedError: a ManifestArray only knows where each + chunk's bytes live, never their values, so any selection that crosses into + the interior of a compressed chunk would require loading the data — which + defeats the point of a virtual array. This is a permanent constraint, not + a missing feature. + """ + + if TYPE_CHECKING: from virtualizarr.manifests.array import ManifestArray @@ -138,56 +152,124 @@ def apply_indexer( def apply_selection( marr: "ManifestArray", indexer_without_newaxes: tuple[T_BasicIndexer_1d, ...] ) -> "ManifestArray": - """Applies indexes to subset along each dimension.""" + """ + Apply chunk-aligned subsetting along each dimension. + + Slices and integer indexers are only supported if they align with chunk boundaries, since + splitting individual chunks would require loading their bytes. See GitHub issue #51. + """ + from virtualizarr.manifests.array import ManifestArray # at this point there should be no ellipsis, no Nones, and one 1D indexer for each axis. assert len(indexer_without_newaxes) == marr.ndim - output_arr = marr - for axis, (length, indexer_1d) in enumerate( - zip(marr.shape, indexer_without_newaxes) + # validate types and reject anything we can never support per-axis + for indexer_1d in indexer_without_newaxes: + if isinstance(indexer_1d, np.ndarray): + raise NotImplementedError( + f"Unsupported indexer. So-called 'fancy indexing' via numpy arrays is not supported, but received {indexer_1d}" + ) + if not isinstance(indexer_1d, (int, slice)): + raise TypeError(f"Invalid indexer type: {indexer_1d}") + + new_shape: list[int] = [] + chunk_grid_slices: list[slice] = [] + for axis_length, chunk_size, indexer_1d in zip( + marr.shape, marr.chunks, indexer_without_newaxes, strict=True ): - output_arr = apply_selection_1d(output_arr, indexer_1d, length) + chunk_grid_slice, new_axis_length = _compute_chunk_aligned_selection_1d( + indexer_1d, axis_length=axis_length, chunk_size=chunk_size + ) + chunk_grid_slices.append(chunk_grid_slice) + new_shape.append(new_axis_length) - return output_arr + chunk_grid_slices_tuple = tuple(chunk_grid_slices) + # short-circuit if every axis selects the whole chunk grid (a no-op) + if all( + cgs == slice(0, dim, 1) + for cgs, dim in zip(chunk_grid_slices_tuple, marr.manifest.shape_chunk_grid) + ): + return marr + + new_manifest = _subset_manifest(marr.manifest, chunk_grid_slices_tuple) + new_metadata = copy_and_replace_metadata(marr.metadata, new_shape=new_shape) + return ManifestArray(metadata=new_metadata, chunkmanifest=new_manifest) -def apply_selection_1d( - marr: "ManifestArray", indexer_1d: T_BasicIndexer_1d, length: int -) -> "ManifestArray": - """ - Actually index the ManifestArray along 1 dimension. - Notice that none of these options actually do any indexing right now! +def _compute_chunk_aligned_selection_1d( + indexer_1d: int | slice, axis_length: int, chunk_size: int +) -> tuple[slice, int]: """ + Translate a 1D array-space indexer (int or slice) into a chunk-grid slice plus the new axis length. - if isinstance(indexer_1d, slice): - if slice_is_no_op(indexer_1d, axis_length=length): - pass - else: - NotImplementedError( - f"Unsupported indexer. Indexing within a ManifestArray using ints or slices is not yet supported (see GitHub issue #51), but received {indexer_1d}" + Raises SubChunkIndexingError if the selection would require splitting individual chunks. + """ + if isinstance(indexer_1d, int): + # Integer indexing is treated as a length-1 slice; we don't drop dimensions because that + # would require loading the underlying data for the array-API conformance reasons that + # ManifestArray operates on chunk references rather than values. + i = indexer_1d + if i < 0: + i += axis_length + if not (0 <= i < axis_length): + raise IndexError( + f"index {indexer_1d} is out of bounds for axis with size {axis_length}" ) - elif isinstance(indexer_1d, int): - # TODO cover possibility of indexing into a length-1 dimension (which just removes that dimension)? - raise NotImplementedError( - f"Unsupported indexer. Indexing within a ManifestArray using ints or slices is not yet supported (see GitHub issue #51), but received {indexer_1d}" - ) - elif isinstance(indexer_1d, np.ndarray): - raise NotImplementedError( - f"Unsupported indexer. So-called 'fancy indexing' via numpy arrays is not supported, but received {indexer_1d}" - ) + start, stop, step = i, i + 1, 1 else: - # should never get here - raise TypeError(f"Invalid indexer type: {indexer_1d}") + start, stop, step = indexer_1d.indices(axis_length) + + if step != 1: + raise SubChunkIndexingError( + f"step != 1 is not supported for chunk-aligned indexing, got step={step}" + ) - return marr + if start % chunk_size != 0: + raise SubChunkIndexingError( + f"Cannot index ManifestArray axis of length {axis_length} and chunk length " + f"{chunk_size} with {indexer_1d!r}: slice would split individual chunks, " + "which a ManifestArray cannot do without loading the underlying data." + ) + # The final chunk may legitimately be partial, so allow stop == axis_length even if + # axis_length % chunk_size != 0; otherwise stop must land on a chunk boundary. + if stop != axis_length and stop % chunk_size != 0: + raise SubChunkIndexingError( + f"Cannot index ManifestArray axis of length {axis_length} and chunk length " + f"{chunk_size} with {indexer_1d!r}: slice would split individual chunks, " + "which a ManifestArray cannot do without loading the underlying data." + ) -def slice_is_no_op(slice_indexer_1d: slice, axis_length: int) -> bool: - if slice_indexer_1d == slice(None): - return True - elif slice_indexer_1d == slice(0, axis_length, 1): - return True + chunk_start = start // chunk_size + # ceil-divide so that a partial final chunk is included when stop == axis_length + chunk_stop = -(-stop // chunk_size) + return slice(chunk_start, chunk_stop, 1), stop - start + + +def _subset_manifest( + manifest: ChunkManifest, chunk_grid_slices: tuple[slice, ...] +) -> ChunkManifest: + """Subset a ChunkManifest by slicing its underlying chunk-grid arrays.""" + new_paths = manifest._paths[chunk_grid_slices] + new_offsets = manifest._offsets[chunk_grid_slices] + new_lengths = manifest._lengths[chunk_grid_slices] + + if manifest._inlined: + starts = tuple(s.start for s in chunk_grid_slices) + stops = tuple(s.stop for s in chunk_grid_slices) + new_inlined = { + tuple(idx - start for idx, start in zip(coords, starts)): data + for coords, data in manifest._inlined.items() + if all(start <= idx < stop for idx, start, stop in zip(coords, starts, stops)) + } else: - return False + new_inlined = {} + + return ChunkManifest.from_arrays( + paths=new_paths, + offsets=new_offsets, + lengths=new_lengths, + inlined=new_inlined, + validate_paths=False, + ) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index ba46e0019..a7feb0e80 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -7,6 +7,7 @@ ZLIB_CODEC, ) from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.indexing import SubChunkIndexingError class TestInit: @@ -843,88 +844,30 @@ def test_raise_on_multiple_ellipses( [ # obvious no-ops ((2,), (1,), slice(0, 2), (2,), (1,)), - # reduces shape - pytest.param( - (1,), - (1,), - 0, - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - # requires chunk-aligned selection - pytest.param( - (2,), - (1,), - 0, - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - pytest.param( - (2,), - (1,), - 1, - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - pytest.param( - (2,), - (1,), - (0, ...), - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - pytest.param( - (2,), - (1,), - (..., 0), - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - pytest.param( - (2,), - (1,), - slice(0, 1), - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - pytest.param( - (2,), - (1,), - (..., slice(0, 1)), - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), - pytest.param( - (2,), - (1,), - (slice(0, 1), ...), - (1,), - (1,), - marks=pytest.mark.xfail( - reason="Chunk-aligned indexing not yet implemented" - ), - ), + # length-1 axis: integer indexing + ((1,), (1,), 0, (1,), (1,)), + # chunk-aligned integer indexing (treated as length-1 slice; does not drop the axis) + ((2,), (1,), 0, (1,), (1,)), + ((2,), (1,), 1, (1,), (1,)), + ((2,), (1,), (0, ...), (1,), (1,)), + ((2,), (1,), (..., 0), (1,), (1,)), + # chunk-aligned slicing + ((2,), (1,), slice(0, 1), (1,), (1,)), + ((2,), (1,), (..., slice(0, 1)), (1,), (1,)), + ((2,), (1,), (slice(0, 1), ...), (1,), (1,)), + # multi-chunk slices and slices over multi-chunk axes + ((8,), (2,), slice(0, 4), (4,), (2,)), + ((8,), (2,), slice(2, 6), (4,), (2,)), + ((8,), (2,), slice(6, 8), (2,), (2,)), + # multi-dim slicing + ((4, 4), (2, 2), (slice(0, 2), slice(0, 2)), (2, 2), (2, 2)), + ((4, 4), (2, 2), (slice(2, 4), slice(0, 4)), (2, 4), (2, 2)), + # mixed integer + slice indexing + ((2, 4), (1, 2), (0, slice(0, 2)), (1, 2), (1, 2)), + ((4, 4), (1, 2), (2, slice(0, 4)), (1, 4), (1, 2)), + # partial final chunk along an axis + ((5,), (2,), slice(0, 4), (4,), (2,)), + ((5,), (2,), slice(2, 5), (3,), (2,)), ], ) def test_chunk_selection_cases( @@ -935,6 +878,32 @@ def test_chunk_selection_cases( assert indexed.shape == out_shape assert indexed.chunks == out_chunks + @pytest.mark.parametrize( + "in_shape, in_chunks, indexer", + [ + # int that doesn't land on a chunk boundary + ((4,), (2,), 1), + ((4,), (2,), 3), + # slice start misaligned + ((4,), (2,), slice(1, 3)), + # slice stop misaligned (and stop != axis_length) + ((4,), (2,), slice(0, 3)), + ((4,), (2,), slice(0, 1)), + # step != 1 + ((4,), (2,), slice(0, 4, 2)), + # only one axis misaligned in a multi-dim selection + ((4, 4), (2, 2), (slice(0, 4), slice(0, 1))), + ], + ) + def test_misaligned_with_chunks( + self, manifest_array, in_shape, in_chunks, indexer + ): + marr = manifest_array(shape=in_shape, chunks=in_chunks) + with pytest.raises( + SubChunkIndexingError, match="split individual chunks|step != 1" + ): + marr[indexer] + def test_to_xarray(array_v3_metadata): chunks = (5, 10) From cb18b7ccd22d50fa51f29f8cbba14456933ed759 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 10:55:47 -0400 Subject: [PATCH 02/10] refactor: make SubChunkIndexingError a ValueError, not IndexError Sub-chunk indexing is impossible by design for a virtual array, not a missing-feature index error, so a ValueError subclass is a closer fit. --- docs/releases.md | 2 +- virtualizarr/manifests/indexing.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/docs/releases.md b/docs/releases.md index d835e71f5..23e6efb50 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -4,7 +4,7 @@ ### New Features -- `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including integer indexing on length-1 chunked axes, multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (an `IndexError` subclass) — this is a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). +- `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including integer indexing on length-1 chunked axes, multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (a `ValueError` subclass) — this is a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). By [Tom Nicholas](https://github.com/TomNicholas). ### Bug fixes diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 11b10f720..f625ee7c0 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -16,15 +16,9 @@ T_Indexer: TypeAlias = T_Indexer_1d | tuple[T_Indexer_1d, ...] -class SubChunkIndexingError(IndexError): +class SubChunkIndexingError(ValueError): """ - Raised when an indexer would split individual chunks of a ManifestArray. - - This is not a NotImplementedError: a ManifestArray only knows where each - chunk's bytes live, never their values, so any selection that crosses into - the interior of a compressed chunk would require loading the data — which - defeats the point of a virtual array. This is a permanent constraint, not - a missing feature. + Raised when an indexer would split individual chunks of a compressed ManifestArray. """ From 3a3fcd73693c744fbdd1cdba4fbbf6bd6f8d7185 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 10:57:57 -0400 Subject: [PATCH 03/10] update __getitem__ docstring --- virtualizarr/manifests/array.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index de42c853a..2c7c6bed6 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -155,8 +155,6 @@ def __array_function__(self, func, types, args, kwargs) -> Any: return MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS[func](*args, **kwargs) - # Everything beyond here is basically just to make this array class wrappable by xarray # - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs) -> Any: """We have to define this in order to convince xarray that this class is a duckarray, even though we will never support ufuncs.""" if ufunc == np.isnan: @@ -229,11 +227,8 @@ def __getitem__( """ Perform numpy-style indexing on this ManifestArray. - Only supports limited indexing, because in general you cannot slice inside of a compressed chunk. - Mainly required because Xarray uses this instead of expand dims (by passing Nones) and often will index with a no-op. - - Could potentially support indexing with slices aligned along chunk boundaries, but currently does not. - + Only supports limited types of indexing, because in general you cannot slice inside of a compressed chunk. + Parameters ---------- key From dec8679824acffb3c95441b6e921678c24a470a1 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 11:31:40 -0400 Subject: [PATCH 04/10] refactor: collapse SubChunkIndexingError raises into one The step, start, and stop alignment checks all raise the same error for the same reason, so combine them into a single conditional with one message. --- virtualizarr/manifests/indexing.py | 24 ++++++++----------- .../tests/test_manifests/test_array.py | 4 +--- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index f625ee7c0..9d9e3aaca 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -205,6 +205,7 @@ def _compute_chunk_aligned_selection_1d( # ManifestArray operates on chunk references rather than values. i = indexer_1d if i < 0: + # Allow negative indexing - this makes it wrap around i += axis_length if not (0 <= i < axis_length): raise IndexError( @@ -214,21 +215,15 @@ def _compute_chunk_aligned_selection_1d( else: start, stop, step = indexer_1d.indices(axis_length) - if step != 1: - raise SubChunkIndexingError( - f"step != 1 is not supported for chunk-aligned indexing, got step={step}" - ) - - if start % chunk_size != 0: - raise SubChunkIndexingError( - f"Cannot index ManifestArray axis of length {axis_length} and chunk length " - f"{chunk_size} with {indexer_1d!r}: slice would split individual chunks, " - "which a ManifestArray cannot do without loading the underlying data." - ) - # The final chunk may legitimately be partial, so allow stop == axis_length even if - # axis_length % chunk_size != 0; otherwise stop must land on a chunk boundary. - if stop != axis_length and stop % chunk_size != 0: + # axis_length % chunk_size != 0; otherwise both endpoints must land on chunk boundaries. + # TODO step != 1 we can actually support for uncompressed arrays along the first axis, + # see https://github.com/zarr-developers/VirtualiZarr/issues/86 + if ( + step != 1 + or start % chunk_size != 0 + or (stop != axis_length and stop % chunk_size != 0) + ): raise SubChunkIndexingError( f"Cannot index ManifestArray axis of length {axis_length} and chunk length " f"{chunk_size} with {indexer_1d!r}: slice would split individual chunks, " @@ -252,6 +247,7 @@ def _subset_manifest( if manifest._inlined: starts = tuple(s.start for s in chunk_grid_slices) stops = tuple(s.stop for s in chunk_grid_slices) + # Keep only the inlined chunks that fall within the new domain, and reindex them if necessary new_inlined = { tuple(idx - start for idx, start in zip(coords, starts)): data for coords, data in manifest._inlined.items() diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index a7feb0e80..06aedda0f 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -899,9 +899,7 @@ def test_misaligned_with_chunks( self, manifest_array, in_shape, in_chunks, indexer ): marr = manifest_array(shape=in_shape, chunks=in_chunks) - with pytest.raises( - SubChunkIndexingError, match="split individual chunks|step != 1" - ): + with pytest.raises(SubChunkIndexingError, match="split individual chunks"): marr[indexer] From 04792164e4526417a21e025fb0fc2b22de995fa5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 11:33:14 -0400 Subject: [PATCH 05/10] update __getitem__ docstring --- virtualizarr/manifests/array.py | 28 +++++++++++++++++-- virtualizarr/manifests/indexing.py | 4 ++- .../tests/test_manifests/test_array.py | 4 +-- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 2c7c6bed6..fcea660cd 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -225,13 +225,35 @@ def __getitem__( /, ) -> "ManifestArray": """ - Perform numpy-style indexing on this ManifestArray. + Index into this ManifestArray, returning a new ManifestArray view over a subset of chunks. + + Supports only chunk-aligned selections. A ManifestArray only stores references to where + each chunk's bytes live, never their decoded values, so any indexer that would split into + the interior of a chunk would require loading the underlying data — which defeats the + point of a virtual array. Selections that would do so raise ``SubChunkIndexingError`` + (a ``ValueError`` subclass); this is a permanent constraint, not a missing feature. + + Supported indexers (and tuples thereof): + + - ``Ellipsis`` and ``None`` — no-ops and new-axis insertion. + - ``slice`` with ``step == 1`` whose start and stop land on chunk boundaries + (``stop == axis_length`` is also allowed, so a partial final chunk can be selected). + - ``int`` — treated as a length-1 slice; does not drop the axis, since dropping a + dimension is only meaningful when values are available. + + Anything else — fancy indexing with arrays, misaligned slices, ``step != 1`` — + raises ``SubChunkIndexingError`` or ``NotImplementedError``. - Only supports limited types of indexing, because in general you cannot slice inside of a compressed chunk. - Parameters ---------- key + A basic indexer or tuple of basic indexers, one per array axis (with ``Ellipsis`` + and ``None`` allowed as per the array API). + + Returns + ------- + ManifestArray + A new array whose ``ChunkManifest`` references only the selected chunks. """ return index(self, key) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 9d9e3aaca..d2b388b94 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -251,7 +251,9 @@ def _subset_manifest( new_inlined = { tuple(idx - start for idx, start in zip(coords, starts)): data for coords, data in manifest._inlined.items() - if all(start <= idx < stop for idx, start, stop in zip(coords, starts, stops)) + if all( + start <= idx < stop for idx, start, stop in zip(coords, starts, stops) + ) } else: new_inlined = {} diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 06aedda0f..365c20ba3 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -895,9 +895,7 @@ def test_chunk_selection_cases( ((4, 4), (2, 2), (slice(0, 4), slice(0, 1))), ], ) - def test_misaligned_with_chunks( - self, manifest_array, in_shape, in_chunks, indexer - ): + def test_misaligned_with_chunks(self, manifest_array, in_shape, in_chunks, indexer): marr = manifest_array(shape=in_shape, chunks=in_chunks) with pytest.raises(SubChunkIndexingError, match="split individual chunks"): marr[indexer] From 68943e3fff87f08bcae8d8d509e1d39f03754f4a Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 11:40:05 -0400 Subject: [PATCH 06/10] docs: explain splitting a virtual dataset across Icechunk commits Cover the use case where ZarrParser produces a virtual dataset whose total chunk-ref count exceeds Icechunk's 50M-per-commit limit: slice along a chunk-aligned axis and write each slice with append_dim. Chunk-aligned indexing on ManifestArray (PR for #51) is what makes this cheap. --- docs/scaling.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/docs/scaling.md b/docs/scaling.md index 1ef04fe62..9ca7b940e 100644 --- a/docs/scaling.md +++ b/docs/scaling.md @@ -308,6 +308,37 @@ for i, batch in enumerate(file_batches): Notice this workflow could also be used for appending data only as it becomes available, e.g. by replacing the for loop with a cron job. +### Splitting a single large virtual dataset across commits + +A single Icechunk commit cannot include more than 50 million chunk references at once. +If a single source — typically a massive Zarr store opened via [`ZarrParser`][virtualizarr.parsers.ZarrParser] — produces a virtual dataset whose arrays together exceed that, you can't write it in one transaction even after all the references are already in memory. + +In that case you can slice the virtual dataset along an axis where the slicing falls on chunk boundaries (often `time`), and commit each slice with `append_dim`. Chunk-aligned slicing on a `ManifestArray` (and therefore on the variables of a virtual `xarray.Dataset`) only subsets the manifest, so this is cheap — no chunks are loaded. + +```python +import icechunk as ic + +# Parse the giant Zarr store once, producing a virtual dataset that exceeds +# 50M refs in total but whose `time` axis is chunked. +vds = vz.open_virtual_dataset(, parser=ZarrParser(), registry=registry) + +chunk_size_time = vds.chunksizes["time"] # must align the splits to chunk boundaries +step = chunk_size_time * N # pick N so that each slice has < 50M refs + +repo = ic.Repository.open() + +for i, start in enumerate(range(0, vds.sizes["time"], step)): + session = repo.writable_session("main") + slice_vds = vds.isel(time=slice(start, start + step)) + append_dim = "time" if i > 0 else None + slice_vds.vz.to_icechunk(session.store, append_dim=append_dim) + session.commit(f"wrote virtual references for time slice {i}") +``` + +If the slice boundaries don't align with chunk edges along that axis, the indexing call raises `SubChunkIndexingError`. + +(Remember you can also subset the Dataset to specific variables and commit those separately too if necessary.) + ### Retries Sometimes an [`open_virtual_dataset`][virtualizarr.open_virtual_dataset] call might fail for a transient reason, such as a failed HTTP response from a server. From cf52fb696f4cc797a44182400b89aa5f6652ba33 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 11:44:21 -0400 Subject: [PATCH 07/10] test: cover xarray .isel routing through ManifestArray indexing Exercises the workflow documented in docs/scaling.md: chunk-aligned .isel on a virtual Dataset subsets the underlying ChunkManifest, misaligned splits raise SubChunkIndexingError, and iterating chunk-aligned slices covers every original ref exactly once. Includes a note that selecting a single chunk requires a length-1 slice rather than an int (since ManifestArray preserves the indexed axis). --- virtualizarr/tests/test_xarray.py | 79 +++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index d3095a44c..8f63e7f86 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -18,6 +18,7 @@ open_virtual_mfdataset, ) from virtualizarr.manifests import ChunkManifest, ManifestArray +from virtualizarr.manifests.indexing import SubChunkIndexingError from virtualizarr.parsers import HDFParser from virtualizarr.tests import ( requires_dask, @@ -1006,3 +1007,81 @@ def test_to_xarray_nonscalar_no_dimension_names(array_v3_metadata): with pytest.raises(ValueError, match="without dimension names"): marr.to_virtual_variable() + + +class TestIsel: + # Verifies the workflow documented in docs/scaling.md under + # "Splitting a single large virtual dataset across commits": slicing a virtual + # xarray.Dataset with .isel along a chunk-aligned axis subsets the underlying + # ChunkManifest without touching the data, and misaligned splits raise. + + def _virtual_dataset(self, array_v3_metadata): + # shape=(8, 4), chunks=(2, 4): 4 chunks along "time", 1 chunk along "x" + metadata = array_v3_metadata( + shape=(8, 4), chunks=(2, 4), dimension_names=["time", "x"] + ) + manifest = ChunkManifest( + entries={ + "0.0": {"path": "/a.nc", "offset": 0, "length": 64}, + "1.0": {"path": "/a.nc", "offset": 100, "length": 64}, + "2.0": {"path": "/a.nc", "offset": 200, "length": 64}, + "3.0": {"path": "/a.nc", "offset": 300, "length": 64}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + return xr.Dataset({"foo": (["time", "x"], marr)}) + + def test_isel_along_chunk_boundary(self, array_v3_metadata): + vds = self._virtual_dataset(array_v3_metadata) + + sliced = vds.isel(time=slice(2, 6)) + + assert sliced.sizes == {"time": 4, "x": 4} + assert isinstance(sliced["foo"].data, ManifestArray) + assert sliced["foo"].data.shape == (4, 4) + assert sliced["foo"].data.chunks == (2, 4) + # only the two middle chunks should remain, re-indexed from 0 + assert sliced["foo"].data.manifest.dict() == { + "0.0": {"path": "file:///a.nc", "offset": 100, "length": 64}, + "1.0": {"path": "file:///a.nc", "offset": 200, "length": 64}, + } + + def test_isel_single_chunk_via_length_1_slice(self, array_v3_metadata): + # Selecting a single chunk requires a length-1 slice rather than an int: + # ManifestArray intentionally preserves the axis (since dropping it would + # require knowing values, not just references), and xarray's int-indexing + # path expects the axis to be dropped — so a literal `isel(time=2)` fails. + vds = self._virtual_dataset(array_v3_metadata) + + sliced = vds.isel(time=slice(2, 4)) + + assert sliced.sizes == {"time": 2, "x": 4} + assert isinstance(sliced["foo"].data, ManifestArray) + # slice(2, 4) on chunks=(2, 4) picks chunk index 1 (the second chunk) + assert sliced["foo"].data.manifest.dict() == { + "0.0": {"path": "file:///a.nc", "offset": 100, "length": 64}, + } + + def test_isel_misaligned_raises(self, array_v3_metadata): + vds = self._virtual_dataset(array_v3_metadata) + + with pytest.raises(SubChunkIndexingError, match="split individual chunks"): + vds.isel(time=slice(1, 5)) + + def test_isel_iterative_append_simulation(self, array_v3_metadata): + # Simulate the scaling.md recipe: walk a chunk-aligned step across "time" + # and confirm each slice yields a valid ManifestArray with the expected refs. + vds = self._virtual_dataset(array_v3_metadata) + step = 4 # two chunks of size 2 per slice + + seen_refs = [] + for start in range(0, vds.sizes["time"], step): + slice_vds = vds.isel(time=slice(start, start + step)) + assert isinstance(slice_vds["foo"].data, ManifestArray) + seen_refs.extend( + entry["offset"] + for entry in slice_vds["foo"].data.manifest.dict().values() + ) + + # every original chunk should have been visited exactly once + assert sorted(seen_refs) == [0, 100, 200, 300] From 79da4e037b5618c953e1d4649a081b846a08f952 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 12:23:24 -0400 Subject: [PATCH 08/10] feat: integer indexing on ManifestArray drops the indexed axis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match numpy / array-API semantics: int indexers drop the indexed axis, slice indexers preserve it. Integer indexing remains legal only when chunk_size == 1 along that axis — picking one element of a larger chunk is still sub-chunk indexing and raises SubChunkIndexingError. Dropping happens by passing int (rather than length-1 slice) selectors into the manifest's chunk-grid arrays, then trimming shape/chunks/dimension_names to the kept axes. The all-int case collapses the manifest to 0D, so wrap the result of numpy indexing with np.asarray to keep ChunkManifest.from_arrays happy when numpy hands back a Python scalar instead of a 0D ndarray. End-to-end win: xarray.Dataset.isel(time=N) now routes through cleanly on a virtual dataset (when chunk_size == 1 along time), matching the workflow documented in docs/scaling.md. --- docs/releases.md | 2 +- virtualizarr/manifests/array.py | 6 +- virtualizarr/manifests/indexing.py | 112 ++++++++++++------ .../tests/test_manifests/test_array.py | 56 +++++++-- virtualizarr/tests/test_xarray.py | 42 ++++++- 5 files changed, 166 insertions(+), 52 deletions(-) diff --git a/docs/releases.md b/docs/releases.md index 23e6efb50..7b12814ff 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -4,7 +4,7 @@ ### New Features -- `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including integer indexing on length-1 chunked axes, multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (a `ValueError` subclass) — this is a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). +- `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Integer indexers drop the indexed axis (numpy / array-API semantics) and are legal only when `chunk_size == 1` along that axis; slice indexers preserve the axis. This makes `xarray.Dataset.isel` work end-to-end on virtual datasets for any chunk-aligned selection. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (a `ValueError` subclass) — a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). By [Tom Nicholas](https://github.com/TomNicholas). ### Bug fixes diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index fcea660cd..9e5814719 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -238,8 +238,10 @@ def __getitem__( - ``Ellipsis`` and ``None`` — no-ops and new-axis insertion. - ``slice`` with ``step == 1`` whose start and stop land on chunk boundaries (``stop == axis_length`` is also allowed, so a partial final chunk can be selected). - - ``int`` — treated as a length-1 slice; does not drop the axis, since dropping a - dimension is only meaningful when values are available. + Slice indexers preserve the axis. + - ``int`` — drops the indexed axis, following numpy / array-API semantics. Only legal + when ``chunk_size == 1`` along that axis; otherwise picking a single element would + require splitting a chunk. Anything else — fancy indexing with arrays, misaligned slices, ``step != 1`` — raises ``SubChunkIndexingError`` or ``NotImplementedError``. diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index d2b388b94..22538825e 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -151,6 +151,10 @@ def apply_selection( Slices and integer indexers are only supported if they align with chunk boundaries, since splitting individual chunks would require loading their bytes. See GitHub issue #51. + + Integer indexers drop the indexed axis (numpy / array-API semantics); slice indexers + preserve it. Integer indexing is only legal when ``chunk_size == 1`` along that axis, + since picking a single element of a larger chunk would require splitting it. """ from virtualizarr.manifests.array import ManifestArray @@ -167,42 +171,58 @@ def apply_selection( raise TypeError(f"Invalid indexer type: {indexer_1d}") new_shape: list[int] = [] - chunk_grid_slices: list[slice] = [] - for axis_length, chunk_size, indexer_1d in zip( - marr.shape, marr.chunks, indexer_without_newaxes, strict=True + new_chunks: list[int] = [] + chunk_grid_selectors: list[int | slice] = [] + kept_axes: list[int] = [] + for axis, (axis_length, chunk_size, indexer_1d) in enumerate( + zip(marr.shape, marr.chunks, indexer_without_newaxes, strict=True) ): - chunk_grid_slice, new_axis_length = _compute_chunk_aligned_selection_1d( + chunk_grid_selector, new_axis_length = _compute_chunk_aligned_selection_1d( indexer_1d, axis_length=axis_length, chunk_size=chunk_size ) - chunk_grid_slices.append(chunk_grid_slice) - new_shape.append(new_axis_length) + chunk_grid_selectors.append(chunk_grid_selector) + # int selectors drop the axis from the output array + if not isinstance(indexer_1d, int): + new_shape.append(new_axis_length) + new_chunks.append(chunk_size) + kept_axes.append(axis) - chunk_grid_slices_tuple = tuple(chunk_grid_slices) + chunk_grid_selectors_tuple = tuple(chunk_grid_selectors) - # short-circuit if every axis selects the whole chunk grid (a no-op) + # short-circuit if every axis selects the whole chunk grid via a slice (a no-op) if all( - cgs == slice(0, dim, 1) - for cgs, dim in zip(chunk_grid_slices_tuple, marr.manifest.shape_chunk_grid) + isinstance(cgs, slice) and cgs == slice(0, dim, 1) + for cgs, dim in zip(chunk_grid_selectors_tuple, marr.manifest.shape_chunk_grid) ): return marr - new_manifest = _subset_manifest(marr.manifest, chunk_grid_slices_tuple) - new_metadata = copy_and_replace_metadata(marr.metadata, new_shape=new_shape) + new_manifest = _subset_manifest(marr.manifest, chunk_grid_selectors_tuple) + old_dimension_names = marr.metadata.dimension_names + new_dimension_names: tuple[str | None, ...] | None | str + if old_dimension_names is None: + new_dimension_names = "default" # sentinel: leave as None + else: + new_dimension_names = tuple(old_dimension_names[a] for a in kept_axes) + new_metadata = copy_and_replace_metadata( + marr.metadata, + new_shape=new_shape, + new_chunks=new_chunks, + new_dimension_names=new_dimension_names, + ) return ManifestArray(metadata=new_metadata, chunkmanifest=new_manifest) def _compute_chunk_aligned_selection_1d( indexer_1d: int | slice, axis_length: int, chunk_size: int -) -> tuple[slice, int]: +) -> tuple[int | slice, int]: """ - Translate a 1D array-space indexer (int or slice) into a chunk-grid slice plus the new axis length. + Translate a 1D array-space indexer (int or slice) into a chunk-grid selector plus the + new axis length. The selector is an ``int`` for int indexers (so the chunk-grid axis + is dropped) and a ``slice`` for slice indexers (so the chunk-grid axis is preserved). Raises SubChunkIndexingError if the selection would require splitting individual chunks. """ if isinstance(indexer_1d, int): - # Integer indexing is treated as a length-1 slice; we don't drop dimensions because that - # would require loading the underlying data for the array-API conformance reasons that - # ManifestArray operates on chunk references rather than values. i = indexer_1d if i < 0: # Allow negative indexing - this makes it wrap around @@ -231,30 +251,56 @@ def _compute_chunk_aligned_selection_1d( ) chunk_start = start // chunk_size - # ceil-divide so that a partial final chunk is included when stop == axis_length + + if isinstance(indexer_1d, int): + # int indexer drops the array axis, so the chunk-grid axis is dropped too via an int selector + return chunk_start, 1 + + # slice indexer: ceil-divide stop so a partial final chunk is included when stop == axis_length chunk_stop = -(-stop // chunk_size) return slice(chunk_start, chunk_stop, 1), stop - start def _subset_manifest( - manifest: ChunkManifest, chunk_grid_slices: tuple[slice, ...] + manifest: ChunkManifest, chunk_grid_selectors: tuple[int | slice, ...] ) -> ChunkManifest: - """Subset a ChunkManifest by slicing its underlying chunk-grid arrays.""" - new_paths = manifest._paths[chunk_grid_slices] - new_offsets = manifest._offsets[chunk_grid_slices] - new_lengths = manifest._lengths[chunk_grid_slices] + """ + Subset a ChunkManifest by indexing its underlying chunk-grid arrays. Each entry of + ``chunk_grid_selectors`` is either an int (which drops the corresponding chunk-grid axis) + or a slice (which keeps it). + """ + # When every axis is int-indexed, numpy returns a 0D scalar (Python str for StringDType, + # numpy scalar for the numeric arrays). Wrap back into a 0D ndarray so from_arrays accepts it. + new_paths = np.asarray( + manifest._paths[chunk_grid_selectors], dtype=manifest._paths.dtype + ) + new_offsets = np.asarray( + manifest._offsets[chunk_grid_selectors], dtype=manifest._offsets.dtype + ) + new_lengths = np.asarray( + manifest._lengths[chunk_grid_selectors], dtype=manifest._lengths.dtype + ) if manifest._inlined: - starts = tuple(s.start for s in chunk_grid_slices) - stops = tuple(s.stop for s in chunk_grid_slices) - # Keep only the inlined chunks that fall within the new domain, and reindex them if necessary - new_inlined = { - tuple(idx - start for idx, start in zip(coords, starts)): data - for coords, data in manifest._inlined.items() - if all( - start <= idx < stop for idx, start, stop in zip(coords, starts, stops) - ) - } + # For each old chunk-grid key, keep it only if int-indexed axes match exactly and + # slice-indexed axes fall inside the slice. Re-map the surviving key by omitting + # int-indexed positions and offsetting slice-indexed positions by the slice start. + new_inlined: dict[tuple[int, ...], bytes] = {} + for coords, data in manifest._inlined.items(): + new_coord: list[int] = [] + keep = True + for coord, sel in zip(coords, chunk_grid_selectors): + if isinstance(sel, int): + if coord != sel: + keep = False + break + else: + if not (sel.start <= coord < sel.stop): + keep = False + break + new_coord.append(coord - sel.start) + if keep: + new_inlined[tuple(new_coord)] = data else: new_inlined = {} diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 365c20ba3..08187ec11 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -844,14 +844,18 @@ def test_raise_on_multiple_ellipses( [ # obvious no-ops ((2,), (1,), slice(0, 2), (2,), (1,)), - # length-1 axis: integer indexing - ((1,), (1,), 0, (1,), (1,)), - # chunk-aligned integer indexing (treated as length-1 slice; does not drop the axis) - ((2,), (1,), 0, (1,), (1,)), - ((2,), (1,), 1, (1,), (1,)), - ((2,), (1,), (0, ...), (1,), (1,)), - ((2,), (1,), (..., 0), (1,), (1,)), - # chunk-aligned slicing + # integer indexing drops the indexed axis (numpy / array-API semantics). + # Only legal when chunk_size == 1 along that axis: otherwise picking a + # single element would require splitting a chunk. + ((1,), (1,), 0, (), ()), + ((2,), (1,), 0, (), ()), + ((2,), (1,), 1, (), ()), + ((2,), (1,), (0, ...), (), ()), + ((2,), (1,), (..., 0), (), ()), + # multi-axis integer indexing drops every indexed axis + ((2, 2), (1, 1), (0, 0), (), ()), + ((3, 3), (1, 1), (1, 2), (), ()), + # chunk-aligned slicing preserves the axis ((2,), (1,), slice(0, 1), (1,), (1,)), ((2,), (1,), (..., slice(0, 1)), (1,), (1,)), ((2,), (1,), (slice(0, 1), ...), (1,), (1,)), @@ -862,9 +866,10 @@ def test_raise_on_multiple_ellipses( # multi-dim slicing ((4, 4), (2, 2), (slice(0, 2), slice(0, 2)), (2, 2), (2, 2)), ((4, 4), (2, 2), (slice(2, 4), slice(0, 4)), (2, 4), (2, 2)), - # mixed integer + slice indexing - ((2, 4), (1, 2), (0, slice(0, 2)), (1, 2), (1, 2)), - ((4, 4), (1, 2), (2, slice(0, 4)), (1, 4), (1, 2)), + # mixed integer + slice indexing — the integer-indexed axis drops, the + # slice-indexed axis stays. + ((2, 4), (1, 2), (0, slice(0, 2)), (2,), (2,)), + ((4, 4), (1, 2), (2, slice(0, 4)), (4,), (2,)), # partial final chunk along an axis ((5,), (2,), slice(0, 4), (4,), (2,)), ((5,), (2,), slice(2, 5), (3,), (2,)), @@ -878,11 +883,36 @@ def test_chunk_selection_cases( assert indexed.shape == out_shape assert indexed.chunks == out_chunks + def test_integer_indexing_subsets_manifest_to_one_chunk(self, array_v3_metadata): + # Make sure dropping the axis also drops the manifest's chunk-grid axis and + # keeps just the referenced chunk's bytes. + metadata = array_v3_metadata(shape=(4, 2), chunks=(1, 2)) + manifest = ChunkManifest( + entries={ + "0.0": {"path": "/a.nc", "offset": 0, "length": 16}, + "1.0": {"path": "/a.nc", "offset": 100, "length": 16}, + "2.0": {"path": "/a.nc", "offset": 200, "length": 16}, + "3.0": {"path": "/a.nc", "offset": 300, "length": 16}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + + result = marr[2, ...] + + assert result.shape == (2,) + assert result.chunks == (2,) + assert result.manifest.shape_chunk_grid == (1,) + assert result.manifest.dict() == { + "0": {"path": "file:///a.nc", "offset": 200, "length": 16}, + } + @pytest.mark.parametrize( "in_shape, in_chunks, indexer", [ - # int that doesn't land on a chunk boundary + # int on a multi-element chunk — only chunk_size == 1 permits int indexing + ((4,), (2,), 0), ((4,), (2,), 1), + ((4,), (2,), 2), ((4,), (2,), 3), # slice start misaligned ((4,), (2,), slice(1, 3)), @@ -893,6 +923,8 @@ def test_chunk_selection_cases( ((4,), (2,), slice(0, 4, 2)), # only one axis misaligned in a multi-dim selection ((4, 4), (2, 2), (slice(0, 4), slice(0, 1))), + # int on a chunk_size > 1 axis even when other axes are fine + ((4, 4), (2, 2), (0, slice(0, 4))), ], ) def test_misaligned_with_chunks(self, manifest_array, in_shape, in_chunks, indexer): diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 8f63e7f86..631b711b3 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -1047,10 +1047,8 @@ def test_isel_along_chunk_boundary(self, array_v3_metadata): } def test_isel_single_chunk_via_length_1_slice(self, array_v3_metadata): - # Selecting a single chunk requires a length-1 slice rather than an int: - # ManifestArray intentionally preserves the axis (since dropping it would - # require knowing values, not just references), and xarray's int-indexing - # path expects the axis to be dropped — so a literal `isel(time=2)` fails. + # A length-1 slice picks a single chunk while preserving the axis as length 1 + # (the array stays 2D). Useful when the caller wants to keep dimensions stable. vds = self._virtual_dataset(array_v3_metadata) sliced = vds.isel(time=slice(2, 4)) @@ -1062,6 +1060,42 @@ def test_isel_single_chunk_via_length_1_slice(self, array_v3_metadata): "0.0": {"path": "file:///a.nc", "offset": 100, "length": 64}, } + def test_isel_integer_drops_axis(self, array_v3_metadata): + # Integer .isel drops the indexed axis (numpy / array-API semantics). Only + # works when chunk_size == 1 along that axis; otherwise it's sub-chunk indexing. + # shape=(4, 4), chunks=(1, 4): 4 single-row chunks along "time". + metadata = array_v3_metadata( + shape=(4, 4), chunks=(1, 4), dimension_names=["time", "x"] + ) + manifest = ChunkManifest( + entries={ + "0.0": {"path": "/a.nc", "offset": 0, "length": 32}, + "1.0": {"path": "/a.nc", "offset": 100, "length": 32}, + "2.0": {"path": "/a.nc", "offset": 200, "length": 32}, + "3.0": {"path": "/a.nc", "offset": 300, "length": 32}, + } + ) + marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) + vds = xr.Dataset({"foo": (["time", "x"], marr)}) + + sliced = vds.isel(time=2) + + assert sliced["foo"].dims == ("x",) + assert sliced.sizes == {"x": 4} + assert isinstance(sliced["foo"].data, ManifestArray) + assert sliced["foo"].data.shape == (4,) + assert sliced["foo"].data.chunks == (4,) + assert sliced["foo"].data.manifest.dict() == { + "0": {"path": "file:///a.nc", "offset": 200, "length": 32}, + } + + def test_isel_integer_misaligned_raises(self, array_v3_metadata): + # chunk_size > 1 along the indexed axis — picking one element would split a chunk. + vds = self._virtual_dataset(array_v3_metadata) + + with pytest.raises(SubChunkIndexingError, match="split individual chunks"): + vds.isel(time=1) + def test_isel_misaligned_raises(self, array_v3_metadata): vds = self._virtual_dataset(array_v3_metadata) From f7626e594ffddbf87f81cbbc75786269d9ad99fd Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 12:29:28 -0400 Subject: [PATCH 09/10] docs: update outdated mentions of indexing limitations data_structures.md and faq.md still implied indexing on a ManifestArray was unimplemented. Reflect the chunk-aligned int/slice semantics that now work end-to-end, and flip the isel row in the kerchunk-comparison table. --- docs/data_structures.md | 5 +++-- docs/faq.md | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/data_structures.md b/docs/data_structures.md index dd1654b9b..94e0979a1 100644 --- a/docs/data_structures.md +++ b/docs/data_structures.md @@ -244,8 +244,9 @@ NotImplementedError: ManifestArrays can't be converted into numpy arrays or pand The whole point is to manipulate references to the data without actually loading any data. !!! note - You also cannot currently index into a `ManifestArray`, as arbitrary indexing would require loading data values to create the new array. - We could imagine supporting indexing without loading data when slicing only along chunk boundaries, but this has not yet been implemented (see [GH issue #51](https://github.com/zarr-developers/VirtualiZarr/issues/51)). + You can index into a `ManifestArray` as long as the selection aligns with chunk boundaries — slicing through the interior of a chunk would require loading the chunk's bytes, which a virtual array deliberately cannot do. + Chunk-aligned integer and slice indexing is supported, including mixed integer + slice indexers; integer indexers drop the indexed axis as in numpy. Misaligned selections raise `SubChunkIndexingError`. + Arbitrary fancy indexing (e.g. with a boolean mask or integer array) is not supported, since it would generally require loading data. ## Zarr Groups diff --git a/docs/faq.md b/docs/faq.md index 4bc9782ef..5d4e34767 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -174,7 +174,7 @@ Users of Kerchunk may find the following comparison table useful, which shows wh | Renaming dimensions | ❌ | `xarray.Dataset.rename_dims` | | Renaming manifest file paths | `kerchunk.utils.rename_target` | `vds.vz.rename_paths` | | Splitting uncompressed data into chunks | `kerchunk.utils.subchunk` | `xarray.Dataset.chunk` (❌ Not yet implemented - see [PR #199](https://github.com/zarr-developers/VirtualiZarr/pull/199)) -| Selecting specific chunks | ❌ | `xarray.Dataset.isel` (❌ Not yet implemented - see [issue #51](https://github.com/zarr-developers/VirtualiZarr/issues/51)) | +| Selecting specific chunks | ❌ | `xarray.Dataset.isel` (✅ chunk-aligned selections only) | **Parallelization** | | | | Parallelized generation of references | Wrapping kerchunk's opener inside `dask.delayed` | Wrapping `open_virtual_dataset` inside `dask.delayed` | Parallelized combining of references (tree-reduce) | `kerchunk.combine.auto_dask` | Wrapping `ManifestArray` objects within `dask.array.Array` objects inside `xarray.Dataset` to use dask's `concatenate` (⚠️ Untested, but also unnecessary) | From 34ff775f80c5c2e6834419677c11ced3c92ef9ec Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 12:49:02 -0400 Subject: [PATCH 10/10] fix: satisfy mypy in indexing.py Three narrow type fixes: collect the indexers into a list typed int | slice once np.ndarray is ruled out, broaden the dimension_names local to Any (zarr's tuple[str | None, ...] doesn't fit copy_and_replace_metadata's Iterable[str] hint but the runtime is fine), and cast np.asarray's dtype-erased return back to the manifest array types that from_arrays expects. --- virtualizarr/manifests/indexing.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 22538825e..580808c43 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -1,5 +1,5 @@ from types import EllipsisType -from typing import TYPE_CHECKING, TypeAlias, cast +from typing import TYPE_CHECKING, Any, TypeAlias, cast import numpy as np @@ -162,6 +162,7 @@ def apply_selection( assert len(indexer_without_newaxes) == marr.ndim # validate types and reject anything we can never support per-axis + narrowed_indexers: list[int | slice] = [] for indexer_1d in indexer_without_newaxes: if isinstance(indexer_1d, np.ndarray): raise NotImplementedError( @@ -169,13 +170,14 @@ def apply_selection( ) if not isinstance(indexer_1d, (int, slice)): raise TypeError(f"Invalid indexer type: {indexer_1d}") + narrowed_indexers.append(indexer_1d) new_shape: list[int] = [] new_chunks: list[int] = [] chunk_grid_selectors: list[int | slice] = [] kept_axes: list[int] = [] for axis, (axis_length, chunk_size, indexer_1d) in enumerate( - zip(marr.shape, marr.chunks, indexer_without_newaxes, strict=True) + zip(marr.shape, marr.chunks, narrowed_indexers, strict=True) ): chunk_grid_selector, new_axis_length = _compute_chunk_aligned_selection_1d( indexer_1d, axis_length=axis_length, chunk_size=chunk_size @@ -198,7 +200,9 @@ def apply_selection( new_manifest = _subset_manifest(marr.manifest, chunk_grid_selectors_tuple) old_dimension_names = marr.metadata.dimension_names - new_dimension_names: tuple[str | None, ...] | None | str + # zarr's dimension_names is tuple[str | None, ...] but copy_and_replace_metadata's + # type hint says Iterable[str]; the runtime handles None entries fine, so cast through. + new_dimension_names: Any if old_dimension_names is None: new_dimension_names = "default" # sentinel: leave as None else: @@ -271,14 +275,22 @@ def _subset_manifest( """ # When every axis is int-indexed, numpy returns a 0D scalar (Python str for StringDType, # numpy scalar for the numeric arrays). Wrap back into a 0D ndarray so from_arrays accepts it. - new_paths = np.asarray( - manifest._paths[chunk_grid_selectors], dtype=manifest._paths.dtype + # np.asarray's return type erases the dtype param, so cast back to keep from_arrays happy. + new_paths = cast( + "np.ndarray[Any, np.dtypes.StringDType]", + np.asarray(manifest._paths[chunk_grid_selectors], dtype=manifest._paths.dtype), ) - new_offsets = np.asarray( - manifest._offsets[chunk_grid_selectors], dtype=manifest._offsets.dtype + new_offsets = cast( + "np.ndarray[Any, np.dtype[np.uint64]]", + np.asarray( + manifest._offsets[chunk_grid_selectors], dtype=manifest._offsets.dtype + ), ) - new_lengths = np.asarray( - manifest._lengths[chunk_grid_selectors], dtype=manifest._lengths.dtype + new_lengths = cast( + "np.ndarray[Any, np.dtype[np.uint64]]", + np.asarray( + manifest._lengths[chunk_grid_selectors], dtype=manifest._lengths.dtype + ), ) if manifest._inlined: