From b3c5dce4fb814be68b26677b4519a63a6a6a46f5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 13:31:40 -0400 Subject: [PATCH 1/6] feat: axis-0 sub-chunk slicing on uncompressed ManifestArrays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When an array has only a BytesCodec (no compressors, no value transforms), a slice that lands within a single source chunk along axis 0 can be expressed as a rewritten byte offset and length on every surviving chunk reference — no data loaded. Picks up part of issue #86: now you can isel a single timestep from a multi-row chunk produced by, e.g., the netCDF3 parser without rechunking. The dispatch is deterministic: axis 0, uncompressed, slice contained in one source chunk, step == 1 → byte-shift path; everything else → existing chunk-aligned path, which raises SubChunkIndexingError for misaligned selections. step != 1 and sub-chunk indexing on other axes remain out of scope here and still raise. --- docs/data_structures.md | 1 + docs/releases.md | 2 + virtualizarr/manifests/array.py | 4 + virtualizarr/manifests/indexing.py | 110 +++++++++++++- .../tests/test_manifests/test_array.py | 136 ++++++++++++++++++ 5 files changed, 248 insertions(+), 5 deletions(-) diff --git a/docs/data_structures.md b/docs/data_structures.md index 94e0979a..f6ce228b 100644 --- a/docs/data_structures.md +++ b/docs/data_structures.md @@ -246,6 +246,7 @@ The whole point is to manipulate references to the data without actually loading !!! note You can index into a `ManifestArray` as long as the selection aligns with chunk boundaries — slicing through the interior of a chunk would require loading the chunk's bytes, which a virtual array deliberately cannot do. Chunk-aligned integer and slice indexing is supported, including mixed integer + slice indexers; integer indexers drop the indexed axis as in numpy. Misaligned selections raise `SubChunkIndexingError`. + As a special case, an **uncompressed** array (only a `BytesCodec`, no other codecs) supports slicing _within_ a single source chunk along axis 0 — the result is a new chunk reference with a bumped byte offset and a smaller length, no data loaded. This is useful for picking out a row from a multi-row chunk produced by a parser like the netCDF3 one. Arbitrary fancy indexing (e.g. with a boolean mask or integer array) is not supported, since it would generally require loading data. ## Zarr Groups diff --git a/docs/releases.md b/docs/releases.md index 7b12814f..e137e753 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -6,6 +6,8 @@ - `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Integer indexers drop the indexed axis (numpy / array-API semantics) and are legal only when `chunk_size == 1` along that axis; slice indexers preserve the axis. This makes `xarray.Dataset.isel` work end-to-end on virtual datasets for any chunk-aligned selection. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (a `ValueError` subclass) — a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). By [Tom Nicholas](https://github.com/TomNicholas). +- Slicing along axis 0 of an uncompressed `ManifestArray` (only the v3 `BytesCodec`, no other codecs) can now sub-divide a chunk — the result is a new reference into the same source file with a bumped byte offset and a smaller length. Useful for picking out a single timestep from a multi-row chunk produced by, e.g., a netCDF3 parser, without rechunking. Limited to slices that fit within one source chunk; multi-chunk-spanning sub-chunk slices, `step != 1`, and sub-chunk indexing on other axes still raise `SubChunkIndexingError`. Addresses part of [#86](https://github.com/zarr-developers/VirtualiZarr/issues/86). + By [Tom Nicholas](https://github.com/TomNicholas). ### Bug fixes diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 9e581471..571ba926 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -242,6 +242,10 @@ def __getitem__( - ``int`` — drops the indexed axis, following numpy / array-API semantics. Only legal when ``chunk_size == 1`` along that axis; otherwise picking a single element would require splitting a chunk. + - Slice along axis 0 of an **uncompressed** array (only the v3 ``BytesCodec``, no other + codecs) that fits entirely within one source chunk — handled by rewriting the chunk + reference's byte offset/length rather than splitting bytes. Useful for picking a + single timestep from a multi-row chunk on a parser like the netCDF3 one. Anything else — fancy indexing with arrays, misaligned slices, ``step != 1`` — raises ``SubChunkIndexingError`` or ``NotImplementedError``. diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 580808c4..767f714a 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Any, TypeAlias, cast import numpy as np +from zarr.codecs import BytesCodec +from zarr.core.metadata.v3 import ArrayV3Metadata from virtualizarr.manifests.array_api import expand_dims from virtualizarr.manifests.manifest import ChunkManifest @@ -172,21 +174,51 @@ def apply_selection( raise TypeError(f"Invalid indexer type: {indexer_1d}") narrowed_indexers.append(indexer_1d) + uncompressed = _is_uncompressed_bytes_only(marr.metadata) + new_shape: list[int] = [] new_chunks: list[int] = [] chunk_grid_selectors: list[int | slice] = [] kept_axes: list[int] = [] + # At most one sub-chunk axis (axis 0). The byte adjustment is uniform across every + # surviving chunk, since chunks share layout. + sub_chunk_byte_adjust: tuple[int, int] | None = None for axis, (axis_length, chunk_size, indexer_1d) in enumerate( zip(marr.shape, marr.chunks, narrowed_indexers, strict=True) ): - chunk_grid_selector, new_axis_length = _compute_chunk_aligned_selection_1d( - indexer_1d, axis_length=axis_length, chunk_size=chunk_size - ) + chunk_grid_selector: int | slice + if ( + axis == 0 + and uncompressed + and _is_axis_0_sub_chunk_slice(indexer_1d, axis_length, chunk_size) + ): + # Sub-chunk axis-0 slicing on an uncompressed array: contained in a single + # source chunk, expressed as an integer byte-offset adjustment. + assert isinstance(indexer_1d, slice) # narrowed by the helper above + start, stop, _ = indexer_1d.indices(axis_length) + chunk_index = start // chunk_size + chunk_grid_selector = slice(chunk_index, chunk_index + 1, 1) + new_axis_length = stop - start + new_chunks_for_axis = new_axis_length + stride_bytes = ( + int(np.prod(marr.chunks[1:])) * marr.dtype.itemsize + ) # bytes per row along axis 0 within one chunk + sub_chunk_byte_adjust = ( + (start - chunk_index * chunk_size) * stride_bytes, + new_axis_length * stride_bytes, + ) + else: + chunk_grid_selector, new_axis_length = _compute_chunk_aligned_selection_1d( + indexer_1d, axis_length=axis_length, chunk_size=chunk_size + ) + new_chunks_for_axis = chunk_size + chunk_grid_selectors.append(chunk_grid_selector) - # int selectors drop the axis from the output array + # int indexers drop the axis from the output array; slices preserve it (including the + # sub-chunk path above, which uses a length-1 chunk-grid slice selector). if not isinstance(indexer_1d, int): new_shape.append(new_axis_length) - new_chunks.append(chunk_size) + new_chunks.append(new_chunks_for_axis) kept_axes.append(axis) chunk_grid_selectors_tuple = tuple(chunk_grid_selectors) @@ -199,6 +231,8 @@ def apply_selection( return marr new_manifest = _subset_manifest(marr.manifest, chunk_grid_selectors_tuple) + if sub_chunk_byte_adjust is not None: + new_manifest = _shift_manifest_byte_ranges(new_manifest, *sub_chunk_byte_adjust) old_dimension_names = marr.metadata.dimension_names # zarr's dimension_names is tuple[str | None, ...] but copy_and_replace_metadata's # type hint says Iterable[str]; the runtime handles None entries fine, so cast through. @@ -323,3 +357,69 @@ def _subset_manifest( inlined=new_inlined, validate_paths=False, ) + + +def _is_uncompressed_bytes_only(metadata: ArrayV3Metadata) -> bool: + """ + True iff ``metadata.codecs`` is exactly one ``BytesCodec`` and nothing else. + + BytesCodec only encodes endianness; with no other codecs, the chunk bytes on disk are + the raw little/big-endian element values in C-order. That lets us treat sub-chunk slices + along axis 0 as a byte-offset adjustment into the same chunk file. Any array-array codec + (value transform) or bytes-bytes codec (compressor / checksum) requires decoding the + whole chunk and is therefore not eligible. + """ + codecs = metadata.codecs + return len(codecs) == 1 and isinstance(codecs[0], BytesCodec) + + +def _is_axis_0_sub_chunk_slice( + indexer_1d: int | slice, axis_length: int, chunk_size: int +) -> bool: + """ + True iff this is a slice that should take the axis-0 sub-chunk path: step == 1, + non-empty, fits entirely within one source chunk along axis 0, and is NOT already + chunk-aligned (chunk-aligned slices go through the simpler aligned path). + """ + if not isinstance(indexer_1d, slice): + return False + start, stop, step = indexer_1d.indices(axis_length) + if step != 1 or start >= stop: + return False + # chunk-aligned slices are handled by _compute_chunk_aligned_selection_1d + aligned = start % chunk_size == 0 and ( + stop == axis_length or stop % chunk_size == 0 + ) + if aligned: + return False + # contained in a single source chunk? + return start // chunk_size == (stop - 1) // chunk_size + + +def _shift_manifest_byte_ranges( + manifest: ChunkManifest, offset_delta: int, new_length: int +) -> ChunkManifest: + """ + Return a new ``ChunkManifest`` whose virtual chunk references point to a uniform + sub-range of each original chunk: ``offset += offset_delta`` and ``length = new_length``. + + Used by the uncompressed-axis-0 sub-chunk path, where every surviving chunk shares the + same byte layout and therefore the same byte adjustment. + """ + new_offsets = cast( + "np.ndarray[Any, np.dtype[np.uint64]]", + manifest._offsets + np.uint64(offset_delta), + ) + new_lengths = cast( + "np.ndarray[Any, np.dtype[np.uint64]]", + np.full_like(manifest._lengths, np.uint64(new_length)), + ) + # paths and any inlined-chunk dict carry through unchanged: inlined chunks aren't + # involved here (this path is only taken for uncompressed virtual references). + return ChunkManifest.from_arrays( + paths=manifest._paths, + offsets=new_offsets, + lengths=new_lengths, + inlined=dict(manifest._inlined), + validate_paths=False, + ) diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 08187ec1..15c809e2 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -933,6 +933,142 @@ def test_misaligned_with_chunks(self, manifest_array, in_shape, in_chunks, index marr[indexer] +class TestSubChunkSlicingUncompressed: + # For an uncompressed C-order array, sub-chunk slicing along axis 0 (the contiguous + # axis in memory layout) can be expressed purely as a byte-offset/length adjustment + # into the same source file. Issue #86. Scope: slice fully contained within one + # original chunk along axis 0. Anything else still raises SubChunkIndexingError. + + BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} + ZLIB_CODEC = {"name": "numcodecs.zlib", "configuration": {"level": 1}} + + def _uncompressed_marr( + self, + array_v3_metadata, + shape, + chunks, + codecs=None, + ): + # one 8-byte float per element, single source file, sequential chunk offsets + itemsize = 8 + bytes_per_chunk = int(np.prod(chunks)) * itemsize + metadata = array_v3_metadata( + shape=shape, + chunks=chunks, + data_type=np.dtype("float64"), + codecs=codecs or [self.BYTES_CODEC], + ) + chunk_grid_shape = tuple(-(-s // c) for s, c in zip(shape, chunks)) + entries: dict[str, dict] = {} + for idx in np.ndindex(*chunk_grid_shape): + key = ".".join(str(i) for i in idx) + entries[key] = { + "path": "/foo.nc", + "offset": 1000 + + int(np.ravel_multi_index(idx, chunk_grid_shape)) * bytes_per_chunk, + "length": bytes_per_chunk, + } + return ManifestArray(metadata=metadata, chunkmanifest=ChunkManifest(entries)) + + @pytest.mark.parametrize( + "shape, chunks, indexer, expected_shape, expected_chunks, expected_entries", + [ + # 1D, single chunk: marr[1:3] on shape=(8,) chunks=(4,) — sits in chunk 0. + # itemsize=8, stride along axis 0 = 8. Offset += 1*8, length = 2*8. + ( + (8,), + (4,), + np.s_[1:3], + (2,), + (2,), + {"0": {"offset": 1000 + 8, "length": 16}}, + ), + # 1D, second chunk: marr[5:7] -> chunk 1 base (1000+32), inner offset (5-4)*8. + ( + (8,), + (4,), + np.s_[5:7], + (2,), + (2,), + {"0": {"offset": 1000 + 32 + 8, "length": 16}}, + ), + # 2D, axis 0 sub-chunk, axis 1 covers the only chunk-col. + # shape=(8, 4) chunks=(4, 4): per-chunk = 4*4*8 = 128 bytes; axis-0 stride = 4*8 = 32. + ( + (8, 4), + (4, 4), + np.s_[1:3, :], + (2, 4), + (2, 4), + {"0.0": {"offset": 1000 + 32, "length": 64}}, + ), + # 2D, axis 0 sub-chunk + axis 1 chunk-aligned across multiple chunks. + # shape=(8, 6) chunks=(4, 3): per-chunk = 4*3*8 = 96 bytes; axis-0 stride = 3*8 = 24. + ( + (8, 6), + (4, 3), + np.s_[1:3, :], + (2, 6), + (2, 3), + { + "0.0": {"offset": 1000 + 24, "length": 48}, + "0.1": {"offset": 1000 + 96 + 24, "length": 48}, + }, + ), + ], + ) + def test_axis_0_slice_within_single_chunk( + self, + array_v3_metadata, + shape, + chunks, + indexer, + expected_shape, + expected_chunks, + expected_entries, + ): + marr = self._uncompressed_marr(array_v3_metadata, shape=shape, chunks=chunks) + + sliced = marr[indexer] + + assert sliced.shape == expected_shape + assert sliced.chunks == expected_chunks + expected = { + k: {"path": "file:///foo.nc", **v} for k, v in expected_entries.items() + } + assert sliced.manifest.dict() == expected + + @pytest.mark.parametrize( + "indexer, reason", + [ + # Slice spans >1 source chunk along axis 0. + (np.s_[1:5, :], "slice crossing chunk boundary"), + # Integer indexing into a multi-row chunk: still chunk-aligned-only + # (not part of the chosen scope for #86). + (np.s_[1, :], "integer indexing into a multi-row chunk"), + # Sub-chunk along axis 1 — bytes are interleaved, can't byte-adjust. + (np.s_[:, 1:3], "sub-chunk slice on non-axis-0"), + ], + ) + def test_uncompressed_out_of_scope_cases_raise( + self, array_v3_metadata, indexer, reason + ): + marr = self._uncompressed_marr(array_v3_metadata, shape=(8, 6), chunks=(4, 3)) + with pytest.raises(SubChunkIndexingError, match="split individual chunks"): + marr[indexer] + + def test_sub_chunk_slice_on_compressed_array_raises(self, array_v3_metadata): + # Compressed array — even axis-0 sub-chunk slicing requires decoding bytes. + marr = self._uncompressed_marr( + array_v3_metadata, + shape=(8, 4), + chunks=(4, 4), + codecs=[self.BYTES_CODEC, self.ZLIB_CODEC], + ) + with pytest.raises(SubChunkIndexingError, match="split individual chunks"): + marr[1:3, :] + + def test_to_xarray(array_v3_metadata): chunks = (5, 10) shape = (5, 20) From 39b1c6e71783591da8613698fde1857fba251fef Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 13:56:11 -0400 Subject: [PATCH 2/6] feat: generalize sub-chunk slicing to any largest-stride storage axis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the hardcoded axis-0 check with a detector that reads the codec stack and returns the eligible axis (or None). [BytesCodec] gives C-order with axis 0; [TransposeCodec(order=...), BytesCodec] gives whatever axis order[0] points to in the logical array — for the reversed-order F-order case that's the last axis. The byte stride is computed commutatively as prod(chunks of the other axes) * itemsize, so the same formula works for any chosen axis. Also fix a latent no-op short-circuit: when the eligible axis happens to have a single source chunk, its chunk-grid selector slice(0, 1, 1) would equal slice(0, dim, 1) and skip the pending byte adjustment. Suppress the short-circuit when a sub-chunk adjustment is queued. --- docs/data_structures.md | 2 +- docs/releases.md | 2 +- virtualizarr/manifests/array.py | 10 +- virtualizarr/manifests/indexing.py | 72 ++++++++------ .../tests/test_manifests/test_array.py | 96 ++++++++++++++++--- 5 files changed, 138 insertions(+), 44 deletions(-) diff --git a/docs/data_structures.md b/docs/data_structures.md index f6ce228b..83e48455 100644 --- a/docs/data_structures.md +++ b/docs/data_structures.md @@ -246,7 +246,7 @@ The whole point is to manipulate references to the data without actually loading !!! note You can index into a `ManifestArray` as long as the selection aligns with chunk boundaries — slicing through the interior of a chunk would require loading the chunk's bytes, which a virtual array deliberately cannot do. Chunk-aligned integer and slice indexing is supported, including mixed integer + slice indexers; integer indexers drop the indexed axis as in numpy. Misaligned selections raise `SubChunkIndexingError`. - As a special case, an **uncompressed** array (only a `BytesCodec`, no other codecs) supports slicing _within_ a single source chunk along axis 0 — the result is a new chunk reference with a bumped byte offset and a smaller length, no data loaded. This is useful for picking out a row from a multi-row chunk produced by a parser like the netCDF3 one. + As a special case, an **uncompressed** array supports slicing _within_ a single source chunk along its largest-stride storage axis — the result is a new chunk reference with a bumped byte offset and a smaller length, no data loaded. This works for `[BytesCodec]` arrays (C-order; the axis is axis 0) and `[TransposeCodec(order=...), BytesCodec]` arrays (the axis is `order[0]` — e.g. the last axis for F-order). Useful for picking out a row from a multi-row chunk produced by a parser like the netCDF3 one. Arbitrary fancy indexing (e.g. with a boolean mask or integer array) is not supported, since it would generally require loading data. ## Zarr Groups diff --git a/docs/releases.md b/docs/releases.md index e137e753..569c55a5 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -6,7 +6,7 @@ - `ManifestArray` now supports chunk-aligned integer and slice indexing along each axis, including multi-chunk slices, mixed integer + slice indexers, and selections that include a partial final chunk. Integer indexers drop the indexed axis (numpy / array-API semantics) and are legal only when `chunk_size == 1` along that axis; slice indexers preserve the axis. This makes `xarray.Dataset.isel` work end-to-end on virtual datasets for any chunk-aligned selection. Indexers that would split individual chunks raise a new `SubChunkIndexingError` (a `ValueError` subclass) — a permanent constraint of a virtual array, not a missing feature. Previously slice misalignment silently no-op'd while integer indexing unconditionally raised `NotImplementedError`. Closes [#51](https://github.com/zarr-developers/VirtualiZarr/issues/51), supersedes [#499](https://github.com/zarr-developers/VirtualiZarr/pull/499). By [Tom Nicholas](https://github.com/TomNicholas). -- Slicing along axis 0 of an uncompressed `ManifestArray` (only the v3 `BytesCodec`, no other codecs) can now sub-divide a chunk — the result is a new reference into the same source file with a bumped byte offset and a smaller length. Useful for picking out a single timestep from a multi-row chunk produced by, e.g., a netCDF3 parser, without rechunking. Limited to slices that fit within one source chunk; multi-chunk-spanning sub-chunk slices, `step != 1`, and sub-chunk indexing on other axes still raise `SubChunkIndexingError`. Addresses part of [#86](https://github.com/zarr-developers/VirtualiZarr/issues/86). +- Slicing along the largest-stride storage axis of an uncompressed `ManifestArray` can now sub-divide a chunk — the result is a new reference into the same source file with a bumped byte offset and a smaller length. Eligible codec stacks are `[BytesCodec]` (C-order; the axis is axis 0) and `[TransposeCodec(order=...), BytesCodec]` (e.g. F-order with `order=(N-1, ..., 0)`, where the axis is `order[0]` — typically the last axis). Useful for picking a single timestep from a multi-row chunk produced by, e.g., the netCDF3 parser, without rechunking. Limited to slices that fit within one source chunk; multi-chunk-spanning sub-chunk slices, `step != 1`, and sub-chunk indexing on other axes still raise `SubChunkIndexingError`. Addresses part of [#86](https://github.com/zarr-developers/VirtualiZarr/issues/86). By [Tom Nicholas](https://github.com/TomNicholas). ### Bug fixes diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 571ba926..8de9ae5f 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -242,10 +242,12 @@ def __getitem__( - ``int`` — drops the indexed axis, following numpy / array-API semantics. Only legal when ``chunk_size == 1`` along that axis; otherwise picking a single element would require splitting a chunk. - - Slice along axis 0 of an **uncompressed** array (only the v3 ``BytesCodec``, no other - codecs) that fits entirely within one source chunk — handled by rewriting the chunk - reference's byte offset/length rather than splitting bytes. Useful for picking a - single timestep from a multi-row chunk on a parser like the netCDF3 one. + - Slice along the largest-stride storage axis of an **uncompressed** array that fits + entirely within one source chunk — handled by rewriting the chunk reference's byte + offset/length rather than splitting bytes. Useful for picking a single timestep from + a multi-row chunk on a parser like the netCDF3 one. The eligible-axis is axis 0 for + a plain ``[BytesCodec]`` array (C-order) or axis ``order[0]`` of a prepended + ``[TransposeCodec(order=...), BytesCodec]`` (e.g. the last axis for F-order). Anything else — fancy indexing with arrays, misaligned slices, ``step != 1`` — raises ``SubChunkIndexingError`` or ``NotImplementedError``. diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 767f714a..7f478cc9 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -2,7 +2,7 @@ from typing import TYPE_CHECKING, Any, TypeAlias, cast import numpy as np -from zarr.codecs import BytesCodec +from zarr.codecs import BytesCodec, TransposeCodec from zarr.core.metadata.v3 import ArrayV3Metadata from virtualizarr.manifests.array_api import expand_dims @@ -174,35 +174,37 @@ def apply_selection( raise TypeError(f"Invalid indexer type: {indexer_1d}") narrowed_indexers.append(indexer_1d) - uncompressed = _is_uncompressed_bytes_only(marr.metadata) + sub_chunk_axis = _uncompressed_sub_chunk_axis(marr.metadata) new_shape: list[int] = [] new_chunks: list[int] = [] chunk_grid_selectors: list[int | slice] = [] kept_axes: list[int] = [] - # At most one sub-chunk axis (axis 0). The byte adjustment is uniform across every - # surviving chunk, since chunks share layout. + # At most one sub-chunk axis (whichever axis has the largest byte stride in storage). + # The byte adjustment is uniform across every surviving chunk, since chunks share layout. sub_chunk_byte_adjust: tuple[int, int] | None = None for axis, (axis_length, chunk_size, indexer_1d) in enumerate( zip(marr.shape, marr.chunks, narrowed_indexers, strict=True) ): chunk_grid_selector: int | slice - if ( - axis == 0 - and uncompressed - and _is_axis_0_sub_chunk_slice(indexer_1d, axis_length, chunk_size) + if axis == sub_chunk_axis and _is_sub_chunk_slice( + indexer_1d, axis_length, chunk_size ): - # Sub-chunk axis-0 slicing on an uncompressed array: contained in a single - # source chunk, expressed as an integer byte-offset adjustment. + # Sub-chunk slicing on an uncompressed array along its largest-stride axis: + # slice fits within one source chunk, expressed as a byte-offset adjustment. assert isinstance(indexer_1d, slice) # narrowed by the helper above start, stop, _ = indexer_1d.indices(axis_length) chunk_index = start // chunk_size chunk_grid_selector = slice(chunk_index, chunk_index + 1, 1) new_axis_length = stop - start new_chunks_for_axis = new_axis_length + # bytes per index step along this axis within one chunk = + # product of all chunk sizes for the other axes (regardless of their order in + # storage — multiplication is commutative) times itemsize. stride_bytes = ( - int(np.prod(marr.chunks[1:])) * marr.dtype.itemsize - ) # bytes per row along axis 0 within one chunk + int(np.prod([c for i, c in enumerate(marr.chunks) if i != axis])) + * marr.dtype.itemsize + ) sub_chunk_byte_adjust = ( (start - chunk_index * chunk_size) * stride_bytes, new_axis_length * stride_bytes, @@ -223,8 +225,10 @@ def apply_selection( chunk_grid_selectors_tuple = tuple(chunk_grid_selectors) - # short-circuit if every axis selects the whole chunk grid via a slice (a no-op) - if all( + # short-circuit if every axis selects the whole chunk grid via a slice (a no-op). + # A pending sub-chunk byte adjustment is real work even if its single source chunk + # happens to span the whole chunk grid along that axis, so don't short-circuit then. + if sub_chunk_byte_adjust is None and all( isinstance(cgs, slice) and cgs == slice(0, dim, 1) for cgs, dim in zip(chunk_grid_selectors_tuple, marr.manifest.shape_chunk_grid) ): @@ -359,27 +363,41 @@ def _subset_manifest( ) -def _is_uncompressed_bytes_only(metadata: ArrayV3Metadata) -> bool: +def _uncompressed_sub_chunk_axis(metadata: ArrayV3Metadata) -> int | None: """ - True iff ``metadata.codecs`` is exactly one ``BytesCodec`` and nothing else. - - BytesCodec only encodes endianness; with no other codecs, the chunk bytes on disk are - the raw little/big-endian element values in C-order. That lets us treat sub-chunk slices - along axis 0 as a byte-offset adjustment into the same chunk file. Any array-array codec - (value transform) or bytes-bytes codec (compressor / checksum) requires decoding the - whole chunk and is therefore not eligible. + Return the axis along which sub-chunk slicing is implementable for this array, or + ``None`` if the codec stack disqualifies it. + + Sub-chunk slicing rewrites an existing chunk reference's byte offset and length, + so it only works when chunk bytes are raw element values in a fixed memory order — + i.e., no compression, no value transforms, no checksums. The eligible codec stacks + are: + + - ``[BytesCodec]`` — C-order layout; the axis with the largest byte stride is axis 0. + - ``[TransposeCodec(order=perm), BytesCodec]`` — stored layout is the logical array + permuted by ``perm``; the axis with the largest byte stride in storage is logical + axis ``perm[0]``. For the F-order case ``perm = (n-1, n-2, ..., 0)`` this picks out + the last axis. """ codecs = metadata.codecs - return len(codecs) == 1 and isinstance(codecs[0], BytesCodec) + if len(codecs) == 1 and isinstance(codecs[0], BytesCodec): + return 0 + if ( + len(codecs) == 2 + and isinstance(codecs[0], TransposeCodec) + and isinstance(codecs[1], BytesCodec) + ): + return int(codecs[0].order[0]) + return None -def _is_axis_0_sub_chunk_slice( +def _is_sub_chunk_slice( indexer_1d: int | slice, axis_length: int, chunk_size: int ) -> bool: """ - True iff this is a slice that should take the axis-0 sub-chunk path: step == 1, - non-empty, fits entirely within one source chunk along axis 0, and is NOT already - chunk-aligned (chunk-aligned slices go through the simpler aligned path). + True iff this is a slice that should take the sub-chunk path: step == 1, non-empty, + fits entirely within one source chunk, and is NOT already chunk-aligned (chunk-aligned + slices go through the simpler aligned path). """ if not isinstance(indexer_1d, slice): return False diff --git a/virtualizarr/tests/test_manifests/test_array.py b/virtualizarr/tests/test_manifests/test_array.py index 15c809e2..5913cdbc 100644 --- a/virtualizarr/tests/test_manifests/test_array.py +++ b/virtualizarr/tests/test_manifests/test_array.py @@ -934,13 +934,20 @@ def test_misaligned_with_chunks(self, manifest_array, in_shape, in_chunks, index class TestSubChunkSlicingUncompressed: - # For an uncompressed C-order array, sub-chunk slicing along axis 0 (the contiguous - # axis in memory layout) can be expressed purely as a byte-offset/length adjustment - # into the same source file. Issue #86. Scope: slice fully contained within one - # original chunk along axis 0. Anything else still raises SubChunkIndexingError. + # For an uncompressed array, sub-chunk slicing along the axis with the largest byte + # stride in storage can be expressed purely as a byte-offset/length adjustment into + # the same source file. Issue #86. Scope: slice fully contained within one source + # chunk along that axis. The eligible-axis is axis 0 for the plain BytesCodec case + # (C-order) or axis ``order[0]`` when a TransposeCodec is prepended. BYTES_CODEC = {"name": "bytes", "configuration": {"endian": "little"}} ZLIB_CODEC = {"name": "numcodecs.zlib", "configuration": {"level": 1}} + # Reversed transpose makes the LAST logical axis the largest-stride axis in storage, + # so this is the F-order equivalent of [BytesCodec]. + F_ORDER_2D_CODECS = [ + {"name": "transpose", "configuration": {"order": [1, 0]}}, + BYTES_CODEC, + ] def _uncompressed_marr( self, @@ -973,8 +980,7 @@ def _uncompressed_marr( @pytest.mark.parametrize( "shape, chunks, indexer, expected_shape, expected_chunks, expected_entries", [ - # 1D, single chunk: marr[1:3] on shape=(8,) chunks=(4,) — sits in chunk 0. - # itemsize=8, stride along axis 0 = 8. Offset += 1*8, length = 2*8. + # chunk size = 32 bytes, axis-0 stride = 8 bytes ( (8,), (4,), @@ -983,7 +989,6 @@ def _uncompressed_marr( (2,), {"0": {"offset": 1000 + 8, "length": 16}}, ), - # 1D, second chunk: marr[5:7] -> chunk 1 base (1000+32), inner offset (5-4)*8. ( (8,), (4,), @@ -992,8 +997,7 @@ def _uncompressed_marr( (2,), {"0": {"offset": 1000 + 32 + 8, "length": 16}}, ), - # 2D, axis 0 sub-chunk, axis 1 covers the only chunk-col. - # shape=(8, 4) chunks=(4, 4): per-chunk = 4*4*8 = 128 bytes; axis-0 stride = 4*8 = 32. + # chunk size = 128 bytes, axis-0 stride = 32 bytes ( (8, 4), (4, 4), @@ -1002,8 +1006,7 @@ def _uncompressed_marr( (2, 4), {"0.0": {"offset": 1000 + 32, "length": 64}}, ), - # 2D, axis 0 sub-chunk + axis 1 chunk-aligned across multiple chunks. - # shape=(8, 6) chunks=(4, 3): per-chunk = 4*3*8 = 96 bytes; axis-0 stride = 3*8 = 24. + # chunk size = 96 bytes, axis-0 stride = 24 bytes ( (8, 6), (4, 3), @@ -1068,6 +1071,77 @@ def test_sub_chunk_slice_on_compressed_array_raises(self, array_v3_metadata): with pytest.raises(SubChunkIndexingError, match="split individual chunks"): marr[1:3, :] + # F-order tests — TransposeCodec(order=(1, 0)) before BytesCodec means the + # largest-stride axis in storage is logical axis 1 (the LAST axis), so sub-chunk + # slicing applies there instead of axis 0. + + @pytest.mark.parametrize( + "shape, chunks, indexer, expected_shape, expected_chunks, expected_entries", + [ + # chunk size = 128 bytes, axis-1 stride = 32 bytes + ( + (8, 4), + (4, 4), + np.s_[:, 1:3], + (8, 2), + (4, 2), + { + "0.0": {"offset": 1000 + 32, "length": 64}, + "1.0": {"offset": 1128 + 32, "length": 64}, + }, + ), + # chunk size = 96 bytes, axis-1 stride = 24 bytes + ( + (6, 8), + (3, 4), + np.s_[:, 5:7], + (6, 2), + (3, 2), + { + "0.0": {"offset": 1000 + 96 + 24, "length": 48}, + "1.0": {"offset": 1000 + 96 + 96 + 96 + 24, "length": 48}, + }, + ), + ], + ) + def test_f_order_sub_chunk_on_last_axis( + self, + array_v3_metadata, + shape, + chunks, + indexer, + expected_shape, + expected_chunks, + expected_entries, + ): + marr = self._uncompressed_marr( + array_v3_metadata, + shape=shape, + chunks=chunks, + codecs=self.F_ORDER_2D_CODECS, + ) + + sliced = marr[indexer] + + assert sliced.shape == expected_shape + assert sliced.chunks == expected_chunks + expected = { + k: {"path": "file:///foo.nc", **v} for k, v in expected_entries.items() + } + assert sliced.manifest.dict() == expected + + def test_f_order_sub_chunk_on_axis_0_raises(self, array_v3_metadata): + # In F-order, axis 0 is the fastest-changing-in-memory axis, so slicing it + # gives interleaved (non-contiguous) bytes. Out of scope. + marr = self._uncompressed_marr( + array_v3_metadata, + shape=(8, 4), + chunks=(4, 4), + codecs=self.F_ORDER_2D_CODECS, + ) + with pytest.raises(SubChunkIndexingError, match="split individual chunks"): + marr[1:3, :] + def test_to_xarray(array_v3_metadata): chunks = (5, 10) From db271415be76ad08342f78c35b4069347524b0d8 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 14:06:32 -0400 Subject: [PATCH 3/6] refactor: extract _compute_sub_chunk_axis_selection helper The sub-chunk branch in apply_selection had inline arithmetic mixed with the surrounding dispatch logic. Pull it out alongside the other _compute_* helpers so apply_selection's per-axis loop reads as a straight two-way choice between chunk-aligned and sub-chunk paths. --- virtualizarr/manifests/indexing.py | 62 +++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index 7f478cc9..fc859bb6 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -190,25 +190,19 @@ def apply_selection( if axis == sub_chunk_axis and _is_sub_chunk_slice( indexer_1d, axis_length, chunk_size ): - # Sub-chunk slicing on an uncompressed array along its largest-stride axis: - # slice fits within one source chunk, expressed as a byte-offset adjustment. assert isinstance(indexer_1d, slice) # narrowed by the helper above - start, stop, _ = indexer_1d.indices(axis_length) - chunk_index = start // chunk_size - chunk_grid_selector = slice(chunk_index, chunk_index + 1, 1) - new_axis_length = stop - start - new_chunks_for_axis = new_axis_length - # bytes per index step along this axis within one chunk = - # product of all chunk sizes for the other axes (regardless of their order in - # storage — multiplication is commutative) times itemsize. - stride_bytes = ( - int(np.prod([c for i, c in enumerate(marr.chunks) if i != axis])) - * marr.dtype.itemsize - ) - sub_chunk_byte_adjust = ( - (start - chunk_index * chunk_size) * stride_bytes, - new_axis_length * stride_bytes, + chunk_grid_selector, new_axis_length, sub_chunk_byte_adjust = ( + _compute_sub_chunk_axis_selection( + indexer_1d, + axis_length=axis_length, + chunk_size=chunk_size, + other_axis_chunks=tuple( + c for i, c in enumerate(marr.chunks) if i != axis + ), + itemsize=marr.dtype.itemsize, + ) ) + new_chunks_for_axis = new_axis_length else: chunk_grid_selector, new_axis_length = _compute_chunk_aligned_selection_1d( indexer_1d, axis_length=axis_length, chunk_size=chunk_size @@ -217,7 +211,7 @@ def apply_selection( chunk_grid_selectors.append(chunk_grid_selector) # int indexers drop the axis from the output array; slices preserve it (including the - # sub-chunk path above, which uses a length-1 chunk-grid slice selector). + # sub-chunk path, which uses a length-1 chunk-grid slice selector). if not isinstance(indexer_1d, int): new_shape.append(new_axis_length) new_chunks.append(new_chunks_for_axis) @@ -303,6 +297,38 @@ def _compute_chunk_aligned_selection_1d( return slice(chunk_start, chunk_stop, 1), stop - start +def _compute_sub_chunk_axis_selection( + indexer_1d: slice, + axis_length: int, + chunk_size: int, + other_axis_chunks: tuple[int, ...], + itemsize: int, +) -> tuple[slice, int, tuple[int, int]]: + """ + Translate a sub-chunk slice along the eligible (largest-stride) storage axis into a + chunk-grid selector, an output axis length, and a uniform byte adjustment + ``(offset_delta, new_chunk_byte_length)`` applied to every surviving chunk reference. + + Callers must have already confirmed that this slice is sub-chunk-eligible via + ``_is_sub_chunk_slice`` and that the array is uncompressed via + ``_uncompressed_sub_chunk_axis``. + """ + start, stop, _ = indexer_1d.indices(axis_length) + chunk_index = start // chunk_size + new_axis_length = stop - start + # Bytes per index step along this axis within one chunk is the product of every + # *other* axis's chunk size, times itemsize. Order doesn't matter since the product + # is commutative. + stride_bytes = int(np.prod(other_axis_chunks)) * itemsize + inner_offset_bytes = (start - chunk_index * chunk_size) * stride_bytes + sub_chunk_byte_adjust = (inner_offset_bytes, new_axis_length * stride_bytes) + return ( + slice(chunk_index, chunk_index + 1, 1), + new_axis_length, + sub_chunk_byte_adjust, + ) + + def _subset_manifest( manifest: ChunkManifest, chunk_grid_selectors: tuple[int | slice, ...] ) -> ChunkManifest: From 14118c1b0ca72550d05e5d1e6f054a6c0b0b005d Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 14:11:00 -0400 Subject: [PATCH 4/6] refactor: narrow _is_sub_chunk_slice's return type with TypeGuard Removes the explicit isinstance assert at the call site: TypeGuard[slice] tells mypy that when the function returns True, its int|slice argument is in fact a slice. _compute_sub_chunk_axis_selection then accepts the narrowed value directly. Pure typing-level change, no runtime effect. --- virtualizarr/manifests/indexing.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/virtualizarr/manifests/indexing.py b/virtualizarr/manifests/indexing.py index fc859bb6..3c8255ba 100644 --- a/virtualizarr/manifests/indexing.py +++ b/virtualizarr/manifests/indexing.py @@ -1,5 +1,5 @@ from types import EllipsisType -from typing import TYPE_CHECKING, Any, TypeAlias, cast +from typing import TYPE_CHECKING, Any, TypeAlias, TypeGuard, cast import numpy as np from zarr.codecs import BytesCodec, TransposeCodec @@ -190,7 +190,6 @@ def apply_selection( if axis == sub_chunk_axis and _is_sub_chunk_slice( indexer_1d, axis_length, chunk_size ): - assert isinstance(indexer_1d, slice) # narrowed by the helper above chunk_grid_selector, new_axis_length, sub_chunk_byte_adjust = ( _compute_sub_chunk_axis_selection( indexer_1d, @@ -419,11 +418,14 @@ def _uncompressed_sub_chunk_axis(metadata: ArrayV3Metadata) -> int | None: def _is_sub_chunk_slice( indexer_1d: int | slice, axis_length: int, chunk_size: int -) -> bool: +) -> TypeGuard[slice]: """ True iff this is a slice that should take the sub-chunk path: step == 1, non-empty, fits entirely within one source chunk, and is NOT already chunk-aligned (chunk-aligned slices go through the simpler aligned path). + + Typed as ``TypeGuard[slice]`` so callers can pass the narrowed indexer straight into + helpers that take a ``slice``. """ if not isinstance(indexer_1d, slice): return False From ae5c12f014711c3f167a8d6bbbd600e22960dcd8 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 14:41:52 -0400 Subject: [PATCH 5/6] test: end-to-end netCDF3 sub-chunk slice through icechunk Reads a 2D netCDF3 file, .isel's within the single source chunk on axis 0, writes to an in-memory icechunk store, reads back via xr.open_zarr, and asserts byte-equality against a direct xr.open_dataset + .isel on the same file. Exercises the sub-chunk byte-adjustment path through the parser, accessor, and store layers in one go. --- virtualizarr/tests/test_integration.py | 50 +++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index 8e4d6bc8..c5f0d1c9 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -18,7 +18,7 @@ ManifestStore, ) from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.parsers import HDFParser, ZarrParser +from virtualizarr.parsers import HDFParser, NetCDF3Parser, ZarrParser from virtualizarr.parsers.kerchunk.translator import manifestgroup_from_kerchunk_refs from virtualizarr.tests import ( has_fastparquet, @@ -26,6 +26,7 @@ has_kerchunk, requires_kerchunk, requires_network, + requires_scipy, requires_zarr_python, slow_test, ) @@ -551,3 +552,50 @@ def test_roundtrip_dataset_with_multiple_compressors(): ) as observed, ): xr.testing.assert_allclose(expected, observed) + + +@requires_scipy +@requires_zarr_python +@pytest.mark.skipif(not has_icechunk, reason="icechunk not available") +def test_subchunk_slice_netcdf3_through_icechunk_roundtrip(tmp_path, local_registry): + # End-to-end check of the uncompressed sub-chunk slicing path: parse a netCDF3 + # file (whose variables become single multi-row chunks), .isel within a chunk, + # write to icechunk, read back, and confirm bytes match a direct netCDF3 read + + # slice on the same file. + data = np.arange(32, dtype=np.float64).reshape(8, 4) + ds = xr.Dataset({"foo": (["time", "x"], data)}) + nc_path = tmp_path / "data.nc" + ds.to_netcdf(nc_path, format="NETCDF3_CLASSIC") + nc_url = f"file://{nc_path}" + + with open_virtual_dataset( + url=nc_url, parser=NetCDF3Parser(), registry=local_registry + ) as vds: + # netCDF3 puts all rows in one source chunk; this slice is sub-chunk on axis 0. + sliced_vds = vds.isel(time=slice(1, 3)) + + storage = icechunk.Storage.new_in_memory() + config = icechunk.RepositoryConfig.default() + container = icechunk.VirtualChunkContainer( + url_prefix=PYTEST_TMP_DIRECTORY_URL_PREFIX, + store=icechunk.local_filesystem_store(PYTEST_TMP_DIRECTORY_URL_PREFIX), + ) + config.set_virtual_chunk_container(container) + repo = icechunk.Repository.create( + storage=storage, + config=config, + authorize_virtual_chunk_access={PYTEST_TMP_DIRECTORY_URL_PREFIX: None}, + ) + session = repo.writable_session("main") + sliced_vds.vz.to_icechunk(session.store) + session.commit("sub-chunk slice") + + read_session = repo.readonly_session("main") + with ( + xr.open_zarr( + read_session.store, zarr_format=3, consolidated=False + ) as roundtripped, + xr.open_dataset(nc_path) as direct, + ): + expected = direct.isel(time=slice(1, 3)) + xrt.assert_identical(roundtripped.load(), expected.load()) From 20d4e6584994d3dbf8d1caf667395ada045ec708 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 18 May 2026 14:49:09 -0400 Subject: [PATCH 6/6] test: tidy fixtures and decorators around the sub-chunk integration test Three small cleanups around test_integration.py: - Move the netcdf3_file fixture from test_parsers/conftest.py to the root conftest.py and turn it into a factory that accepts a Dataset, so both the existing parser test and the new integration test can share it. - Replace pytest.mark.skipif(not has_icechunk, ...) with the existing @requires_icechunk decorator from virtualizarr.tests. - Drop @requires_zarr_python and its definition: zarr is a hard project dependency, so the decorator is a no-op import-skip that misleadingly suggests zarr is optional. --- conftest.py | 14 ++++++++++++++ virtualizarr/tests/__init__.py | 1 - virtualizarr/tests/test_integration.py | 14 ++++++-------- virtualizarr/tests/test_parsers/conftest.py | 10 ---------- virtualizarr/tests/test_parsers/test_netcdf3.py | 2 +- 5 files changed, 21 insertions(+), 20 deletions(-) diff --git a/conftest.py b/conftest.py index 11026ed8..4e6d7694 100644 --- a/conftest.py +++ b/conftest.py @@ -80,6 +80,20 @@ def local_registry(): return ObjectStoreRegistry({"file://": LocalStore()}) +@pytest.fixture +def netcdf3_file(tmp_path: Path): + """Factory for writing a temporary netCDF3 file with a caller-supplied Dataset.""" + + def _make(ds: xr.Dataset | None = None, name: str = "file.nc") -> Path: + if ds is None: + ds = xr.Dataset({"foo": ("x", np.array([1, 2, 3]))}) + filepath = tmp_path / name + ds.to_netcdf(filepath, format="NETCDF3_CLASSIC") + return filepath + + return _make + + @pytest.fixture(params=["int8", "uint8", "float32"]) def zarr_array_fill_value(request): store = zarr.storage.MemoryStore() diff --git a/virtualizarr/tests/__init__.py b/virtualizarr/tests/__init__.py index 093ffdf5..16cf2fd6 100644 --- a/virtualizarr/tests/__init__.py +++ b/virtualizarr/tests/__init__.py @@ -38,7 +38,6 @@ def _importorskip( has_tifffile, requires_tifffile = _importorskip("tifffile") has_imagecodecs, requires_imagecodecs = _importorskip("imagecodecs") has_hdf5plugin, requires_hdf5plugin = _importorskip("hdf5plugin") -has_zarr_python, requires_zarr_python = _importorskip("zarr") has_dask, requires_dask = _importorskip("dask") has_obstore, requires_obstore = _importorskip("obstore") has_tiff, requires_tiff = _importorskip("virtual_tiff") diff --git a/virtualizarr/tests/test_integration.py b/virtualizarr/tests/test_integration.py index c5f0d1c9..9a7f3554 100644 --- a/virtualizarr/tests/test_integration.py +++ b/virtualizarr/tests/test_integration.py @@ -24,10 +24,10 @@ has_fastparquet, has_icechunk, has_kerchunk, + requires_icechunk, requires_kerchunk, requires_network, requires_scipy, - requires_zarr_python, slow_test, ) from virtualizarr.tests.utils import PYTEST_TMP_DIRECTORY_URL_PREFIX @@ -180,7 +180,6 @@ def roundtrip_as_in_memory_icechunk( ) -@requires_zarr_python @pytest.mark.parametrize( "roundtrip_func", [ @@ -555,17 +554,16 @@ def test_roundtrip_dataset_with_multiple_compressors(): @requires_scipy -@requires_zarr_python -@pytest.mark.skipif(not has_icechunk, reason="icechunk not available") -def test_subchunk_slice_netcdf3_through_icechunk_roundtrip(tmp_path, local_registry): +@requires_icechunk +def test_subchunk_slice_netcdf3_through_icechunk_roundtrip( + netcdf3_file, local_registry +): # End-to-end check of the uncompressed sub-chunk slicing path: parse a netCDF3 # file (whose variables become single multi-row chunks), .isel within a chunk, # write to icechunk, read back, and confirm bytes match a direct netCDF3 read + # slice on the same file. data = np.arange(32, dtype=np.float64).reshape(8, 4) - ds = xr.Dataset({"foo": (["time", "x"], data)}) - nc_path = tmp_path / "data.nc" - ds.to_netcdf(nc_path, format="NETCDF3_CLASSIC") + nc_path = netcdf3_file(xr.Dataset({"foo": (["time", "x"], data)})) nc_url = f"file://{nc_path}" with open_virtual_dataset( diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py index 23d30a4f..d1fe22ef 100644 --- a/virtualizarr/tests/test_parsers/conftest.py +++ b/virtualizarr/tests/test_parsers/conftest.py @@ -366,16 +366,6 @@ def root_coordinates_hdf5_file(tmp_path: Path, np_uncompressed_int16) -> str: return filepath -@pytest.fixture -def netcdf3_file(tmp_path: Path) -> Path: - ds = xr.Dataset({"foo": ("x", np.array([1, 2, 3]))}) - - filepath = tmp_path / "file.nc" - ds.to_netcdf(filepath, format="NETCDF3_CLASSIC") - - return filepath - - @pytest.fixture def non_coord_dim(tmpdir): filepath = f"{tmpdir}/non_coord_dim.nc" diff --git a/virtualizarr/tests/test_parsers/test_netcdf3.py b/virtualizarr/tests/test_parsers/test_netcdf3.py index 87d24f60..bd4dc09a 100644 --- a/virtualizarr/tests/test_parsers/test_netcdf3.py +++ b/virtualizarr/tests/test_parsers/test_netcdf3.py @@ -10,7 +10,7 @@ @requires_scipy def test_read_netcdf3(netcdf3_file, array_v3_metadata, local_registry): - filepath = str(netcdf3_file) + filepath = str(netcdf3_file()) url = f"file://{filepath}" parser = NetCDF3Parser() with (