diff --git a/conftest.py b/conftest.py index 11026ed8..ee242e12 100644 --- a/conftest.py +++ b/conftest.py @@ -19,7 +19,6 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.utils import ceildiv # Pytest configuration @@ -133,10 +132,9 @@ def _generate_chunk_entries( dict Mapping of chunk keys to entry dictionaries """ - chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) - for axis_length, chunk_length in zip(shape, chunks) - ) + from zarr.experimental import ChunkGrid + + chunk_grid_shape = ChunkGrid.from_sizes(shape, chunks).grid_shape if chunk_grid_shape == (): return {"0": entry_generator((0,), (0,), itemsize)} diff --git a/pyproject.toml b/pyproject.toml index 834cab0d..fa5a5ce3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,6 +134,7 @@ dev = [ "s3fs", "lithops", "dask", + "tifffile>=2026.2.16,<2027" ] [project.urls] @@ -182,6 +183,11 @@ h5netcdf = ">=1.5.0,<2" [tool.pixi.feature.icechunk-dev.dependencies] rust = "*" +[tool.pixi.feature.rectilinear.pypi-dependencies] +zarr = { git = "https://github.com/zarr-developers/zarr-python.git", branch = "main" } +xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "poc/unified-zarr-chunk-grid" } +virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "poc/unified-chunk-grid" } + [tool.pixi.feature.minimum-versions.dependencies] xarray = "==2025.6.0" numpy = "==2.1.0" @@ -214,6 +220,7 @@ test-py314 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "ke minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "py314", "zarr", "minio"] minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr","minimum-versions", "py312"] upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "zarr", "py313"] +rectilinear = ["dev", "test", "tiff", "hdf5-lib", "rectilinear", "py313"] all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "all_parsers", "all_writers", "py313"] docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff","zarr", "py313"] diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 511e2972..676a53c7 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -1,9 +1,10 @@ import warnings -from typing import TYPE_CHECKING, Any, Callable, Union, cast +from typing import Any, Callable, Union, cast import numpy as np import xarray as xr from zarr.core.metadata.v3 import ArrayV3Metadata +from zarr.experimental import ChunkGrid import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -14,20 +15,6 @@ from virtualizarr.manifests.manifest import ChunkManifest from virtualizarr.manifests.utils import ChunkKeySeparator -# Type-check against the min-deps name (RegularChunkGrid) while keeping a -# runtime try/except so both zarr-python <=3.1.6 and >3.1.6 work. Version -# sniffing is unreliable under hatch-vcs when installed from a git source -# without fetched tags. -if TYPE_CHECKING: - from zarr.core.metadata.v3 import RegularChunkGrid as RegularChunkGridMetadata -else: - try: - from zarr.core.metadata.v3 import RegularChunkGridMetadata # zarr-python>3.1.6 - except ImportError: - from zarr.core.metadata.v3 import ( - RegularChunkGrid as RegularChunkGridMetadata, # zarr-python<=3.1.6 - ) - class ManifestArray: """ @@ -64,11 +51,6 @@ def __init__( # try unpacking the dict _metadata = ArrayV3Metadata(**metadata) - if not isinstance(_metadata.chunk_grid, RegularChunkGridMetadata): - raise NotImplementedError( - f"Only RegularChunkGrid is currently supported for chunk size, but got type {type(_metadata.chunk_grid)}" - ) - if isinstance(chunkmanifest, ChunkManifest): _chunkmanifest = chunkmanifest elif isinstance(chunkmanifest, dict): @@ -97,11 +79,22 @@ def metadata(self) -> ArrayV3Metadata: return self._metadata @property - def chunks(self) -> tuple[int, ...]: + def chunk_grid(self) -> ChunkGrid: + """Behavioral chunk grid bound to this array's shape.""" + return ChunkGrid.from_metadata(self._metadata) + + @property + def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: """ Individual chunk size by number of elements. + + For regular grids, returns a tuple of ints (e.g., (30, 50)). + For rectilinear grids, returns a tuple of tuples (e.g., ((10, 20, 30), (50, 50))). """ - return self._metadata.chunks + grid = self.chunk_grid + if grid.is_regular: + return grid.chunk_shape + return grid.chunk_sizes @property def dtype(self) -> np.dtype: diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index e3ac8525..01942fe4 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -2,8 +2,7 @@ from typing import TYPE_CHECKING, Any, Callable, Union, cast import numpy as np - -from virtualizarr.utils import determine_chunk_grid_shape +from zarr.experimental import ChunkGrid from .manifest import ChunkManifest from .utils import ( @@ -102,8 +101,17 @@ def concatenate( [arr.manifest for arr in arrays], axis=axis ) + # For rectilinear grids, concatenate chunk edges along the concat axis + new_chunks = None + if not first_arr.chunk_grid.is_regular: + new_chunks = list(first_arr.chunks) + concat_edges: tuple[int, ...] = () + for arr in arrays: + concat_edges = concat_edges + arr.chunks[axis] # type: ignore[index] + new_chunks[axis] = concat_edges + new_metadata = copy_and_replace_metadata( - old_metadata=first_arr.metadata, new_shape=new_shape + old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks ) return ManifestArray(chunkmanifest=concatenated_manifest, metadata=new_metadata) @@ -151,7 +159,11 @@ def stack( # chunk shape has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks new_chunks = list(old_chunks) - new_chunks.insert(axis, 1) + # For rectilinear grids, each element is a sequence; insert a single-element tuple + if not first_arr.chunk_grid.is_regular: + new_chunks.insert(axis, (1,)) + else: + new_chunks.insert(axis, 1) new_metadata = copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks @@ -187,22 +199,21 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra # new chunk_shape is old chunk_shape with singleton dimensions prepended # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements) - old_chunk_shape = x.chunks + # broadcast_to only applies to regular chunk grids + old_chunk_shape: tuple[int, ...] = x.chunks # type: ignore[assignment] new_chunk_shape = _prepend_singleton_dimensions( old_chunk_shape, ndim=len(new_shape) ) - # find new chunk grid shape by dividing new array shape by new chunk shape - new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape) - - # do broadcasting of entries in manifest - broadcasted_manifest = _broadcast_manifest(x.manifest, shape=new_chunk_grid_shape) - new_metadata = copy_and_replace_metadata( old_metadata=x.metadata, new_shape=list(new_shape), new_chunks=list(new_chunk_shape), ) + new_chunk_grid_shape = ChunkGrid.from_metadata(new_metadata).grid_shape + + # do broadcasting of entries in manifest + broadcasted_manifest = _broadcast_manifest(x.manifest, shape=new_chunk_grid_shape) return ManifestArray(chunkmanifest=broadcasted_manifest, metadata=new_metadata) diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 8e3fc874..6837c099 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -1,10 +1,12 @@ import functools import re import typing +from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Dict, Iterable, Literal, Optional, Union import numpy as np from zarr import Array +from zarr.core.chunk_grids import _is_rectilinear_chunks from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike from zarr.core.metadata.v3 import ( ArrayV3Metadata, @@ -230,7 +232,7 @@ def check_same_codecs(codecs: list[Any]) -> None: ) -def check_same_chunk_shapes(chunks_list: list[tuple[int, ...]]) -> None: +def check_same_chunk_shapes(chunks_list: list[Sequence]) -> None: """Check all the chunk shapes are the same""" first_chunks, *other_chunks_list = chunks_list @@ -273,11 +275,17 @@ def _remove_elements_at_positions( def check_no_partial_chunks_on_concat_axis( - shapes: list[tuple[int, ...]], chunks: list[tuple[int, ...]], axis: int + shapes: list[tuple[int, ...]], chunks: list, axis: int ): - """Check that there are no partial chunks along the concatenation axis""" - # loop over the arrays to be concatenated + """Check that there are no partial chunks along the concatenation axis. + + Only applies to regular chunk grids; rectilinear grids explicitly encode + variable chunk sizes so partial-chunk checks are not needed. + """ for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)): + # Rectilinear grids have sequences along each axis; skip the check + if _is_rectilinear_chunks(chunk_shape): + continue if shape[axis] % chunk_shape[axis] > 0: raise ValueError( "Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. " @@ -347,7 +355,7 @@ def check_compatible_arrays( def copy_and_replace_metadata( old_metadata: ArrayV3Metadata, new_shape: list[int] | None = None, - new_chunks: list[int] | None = None, + new_chunks: list | None = None, new_dimension_names: Iterable[str] | None | Literal["default"] = "default", new_attributes: dict | None = None, ) -> ArrayV3Metadata: @@ -361,10 +369,19 @@ def copy_and_replace_metadata( if new_shape is not None: metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment] if new_chunks is not None: - metadata_copy["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": tuple(new_chunks)}, - } + if _is_rectilinear_chunks(new_chunks): + metadata_copy["chunk_grid"] = { + "name": "rectilinear", + "configuration": { + "kind": "inline", + "chunk_shapes": [list(c) for c in new_chunks], + }, + } + else: + metadata_copy["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": tuple(new_chunks)}, + } if new_dimension_names != "default": # need the option to use the literal string "default" as a sentinel value because None is a valid choice for zarr dimension_names metadata_copy["dimension_names"] = parse_dimension_names(new_dimension_names) diff --git a/virtualizarr/parsers/kerchunk/translator.py b/virtualizarr/parsers/kerchunk/translator.py index 6556e33d..f8f3f0bc 100644 --- a/virtualizarr/parsers/kerchunk/translator.py +++ b/virtualizarr/parsers/kerchunk/translator.py @@ -9,6 +9,7 @@ import ujson from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata +from zarr.experimental import ChunkGrid from virtualizarr.codecs import ( zarr_codec_config_to_v3, @@ -24,7 +25,6 @@ KerchunkArrRefs, KerchunkStoreRefs, ) -from virtualizarr.utils import determine_chunk_grid_shape def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata": @@ -179,10 +179,7 @@ def manifestarray_from_kerchunk_refs( chunk_dict, metadata, zattrs = parse_array_refs(arr_refs) # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs if chunk_dict: - chunk_grid_shape = determine_chunk_grid_shape( - metadata.shape, - metadata.chunks, - ) + chunk_grid_shape = ChunkGrid.from_metadata(metadata).grid_shape manifest = manifest_from_kerchunk_chunk_dict( chunk_dict, fs_root=fs_root, shape=chunk_grid_shape ) @@ -191,10 +188,7 @@ def manifestarray_from_kerchunk_refs( # empty variables don't have physical chunks, but zarray shows that the variable # is at least 1D - shape = determine_chunk_grid_shape( - metadata.shape, - metadata.chunks, - ) + shape = ChunkGrid.from_metadata(metadata).grid_shape manifest = ChunkManifest(entries={}, shape=shape) marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) else: diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index b753a191..5676d10c 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -6,7 +6,7 @@ from collections.abc import Coroutine, Iterable from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, Any, TypeVar, cast +from typing import Any, TypeVar, cast import numpy as np import obstore @@ -14,6 +14,8 @@ from obspec_utils.registry import ObjectStoreRegistry from zarr.api.asynchronous import open_group as open_group_async from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v3 import RegularChunkGridMetadata +from zarr.experimental import ChunkGrid from zarr.storage import ObjectStore from virtualizarr.manifests import ( @@ -26,21 +28,6 @@ validate_and_normalize_path_to_uri, ) from virtualizarr.manifests.utils import ChunkKeySeparator -from virtualizarr.utils import determine_chunk_grid_shape - -# Type-check against the min-deps name (RegularChunkGrid) while keeping a -# runtime try/except so both zarr-python <=3.1.6 and >3.1.6 work. Version -# sniffing is unreliable under hatch-vcs when installed from a git source -# without fetched tags. -if TYPE_CHECKING: - from zarr.core.metadata.v3 import RegularChunkGrid as RegularChunkGridMetadata -else: - try: - from zarr.core.metadata.v3 import RegularChunkGridMetadata # zarr-python>3.1.6 - except ImportError: - from zarr.core.metadata.v3 import ( - RegularChunkGrid as RegularChunkGridMetadata, # zarr-python<=3.1.6 - ) # obstore doesn't export a public base type for stores, so we use Any for now. ObstoreStore = Any @@ -282,7 +269,7 @@ async def construct_manifest_array( if not isinstance(array_v3_metadata.chunk_grid, RegularChunkGridMetadata): raise NotImplementedError( - f"Only RegularChunkGrid is supported, but array {zarr_array.path} " + f"Only RegularChunkGridMetadata is supported, but array {zarr_array.path} " f"uses {type(array_v3_metadata.chunk_grid).__name__}." ) @@ -396,12 +383,9 @@ async def build_chunk_manifest( missing, Zarr will return the fill_value for those regions when the array is read. """ - # For sharded arrays, chunk_grid.chunk_shape is the shard shape (not the inner - # chunk shape, which lives inside the ShardingCodec config). So this grid describes - # the number of shard files on disk, which is exactly what we want for the manifest. - chunk_grid_shape = determine_chunk_grid_shape( - metadata.shape, cast(RegularChunkGridMetadata, metadata.chunk_grid).chunk_shape - ) + # For sharded arrays, grid_shape reflects the number of shard files on disk, + # which is exactly what we want for the manifest. + chunk_grid_shape = ChunkGrid.from_metadata(metadata).grid_shape total_size = math.prod(chunk_grid_shape) # Handle scalar arrays diff --git a/virtualizarr/tests/conftest.py b/virtualizarr/tests/conftest.py index 55b08eee..f78bf178 100644 --- a/virtualizarr/tests/conftest.py +++ b/virtualizarr/tests/conftest.py @@ -2,6 +2,14 @@ import time import pytest +import zarr + + +@pytest.fixture(autouse=True) +def _enable_rectilinear_chunks(): + """Enable rectilinear chunks for all tests.""" + with zarr.config.set({"array.rectilinear_chunks": True}): + yield @pytest.fixture(scope="session") diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index cf0e110c..32b3bef2 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -1,10 +1,19 @@ +import numpy as np import pytest +import xarray as xr from obspec_utils.registry import ObjectStoreRegistry -from obstore.store import S3Store +from obstore.store import LocalStore, S3Store from xarray import Dataset, DataTree from virtualizarr import open_virtual_dataset, open_virtual_datatree -from virtualizarr.tests import requires_network, requires_tiff + +try: + from zarr.core.metadata.v3 import RectilinearChunkGrid # noqa: F401 + + has_rectilinear_chunk_grid_support = True +except ImportError: + has_rectilinear_chunk_grid_support = False +from virtualizarr.tests import requires_network, requires_tiff, requires_tifffile virtual_tiff = pytest.importorskip("virtual_tiff") @@ -40,3 +49,62 @@ def test_virtual_tiff_dataset() -> None: var = vds["0"].variable assert var.sizes == {"y": 10980, "x": 10980} assert var.dtype == " None: + """Test concatenating two virtual TIFF datasets with rectilinear chunk grids. + + Creates stripped TIFFs where image_height is not evenly divisible by rows_per_strip, + producing rectilinear chunks, then verifies they can be concatenated. + """ + import tifffile + + # Create two stripped TIFFs where image_height (100) is not evenly divisible + # by rows_per_strip (30), creating rectilinear chunks: [[30, 30, 30, 10], [50]] + shape = (100, 50) + rows_per_strip = 30 + + filepath1 = tmp_path / "test1.tif" + filepath2 = tmp_path / "test2.tif" + + tifffile.imwrite( + str(filepath1), np.ones(shape, dtype=np.uint8), rowsperstrip=rows_per_strip + ) + tifffile.imwrite( + str(filepath2), np.ones(shape, dtype=np.uint8) * 2, rowsperstrip=rows_per_strip + ) + + parser = virtual_tiff.VirtualTIFF(ifd=0) + registry = ObjectStoreRegistry({"file://": LocalStore()}) + + with ( + open_virtual_dataset( + url=f"file://{filepath1}", parser=parser, registry=registry + ) as vds1, + open_virtual_dataset( + url=f"file://{filepath2}", parser=parser, registry=registry + ) as vds2, + ): + # Verify both datasets have the expected shape + assert vds1["0"].sizes == {"y": 100, "x": 50} + assert vds2["0"].sizes == {"y": 100, "x": 50} + + # Verify both datasets have rectilinear chunk grids + assert isinstance( + vds1["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + ) + assert isinstance( + vds2["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + ) + + # Concatenate along a new dimension + combined = xr.concat([vds1, vds2], dim="time") + + assert isinstance(combined, Dataset) + assert combined["0"].sizes == {"time": 2, "y": 100, "x": 50} diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 87af9094..10e9e736 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -80,22 +80,6 @@ def soft_import(name: str, reason: str, strict: Optional[bool] = True): return None -def ceildiv(a: int, b: int) -> int: - """ - Ceiling division operator for integers. - - See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - """ - return -(a // -b) - - -def determine_chunk_grid_shape( - shape: tuple[int, ...], chunks: tuple[int, ...] -) -> tuple[int, ...]: - """Calculate the shape of the chunk grid based on array shape and chunk size.""" - return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) - - def convert_v3_to_v2_metadata( v3_metadata: ArrayV3Metadata, fill_value: Any = None ) -> ArrayV2Metadata: