From 31f829564cd0310e1de3defa4435bdf07e8583e3 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:47:56 -0500 Subject: [PATCH 01/15] Remove check for RegularChunkGrid --- virtualizarr/manifests/array.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 967ea986..c70f3ee1 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,7 +3,7 @@ import numpy as np import xarray as xr -from zarr.core.metadata.v3 import ArrayV3Metadata, RegularChunkGrid +from zarr.core.metadata.v3 import ArrayV3Metadata import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -49,11 +49,6 @@ def __init__( # try unpacking the dict _metadata = ArrayV3Metadata(**metadata) - if not isinstance(_metadata.chunk_grid, RegularChunkGrid): - raise NotImplementedError( - f"Only RegularChunkGrid is currently supported for chunk size, but got type {type(_metadata.chunk_grid)}" - ) - if isinstance(chunkmanifest, ChunkManifest): _chunkmanifest = chunkmanifest elif isinstance(chunkmanifest, dict): From 58ce60291a30fea8010cdb995145042dd5a54aaf Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:26:03 -0500 Subject: [PATCH 02/15] Temporarily unpin xarray --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 66b6ce67..0b7fbded 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.11" dynamic = ["version"] dependencies = [ - "xarray>=2025.03.0", + "xarray", "numpy>=2.0.0", "universal-pathlib", "numcodecs>=0.15.1", From d2dee3b5c24252afc15a86d1a568d012c17cf881 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 16 Feb 2026 21:41:17 -0500 Subject: [PATCH 03/15] Update envs --- pyproject.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 0b7fbded..47e56cef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ fits = [ "astropy", ] tiff = [ - "virtual-tiff", + "virtual-tiff @ git+https://github.com/virtual-zarr/virtual-tiff@rectilinear-chunks", ] kerchunk_parquet = [ "virtualizarr[remote]", @@ -128,6 +128,7 @@ dev = [ "s3fs", "lithops", "dask", + "tifffile>=2026.2.16,<2027" ] [project.urls] @@ -176,6 +177,10 @@ h5netcdf = ">=1.5.0,<2" [tool.pixi.feature.icechunk-dev.dependencies] rust = "*" +[tool.pixi.feature.rectilinear.pypi-dependencies] +zarr = { git = "https://github.com/jhamman/zarr-python.git", branch = "feature/rectilinear-chunk-grid" } +xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "variable-chunking" } + [tool.pixi.feature.minimum-versions.dependencies] xarray = "==2025.6.0" numpy = "==2.0.0" @@ -206,6 +211,7 @@ test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "ke minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"] minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"] upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"] +rectilinear = ["dev", "test", "tiff", "hdf5-lib", "rectilinear", "py313"] all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"] docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"] From b2c97ea28207b4f1cf66dcb455f2433dc27a1479 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 16 Feb 2026 21:41:39 -0500 Subject: [PATCH 04/15] Add a test using VirtualTIFF --- virtualizarr/tests/test_parsers/test_tiff.py | 63 +++++++++++++++++++- 1 file changed, 61 insertions(+), 2 deletions(-) diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index cf0e110c..381bd196 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -1,10 +1,12 @@ +import numpy as np import pytest +import xarray as xr from obspec_utils.registry import ObjectStoreRegistry -from obstore.store import S3Store +from obstore.store import LocalStore, S3Store from xarray import Dataset, DataTree from virtualizarr import open_virtual_dataset, open_virtual_datatree -from virtualizarr.tests import requires_network, requires_tiff +from virtualizarr.tests import requires_network, requires_tiff, requires_tifffile virtual_tiff = pytest.importorskip("virtual_tiff") @@ -40,3 +42,60 @@ def test_virtual_tiff_dataset() -> None: var = vds["0"].variable assert var.sizes == {"y": 10980, "x": 10980} assert var.dtype == " None: + """Test concatenating two virtual TIFF datasets with rectilinear chunk grids. + + Creates stripped TIFFs where image_height is not evenly divisible by rows_per_strip, + producing rectilinear chunks, then verifies they can be concatenated. + """ + import tifffile + + # Create two stripped TIFFs where image_height (100) is not evenly divisible + # by rows_per_strip (30), creating rectilinear chunks: [[30, 30, 30, 10], [50]] + shape = (100, 50) + rows_per_strip = 30 + + filepath1 = tmp_path / "test1.tif" + filepath2 = tmp_path / "test2.tif" + + tifffile.imwrite( + str(filepath1), np.ones(shape, dtype=np.uint8), rowsperstrip=rows_per_strip + ) + tifffile.imwrite( + str(filepath2), np.ones(shape, dtype=np.uint8) * 2, rowsperstrip=rows_per_strip + ) + + parser = virtual_tiff.VirtualTIFF(ifd=0) + registry = ObjectStoreRegistry({"file://": LocalStore()}) + + with ( + open_virtual_dataset( + url=f"file://{filepath1}", parser=parser, registry=registry + ) as vds1, + open_virtual_dataset( + url=f"file://{filepath2}", parser=parser, registry=registry + ) as vds2, + ): + # Verify both datasets have the expected shape + assert vds1["0"].sizes == {"y": 100, "x": 50} + assert vds2["0"].sizes == {"y": 100, "x": 50} + + # Verify both datasets have rectilinear chunk grids + from zarr.core.chunk_grids import RectilinearChunkGrid + + assert isinstance( + vds1["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + ) + assert isinstance( + vds2["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + ) + + # Concatenate along a new dimension + combined = xr.concat([vds1, vds2], dim="time") + + assert isinstance(combined, Dataset) + assert combined["0"].sizes == {"time": 2, "y": 100, "x": 50} From 17b1903190a295fe0fdfda5fef362f3436a83430 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 16 Feb 2026 21:55:55 -0500 Subject: [PATCH 05/15] Update array api --- virtualizarr/manifests/array.py | 16 ++++++++- virtualizarr/manifests/array_api.py | 7 +++- virtualizarr/manifests/utils.py | 30 +++++++++++++---- virtualizarr/vendor/zarr/core/chunk_grids.py | 35 ++++++++++++++++++++ 4 files changed, 79 insertions(+), 9 deletions(-) create mode 100644 virtualizarr/vendor/zarr/core/chunk_grids.py diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index c70f3ee1..0f5259dc 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -5,6 +5,13 @@ import xarray as xr from zarr.core.metadata.v3 import ArrayV3Metadata +try: + from zarr.core.chunk_grids import RegularChunkGrid + + has_rectilinear_chunk_grid_support = True +except ImportError: + has_rectilinear_chunk_grid_support = False + import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( MANIFESTARRAY_HANDLED_ARRAY_FUNCTIONS, @@ -73,10 +80,17 @@ def metadata(self) -> ArrayV3Metadata: return self._metadata @property - def chunks(self) -> tuple[int, ...]: + def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: """ Individual chunk size by number of elements. + + For RegularChunkGrid, returns a tuple of ints (e.g., (30, 50)). + For RectilinearChunkGrid, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))). """ + if has_rectilinear_chunk_grid_support and not isinstance( + self._metadata.chunk_grid, RegularChunkGrid + ): + return self._metadata.chunk_grid.chunk_shapes return self._metadata.chunks @property diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 238876d4..bb266803 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -3,6 +3,7 @@ import numpy as np from virtualizarr.utils import determine_chunk_grid_shape +from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence from .manifest import ChunkManifest from .utils import ( @@ -188,7 +189,11 @@ def stack( # chunk shape has changed because a length-1 axis has been inserted old_chunks = first_arr.chunks new_chunks = list(old_chunks) - new_chunks.insert(axis, 1) + # For rectilinear grids, each element is a sequence; insert a single-element tuple + if _is_nested_sequence(old_chunks): + new_chunks.insert(axis, (1,)) + else: + new_chunks.insert(axis, 1) new_metadata = copy_and_replace_metadata( old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 1ea9cddc..92f161d8 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -12,6 +12,7 @@ from zarr.dtype import parse_data_type from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs +from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence if TYPE_CHECKING: from .array import ManifestArray @@ -214,11 +215,17 @@ def _remove_element_at_position(t: tuple[int, ...], pos: int) -> tuple[int, ...] def check_no_partial_chunks_on_concat_axis( - shapes: list[tuple[int, ...]], chunks: list[tuple[int, ...]], axis: int + shapes: list[tuple[int, ...]], chunks: list, axis: int ): - """Check that there are no partial chunks along the concatenation axis""" - # loop over the arrays to be concatenated + """Check that there are no partial chunks along the concatenation axis. + + Only applies to regular chunk grids; rectilinear grids explicitly encode + variable chunk sizes so partial-chunk checks are not needed. + """ for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)): + # Rectilinear grids have sequences along each axis; skip the check + if _is_nested_sequence(chunk_shape): + continue if shape[axis] % chunk_shape[axis] > 0: raise ValueError( "Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. " @@ -285,10 +292,19 @@ def copy_and_replace_metadata( if new_shape is not None: metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment] if new_chunks is not None: - metadata_copy["chunk_grid"] = { - "name": "regular", - "configuration": {"chunk_shape": tuple(new_chunks)}, - } + if _is_nested_sequence(new_chunks): + metadata_copy["chunk_grid"] = { + "name": "rectilinear", + "configuration": { + "chunk_shapes": [list(c) for c in new_chunks], + "kind": "inline", + }, + } + else: + metadata_copy["chunk_grid"] = { + "name": "regular", + "configuration": {"chunk_shape": tuple(new_chunks)}, + } if new_dimension_names != "default": # need the option to use the literal string "default" as a sentinel value because None is a valid choice for zarr dimension_names metadata_copy["dimension_names"] = parse_dimension_names(new_dimension_names) diff --git a/virtualizarr/vendor/zarr/core/chunk_grids.py b/virtualizarr/vendor/zarr/core/chunk_grids.py new file mode 100644 index 00000000..309ceb3a --- /dev/null +++ b/virtualizarr/vendor/zarr/core/chunk_grids.py @@ -0,0 +1,35 @@ +""" +Vendored utilities from zarr-python for chunk grid handling. + +See https://github.com/zarr-developers/zarr-python/pull/3534 +""" + +from typing import Any + +from zarr.core.chunk_grids import ChunkGrid + + +def _is_nested_sequence(chunks: Any) -> bool: + """ + Check if chunks is a nested sequence (tuple of tuples/lists). + + Returns True for inputs like [[10, 20], [5, 5]] or [(10, 20), (5, 5)]. + Returns False for flat sequences like (10, 10) or [10, 10]. + + Vendored from https://github.com/zarr-developers/zarr-python/pull/3534 + """ + if isinstance(chunks, str | int | ChunkGrid): + return False + + if not hasattr(chunks, "__iter__"): + return False + + try: + first_elem = next(iter(chunks), None) + if first_elem is None: + return False + return hasattr(first_elem, "__iter__") and not isinstance( + first_elem, str | bytes | int + ) + except (TypeError, StopIteration): + return False From 8de7e4d1912a127c325e48ce77077524647c6a42 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 16 Feb 2026 22:14:11 -0500 Subject: [PATCH 06/15] Update array api --- pyproject.toml | 3 ++- virtualizarr/manifests/array.py | 13 ++++++------- virtualizarr/manifests/array_api.py | 3 ++- virtualizarr/manifests/utils.py | 5 +++-- virtualizarr/tests/test_parsers/test_tiff.py | 17 +++++++++++------ 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 47e56cef..265a5e57 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,7 @@ fits = [ "astropy", ] tiff = [ - "virtual-tiff @ git+https://github.com/virtual-zarr/virtual-tiff@rectilinear-chunks", + "virtual-tiff", ] kerchunk_parquet = [ "virtualizarr[remote]", @@ -180,6 +180,7 @@ rust = "*" [tool.pixi.feature.rectilinear.pypi-dependencies] zarr = { git = "https://github.com/jhamman/zarr-python.git", branch = "feature/rectilinear-chunk-grid" } xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "variable-chunking" } +virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "rectilinear-chunks" } [tool.pixi.feature.minimum-versions.dependencies] xarray = "==2025.6.0" diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 0f5259dc..a902f888 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,14 +3,13 @@ import numpy as np import xarray as xr +from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.metadata.v3 import ArrayV3Metadata -try: - from zarr.core.chunk_grids import RegularChunkGrid - - has_rectilinear_chunk_grid_support = True -except ImportError: - has_rectilinear_chunk_grid_support = False +has_rectilinear_chunk_grid_support = hasattr( + __import__("zarr.core.chunk_grids", fromlist=["RectilinearChunkGrid"]), + "RectilinearChunkGrid", +) import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -90,7 +89,7 @@ def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: if has_rectilinear_chunk_grid_support and not isinstance( self._metadata.chunk_grid, RegularChunkGrid ): - return self._metadata.chunk_grid.chunk_shapes + return self._metadata.chunk_grid.chunk_shapes # type: ignore[attr-defined] return self._metadata.chunks @property diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index bb266803..b19eac76 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -229,7 +229,8 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra # new chunk_shape is old chunk_shape with singleton dimensions prepended # (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements) - old_chunk_shape = x.chunks + # broadcast_to only applies to regular chunk grids + old_chunk_shape: tuple[int, ...] = x.chunks # type: ignore[assignment] new_chunk_shape = _prepend_singleton_dimensions( old_chunk_shape, ndim=len(new_shape) ) diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 92f161d8..2881f1c4 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -1,4 +1,5 @@ import re +from collections.abc import Sequence from typing import TYPE_CHECKING, Any, Dict, Iterable, Literal, Optional, Union import numpy as np @@ -178,7 +179,7 @@ def check_same_codecs(codecs: list[Any]) -> None: ) -def check_same_chunk_shapes(chunks_list: list[tuple[int, ...]]) -> None: +def check_same_chunk_shapes(chunks_list: list[Sequence]) -> None: """Check all the chunk shapes are the same""" first_chunks, *other_chunks_list = chunks_list @@ -278,7 +279,7 @@ def check_compatible_arrays( def copy_and_replace_metadata( old_metadata: ArrayV3Metadata, new_shape: list[int] | None = None, - new_chunks: list[int] | None = None, + new_chunks: list | None = None, new_dimension_names: Iterable[str] | None | Literal["default"] = "default", new_attributes: dict | None = None, ) -> ArrayV3Metadata: diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index 381bd196..e0fb3a77 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -6,6 +6,7 @@ from xarray import Dataset, DataTree from virtualizarr import open_virtual_dataset, open_virtual_datatree +from virtualizarr.manifests.array import has_rectilinear_chunk_grid_support from virtualizarr.tests import requires_network, requires_tiff, requires_tifffile virtual_tiff = pytest.importorskip("virtual_tiff") @@ -46,6 +47,10 @@ def test_virtual_tiff_dataset() -> None: @requires_tiff @requires_tifffile +@pytest.mark.skipif( + not has_rectilinear_chunk_grid_support, + reason="requires zarr with RectilinearChunkGrid support", +) def test_concat_rectilinear_tiff_datasets(tmp_path) -> None: """Test concatenating two virtual TIFF datasets with rectilinear chunk grids. @@ -84,14 +89,14 @@ def test_concat_rectilinear_tiff_datasets(tmp_path) -> None: assert vds1["0"].sizes == {"y": 100, "x": 50} assert vds2["0"].sizes == {"y": 100, "x": 50} - # Verify both datasets have rectilinear chunk grids - from zarr.core.chunk_grids import RectilinearChunkGrid + # Verify both datasets have rectilinear (non-regular) chunk grids + from zarr.core.chunk_grids import RegularChunkGrid - assert isinstance( - vds1["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + assert not isinstance( + vds1["0"].variable.data.metadata.chunk_grid, RegularChunkGrid ) - assert isinstance( - vds2["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + assert not isinstance( + vds2["0"].variable.data.metadata.chunk_grid, RegularChunkGrid ) # Concatenate along a new dimension From bb7dc7e1fa0b5ad0e4fb867654071ce3c269d698 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 16 Feb 2026 22:18:00 -0500 Subject: [PATCH 07/15] Repin xarray --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 265a5e57..dd3e6d84 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ requires-python = ">=3.11" dynamic = ["version"] dependencies = [ - "xarray", + "xarray>=2025.03.0", "numpy>=2.0.0", "universal-pathlib", "numcodecs>=0.15.1", From 3fe140ba396d5b41617161965e4789e0f1550ed3 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 9 Mar 2026 19:43:06 -0400 Subject: [PATCH 08/15] Use unified chunk grid POC --- pyproject.toml | 6 +++--- virtualizarr/manifests/array.py | 17 ++++++++--------- virtualizarr/manifests/utils.py | 1 - virtualizarr/tests/test_parsers/test_tiff.py | 10 ++-------- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dd3e6d84..9270b5a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -178,9 +178,9 @@ h5netcdf = ">=1.5.0,<2" rust = "*" [tool.pixi.feature.rectilinear.pypi-dependencies] -zarr = { git = "https://github.com/jhamman/zarr-python.git", branch = "feature/rectilinear-chunk-grid" } -xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "variable-chunking" } -virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "rectilinear-chunks" } +zarr = { git = "https://github.com/maxrjones/zarr-python.git", branch = "poc/unified-chunk-grid" } +xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "poc/unified-zarr-chunk-grid" } +virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "poc/unified-chunk-grid" } [tool.pixi.feature.minimum-versions.dependencies] xarray = "==2025.6.0" diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index a902f888..27cb5e71 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,13 +3,10 @@ import numpy as np import xarray as xr -from zarr.core.chunk_grids import RegularChunkGrid +from zarr.core.chunk_grids import ChunkGrid from zarr.core.metadata.v3 import ArrayV3Metadata -has_rectilinear_chunk_grid_support = hasattr( - __import__("zarr.core.chunk_grids", fromlist=["RectilinearChunkGrid"]), - "RectilinearChunkGrid", -) +has_rectilinear_chunk_grid_support = hasattr(ChunkGrid, "is_regular") import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -86,10 +83,12 @@ def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: For RegularChunkGrid, returns a tuple of ints (e.g., (30, 50)). For RectilinearChunkGrid, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))). """ - if has_rectilinear_chunk_grid_support and not isinstance( - self._metadata.chunk_grid, RegularChunkGrid - ): - return self._metadata.chunk_grid.chunk_shapes # type: ignore[attr-defined] + chunk_grid = self._metadata.chunk_grid + if has_rectilinear_chunk_grid_support and not chunk_grid.is_regular: + return tuple( + d.edges + for d in chunk_grid.dimensions # type: ignore[union-attr] + ) return self._metadata.chunks @property diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 2881f1c4..96bc1d4a 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -298,7 +298,6 @@ def copy_and_replace_metadata( "name": "rectilinear", "configuration": { "chunk_shapes": [list(c) for c in new_chunks], - "kind": "inline", }, } else: diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index e0fb3a77..4838acf1 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -90,14 +90,8 @@ def test_concat_rectilinear_tiff_datasets(tmp_path) -> None: assert vds2["0"].sizes == {"y": 100, "x": 50} # Verify both datasets have rectilinear (non-regular) chunk grids - from zarr.core.chunk_grids import RegularChunkGrid - - assert not isinstance( - vds1["0"].variable.data.metadata.chunk_grid, RegularChunkGrid - ) - assert not isinstance( - vds2["0"].variable.data.metadata.chunk_grid, RegularChunkGrid - ) + assert not vds1["0"].variable.data.metadata.chunk_grid.is_regular + assert not vds2["0"].variable.data.metadata.chunk_grid.is_regular # Concatenate along a new dimension combined = xr.concat([vds1, vds2], dim="time") From 10710d920ef696479ba6da1d1d68758a7b1ec959 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 17:44:34 -0400 Subject: [PATCH 09/15] Fastforward --- virtualizarr/manifests/array.py | 9 ++--- virtualizarr/manifests/array_api.py | 4 +-- virtualizarr/manifests/utils.py | 7 ++-- virtualizarr/vendor/zarr/core/chunk_grids.py | 35 -------------------- 4 files changed, 9 insertions(+), 46 deletions(-) delete mode 100644 virtualizarr/vendor/zarr/core/chunk_grids.py diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 67cca2d8..f9ccb613 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -80,15 +80,12 @@ def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: """ Individual chunk size by number of elements. - For RegularChunkGrid, returns a tuple of ints (e.g., (30, 50)). - For RectilinearChunkGrid, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))). + For regular grids, returns a tuple of ints (e.g., (30, 50)). + For rectilinear grids, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))). """ chunk_grid = self._metadata.chunk_grid if has_rectilinear_chunk_grid_support and not chunk_grid.is_regular: - return tuple( - d.edges - for d in chunk_grid.dimensions # type: ignore[union-attr] - ) + return chunk_grid.chunk_sizes return self._metadata.chunks @property diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 2bc1a4f2..08d30b3a 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,9 +1,9 @@ from typing import TYPE_CHECKING, Any, Callable, Union, cast import numpy as np +from zarr.core.chunk_grids import _is_rectilinear_chunks from virtualizarr.utils import determine_chunk_grid_shape -from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence from .manifest import ChunkManifest from .utils import ( @@ -192,7 +192,7 @@ def stack( old_chunks = first_arr.chunks new_chunks = list(old_chunks) # For rectilinear grids, each element is a sequence; insert a single-element tuple - if _is_nested_sequence(old_chunks): + if _is_rectilinear_chunks(old_chunks): new_chunks.insert(axis, (1,)) else: new_chunks.insert(axis, 1) diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py index 1eb5cc3d..a366672f 100644 --- a/virtualizarr/manifests/utils.py +++ b/virtualizarr/manifests/utils.py @@ -6,6 +6,7 @@ import numpy as np from zarr import Array +from zarr.core.chunk_grids import _is_rectilinear_chunks from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike from zarr.core.metadata.v3 import ( ArrayV3Metadata, @@ -15,7 +16,6 @@ from zarr.dtype import parse_data_type from virtualizarr.codecs import convert_to_codec_pipeline, get_codecs -from virtualizarr.vendor.zarr.core.chunk_grids import _is_nested_sequence if TYPE_CHECKING: from .array import ManifestArray @@ -278,7 +278,7 @@ def check_no_partial_chunks_on_concat_axis( """ for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)): # Rectilinear grids have sequences along each axis; skip the check - if _is_nested_sequence(chunk_shape): + if _is_rectilinear_chunks(chunk_shape): continue if shape[axis] % chunk_shape[axis] > 0: raise ValueError( @@ -346,10 +346,11 @@ def copy_and_replace_metadata( if new_shape is not None: metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment] if new_chunks is not None: - if _is_nested_sequence(new_chunks): + if _is_rectilinear_chunks(new_chunks): metadata_copy["chunk_grid"] = { "name": "rectilinear", "configuration": { + "kind": "inline", "chunk_shapes": [list(c) for c in new_chunks], }, } diff --git a/virtualizarr/vendor/zarr/core/chunk_grids.py b/virtualizarr/vendor/zarr/core/chunk_grids.py deleted file mode 100644 index 309ceb3a..00000000 --- a/virtualizarr/vendor/zarr/core/chunk_grids.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -Vendored utilities from zarr-python for chunk grid handling. - -See https://github.com/zarr-developers/zarr-python/pull/3534 -""" - -from typing import Any - -from zarr.core.chunk_grids import ChunkGrid - - -def _is_nested_sequence(chunks: Any) -> bool: - """ - Check if chunks is a nested sequence (tuple of tuples/lists). - - Returns True for inputs like [[10, 20], [5, 5]] or [(10, 20), (5, 5)]. - Returns False for flat sequences like (10, 10) or [10, 10]. - - Vendored from https://github.com/zarr-developers/zarr-python/pull/3534 - """ - if isinstance(chunks, str | int | ChunkGrid): - return False - - if not hasattr(chunks, "__iter__"): - return False - - try: - first_elem = next(iter(chunks), None) - if first_elem is None: - return False - return hasattr(first_elem, "__iter__") and not isinstance( - first_elem, str | bytes | int - ) - except (TypeError, StopIteration): - return False From f0be6917f7a6a4ee1e4b43c8f1c9562fc3c3d0e2 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Sat, 14 Mar 2026 19:23:19 -0400 Subject: [PATCH 10/15] Enable --- virtualizarr/tests/conftest.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/virtualizarr/tests/conftest.py b/virtualizarr/tests/conftest.py index a73de6e3..a2c95287 100644 --- a/virtualizarr/tests/conftest.py +++ b/virtualizarr/tests/conftest.py @@ -2,6 +2,14 @@ import time import pytest +import zarr + + +@pytest.fixture(autouse=True) +def _enable_rectilinear_chunks(): + """Enable rectilinear chunks for all tests.""" + with zarr.config.set({"array.rectilinear_chunks": True}): + yield @pytest.fixture(scope="session") From 228b4e26ee99ae7ac4eb7f01d792e075f82e2f4e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 19:55:15 -0400 Subject: [PATCH 11/15] Fast-forward --- virtualizarr/manifests/array.py | 20 ++++++++++++++------ virtualizarr/tests/test_parsers/test_tiff.py | 18 ++++++++++++++---- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index f9ccb613..6bcab0e5 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -3,10 +3,14 @@ import numpy as np import xarray as xr -from zarr.core.chunk_grids import ChunkGrid from zarr.core.metadata.v3 import ArrayV3Metadata -has_rectilinear_chunk_grid_support = hasattr(ChunkGrid, "is_regular") +try: + from zarr.core.metadata.v3 import RectilinearChunkGrid, RegularChunkGrid + + _has_chunk_grid_dtos = True +except ImportError: + _has_chunk_grid_dtos = False import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -81,11 +85,15 @@ def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: Individual chunk size by number of elements. For regular grids, returns a tuple of ints (e.g., (30, 50)). - For rectilinear grids, returns a tuple of tuples (e.g., ((30, 30, 30, 10), (50,))). + For rectilinear grids, returns a tuple of tuples (e.g., ((10, 20, 30), (50, 50))). """ - chunk_grid = self._metadata.chunk_grid - if has_rectilinear_chunk_grid_support and not chunk_grid.is_regular: - return chunk_grid.chunk_sizes + if _has_chunk_grid_dtos: + chunk_grid = self._metadata.chunk_grid + if isinstance(chunk_grid, RectilinearChunkGrid): + return chunk_grid.chunk_shapes + elif isinstance(chunk_grid, RegularChunkGrid): + return chunk_grid.chunk_shape + raise TypeError(f"Unknown chunk grid type: {type(chunk_grid)}") return self._metadata.chunks @property diff --git a/virtualizarr/tests/test_parsers/test_tiff.py b/virtualizarr/tests/test_parsers/test_tiff.py index 4838acf1..32b3bef2 100644 --- a/virtualizarr/tests/test_parsers/test_tiff.py +++ b/virtualizarr/tests/test_parsers/test_tiff.py @@ -6,7 +6,13 @@ from xarray import Dataset, DataTree from virtualizarr import open_virtual_dataset, open_virtual_datatree -from virtualizarr.manifests.array import has_rectilinear_chunk_grid_support + +try: + from zarr.core.metadata.v3 import RectilinearChunkGrid # noqa: F401 + + has_rectilinear_chunk_grid_support = True +except ImportError: + has_rectilinear_chunk_grid_support = False from virtualizarr.tests import requires_network, requires_tiff, requires_tifffile virtual_tiff = pytest.importorskip("virtual_tiff") @@ -89,9 +95,13 @@ def test_concat_rectilinear_tiff_datasets(tmp_path) -> None: assert vds1["0"].sizes == {"y": 100, "x": 50} assert vds2["0"].sizes == {"y": 100, "x": 50} - # Verify both datasets have rectilinear (non-regular) chunk grids - assert not vds1["0"].variable.data.metadata.chunk_grid.is_regular - assert not vds2["0"].variable.data.metadata.chunk_grid.is_regular + # Verify both datasets have rectilinear chunk grids + assert isinstance( + vds1["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + ) + assert isinstance( + vds2["0"].variable.data.metadata.chunk_grid, RectilinearChunkGrid + ) # Concatenate along a new dimension combined = xr.concat([vds1, vds2], dim="time") From c47eefedcb7e17f27bb4bd04938a05a19afa2ab5 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Fri, 20 Mar 2026 20:27:48 -0400 Subject: [PATCH 12/15] Fastforward --- conftest.py | 8 +++---- virtualizarr/manifests/array.py | 25 +++++++++------------ virtualizarr/manifests/array_api.py | 20 +++++++---------- virtualizarr/parsers/kerchunk/translator.py | 7 ++---- virtualizarr/parsers/zarr.py | 3 ++- virtualizarr/utils.py | 16 ------------- 6 files changed, 25 insertions(+), 54 deletions(-) diff --git a/conftest.py b/conftest.py index 185d9b24..63f51984 100644 --- a/conftest.py +++ b/conftest.py @@ -19,7 +19,6 @@ from virtualizarr.manifests import ChunkManifest, ManifestArray from virtualizarr.manifests.manifest import join from virtualizarr.manifests.utils import create_v3_array_metadata -from virtualizarr.utils import ceildiv # Pytest configuration @@ -126,10 +125,9 @@ def _generate_chunk_entries( dict Mapping of chunk keys to entry dictionaries """ - chunk_grid_shape = tuple( - ceildiv(axis_length, chunk_length) - for axis_length, chunk_length in zip(shape, chunks) - ) + from zarr.experimental import ChunkGrid + + chunk_grid_shape = ChunkGrid.from_regular(shape, chunks).shape if chunk_grid_shape == (): return {"0": entry_generator((0,), (0,), itemsize)} diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py index 6bcab0e5..9a2d9fc5 100644 --- a/virtualizarr/manifests/array.py +++ b/virtualizarr/manifests/array.py @@ -4,13 +4,7 @@ import numpy as np import xarray as xr from zarr.core.metadata.v3 import ArrayV3Metadata - -try: - from zarr.core.metadata.v3 import RectilinearChunkGrid, RegularChunkGrid - - _has_chunk_grid_dtos = True -except ImportError: - _has_chunk_grid_dtos = False +from zarr.experimental import ChunkGrid import virtualizarr.manifests.utils as utils from virtualizarr.manifests.array_api import ( @@ -79,6 +73,11 @@ def manifest(self) -> ChunkManifest: def metadata(self) -> ArrayV3Metadata: return self._metadata + @property + def chunk_grid(self) -> ChunkGrid: + """Behavioral chunk grid bound to this array's shape.""" + return ChunkGrid.from_metadata(self._metadata) + @property def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: """ @@ -87,14 +86,10 @@ def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]: For regular grids, returns a tuple of ints (e.g., (30, 50)). For rectilinear grids, returns a tuple of tuples (e.g., ((10, 20, 30), (50, 50))). """ - if _has_chunk_grid_dtos: - chunk_grid = self._metadata.chunk_grid - if isinstance(chunk_grid, RectilinearChunkGrid): - return chunk_grid.chunk_shapes - elif isinstance(chunk_grid, RegularChunkGrid): - return chunk_grid.chunk_shape - raise TypeError(f"Unknown chunk grid type: {type(chunk_grid)}") - return self._metadata.chunks + grid = self.chunk_grid + if grid.is_regular: + return grid.chunk_shape + return grid.chunk_sizes @property def dtype(self) -> np.dtype: diff --git a/virtualizarr/manifests/array_api.py b/virtualizarr/manifests/array_api.py index 08d30b3a..bb18d161 100644 --- a/virtualizarr/manifests/array_api.py +++ b/virtualizarr/manifests/array_api.py @@ -1,9 +1,7 @@ from typing import TYPE_CHECKING, Any, Callable, Union, cast import numpy as np -from zarr.core.chunk_grids import _is_rectilinear_chunks - -from virtualizarr.utils import determine_chunk_grid_shape +from zarr.experimental import ChunkGrid from .manifest import ChunkManifest from .utils import ( @@ -192,7 +190,7 @@ def stack( old_chunks = first_arr.chunks new_chunks = list(old_chunks) # For rectilinear grids, each element is a sequence; insert a single-element tuple - if _is_rectilinear_chunks(old_chunks): + if not first_arr.chunk_grid.is_regular: new_chunks.insert(axis, (1,)) else: new_chunks.insert(axis, 1) @@ -237,8 +235,12 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra old_chunk_shape, ndim=len(new_shape) ) - # find new chunk grid shape by dividing new array shape by new chunk shape - new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape) + new_metadata = copy_and_replace_metadata( + old_metadata=x.metadata, + new_shape=list(new_shape), + new_chunks=list(new_chunk_shape), + ) + new_chunk_grid_shape = ChunkGrid.from_metadata(new_metadata).shape # do broadcasting of entries in manifest broadcasted_paths = cast( # `np.broadcast_to` apparently is type hinted as if the output could have Any dtype @@ -264,12 +266,6 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra validate_paths=False, ) - new_metadata = copy_and_replace_metadata( - old_metadata=x.metadata, - new_shape=list(new_shape), - new_chunks=list(new_chunk_shape), - ) - return ManifestArray(chunkmanifest=broadcasted_manifest, metadata=new_metadata) diff --git a/virtualizarr/parsers/kerchunk/translator.py b/virtualizarr/parsers/kerchunk/translator.py index bb813c5e..f8fbe265 100644 --- a/virtualizarr/parsers/kerchunk/translator.py +++ b/virtualizarr/parsers/kerchunk/translator.py @@ -8,6 +8,7 @@ import ujson from zarr.core.common import JSON from zarr.core.metadata import ArrayV3Metadata +from zarr.experimental import ChunkGrid from virtualizarr.codecs import ( zarr_codec_config_to_v3, @@ -23,7 +24,6 @@ KerchunkArrRefs, KerchunkStoreRefs, ) -from virtualizarr.utils import determine_chunk_grid_shape def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata": @@ -184,10 +184,7 @@ def manifestarray_from_kerchunk_refs( # empty variables don't have physical chunks, but zarray shows that the variable # is at least 1D - shape = determine_chunk_grid_shape( - metadata.shape, - metadata.chunks, - ) + shape = ChunkGrid.from_metadata(metadata).shape manifest = ChunkManifest(entries={}, shape=shape) marr = ManifestArray(metadata=metadata, chunkmanifest=manifest) else: diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 41785559..c036e44a 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -14,6 +14,7 @@ from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.group import GroupMetadata from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.experimental import ChunkGrid from zarr.storage import ObjectStore from virtualizarr.manifests import ( @@ -383,7 +384,7 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan strategy = get_strategy(zarr_array) strategy.validate(zarr_array) - chunk_grid_shape = zarr_array._chunk_grid_shape + chunk_grid_shape = ChunkGrid.from_metadata(zarr_array.metadata).shape if zarr_array.shape == (): chunk_map = await strategy.get_chunk_mapping(zarr_array, path) diff --git a/virtualizarr/utils.py b/virtualizarr/utils.py index 87af9094..10e9e736 100644 --- a/virtualizarr/utils.py +++ b/virtualizarr/utils.py @@ -80,22 +80,6 @@ def soft_import(name: str, reason: str, strict: Optional[bool] = True): return None -def ceildiv(a: int, b: int) -> int: - """ - Ceiling division operator for integers. - - See https://stackoverflow.com/questions/14822184/is-there-a-ceiling-equivalent-of-operator-in-python - """ - return -(a // -b) - - -def determine_chunk_grid_shape( - shape: tuple[int, ...], chunks: tuple[int, ...] -) -> tuple[int, ...]: - """Calculate the shape of the chunk grid based on array shape and chunk size.""" - return tuple(ceildiv(length, chunksize) for length, chunksize in zip(shape, chunks)) - - def convert_v3_to_v2_metadata( v3_metadata: ArrayV3Metadata, fill_value: Any = None ) -> ArrayV2Metadata: From b8594c0a5864f633020514149124f1ccbaeb7e12 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Thu, 2 Apr 2026 12:27:59 -0400 Subject: [PATCH 13/15] Fastforward --- virtualizarr/parsers/zarr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index d8e83903..8a8eb798 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -14,10 +14,10 @@ import zarr from obspec_utils.registry import ObjectStoreRegistry from zarr.api.asynchronous import open_group as open_group_async -from zarr.core.chunk_grids import RegularChunkGrid from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding from zarr.core.group import GroupMetadata from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata +from zarr.core.metadata.v3 import RegularChunkGrid from zarr.experimental import ChunkGrid from zarr.storage import ObjectStore From 78ca8565e2fff451caa0a9bce8ebd9b67ba48d60 Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Wed, 8 Apr 2026 16:53:42 -0400 Subject: [PATCH 14/15] update source --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index de8a6bf8..7f1fb8bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -189,7 +189,7 @@ h5netcdf = ">=1.5.0,<2" rust = "*" [tool.pixi.feature.rectilinear.pypi-dependencies] -zarr = { git = "https://github.com/maxrjones/zarr-python.git", branch = "poc/unified-chunk-grid" } +zarr = { git = "https://github.com/zarr-developers/zarr-python.git", branch = "main" } xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "poc/unified-zarr-chunk-grid" } virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "poc/unified-chunk-grid" } From 0369656621c2da846f5b5f1e090cbb6aed64901e Mon Sep 17 00:00:00 2001 From: Max Jones <14077947+maxrjones@users.noreply.github.com> Date: Mon, 20 Apr 2026 12:18:53 -0400 Subject: [PATCH 15/15] Fix bad merge --- conftest.py | 2 +- virtualizarr/parsers/kerchunk/translator.py | 5 +- virtualizarr/parsers/zarr.py | 506 +------------------- 3 files changed, 23 insertions(+), 490 deletions(-) diff --git a/conftest.py b/conftest.py index 3f38813b..ee242e12 100644 --- a/conftest.py +++ b/conftest.py @@ -134,7 +134,7 @@ def _generate_chunk_entries( """ from zarr.experimental import ChunkGrid - chunk_grid_shape = ChunkGrid.from_regular(shape, chunks).grid_shape + chunk_grid_shape = ChunkGrid.from_sizes(shape, chunks).grid_shape if chunk_grid_shape == (): return {"0": entry_generator((0,), (0,), itemsize)} diff --git a/virtualizarr/parsers/kerchunk/translator.py b/virtualizarr/parsers/kerchunk/translator.py index 8313e061..79936ad4 100644 --- a/virtualizarr/parsers/kerchunk/translator.py +++ b/virtualizarr/parsers/kerchunk/translator.py @@ -178,10 +178,7 @@ def manifestarray_from_kerchunk_refs( chunk_dict, metadata, zattrs = parse_array_refs(arr_refs) # we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs if chunk_dict: - chunk_grid_shape = determine_chunk_grid_shape( - metadata.shape, - metadata.chunks, - ) + chunk_grid_shape = ChunkGrid.from_metadata(metadata).grid_shape manifest = manifest_from_kerchunk_chunk_dict( chunk_dict, fs_root=fs_root, shape=chunk_grid_shape ) diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 8a8eb798..5676d10c 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -3,21 +3,18 @@ import asyncio import concurrent.futures import math -from abc import ABC, abstractmethod from collections.abc import Coroutine, Iterable from enum import Enum from pathlib import Path -from typing import TYPE_CHECKING, Any, TypeVar, cast +from typing import Any, TypeVar, cast import numpy as np import obstore import zarr from obspec_utils.registry import ObjectStoreRegistry from zarr.api.asynchronous import open_group as open_group_async -from zarr.core.chunk_key_encodings import DefaultChunkKeyEncoding -from zarr.core.group import GroupMetadata from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata -from zarr.core.metadata.v3 import RegularChunkGrid +from zarr.core.metadata.v3 import RegularChunkGridMetadata from zarr.experimental import ChunkGrid from zarr.storage import ObjectStore @@ -32,479 +29,10 @@ ) from virtualizarr.manifests.utils import ChunkKeySeparator -if TYPE_CHECKING: - import pyarrow as pa - # obstore doesn't export a public base type for stores, so we use Any for now. ObstoreStore = Any T = TypeVar("T") -ZarrArrayType = zarr.AsyncArray | zarr.Array - - -def join_url(base: str, key: str) -> str: - """Join a base URL (like s3://bucket/store.zarr) with an object key. - - Ensures we don't accidentally produce double slashes (after the scheme) - and that the returned string is scheme-friendly. - """ - if not base: - return key - # strip trailing slash from base and leading slash from key to avoid '//' in middle - return base.rstrip("/") + "/" + key.lstrip("/") - - -def _get_array_name(zarr_array: ZarrArrayType) -> str: - """Extract and normalize the array name.""" - name = getattr(zarr_array, "name", "") or "" - return name.lstrip("/") - - -def _normalize_chunk_keys(chunk_keys: list[str], prefix: str) -> list[str]: - """ - Normalize chunk keys to dot-separated coordinates. - - Strips the prefix from each key and replaces '/' with '.' for coordinate notation. - """ - chunk_coords = [ - k[len(prefix) :] if prefix and k.startswith(prefix) else k for k in chunk_keys - ] - return [coord.replace("/", ".") for coord in chunk_coords] - - -async def _handle_scalar_array( - zarr_array: ZarrArrayType, path: str, scalar_key: str -) -> dict[str, dict[str, Any]]: - """ - Handle scalar arrays (shape == ()). - - Parameters - ---------- - zarr_array - The scalar Zarr array. - path - Base path for constructing chunk paths. - scalar_key - The storage key for the scalar value (e.g., "0" for V2, "c" for V3). - - Returns - ------- - dict - Mapping with a single entry for the scalar chunk. - """ - size = await zarr_array.store.getsize(scalar_key) - actual_path = join_url(path, scalar_key) - return { - "0" if scalar_key == "0" else "c": { - "path": actual_path, - "offset": 0, - "length": size, - } - } - - -async def _build_chunk_mapping( - zarr_array: ZarrArrayType, path: str, prefix: str -) -> tuple["pa.Array", "pa.Array", "pa.Array"] | None: - """ - Build chunk mapping by listing the object store with obstore. - - Uses obstore's list_async with Arrow output to get chunk paths and sizes - in a single Rust-level call, avoiding per-chunk getsize calls. - - Parameters - ---------- - zarr_array - The Zarr array. - path - Base path for constructing chunk paths. - prefix - Prefix to list and strip from chunk keys. - - Returns - ------- - Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found. - """ - import pyarrow as pa # type: ignore[import-untyped,import-not-found] - import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] - - path_batches = [] - size_batches = [] - stream = cast(ObjectStore, zarr_array.store).store.list_async( - prefix=prefix, return_arrow=True - ) - async for batch in stream: - pa_path_col = pa.array(batch.column("path")) - not_metadata = pc.invert( - pc.or_( - pc.match_substring(pa_path_col, pattern="/."), - pc.starts_with(pa_path_col, "."), - ) - ) - - filtered_paths = pa_path_col.filter(not_metadata) - filtered_sizes = pa.array(batch.column("size")).filter(not_metadata) - path_batches.append(filtered_paths) - size_batches.append(filtered_sizes) - - if not path_batches: - return None - - all_paths = pa.concat_arrays(path_batches) - all_sizes = pa.concat_arrays(size_batches) - - if len(all_paths) == 0: - return None - stripped_keys = pc.utf8_replace_slice( - all_paths, start=0, stop=len(prefix), replacement="" - ) - - # construct full paths - full_paths = pc.binary_join_element_wise( - pa.scalar(path.rstrip("/")), all_paths, "/" - ) - - return stripped_keys, full_paths, all_sizes - - -class ZarrVersionStrategy(ABC): - """Abstract base class for handling version-specific Zarr operations.""" - - @abstractmethod - async def get_chunk_mapping( - self, zarr_array: ZarrArrayType, path: str - ) -> dict[str, dict[str, Any]]: - """Get mapping of chunk coordinates to storage locations.""" - ... - - @abstractmethod - def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: - """Get V3 metadata for the array (converting if necessary).""" - ... - - @abstractmethod - def get_prefix(self, zarr_array: ZarrArrayType) -> str: - """Get the storage prefix for chunk listing.""" - ... - - @abstractmethod - def _get_separator(self, zarr_array: ZarrArrayType) -> str: ... - - @abstractmethod - def validate(self, zarr_array: ZarrArrayType) -> None: - """Validate that the array can be virtualized.""" - - -class ZarrV2Strategy(ZarrVersionStrategy): - """Strategy for handling Zarr V2 arrays.""" - - async def get_chunk_mapping( - self, zarr_array: ZarrArrayType, path: str - ) -> dict[str, dict[str, Any]]: - """Create a mapping of chunk coordinates to their storage locations for V2 arrays.""" - name = _get_array_name(zarr_array) - prefix = f"{name}/" if name else "" - - # Handle scalar arrays - if zarr_array.shape == (): - scalar_key = f"{prefix}0" - return await _handle_scalar_array(zarr_array, path, scalar_key) - - return await _build_chunk_mapping(zarr_array, path, prefix) # type: ignore[return-value] - - def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: - """Convert V2 metadata to V3 format.""" - - try: - from zarr.core.dtype import parse_dtype - from zarr.metadata.migrate_v3 import _convert_array_metadata - except (ImportError, AttributeError): - raise ImportError( - f"Zarr-Python>=3.1.3 is required for parsing Zarr V2 into Zarr V3. " - f"Found Zarr version '{zarr.__version__}'" - ) - - v2_metadata = zarr_array.metadata - assert isinstance(v2_metadata, ArrayV2Metadata) - - if v2_metadata.fill_value is None: - v2_dict = v2_metadata.to_dict() - v2_dtype = parse_dtype(cast(Any, v2_dict["dtype"]), zarr_format=2) - fill_value = v2_dtype.default_scalar() - v2_dict["fill_value"] = v2_dtype.to_json_scalar(fill_value, zarr_format=2) - temp_v2 = ArrayV2Metadata.from_dict(v2_dict) - v3_metadata = _convert_array_metadata(temp_v2) - else: - # Normal conversion; allow other errors to propagate. - v3_metadata = _convert_array_metadata(v2_metadata) - - # Set dimension names from attributes or generate defaults - if v3_metadata.dimension_names is None: - v3_dict = v3_metadata.to_dict() - dim_names = None - if hasattr(v2_metadata, "attributes") and v2_metadata.attributes: - dim_names = v2_metadata.attributes.get("_ARRAY_DIMENSIONS") - - if dim_names: - v3_dict["dimension_names"] = dim_names - else: - array_name = zarr_array.name.lstrip("/") if zarr_array.name else "array" - v3_dict["dimension_names"] = [ - f"{array_name}_dim_{i}" for i in range(len(zarr_array.shape)) - ] - v3_metadata = ArrayV3Metadata.from_dict(v3_dict) - - v3_dict = v3_metadata.to_dict() - - # Replace V2ChunkKeyEncoding with V3 DefaultChunkKeyEncoding - # The automatic conversion preserves V2's encoding, causing zarr to use V2-style - # paths (array/0) instead of V3-style (array/c/0). This ensures V3 semantics. - if ( - "attributes" in v3_dict - and isinstance(v3_dict["attributes"], dict) - and "_ARRAY_DIMENSIONS" in v3_dict["attributes"] - ): - del v3_dict["attributes"]["_ARRAY_DIMENSIONS"] - v3_metadata = ArrayV3Metadata.from_dict(v3_dict) - v3_dict["chunk_key_encoding"] = {"name": "default", "separator": "."} - v3_metadata = ArrayV3Metadata.from_dict(v3_dict) - - return v3_metadata - - def get_prefix(self, zarr_array: ZarrArrayType) -> str: - name = _get_array_name(zarr_array) - return f"{name}/" if name else "" - - def _get_separator(self, zarr_array: ZarrArrayType) -> str: - from typing import cast - - return cast(ArrayV2Metadata, zarr_array.metadata).dimension_separator - - def validate(self, zarr_array: ZarrArrayType) -> None: - pass # no restrictions for V2 - - -class ZarrV3Strategy(ZarrVersionStrategy): - """Strategy for handling Zarr V3 arrays.""" - - async def get_chunk_mapping( - self, zarr_array: ZarrArrayType, path: str - ) -> dict[str, dict[str, Any]]: - """Create a mapping of chunk coordinates to their storage locations for V3 arrays.""" - # Check for sharding - not yet supported - from zarr.codecs import ShardingCodec - - # Type narrowing: V3 strategy only handles V3 arrays with V3 metadata - metadata = zarr_array.metadata - if not isinstance(metadata, ArrayV3Metadata): - raise TypeError( - f"Expected ArrayV3Metadata in V3 strategy, got {type(metadata)}" - ) - - if any(isinstance(codec, ShardingCodec) for codec in metadata.codecs): - raise NotImplementedError( - "Zarr V3 arrays with sharding are not yet supported. " - "Sharding stores multiple chunks in a single storage object with non-zero offsets, " - "which VirtualiZarr does not currently handle. " - "Reading sharded arrays without proper offset handling would result in corrupted data." - ) - - name = _get_array_name(zarr_array) - - # Handle scalar arrays - if zarr_array.shape == (): - scalar_key = f"{name}/c" if name else "c" - return await _handle_scalar_array(zarr_array, path, scalar_key) - - # List chunk keys under the c/ subdirectory - prefix = f"{name}/c/" if name else "c/" - return await _build_chunk_mapping(zarr_array, path, prefix) # type: ignore[return-value] - - def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata: - """Return V3 metadata as-is (no conversion needed).""" - return zarr_array.metadata # type: ignore[return-value] - - def get_prefix(self, zarr_array: ZarrArrayType) -> str: - name = _get_array_name(zarr_array) - return f"{name}/c/" if name else "c/" - - def _get_separator(self, zarr_array: ZarrArrayType) -> str: - from typing import cast - - metadata = cast(ArrayV3Metadata, zarr_array.metadata) - return cast(DefaultChunkKeyEncoding, metadata.chunk_key_encoding).separator - - def validate(self, zarr_array: ZarrArrayType) -> None: - from zarr.codecs import ShardingCodec - - if not isinstance(zarr_array.metadata, ArrayV3Metadata): - return - if any( - isinstance(codec, ShardingCodec) for codec in zarr_array.metadata.codecs - ): - raise NotImplementedError( - "Zarr V3 arrays with sharding are not yet supported. " - "Sharding stores multiple chunks in a single storage object with non-zero offsets, " - "which VirtualiZarr does not currently handle. " - "Reading sharded arrays without proper offset handling would result in corrupted data." - ) - - -def get_strategy(zarr_array: ZarrArrayType) -> ZarrVersionStrategy: - """ - Factory function to get the appropriate strategy for a Zarr array. - - Parameters - ---------- - zarr_array - The Zarr array to get a strategy for. - - Returns - ------- - ZarrVersionStrategy - The appropriate strategy instance for the array's Zarr format version. - - Raises - ------ - NotImplementedError - If the Zarr format version is not supported. - """ - zarr_format = zarr_array.metadata.zarr_format - if zarr_format == 2: - return ZarrV2Strategy() - elif zarr_format == 3: - return ZarrV3Strategy() - else: - raise NotImplementedError(f"Zarr format {zarr_format} is not supported") - - -async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkManifest: - """Build a ChunkManifest from chunk coordinate mappings. - - Note: Chunk keys are discovered by listing what's actually in storage rather than - generating all possible keys from the chunk grid. Zarr allows chunks to be missing - (sparse arrays), and VirtualiZarr manifests preserve this sparsity. When chunks are - missing, Zarr will return the fill_value for those regions when the array is read. - """ - import pyarrow as pa # type: ignore[import-untyped,import-not-found] - import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] - - strategy = get_strategy(zarr_array) - strategy.validate(zarr_array) - chunk_grid_shape = ChunkGrid.from_metadata(zarr_array.metadata).grid_shape - - if zarr_array.shape == (): - chunk_map = await strategy.get_chunk_mapping(zarr_array, path) - if not chunk_map: - return ChunkManifest(chunk_map, shape=chunk_grid_shape) - entry = next(iter(chunk_map.values())) - return ChunkManifest._from_arrow( - paths=pa.array([entry["path"]], type=pa.string()), - offsets=pa.array([entry["offset"]], type=pa.uint64()), - lengths=pa.array([entry["length"]], type=pa.uint64()), - shape=chunk_grid_shape, - ) - - prefix = strategy.get_prefix(zarr_array) - - result = await _build_chunk_mapping(zarr_array, path, prefix) - - if result is None: - return ChunkManifest({}, shape=chunk_grid_shape) - - stripped_keys, full_paths, all_lengths = result - - total_size = zarr_array.nchunks - separator = strategy._get_separator(zarr_array) - split_keys = pc.split_pattern(stripped_keys, pattern=separator) - coords = [ - pc.cast(pc.list_element(split_keys, dim), pa.int64()).to_numpy() - for dim in range(zarr_array.ndim) - ] - flat_positions = pa.array(np.ravel_multi_index(coords, chunk_grid_shape)) - - # scatter listed chunks into a dense flat array (nulls = missing chunks) - updates = pa.table( - {"idx": flat_positions, "path": full_paths, "length": all_lengths} - ) - dense = ( - pa.table({"idx": pa.array(np.arange(total_size, dtype=np.int64))}) - .join(updates, "idx", join_type="left outer") - .sort_by("idx") - ) - - return ChunkManifest._from_arrow( - paths=dense["path"].combine_chunks(), - offsets=pa.repeat(pa.scalar(0, type=pa.uint64()), total_size), - lengths=dense["length"].combine_chunks(), - shape=chunk_grid_shape, - ) - - -def get_metadata(zarr_array: ZarrArrayType) -> ArrayV3Metadata: - """ - Get V3 metadata for an array, converting from V2 if necessary. - - Parameters - ---------- - zarr_array - The Zarr array to get metadata for. - - Returns - ------- - ArrayV3Metadata - V3 metadata for the array. - """ - strategy = get_strategy(zarr_array) - return strategy.get_metadata(zarr_array) - - -async def _construct_manifest_array( - zarr_array: zarr.AsyncArray[Any], path: str -) -> ManifestArray: - """Construct a ManifestArray from a zarr array.""" - array_metadata = get_metadata(zarr_array) - chunk_manifest = await build_chunk_manifest(zarr_array, path) - return ManifestArray(metadata=array_metadata, chunkmanifest=chunk_manifest) - - -async def _construct_manifest_group( - path: str, - store: zarr.storage.ObjectStore, - *, - skip_variables: str | Iterable[str] | None = None, - group: str | None = None, -) -> ManifestGroup: - """Construct a ManifestGroup from a zarr group.""" - zarr_group = await open_group_async(store=store, path=group, mode="r") - - zarr_array_keys = [key async for key in zarr_group.array_keys()] - _skip_variables = [] if skip_variables is None else list(skip_variables) - - zarr_arrays = await asyncio.gather( - *[ - zarr_group.getitem(var) - for var in zarr_array_keys - if var not in _skip_variables - ] - ) - - manifest_arrays = await asyncio.gather( - *[_construct_manifest_array(array, path) for array in zarr_arrays] # type: ignore[arg-type] - ) - - manifest_dict = { - array.basename: result for array, result in zip(zarr_arrays, manifest_arrays) - } - - manifest_group = ManifestGroup(manifest_dict, attributes=zarr_group.attrs) - manifest_group._metadata = GroupMetadata( - attributes=dict(zarr_group.attrs) if zarr_group.attrs is not None else {}, - zarr_format=3, - consolidated_metadata=None, - ) - - return manifest_group def _run_async(coro: Coroutine[Any, Any, T]) -> T: @@ -562,6 +90,18 @@ def chunks_dir_prefix(self) -> str: return "c/" +def join_url(base: str, key: str) -> str: + """Join a base URL (like s3://bucket/store.zarr) with an object key. + + Ensures we don't accidentally produce double slashes (after the scheme) + and that the returned string is scheme-friendly. + """ + if not base: + return key + # strip trailing slash from base and leading slash from key to avoid '//' in middle + return base.rstrip("/") + "/" + key.lstrip("/") + + class ZarrParser: """ Parser for creating virtual references to existing Zarr stores. @@ -727,9 +267,9 @@ async def construct_manifest_array( """Construct a ManifestArray from a zarr array.""" array_v3_metadata = metadata_as_v3(zarr_array.metadata) - if not isinstance(array_v3_metadata.chunk_grid, RegularChunkGrid): + if not isinstance(array_v3_metadata.chunk_grid, RegularChunkGridMetadata): raise NotImplementedError( - f"Only RegularChunkGrid is supported, but array {zarr_array.path} " + f"Only RegularChunkGridMetadata is supported, but array {zarr_array.path} " f"uses {type(array_v3_metadata.chunk_grid).__name__}." ) @@ -743,7 +283,7 @@ async def construct_manifest_array( ) obs_store = cast(ObjectStore, zarr_array.store).store - chunk_manifest = await _build_chunk_manifest_from_store( + chunk_manifest = await build_chunk_manifest( obs_store=obs_store, array_path=zarr_array.path, store_base_uri=path, @@ -809,7 +349,7 @@ def _convert_v2_to_v3_dict(metadata: ArrayV2Metadata) -> dict: return v3_dict -async def _build_chunk_manifest_from_store( +async def build_chunk_manifest( obs_store: ObstoreStore, array_path: str, store_base_uri: str, @@ -843,13 +383,9 @@ async def _build_chunk_manifest_from_store( missing, Zarr will return the fill_value for those regions when the array is read. """ - # For sharded arrays, chunk_grid.chunk_shape is the shard shape (not the inner - # chunk shape, which lives inside the ShardingCodec config). So this grid describes - # the number of shard files on disk, which is exactly what we want for the manifest. - chunk_shape = cast(RegularChunkGrid, metadata.chunk_grid).chunk_shape - chunk_grid_shape = tuple( - math.ceil(s / c) for s, c in zip(metadata.shape, chunk_shape) - ) + # For sharded arrays, grid_shape reflects the number of shard files on disk, + # which is exactly what we want for the manifest. + chunk_grid_shape = ChunkGrid.from_metadata(metadata).grid_shape total_size = math.prod(chunk_grid_shape) # Handle scalar arrays