Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.manifests.manifest import join
from virtualizarr.manifests.utils import create_v3_array_metadata
from virtualizarr.utils import ceildiv


# Pytest configuration
Expand Down Expand Up @@ -133,10 +132,9 @@ def _generate_chunk_entries(
dict
Mapping of chunk keys to entry dictionaries
"""
chunk_grid_shape = tuple(
ceildiv(axis_length, chunk_length)
for axis_length, chunk_length in zip(shape, chunks)
)
from zarr.experimental import ChunkGrid

chunk_grid_shape = ChunkGrid.from_sizes(shape, chunks).grid_shape

if chunk_grid_shape == ():
return {"0": entry_generator((0,), (0,), itemsize)}
Expand Down
7 changes: 7 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ dev = [
"s3fs",
"lithops",
"dask",
"tifffile>=2026.2.16,<2027"
]

[project.urls]
Expand Down Expand Up @@ -182,6 +183,11 @@ h5netcdf = ">=1.5.0,<2"
[tool.pixi.feature.icechunk-dev.dependencies]
rust = "*"

[tool.pixi.feature.rectilinear.pypi-dependencies]
zarr = { git = "https://github.com/zarr-developers/zarr-python.git", branch = "main" }
xarray = { git = "https://github.com/maxrjones/xarray.git", branch = "poc/unified-zarr-chunk-grid" }
virtual-tiff = { git = "https://github.com/virtual-zarr/virtual-tiff.git", branch = "poc/unified-chunk-grid" }

[tool.pixi.feature.minimum-versions.dependencies]
xarray = "==2025.6.0"
numpy = "==2.1.0"
Expand Down Expand Up @@ -214,6 +220,7 @@ test-py314 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "ke
minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "py314", "zarr", "minio"]
minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr","minimum-versions", "py312"]
upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "zarr", "py313"]
rectilinear = ["dev", "test", "tiff", "hdf5-lib", "rectilinear", "py313"]
all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "all_parsers", "all_writers", "py313"]
docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff","zarr", "py313"]

Expand Down
37 changes: 15 additions & 22 deletions virtualizarr/manifests/array.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import warnings
from typing import TYPE_CHECKING, Any, Callable, Union, cast
from typing import Any, Callable, Union, cast

import numpy as np
import xarray as xr
from zarr.core.metadata.v3 import ArrayV3Metadata
from zarr.experimental import ChunkGrid

import virtualizarr.manifests.utils as utils
from virtualizarr.manifests.array_api import (
Expand All @@ -14,20 +15,6 @@
from virtualizarr.manifests.manifest import ChunkManifest
from virtualizarr.manifests.utils import ChunkKeySeparator

# Type-check against the min-deps name (RegularChunkGrid) while keeping a
# runtime try/except so both zarr-python <=3.1.6 and >3.1.6 work. Version
# sniffing is unreliable under hatch-vcs when installed from a git source
# without fetched tags.
if TYPE_CHECKING:
from zarr.core.metadata.v3 import RegularChunkGrid as RegularChunkGridMetadata
else:
try:
from zarr.core.metadata.v3 import RegularChunkGridMetadata # zarr-python>3.1.6
except ImportError:
from zarr.core.metadata.v3 import (
RegularChunkGrid as RegularChunkGridMetadata, # zarr-python<=3.1.6
)


class ManifestArray:
"""
Expand Down Expand Up @@ -64,11 +51,6 @@ def __init__(
# try unpacking the dict
_metadata = ArrayV3Metadata(**metadata)

if not isinstance(_metadata.chunk_grid, RegularChunkGridMetadata):
raise NotImplementedError(
f"Only RegularChunkGrid is currently supported for chunk size, but got type {type(_metadata.chunk_grid)}"
)

if isinstance(chunkmanifest, ChunkManifest):
_chunkmanifest = chunkmanifest
elif isinstance(chunkmanifest, dict):
Expand Down Expand Up @@ -97,11 +79,22 @@ def metadata(self) -> ArrayV3Metadata:
return self._metadata

@property
def chunks(self) -> tuple[int, ...]:
def chunk_grid(self) -> ChunkGrid:
"""Behavioral chunk grid bound to this array's shape."""
return ChunkGrid.from_metadata(self._metadata)

@property
def chunks(self) -> tuple[int, ...] | tuple[tuple[int, ...], ...]:
"""
Individual chunk size by number of elements.

For regular grids, returns a tuple of ints (e.g., (30, 50)).
For rectilinear grids, returns a tuple of tuples (e.g., ((10, 20, 30), (50, 50))).
"""
return self._metadata.chunks
grid = self.chunk_grid
if grid.is_regular:
return grid.chunk_shape
return grid.chunk_sizes

@property
def dtype(self) -> np.dtype:
Expand Down
33 changes: 22 additions & 11 deletions virtualizarr/manifests/array_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
from typing import TYPE_CHECKING, Any, Callable, Union, cast

import numpy as np

from virtualizarr.utils import determine_chunk_grid_shape
from zarr.experimental import ChunkGrid

from .manifest import ChunkManifest
from .utils import (
Expand Down Expand Up @@ -102,8 +101,17 @@ def concatenate(
[arr.manifest for arr in arrays], axis=axis
)

# For rectilinear grids, concatenate chunk edges along the concat axis
new_chunks = None
if not first_arr.chunk_grid.is_regular:
new_chunks = list(first_arr.chunks)
concat_edges: tuple[int, ...] = ()
for arr in arrays:
concat_edges = concat_edges + arr.chunks[axis] # type: ignore[index]
new_chunks[axis] = concat_edges

new_metadata = copy_and_replace_metadata(
old_metadata=first_arr.metadata, new_shape=new_shape
old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks
)

return ManifestArray(chunkmanifest=concatenated_manifest, metadata=new_metadata)
Expand Down Expand Up @@ -151,7 +159,11 @@ def stack(
# chunk shape has changed because a length-1 axis has been inserted
old_chunks = first_arr.chunks
new_chunks = list(old_chunks)
new_chunks.insert(axis, 1)
# For rectilinear grids, each element is a sequence; insert a single-element tuple
if not first_arr.chunk_grid.is_regular:
new_chunks.insert(axis, (1,))
else:
new_chunks.insert(axis, 1)

new_metadata = copy_and_replace_metadata(
old_metadata=first_arr.metadata, new_shape=new_shape, new_chunks=new_chunks
Expand Down Expand Up @@ -187,22 +199,21 @@ def broadcast_to(x: "ManifestArray", /, shape: tuple[int, ...]) -> "ManifestArra

# new chunk_shape is old chunk_shape with singleton dimensions prepended
# (chunk shape can never change by more than adding length-1 axes because each chunk represents a fixed number of array elements)
old_chunk_shape = x.chunks
# broadcast_to only applies to regular chunk grids
old_chunk_shape: tuple[int, ...] = x.chunks # type: ignore[assignment]
new_chunk_shape = _prepend_singleton_dimensions(
old_chunk_shape, ndim=len(new_shape)
)

# find new chunk grid shape by dividing new array shape by new chunk shape
new_chunk_grid_shape = determine_chunk_grid_shape(new_shape, new_chunk_shape)

# do broadcasting of entries in manifest
broadcasted_manifest = _broadcast_manifest(x.manifest, shape=new_chunk_grid_shape)

new_metadata = copy_and_replace_metadata(
old_metadata=x.metadata,
new_shape=list(new_shape),
new_chunks=list(new_chunk_shape),
)
new_chunk_grid_shape = ChunkGrid.from_metadata(new_metadata).grid_shape

# do broadcasting of entries in manifest
broadcasted_manifest = _broadcast_manifest(x.manifest, shape=new_chunk_grid_shape)

return ManifestArray(chunkmanifest=broadcasted_manifest, metadata=new_metadata)

Expand Down
35 changes: 26 additions & 9 deletions virtualizarr/manifests/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import functools
import re
import typing
from collections.abc import Sequence
from typing import TYPE_CHECKING, Any, Dict, Iterable, Literal, Optional, Union

import numpy as np
from zarr import Array
from zarr.core.chunk_grids import _is_rectilinear_chunks
from zarr.core.chunk_key_encodings import ChunkKeyEncodingLike
from zarr.core.metadata.v3 import (
ArrayV3Metadata,
Expand Down Expand Up @@ -230,7 +232,7 @@ def check_same_codecs(codecs: list[Any]) -> None:
)


def check_same_chunk_shapes(chunks_list: list[tuple[int, ...]]) -> None:
def check_same_chunk_shapes(chunks_list: list[Sequence]) -> None:
"""Check all the chunk shapes are the same"""

first_chunks, *other_chunks_list = chunks_list
Expand Down Expand Up @@ -273,11 +275,17 @@ def _remove_elements_at_positions(


def check_no_partial_chunks_on_concat_axis(
shapes: list[tuple[int, ...]], chunks: list[tuple[int, ...]], axis: int
shapes: list[tuple[int, ...]], chunks: list, axis: int
):
"""Check that there are no partial chunks along the concatenation axis"""
# loop over the arrays to be concatenated
"""Check that there are no partial chunks along the concatenation axis.

Only applies to regular chunk grids; rectilinear grids explicitly encode
variable chunk sizes so partial-chunk checks are not needed.
"""
for i, (shape, chunk_shape) in enumerate(zip(shapes, chunks)):
# Rectilinear grids have sequences along each axis; skip the check
if _is_rectilinear_chunks(chunk_shape):
continue
if shape[axis] % chunk_shape[axis] > 0:
raise ValueError(
"Cannot concatenate arrays with partial chunks because only regular chunk grids are currently supported. "
Expand Down Expand Up @@ -347,7 +355,7 @@ def check_compatible_arrays(
def copy_and_replace_metadata(
old_metadata: ArrayV3Metadata,
new_shape: list[int] | None = None,
new_chunks: list[int] | None = None,
new_chunks: list | None = None,
new_dimension_names: Iterable[str] | None | Literal["default"] = "default",
new_attributes: dict | None = None,
) -> ArrayV3Metadata:
Expand All @@ -361,10 +369,19 @@ def copy_and_replace_metadata(
if new_shape is not None:
metadata_copy["shape"] = parse_shapelike(new_shape) # type: ignore[assignment]
if new_chunks is not None:
metadata_copy["chunk_grid"] = {
"name": "regular",
"configuration": {"chunk_shape": tuple(new_chunks)},
}
if _is_rectilinear_chunks(new_chunks):
metadata_copy["chunk_grid"] = {
"name": "rectilinear",
"configuration": {
"kind": "inline",
"chunk_shapes": [list(c) for c in new_chunks],
},
}
else:
metadata_copy["chunk_grid"] = {
"name": "regular",
"configuration": {"chunk_shape": tuple(new_chunks)},
}
if new_dimension_names != "default":
# need the option to use the literal string "default" as a sentinel value because None is a valid choice for zarr dimension_names
metadata_copy["dimension_names"] = parse_dimension_names(new_dimension_names)
Expand Down
12 changes: 3 additions & 9 deletions virtualizarr/parsers/kerchunk/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import ujson
from zarr.core.common import JSON
from zarr.core.metadata import ArrayV3Metadata
from zarr.experimental import ChunkGrid

from virtualizarr.codecs import (
zarr_codec_config_to_v3,
Expand All @@ -24,7 +25,6 @@
KerchunkArrRefs,
KerchunkStoreRefs,
)
from virtualizarr.utils import determine_chunk_grid_shape


def from_kerchunk_refs(decoded_arr_refs_zarray, zattrs) -> "ArrayV3Metadata":
Expand Down Expand Up @@ -179,10 +179,7 @@ def manifestarray_from_kerchunk_refs(
chunk_dict, metadata, zattrs = parse_array_refs(arr_refs)
# we want to remove the _ARRAY_DIMENSIONS from the final variables' .attrs
if chunk_dict:
chunk_grid_shape = determine_chunk_grid_shape(
metadata.shape,
metadata.chunks,
)
chunk_grid_shape = ChunkGrid.from_metadata(metadata).grid_shape
manifest = manifest_from_kerchunk_chunk_dict(
chunk_dict, fs_root=fs_root, shape=chunk_grid_shape
)
Expand All @@ -191,10 +188,7 @@ def manifestarray_from_kerchunk_refs(
# empty variables don't have physical chunks, but zarray shows that the variable
# is at least 1D

shape = determine_chunk_grid_shape(
metadata.shape,
metadata.chunks,
)
shape = ChunkGrid.from_metadata(metadata).grid_shape
manifest = ChunkManifest(entries={}, shape=shape)
marr = ManifestArray(metadata=metadata, chunkmanifest=manifest)
else:
Expand Down
30 changes: 7 additions & 23 deletions virtualizarr/parsers/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
from collections.abc import Coroutine, Iterable
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar, cast
from typing import Any, TypeVar, cast

import numpy as np
import obstore
import zarr
from obspec_utils.registry import ObjectStoreRegistry
from zarr.api.asynchronous import open_group as open_group_async
from zarr.core.metadata import ArrayV2Metadata, ArrayV3Metadata
from zarr.core.metadata.v3 import RegularChunkGridMetadata
from zarr.experimental import ChunkGrid
from zarr.storage import ObjectStore

from virtualizarr.manifests import (
Expand All @@ -26,21 +28,6 @@
validate_and_normalize_path_to_uri,
)
from virtualizarr.manifests.utils import ChunkKeySeparator
from virtualizarr.utils import determine_chunk_grid_shape

# Type-check against the min-deps name (RegularChunkGrid) while keeping a
# runtime try/except so both zarr-python <=3.1.6 and >3.1.6 work. Version
# sniffing is unreliable under hatch-vcs when installed from a git source
# without fetched tags.
if TYPE_CHECKING:
from zarr.core.metadata.v3 import RegularChunkGrid as RegularChunkGridMetadata
else:
try:
from zarr.core.metadata.v3 import RegularChunkGridMetadata # zarr-python>3.1.6
except ImportError:
from zarr.core.metadata.v3 import (
RegularChunkGrid as RegularChunkGridMetadata, # zarr-python<=3.1.6
)

# obstore doesn't export a public base type for stores, so we use Any for now.
ObstoreStore = Any
Expand Down Expand Up @@ -282,7 +269,7 @@ async def construct_manifest_array(

if not isinstance(array_v3_metadata.chunk_grid, RegularChunkGridMetadata):
raise NotImplementedError(
f"Only RegularChunkGrid is supported, but array {zarr_array.path} "
f"Only RegularChunkGridMetadata is supported, but array {zarr_array.path} "
f"uses {type(array_v3_metadata.chunk_grid).__name__}."
)

Expand Down Expand Up @@ -396,12 +383,9 @@ async def build_chunk_manifest(
missing, Zarr will return the fill_value for those regions when the array is read.
"""

# For sharded arrays, chunk_grid.chunk_shape is the shard shape (not the inner
# chunk shape, which lives inside the ShardingCodec config). So this grid describes
# the number of shard files on disk, which is exactly what we want for the manifest.
chunk_grid_shape = determine_chunk_grid_shape(
metadata.shape, cast(RegularChunkGridMetadata, metadata.chunk_grid).chunk_shape
)
# For sharded arrays, grid_shape reflects the number of shard files on disk,
# which is exactly what we want for the manifest.
chunk_grid_shape = ChunkGrid.from_metadata(metadata).grid_shape
total_size = math.prod(chunk_grid_shape)

# Handle scalar arrays
Expand Down
8 changes: 8 additions & 0 deletions virtualizarr/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
import time

import pytest
import zarr


@pytest.fixture(autouse=True)
def _enable_rectilinear_chunks():
"""Enable rectilinear chunks for all tests."""
with zarr.config.set({"array.rectilinear_chunks": True}):
yield


@pytest.fixture(scope="session")
Expand Down
Loading
Loading