diff --git a/docs/user-guide/experimental.md b/docs/user-guide/experimental.md index eaa53a4622..48f9e39c66 100644 --- a/docs/user-guide/experimental.md +++ b/docs/user-guide/experimental.md @@ -19,7 +19,7 @@ Because the `CacheStore` uses an ordinary Zarr `Store` object as the caching lay > **Note:** The CacheStore is a wrapper store that maintains compatibility with the full > `zarr.abc.store.Store` API while adding transparent caching functionality. -## Basic Usage +### Basic Usage Creating a CacheStore requires both a source store and a cache store. The cache store can be any Store implementation, providing flexibility in cache persistence: @@ -51,7 +51,7 @@ zarr_array[:] = np.random.random((100, 100)) The dual-store architecture allows you to use different store types for source and cache, such as a remote store for source data and a local store for persistent caching. -## Performance Benefits +### Performance Benefits The CacheStore provides significant performance improvements for repeated data access: @@ -79,7 +79,7 @@ print(f"Speedup is {speedup}") Cache effectiveness is particularly pronounced with repeated access to the same data chunks. -## Cache Configuration +### Cache Configuration The CacheStore can be configured with several parameters: @@ -137,7 +137,7 @@ cache = CacheStore( ) ``` -## Cache Statistics +### Cache Statistics The CacheStore provides statistics to monitor cache performance and state: @@ -159,7 +159,7 @@ print(info['cache_set_data']) The `cache_info()` method returns a dictionary with detailed information about the cache state. -## Cache Management +### Cache Management The CacheStore provides methods for manual cache management: @@ -177,7 +177,7 @@ assert info['current_size'] == 0 The `clear_cache()` method is an async method that clears both the cache store (if it supports the `clear` method) and all internal tracking data. -## Best Practices +### Best Practices 1. **Choose appropriate cache store**: Use MemoryStore for fast temporary caching or LocalStore for persistent caching 2. **Size the cache appropriately**: Set `max_size` based on available storage and expected data access patterns @@ -186,12 +186,12 @@ The `clear_cache()` method is an async method that clears both the cache store 5. **Consider data locality**: Group related data accesses together to improve cache efficiency 6. **Set appropriate expiration**: Use `max_age_seconds` for time-sensitive data or "infinity" for static data -## Working with Different Store Types +### Working with Different Store Types The CacheStore can wrap any store that implements the `zarr.abc.store.Store` interface and use any store type for the cache backend: -### Local Store with Memory Cache +#### Local Store with Memory Cache ```python exec="true" session="experimental-memory-cache" source="above" from zarr.storage import LocalStore, MemoryStore @@ -208,7 +208,7 @@ cached_store = CacheStore( ) ``` -### Memory Store with Persistent Cache +#### Memory Store with Persistent Cache ```python exec="true" session="experimental-local-cache" source="above" from tempfile import mkdtemp @@ -228,7 +228,7 @@ cached_store = CacheStore( The dual-store architecture provides flexibility in choosing the best combination of source and cache stores for your specific use case. -## Examples from Real Usage +### Examples from Real Usage Here's a complete example demonstrating cache effectiveness: @@ -273,3 +273,40 @@ print(f"Cache contains {info['cached_keys']} keys with {info['current_size']} by This example shows how the CacheStore can significantly reduce access times for repeated data reads, particularly important when working with remote data sources. The dual-store architecture allows for flexible cache persistence and management. + +## Lazy indexing + +Experimental support for lazy indexing can be found in an array class defined in `zarr.experimental.lazy_indexing`. + +```python exec="true" session="lazy-indexing-intro" source="above" result="ansi" +import zarr +import numpy as np +from zarr.experimental.lazy_indexing import Array, merge + +store = {} +np_data = np.arange(100) +zarr.create_array(store, data=np_data, chunks=(10,), fill_value=0, write_data=True) + +lazy_array = Array.open(store) +print(lazy_array) +# + +slice_a = slice(0, 10) +slice_b = slice(10, None) + +subregion_a = lazy_array[slice_a] +print(subregion_a) +# +assert np.array_equal(np.array(subregion_a), np_data[slice_a]) + +subregion_b = lazy_array[slice_b] +print(subregion_b) +# +assert np.array_equal(np.array(subregion_a), np_data[slice_a]) + +merged = merge([subregion_a, subregion_b]) +assert merged == lazy_array +assert np.array_equal(np.array(merged), np_data) +``` + +The base `zarr.Array` class returns a NumPy array when you index it. But when indexing an instance of `zarr.experimental.lazy_indexing.Array`, you get another lazy array. This is possible because the lazy indexing array keeps track its coordinates relative to the chunk grid. \ No newline at end of file diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 00536a1ec0..0e68288c68 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -1337,13 +1337,7 @@ async def example(): result = asyncio.run(example()) ``` """ - if self.shards is None: - chunks_per_shard = 1 - else: - chunks_per_shard = product( - tuple(a // b for a, b in zip(self.shards, self.chunks, strict=True)) - ) - return (await self._nshards_initialized()) * chunks_per_shard + return await _nchunks_initialized(self) async def _nshards_initialized(self) -> int: """ @@ -1381,10 +1375,10 @@ async def example(): result = asyncio.run(example()) ``` """ - return len(await _shards_initialized(self)) + return await _nshards_initialized(self) async def nbytes_stored(self) -> int: - return await self.store_path.store.getsize_prefix(self.store_path.path) + return await _nbytes_stored(self.store_path) def _iter_chunk_coords( self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None @@ -1549,49 +1543,16 @@ async def _get_selection( out: NDBuffer | None = None, fields: Fields | None = None, ) -> NDArrayLikeOrScalar: - # check fields are sensible - out_dtype = check_fields(fields, self.dtype) - - # setup output buffer - if out is not None: - if isinstance(out, NDBuffer): - out_buffer = out - else: - raise TypeError(f"out argument needs to be an NDBuffer. Got {type(out)!r}") - if out_buffer.shape != indexer.shape: - raise ValueError( - f"shape of out argument doesn't match. Expected {indexer.shape}, got {out.shape}" - ) - else: - out_buffer = prototype.nd_buffer.empty( - shape=indexer.shape, - dtype=out_dtype, - order=self.order, - ) - if product(indexer.shape) > 0: - # need to use the order from the metadata for v2 - _config = self._config - if self.metadata.zarr_format == 2: - _config = replace(_config, order=self.order) - - # reading chunks and decoding them - await self.codec_pipeline.read( - [ - ( - self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), - chunk_selection, - out_selection, - is_complete_chunk, - ) - for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer - ], - out_buffer, - drop_axes=indexer.drop_axes, - ) - if isinstance(indexer, BasicIndexer) and indexer.shape == (): - return out_buffer.as_scalar() - return out_buffer.as_ndarray_like() + return await _get_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, + indexer, + prototype=prototype, + out=out, + fields=fields, + ) async def getitem( self, @@ -1636,14 +1597,14 @@ async def example(): value = asyncio.run(example()) ``` """ - if prototype is None: - prototype = default_buffer_prototype() - indexer = BasicIndexer( + return await _getitem( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, selection, - shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, + prototype=prototype, ) - return await self._get_selection(indexer, prototype=prototype) async def get_orthogonal_selection( self, @@ -1653,11 +1614,15 @@ async def get_orthogonal_selection( fields: Fields | None = None, prototype: BufferPrototype | None = None, ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = OrthogonalIndexer(selection, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype + return await _get_orthogonal_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, + selection, + out=out, + fields=fields, + prototype=prototype, ) async def get_mask_selection( @@ -1668,11 +1633,15 @@ async def get_mask_selection( fields: Fields | None = None, prototype: BufferPrototype | None = None, ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = MaskIndexer(mask, self.shape, self.metadata.chunk_grid) - return await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype + return await _get_mask_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, + mask, + out=out, + fields=fields, + prototype=prototype, ) async def get_coordinate_selection( @@ -1683,18 +1652,17 @@ async def get_coordinate_selection( fields: Fields | None = None, prototype: BufferPrototype | None = None, ) -> NDArrayLikeOrScalar: - if prototype is None: - prototype = default_buffer_prototype() - indexer = CoordinateIndexer(selection, self.shape, self.metadata.chunk_grid) - out_array = await self._get_selection( - indexer=indexer, out=out, fields=fields, prototype=prototype + return await _get_coordinate_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, + selection, + out=out, + fields=fields, + prototype=prototype, ) - if hasattr(out_array, "shape"): - # restore shape - out_array = np.array(out_array).reshape(indexer.sel_shape) - return out_array - async def _save_metadata(self, metadata: ArrayMetadata, ensure_parents: bool = False) -> None: """ Asynchronously save the array metadata. @@ -1709,56 +1677,15 @@ async def _set_selection( prototype: BufferPrototype, fields: Fields | None = None, ) -> None: - # check fields are sensible - check_fields(fields, self.dtype) - fields = check_no_multi_fields(fields) - - # check value shape - if np.isscalar(value): - array_like = prototype.buffer.create_zero_length().as_array_like() - if isinstance(array_like, np._typing._SupportsArrayFunc): - # TODO: need to handle array types that don't support __array_function__ - # like PyTorch and JAX - array_like_ = cast("np._typing._SupportsArrayFunc", array_like) - value = np.asanyarray(value, dtype=self.dtype, like=array_like_) - else: - if not hasattr(value, "shape"): - value = np.asarray(value, self.dtype) - # assert ( - # value.shape == indexer.shape - # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}" - if not hasattr(value, "dtype") or value.dtype.name != self.dtype.name: - if hasattr(value, "astype"): - # Handle things that are already NDArrayLike more efficiently - value = value.astype(dtype=self.dtype, order="A") - else: - value = np.array(value, dtype=self.dtype, order="A") - value = cast("NDArrayLike", value) - - # We accept any ndarray like object from the user and convert it - # to an NDBuffer (or subclass). From this point onwards, we only pass - # Buffer and NDBuffer between components. - value_buffer = prototype.nd_buffer.from_ndarray_like(value) - - # need to use the order from the metadata for v2 - _config = self._config - if self.metadata.zarr_format == 2: - _config = replace(_config, order=self.metadata.order) - - # merging with existing data and encoding chunks - await self.codec_pipeline.write( - [ - ( - self.store_path / self.metadata.encode_chunk_key(chunk_coords), - self.metadata.get_chunk_spec(chunk_coords, _config, prototype), - chunk_selection, - out_selection, - is_complete_chunk, - ) - for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer - ], - value_buffer, - drop_axes=indexer.drop_axes, + return await _set_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, + indexer, + value, + prototype=prototype, + fields=fields, ) async def setitem( @@ -1800,14 +1727,15 @@ async def setitem( - This method is asynchronous and should be awaited. - Supports basic indexing, where the selection is contiguous and does not involve advanced indexing. """ - if prototype is None: - prototype = default_buffer_prototype() - indexer = BasicIndexer( + return await _setitem( + self.store_path, + self.metadata, + self.codec_pipeline, + self._config, selection, - shape=self.metadata.shape, - chunk_grid=self.metadata.chunk_grid, + value, + prototype=prototype, ) - return await self._set_selection(indexer, value, prototype=prototype) @property def oindex(self) -> AsyncOIndex[T_ArrayMetadata]: @@ -1849,32 +1777,7 @@ async def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) ----- - This method is asynchronous and should be awaited. """ - new_shape = parse_shapelike(new_shape) - assert len(new_shape) == len(self.metadata.shape) - new_metadata = self.metadata.update_shape(new_shape) - - if delete_outside_chunks: - # Remove all chunks outside of the new shape - old_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(self.metadata.shape)) - new_chunk_coords = set(self.metadata.chunk_grid.all_chunk_coords(new_shape)) - - async def _delete_key(key: str) -> None: - await (self.store_path / key).delete() - - await concurrent_map( - [ - (self.metadata.encode_chunk_key(chunk_coords),) - for chunk_coords in old_chunk_coords.difference(new_chunk_coords) - ], - _delete_key, - zarr_config.get("async.concurrency"), - ) - - # Write new metadata - await self._save_metadata(new_metadata) - - # Update metadata (in place) - object.__setattr__(self, "metadata", new_metadata) + return await _resize(self, new_shape, delete_outside_chunks) async def append(self, data: npt.ArrayLike, axis: int = 0) -> tuple[int, ...]: """Append `data` to `axis`. @@ -1895,40 +1798,7 @@ async def append(self, data: npt.ArrayLike, axis: int = 0) -> tuple[int, ...]: The size of all dimensions other than `axis` must match between this array and `data`. """ - # ensure data is array-like - if not hasattr(data, "shape"): - data = np.asanyarray(data) - - self_shape_preserved = tuple(s for i, s in enumerate(self.shape) if i != axis) - data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) - if self_shape_preserved != data_shape_preserved: - raise ValueError( - f"shape of data to append is not compatible with the array. " - f"The shape of the data is ({data_shape_preserved})" - f"and the shape of the array is ({self_shape_preserved})." - "All dimensions must match except for the dimension being " - "appended." - ) - # remember old shape - old_shape = self.shape - - # determine new shape - new_shape = tuple( - self.shape[i] if i != axis else self.shape[i] + data.shape[i] - for i in range(len(self.shape)) - ) - - # resize - await self.resize(new_shape) - - # store data - append_selection = tuple( - slice(None) if i != axis else slice(old_shape[i], new_shape[i]) - for i in range(len(self.shape)) - ) - await self.setitem(append_selection, data) - - return new_shape + return await _append(self, data, axis) async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: """ @@ -1956,11 +1826,7 @@ async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: - The updated attributes will be merged with existing attributes, and any conflicts will be overwritten by the new values. """ - self.metadata.attributes.update(new_attributes) - - # Write new metadata - await self._save_metadata(self.metadata) - + await _update_attributes(self, new_attributes) return self def __repr__(self) -> str: @@ -2017,10 +1883,7 @@ async def info_complete(self) -> Any: ------- [zarr.AsyncArray.info][] - A property giving just the statically known information about an array. """ - return self._info( - await self._nshards_initialized(), - await self.store_path.store.getsize_prefix(self.store_path.path), - ) + return await _info_complete(self) def _info( self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None @@ -5518,3 +5381,685 @@ def _iter_chunk_regions( return _iter_regions( array.shape, array.chunks, origin=origin, selection_shape=selection_shape, trim_excess=True ) + + +async def _nchunks_initialized( + array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], +) -> int: + """ + Calculate the number of chunks that have been initialized in storage. + + This value is calculated as the product of the number of initialized shards and the number + of chunks per shard. For arrays that do not use sharding, the number of chunks per shard is + effectively 1, and in that case the number of chunks initialized is the same as the number + of stored objects associated with an array. + + Parameters + ---------- + array : AsyncArray + The array to inspect. + + Returns + ------- + nchunks_initialized : int + The number of chunks that have been initialized. + """ + if array.shards is None: + chunks_per_shard = 1 + else: + chunks_per_shard = product( + tuple(a // b for a, b in zip(array.shards, array.chunks, strict=True)) + ) + return (await _nshards_initialized(array)) * chunks_per_shard + + +async def _nshards_initialized( + array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], +) -> int: + """ + Calculate the number of shards that have been initialized in storage. + + This is the number of shards that have been persisted to the storage backend. + + Parameters + ---------- + array : AsyncArray + The array to inspect. + + Returns + ------- + nshards_initialized : int + The number of shards that have been initialized. + """ + return len(await _shards_initialized(array)) + + +async def _nbytes_stored( + store_path: StorePath, +) -> int: + """ + Calculate the number of bytes stored for an array. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + + Returns + ------- + nbytes_stored : int + The number of bytes stored. + """ + return await store_path.store.getsize_prefix(store_path.path) + + +async def _get_selection( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + indexer: Indexer, + *, + prototype: BufferPrototype, + out: NDBuffer | None = None, + fields: Fields | None = None, +) -> NDArrayLikeOrScalar: + """ + Get a selection from an array. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + indexer : Indexer + The indexer specifying the selection. + prototype : BufferPrototype + A buffer prototype to use for the retrieved data. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + # Get dtype from metadata + if metadata.zarr_format == 2: + zdtype = metadata.dtype + else: + zdtype = metadata.data_type + dtype = zdtype.to_native_dtype() + + # Determine memory order + if metadata.zarr_format == 2: + order = metadata.order + else: + order = config.order + + # check fields are sensible + out_dtype = check_fields(fields, dtype) + + # setup output buffer + if out is not None: + if isinstance(out, NDBuffer): + out_buffer = out + else: + raise TypeError(f"out argument needs to be an NDBuffer. Got {type(out)!r}") + if out_buffer.shape != indexer.shape: + raise ValueError( + f"shape of out argument doesn't match. Expected {indexer.shape}, got {out.shape}" + ) + else: + out_buffer = prototype.nd_buffer.empty( + shape=indexer.shape, + dtype=out_dtype, + order=order, + ) + if product(indexer.shape) > 0: + # need to use the order from the metadata for v2 + _config = config + if metadata.zarr_format == 2: + _config = replace(_config, order=order) + + # reading chunks and decoding them + await codec_pipeline.read( + [ + ( + store_path / metadata.encode_chunk_key(chunk_coords), + metadata.get_chunk_spec(chunk_coords, _config, prototype=prototype), + chunk_selection, + out_selection, + is_complete_chunk, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + ], + out_buffer, + drop_axes=indexer.drop_axes, + ) + if isinstance(indexer, BasicIndexer) and indexer.shape == (): + return out_buffer.as_scalar() + return out_buffer.as_ndarray_like() + + +async def _getitem( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + selection: BasicSelection, + *, + prototype: BufferPrototype | None = None, +) -> NDArrayLikeOrScalar: + """ + Retrieve a subset of the array's data based on the provided selection. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + selection : BasicSelection + A selection object specifying the subset of data to retrieve. + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data (default is None). + + Returns + ------- + NDArrayLikeOrScalar + The retrieved subset of the array's data. + """ + if prototype is None: + prototype = default_buffer_prototype() + indexer = BasicIndexer( + selection, + shape=metadata.shape, + chunk_grid=metadata.chunk_grid, + ) + return await _get_selection( + store_path, metadata, codec_pipeline, config, indexer, prototype=prototype + ) + + +async def _get_orthogonal_selection( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + selection: OrthogonalSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, +) -> NDArrayLikeOrScalar: + """ + Get an orthogonal selection from the array. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + selection : OrthogonalSelection + The orthogonal selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + if prototype is None: + prototype = default_buffer_prototype() + indexer = OrthogonalIndexer(selection, metadata.shape, metadata.chunk_grid) + return await _get_selection( + store_path, + metadata, + codec_pipeline, + config, + indexer=indexer, + out=out, + fields=fields, + prototype=prototype, + ) + + +async def _get_mask_selection( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + mask: MaskSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, +) -> NDArrayLikeOrScalar: + """ + Get a mask selection from the array. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + mask : MaskSelection + The boolean mask specifying the selection. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + if prototype is None: + prototype = default_buffer_prototype() + indexer = MaskIndexer(mask, metadata.shape, metadata.chunk_grid) + return await _get_selection( + store_path, + metadata, + codec_pipeline, + config, + indexer=indexer, + out=out, + fields=fields, + prototype=prototype, + ) + + +async def _get_coordinate_selection( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + selection: CoordinateSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, +) -> NDArrayLikeOrScalar: + """ + Get a coordinate selection from the array. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + selection : CoordinateSelection + The coordinate selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + if prototype is None: + prototype = default_buffer_prototype() + indexer = CoordinateIndexer(selection, metadata.shape, metadata.chunk_grid) + out_array = await _get_selection( + store_path, + metadata, + codec_pipeline, + config, + indexer=indexer, + out=out, + fields=fields, + prototype=prototype, + ) + + if hasattr(out_array, "shape"): + # restore shape + out_array = np.array(out_array).reshape(indexer.sel_shape) + return out_array + + +async def _set_selection( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + indexer: Indexer, + value: npt.ArrayLike, + *, + prototype: BufferPrototype, + fields: Fields | None = None, +) -> None: + """ + Set a selection in an array. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + indexer : Indexer + The indexer specifying the selection. + value : npt.ArrayLike + The values to write. + prototype : BufferPrototype + A buffer prototype to use. + fields : Fields | None, optional + Fields to select from structured arrays. + """ + # Get dtype from metadata + if metadata.zarr_format == 2: + zdtype = metadata.dtype + else: + zdtype = metadata.data_type + dtype = zdtype.to_native_dtype() + + # check fields are sensible + check_fields(fields, dtype) + fields = check_no_multi_fields(fields) + + # check value shape + if np.isscalar(value): + array_like = prototype.buffer.create_zero_length().as_array_like() + if isinstance(array_like, np._typing._SupportsArrayFunc): + # TODO: need to handle array types that don't support __array_function__ + # like PyTorch and JAX + array_like_ = cast("np._typing._SupportsArrayFunc", array_like) + value = np.asanyarray(value, dtype=dtype, like=array_like_) + else: + if not hasattr(value, "shape"): + value = np.asarray(value, dtype) + # assert ( + # value.shape == indexer.shape + # ), f"shape of value doesn't match indexer shape. Expected {indexer.shape}, got {value.shape}" + if not hasattr(value, "dtype") or value.dtype.name != dtype.name: + if hasattr(value, "astype"): + # Handle things that are already NDArrayLike more efficiently + value = value.astype(dtype=dtype, order="A") + else: + value = np.array(value, dtype=dtype, order="A") + value = cast("NDArrayLike", value) + + # We accept any ndarray like object from the user and convert it + # to an NDBuffer (or subclass). From this point onwards, we only pass + # Buffer and NDBuffer between components. + value_buffer = prototype.nd_buffer.from_ndarray_like(value) + + # Determine memory order + if metadata.zarr_format == 2: + order = metadata.order + else: + order = config.order + + # need to use the order from the metadata for v2 + _config = config + if metadata.zarr_format == 2: + _config = replace(_config, order=order) + + # merging with existing data and encoding chunks + await codec_pipeline.write( + [ + ( + store_path / metadata.encode_chunk_key(chunk_coords), + metadata.get_chunk_spec(chunk_coords, _config, prototype), + chunk_selection, + out_selection, + is_complete_chunk, + ) + for chunk_coords, chunk_selection, out_selection, is_complete_chunk in indexer + ], + value_buffer, + drop_axes=indexer.drop_axes, + ) + + +async def _setitem( + store_path: StorePath, + metadata: ArrayMetadata, + codec_pipeline: CodecPipeline, + config: ArrayConfig, + selection: BasicSelection, + value: npt.ArrayLike, + prototype: BufferPrototype | None = None, +) -> None: + """ + Set values in the array using basic indexing. + + Parameters + ---------- + store_path : StorePath + The store path of the array. + metadata : ArrayMetadata + The array metadata. + codec_pipeline : CodecPipeline + The codec pipeline for encoding/decoding. + config : ArrayConfig + The array configuration. + selection : BasicSelection + The selection defining the region of the array to set. + value : npt.ArrayLike + The values to be written into the selected region of the array. + prototype : BufferPrototype or None, optional + A prototype buffer that defines the structure and properties of the array chunks being modified. + If None, the default buffer prototype is used. + """ + if prototype is None: + prototype = default_buffer_prototype() + indexer = BasicIndexer( + selection, + shape=metadata.shape, + chunk_grid=metadata.chunk_grid, + ) + return await _set_selection( + store_path, metadata, codec_pipeline, config, indexer, value, prototype=prototype + ) + + +async def _resize( + array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], + new_shape: ShapeLike, + delete_outside_chunks: bool = True, +) -> None: + """ + Resize an array to a new shape. + + Parameters + ---------- + array : AsyncArray + The array to resize. + new_shape : ShapeLike + The desired new shape of the array. + delete_outside_chunks : bool, optional + If True (default), chunks that fall outside the new shape will be deleted. + If False, the data in those chunks will be preserved. + """ + new_shape = parse_shapelike(new_shape) + assert len(new_shape) == len(array.metadata.shape) + new_metadata = array.metadata.update_shape(new_shape) + + if delete_outside_chunks: + # Remove all chunks outside of the new shape + old_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(array.metadata.shape)) + new_chunk_coords = set(array.metadata.chunk_grid.all_chunk_coords(new_shape)) + + async def _delete_key(key: str) -> None: + await (array.store_path / key).delete() + + await concurrent_map( + [ + (array.metadata.encode_chunk_key(chunk_coords),) + for chunk_coords in old_chunk_coords.difference(new_chunk_coords) + ], + _delete_key, + zarr_config.get("async.concurrency"), + ) + + # Write new metadata + await save_metadata(array.store_path, new_metadata) + + # Update metadata (in place) + object.__setattr__(array, "metadata", new_metadata) + + +async def _append( + array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], + data: npt.ArrayLike, + axis: int = 0, +) -> tuple[int, ...]: + """ + Append data to an array along the specified axis. + + Parameters + ---------- + array : AsyncArray + The array to append to. + data : npt.ArrayLike + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + new_shape : tuple[int, ...] + The new shape of the array after appending. + + Notes + ----- + The size of all dimensions other than `axis` must match between the + array and `data`. + """ + # ensure data is array-like + if not hasattr(data, "shape"): + data = np.asanyarray(data) + + self_shape_preserved = tuple(s for i, s in enumerate(array.shape) if i != axis) + data_shape_preserved = tuple(s for i, s in enumerate(data.shape) if i != axis) + if self_shape_preserved != data_shape_preserved: + raise ValueError( + f"shape of data to append is not compatible with the array. " + f"The shape of the data is ({data_shape_preserved})" + f"and the shape of the array is ({self_shape_preserved})." + "All dimensions must match except for the dimension being " + "appended." + ) + # remember old shape + old_shape = array.shape + + # determine new shape + new_shape = tuple( + array.shape[i] if i != axis else array.shape[i] + data.shape[i] + for i in range(len(array.shape)) + ) + + # resize + await _resize(array, new_shape) + + # store data + append_selection = tuple( + slice(None) if i != axis else slice(old_shape[i], new_shape[i]) + for i in range(len(array.shape)) + ) + await _setitem( + array.store_path, + array.metadata, + array.codec_pipeline, + array._config, + append_selection, + data, + ) + + return new_shape + + +async def _update_attributes( + array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], + new_attributes: dict[str, JSON], +) -> AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata]: + """ + Update the array's attributes. + + Parameters + ---------- + array : AsyncArray + The array whose attributes to update. + new_attributes : dict[str, JSON] + A dictionary of new attributes to update or add to the array. + + Returns + ------- + AsyncArray + The array with the updated attributes. + """ + array.metadata.attributes.update(new_attributes) + + # Write new metadata + await save_metadata(array.store_path, array.metadata) + + return array + + +async def _info_complete( + array: AsyncArray[ArrayV2Metadata] | AsyncArray[ArrayV3Metadata], +) -> Any: + """ + Return all the information for an array, including dynamic information like storage size. + + Parameters + ---------- + array : AsyncArray + The array to get info for. + + Returns + ------- + ArrayInfo + Complete information about the array including: + - The count of chunks initialized + - The sum of the bytes written + """ + return array._info( + await _nshards_initialized(array), + await array.store_path.store.getsize_prefix(array.store_path.path), + ) diff --git a/src/zarr/experimental/array.py b/src/zarr/experimental/array.py new file mode 100644 index 0000000000..4f5bda7302 --- /dev/null +++ b/src/zarr/experimental/array.py @@ -0,0 +1,955 @@ +from __future__ import annotations + +from itertools import starmap +from typing import TYPE_CHECKING, Any + +import numpy as np + +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.abc.numcodec import Numcodec +from zarr.core._info import ArrayInfo +from zarr.core.array import ( + _append, + _get_coordinate_selection, + _get_mask_selection, + _get_orthogonal_selection, + _getitem, + _info_complete, + _iter_chunk_coords, + _iter_chunk_regions, + _iter_shard_coords, + _iter_shard_keys, + _iter_shard_regions, + _nbytes_stored, + _nchunks_initialized, + _nshards_initialized, + _resize, + _setitem, + _update_attributes, + create_codec_pipeline, + get_array_metadata, + parse_array_metadata, +) +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config +from zarr.core.buffer import ( + BufferPrototype, + NDArrayLikeOrScalar, + NDBuffer, +) +from zarr.core.common import ( + JSON, + MemoryOrder, + ShapeLike, + ZarrFormat, + ceildiv, + product, +) +from zarr.core.indexing import ( + BasicSelection, + CoordinateSelection, + Fields, + MaskSelection, + OrthogonalSelection, +) +from zarr.core.metadata import ( + ArrayMetadata, + ArrayMetadataDict, + ArrayV2Metadata, + ArrayV3Metadata, +) +from zarr.core.sync import sync +from zarr.storage._common import StorePath, make_store_path + +if TYPE_CHECKING: + from collections.abc import Iterator, Sequence + from typing import Self + + import numpy.typing as npt + + from zarr.abc.codec import CodecPipeline + from zarr.abc.store import Store + from zarr.storage import StoreLike + + +class Array: + """ + A unified Zarr array class with both synchronous and asynchronous methods. + + This class combines the functionality of AsyncArray and Array into a single class. + For each operation, there is both a synchronous method (e.g., `getitem`) and an + asynchronous method (e.g., `getitem_async`). + + Parameters + ---------- + metadata : ArrayV2Metadata | ArrayV3Metadata + The metadata of the array. + store_path : StorePath + The path to the Zarr store. + config : ArrayConfigLike, optional + The runtime configuration of the array, by default None. + + Attributes + ---------- + metadata : ArrayV2Metadata | ArrayV3Metadata + The metadata of the array. + store_path : StorePath + The path to the Zarr store. + codec_pipeline : CodecPipeline + The codec pipeline used for encoding and decoding chunks. + _config : ArrayConfig + The runtime configuration of the array. + """ + + metadata: ArrayV2Metadata | ArrayV3Metadata + store_path: StorePath + codec_pipeline: CodecPipeline + config: ArrayConfig + + def __init__( + self, + store_path: StorePath, + metadata: ArrayMetadata | ArrayMetadataDict, + *, + codec_pipeline: CodecPipeline | None = None, + config: ArrayConfigLike | None = None, + ) -> None: + metadata_parsed = parse_array_metadata(metadata) + config_parsed = parse_array_config(config) + + if codec_pipeline is None: + codec_pipeline = create_codec_pipeline(metadata=metadata_parsed, store=store_path.store) + + self.metadata = metadata_parsed + self.store_path = store_path + self.config = config_parsed + self.codec_pipeline = codec_pipeline + + # ------------------------------------------------------------------------- + # Class methods: open + # ------------------------------------------------------------------------- + + @classmethod + async def open_async( + cls, + store: StoreLike, + *, + config: ArrayConfigLike | None = None, + codec_pipeline: CodecPipeline | None = None, + zarr_format: ZarrFormat | None = 3, + ) -> Array: + """ + Async method to open an existing Zarr array from a given store. + + Parameters + ---------- + store : StoreLike + The store containing the Zarr array. + zarr_format : ZarrFormat | None, optional + The Zarr format version (default is 3). + + Returns + ------- + Array + The opened Zarr array. + """ + store_path = await make_store_path(store) + metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) + return cls( + store_path=store_path, + metadata=metadata_dict, + codec_pipeline=codec_pipeline, + config=config, + ) + + @classmethod + def open( + cls, + store: StoreLike, + *, + config: ArrayConfigLike | None = None, + codec_pipeline: CodecPipeline | None = None, + zarr_format: ZarrFormat | None = 3, + ) -> Array: + """ + Open an existing Zarr array from a given store. + + Parameters + ---------- + store : StoreLike + The store containing the Zarr array. + zarr_format : ZarrFormat | None, optional + The Zarr format version (default is 3). + + Returns + ------- + Array + The opened Zarr array. + """ + return sync(cls.open_async(store, zarr_format=zarr_format)) + + # ------------------------------------------------------------------------- + # Properties (all synchronous, derived from metadata/store_path) + # ------------------------------------------------------------------------- + + @property + def store(self) -> Store: + """The store containing the array data.""" + return self.store_path.store + + @property + def ndim(self) -> int: + """Returns the number of dimensions in the Array.""" + return len(self.metadata.shape) + + @property + def shape(self) -> tuple[int, ...]: + """Returns the shape of the Array.""" + return self.metadata.shape + + @property + def chunks(self) -> tuple[int, ...]: + """Returns the chunk shape of the Array.""" + return self.metadata.chunks + + @property + def shards(self) -> tuple[int, ...] | None: + """Returns the shard shape of the Array, or None if sharding is not used.""" + return self.metadata.shards + + @property + def size(self) -> int: + """Returns the total number of elements in the array.""" + return np.prod(self.metadata.shape).item() + + @property + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...]: + """Filters applied to each chunk before serialization.""" + if self.metadata.zarr_format == 2: + filters = self.metadata.filters + if filters is None: + return () + return filters + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayArrayCodec) + ) + + @property + def serializer(self) -> ArrayBytesCodec | None: + """Array-to-bytes codec for serializing chunks.""" + if self.metadata.zarr_format == 2: + return None + return next( + codec for codec in self.metadata.inner_codecs if isinstance(codec, ArrayBytesCodec) + ) + + @property + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: + """Compressors applied to each chunk after serialization.""" + if self.metadata.zarr_format == 2: + if self.metadata.compressor is not None: + return (self.metadata.compressor,) + return () + return tuple( + codec for codec in self.metadata.inner_codecs if isinstance(codec, BytesBytesCodec) + ) + + @property + def _zdtype(self) -> Any: + """The zarr-specific representation of the array data type.""" + if self.metadata.zarr_format == 2: + return self.metadata.dtype + else: + return self.metadata.data_type + + @property + def dtype(self) -> np.dtype[Any]: + """Returns the data type of the array.""" + return self._zdtype.to_native_dtype() + + @property + def order(self) -> MemoryOrder: + """Returns the memory order of the array.""" + if self.metadata.zarr_format == 2: + return self.metadata.order + else: + return self.config.order + + @property + def attrs(self) -> dict[str, JSON]: + """Returns the attributes of the array.""" + return self.metadata.attributes + + @property + def read_only(self) -> bool: + """Returns True if the array is read-only.""" + return self.store_path.read_only + + @property + def path(self) -> str: + """Storage path.""" + return self.store_path.path + + @property + def name(self) -> str: + """Array name following h5py convention.""" + name = self.path + if not name.startswith("/"): + name = "/" + name + return name + + @property + def basename(self) -> str: + """Final component of name.""" + return self.name.split("/")[-1] + + @property + def cdata_shape(self) -> tuple[int, ...]: + """The shape of the chunk grid for this array.""" + return self._chunk_grid_shape + + @property + def _chunk_grid_shape(self) -> tuple[int, ...]: + """The shape of the chunk grid for this array.""" + return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=True))) + + @property + def _shard_grid_shape(self) -> tuple[int, ...]: + """The shape of the shard grid for this array.""" + if self.shards is None: + shard_shape = self.chunks + else: + shard_shape = self.shards + return tuple(starmap(ceildiv, zip(self.shape, shard_shape, strict=True))) + + @property + def nchunks(self) -> int: + """The number of chunks in this array.""" + return product(self._chunk_grid_shape) + + @property + def _nshards(self) -> int: + """The number of shards in this array.""" + return product(self._shard_grid_shape) + + @property + def nbytes(self) -> int: + """The total number of bytes that would be stored if all chunks were initialized.""" + return self.size * self.dtype.itemsize + + @property + def info(self) -> ArrayInfo: + """Return the statically known information for an array.""" + return self._info() + + def _info( + self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None + ) -> ArrayInfo: + return ArrayInfo( + _zarr_format=self.metadata.zarr_format, + _data_type=self._zdtype, + _fill_value=self.metadata.fill_value, + _shape=self.shape, + _order=self.order, + _shard_shape=self.shards, + _chunk_shape=self.chunks, + _read_only=self.read_only, + _compressors=self.compressors, + _filters=self.filters, + _serializer=self.serializer, + _store_type=type(self.store_path.store).__name__, + _count_bytes=self.nbytes, + _count_bytes_stored=count_bytes_stored, + _count_chunks_initialized=count_chunks_initialized, + ) + + # ------------------------------------------------------------------------- + # Iteration methods (synchronous) + # ------------------------------------------------------------------------- + + def _iter_chunk_coords( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[int, ...]]: + """Iterate over chunk coordinates in chunk grid space.""" + return _iter_chunk_coords(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_shard_coords( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[int, ...]]: + """Iterate over shard coordinates in shard grid space.""" + return _iter_shard_coords(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_shard_keys( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[str]: + """Iterate over the keys of stored objects supporting this array.""" + return _iter_shard_keys(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_chunk_regions( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[slice, ...]]: + """Iterate over chunk regions in array index space.""" + return _iter_chunk_regions(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_shard_regions( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[slice, ...]]: + """Iterate over shard regions in array index space.""" + return _iter_shard_regions(array=self, origin=origin, selection_shape=selection_shape) + + # ------------------------------------------------------------------------- + # nchunks_initialized: async and sync + # ------------------------------------------------------------------------- + + async def nchunks_initialized_async(self) -> int: + """ + Asynchronously calculate the number of chunks that have been initialized. + + Returns + ------- + int + The number of chunks that have been initialized. + """ + return await _nchunks_initialized(self) + + def nchunks_initialized(self) -> int: + """ + Calculate the number of chunks that have been initialized. + + Returns + ------- + int + The number of chunks that have been initialized. + """ + return sync(self.nchunks_initialized_async()) + + # ------------------------------------------------------------------------- + # _nshards_initialized: async and sync + # ------------------------------------------------------------------------- + + async def _nshards_initialized_async(self) -> int: + """ + Asynchronously calculate the number of shards that have been initialized. + + Returns + ------- + int + The number of shards that have been initialized. + """ + return await _nshards_initialized(self) + + def _nshards_initialized(self) -> int: + """ + Calculate the number of shards that have been initialized. + + Returns + ------- + int + The number of shards that have been initialized. + """ + return sync(self._nshards_initialized_async()) + + # ------------------------------------------------------------------------- + # nbytes_stored: async and sync + # ------------------------------------------------------------------------- + + async def nbytes_stored_async(self) -> int: + """ + Asynchronously calculate the number of bytes stored for this array. + + Returns + ------- + int + The number of bytes stored. + """ + return await _nbytes_stored(self.store_path) + + def nbytes_stored(self) -> int: + """ + Calculate the number of bytes stored for this array. + + Returns + ------- + int + The number of bytes stored. + """ + return sync(self.nbytes_stored_async()) + + # ------------------------------------------------------------------------- + # getitem: async and sync + # ------------------------------------------------------------------------- + + async def getitem_async( + self, + selection: BasicSelection, + *, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously retrieve a subset of the array's data based on the provided selection. + + Parameters + ---------- + selection : BasicSelection + A selection object specifying the subset of data to retrieve. + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The retrieved subset of the array's data. + """ + return await _getitem( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + prototype=prototype, + ) + + def getitem( + self, + selection: BasicSelection, + *, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Retrieve a subset of the array's data based on the provided selection. + + Parameters + ---------- + selection : BasicSelection + A selection object specifying the subset of data to retrieve. + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The retrieved subset of the array's data. + """ + return sync(self.getitem_async(selection, prototype=prototype)) + + def __getitem__(self, selection: BasicSelection) -> NDArrayLikeOrScalar: + """Retrieve data using indexing syntax.""" + return self.getitem(selection) + + # ------------------------------------------------------------------------- + # setitem: async and sync + # ------------------------------------------------------------------------- + + async def setitem_async( + self, + selection: BasicSelection, + value: npt.ArrayLike, + prototype: BufferPrototype | None = None, + ) -> None: + """ + Asynchronously set values in the array using basic indexing. + + Parameters + ---------- + selection : BasicSelection + The selection defining the region of the array to set. + value : npt.ArrayLike + The values to be written into the selected region. + prototype : BufferPrototype, optional + A buffer prototype to use. + """ + return await _setitem( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + value, + prototype=prototype, + ) + + def setitem( + self, + selection: BasicSelection, + value: npt.ArrayLike, + prototype: BufferPrototype | None = None, + ) -> None: + """ + Set values in the array using basic indexing. + + Parameters + ---------- + selection : BasicSelection + The selection defining the region of the array to set. + value : npt.ArrayLike + The values to be written into the selected region. + prototype : BufferPrototype, optional + A buffer prototype to use. + """ + sync(self.setitem_async(selection, value, prototype=prototype)) + + def __setitem__(self, selection: BasicSelection, value: npt.ArrayLike) -> None: + """Set data using indexing syntax.""" + self.setitem(selection, value) + + # ------------------------------------------------------------------------- + # get_orthogonal_selection: async and sync + # ------------------------------------------------------------------------- + + async def get_orthogonal_selection_async( + self, + selection: OrthogonalSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously get an orthogonal selection from the array. + + Parameters + ---------- + selection : OrthogonalSelection + The orthogonal selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return await _get_orthogonal_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + out=out, + fields=fields, + prototype=prototype, + ) + + def get_orthogonal_selection( + self, + selection: OrthogonalSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Get an orthogonal selection from the array. + + Parameters + ---------- + selection : OrthogonalSelection + The orthogonal selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return sync( + self.get_orthogonal_selection_async( + selection, out=out, fields=fields, prototype=prototype + ) + ) + + # ------------------------------------------------------------------------- + # get_mask_selection: async and sync + # ------------------------------------------------------------------------- + + async def get_mask_selection_async( + self, + mask: MaskSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously get a mask selection from the array. + + Parameters + ---------- + mask : MaskSelection + The boolean mask specifying the selection. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return await _get_mask_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + mask, + out=out, + fields=fields, + prototype=prototype, + ) + + def get_mask_selection( + self, + mask: MaskSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Get a mask selection from the array. + + Parameters + ---------- + mask : MaskSelection + The boolean mask specifying the selection. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return sync( + self.get_mask_selection_async(mask, out=out, fields=fields, prototype=prototype) + ) + + # ------------------------------------------------------------------------- + # get_coordinate_selection: async and sync + # ------------------------------------------------------------------------- + + async def get_coordinate_selection_async( + self, + selection: CoordinateSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously get a coordinate selection from the array. + + Parameters + ---------- + selection : CoordinateSelection + The coordinate selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return await _get_coordinate_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + out=out, + fields=fields, + prototype=prototype, + ) + + def get_coordinate_selection( + self, + selection: CoordinateSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Get a coordinate selection from the array. + + Parameters + ---------- + selection : CoordinateSelection + The coordinate selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return sync( + self.get_coordinate_selection_async( + selection, out=out, fields=fields, prototype=prototype + ) + ) + + # ------------------------------------------------------------------------- + # resize: async and sync + # ------------------------------------------------------------------------- + + async def resize_async(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + """ + Asynchronously resize the array to a new shape. + + Parameters + ---------- + new_shape : ShapeLike + The desired new shape of the array. + delete_outside_chunks : bool, optional + If True (default), chunks that fall outside the new shape will be deleted. + """ + return await _resize(self, new_shape, delete_outside_chunks) + + def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + """ + Resize the array to a new shape. + + Parameters + ---------- + new_shape : ShapeLike + The desired new shape of the array. + delete_outside_chunks : bool, optional + If True (default), chunks that fall outside the new shape will be deleted. + """ + sync(self.resize_async(new_shape, delete_outside_chunks)) + + # ------------------------------------------------------------------------- + # append: async and sync + # ------------------------------------------------------------------------- + + async def append_async(self, data: npt.ArrayLike, axis: int = 0) -> tuple[int, ...]: + """ + Asynchronously append data to the array along the specified axis. + + Parameters + ---------- + data : npt.ArrayLike + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + tuple[int, ...] + The new shape of the array after appending. + """ + return await _append(self, data, axis) + + def append(self, data: npt.ArrayLike, axis: int = 0) -> tuple[int, ...]: + """ + Append data to the array along the specified axis. + + Parameters + ---------- + data : npt.ArrayLike + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + tuple[int, ...] + The new shape of the array after appending. + """ + return sync(self.append_async(data, axis)) + + # ------------------------------------------------------------------------- + # update_attributes: async and sync + # ------------------------------------------------------------------------- + + async def update_attributes_async(self, new_attributes: dict[str, JSON]) -> Self: + """ + Asynchronously update the array's attributes. + + Parameters + ---------- + new_attributes : dict[str, JSON] + A dictionary of new attributes to update or add. + + Returns + ------- + Array + The array with the updated attributes. + """ + await _update_attributes(self, new_attributes) + return self + + def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: + """ + Update the array's attributes. + + Parameters + ---------- + new_attributes : dict[str, JSON] + A dictionary of new attributes to update or add. + + Returns + ------- + Array + The array with the updated attributes. + """ + return sync(self.update_attributes_async(new_attributes)) + + # ------------------------------------------------------------------------- + # info_complete: async and sync + # ------------------------------------------------------------------------- + + async def info_complete_async(self) -> ArrayInfo: + """ + Asynchronously return all the information for an array, including dynamic information. + + Returns + ------- + ArrayInfo + Complete information about the array including chunks initialized and bytes stored. + """ + return await _info_complete(self) + + def info_complete(self) -> ArrayInfo: + """ + Return all the information for an array, including dynamic information. + + Returns + ------- + ArrayInfo + Complete information about the array including chunks initialized and bytes stored. + """ + return sync(self.info_complete_async()) + + # ------------------------------------------------------------------------- + # __repr__ + # ------------------------------------------------------------------------- + + def __repr__(self) -> str: + return f"" diff --git a/src/zarr/experimental/lazy_indexing.py b/src/zarr/experimental/lazy_indexing.py new file mode 100644 index 0000000000..434fb87420 --- /dev/null +++ b/src/zarr/experimental/lazy_indexing.py @@ -0,0 +1,2578 @@ +"""An experimental array that supports lazy indexing by explicitly tracking the +domain of the array. + +This module implements TensorStore-inspired lazy indexing for Zarr arrays. +Key concepts: + +- **IndexDomain**: Represents a rectangular region in index space with inclusive + lower bounds and exclusive upper bounds. Unlike NumPy, domains preserve non-zero + origins when slicing. + +- **Lazy Indexing**: When you index an Array, instead of loading data, you get + a new Array with a narrowed domain. Data is only loaded when you call `resolve()`. + +- **Non-zero Origins**: Arrays can have domains that don't start at zero. + For example, an array with domain [10, 20) has indices 10, 11, ..., 19. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from itertools import starmap +from typing import TYPE_CHECKING, Any + +import numpy as np + +from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec +from zarr.abc.numcodec import Numcodec +from zarr.core._info import ArrayInfo +from zarr.core.array import ( + _append, + _get_coordinate_selection, + _get_mask_selection, + _get_orthogonal_selection, + _getitem, + _info_complete, + _iter_chunk_coords, + _iter_chunk_regions, + _iter_shard_coords, + _iter_shard_keys, + _iter_shard_regions, + _nbytes_stored, + _nchunks_initialized, + _nshards_initialized, + _resize, + _setitem, + _update_attributes, + create_codec_pipeline, + get_array_metadata, + parse_array_metadata, +) +from zarr.core.array_spec import ArrayConfig, ArrayConfigLike, parse_array_config +from zarr.core.buffer import ( + BufferPrototype, + NDArrayLikeOrScalar, + NDBuffer, +) +from zarr.core.common import ( + JSON, + MemoryOrder, + ShapeLike, + ZarrFormat, + ceildiv, + product, +) +from zarr.core.indexing import ( + BasicSelection, + CoordinateSelection, + Fields, + MaskSelection, + OrthogonalSelection, +) +from zarr.core.metadata import ( + ArrayMetadata, + ArrayMetadataDict, + ArrayV2Metadata, + ArrayV3Metadata, +) +from zarr.core.sync import sync +from zarr.storage._common import StorePath, make_store_path + +if TYPE_CHECKING: + from collections.abc import Iterator, Sequence + from typing import Self + + import numpy.typing as npt + + from zarr.abc.codec import CodecPipeline + from zarr.abc.store import Store + from zarr.storage import StoreLike + + +@dataclass(frozen=True) +class IndexDomain: + """ + Represents a rectangular region in index space. + + An IndexDomain defines the valid indices for an array, with inclusive lower + bounds and exclusive upper bounds for each dimension. This is inspired by + TensorStore's IndexDomain concept. + + Unlike NumPy arrays which always have origins at zero, IndexDomain supports + non-zero origins. For example, after slicing arr[5:10], the resulting array + has a domain with origin 5 and shape 5, meaning valid indices are 5, 6, 7, 8, 9. + + Parameters + ---------- + inclusive_min : tuple[int, ...] + The inclusive lower bounds for each dimension (the first valid index). + exclusive_max : tuple[int, ...] + The exclusive upper bounds for each dimension (one past the last valid index). + + Examples + -------- + >>> domain = IndexDomain(inclusive_min=(0, 0), exclusive_max=(10, 20)) + >>> domain.shape + (10, 20) + >>> domain.origin + (0, 0) + + >>> # After slicing [5:8, 10:15] + >>> sliced = IndexDomain(inclusive_min=(5, 10), exclusive_max=(8, 15)) + >>> sliced.shape + (3, 5) + >>> sliced.origin + (5, 10) + """ + + inclusive_min: tuple[int, ...] + exclusive_max: tuple[int, ...] + + def __post_init__(self) -> None: + if len(self.inclusive_min) != len(self.exclusive_max): + raise ValueError( + f"inclusive_min and exclusive_max must have the same length. " + f"Got {len(self.inclusive_min)} and {len(self.exclusive_max)}." + ) + for i, (lo, hi) in enumerate(zip(self.inclusive_min, self.exclusive_max, strict=True)): + if lo > hi: + raise ValueError( + f"inclusive_min must be <= exclusive_max for all dimensions. " + f"Dimension {i}: {lo} > {hi}" + ) + + @classmethod + def from_shape(cls, shape: tuple[int, ...]) -> IndexDomain: + """Create a domain with origin at zero and the given shape.""" + return cls( + inclusive_min=(0,) * len(shape), + exclusive_max=shape, + ) + + @property + def ndim(self) -> int: + """Number of dimensions.""" + return len(self.inclusive_min) + + @property + def origin(self) -> tuple[int, ...]: + """The origin (inclusive lower bounds) of the domain.""" + return self.inclusive_min + + @property + def shape(self) -> tuple[int, ...]: + """The shape of the domain (exclusive_max - inclusive_min).""" + return tuple(hi - lo for lo, hi in zip(self.inclusive_min, self.exclusive_max, strict=True)) + + def contains(self, index: tuple[int, ...]) -> bool: + """Check if an index is within this domain.""" + if len(index) != self.ndim: + return False + return all( + lo <= idx < hi + for lo, hi, idx in zip(self.inclusive_min, self.exclusive_max, index, strict=True) + ) + + def contains_domain(self, other: IndexDomain) -> bool: + """Check if another domain is entirely contained within this domain.""" + if other.ndim != self.ndim: + return False + return all( + self_lo <= other_lo and other_hi <= self_hi + for self_lo, self_hi, other_lo, other_hi in zip( + self.inclusive_min, + self.exclusive_max, + other.inclusive_min, + other.exclusive_max, + strict=True, + ) + ) + + def __repr__(self) -> str: + ranges = ", ".join( + f"[{lo}, {hi})" for lo, hi in zip(self.inclusive_min, self.exclusive_max, strict=True) + ) + return f"IndexDomain({ranges})" + + def intersect(self, other: IndexDomain) -> IndexDomain | None: + """ + Compute the intersection of this domain with another. + + Returns None if the domains do not overlap. + + Parameters + ---------- + other : IndexDomain + The other domain to intersect with. + + Returns + ------- + IndexDomain | None + The intersection domain, or None if they don't overlap. + """ + if other.ndim != self.ndim: + raise ValueError( + f"Cannot intersect domains with different ranks: {self.ndim} vs {other.ndim}" + ) + new_min = tuple( + max(a, b) for a, b in zip(self.inclusive_min, other.inclusive_min, strict=True) + ) + new_max = tuple( + min(a, b) for a, b in zip(self.exclusive_max, other.exclusive_max, strict=True) + ) + # Check if intersection is empty + if any(lo >= hi for lo, hi in zip(new_min, new_max, strict=True)): + return None + return IndexDomain(inclusive_min=new_min, exclusive_max=new_max) + + def translate(self, offset: tuple[int, ...]) -> IndexDomain: + """ + Translate (shift) this domain by an offset. + + Returns a new IndexDomain with bounds shifted by the given offset. + This is useful for converting between coordinate systems. + + Parameters + ---------- + offset : tuple[int, ...] + The offset to add to each dimension's bounds. Positive values + shift the domain in the positive direction. + + Returns + ------- + IndexDomain + A new domain with translated bounds. + + Examples + -------- + >>> domain = IndexDomain(inclusive_min=(10, 20), exclusive_max=(30, 40)) + >>> domain.translate((-10, -20)) + IndexDomain([0, 10), [0, 20)) + + >>> # Useful for converting domain coordinates to output coordinates + >>> intersection = domain.intersect(other_domain) + >>> output_domain = intersection.translate((-domain.inclusive_min[0], ...)) + """ + if len(offset) != self.ndim: + raise ValueError( + f"Offset must have same length as domain dimensions. " + f"Domain has {self.ndim} dimensions, offset has {len(offset)}." + ) + new_min = tuple(lo + off for lo, off in zip(self.inclusive_min, offset, strict=True)) + new_max = tuple(hi + off for hi, off in zip(self.exclusive_max, offset, strict=True)) + return IndexDomain(inclusive_min=new_min, exclusive_max=new_max) + + +@dataclass(frozen=True) +class ChunkLayout: + """ + Describes the chunk grid for an array. + + A ChunkLayout defines how an array is partitioned into chunks. It consists of: + - grid_origin: The coordinate where the chunk grid starts (where chunk (0,0,...) begins) + - chunk_shape: The size of each chunk + + Key insight: Each chunk is conceptually a sub-array with its own domain. The chunk + at coordinates (i, j, ...) has domain: + [grid_origin[d] + i * chunk_shape[d], grid_origin[d] + (i+1) * chunk_shape[d]) + for each dimension d. + + This means a chunked array can be thought of as a concatenation of chunk sub-arrays, + each with its own domain. + + Parameters + ---------- + grid_origin : tuple[int, ...] + The origin of the chunk grid (where chunk boundaries start). + chunk_shape : tuple[int, ...] + The shape of each chunk. + + Examples + -------- + >>> layout = ChunkLayout(grid_origin=(0, 0), chunk_shape=(10, 10)) + >>> layout.chunk_domain((0, 0)) + IndexDomain([0, 10), [0, 10)) + >>> layout.chunk_domain((1, 2)) + IndexDomain([10, 20), [20, 30)) + + >>> # With non-zero origin + >>> layout = ChunkLayout(grid_origin=(5, 5), chunk_shape=(10, 10)) + >>> layout.chunk_domain((0, 0)) + IndexDomain([5, 15), [5, 15)) + >>> layout.is_aligned((5, 5)) + True + >>> layout.is_aligned((7, 5)) + False + """ + + grid_origin: tuple[int, ...] + chunk_shape: tuple[int, ...] + + def __post_init__(self) -> None: + if len(self.grid_origin) != len(self.chunk_shape): + raise ValueError( + f"grid_origin and chunk_shape must have the same length. " + f"Got {len(self.grid_origin)} and {len(self.chunk_shape)}." + ) + if any(s <= 0 for s in self.chunk_shape): + raise ValueError( + f"chunk_shape must be positive in all dimensions. Got {self.chunk_shape}" + ) + + @classmethod + def from_chunk_shape(cls, chunk_shape: tuple[int, ...]) -> ChunkLayout: + """Create a ChunkLayout with grid origin at zero.""" + return cls(grid_origin=(0,) * len(chunk_shape), chunk_shape=chunk_shape) + + @property + def ndim(self) -> int: + """Number of dimensions.""" + return len(self.grid_origin) + + def is_aligned(self, coord: tuple[int, ...]) -> bool: + """ + Check if a coordinate lies on a chunk boundary. + + A coordinate is chunk-aligned if (coord[d] - grid_origin[d]) is divisible + by chunk_shape[d] for all dimensions d. + + Parameters + ---------- + coord : tuple[int, ...] + The coordinate to check. + + Returns + ------- + bool + True if the coordinate is on a chunk boundary in all dimensions. + """ + if len(coord) != self.ndim: + raise ValueError(f"Expected {self.ndim} dimensions, got {len(coord)}") + return all( + (c - o) % s == 0 + for c, o, s in zip(coord, self.grid_origin, self.chunk_shape, strict=True) + ) + + def chunk_coords_for_point(self, point: tuple[int, ...]) -> tuple[int, ...]: + """ + Get the chunk coordinates containing a given point. + + Parameters + ---------- + point : tuple[int, ...] + A point in the array's coordinate space. + + Returns + ------- + tuple[int, ...] + The chunk coordinates (i, j, ...) of the chunk containing this point. + """ + if len(point) != self.ndim: + raise ValueError(f"Expected {self.ndim} dimensions, got {len(point)}") + # Use floor division to handle negative coordinates correctly + return tuple( + (p - o) // s if (p - o) >= 0 else -ceildiv(o - p, s) + for p, o, s in zip(point, self.grid_origin, self.chunk_shape, strict=True) + ) + + def chunk_domain(self, chunk_coords: tuple[int, ...]) -> IndexDomain: + """ + Get the domain of a specific chunk. + + Each chunk is a sub-array with its own domain. This returns that domain. + + Parameters + ---------- + chunk_coords : tuple[int, ...] + The chunk coordinates (e.g., (0, 0) for the first chunk). + + Returns + ------- + IndexDomain + The domain of the chunk. + """ + if len(chunk_coords) != self.ndim: + raise ValueError(f"Expected {self.ndim} dimensions, got {len(chunk_coords)}") + inclusive_min = tuple( + o + c * s + for o, c, s in zip(self.grid_origin, chunk_coords, self.chunk_shape, strict=True) + ) + exclusive_max = tuple( + o + (c + 1) * s + for o, c, s in zip(self.grid_origin, chunk_coords, self.chunk_shape, strict=True) + ) + return IndexDomain(inclusive_min=inclusive_min, exclusive_max=exclusive_max) + + def iter_chunk_coords(self, domain: IndexDomain) -> Iterator[tuple[int, ...]]: + """ + Iterate over all chunk coordinates that overlap with a domain. + + Parameters + ---------- + domain : IndexDomain + The domain to find overlapping chunks for. + + Yields + ------ + tuple[int, ...] + Chunk coordinates for each chunk that overlaps with the domain. + """ + if domain.ndim != self.ndim: + raise ValueError(f"Domain has {domain.ndim} dims, layout has {self.ndim} dims") + + # Find the range of chunk coordinates + start_coords = self.chunk_coords_for_point(domain.inclusive_min) + # For exclusive_max, we need the chunk containing (exclusive_max - 1) + # But if exclusive_max is on a boundary, the last chunk is the previous one + end_coords = tuple( + self.chunk_coords_for_point(tuple(m - 1 for m in domain.exclusive_max))[d] + 1 + for d in range(self.ndim) + ) + + def iter_coords( + starts: tuple[int, ...], ends: tuple[int, ...] + ) -> Iterator[tuple[int, ...]]: + if not starts: + yield () + return + for c in range(starts[0], ends[0]): + for rest in iter_coords(starts[1:], ends[1:]): + yield (c,) + rest + + yield from iter_coords(start_coords, end_coords) + + def iter_chunk_domains( + self, domain: IndexDomain + ) -> Iterator[tuple[tuple[int, ...], IndexDomain]]: + """ + Iterate over chunks that overlap with a domain, yielding their domains. + + This embodies the insight that chunks are sub-arrays with their own domains. + For each chunk overlapping the given domain, yields the chunk coordinates + and the intersection of the chunk's domain with the given domain. + + Parameters + ---------- + domain : IndexDomain + The domain to find overlapping chunks for. + + Yields + ------ + tuple[tuple[int, ...], IndexDomain] + Pairs of (chunk_coords, chunk_intersection_domain). + """ + for chunk_coords in self.iter_chunk_coords(domain): + chunk_dom = self.chunk_domain(chunk_coords) + intersection = chunk_dom.intersect(domain) + if intersection is not None: + yield chunk_coords, intersection + + def aligned_domain(self, domain: IndexDomain) -> IndexDomain: + """ + Return the largest chunk-aligned subdomain contained within the given domain. + + This rounds the lower bounds up and upper bounds down to chunk boundaries. + + Parameters + ---------- + domain : IndexDomain + The domain to align. + + Returns + ------- + IndexDomain + The largest aligned subdomain. May have zero size in some dimensions + if the domain is smaller than a chunk. + """ + if domain.ndim != self.ndim: + raise ValueError(f"Domain has {domain.ndim} dims, layout has {self.ndim} dims") + + # Round lower bounds UP to next chunk boundary + aligned_min = tuple( + o + ceildiv(lo - o, s) * s + for lo, o, s in zip( + domain.inclusive_min, self.grid_origin, self.chunk_shape, strict=True + ) + ) + # Round upper bounds DOWN to previous chunk boundary + aligned_max = tuple( + o + ((hi - o) // s) * s + for hi, o, s in zip( + domain.exclusive_max, self.grid_origin, self.chunk_shape, strict=True + ) + ) + # Ensure we don't create an invalid domain (max < min) + aligned_max = tuple(max(lo, hi) for lo, hi in zip(aligned_min, aligned_max, strict=True)) + return IndexDomain(inclusive_min=aligned_min, exclusive_max=aligned_max) + + def __repr__(self) -> str: + return f"ChunkLayout(grid_origin={self.grid_origin}, chunk_shape={self.chunk_shape})" + + +@dataclass(frozen=True) +class StorageSource: + """ + A source backed by Zarr storage. + + This encapsulates all the information needed to read data from a Zarr array + stored on disk or in memory. It includes the store path, metadata, codec + pipeline, and the index transform that maps domain coordinates to storage + coordinates. + + Parameters + ---------- + store_path : StorePath + The path to the Zarr store. + metadata : ArrayV2Metadata | ArrayV3Metadata + The metadata of the array. + codec_pipeline : CodecPipeline + The codec pipeline used for encoding and decoding chunks. + config : ArrayConfig + The runtime configuration of the array. + index_transform : tuple[int, ...] + The offset to subtract from domain coordinates to get storage coordinates. + """ + + store_path: StorePath + metadata: ArrayV2Metadata | ArrayV3Metadata + codec_pipeline: Any # CodecPipeline - avoid forward reference issues + config: ArrayConfig + index_transform: tuple[int, ...] + + @property + def storage_shape(self) -> tuple[int, ...]: + """The shape of the underlying storage.""" + return self.metadata.shape + + @property + def chunks(self) -> tuple[int, ...]: + """The chunk shape.""" + return self.metadata.chunks + + @property + def dtype(self) -> np.dtype[Any]: + """The data type.""" + return ( + self.metadata.data_type.to_native_dtype() + if hasattr(self.metadata, "data_type") + else self.metadata.dtype.to_native_dtype() + ) + + @property + def fill_value(self) -> Any: + """The fill value.""" + return self.metadata.fill_value + + +def _get_storage_identity(arr: Array) -> tuple[Any, tuple[int, ...]] | None: + """ + Get the storage identity for an Array if it's backed by a single storage source. + + Returns (store_path, index_transform) if the array has a single StorageSource, + or None if it has multiple sources or Array sources. + """ + if len(arr._sources) == 1 and isinstance(arr._sources[0], StorageSource): + source = arr._sources[0] + return (source.store_path, source.index_transform) + return None + + +def _try_merge_to_single_source( + arrays: list[Array], + domain: IndexDomain, +) -> StorageSource | None: + """ + Try to merge multiple Arrays into a single StorageSource. + + This succeeds when all input arrays: + 1. Are backed by the same storage (same store_path) + 2. Have the same index_transform (same coordinate mapping) + 3. Their combined domains fully cover the target domain (no gaps) + + In this case, we can represent the concatenation as a single StorageSource, + since the storage already contains all the data we need. + + Returns the merged StorageSource, or None if merging isn't possible. + """ + if not arrays: + return None + + # Check if all arrays share the same storage identity + first_identity = _get_storage_identity(arrays[0]) + if first_identity is None: + return None + + for arr in arrays[1:]: + identity = _get_storage_identity(arr) + if identity != first_identity: + return None + + # All arrays share the same storage identity. + # Now check if the source domains fully cover the target domain. + # We need to verify there are no gaps that would require fill_value. + + # For simplicity, check if the union of input domains equals the target domain. + # This is a conservative check - we only merge when domains are exactly covering. + + # Compute the bounding box of all input domains + ndim = domain.ndim + input_min = tuple(min(arr.domain.inclusive_min[d] for arr in arrays) for d in range(ndim)) + input_max = tuple(max(arr.domain.exclusive_max[d] for arr in arrays) for d in range(ndim)) + input_bbox = IndexDomain(inclusive_min=input_min, exclusive_max=input_max) + + # For the merge to be valid without gaps, we need to check that the input arrays + # completely cover their bounding box. This is complex to check in general. + # + # A simple conservative approach: only merge if there's a single input array + # or if the input arrays' total volume equals the bounding box volume. + # This works for non-overlapping, gap-free cases like arr[:10] and arr[10:]. + + total_input_volume = sum( + int( + np.prod( + [arr.domain.exclusive_max[d] - arr.domain.inclusive_min[d] for d in range(ndim)] + ) + ) + for arr in arrays + ) + bbox_volume = int( + np.prod([input_bbox.exclusive_max[d] - input_bbox.inclusive_min[d] for d in range(ndim)]) + ) + + # If total volume < bbox volume, there are gaps -> can't merge + # If total volume > bbox volume, there are overlaps -> we can still merge + # (overlaps are fine, we just read from one source) + # If total volume == bbox volume, perfect coverage -> can merge + if total_input_volume < bbox_volume: + return None + + # All arrays share the same storage and fully cover the domain + first_source = arrays[0]._sources[0] + assert isinstance(first_source, StorageSource) + return first_source + + +@dataclass(frozen=True) +class ChunkCoordSlice: + """ + Identifies a slice within a specific chunk. + + Parameters + ---------- + chunk_coords : tuple[int, ...] + The coordinates of the chunk in the chunk grid. + selection : tuple[slice, ...] + The slice within the chunk to read (in chunk-local coordinates, starting at 0). + """ + + chunk_coords: tuple[int, ...] + selection: tuple[slice, ...] + + +def get_chunk_projections( + storage_shape: tuple[int, ...], + chunk_shape: tuple[int, ...], + domain: IndexDomain, + index_transform: tuple[int, ...] | None = None, +) -> Iterator[tuple[tuple[slice, ...], ChunkCoordSlice]]: + """ + Compute chunk projections for resolving data from a domain. + + This function maps domain coordinates to storage coordinates and determines + which chunks need to be read and how to assemble them into an output array. + + The mapping from domain to storage coordinates is: + storage_coord = domain_coord - index_transform + + Parameters + ---------- + storage_shape : tuple[int, ...] + The shape of the underlying storage array. + chunk_shape : tuple[int, ...] + The shape of each chunk. + domain : IndexDomain + The domain to resolve. + index_transform : tuple[int, ...] | None + The offset to subtract from domain coordinates to get storage coordinates. + If None, defaults to (0, 0, ...), meaning domain coordinates equal storage + coordinates. + + Yields + ------ + tuple[tuple[slice, ...], ChunkCoordSlice] + For each chunk that overlaps with the domain (after translation to storage + coordinates), yields a tuple of: + - output_selection: where to place the data in the output array (the key) + - chunk_info: ChunkCoordSlice with chunk coords and slice within chunk (the value) + + Examples + -------- + >>> # Storage is shape (100,) with chunks of size 10 + >>> # Domain is [25, 75), with default offset (0,) so storage coords are [25, 75) + >>> storage_shape = (100,) + >>> chunk_shape = (10,) + >>> domain = IndexDomain(inclusive_min=(25,), exclusive_max=(75,)) + >>> projs = list(get_chunk_projections(storage_shape, chunk_shape, domain)) + >>> projs[0] # First chunk: output_selection, chunk_info + ((slice(0, 5),), ChunkCoordSlice(chunk_coords=(2,), selection=(slice(5, 10),))) + + >>> # With index_transform=(10,), domain 10 maps to storage 0 + >>> domain = IndexDomain(inclusive_min=(10,), exclusive_max=(20,)) + >>> list(get_chunk_projections((10,), (5,), domain, index_transform=(10,))) + [((slice(0, 5),), ChunkCoordSlice(chunk_coords=(0,), selection=(slice(0, 5),))), + ((slice(5, 10),), ChunkCoordSlice(chunk_coords=(1,), selection=(slice(0, 5),)))] + """ + ndim = len(storage_shape) + if len(chunk_shape) != ndim or domain.ndim != ndim: + raise ValueError( + f"Dimension mismatch: storage_shape has {ndim} dims, " + f"chunk_shape has {len(chunk_shape)} dims, domain has {domain.ndim} dims" + ) + + if index_transform is None: + index_transform = (0,) * ndim + + if len(index_transform) != ndim: + raise ValueError(f"index_transform has {len(index_transform)} dims, expected {ndim}") + + # Translate domain to storage coordinates + # storage_coord = domain_coord - index_transform + neg_transform = tuple(-x for x in index_transform) + storage_domain = domain.translate(neg_transform) + + # Intersect with valid storage bounds [0, storage_dim) + # This gives us the range of storage coordinates we can actually read + storage_bounds = IndexDomain.from_shape(storage_shape) + valid_storage = storage_domain.intersect(storage_bounds) + + # Check if there's any valid intersection + if valid_storage is None: + return # No chunks to read + + # Compute the range of chunk coordinates that overlap with the valid storage region + chunk_start = tuple( + lo // c for lo, c in zip(valid_storage.inclusive_min, chunk_shape, strict=True) + ) + chunk_end = tuple( + ceildiv(hi, c) for hi, c in zip(valid_storage.exclusive_max, chunk_shape, strict=True) + ) + + # Iterate over all chunks in the range + def iter_chunk_coords_range( + starts: tuple[int, ...], ends: tuple[int, ...] + ) -> Iterator[tuple[int, ...]]: + """Iterate over all chunk coordinates in the given range.""" + if not starts: + yield () + return + for coord in range(starts[0], ends[0]): + for rest in iter_chunk_coords_range(starts[1:], ends[1:]): + yield (coord,) + rest + + for chunk_coords in iter_chunk_coords_range(chunk_start, chunk_end): + # Compute the storage region covered by this chunk + chunk_storage_start = tuple(c * cs for c, cs in zip(chunk_coords, chunk_shape, strict=True)) + chunk_storage_end = tuple( + min((c + 1) * cs, dim) + for c, cs, dim in zip(chunk_coords, chunk_shape, storage_shape, strict=True) + ) + chunk_domain = IndexDomain( + inclusive_min=chunk_storage_start, exclusive_max=chunk_storage_end + ) + + # Intersect chunk region with valid storage region + intersection = chunk_domain.intersect(valid_storage) + + # Skip if no intersection (shouldn't happen given our chunk range, but be safe) + if intersection is None: + continue + + # Compute chunk_selection: slice within the chunk (chunk-local coordinates) + chunk_local = intersection.translate(tuple(-x for x in chunk_storage_start)) + chunk_selection = tuple( + slice(chunk_local.inclusive_min[d], chunk_local.exclusive_max[d]) for d in range(ndim) + ) + + # Compute output_selection: where to place in output array + # Output array index 0 corresponds to domain.origin, which maps to storage_domain.origin + # So output_index = storage_index - storage_domain.origin + output_local = intersection.translate(tuple(-x for x in storage_domain.inclusive_min)) + output_selection = tuple( + slice(output_local.inclusive_min[d], output_local.exclusive_max[d]) for d in range(ndim) + ) + + yield ( + output_selection, + ChunkCoordSlice(chunk_coords=chunk_coords, selection=chunk_selection), + ) + + +class Array: + """ + A Zarr array class that supports lazy indexing with explicit domain tracking. + + This class extends standard Zarr array functionality with TensorStore-inspired + lazy indexing. When you index an array using `__getitem__`, instead of loading + data immediately, you get a new Array with a narrowed domain. Data is only + loaded when you explicitly call `resolve()`. + + An Array can be backed by: + - A single storage source (when opened from a store) + - Multiple sources (when created via merge) + - Other Arrays as sources (enabling nested composition) + + Key concepts: + - **Domain**: Each array has an IndexDomain that defines its valid index range. + The domain has an origin (inclusive lower bounds) and a shape. + - **Lazy Indexing**: `arr[5:10]` returns a new Array with domain [5, 10), not data. + - **Data Resolution**: Call `resolve()` to actually load the data as a numpy array. + - **Non-zero Origins**: Arrays can have domains that don't start at zero. + - **Merging**: `merge([a, b])` returns an Array combining multiple sources. + + Examples + -------- + >>> arr = Array.open("path/to/array") + >>> arr.domain + IndexDomain([0, 100)) + + >>> # Lazy indexing - returns a new Array, not data + >>> sliced = arr[20:30] + >>> sliced.domain + IndexDomain([20, 30)) + + >>> # Actually load the data + >>> data = sliced.resolve() + >>> data.shape + (10,) + + >>> # Merging returns the same type + >>> combined = merge([arr[0:30], arr[70:100]]) + >>> isinstance(combined, Array) + True + """ + + _domain: IndexDomain + _sources: tuple[StorageSource | Array, ...] + _dtype: np.dtype[Any] + _fill_value: Any + + # For storage-backed arrays, keep references to these for compatibility + # These are None for multi-source arrays + _metadata: ArrayV2Metadata | ArrayV3Metadata | None + _store_path: StorePath | None + _codec_pipeline: Any | None # CodecPipeline + _config: ArrayConfig | None + + def __init__( + self, + store_path: StorePath, + metadata: ArrayMetadata | ArrayMetadataDict, + *, + domain: IndexDomain | None = None, + index_transform: tuple[int, ...] | None = None, + codec_pipeline: CodecPipeline | None = None, + config: ArrayConfigLike | None = None, + ) -> None: + """Create an Array from storage (single source).""" + metadata_parsed = parse_array_metadata(metadata) + config_parsed = parse_array_config(config) + + if codec_pipeline is None: + codec_pipeline = create_codec_pipeline(metadata=metadata_parsed, store=store_path.store) + + # Default domain is origin at zero with shape from metadata + if domain is None: + domain = IndexDomain.from_shape(metadata_parsed.shape) + + # Default storage transform offset is zero (domain coords = storage coords) + if index_transform is None: + index_transform = (0,) * domain.ndim + + # Create a single storage source + source = StorageSource( + store_path=store_path, + metadata=metadata_parsed, + codec_pipeline=codec_pipeline, + config=config_parsed, + index_transform=index_transform, + ) + + self._domain = domain + self._sources = (source,) + self._dtype = ( + metadata_parsed.data_type.to_native_dtype() + if hasattr(metadata_parsed, "data_type") + else metadata_parsed.dtype.to_native_dtype() + ) + self._fill_value = metadata_parsed.fill_value + + # Keep references for backward compatibility + self._metadata = metadata_parsed + self._store_path = store_path + self._codec_pipeline = codec_pipeline + self._config = config_parsed + + @classmethod + def _from_sources( + cls, + sources: Sequence[StorageSource | Array], + *, + domain: IndexDomain, + dtype: np.dtype[Any], + fill_value: Any, + ) -> Array: + """Create an Array from multiple sources (internal constructor).""" + arr = object.__new__(cls) + arr._domain = domain + arr._sources = tuple(sources) + arr._dtype = dtype + arr._fill_value = fill_value + + # For single StorageSource, preserve the storage references for compatibility + if len(sources) == 1 and isinstance(sources[0], StorageSource): + source = sources[0] + arr._metadata = source.metadata + arr._store_path = source.store_path + arr._codec_pipeline = source.codec_pipeline + arr._config = source.config + else: + # Multi-source arrays don't have single storage references + arr._metadata = None + arr._store_path = None + arr._codec_pipeline = None + arr._config = None + + return arr + + # ------------------------------------------------------------------------- + # Class methods: open + # ------------------------------------------------------------------------- + + @classmethod + async def open_async( + cls, + store: StoreLike, + *, + domain: IndexDomain | None = None, + config: ArrayConfigLike | None = None, + codec_pipeline: CodecPipeline | None = None, + zarr_format: ZarrFormat | None = 3, + ) -> Array: + """ + Async method to open an existing Zarr array from a given store. + + Parameters + ---------- + store : StoreLike + The store containing the Zarr array. + domain : IndexDomain | None, optional + The domain for this array view. If None, defaults to a domain with + origin at zero and shape from the array metadata. + zarr_format : ZarrFormat | None, optional + The Zarr format version (default is 3). + + Returns + ------- + Array + The opened Zarr array. + """ + store_path = await make_store_path(store) + metadata_dict = await get_array_metadata(store_path, zarr_format=zarr_format) + return cls( + store_path=store_path, + metadata=metadata_dict, + domain=domain, + codec_pipeline=codec_pipeline, + config=config, + ) + + @classmethod + def open( + cls, + store: StoreLike, + *, + domain: IndexDomain | None = None, + config: ArrayConfigLike | None = None, + codec_pipeline: CodecPipeline | None = None, + zarr_format: ZarrFormat | None = 3, + ) -> Array: + """ + Open an existing Zarr array from a given store. + + Parameters + ---------- + store : StoreLike + The store containing the Zarr array. + domain : IndexDomain | None, optional + The domain for this array view. If None, defaults to a domain with + origin at zero and shape from the array metadata. + zarr_format : ZarrFormat | None, optional + The Zarr format version (default is 3). + + Returns + ------- + Array + The opened Zarr array. + """ + return sync( + cls.open_async( + store, + config=config, + codec_pipeline=codec_pipeline, + domain=domain, + zarr_format=zarr_format, + ) + ) + + # ------------------------------------------------------------------------- + # Properties (all synchronous, derived from internal state) + # ------------------------------------------------------------------------- + + @property + def domain(self) -> IndexDomain: + """The domain defining valid indices for this array view.""" + return self._domain + + @property + def sources(self) -> tuple[StorageSource | Array, ...]: + """The sources backing this array.""" + return self._sources + + @property + def store(self) -> Store | None: + """The store containing the array data, or None for multi-source arrays.""" + if self._store_path is not None: + return self._store_path.store + return None + + @property + def store_path(self) -> StorePath | None: + """The store path, or None for multi-source arrays.""" + return self._store_path + + @property + def metadata(self) -> ArrayV2Metadata | ArrayV3Metadata | None: + """The metadata, or None for multi-source arrays.""" + return self._metadata + + @property + def codec_pipeline(self) -> Any | None: + """The codec pipeline, or None for multi-source arrays.""" + return self._codec_pipeline + + @property + def config(self) -> ArrayConfig | None: + """The config, or None for multi-source arrays.""" + return self._config + + @property + def origin(self) -> tuple[int, ...]: + """The origin (inclusive lower bounds) of this array's domain.""" + return self._domain.origin + + @property + def ndim(self) -> int: + """Returns the number of dimensions in the Array.""" + return self._domain.ndim + + @property + def shape(self) -> tuple[int, ...]: + """Returns the shape of the Array (from the domain, not metadata).""" + return self._domain.shape + + @property + def dtype(self) -> np.dtype[Any]: + """Returns the data type of the array.""" + return self._dtype + + @property + def fill_value(self) -> Any: + """Returns the fill value of the array.""" + return self._fill_value + + @property + def index_transform(self) -> tuple[int, ...] | None: + """The index transform for single-source storage arrays, or None.""" + if len(self._sources) == 1 and isinstance(self._sources[0], StorageSource): + return self._sources[0].index_transform + return None + + @property + def chunks(self) -> tuple[int, ...] | None: + """Returns the chunk shape of the Array, or None for multi-source arrays.""" + if self._metadata is not None: + return self._metadata.chunks + return None + + @property + def chunk_layout(self) -> ChunkLayout | None: + """ + Returns the chunk layout describing how this array is partitioned. + + For multi-source arrays, returns None as there is no single chunk layout. + + The chunk layout defines the chunk grid in domain coordinates. It accounts + for the index_transform, so the grid_origin is expressed in the array's + current coordinate system. + + Each chunk can be thought of as a sub-array with its own domain. Use + `chunk_layout.chunk_domain(coords)` to get the domain of a specific chunk, + or `chunk_layout.iter_chunk_domains(domain)` to iterate over chunks + overlapping a region. + + Returns + ------- + ChunkLayout | None + The chunk layout with grid_origin in domain coordinates, or None. + + Examples + -------- + >>> arr = Array.open("path/to/array") # shape (100,), chunks (10,) + >>> arr.chunk_layout + ChunkLayout(grid_origin=(0,), chunk_shape=(10,)) + >>> arr.chunk_layout.is_aligned((0,)) + True + >>> arr.chunk_layout.is_aligned((5,)) + False + + >>> # After slicing, layout is in the sliced domain's coordinates + >>> sliced = arr[25:75] + >>> sliced.chunk_layout.is_aligned((30,)) # 30 is a chunk boundary + True + """ + if self.index_transform is not None and self.chunks is not None: + return ChunkLayout(grid_origin=self.index_transform, chunk_shape=self.chunks) + return None + + @property + def shards(self) -> tuple[int, ...] | None: + """Returns the shard shape of the Array, or None if sharding is not used.""" + if self._metadata is not None: + return self._metadata.shards + return None + + @property + def size(self) -> int: + """Returns the total number of elements in the array.""" + return product(self.shape) + + @property + def filters(self) -> tuple[Numcodec, ...] | tuple[ArrayArrayCodec, ...] | None: + """Filters applied to each chunk before serialization.""" + if self._metadata is None: + return None + if self._metadata.zarr_format == 2: + filters = self._metadata.filters + if filters is None: + return () + return filters + return tuple( + codec for codec in self._metadata.inner_codecs if isinstance(codec, ArrayArrayCodec) + ) + + @property + def serializer(self) -> ArrayBytesCodec | None: + """Array-to-bytes codec for serializing chunks.""" + if self._metadata is None: + return None + if self._metadata.zarr_format == 2: + return None + return next( + codec for codec in self._metadata.inner_codecs if isinstance(codec, ArrayBytesCodec) + ) + + @property + def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...] | None: + """Compressors applied to each chunk after serialization.""" + if self._metadata is None: + return None + if self._metadata.zarr_format == 2: + if self._metadata.compressor is not None: + return (self._metadata.compressor,) + return () + return tuple( + codec for codec in self._metadata.inner_codecs if isinstance(codec, BytesBytesCodec) + ) + + @property + def _zdtype(self) -> Any: + """The zarr-specific representation of the array data type.""" + if self._metadata is None: + return None + if self._metadata.zarr_format == 2: + return self._metadata.dtype + else: + return self._metadata.data_type + + @property + def order(self) -> MemoryOrder | None: + """Returns the memory order of the array.""" + if self._metadata is None or self._config is None: + return None + if self._metadata.zarr_format == 2: + return self._metadata.order + else: + return self._config.order + + @property + def attrs(self) -> dict[str, JSON] | None: + """Returns the attributes of the array.""" + if self._metadata is None: + return None + return self._metadata.attributes + + @property + def read_only(self) -> bool | None: + """Returns True if the array is read-only, or None for multi-source arrays.""" + if self._store_path is not None: + return self._store_path.read_only + return None + + @property + def path(self) -> str | None: + """Storage path, or None for multi-source arrays.""" + if self._store_path is not None: + return self._store_path.path + return None + + @property + def name(self) -> str | None: + """Array name following h5py convention, or None for multi-source arrays.""" + if self.path is None: + return None + name = self.path + if not name.startswith("/"): + name = "/" + name + return name + + @property + def basename(self) -> str | None: + """Final component of name, or None for multi-source arrays.""" + if self.name is None: + return None + return self.name.split("/")[-1] + + @property + def cdata_shape(self) -> tuple[int, ...] | None: + """The shape of the chunk grid for this array.""" + return self._chunk_grid_shape + + @property + def _chunk_grid_shape(self) -> tuple[int, ...] | None: + """The shape of the chunk grid for this array.""" + if self.chunks is None: + return None + return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=True))) + + @property + def _shard_grid_shape(self) -> tuple[int, ...] | None: + """The shape of the shard grid for this array.""" + if self.chunks is None: + return None + if self.shards is None: + shard_shape = self.chunks + else: + shard_shape = self.shards + return tuple(starmap(ceildiv, zip(self.shape, shard_shape, strict=True))) + + @property + def nchunks(self) -> int | None: + """The number of chunks in this array.""" + if self._chunk_grid_shape is None: + return None + return product(self._chunk_grid_shape) + + @property + def _nshards(self) -> int | None: + """The number of shards in this array.""" + if self._shard_grid_shape is None: + return None + return product(self._shard_grid_shape) + + @property + def nbytes(self) -> int: + """The total number of bytes that would be stored if all chunks were initialized.""" + return self.size * self._dtype.itemsize + + @property + def info(self) -> ArrayInfo | None: + """Return the statically known information for an array, or None for multi-source.""" + if self._metadata is None: + return None + return self._info() + + def _info( + self, count_chunks_initialized: int | None = None, count_bytes_stored: int | None = None + ) -> ArrayInfo | None: + if self._metadata is None or self._store_path is None: + return None + return ArrayInfo( + _zarr_format=self._metadata.zarr_format, + _data_type=self._zdtype, + _fill_value=self._fill_value, + _shape=self.shape, + _order=self.order, + _shard_shape=self.shards, + _chunk_shape=self.chunks, + _read_only=self.read_only, + _compressors=self.compressors, + _filters=self.filters, + _serializer=self.serializer, + _store_type=type(self._store_path.store).__name__, + _count_bytes=self.nbytes, + _count_bytes_stored=count_bytes_stored, + _count_chunks_initialized=count_chunks_initialized, + ) + + # ------------------------------------------------------------------------- + # Iteration methods (synchronous) + # ------------------------------------------------------------------------- + + def _iter_chunk_coords( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[int, ...]]: + """Iterate over chunk coordinates in chunk grid space.""" + return _iter_chunk_coords(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_shard_coords( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[int, ...]]: + """Iterate over shard coordinates in shard grid space.""" + return _iter_shard_coords(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_shard_keys( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[str]: + """Iterate over the keys of stored objects supporting this array.""" + return _iter_shard_keys(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_chunk_regions( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[slice, ...]]: + """Iterate over chunk regions in array index space.""" + return _iter_chunk_regions(array=self, origin=origin, selection_shape=selection_shape) + + def _iter_shard_regions( + self, *, origin: Sequence[int] | None = None, selection_shape: Sequence[int] | None = None + ) -> Iterator[tuple[slice, ...]]: + """Iterate over shard regions in array index space.""" + return _iter_shard_regions(array=self, origin=origin, selection_shape=selection_shape) + + # ------------------------------------------------------------------------- + # nchunks_initialized: async and sync + # ------------------------------------------------------------------------- + + async def nchunks_initialized_async(self) -> int: + """ + Asynchronously calculate the number of chunks that have been initialized. + + Returns + ------- + int + The number of chunks that have been initialized. + """ + return await _nchunks_initialized(self) + + def nchunks_initialized(self) -> int: + """ + Calculate the number of chunks that have been initialized. + + Returns + ------- + int + The number of chunks that have been initialized. + """ + return sync(self.nchunks_initialized_async()) + + # ------------------------------------------------------------------------- + # _nshards_initialized: async and sync + # ------------------------------------------------------------------------- + + async def _nshards_initialized_async(self) -> int: + """ + Asynchronously calculate the number of shards that have been initialized. + + Returns + ------- + int + The number of shards that have been initialized. + """ + return await _nshards_initialized(self) + + def _nshards_initialized(self) -> int: + """ + Calculate the number of shards that have been initialized. + + Returns + ------- + int + The number of shards that have been initialized. + """ + return sync(self._nshards_initialized_async()) + + # ------------------------------------------------------------------------- + # nbytes_stored: async and sync + # ------------------------------------------------------------------------- + + async def nbytes_stored_async(self) -> int: + """ + Asynchronously calculate the number of bytes stored for this array. + + Returns + ------- + int + The number of bytes stored. + """ + return await _nbytes_stored(self.store_path) + + def nbytes_stored(self) -> int: + """ + Calculate the number of bytes stored for this array. + + Returns + ------- + int + The number of bytes stored. + """ + return sync(self.nbytes_stored_async()) + + # ------------------------------------------------------------------------- + # getitem: async and sync + # ------------------------------------------------------------------------- + + async def getitem_async( + self, + selection: BasicSelection, + *, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously retrieve a subset of the array's data based on the provided selection. + + Parameters + ---------- + selection : BasicSelection + A selection object specifying the subset of data to retrieve. + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The retrieved subset of the array's data. + """ + return await _getitem( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + prototype=prototype, + ) + + def getitem( + self, + selection: BasicSelection, + *, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Retrieve a subset of the array's data based on the provided selection. + + Parameters + ---------- + selection : BasicSelection + A selection object specifying the subset of data to retrieve. + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The retrieved subset of the array's data. + """ + return sync(self.getitem_async(selection, prototype=prototype)) + + def __getitem__(self, selection: BasicSelection) -> Self: + """ + Lazy indexing: returns a new Array with a narrowed domain. + + Unlike standard Zarr arrays which load data immediately, this method + returns a new Array view with an updated domain. No I/O is performed. + To actually load data, call `resolve()` on the result. + + This follows TensorStore's design where: + - Indexing operations create virtual views without loading data + - Indices are ABSOLUTE coordinates in the domain's index space + - Negative indices refer to actual negative coordinates, NOT "from the end" + + This is different from NumPy where arr[-1] means "last element". Here, + arr[-1] means "coordinate -1" which is only valid if -1 is within the + array's domain. + + Parameters + ---------- + selection : BasicSelection + The selection (int, slice, or tuple of ints/slices). These are + absolute coordinates in the domain's index space. + + Returns + ------- + Array + A new Array with a narrowed domain. + + Examples + -------- + >>> arr = Array.open("path/to/array") # shape (100,), domain [0, 100) + >>> arr.domain + IndexDomain([0, 100)) + + >>> sliced = arr[20:30] # No data loaded! + >>> sliced.domain + IndexDomain([20, 30)) + + >>> # To get element at coordinate 25: + >>> arr[25].domain + IndexDomain([25, 26)) + + >>> # With a shifted domain: + >>> shifted = arr.with_domain(IndexDomain((-50,), (50,))) + >>> shifted[-10:10].domain # Coordinates -10 to 10 + IndexDomain([-10, 10)) + + >>> data = sliced.resolve() # Now data is loaded + """ + new_domain = self._apply_selection_to_domain(selection) + return self._with_domain(new_domain) + + def _normalize_selection(self, selection: BasicSelection) -> tuple[int | slice, ...]: + """Normalize a selection to a tuple of ints/slices.""" + if not isinstance(selection, tuple): + selection = (selection,) + + # Handle ellipsis + result: list[int | slice] = [] + ellipsis_seen = False + for sel in selection: + if sel is Ellipsis: + if ellipsis_seen: + raise IndexError("an index can only have a single ellipsis ('...')") + ellipsis_seen = True + # Insert enough slices to fill remaining dimensions + num_missing = self.ndim - (len(selection) - 1) + result.extend([slice(None)] * num_missing) + else: + result.append(sel) # type: ignore[arg-type] + + # Pad with full slices if needed + while len(result) < self.ndim: + result.append(slice(None)) + + if len(result) > self.ndim: + raise IndexError( + f"too many indices for array: array has {self.ndim} dimensions, " + f"but {len(result)} were indexed" + ) + + return tuple(result) + + def _apply_selection_to_domain(self, selection: BasicSelection) -> IndexDomain: + """ + Apply a selection to the current domain and return a new domain. + + Following TensorStore's design: + - Indices are ABSOLUTE coordinates in the domain's index space + - Negative indices refer to actual negative coordinates, NOT "from the end" + - This is different from NumPy where -1 means "last element" + + For example, with domain [10, 20): + - arr[15] selects coordinate 15 (valid) + - arr[5] selects coordinate 5 (out of bounds - before domain start) + - arr[-1] selects coordinate -1 (out of bounds - before domain start) + + This matches TensorStore's behavior where the index space has meaning + independent of array bounds. + """ + normalized = self._normalize_selection(selection) + + new_inclusive_min: list[int] = [] + new_exclusive_max: list[int] = [] + + for dim_idx, (sel, dim_lo, dim_hi) in enumerate( + zip(normalized, self.domain.inclusive_min, self.domain.exclusive_max, strict=True) + ): + if isinstance(sel, int): + # In TensorStore style, the index IS the coordinate - no translation + # Negative indices mean negative coordinates, not "from the end" + abs_idx = sel + + # Bounds check against domain + if abs_idx < dim_lo or abs_idx >= dim_hi: + raise IndexError( + f"index {sel} is out of bounds for dimension {dim_idx} " + f"with domain [{dim_lo}, {dim_hi})" + ) + + # Integer indexing gives a length-1 slice in lazy indexing + # (dimension is NOT dropped, unlike NumPy) + new_inclusive_min.append(abs_idx) + new_exclusive_max.append(abs_idx + 1) + + else: + # sel is a slice + # Slice bounds are also absolute coordinates + start, stop, step = sel.start, sel.stop, sel.step + + if step is not None and step != 1: + raise IndexError( + "lazy indexing only supports step=1 slices. " + f"Got step={step}. Use resolve() first for strided access." + ) + + # Handle None/default values - None means "to the edge of domain" + if start is None: + abs_start = dim_lo + else: + abs_start = start # Absolute coordinate + + if stop is None: + abs_stop = dim_hi + else: + abs_stop = stop # Absolute coordinate + + # Clamp to domain bounds (like NumPy slice behavior - no error for OOB) + abs_start = max(abs_start, dim_lo) + abs_stop = min(abs_stop, dim_hi) + abs_stop = max(abs_stop, abs_start) # Ensure stop >= start + + new_inclusive_min.append(abs_start) + new_exclusive_max.append(abs_stop) + + return IndexDomain( + inclusive_min=tuple(new_inclusive_min), + exclusive_max=tuple(new_exclusive_max), + ) + + def _with_domain( + self, + new_domain: IndexDomain, + index_transform: tuple[int, ...] | None = None, + ) -> Self: + """Create a new Array with a different domain (internal helper). + + Parameters + ---------- + new_domain : IndexDomain + The new domain. + index_transform : tuple[int, ...] | None + The new storage transform offset. Only used for single-source storage arrays. + If None, preserves the current offset. + """ + # For single storage source, we can update the index_transform + if len(self._sources) == 1 and isinstance(self._sources[0], StorageSource): + source = self._sources[0] + if index_transform is None: + index_transform = source.index_transform + new_source = StorageSource( + store_path=source.store_path, + metadata=source.metadata, + codec_pipeline=source.codec_pipeline, + config=source.config, + index_transform=index_transform, + ) + return self.__class__._from_sources( + sources=[new_source], + domain=new_domain, + dtype=self._dtype, + fill_value=self._fill_value, + ) + else: + # For multi-source arrays, just narrow the domain + # Filter sources to only include those that overlap with new domain + new_sources: list[StorageSource | Array] = [] + for source in self._sources: + if isinstance(source, StorageSource): + # Keep the source as-is, resolve will handle the intersection + new_sources.append(source) + else: + # It's an Array - slice it to the new domain + intersection = source.domain.intersect(new_domain) + if intersection is not None: + slices = tuple( + slice( + max(new_domain.inclusive_min[d], source.domain.inclusive_min[d]), + min(new_domain.exclusive_max[d], source.domain.exclusive_max[d]), + ) + for d in range(self.ndim) + ) + new_sources.append(source[slices]) + + return self.__class__._from_sources( + sources=new_sources if new_sources else list(self._sources), + domain=new_domain, + dtype=self._dtype, + fill_value=self._fill_value, + ) + + def with_domain(self, new_domain: IndexDomain) -> Self: + """ + Create a new Array view with a different domain. + + This allows creating views with arbitrary domains, including non-zero + origins or even domains that extend beyond the underlying storage bounds. + When resolving data from regions outside storage bounds, the array's + fill_value is used (this is standard Zarr behavior for uninitialized chunks). + + The new domain's origin will map to storage coordinate 0. This means + domain coordinate `new_domain.origin[i]` will read from storage coordinate 0 + in dimension i. + + Parameters + ---------- + new_domain : IndexDomain + The new domain for the array view. + + Returns + ------- + Array + A new Array with the specified domain. + + Examples + -------- + >>> arr = Array.open("path/to/array") # shape (10,), fill_value=0 + >>> arr.domain + IndexDomain([0, 10)) + + >>> # Create a view with a shifted origin - domain 10 maps to storage 0 + >>> shifted = arr.with_domain(IndexDomain(inclusive_min=(10,), exclusive_max=(20,))) + >>> shifted.origin + (10,) + >>> shifted.shape + (10,) + >>> shifted[15].resolve() # domain 15 -> storage 5, returns data[5] + + >>> # Create a view with negative origin - domain -5 maps to storage 0 + >>> neg = arr.with_domain(IndexDomain(inclusive_min=(-5,), exclusive_max=(5,))) + >>> neg.origin + (-5,) + >>> neg.shape + (10,) + >>> neg[-3].resolve() # domain -3 -> storage 2, returns data[2] + """ + if new_domain.ndim != self.ndim: + raise ValueError( + f"New domain must have same number of dimensions as array. " + f"Array has {self.ndim} dimensions, new domain has {new_domain.ndim}." + ) + # Set storage transform offset to the new domain's origin + # so that domain.origin maps to storage coordinate 0 + return self._with_domain(new_domain, index_transform=new_domain.origin) + + async def resolve_async( + self, + *, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously resolve (materialize) this array view by loading the data. + + This is the method that actually performs I/O and loads the data from + storage. The returned numpy array will have shape equal to this array's + domain shape. + + For single-source arrays, domain coordinates are translated to storage + coordinates using the index_transform. For multi-source arrays, data is + assembled from all sources that overlap with the domain. + + If the domain extends beyond storage bounds or has gaps between sources, + those regions are filled with the array's fill_value. + + Parameters + ---------- + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The data as a numpy array (or scalar if the domain has size 1 in all dims). + """ + # Create output array filled with fill_value + output = np.full(self.shape, self._fill_value, dtype=self._dtype) + + # Precompute the negative of our domain origin for translating to output coords + neg_origin = tuple(-x for x in self._domain.inclusive_min) + + for source in self._sources: + if isinstance(source, StorageSource): + # Resolve from storage source + await self._resolve_storage_source(source, output, neg_origin, prototype) + else: + # source is an Array - resolve recursively + await self._resolve_array_source(source, output, neg_origin) + + return output + + async def _resolve_storage_source( + self, + source: StorageSource, + output: np.ndarray[Any, Any], + neg_origin: tuple[int, ...], + prototype: BufferPrototype | None, + ) -> None: + """Resolve data from a storage source into the output array.""" + # Compute the domain of this source in our coordinate system + # The source has an index_transform that maps domain coords to storage coords + # storage_coord = domain_coord - index_transform + # So domain_coord = storage_coord + index_transform + source_domain = IndexDomain.from_shape(source.storage_shape).translate( + source.index_transform + ) + + # Find intersection with our domain + intersection = source_domain.intersect(self._domain) + if intersection is None: + return + + # Get chunk projections for the intersection + projections = list( + get_chunk_projections( + storage_shape=source.storage_shape, + chunk_shape=source.chunks, + domain=intersection, + index_transform=source.index_transform, + ) + ) + + if not projections: + return + + for output_selection_in_intersection, chunk_info in projections: + # Compute storage selection from chunk coords and chunk selection + storage_selection = tuple( + slice( + coord * chunk_size + sel.start, + coord * chunk_size + sel.stop, + ) + for coord, chunk_size, sel in zip( + chunk_info.chunk_coords, source.chunks, chunk_info.selection, strict=True + ) + ) + + data = await _getitem( + source.store_path, + source.metadata, + source.codec_pipeline, + source.config, + storage_selection, + prototype=prototype, + ) + + # The output_selection_in_intersection is relative to intersection.origin + # We need to translate it to be relative to our domain's origin + # First, get the absolute coordinates of this chunk region + intersection_neg_origin = tuple(-x for x in intersection.inclusive_min) + abs_start = tuple( + sel.start - off + for sel, off in zip( + output_selection_in_intersection, intersection_neg_origin, strict=True + ) + ) + abs_end = tuple( + sel.stop - off + for sel, off in zip( + output_selection_in_intersection, intersection_neg_origin, strict=True + ) + ) + + # Now translate to our output coordinates + output_selection = tuple( + slice(start + neg_off, end + neg_off) + for start, end, neg_off in zip(abs_start, abs_end, neg_origin, strict=True) + ) + + output[output_selection] = data + + async def _resolve_array_source( + self, + source: Array, + output: np.ndarray[Any, Any], + neg_origin: tuple[int, ...], + ) -> None: + """Resolve data from an Array source into the output array.""" + # Find intersection of source's domain with our domain + intersection = source.domain.intersect(self._domain) + if intersection is None: + return + + # Resolve the source array's data + data = await source.resolve_async() + + # Translate intersection to output coordinates (origin at 0) + output_region = intersection.translate(neg_origin) + output_slices = tuple( + slice(output_region.inclusive_min[d], output_region.exclusive_max[d]) + for d in range(self.ndim) + ) + + # Translate intersection to data coordinates (relative to source's origin) + source_neg_origin = tuple(-x for x in source.domain.inclusive_min) + data_region = intersection.translate(source_neg_origin) + data_slices = tuple( + slice(data_region.inclusive_min[d], data_region.exclusive_max[d]) + for d in range(self.ndim) + ) + + output[output_slices] = data[data_slices] + + def resolve( + self, + *, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Resolve (materialize) this array view by loading the data. + + This is the method that actually performs I/O and loads the data from + storage. The returned numpy array will have shape equal to this array's + domain shape. + + Parameters + ---------- + prototype : BufferPrototype, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The data as a numpy array (or scalar if the domain has size 1 in all dims). + + Examples + -------- + >>> arr = Array.open("path/to/array") + >>> sliced = arr[10:20] # Lazy - no I/O yet + >>> data = sliced.resolve() # Now I/O happens + >>> data.shape + (10,) + """ + return sync(self.resolve_async(prototype=prototype)) + + def _domain_to_selection(self) -> tuple[slice, ...]: + """Convert the current domain to a selection tuple for the underlying storage.""" + return tuple( + slice(lo, hi) + for lo, hi in zip(self.domain.inclusive_min, self.domain.exclusive_max, strict=True) + ) + + # ------------------------------------------------------------------------- + # setitem: async and sync + # ------------------------------------------------------------------------- + + async def setitem_async( + self, + selection: BasicSelection, + value: npt.ArrayLike, + prototype: BufferPrototype | None = None, + ) -> None: + """ + Asynchronously set values in the array using basic indexing. + + Parameters + ---------- + selection : BasicSelection + The selection defining the region of the array to set. + value : npt.ArrayLike + The values to be written into the selected region. + prototype : BufferPrototype, optional + A buffer prototype to use. + """ + return await _setitem( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + value, + prototype=prototype, + ) + + def setitem( + self, + selection: BasicSelection, + value: npt.ArrayLike, + prototype: BufferPrototype | None = None, + ) -> None: + """ + Set values in the array using basic indexing. + + Parameters + ---------- + selection : BasicSelection + The selection defining the region of the array to set. + value : npt.ArrayLike + The values to be written into the selected region. + prototype : BufferPrototype, optional + A buffer prototype to use. + """ + sync(self.setitem_async(selection, value, prototype=prototype)) + + def __setitem__(self, selection: BasicSelection, value: npt.ArrayLike) -> None: + """Set data using indexing syntax.""" + self.setitem(selection, value) + + # ------------------------------------------------------------------------- + # get_orthogonal_selection: async and sync + # ------------------------------------------------------------------------- + + async def get_orthogonal_selection_async( + self, + selection: OrthogonalSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously get an orthogonal selection from the array. + + Parameters + ---------- + selection : OrthogonalSelection + The orthogonal selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return await _get_orthogonal_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + out=out, + fields=fields, + prototype=prototype, + ) + + def get_orthogonal_selection( + self, + selection: OrthogonalSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Get an orthogonal selection from the array. + + Parameters + ---------- + selection : OrthogonalSelection + The orthogonal selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return sync( + self.get_orthogonal_selection_async( + selection, out=out, fields=fields, prototype=prototype + ) + ) + + # ------------------------------------------------------------------------- + # get_mask_selection: async and sync + # ------------------------------------------------------------------------- + + async def get_mask_selection_async( + self, + mask: MaskSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously get a mask selection from the array. + + Parameters + ---------- + mask : MaskSelection + The boolean mask specifying the selection. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return await _get_mask_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + mask, + out=out, + fields=fields, + prototype=prototype, + ) + + def get_mask_selection( + self, + mask: MaskSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Get a mask selection from the array. + + Parameters + ---------- + mask : MaskSelection + The boolean mask specifying the selection. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return sync( + self.get_mask_selection_async(mask, out=out, fields=fields, prototype=prototype) + ) + + # ------------------------------------------------------------------------- + # get_coordinate_selection: async and sync + # ------------------------------------------------------------------------- + + async def get_coordinate_selection_async( + self, + selection: CoordinateSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Asynchronously get a coordinate selection from the array. + + Parameters + ---------- + selection : CoordinateSelection + The coordinate selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return await _get_coordinate_selection( + self.store_path, + self.metadata, + self.codec_pipeline, + self.config, + selection, + out=out, + fields=fields, + prototype=prototype, + ) + + def get_coordinate_selection( + self, + selection: CoordinateSelection, + *, + out: NDBuffer | None = None, + fields: Fields | None = None, + prototype: BufferPrototype | None = None, + ) -> NDArrayLikeOrScalar: + """ + Get a coordinate selection from the array. + + Parameters + ---------- + selection : CoordinateSelection + The coordinate selection specification. + out : NDBuffer | None, optional + An output buffer to write the data to. + fields : Fields | None, optional + Fields to select from structured arrays. + prototype : BufferPrototype | None, optional + A buffer prototype to use for the retrieved data. + + Returns + ------- + NDArrayLikeOrScalar + The selected data. + """ + return sync( + self.get_coordinate_selection_async( + selection, out=out, fields=fields, prototype=prototype + ) + ) + + # ------------------------------------------------------------------------- + # resize: async and sync + # ------------------------------------------------------------------------- + + async def resize_async(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + """ + Asynchronously resize the array to a new shape. + + Parameters + ---------- + new_shape : ShapeLike + The desired new shape of the array. + delete_outside_chunks : bool, optional + If True (default), chunks that fall outside the new shape will be deleted. + """ + return await _resize(self, new_shape, delete_outside_chunks) + + def resize(self, new_shape: ShapeLike, delete_outside_chunks: bool = True) -> None: + """ + Resize the array to a new shape. + + Parameters + ---------- + new_shape : ShapeLike + The desired new shape of the array. + delete_outside_chunks : bool, optional + If True (default), chunks that fall outside the new shape will be deleted. + """ + sync(self.resize_async(new_shape, delete_outside_chunks)) + + # ------------------------------------------------------------------------- + # append: async and sync + # ------------------------------------------------------------------------- + + async def append_async(self, data: npt.ArrayLike, axis: int = 0) -> tuple[int, ...]: + """ + Asynchronously append data to the array along the specified axis. + + Parameters + ---------- + data : npt.ArrayLike + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + tuple[int, ...] + The new shape of the array after appending. + """ + return await _append(self, data, axis) + + def append(self, data: npt.ArrayLike, axis: int = 0) -> tuple[int, ...]: + """ + Append data to the array along the specified axis. + + Parameters + ---------- + data : npt.ArrayLike + Data to be appended. + axis : int + Axis along which to append. + + Returns + ------- + tuple[int, ...] + The new shape of the array after appending. + """ + return sync(self.append_async(data, axis)) + + # ------------------------------------------------------------------------- + # update_attributes: async and sync + # ------------------------------------------------------------------------- + + async def update_attributes_async(self, new_attributes: dict[str, JSON]) -> Self: + """ + Asynchronously update the array's attributes. + + Parameters + ---------- + new_attributes : dict[str, JSON] + A dictionary of new attributes to update or add. + + Returns + ------- + Array + The array with the updated attributes. + """ + await _update_attributes(self, new_attributes) + return self + + def update_attributes(self, new_attributes: dict[str, JSON]) -> Self: + """ + Update the array's attributes. + + Parameters + ---------- + new_attributes : dict[str, JSON] + A dictionary of new attributes to update or add. + + Returns + ------- + Array + The array with the updated attributes. + """ + return sync(self.update_attributes_async(new_attributes)) + + # ------------------------------------------------------------------------- + # info_complete: async and sync + # ------------------------------------------------------------------------- + + async def info_complete_async(self) -> ArrayInfo: + """ + Asynchronously return all the information for an array, including dynamic information. + + Returns + ------- + ArrayInfo + Complete information about the array including chunks initialized and bytes stored. + """ + return await _info_complete(self) + + def info_complete(self) -> ArrayInfo: + """ + Return all the information for an array, including dynamic information. + + Returns + ------- + ArrayInfo + Complete information about the array including chunks initialized and bytes stored. + """ + return sync(self.info_complete_async()) + + # ------------------------------------------------------------------------- + # __repr__ + # ------------------------------------------------------------------------- + + def __repr__(self) -> str: + if self._store_path is not None: + return f"" + else: + return f"" + + def __eq__(self, other: object) -> bool: + """ + Check equality between two Arrays. + + Two Arrays are equal if they have the same domain, dtype, fill_value, + and equivalent sources. For single-source arrays backed by the same + storage with the same index_transform, they are considered equal. + """ + if not isinstance(other, Array): + return NotImplemented + + # Basic properties must match + if self._domain != other._domain: + return False + if self._dtype != other._dtype: + return False + if self._fill_value != other._fill_value: + return False + + # Compare sources + if len(self._sources) != len(other._sources): + return False + + for s1, s2 in zip(self._sources, other._sources, strict=True): + if type(s1) is not type(s2): + return False + # StorageSource is a frozen dataclass, so equality works + # Array uses recursive equality check + if s1 != s2: + return False + + return True + + def __array__( + self, dtype: np.dtype[Any] | None = None, copy: bool | None = None + ) -> np.ndarray[Any, Any]: + """ + Convert the array to a numpy array by resolving it. + + This allows using `np.array(arr)` or `np.asarray(arr)` to get the data. + + Parameters + ---------- + dtype : np.dtype, optional + The desired dtype for the output array. + copy : bool, optional + Whether to copy the data. + + Returns + ------- + np.ndarray + The resolved data as a numpy array. + """ + result = self.resolve() + if isinstance(result, np.ndarray): + if dtype is not None and result.dtype != dtype: + result = result.astype(dtype, copy=copy if copy is not None else True) + elif copy: + result = result.copy() + return result + # Scalar case + arr = np.asarray(result, dtype=dtype) + if copy: + arr = arr.copy() + return arr + + +# ----------------------------------------------------------------------------- +# merge: Combine multiple arrays by domain +# ----------------------------------------------------------------------------- + + +def merge( + arrays: Sequence[Array], + *, + fill_value: Any = None, + domain: IndexDomain | None = None, +) -> Array: + """ + Merge arrays by their domains. + + This is the inverse of slicing. While slicing takes an array and returns + a view with a smaller domain, merge takes multiple arrays and returns + a view with a larger domain (the union/bounding box of all input domains). + + Unlike numpy.concatenate which requires arrays to be adjacent along one axis, + this function allows arrays to have arbitrary non-overlapping (or overlapping) + domains. Gaps between arrays are filled with fill_value. + + Parameters + ---------- + arrays : Sequence[Array] + The arrays to merge. All must have the same dtype and number + of dimensions. Must all be Array instances. + fill_value : Any, optional + The fill value for regions not covered by any input array. + Defaults to the first array's fill_value. + domain : IndexDomain, optional + The domain of the result. If not provided, uses the bounding box + of all input arrays' domains. + + Returns + ------- + Array + A lazy merge that can be resolved or further sliced. + + Examples + -------- + >>> # Slice and reassemble + >>> arr = Array.open("path/to/array") # domain [0, 100) + >>> left = arr[0:40] + >>> right = arr[60:100] + >>> combined = merge([left, right]) + >>> combined.domain + IndexDomain([0, 100)) + >>> data = combined.resolve() + >>> data[40:60] # Filled with fill_value + + >>> # Verify merge inverts slicing + >>> arr = Array.open("path/to/array") + >>> chunks = [arr[i:i+10] for i in range(0, 100, 10)] + >>> reassembled = merge(chunks) + >>> np.array_equal(arr.resolve(), reassembled.resolve()) + True + + >>> # Works with overlapping domains (last wins) + >>> a = arr[0:60] + >>> b = arr[40:100] # Overlaps with a in [40, 60) + >>> combined = merge([a, b]) # b's data used in overlap region + """ + if not arrays: + raise ValueError("merge requires at least one array") + + arrays_list = list(arrays) + first = arrays_list[0] + ndim = first.domain.ndim + + # Validate all arrays have same ndim and dtype + for i, arr in enumerate(arrays_list): + if arr.domain.ndim != ndim: + raise ValueError( + f"All arrays must have the same number of dimensions. " + f"Array 0 has {ndim} dims, array {i} has {arr.domain.ndim} dims." + ) + if arr.dtype != first.dtype: + raise ValueError( + f"All arrays must have the same dtype. " + f"Array 0 has dtype {first.dtype}, array {i} has dtype {arr.dtype}." + ) + + # Determine fill_value + if fill_value is None: + fill_value = first.fill_value + + # Compute domain as bounding box if not provided + if domain is None: + inclusive_min = tuple( + min(arr.domain.inclusive_min[d] for arr in arrays_list) for d in range(ndim) + ) + exclusive_max = tuple( + max(arr.domain.exclusive_max[d] for arr in arrays_list) for d in range(ndim) + ) + domain = IndexDomain(inclusive_min=inclusive_min, exclusive_max=exclusive_max) + + # Create an Array with the input arrays as sources + # We need to convert ArrayLike to Array - for now we only support Array inputs + sources: list[StorageSource | Array] = [] + for arr in arrays_list: + if isinstance(arr, Array): + sources.append(arr) + else: + raise TypeError(f"merge currently only supports Array inputs, got {type(arr).__name__}") + + # Try to merge sources if they all come from the same storage + merged_source = _try_merge_to_single_source(arrays_list, domain) + if merged_source is not None: + # All arrays share the same storage - use the merged source + return Array._from_sources( + sources=[merged_source], + domain=domain, + dtype=first.dtype, + fill_value=fill_value, + ) + + return Array._from_sources( + sources=sources, + domain=domain, + dtype=first.dtype, + fill_value=fill_value, + ) diff --git a/tests/test_experimental/test_lazy_indexing.py b/tests/test_experimental/test_lazy_indexing.py new file mode 100644 index 0000000000..b474736587 --- /dev/null +++ b/tests/test_experimental/test_lazy_indexing.py @@ -0,0 +1,1255 @@ +""" +Tests for lazy indexing with TensorStore-inspired domain tracking. + +Key difference from NumPy: indices are ABSOLUTE coordinates in the domain's +index space, not relative offsets. Negative indices mean negative coordinates, +not "counting from the end". +""" + +import numpy as np +import pytest + +import zarr +from zarr.experimental.lazy_indexing import ( + Array, + ChunkLayout, + IndexDomain, + get_chunk_projections, + merge, +) +from zarr.storage import MemoryStore + + +class TestIndexDomain: + """Tests for the IndexDomain class.""" + + def test_from_shape(self) -> None: + """Test creating a domain from a shape.""" + domain = IndexDomain.from_shape((10, 20, 30)) + assert domain.inclusive_min == (0, 0, 0) + assert domain.exclusive_max == (10, 20, 30) + assert domain.shape == (10, 20, 30) + assert domain.origin == (0, 0, 0) + assert domain.ndim == 3 + + def test_non_zero_origin(self) -> None: + """Test a domain with non-zero origin.""" + domain = IndexDomain(inclusive_min=(5, 10), exclusive_max=(15, 30)) + assert domain.origin == (5, 10) + assert domain.shape == (10, 20) + assert domain.ndim == 2 + + def test_contains(self) -> None: + """Test the contains method.""" + domain = IndexDomain(inclusive_min=(5, 10), exclusive_max=(15, 30)) + + # Inside the domain + assert domain.contains((5, 10)) + assert domain.contains((10, 20)) + assert domain.contains((14, 29)) + + # Outside the domain + assert not domain.contains((4, 10)) # x too low + assert not domain.contains((15, 10)) # x at exclusive max + assert not domain.contains((5, 30)) # y at exclusive max + assert not domain.contains((5, 9)) # y too low + + def test_contains_domain(self) -> None: + """Test that one domain contains another.""" + outer = IndexDomain(inclusive_min=(0, 0), exclusive_max=(100, 100)) + inner = IndexDomain(inclusive_min=(10, 20), exclusive_max=(50, 60)) + + assert outer.contains_domain(inner) + assert not inner.contains_domain(outer) + + # Partially overlapping + partial = IndexDomain(inclusive_min=(50, 50), exclusive_max=(150, 150)) + assert not outer.contains_domain(partial) + + def test_invalid_domain(self) -> None: + """Test that invalid domains raise errors.""" + # min > max + with pytest.raises(ValueError, match="inclusive_min must be <= exclusive_max"): + IndexDomain(inclusive_min=(10,), exclusive_max=(5,)) + + # Mismatched lengths + with pytest.raises(ValueError, match="same length"): + IndexDomain(inclusive_min=(0, 0), exclusive_max=(10,)) + + def test_repr(self) -> None: + """Test string representation.""" + domain = IndexDomain(inclusive_min=(5, 10), exclusive_max=(15, 30)) + assert repr(domain) == "IndexDomain([5, 15), [10, 30))" + + def test_translate_basic(self) -> None: + """Test basic translation of a domain.""" + domain = IndexDomain(inclusive_min=(10, 20), exclusive_max=(30, 40)) + translated = domain.translate((-10, -20)) + + assert translated.inclusive_min == (0, 0) + assert translated.exclusive_max == (20, 20) + assert translated.shape == domain.shape # Shape unchanged + + def test_translate_positive_offset(self) -> None: + """Test translation with positive offset.""" + domain = IndexDomain(inclusive_min=(0, 0), exclusive_max=(10, 20)) + translated = domain.translate((5, 10)) + + assert translated.inclusive_min == (5, 10) + assert translated.exclusive_max == (15, 30) + + def test_translate_to_negative_coords(self) -> None: + """Test translation that results in negative coordinates.""" + domain = IndexDomain(inclusive_min=(5, 10), exclusive_max=(15, 20)) + translated = domain.translate((-10, -15)) + + assert translated.inclusive_min == (-5, -5) + assert translated.exclusive_max == (5, 5) + + def test_translate_preserves_shape(self) -> None: + """Test that translation preserves shape.""" + domain = IndexDomain(inclusive_min=(100, 200), exclusive_max=(150, 300)) + original_shape = domain.shape + + for offset in [(-100, -200), (50, 100), (-50, 50)]: + translated = domain.translate(offset) + assert translated.shape == original_shape + + def test_translate_wrong_ndim_raises(self) -> None: + """Test that translate raises for mismatched dimensions.""" + domain = IndexDomain(inclusive_min=(0, 0), exclusive_max=(10, 10)) + + with pytest.raises(ValueError, match="same length"): + domain.translate((5,)) + + with pytest.raises(ValueError, match="same length"): + domain.translate((5, 5, 5)) + + def test_translate_identity(self) -> None: + """Test that translating by zero offset is identity.""" + domain = IndexDomain(inclusive_min=(5, 10), exclusive_max=(15, 30)) + translated = domain.translate((0, 0)) + + assert translated == domain + + def test_translate_1d(self) -> None: + """Test translation in 1D.""" + domain = IndexDomain(inclusive_min=(50,), exclusive_max=(100,)) + translated = domain.translate((-50,)) + + assert translated.inclusive_min == (0,) + assert translated.exclusive_max == (50,) + + +class TestGetChunkProjections: + """Tests for the get_chunk_projections function.""" + + def test_single_chunk_full_domain(self) -> None: + """Test projection for domain covering exactly one chunk.""" + storage_shape = (10,) + chunk_shape = (10,) + domain = IndexDomain.from_shape((10,)) + + projections = list(get_chunk_projections(storage_shape, chunk_shape, domain)) + + assert len(projections) == 1 + output_sel, chunk_info = projections[0] + assert chunk_info.chunk_coords == (0,) + assert chunk_info.selection == (slice(0, 10),) + assert output_sel == (slice(0, 10),) + + def test_multiple_chunks(self) -> None: + """Test projection spanning multiple chunks.""" + storage_shape = (100,) + chunk_shape = (10,) + domain = IndexDomain(inclusive_min=(25,), exclusive_max=(75,)) + + projections = list(get_chunk_projections(storage_shape, chunk_shape, domain)) + + # Domain [25, 75) covers chunks 2, 3, 4, 5, 6, 7: + # Chunk 2 (20-30): selection [25-30) -> chunk_sel [5, 10), output_sel [0, 5) + # Chunk 3 (30-40): full chunk -> chunk_sel [0, 10), output_sel [5, 15) + # Chunk 4 (40-50): full chunk -> chunk_sel [0, 10), output_sel [15, 25) + # Chunk 5 (50-60): full chunk -> chunk_sel [0, 10), output_sel [25, 35) + # Chunk 6 (60-70): full chunk -> chunk_sel [0, 10), output_sel [35, 45) + # Chunk 7 (70-80): selection [70-75) -> chunk_sel [0, 5), output_sel [45, 50) + assert len(projections) == 6 # chunks 2, 3, 4, 5, 6, 7 + + # Check first chunk + output_sel, chunk_info = projections[0] + assert chunk_info.chunk_coords == (2,) + assert chunk_info.selection == (slice(5, 10),) + assert output_sel == (slice(0, 5),) + + # Check last chunk + output_sel, chunk_info = projections[-1] + assert chunk_info.chunk_coords == (7,) + assert chunk_info.selection == (slice(0, 5),) + assert output_sel == (slice(45, 50),) + + def test_with_index_transform(self) -> None: + """Test projection with non-zero storage transform offset.""" + storage_shape = (10,) + chunk_shape = (5,) + # Domain [10, 20) with offset 10 maps to storage [0, 10) + domain = IndexDomain(inclusive_min=(10,), exclusive_max=(20,)) + offset = (10,) + + projections = list( + get_chunk_projections(storage_shape, chunk_shape, domain, index_transform=offset) + ) + + assert len(projections) == 2 + # Chunk 0 (storage 0-5) maps to domain [10, 15) + output_sel, chunk_info = projections[0] + assert chunk_info.chunk_coords == (0,) + assert chunk_info.selection == (slice(0, 5),) + assert output_sel == (slice(0, 5),) + # Chunk 1 (storage 5-10) maps to domain [15, 20) + output_sel, chunk_info = projections[1] + assert chunk_info.chunk_coords == (1,) + assert chunk_info.selection == (slice(0, 5),) + assert output_sel == (slice(5, 10),) + + def test_domain_outside_storage_bounds(self) -> None: + """Test projection when domain extends beyond storage.""" + storage_shape = (10,) + chunk_shape = (5,) + # Domain [5, 15) with no offset - storage only has [0, 10) + domain = IndexDomain(inclusive_min=(5,), exclusive_max=(15,)) + + projections = list(get_chunk_projections(storage_shape, chunk_shape, domain)) + + # Only storage [5, 10) is valid + assert len(projections) == 1 + output_sel, chunk_info = projections[0] + assert chunk_info.chunk_coords == (1,) + assert chunk_info.selection == (slice(0, 5),) + # Domain [5, 15) -> output indices [0, 10), but only [0, 5) has data + assert output_sel == (slice(0, 5),) + + def test_domain_completely_outside_storage(self) -> None: + """Test projection when domain is entirely outside storage bounds.""" + storage_shape = (10,) + chunk_shape = (5,) + domain = IndexDomain(inclusive_min=(20,), exclusive_max=(30,)) + + projections = list(get_chunk_projections(storage_shape, chunk_shape, domain)) + + # No intersection with storage + assert len(projections) == 0 + + def test_multidimensional(self) -> None: + """Test projection for multi-dimensional arrays.""" + storage_shape = (20, 30) + chunk_shape = (10, 10) + domain = IndexDomain(inclusive_min=(5, 15), exclusive_max=(15, 25)) + + projections = list(get_chunk_projections(storage_shape, chunk_shape, domain)) + + # Domain [5, 15) x [15, 25) covers: + # Dim 0: chunks 0 and 1 (0-10 and 10-20) + # Dim 1: chunks 1 and 2 (10-20 and 20-30) + # So 2x2 = 4 chunk combinations + assert len(projections) == 4 + + # Check first chunk (0, 1) + output_sel, chunk_info = projections[0] + assert chunk_info.chunk_coords == (0, 1) + assert chunk_info.selection == (slice(5, 10), slice(5, 10)) + assert output_sel == (slice(0, 5), slice(0, 5)) + + def test_negative_domain_with_offset(self) -> None: + """Test projection with negative domain coordinates.""" + storage_shape = (10,) + chunk_shape = (5,) + # Domain [-5, 5) with offset -5 maps to storage [0, 10) + domain = IndexDomain(inclusive_min=(-5,), exclusive_max=(5,)) + offset = (-5,) + + projections = list( + get_chunk_projections(storage_shape, chunk_shape, domain, index_transform=offset) + ) + + assert len(projections) == 2 + _, chunk_info0 = projections[0] + _, chunk_info1 = projections[1] + assert chunk_info0.chunk_coords == (0,) + assert chunk_info1.chunk_coords == (1,) + + +class TestChunkLayout: + """Tests for the ChunkLayout class.""" + + def test_from_chunk_shape(self) -> None: + """Test creating a layout with zero origin.""" + layout = ChunkLayout.from_chunk_shape((10, 20)) + assert layout.grid_origin == (0, 0) + assert layout.chunk_shape == (10, 20) + assert layout.ndim == 2 + + def test_is_aligned(self) -> None: + """Test chunk alignment checking.""" + layout = ChunkLayout(grid_origin=(0, 0), chunk_shape=(10, 10)) + + # On chunk boundaries + assert layout.is_aligned((0, 0)) + assert layout.is_aligned((10, 0)) + assert layout.is_aligned((0, 10)) + assert layout.is_aligned((10, 10)) + assert layout.is_aligned((100, 200)) + + # Not on chunk boundaries + assert not layout.is_aligned((5, 0)) + assert not layout.is_aligned((0, 5)) + assert not layout.is_aligned((5, 5)) + assert not layout.is_aligned((15, 25)) + + def test_is_aligned_nonzero_origin(self) -> None: + """Test alignment with non-zero grid origin.""" + layout = ChunkLayout(grid_origin=(5, 5), chunk_shape=(10, 10)) + + # Aligned relative to origin at (5, 5) + assert layout.is_aligned((5, 5)) + assert layout.is_aligned((15, 5)) + assert layout.is_aligned((5, 15)) + assert layout.is_aligned((15, 15)) + + # Not aligned + assert not layout.is_aligned((0, 0)) # Would be aligned if origin was 0 + assert not layout.is_aligned((10, 10)) + assert not layout.is_aligned((7, 5)) + + def test_chunk_domain(self) -> None: + """Test getting the domain of a specific chunk.""" + layout = ChunkLayout(grid_origin=(0, 0), chunk_shape=(10, 10)) + + # First chunk + dom = layout.chunk_domain((0, 0)) + assert dom.inclusive_min == (0, 0) + assert dom.exclusive_max == (10, 10) + + # Another chunk + dom = layout.chunk_domain((2, 3)) + assert dom.inclusive_min == (20, 30) + assert dom.exclusive_max == (30, 40) + + def test_chunk_domain_nonzero_origin(self) -> None: + """Test chunk domain with non-zero grid origin.""" + layout = ChunkLayout(grid_origin=(5, 5), chunk_shape=(10, 10)) + + # First chunk starts at grid origin + dom = layout.chunk_domain((0, 0)) + assert dom.inclusive_min == (5, 5) + assert dom.exclusive_max == (15, 15) + + # Second chunk in each dimension + dom = layout.chunk_domain((1, 1)) + assert dom.inclusive_min == (15, 15) + assert dom.exclusive_max == (25, 25) + + def test_chunk_coords_for_point(self) -> None: + """Test finding which chunk contains a point.""" + layout = ChunkLayout(grid_origin=(0, 0), chunk_shape=(10, 10)) + + assert layout.chunk_coords_for_point((0, 0)) == (0, 0) + assert layout.chunk_coords_for_point((5, 5)) == (0, 0) + assert layout.chunk_coords_for_point((9, 9)) == (0, 0) + assert layout.chunk_coords_for_point((10, 10)) == (1, 1) + assert layout.chunk_coords_for_point((25, 35)) == (2, 3) + + def test_chunk_coords_for_point_nonzero_origin(self) -> None: + """Test chunk coords with non-zero grid origin.""" + layout = ChunkLayout(grid_origin=(5, 5), chunk_shape=(10, 10)) + + # Point at grid origin is in chunk (0, 0) + assert layout.chunk_coords_for_point((5, 5)) == (0, 0) + assert layout.chunk_coords_for_point((14, 14)) == (0, 0) + assert layout.chunk_coords_for_point((15, 15)) == (1, 1) + + # Point before grid origin is in chunk (-1, -1) + assert layout.chunk_coords_for_point((0, 0)) == (-1, -1) + assert layout.chunk_coords_for_point((4, 4)) == (-1, -1) + + def test_iter_chunk_coords(self) -> None: + """Test iterating over chunks overlapping a domain.""" + layout = ChunkLayout(grid_origin=(0, 0), chunk_shape=(10, 10)) + domain = IndexDomain(inclusive_min=(5, 15), exclusive_max=(25, 35)) + + coords = list(layout.iter_chunk_coords(domain)) + + # Domain [5, 25) x [15, 35) overlaps: + # Dim 0: chunks 0, 1, 2 (0-10, 10-20, 20-30) + # Dim 1: chunks 1, 2, 3 (10-20, 20-30, 30-40) + expected = [ + (0, 1), + (0, 2), + (0, 3), + (1, 1), + (1, 2), + (1, 3), + (2, 1), + (2, 2), + (2, 3), + ] + assert coords == expected + + def test_iter_chunk_domains(self) -> None: + """Test iterating over chunk domains overlapping a region.""" + layout = ChunkLayout(grid_origin=(0,), chunk_shape=(10,)) + domain = IndexDomain(inclusive_min=(15,), exclusive_max=(35,)) + + results = list(layout.iter_chunk_domains(domain)) + + # Domain [15, 35) overlaps chunks 1, 2, 3 + assert len(results) == 3 + + # Chunk 1: [10, 20) intersected with [15, 35) = [15, 20) + coords, dom = results[0] + assert coords == (1,) + assert dom.inclusive_min == (15,) + assert dom.exclusive_max == (20,) + + # Chunk 2: [20, 30) fully contained + coords, dom = results[1] + assert coords == (2,) + assert dom.inclusive_min == (20,) + assert dom.exclusive_max == (30,) + + # Chunk 3: [30, 40) intersected with [15, 35) = [30, 35) + coords, dom = results[2] + assert coords == (3,) + assert dom.inclusive_min == (30,) + assert dom.exclusive_max == (35,) + + def test_aligned_domain(self) -> None: + """Test finding the largest aligned subdomain.""" + layout = ChunkLayout(grid_origin=(0,), chunk_shape=(10,)) + + # Domain that's already aligned + domain = IndexDomain(inclusive_min=(10,), exclusive_max=(30,)) + aligned = layout.aligned_domain(domain) + assert aligned.inclusive_min == (10,) + assert aligned.exclusive_max == (30,) + + # Domain that needs rounding + domain = IndexDomain(inclusive_min=(5,), exclusive_max=(35,)) + aligned = layout.aligned_domain(domain) + assert aligned.inclusive_min == (10,) # Rounded up from 5 + assert aligned.exclusive_max == (30,) # Rounded down from 35 + + # Domain smaller than a chunk + domain = IndexDomain(inclusive_min=(12,), exclusive_max=(18,)) + aligned = layout.aligned_domain(domain) + assert aligned.inclusive_min == (20,) # Rounded up + assert aligned.exclusive_max == (20,) # Empty (rounded down < rounded up) + + def test_aligned_domain_nonzero_origin(self) -> None: + """Test aligned_domain with non-zero grid origin.""" + layout = ChunkLayout(grid_origin=(5,), chunk_shape=(10,)) + + # Domain [7, 28) -> aligned to [15, 25) (boundaries at 5, 15, 25, 35...) + domain = IndexDomain(inclusive_min=(7,), exclusive_max=(28,)) + aligned = layout.aligned_domain(domain) + assert aligned.inclusive_min == (15,) + assert aligned.exclusive_max == (25,) + + def test_invalid_chunk_shape(self) -> None: + """Test that zero or negative chunk shapes raise errors.""" + with pytest.raises(ValueError, match="positive"): + ChunkLayout(grid_origin=(0,), chunk_shape=(0,)) + + with pytest.raises(ValueError, match="positive"): + ChunkLayout(grid_origin=(0,), chunk_shape=(-5,)) + + def test_mismatched_dimensions(self) -> None: + """Test that mismatched dimensions raise errors.""" + with pytest.raises(ValueError, match="same length"): + ChunkLayout(grid_origin=(0, 0), chunk_shape=(10,)) + + def test_repr(self) -> None: + """Test string representation.""" + layout = ChunkLayout(grid_origin=(5, 10), chunk_shape=(10, 20)) + assert repr(layout) == "ChunkLayout(grid_origin=(5, 10), chunk_shape=(10, 20))" + + +class TestArrayChunkLayout: + """Tests for chunk_layout property on Array.""" + + @pytest.fixture + def base_array(self) -> Array: + """Create a base array for testing.""" + store = MemoryStore() + zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4", fill_value=0) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(100, dtype="i4")) + return arr + + def test_chunk_layout_basic(self, base_array: Array) -> None: + """Test chunk_layout on a basic array.""" + layout = base_array.chunk_layout + assert layout.grid_origin == (0,) + assert layout.chunk_shape == (10,) + + def test_chunk_layout_is_aligned(self, base_array: Array) -> None: + """Test using chunk_layout to check alignment.""" + layout = base_array.chunk_layout + assert layout.is_aligned((0,)) + assert layout.is_aligned((10,)) + assert layout.is_aligned((50,)) + assert not layout.is_aligned((5,)) + assert not layout.is_aligned((25,)) + + def test_chunk_layout_after_slice(self, base_array: Array) -> None: + """Test that chunk_layout reflects the sliced domain's coordinate system.""" + # Slice the array - domain becomes [25, 75) + sliced = base_array[25:75] + + # The chunk layout should still have the same grid boundaries + # but expressed in the original coordinate system (since slicing + # preserves index_transform) + layout = sliced.chunk_layout + assert layout.chunk_shape == (10,) + assert layout.grid_origin == (0,) # Slicing preserves index_transform=(0,) + + # Check alignment in the sliced domain's coordinates + assert layout.is_aligned((30,)) # 30 is a chunk boundary + assert layout.is_aligned((40,)) + assert not layout.is_aligned((25,)) # 25 is not a chunk boundary + assert not layout.is_aligned((35,)) + + def test_chunk_layout_after_with_domain(self, base_array: Array) -> None: + """Test chunk_layout after with_domain shifts the grid.""" + # with_domain sets index_transform = domain.origin + # So domain coordinate 10 maps to storage coordinate 0 + new_domain = IndexDomain(inclusive_min=(10,), exclusive_max=(20,)) + view = base_array.with_domain(new_domain) + + layout = view.chunk_layout + assert layout.chunk_shape == (10,) + # Grid origin is at index_transform = (10,) + # So chunk boundaries are at 10, 20, 30, ... + assert layout.grid_origin == (10,) + + assert layout.is_aligned((10,)) # Domain origin is aligned + assert layout.is_aligned((20,)) + assert not layout.is_aligned((15,)) + + def test_chunk_layout_iter_chunk_domains(self, base_array: Array) -> None: + """Test using chunk_layout to iterate over chunks.""" + layout = base_array.chunk_layout + + # Get chunks overlapping [25, 55) + domain = IndexDomain(inclusive_min=(25,), exclusive_max=(55,)) + chunks = list(layout.iter_chunk_domains(domain)) + + # Should overlap chunks 2, 3, 4, 5 (covering 20-60) + assert len(chunks) == 4 + + # First chunk: coords (2,), intersection [25, 30) + coords, dom = chunks[0] + assert coords == (2,) + assert dom == IndexDomain(inclusive_min=(25,), exclusive_max=(30,)) + + # Last chunk: coords (5,), intersection [50, 55) + coords, dom = chunks[-1] + assert coords == (5,) + assert dom == IndexDomain(inclusive_min=(50,), exclusive_max=(55,)) + + def test_chunk_layout_aligned_domain(self, base_array: Array) -> None: + """Test finding aligned subdomain.""" + layout = base_array.chunk_layout + + # Find aligned subdomain of [25, 75) + domain = IndexDomain(inclusive_min=(25,), exclusive_max=(75,)) + aligned = layout.aligned_domain(domain) + + # Should round to [30, 70) + assert aligned.inclusive_min == (30,) + assert aligned.exclusive_max == (70,) + + +class TestArrayDomain: + """Tests for Array with domain tracking.""" + + @pytest.fixture + def base_array(self) -> Array: + """Create a base array for testing.""" + store = MemoryStore() + zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4", fill_value=0) + arr = Array.open(store) + # Fill with test data + arr.setitem(slice(None), np.arange(100, dtype="i4")) + return arr + + @pytest.fixture + def multidim_array(self) -> Array: + """Create a multi-dimensional array for testing.""" + store = MemoryStore() + zarr.create_array(store, shape=(10, 20, 30), chunks=(5, 10, 15), dtype="i4", fill_value=0) + arr = Array.open(store) + data = np.arange(10 * 20 * 30, dtype="i4").reshape((10, 20, 30)) + arr.setitem(slice(None), data) + return arr + + def test_array_domain(self, base_array: Array) -> None: + """Test that slicing an array changes the origin and domain of the array.""" + # Check initial domain + assert base_array.origin == (0,) + assert base_array.domain == IndexDomain.from_shape((100,)) + assert base_array.shape == (100,) + + # Slice the array using absolute coordinates + sliced = base_array[20:40] + + # Check that we got a new Array, not data + assert isinstance(sliced, Array) + + # Check that the domain reflects the slice + assert sliced.origin == (20,) + assert sliced.domain.inclusive_min == (20,) + assert sliced.domain.exclusive_max == (40,) + assert sliced.shape == (20,) + + def test_chained_slicing(self, base_array: Array) -> None: + """Test that chained slicing works correctly with absolute coordinates.""" + # First slice: [20:60) -> domain [20, 60) + first = base_array[20:60] + assert first.origin == (20,) + assert first.shape == (40,) + + # Second slice: [30:40) in absolute coordinates + # (these coordinates are within the domain [20, 60)) + second = first[30:40] + assert second.origin == (30,) + assert second.shape == (10,) + + # Verify we can resolve to the correct data + data = second.resolve() + expected = np.arange(30, 40, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_multidim_slicing(self, multidim_array: Array) -> None: + """Test slicing in multiple dimensions.""" + # Slice in all dimensions using absolute coordinates + sliced = multidim_array[2:8, 5:15, 10:25] + + assert sliced.origin == (2, 5, 10) + assert sliced.shape == (6, 10, 15) + assert sliced.domain.inclusive_min == (2, 5, 10) + assert sliced.domain.exclusive_max == (8, 15, 25) + + def test_partial_slicing(self, multidim_array: Array) -> None: + """Test slicing only some dimensions.""" + # Slice only first dimension + sliced = multidim_array[3:7] + + assert sliced.origin == (3, 0, 0) + assert sliced.shape == (4, 20, 30) + + def test_ellipsis_slicing(self, multidim_array: Array) -> None: + """Test slicing with ellipsis.""" + # Ellipsis at the end + sliced = multidim_array[3:7, ...] + assert sliced.origin == (3, 0, 0) + assert sliced.shape == (4, 20, 30) + + # Ellipsis at the start + sliced = multidim_array[..., 10:20] + assert sliced.origin == (0, 0, 10) + assert sliced.shape == (10, 20, 10) + + +class TestWithDomain: + """Tests for the with_domain() method.""" + + @pytest.fixture + def base_array(self) -> Array: + """Create a base array for testing.""" + store = MemoryStore() + zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4", fill_value=0) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(100, dtype="i4")) + return arr + + def test_with_domain(self, base_array: Array) -> None: + """Test that with_domain remaps domain coordinates to storage coordinates. + + with_domain() creates a view where domain.origin maps to storage coordinate 0. + This follows TensorStore's IndexTransform semantic. + """ + # Create new domain [10, 20) - this will MAP to storage [0, 10) + new_domain = IndexDomain(inclusive_min=(10,), exclusive_max=(20,)) + + # Use with_domain to create a new view + view = base_array.with_domain(new_domain) + + # Check properties + assert view.origin == (10,) + assert view.shape == (10,) + assert view.domain == new_domain + + # Resolve and check data + # Domain [10, 20) maps to storage [0, 10), so we get storage[0:10] + data = view.resolve() + expected = np.arange(0, 10, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_with_domain_beyond_bounds(self, base_array: Array) -> None: + """Test that with_domain allows domains beyond storage bounds. + + When domain extends beyond storage (after coordinate remapping), + out-of-bounds regions are filled with fill_value. + """ + # Create a domain [90, 110) which maps to storage [0, 20) + # But storage only has 100 elements, so storage [0, 20) is valid + # This test should use a domain that goes beyond the remapped storage bounds + # Let's use domain [0, 110) which maps to storage [0, 110) - last 10 are OOB + extended_domain = IndexDomain(inclusive_min=(0,), exclusive_max=(110,)) + view = base_array.with_domain(extended_domain) + + assert view.origin == (0,) + assert view.shape == (110,) + + # Domain [0, 110) maps to storage [0, 110) + # Storage only has [0, 100), so last 10 values are fill_value (0) + data = view.resolve() + expected = np.concatenate([np.arange(0, 100, dtype="i4"), np.zeros(10, dtype="i4")]) + np.testing.assert_array_equal(data, expected) + + def test_with_domain_negative_origin(self, base_array: Array) -> None: + """Test with_domain with negative origin. + + Domain with negative origin maps negative coords to storage coordinates. + Domain.origin maps to storage 0, so domain -5 maps to storage 0. + """ + # Create a domain with negative origin [-5, 5) + # This maps to storage [0, 10) + neg_domain = IndexDomain(inclusive_min=(-5,), exclusive_max=(5,)) + view = base_array.with_domain(neg_domain) + + assert view.origin == (-5,) + assert view.shape == (10,) + + # Domain [-5, 5) maps to storage [0, 10) + # So we get storage[0:10] = [0, 1, 2, ..., 9] + data = view.resolve() + expected = np.arange(0, 10, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_with_domain_wrong_ndim(self, base_array: Array) -> None: + """Test that with_domain raises error for wrong number of dimensions.""" + wrong_ndim = IndexDomain(inclusive_min=(0, 0), exclusive_max=(10, 10)) + + with pytest.raises(ValueError, match="same number of dimensions"): + base_array.with_domain(wrong_ndim) + + def test_with_domain_preserves_store(self, base_array: Array) -> None: + """Test that with_domain preserves the store reference.""" + new_domain = IndexDomain(inclusive_min=(50,), exclusive_max=(60,)) + view = base_array.with_domain(new_domain) + + # Should share the same store + assert view.store is base_array.store + assert view.store_path == base_array.store_path + + +class TestAbsoluteIndexing: + """Tests for TensorStore-style absolute coordinate indexing. + + Key insight: indices are ABSOLUTE coordinates in the domain, not offsets. + Negative indices mean negative coordinates, not "from the end". + """ + + @pytest.fixture + def standard_array(self) -> Array: + """Create a standard array with domain [0, 10).""" + store = MemoryStore() + zarr.create_array(store, shape=(10,), chunks=(5,), dtype="i4", fill_value=0) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(10, dtype="i4")) + return arr + + @pytest.fixture + def shifted_array(self) -> Array: + """Create an array with domain [10, 20).""" + store = MemoryStore() + zarr.create_array(store, shape=(10,), chunks=(5,), dtype="i4", fill_value=0) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(10, dtype="i4")) + # Shift domain to [10, 20) + return arr.with_domain(IndexDomain(inclusive_min=(10,), exclusive_max=(20,))) + + @pytest.fixture + def negative_domain_array(self) -> Array: + """Create an array with domain [-5, 5).""" + store = MemoryStore() + zarr.create_array(store, shape=(10,), chunks=(5,), dtype="i4", fill_value=-1) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(10, dtype="i4")) + # Shift domain to [-5, 5) + return arr.with_domain(IndexDomain(inclusive_min=(-5,), exclusive_max=(5,))) + + def test_absolute_integer_index(self, standard_array: Array) -> None: + """Test that integer indices are absolute coordinates.""" + # arr[5] means coordinate 5, not "5th element" + sliced = standard_array[5] + assert sliced.origin == (5,) + assert sliced.shape == (1,) + + data = sliced.resolve() + assert data == 5 + + def test_absolute_index_shifted_domain(self, shifted_array: Array) -> None: + """Test absolute indexing with a shifted domain.""" + # Domain is [10, 20), so arr[15] selects coordinate 15 + sliced = shifted_array[15] + assert sliced.origin == (15,) + assert sliced.shape == (1,) + + # Coordinate 15 maps to storage index 5 (since domain starts at 10) + data = sliced.resolve() + assert data == 5 + + def test_index_below_domain_raises(self, shifted_array: Array) -> None: + """Test that indexing below domain raises error.""" + # Domain is [10, 20), so coordinate 5 is out of bounds + with pytest.raises(IndexError, match="out of bounds"): + shifted_array[5] + + def test_negative_index_is_coordinate(self, negative_domain_array: Array) -> None: + """Test that negative indices are actual coordinates, not 'from end'.""" + # Domain is [-5, 5), so arr[-3] means coordinate -3 + sliced = negative_domain_array[-3] + assert sliced.origin == (-3,) + assert sliced.shape == (1,) + + # Coordinate -3 maps to storage index 2 (since domain starts at -5) + data = sliced.resolve() + assert data == 2 + + def test_negative_index_out_of_bounds(self, standard_array: Array) -> None: + """Test that negative indices outside domain raise errors.""" + # Domain is [0, 10), so -1 is out of bounds (it's not "last element") + with pytest.raises(IndexError, match="out of bounds"): + standard_array[-1] + + def test_absolute_slice(self, shifted_array: Array) -> None: + """Test that slice bounds are absolute coordinates.""" + # Domain is [10, 20), slice [12:18) + sliced = shifted_array[12:18] + assert sliced.origin == (12,) + assert sliced.shape == (6,) + + data = sliced.resolve() + # Coordinates 12-17 map to storage indices 2-7 + expected = np.arange(2, 8, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_slice_with_negative_coordinates(self, negative_domain_array: Array) -> None: + """Test slicing with negative coordinate bounds.""" + # Domain is [-5, 5), slice [-3:2) + sliced = negative_domain_array[-3:2] + assert sliced.origin == (-3,) + assert sliced.shape == (5,) + + data = sliced.resolve() + # Coordinates -3 to 1 map to storage indices 2-6 + expected = np.arange(2, 7, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_slice_clamps_to_domain(self, standard_array: Array) -> None: + """Test that slices clamp to domain bounds (no error for OOB slices).""" + # Domain is [0, 10), slice [5:100) clamps to [5:10) + sliced = standard_array[5:100] + assert sliced.origin == (5,) + assert sliced.shape == (5,) + + def test_slice_before_domain_clamps(self, shifted_array: Array) -> None: + """Test that slices starting before domain clamp correctly.""" + # Domain is [10, 20), slice [0:15) clamps to [10:15) + sliced = shifted_array[0:15] + assert sliced.origin == (10,) + assert sliced.shape == (5,) + + def test_chained_absolute_indexing(self, standard_array: Array) -> None: + """Test chaining with absolute coordinates.""" + # First slice: [2:8) -> domain [2, 8) + first = standard_array[2:8] + assert first.origin == (2,) + assert first.shape == (6,) + + # Second slice: [4:6) - these are absolute coordinates within [2, 8) + second = first[4:6] + assert second.origin == (4,) + assert second.shape == (2,) + + data = second.resolve() + expected = np.arange(4, 6, dtype="i4") + np.testing.assert_array_equal(data, expected) + + +class TestResolve: + """Tests for the resolve() method that materializes data.""" + + @pytest.fixture + def filled_array(self) -> Array: + """Create an array filled with sequential data.""" + store = MemoryStore() + zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4", fill_value=0) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(100, dtype="i4")) + return arr + + def test_resolve_full_array(self, filled_array: Array) -> None: + """Test resolving the full array.""" + data = filled_array.resolve() + expected = np.arange(100, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_resolve_sliced_array(self, filled_array: Array) -> None: + """Test resolving a sliced array.""" + sliced = filled_array[25:75] + data = sliced.resolve() + expected = np.arange(25, 75, dtype="i4") + np.testing.assert_array_equal(data, expected) + + def test_resolve_chain_slices(self, filled_array: Array) -> None: + """Test resolving after chaining multiple slices.""" + # Chain multiple slices with absolute coordinates + result = filled_array[10:90][30:70][40:60] + data = result.resolve() + expected = np.arange(40, 60, dtype="i4") + np.testing.assert_array_equal(data, expected) + + +class TestIntegerIndexing: + """Tests for integer (single element) indexing.""" + + @pytest.fixture + def array_1d(self) -> Array: + """Create a 1D array.""" + store = MemoryStore() + zarr.create_array(store, shape=(10,), chunks=(5,), dtype="i4", fill_value=0) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(10, dtype="i4")) + return arr + + @pytest.fixture + def array_3d(self) -> Array: + """Create a 3D array.""" + store = MemoryStore() + zarr.create_array(store, shape=(5, 6, 7), chunks=(2, 3, 4), dtype="i4", fill_value=0) + arr = Array.open(store) + data = np.arange(5 * 6 * 7, dtype="i4").reshape((5, 6, 7)) + arr.setitem(slice(None), data) + return arr + + def test_integer_index_preserves_dimension(self, array_1d: Array) -> None: + """Test that integer indexing preserves the dimension (unlike NumPy).""" + # In lazy indexing, arr[5] should give a length-1 array, not drop the dimension + sliced = array_1d[5] + assert sliced.ndim == 1 + assert sliced.shape == (1,) + assert sliced.origin == (5,) + + def test_integer_index_3d(self, array_3d: Array) -> None: + """Test integer indexing in 3D.""" + # Single integer should give a length-1 slice in that dimension + sliced = array_3d[2] + assert sliced.shape == (1, 6, 7) + assert sliced.origin == (2, 0, 0) + + def test_mixed_integer_slice(self, array_3d: Array) -> None: + """Test mixing integer and slice indexing.""" + sliced = array_3d[2, 1:4, 3] + assert sliced.shape == (1, 3, 1) + assert sliced.origin == (2, 1, 3) + + +class TestEdgeCases: + """Test edge cases and error handling.""" + + @pytest.fixture + def array_1d(self) -> Array: + """Create a 1D array.""" + store = MemoryStore() + zarr.create_array(store, shape=(10,), chunks=(5,), dtype="i4", fill_value=0) + return Array.open(store) + + def test_empty_slice(self, array_1d: Array) -> None: + """Test that an empty slice creates a zero-size array.""" + sliced = array_1d[5:5] + assert sliced.shape == (0,) + assert sliced.origin == (5,) + + def test_slice_step_not_one_raises(self, array_1d: Array) -> None: + """Test that step != 1 raises an error.""" + with pytest.raises(IndexError, match="only supports step=1"): + array_1d[::2] + + def test_too_many_indices(self, array_1d: Array) -> None: + """Test that too many indices raises an error.""" + with pytest.raises(IndexError, match="too many indices"): + array_1d[1, 2] + + def test_slice_clamps_to_bounds(self, array_1d: Array) -> None: + """Test that slices clamp to array bounds (like NumPy).""" + # Slice extends beyond bounds + sliced = array_1d[5:100] + assert sliced.shape == (5,) # Clamped to (5, 10) + assert sliced.origin == (5,) + assert sliced.domain.exclusive_max == (10,) + + def test_open_with_custom_domain(self) -> None: + """Test opening an array with a custom domain.""" + store = MemoryStore() + zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4", fill_value=0) + + # Open with a custom domain + custom_domain = IndexDomain(inclusive_min=(25,), exclusive_max=(75,)) + arr = Array.open(store, domain=custom_domain) + + assert arr.domain == custom_domain + assert arr.origin == (25,) + assert arr.shape == (50,) + + +class TestMerge: + """Tests for merge with unified Array type.""" + + @pytest.fixture + def base_array(self) -> Array: + """Create a base array for testing.""" + store = MemoryStore() + zarr.create_array(store, shape=(100,), chunks=(10,), dtype="i4", fill_value=-1) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(100, dtype="i4")) + return arr + + def test_merge_basic(self, base_array: Array) -> None: + """Test basic concatenation of two slices.""" + left = base_array[0:30] + right = base_array[70:100] + + combined = merge([left, right]) + + assert combined.domain == IndexDomain(inclusive_min=(0,), exclusive_max=(100,)) + assert combined.shape == (100,) + assert combined.dtype == base_array.dtype + + def test_merge_resolve(self, base_array: Array) -> None: + """Test that merge resolves correctly with gaps filled.""" + left = base_array[0:30] + right = base_array[70:100] + + combined = merge([left, right]) + data = combined.resolve() + + # Check that we got the expected data + np.testing.assert_array_equal(data[0:30], np.arange(0, 30, dtype="i4")) + np.testing.assert_array_equal(data[70:100], np.arange(70, 100, dtype="i4")) + # Gap should be filled with fill_value (-1) + np.testing.assert_array_equal(data[30:70], np.full(40, -1, dtype="i4")) + + def test_merge_inverts_slicing(self, base_array: Array) -> None: + """Test that merge is the inverse of slicing.""" + # Slice into chunks + chunks = [base_array[i : i + 10] for i in range(0, 100, 10)] + + # Reassemble + reassembled = merge(chunks) + + # Should be identical to original + np.testing.assert_array_equal(reassembled.resolve(), base_array.resolve()) + + def test_merge_overlapping_last_wins(self, base_array: Array) -> None: + """Test that overlapping regions use last-write-wins.""" + # Create two overlapping slices + a = base_array[0:60] + b = base_array[40:100] + + # b comes after a, so b's data should win in [40, 60) + combined = merge([a, b]) + data = combined.resolve() + + # All data should match original since both come from same source + np.testing.assert_array_equal(data, base_array.resolve()) + + def test_merge_with_explicit_domain(self, base_array: Array) -> None: + """Test merge with explicitly specified domain.""" + left = base_array[10:30] + right = base_array[70:90] + + # Specify a smaller domain than bounding box + explicit_domain = IndexDomain(inclusive_min=(20,), exclusive_max=(80,)) + combined = merge([left, right], domain=explicit_domain) + + assert combined.domain == explicit_domain + assert combined.shape == (60,) + + data = combined.resolve() + # left contributes [20, 30), right contributes [70, 80) + np.testing.assert_array_equal(data[0:10], np.arange(20, 30, dtype="i4")) + np.testing.assert_array_equal(data[50:60], np.arange(70, 80, dtype="i4")) + # Gap is fill_value + np.testing.assert_array_equal(data[10:50], np.full(40, -1, dtype="i4")) + + def test_merge_custom_fill_value(self, base_array: Array) -> None: + """Test merge with custom fill value.""" + left = base_array[0:30] + right = base_array[70:100] + + combined = merge([left, right], fill_value=999) + data = combined.resolve() + + # Gap should be filled with custom fill_value + np.testing.assert_array_equal(data[30:70], np.full(40, 999, dtype="i4")) + + def test_merge_preserves_dtype(self, base_array: Array) -> None: + """Test that merge preserves dtype.""" + left = base_array[0:50] + right = base_array[50:100] + + combined = merge([left, right]) + assert combined.dtype == base_array.dtype + + data = combined.resolve() + assert data.dtype == base_array.dtype + + def test_merge_single_array(self, base_array: Array) -> None: + """Test merge with a single array.""" + sliced = base_array[25:75] + combined = merge([sliced]) + + assert combined.domain == sliced.domain + np.testing.assert_array_equal(combined.resolve(), sliced.resolve()) + + def test_merge_empty_raises(self) -> None: + """Test that merge with no arrays raises.""" + with pytest.raises(ValueError, match="at least one array"): + merge([]) + + def test_merge_mismatched_ndim_raises(self, base_array: Array) -> None: + """Test that merge with mismatched dimensions raises.""" + store = MemoryStore() + zarr.create_array(store, shape=(10, 10), chunks=(5, 5), dtype="i4", fill_value=0) + arr_2d = Array.open(store) + + with pytest.raises(ValueError, match="same number of dimensions"): + merge([base_array[0:10], arr_2d[0:5, 0:5]]) + + def test_merge_mismatched_dtype_raises(self) -> None: + """Test that merge with mismatched dtypes raises.""" + store1 = MemoryStore() + zarr.create_array(store1, shape=(10,), chunks=(5,), dtype="i4", fill_value=0) + arr1 = Array.open(store1) + + store2 = MemoryStore() + zarr.create_array(store2, shape=(10,), chunks=(5,), dtype="f8", fill_value=0) + arr2 = Array.open(store2) + + with pytest.raises(ValueError, match="same dtype"): + merge([arr1, arr2]) + + def test_merge_2d(self) -> None: + """Test merge with 2D arrays.""" + store = MemoryStore() + zarr.create_array(store, shape=(20, 20), chunks=(10, 10), dtype="i4", fill_value=-1) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(400, dtype="i4").reshape(20, 20)) + + # Slice into quadrants + top_left = arr[0:10, 0:10] + top_right = arr[0:10, 10:20] + bottom_left = arr[10:20, 0:10] + bottom_right = arr[10:20, 10:20] + + # Reassemble + combined = merge([top_left, top_right, bottom_left, bottom_right]) + + assert combined.domain == arr.domain + np.testing.assert_array_equal(combined.resolve(), arr.resolve()) + + def test_merge_2d_with_gap(self) -> None: + """Test 2D merge with a gap.""" + store = MemoryStore() + zarr.create_array(store, shape=(20, 20), chunks=(10, 10), dtype="i4", fill_value=-1) + arr = Array.open(store) + arr.setitem(slice(None), np.arange(400, dtype="i4").reshape(20, 20)) + + # Only top-left and bottom-right + top_left = arr[0:10, 0:10] + bottom_right = arr[10:20, 10:20] + + combined = merge([top_left, bottom_right]) + + assert combined.domain == IndexDomain(inclusive_min=(0, 0), exclusive_max=(20, 20)) + + data = combined.resolve() + # Top-left should have data + np.testing.assert_array_equal(data[0:10, 0:10], arr.resolve()[0:10, 0:10]) + # Bottom-right should have data + np.testing.assert_array_equal(data[10:20, 10:20], arr.resolve()[10:20, 10:20]) + # Gaps should be fill_value + np.testing.assert_array_equal(data[0:10, 10:20], np.full((10, 10), -1, dtype="i4")) + np.testing.assert_array_equal(data[10:20, 0:10], np.full((10, 10), -1, dtype="i4")) + + def test_merge_repr(self, base_array: Array) -> None: + """Test concatenated Array string representation.""" + combined = merge([base_array[0:30], base_array[70:100]]) + repr_str = repr(combined) + assert "Array" in repr_str + assert "sources=2" in repr_str + + def test_merge_returns_array(self, base_array: Array) -> None: + """Test that merge returns an Array.""" + combined = merge([base_array[0:50], base_array[50:100]]) + assert isinstance(combined, Array) + + def test_merge_nested(self, base_array: Array) -> None: + """Test that concatenated Arrays can be nested.""" + # Create two concatenated Arrays + left_concat = merge([base_array[0:20], base_array[20:40]]) + right_concat = merge([base_array[60:80], base_array[80:100]]) + + # Concat them together + combined = merge([left_concat, right_concat]) + + assert combined.domain == IndexDomain(inclusive_min=(0,), exclusive_max=(100,)) + + data = combined.resolve() + np.testing.assert_array_equal(data[0:40], np.arange(0, 40, dtype="i4")) + np.testing.assert_array_equal(data[60:100], np.arange(60, 100, dtype="i4")) + # Gap filled + np.testing.assert_array_equal(data[40:60], np.full(20, -1, dtype="i4")) + + def test_merge_slicing(self, base_array: Array) -> None: + """Test slicing a concatenated Array.""" + combined = merge([base_array[0:30], base_array[70:100]]) + + # Slice the combined array + sliced = combined[20:80] + + assert sliced.domain == IndexDomain(inclusive_min=(20,), exclusive_max=(80,)) + assert sliced.shape == (60,) + + data = sliced.resolve() + np.testing.assert_array_equal(data[0:10], np.arange(20, 30, dtype="i4")) + np.testing.assert_array_equal(data[50:60], np.arange(70, 80, dtype="i4")) + np.testing.assert_array_equal(data[10:50], np.full(40, -1, dtype="i4")) + + def test_merge_from_chunk_layout(self, base_array: Array) -> None: + """Test reassembling an array from its chunks using chunk_layout.""" + layout = base_array.chunk_layout + + # Get each chunk as a slice + chunks = [] + for _chunk_coords, chunk_domain in layout.iter_chunk_domains(base_array.domain): + chunk_slice = base_array[chunk_domain.inclusive_min[0] : chunk_domain.exclusive_max[0]] + chunks.append(chunk_slice) + + # Reassemble + reassembled = merge(chunks) + + np.testing.assert_array_equal(reassembled.resolve(), base_array.resolve())