diff --git a/Cargo.lock b/Cargo.lock index f1cd5d9..6c0b3c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5247,6 +5247,7 @@ dependencies = [ "arrow-array", "arrow-buffer", "arrow-schema", + "async-trait", "bytes", "dlpark", "half", diff --git a/Cargo.toml b/Cargo.toml index 97081e7..14f3569 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,7 @@ abi3-py311 = ["pyo3/abi3-py311"] arrow-array = "59" arrow-buffer = "59" arrow-schema = "59" +async-trait = "0.1" bytes = "1.12.0" dlpark = { git = "https://github.com/kylebarron/dlpark", rev = "31c6f49c064e634326c97172d39a00acecd854b6", features = [ "pyo3", diff --git a/dev-docs/specs/2026-06-25-read-only-array-design.md b/dev-docs/specs/2026-06-25-read-only-array-design.md new file mode 100644 index 0000000..149e780 --- /dev/null +++ b/dev-docs/specs/2026-06-25-read-only-array-design.md @@ -0,0 +1,131 @@ +# Read-only array via a `ReadOnly` storage adapter + +**Date:** 2026-06-25 +**Status:** Approved, ready for implementation plan + +## Problem + +`PyArray` wraps `Array` — the maximal +zarrs storage trait — and all write methods (`store_chunk`, `store_encoded_chunk`, +`erase_chunk`, `erase_metadata`, `compact_chunk`) live directly on it. + +We want a `read_only()` method that returns an array which **raises at runtime** if +the user attempts to mutate it. The motivation is guarding against *accidental* +mutation of an array to which the user actually has write access. + +zarrs' built-in `Array::readable()` downgrades to `Array`, +a strictly less-capable *type* that `PyArray::new` will not accept (compile error: +`expected trait ReadableWritableListableStorageTraits, found trait +ReadableStorageTraits`). A compile-time-distinct read-only type is also the wrong +model for our future needs: stores like read-only S3 (via `ObjectStore`) or future +Python-protocol-backed stores only know they are read-only **at runtime**, not at +compile time. + +## Decision + +Introduce a `ReadOnly` storage **adapter** that wraps a readable + listable store +and **satisfies the maximal `ReadableWritableListableStorageTraits`** by implementing +its write methods as runtime errors. This keeps the entire existing stack (one +`PyArray` type over one storage trait) unchanged — no new pyclass, no type surgery — +while making writes fail at runtime instead of compile time. + +This works because zarrs provides a blanket impl: any `T: Readable + Writable + +Listable + 'static` automatically implements `ReadableWritableListableStorageTraits` +(`zarrs_storage/src/storage_sync.rs:254`). So an adapter that *delegates* reads and +lists and *errors* on writes transparently fills the maximal-trait slot. + +Rejected alternative — a separate `ReadOnlyArray` pyclass over +`Array`: gives compile-time guarantees but (a) requires +duplicating/factoring all read + metadata methods into shared macros for a second +type, and (b) cannot represent stores whose read-only-ness is only known at runtime, +which is the more important real-world case. + +## Components + +### 1. `ReadOnly` — sync adapter + +New file: `src/storage/read_only.rs`. + +```rust +pub struct ReadOnly(Arc); + +impl ReadOnly { + pub fn new(inner: Arc) -> Self { Self(inner) } +} +``` + +Trait impls (over `T: ?Sized + ReadableListableStorageTraits` as appropriate): + +- `ReadableStorageTraits` — delegate every method to `self.0` + (`get_partial_many`, `size_key`, `supports_get_partial`). +- `ListableStorageTraits` — delegate every method to `self.0` + (`list`, `list_prefix`, `list_dir`, `size_prefix`). +- `WritableStorageTraits` — **every** method returns + `Err(StorageError::ReadOnly)`: + - `set` → `Err(StorageError::ReadOnly)` + - `set_partial_many` → `Err(StorageError::ReadOnly)` + - `erase` → `Err(StorageError::ReadOnly)` + - `erase_prefix` → `Err(StorageError::ReadOnly)` + - `supports_set_partial` → `false` + +`StorageError::ReadOnly` already exists with the message "a write operation was +attempted on a read only store" (`zarrs_storage/src/lib.rs:170`). It flows through +the existing `ZarristaError` conversion and surfaces as a Python exception. + +The blanket impl then yields `ReadableWritableListableStorageTraits` for free. + +### 2. `AsyncReadOnly` — async adapter + +Same file (or `src/storage/read_only.rs` shared). Mirrors `ReadOnly` against the +async traits (`AsyncReadableStorageTraits`, `AsyncListableStorageTraits`, +`AsyncWritableStorageTraits`), whose method set is identical with `async fn`. Write +methods return `Err(StorageError::ReadOnly)`. The async blanket impl +(`storage_async.rs`) yields `AsyncReadableWritableListableStorageTraits`. + +### 3. `PyArray::read_only` (sync) + +Replace the current non-compiling body in `src/array/sync.rs`: + +```rust +fn read_only(&self) -> Self { + let inner = self.inner.storage().readable_listable(); // RWL -> RL + let storage = Arc::new(ReadOnly::new(inner)); // RL -> faked RWL + Self::new(Arc::new(self.inner.with_storage(storage))) +} +``` + +- `Array::storage()` (`zarrs/src/array.rs:710`) → `Arc`. +- `.readable_listable()` (`storage_sync.rs:248`) downgrades to `Arc`. +- `Array::with_storage(storage)` (`zarrs/src/array.rs:420`) rebuilds the `Array` + with the same metadata over the new storage. + +### 4. `PyAsyncArray::read_only` (async) + +Same shape in `src/array/async.rs`, using `AsyncReadOnly` and the async +`readable_listable()`. + +## Data flow + +- Reads (`__getitem__`, `retrieve_array_subset`, `retrieve_chunk`, + `retrieve_encoded_chunk`) and metadata accessors: pass through the adapter + untouched. +- Writes (`store_chunk`, `store_encoded_chunk`, `erase_chunk`, `erase_metadata`, + `compact_chunk`): hit the adapter's erroring write methods and raise + `StorageError::ReadOnly` → Python exception. + +## Out of scope (YAGNI for this pass) + +- A `read_only` boolean / introspection property on the array. Future direction: + expose `array.store`, and put `is_read_only` on that store object instead. +- Wrapping genuinely read-only stores at the `PySyncStorage`/`PyAsyncStorage` + boundary (S3, Python-protocol stores). The `ReadOnly` adapter built here is the + reusable mechanism that will enable it; the boundary wiring is a separate change. + +## Testing + +- `read_only()` returns an array that still reads correctly (round-trip a subset + read equals the source array's read). +- Each write method on a read-only array raises (Python-level assertion that the + expected exception type is raised) for both sync and async. +- A normal (non-read-only) array still writes successfully — no regression. +``` \ No newline at end of file diff --git a/python/zarrista/_array.pyi b/python/zarrista/_array.pyi index 25257a4..01f2af2 100644 --- a/python/zarrista/_array.pyi +++ b/python/zarrista/_array.pyi @@ -33,6 +33,16 @@ class Array: @staticmethod def open(store: FilesystemStore | MemoryStore, path: str = "/") -> Array: """Open the array stored at `path` in `store`.""" + @staticmethod + def from_metadata( + metadata: ArrayMetadataV3, + store: FilesystemStore | MemoryStore, + path: str = "/", + ) -> Array: + """Use the provided metadata to open a new array at `path` in `store`. + + This does **not** write the metadata to the store. + """ @property def attrs(self) -> dict[str, JSONValue]: """The array's user attributes as a dict.""" @@ -129,6 +139,12 @@ class Array: """ def erase_metadata(self) -> None: """Delete the array's metadata from the store.""" + def read_only(self) -> Array: + """Return a read-only view of this array. + + Reads behave identically, but any write (`store_chunk`, `erase_chunk`, + `erase_metadata`, ...) raises at runtime. + """ @property def shape(self) -> list[int]: """The array shape.""" @@ -147,6 +163,16 @@ class AsyncArray: `store` may be an obstore `ObjectStore` or an icechunk `Session`. """ + @staticmethod + def from_metadata( + metadata: ArrayMetadataV3, + store: AsyncStore, + path: str = "/", + ) -> AsyncArray: + """Use the provided metadata to open a new array at `path` in `store`. + + This does **not** write the metadata to the store. + """ @property def attrs(self) -> dict[str, JSONValue]: """The array's user attributes as a dict.""" @@ -243,6 +269,12 @@ class AsyncArray: """ async def erase_metadata(self) -> None: """Delete the array's metadata from the store.""" + def read_only(self) -> AsyncArray: + """Return a read-only view of this array. + + Reads behave identically, but any write (`store_chunk`, `erase_chunk`, + `erase_metadata`, ...) raises at runtime. + """ @property def shape(self) -> list[int]: """The array shape.""" diff --git a/src/array/async.rs b/src/array/async.rs index ba418f2..74976b9 100644 --- a/src/array/async.rs +++ b/src/array/async.rs @@ -8,9 +8,10 @@ use crate::array::PyChunkIndices; use crate::array_bytes::PyArrayBytes; use crate::codec::PyCodecOptions; use crate::decoded_array::DecodedArray; -use crate::error::ZarristaError; +use crate::error::{ZarristaError, ZarristaResult}; +use crate::metadata::PyArrayMetadata; use crate::node::PyNodePath; -use crate::storage::PyAsyncStorage; +use crate::storage::{AsyncReadOnlyStorageAdapter, PyAsyncStorage}; use pyo3::prelude::*; use pyo3_async_runtimes::tokio::future_into_py; use pyo3_bytes::PyBytes; @@ -56,6 +57,24 @@ impl PyAsyncArray { ) } + /// Use the provided metadata to open a new array at `path` in `store`. + /// + /// This does **not** write to the store, use `store_metadata` to write metadata to storage. + #[staticmethod] + #[pyo3( + signature = (metadata, store, path = PyNodePath::root()), + text_signature = "(metadata, store, path='/')" + )] + fn from_metadata( + metadata: PyArrayMetadata, + store: PyAsyncStorage, + path: PyNodePath, + ) -> ZarristaResult { + let inner = + Array::new_with_metadata(store.into_inner(), path.as_str(), metadata.into_inner())?; + Ok(Self::new(Arc::new(inner))) + } + /// Open the array stored at `path` in `store`. #[staticmethod] #[pyo3( @@ -112,6 +131,13 @@ impl PyAsyncArray { }) } + /// Return a read-only view of this array; writes raise at runtime. + fn read_only(&self) -> Self { + let read_list_storage = self.inner.storage().readable_listable(); + let storage = Arc::new(AsyncReadOnlyStorageAdapter::new(read_list_storage)); + Self::new(Arc::new(self.inner.with_storage(storage))) + } + fn erase_metadata<'py>(&self, py: Python<'py>) -> PyResult> { let inner = self.inner.clone(); future_into_py(py, async move { diff --git a/src/array/sync.rs b/src/array/sync.rs index 38bcc96..be20628 100644 --- a/src/array/sync.rs +++ b/src/array/sync.rs @@ -9,8 +9,9 @@ use crate::array_bytes::PyArrayBytes; use crate::codec::PyCodecOptions; use crate::decoded_array::DecodedArray; use crate::error::ZarristaResult; +use crate::metadata::PyArrayMetadata; use crate::node::PyNodePath; -use crate::storage::PySyncStorage; +use crate::storage::{PySyncStorage, ReadOnlyStorageAdapter}; use pyo3::prelude::*; use pyo3_bytes::PyBytes; use zarrs::array::Array; @@ -50,6 +51,24 @@ impl PyArray { ) } + /// Use the provided metadata to open a new array at `path` in `store`. + /// + /// This does **not** write to the store, use `store_metadata` to write metadata to storage. + #[staticmethod] + #[pyo3( + signature = (metadata, store, path = PyNodePath::root()), + text_signature = "(metadata, store, path='/')" + )] + fn from_metadata( + metadata: PyArrayMetadata, + store: PySyncStorage, + path: PyNodePath, + ) -> ZarristaResult { + let inner = + Array::new_with_metadata(store.into_inner(), path.as_str(), metadata.into_inner())?; + Ok(Self::new(Arc::new(inner))) + } + /// Open the array stored at `path` in `store`. #[staticmethod] #[pyo3( @@ -85,6 +104,12 @@ impl PyArray { Ok(()) } + fn read_only(&self) -> Self { + let read_list_storage = self.inner.storage().readable_listable(); + let storage = Arc::new(ReadOnlyStorageAdapter::new(read_list_storage)); + Self::new(Arc::new(self.inner.with_storage(storage))) + } + /// Read a region of the array, using numpy-style basic indexing. /// /// Returns one of the decoded result classes (`Tensor`, `VariableArray`, diff --git a/src/storage/async.rs b/src/storage/async.rs index 25d3b90..01a59c9 100644 --- a/src/storage/async.rs +++ b/src/storage/async.rs @@ -1,12 +1,19 @@ use std::sync::Arc; +use async_trait::async_trait; use object_store::ObjectStore; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::pybacked::PyBackedStr; use pyo3_bytes::PyBytes; use pyo3_object_store::AnyObjectStore; +use zarrs::storage::byte_range::ByteRangeIterator; use zarrs::storage::AsyncReadableWritableListableStorageTraits; +use zarrs::storage::{ + AsyncListableStorageTraits, AsyncMaybeBytesIterator, AsyncReadableListableStorageTraits, + AsyncReadableStorageTraits, AsyncWritableStorageTraits, Bytes, MaybeBytes, OffsetBytesIterator, + StorageError, StoreKey, StoreKeys, StoreKeysPrefixes, StorePrefix, +}; use zarrs_icechunk::AsyncIcechunkStore; use zarrs_object_store::AsyncObjectStore; @@ -123,3 +130,81 @@ impl FromPyObject<'_, '_> for PyAsyncIcechunkStore { Ok(Self(AsyncIcechunkStore::new(session))) } } + +/// An async storage adapter that reads and lists transparently but rejects all writes at runtime. +pub struct AsyncReadOnlyStorageAdapter(Arc); + +impl AsyncReadOnlyStorageAdapter { + pub fn new(inner: Arc) -> Self { + Self(inner) + } +} + +#[async_trait] +impl AsyncReadableStorageTraits for AsyncReadOnlyStorageAdapter { + async fn get(&self, key: &StoreKey) -> Result { + self.0.get(key).await + } + + async fn get_partial_many<'a>( + &'a self, + key: &StoreKey, + byte_ranges: ByteRangeIterator<'a>, + ) -> Result, StorageError> { + self.0.get_partial_many(key, byte_ranges).await + } + + async fn size_key(&self, key: &StoreKey) -> Result, StorageError> { + self.0.size_key(key).await + } + + fn supports_get_partial(&self) -> bool { + self.0.supports_get_partial() + } +} + +#[async_trait] +impl AsyncListableStorageTraits for AsyncReadOnlyStorageAdapter { + async fn list(&self) -> Result { + self.0.list().await + } + + async fn list_prefix(&self, prefix: &StorePrefix) -> Result { + self.0.list_prefix(prefix).await + } + + async fn list_dir(&self, prefix: &StorePrefix) -> Result { + self.0.list_dir(prefix).await + } + + async fn size_prefix(&self, prefix: &StorePrefix) -> Result { + self.0.size_prefix(prefix).await + } +} + +#[async_trait] +impl AsyncWritableStorageTraits for AsyncReadOnlyStorageAdapter { + async fn set(&self, _key: &StoreKey, _value: Bytes) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + async fn set_partial_many<'a>( + &'a self, + _key: &StoreKey, + _offset_values: OffsetBytesIterator<'a>, + ) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + async fn erase(&self, _key: &StoreKey) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + async fn erase_prefix(&self, _prefix: &StorePrefix) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + fn supports_set_partial(&self) -> bool { + false + } +} diff --git a/src/storage/mod.rs b/src/storage/mod.rs index ad51b68..abefccb 100644 --- a/src/storage/mod.rs +++ b/src/storage/mod.rs @@ -1,6 +1,6 @@ mod r#async; mod sync; -pub use r#async::PyAsyncStorage; +pub use r#async::{AsyncReadOnlyStorageAdapter, PyAsyncStorage}; pub(crate) use sync::PySyncStorage; -pub use sync::{PyFilesystemStore, PyMemoryStore}; +pub use sync::{PyFilesystemStore, PyMemoryStore, ReadOnlyStorageAdapter}; diff --git a/src/storage/sync.rs b/src/storage/sync.rs index 6d6afe3..1fec4d6 100644 --- a/src/storage/sync.rs +++ b/src/storage/sync.rs @@ -4,9 +4,13 @@ use pyo3::prelude::*; use std::path::PathBuf; use std::sync::Arc; use zarrs::filesystem::FilesystemStore; +use zarrs::storage::byte_range::{ByteRange, ByteRangeIterator}; use zarrs::storage::store::MemoryStore; -use zarrs::storage::ReadableWritableListableStorageTraits; - +use zarrs::storage::{ + Bytes, ListableStorageTraits, MaybeBytes, MaybeBytesIterator, OffsetBytesIterator, + ReadableListableStorageTraits, ReadableStorageTraits, ReadableWritableListableStorageTraits, + StorageError, StoreKey, StoreKeys, StoreKeysPrefixes, StorePrefix, WritableStorageTraits, +}; /// A zarrista sync store object adapted to the maximal `zarrs` storage trait. pub struct PySyncStorage(Arc); @@ -79,3 +83,86 @@ impl PyMemoryStore { "MemoryStore()".to_string() } } + +/// A storage adapter that reads and lists transparently but rejects all writes at runtime. +pub struct ReadOnlyStorageAdapter(Arc); + +impl ReadOnlyStorageAdapter { + pub fn new(inner: Arc) -> Self { + Self(inner) + } +} + +impl ReadableStorageTraits for ReadOnlyStorageAdapter { + fn get(&self, key: &StoreKey) -> Result { + self.0.get(key) + } + + fn get_partial_many<'a>( + &'a self, + key: &StoreKey, + byte_ranges: ByteRangeIterator<'a>, + ) -> Result, StorageError> { + self.0.get_partial_many(key, byte_ranges) + } + + fn get_partial( + &self, + key: &StoreKey, + byte_range: ByteRange, + ) -> Result { + self.0.get_partial(key, byte_range) + } + + fn size_key(&self, key: &StoreKey) -> Result, StorageError> { + self.0.size_key(key) + } + + fn supports_get_partial(&self) -> bool { + self.0.supports_get_partial() + } +} + +impl ListableStorageTraits for ReadOnlyStorageAdapter { + fn list(&self) -> Result { + self.0.list() + } + + fn list_prefix(&self, prefix: &StorePrefix) -> Result { + self.0.list_prefix(prefix) + } + + fn list_dir(&self, prefix: &StorePrefix) -> Result { + self.0.list_dir(prefix) + } + + fn size_prefix(&self, prefix: &StorePrefix) -> Result { + self.0.size_prefix(prefix) + } +} + +impl WritableStorageTraits for ReadOnlyStorageAdapter { + fn set(&self, _key: &StoreKey, _value: Bytes) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + fn set_partial_many( + &self, + _key: &StoreKey, + _offset_values: OffsetBytesIterator, + ) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + fn erase(&self, _key: &StoreKey) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + fn erase_prefix(&self, _prefix: &StorePrefix) -> Result<(), StorageError> { + Err(StorageError::ReadOnly) + } + + fn supports_set_partial(&self) -> bool { + false + } +} diff --git a/tests/array/test_read_only.py b/tests/array/test_read_only.py new file mode 100644 index 0000000..3073522 --- /dev/null +++ b/tests/array/test_read_only.py @@ -0,0 +1,96 @@ +"""`Array.read_only()` returns an array that reads normally but raises on writes.""" + +import numpy as np +import pytest + +from zarrista import ( + Array, + ArrayBuilder, + ArrayBytes, + ChunkGrid, + DataType, + FillValue, + MemoryStore, +) +from zarrista.exceptions import ZarristaError + + +def _writable_array() -> Array: + """A 4x4 int8 array (single 4x4 chunk, fill 0) created in a MemoryStore.""" + return ArrayBuilder( + ChunkGrid.regular([4, 4], [4, 4]), + DataType.from_string("int8"), + FillValue(b"\x00"), + ).create(MemoryStore(), "/a") + + +def test_read_only_still_reads(): + array = _writable_array() + array.store_chunk([0, 0], ArrayBytes(np.arange(16, dtype="int8").tobytes())) + + ro = array.read_only() + + np.testing.assert_array_equal( + ro.retrieve_chunk([0, 0]).to_numpy(), + np.arange(16, dtype="int8").reshape(4, 4), + ) + assert ro.shape == [4, 4] + + +def test_read_only_store_chunk_raises(): + # The underlying StorageError::ReadOnly may be wrapped by the Array layer + # (e.g. into an ArrayError); assert the read-only failure surfaces, not the + # exact wrapper class. + ro = _writable_array().read_only() + with pytest.raises(ZarristaError, match="read only"): + ro.store_chunk([0, 0], ArrayBytes(np.zeros(16, dtype="int8").tobytes())) + + +def test_read_only_erase_metadata_raises(): + # `ArrayBuilder.create` already stored metadata, so erasing it is a real write. + ro = _writable_array().read_only() + with pytest.raises(ZarristaError, match="read only"): + ro.erase_metadata() + + +def test_writable_array_still_writes(): + """The original array (and any non-read-only array) writes without error.""" + array = _writable_array() + array.store_chunk([0, 0], ArrayBytes(np.ones(16, dtype="int8").tobytes())) + np.testing.assert_array_equal( + array.retrieve_chunk([0, 0]).to_numpy(), + np.ones((4, 4), dtype="int8"), + ) + + +# --- async --------------------------------------------------------------- + +import zarr # noqa: E402 +from obstore.store import LocalStore # noqa: E402 + +from zarrista import AsyncArray # noqa: E402 + + +async def _async_writable_array(tmp_path) -> AsyncArray: + """A 4x4 int8 array written with zarr-python, opened async over obstore.""" + path = tmp_path / "a.zarr" + z = zarr.create_array(store=str(path), shape=(4, 4), chunks=(4, 4), dtype="int8") + z[:] = np.arange(16, dtype="int8").reshape(4, 4) + return await AsyncArray.open_async(LocalStore(str(path))) + + +async def test_async_read_only_still_reads(tmp_path): + arr = await _async_writable_array(tmp_path) + ro = arr.read_only() + chunk = await ro.retrieve_chunk([0, 0]) + np.testing.assert_array_equal( + chunk.to_numpy(), + np.arange(16, dtype="int8").reshape(4, 4), + ) + + +async def test_async_read_only_store_chunk_raises(tmp_path): + arr = await _async_writable_array(tmp_path) + ro = arr.read_only() + with pytest.raises(ZarristaError, match="read only"): + await ro.store_chunk([0, 0], ArrayBytes(np.zeros(16, dtype="int8").tobytes()))