From 3630ed9fd1a69ba80280e794af5310bd49682146 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 21 May 2025 13:14:38 -0400 Subject: [PATCH] Remove Arrow from list protocol --- docs/api/arrow.md | 3 - mkdocs.yml | 3 - src/obspec/_list.py | 152 ++++---------------------------------------- src/obspec/arrow.py | 55 ---------------- 4 files changed, 14 insertions(+), 199 deletions(-) delete mode 100644 docs/api/arrow.md delete mode 100644 src/obspec/arrow.py diff --git a/docs/api/arrow.md b/docs/api/arrow.md deleted file mode 100644 index 1535ee5..0000000 --- a/docs/api/arrow.md +++ /dev/null @@ -1,3 +0,0 @@ -# Arrow - -::: obspec.arrow diff --git a/mkdocs.yml b/mkdocs.yml index 7ac3ae8..076265a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -41,7 +41,6 @@ nav: - api/put.md - api/rename.md - api/attributes.md - - api/arrow.md - CHANGELOG.md watch: @@ -121,8 +120,6 @@ plugins: signature_crossrefs: true inventories: - - https://arrow.apache.org/docs/objects.inv - - https://docs.pola.rs/api/python/stable/objects.inv - https://docs.python.org/3/objects.inv - https://filesystem-spec.readthedocs.io/en/latest/objects.inv diff --git a/src/obspec/_list.py b/src/obspec/_list.py index 061aab9..08c913a 100644 --- a/src/obspec/_list.py +++ b/src/obspec/_list.py @@ -1,32 +1,15 @@ from __future__ import annotations -from collections.abc import Sequence -from typing import ( - Generic, - Literal, - Protocol, - Self, - TypedDict, - TypeVar, - overload, -) - -from ._meta import ObjectMeta -from .arrow import ArrowArrayExportable, ArrowStreamExportable - -ListChunkType_co = TypeVar( - "ListChunkType_co", - Sequence[ObjectMeta], - ArrowArrayExportable, - ArrowStreamExportable, - covariant=True, -) -"""The data structure used for holding list results. - -By default, listing APIs return a `list` of [`ObjectMeta`][obspec.ObjectMeta]. However -for improved performance when listing large buckets, you can pass `return_arrow=True`. -Then an Arrow `RecordBatch` will be returned instead. -""" +from typing import TYPE_CHECKING, Generic, Protocol, Self, TypedDict, TypeVar + +if TYPE_CHECKING: + from collections.abc import Sequence + + from ._meta import ObjectMeta + + +ListChunkType_co = TypeVar("ListChunkType_co", covariant=True) +"""The data structure used for holding list results.""" class ListResult(TypedDict, Generic[ListChunkType_co]): @@ -85,32 +68,13 @@ async def __anext__(self) -> ListChunkType_co: class List(Protocol): - @overload - def list( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: Literal[True], - ) -> ListIterator[ArrowArrayExportable]: ... - @overload - def list( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: Literal[False] = False, - ) -> ListIterator[Sequence[ObjectMeta]]: ... def list( self, prefix: str | None = None, *, offset: str | None = None, chunk_size: int = 50, - return_arrow: bool = False, - ) -> ListIterator[ArrowArrayExportable] | ListIterator[Sequence[ObjectMeta]]: + ) -> ListIterator[Sequence[ObjectMeta]]: """List all the objects with the given prefix. Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of @@ -136,28 +100,6 @@ def list( break ``` - Return large list results as [Arrow](https://arrow.apache.org/). This is most - useful with large list operations. In this case you may want to increase the - `chunk_size` parameter. - - ```py - stream = obs.list(store, chunk_size=1000, return_arrow=True) - # Stream is now an iterable/async iterable of `RecordBatch`es - for batch in stream: - print(batch.num_rows) # 100 - - # If desired, convert to a pyarrow RecordBatch (zero-copy) with - # `pyarrow.record_batch(batch)` - break - ``` - - Collect all list results into a single Arrow `RecordBatch`. - - ```py - stream = obs.list(store, return_arrow=True) - batch = stream.collect() - ``` - !!! note The order of returned [`ObjectMeta`][obspec.ObjectMeta] is not guaranteed @@ -171,10 +113,6 @@ def list( chunk_size: The number of items to collect per chunk in the returned (async) iterator. All chunks except for the last one will have this many items. This is ignored in [`collect`][obspec.ListIterator.collect]. - return_arrow: If `True`, return each batch of list items as an Arrow - `RecordBatch`, not as a list of Python `dict`s. Arrow removes - serialization overhead between Rust and Python and so this can be - significantly faster for large list operations. Defaults to `False`. Returns: A ListStream, which you can iterate through to access list results. @@ -184,32 +122,13 @@ def list( class ListAsync(Protocol): - @overload def list_async( self, prefix: str | None = None, *, offset: str | None = None, chunk_size: int = 50, - return_arrow: Literal[True], - ) -> ListStream[ArrowArrayExportable]: ... - @overload - def list_async( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: Literal[False] = False, - ) -> ListStream[Sequence[ObjectMeta]]: ... - def list_async( - self, - prefix: str | None = None, - *, - offset: str | None = None, - chunk_size: int = 50, - return_arrow: bool = False, - ) -> ListStream[ArrowArrayExportable] | ListStream[Sequence[ObjectMeta]]: + ) -> ListStream[Sequence[ObjectMeta]]: """List all the objects with the given prefix. Note that this method itself is **not async**. It's a synchronous method but @@ -243,10 +162,6 @@ def list_async( (async) iterator. All chunks except for the last one will have this many items. This is ignored in [`collect_async`][obspec.ListStream.collect_async]. - return_arrow: If `True`, return each batch of list items as an Arrow - `RecordBatch`, not as a list of Python `dict`s. Arrow removes - serialization overhead between Rust and Python and so this can be - significantly faster for large list operations. Defaults to `False`. Returns: A ListStream, which you can iterate through to access list results. @@ -256,26 +171,10 @@ def list_async( class ListWithDelimiter(Protocol): - @overload def list_with_delimiter( self, prefix: str | None = None, - *, - return_arrow: Literal[True], - ) -> ListResult[ArrowStreamExportable]: ... - @overload - def list_with_delimiter( - self, - prefix: str | None = None, - *, - return_arrow: Literal[False] = False, - ) -> ListResult[Sequence[ObjectMeta]]: ... - def list_with_delimiter( - self, - prefix: str | None = None, - *, - return_arrow: bool = False, - ) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]: + ) -> ListResult[Sequence[ObjectMeta]]: """List objects with the given prefix and an implementation specific delimiter. @@ -294,13 +193,6 @@ def list_with_delimiter( Args: prefix: The prefix within ObjectStore to use for listing. Defaults to None. - Keyword Args: - return_arrow: If `True`, return list results as an Arrow - `Table`, not as a list of Python `dict`s. Arrow removes serialization - overhead between Rust and Python and so this can be significantly faster - for large list operations. Defaults to `False`. - - Returns: ListResult @@ -309,26 +201,10 @@ def list_with_delimiter( class ListWithDelimiterAsync(Protocol): - @overload - async def list_with_delimiter_async( - self, - prefix: str | None = None, - *, - return_arrow: Literal[True], - ) -> ListResult[ArrowStreamExportable]: ... - @overload async def list_with_delimiter_async( self, prefix: str | None = None, - *, - return_arrow: Literal[False] = False, - ) -> ListResult[Sequence[ObjectMeta]]: ... - async def list_with_delimiter_async( - self, - prefix: str | None = None, - *, - return_arrow: bool = False, - ) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]: + ) -> ListResult[Sequence[ObjectMeta]]: """Call `list_with_delimiter` asynchronously. Refer to the documentation for diff --git a/src/obspec/arrow.py b/src/obspec/arrow.py deleted file mode 100644 index 3cdf2e2..0000000 --- a/src/obspec/arrow.py +++ /dev/null @@ -1,55 +0,0 @@ -"""Arrow protocol type hints for use in [list][obspec.List] calls.""" - -from __future__ import annotations - -from typing import Protocol - - -class ArrowArrayExportable(Protocol): - """An object with an `__arrow_c_array__` method. - - Supported objects include: - - - arro3 `Array` or `RecordBatch` objects. - - pyarrow `Array` or `RecordBatch` objects - - Such an object implements the [Arrow C Data Interface - interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the - [Arrow PyCapsule - Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - This allows for zero-copy Arrow data interchange across libraries. - """ - - def __arrow_c_array__( - self, - requested_schema: object | None = None, - ) -> tuple[object, object]: - """Return Arrow C data interface PyCapsules for the object.""" - ... - - -class ArrowStreamExportable(Protocol): - """An object with an `__arrow_c_stream__` method. - - Supported objects include: - - - arro3 `Table`, `RecordBatchReader`, `ChunkedArray`, or `ArrayReader` objects. - - Polars `Series` or `DataFrame` objects (polars v1.2 or higher) - - pyarrow `RecordBatchReader`, `Table`, or `ChunkedArray` objects (pyarrow v14 or - higher) - - pandas `DataFrame`s (pandas v2.2 or higher) - - ibis `Table` objects. - - For an up to date list of supported objects, see [this - issue](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008). - - Such an object implements the [Arrow C Stream - interface](https://arrow.apache.org/docs/format/CStreamInterface.html) via the - [Arrow PyCapsule - Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - This allows for zero-copy Arrow data interchange across libraries. - """ - - def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: - """Return an Arrow C stream interface PyCapsule for the object.""" - ...