From 3630ed9fd1a69ba80280e794af5310bd49682146 Mon Sep 17 00:00:00 2001
From: Kyle Barron <kyle@developmentseed.org>
Date: Wed, 21 May 2025 13:14:38 -0400
Subject: [PATCH] Remove Arrow from list protocol

---
 docs/api/arrow.md   |   3 -
 mkdocs.yml          |   3 -
 src/obspec/_list.py | 152 ++++----------------------------------------
 src/obspec/arrow.py |  55 ----------------
 4 files changed, 14 insertions(+), 199 deletions(-)
 delete mode 100644 docs/api/arrow.md
 delete mode 100644 src/obspec/arrow.py

diff --git a/docs/api/arrow.md b/docs/api/arrow.md
deleted file mode 100644
index 1535ee5..0000000
--- a/docs/api/arrow.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Arrow
-
-::: obspec.arrow
diff --git a/mkdocs.yml b/mkdocs.yml
index 7ac3ae8..076265a 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -41,7 +41,6 @@ nav:
       - api/put.md
       - api/rename.md
       - api/attributes.md
-      - api/arrow.md
   - CHANGELOG.md
 
 watch:
@@ -121,8 +120,6 @@ plugins:
             signature_crossrefs: true
 
           inventories:
-            - https://arrow.apache.org/docs/objects.inv
-            - https://docs.pola.rs/api/python/stable/objects.inv
             - https://docs.python.org/3/objects.inv
             - https://filesystem-spec.readthedocs.io/en/latest/objects.inv
 
diff --git a/src/obspec/_list.py b/src/obspec/_list.py
index 061aab9..08c913a 100644
--- a/src/obspec/_list.py
+++ b/src/obspec/_list.py
@@ -1,32 +1,15 @@
 from __future__ import annotations
 
-from collections.abc import Sequence
-from typing import (
-    Generic,
-    Literal,
-    Protocol,
-    Self,
-    TypedDict,
-    TypeVar,
-    overload,
-)
-
-from ._meta import ObjectMeta
-from .arrow import ArrowArrayExportable, ArrowStreamExportable
-
-ListChunkType_co = TypeVar(
-    "ListChunkType_co",
-    Sequence[ObjectMeta],
-    ArrowArrayExportable,
-    ArrowStreamExportable,
-    covariant=True,
-)
-"""The data structure used for holding list results.
-
-By default, listing APIs return a `list` of [`ObjectMeta`][obspec.ObjectMeta]. However
-for improved performance when listing large buckets, you can pass `return_arrow=True`.
-Then an Arrow `RecordBatch` will be returned instead.
-"""
+from typing import TYPE_CHECKING, Generic, Protocol, Self, TypedDict, TypeVar
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from ._meta import ObjectMeta
+
+
+ListChunkType_co = TypeVar("ListChunkType_co", covariant=True)
+"""The data structure used for holding list results."""
 
 
 class ListResult(TypedDict, Generic[ListChunkType_co]):
@@ -85,32 +68,13 @@ async def __anext__(self) -> ListChunkType_co:
 
 
 class List(Protocol):
-    @overload
-    def list(
-        self,
-        prefix: str | None = None,
-        *,
-        offset: str | None = None,
-        chunk_size: int = 50,
-        return_arrow: Literal[True],
-    ) -> ListIterator[ArrowArrayExportable]: ...
-    @overload
-    def list(
-        self,
-        prefix: str | None = None,
-        *,
-        offset: str | None = None,
-        chunk_size: int = 50,
-        return_arrow: Literal[False] = False,
-    ) -> ListIterator[Sequence[ObjectMeta]]: ...
     def list(
         self,
         prefix: str | None = None,
         *,
         offset: str | None = None,
         chunk_size: int = 50,
-        return_arrow: bool = False,
-    ) -> ListIterator[ArrowArrayExportable] | ListIterator[Sequence[ObjectMeta]]:
+    ) -> ListIterator[Sequence[ObjectMeta]]:
         """List all the objects with the given prefix.
 
         Prefixes are evaluated on a path segment basis, i.e. `foo/bar/` is a prefix of
@@ -136,28 +100,6 @@ def list(
             break
         ```
 
-        Return large list results as [Arrow](https://arrow.apache.org/). This is most
-        useful with large list operations. In this case you may want to increase the
-        `chunk_size` parameter.
-
-        ```py
-        stream = obs.list(store, chunk_size=1000, return_arrow=True)
-        # Stream is now an iterable/async iterable of `RecordBatch`es
-        for batch in stream:
-            print(batch.num_rows) # 100
-
-            # If desired, convert to a pyarrow RecordBatch (zero-copy) with
-            # `pyarrow.record_batch(batch)`
-            break
-        ```
-
-        Collect all list results into a single Arrow `RecordBatch`.
-
-        ```py
-        stream = obs.list(store, return_arrow=True)
-        batch = stream.collect()
-        ```
-
         !!! note
             The order of returned [`ObjectMeta`][obspec.ObjectMeta] is not
             guaranteed
@@ -171,10 +113,6 @@ def list(
             chunk_size: The number of items to collect per chunk in the returned
                 (async) iterator. All chunks except for the last one will have this many
                 items. This is ignored in [`collect`][obspec.ListIterator.collect].
-            return_arrow: If `True`, return each batch of list items as an Arrow
-                `RecordBatch`, not as a list of Python `dict`s. Arrow removes
-                serialization overhead between Rust and Python and so this can be
-                significantly faster for large list operations. Defaults to `False`.
 
         Returns:
             A ListStream, which you can iterate through to access list results.
@@ -184,32 +122,13 @@ def list(
 
 
 class ListAsync(Protocol):
-    @overload
     def list_async(
         self,
         prefix: str | None = None,
         *,
         offset: str | None = None,
         chunk_size: int = 50,
-        return_arrow: Literal[True],
-    ) -> ListStream[ArrowArrayExportable]: ...
-    @overload
-    def list_async(
-        self,
-        prefix: str | None = None,
-        *,
-        offset: str | None = None,
-        chunk_size: int = 50,
-        return_arrow: Literal[False] = False,
-    ) -> ListStream[Sequence[ObjectMeta]]: ...
-    def list_async(
-        self,
-        prefix: str | None = None,
-        *,
-        offset: str | None = None,
-        chunk_size: int = 50,
-        return_arrow: bool = False,
-    ) -> ListStream[ArrowArrayExportable] | ListStream[Sequence[ObjectMeta]]:
+    ) -> ListStream[Sequence[ObjectMeta]]:
         """List all the objects with the given prefix.
 
         Note that this method itself is **not async**. It's a synchronous method but
@@ -243,10 +162,6 @@ def list_async(
                 (async) iterator. All chunks except for the last one will have this many
                 items. This is ignored in
                 [`collect_async`][obspec.ListStream.collect_async].
-            return_arrow: If `True`, return each batch of list items as an Arrow
-                `RecordBatch`, not as a list of Python `dict`s. Arrow removes
-                serialization overhead between Rust and Python and so this can be
-                significantly faster for large list operations. Defaults to `False`.
 
         Returns:
             A ListStream, which you can iterate through to access list results.
@@ -256,26 +171,10 @@ def list_async(
 
 
 class ListWithDelimiter(Protocol):
-    @overload
     def list_with_delimiter(
         self,
         prefix: str | None = None,
-        *,
-        return_arrow: Literal[True],
-    ) -> ListResult[ArrowStreamExportable]: ...
-    @overload
-    def list_with_delimiter(
-        self,
-        prefix: str | None = None,
-        *,
-        return_arrow: Literal[False] = False,
-    ) -> ListResult[Sequence[ObjectMeta]]: ...
-    def list_with_delimiter(
-        self,
-        prefix: str | None = None,
-        *,
-        return_arrow: bool = False,
-    ) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]:
+    ) -> ListResult[Sequence[ObjectMeta]]:
         """List objects with the given prefix and an implementation specific
         delimiter.
 
@@ -294,13 +193,6 @@ def list_with_delimiter(
         Args:
             prefix: The prefix within ObjectStore to use for listing. Defaults to None.
 
-        Keyword Args:
-            return_arrow: If `True`, return list results as an Arrow
-                `Table`, not as a list of Python `dict`s. Arrow removes serialization
-                overhead between Rust and Python and so this can be significantly faster
-                for large list operations. Defaults to `False`.
-
-
         Returns:
             ListResult
 
@@ -309,26 +201,10 @@ def list_with_delimiter(
 
 
 class ListWithDelimiterAsync(Protocol):
-    @overload
-    async def list_with_delimiter_async(
-        self,
-        prefix: str | None = None,
-        *,
-        return_arrow: Literal[True],
-    ) -> ListResult[ArrowStreamExportable]: ...
-    @overload
     async def list_with_delimiter_async(
         self,
         prefix: str | None = None,
-        *,
-        return_arrow: Literal[False] = False,
-    ) -> ListResult[Sequence[ObjectMeta]]: ...
-    async def list_with_delimiter_async(
-        self,
-        prefix: str | None = None,
-        *,
-        return_arrow: bool = False,
-    ) -> ListResult[ArrowStreamExportable] | ListResult[Sequence[ObjectMeta]]:
+    ) -> ListResult[Sequence[ObjectMeta]]:
         """Call `list_with_delimiter` asynchronously.
 
         Refer to the documentation for
diff --git a/src/obspec/arrow.py b/src/obspec/arrow.py
deleted file mode 100644
index 3cdf2e2..0000000
--- a/src/obspec/arrow.py
+++ /dev/null
@@ -1,55 +0,0 @@
-"""Arrow protocol type hints for use in [list][obspec.List] calls."""
-
-from __future__ import annotations
-
-from typing import Protocol
-
-
-class ArrowArrayExportable(Protocol):
-    """An object with an `__arrow_c_array__` method.
-
-    Supported objects include:
-
-    - arro3 `Array` or `RecordBatch` objects.
-    - pyarrow `Array` or `RecordBatch` objects
-
-    Such an object implements the [Arrow C Data Interface
-    interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the
-    [Arrow PyCapsule
-    Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
-    This allows for zero-copy Arrow data interchange across libraries.
-    """
-
-    def __arrow_c_array__(
-        self,
-        requested_schema: object | None = None,
-    ) -> tuple[object, object]:
-        """Return Arrow C data interface PyCapsules for the object."""
-        ...
-
-
-class ArrowStreamExportable(Protocol):
-    """An object with an `__arrow_c_stream__` method.
-
-    Supported objects include:
-
-    - arro3 `Table`, `RecordBatchReader`, `ChunkedArray`, or `ArrayReader` objects.
-    - Polars `Series` or `DataFrame` objects (polars v1.2 or higher)
-    - pyarrow `RecordBatchReader`, `Table`, or `ChunkedArray` objects (pyarrow v14 or
-        higher)
-    - pandas `DataFrame`s  (pandas v2.2 or higher)
-    - ibis `Table` objects.
-
-    For an up to date list of supported objects, see [this
-    issue](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008).
-
-    Such an object implements the [Arrow C Stream
-    interface](https://arrow.apache.org/docs/format/CStreamInterface.html) via the
-    [Arrow PyCapsule
-    Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
-    This allows for zero-copy Arrow data interchange across libraries.
-    """
-
-    def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
-        """Return an Arrow C stream interface PyCapsule for the object."""
-        ...