From 3ac9275d0cfc1bcd8e14d795135199e86b7c073b Mon Sep 17 00:00:00 2001 From: virgesmith Date: Sun, 14 Jun 2026 12:59:01 +0100 Subject: [PATCH] Release 0.3.0: nth/interleave fixes, add chunk_by - nth is now 0-based, consistent with Rust's Iterator::nth and Python indexing (BREAKING: drop the +1 in callers; nth(0) now valid) - interleave yields the remainder of the longer iterable once the shorter is exhausted, matching Rust (BREAKING: previously truncated) - add chunk_by: lazy, order-preserving grouping of consecutive runs (itertools.groupby / Rust chunk_by semantics; works on infinite iterators) - document groupby/value_counts as eager (sorts input); note equivalence to pandas default groupby; fix nth docstring - bump version to 0.3.0; refresh apidoc, README and release notes Co-Authored-By: Claude Opus 4.8 --- .pre-commit-config.yaml | 8 ++-- README.md | 5 ++- doc/apidoc.md | 61 ++++++++++++++++++++------- pyproject.toml | 2 +- relnotes.md | 15 +++++++ src/itrx/itr.py | 69 ++++++++++++++++++++++++------- src/test/test_collection.py | 6 ++- src/test/test_combine_split.py | 10 ++--- src/test/test_transform_filter.py | 13 ++++++ uv.lock | 2 +- 10 files changed, 148 insertions(+), 43 deletions(-) create mode 100644 relnotes.md diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a53ed53..1887c5a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -10,6 +10,8 @@ repos: - id: ruff-check args: [--fix] - id: ruff-format -# doesn't exist (yet) -# - repo: https://github.com/astral-sh/ty-pre-commit - + - repo: https://github.com/astral-sh/ty-pre-commit + # ty version. + rev: v0.0.49 + hooks: + - id: ty diff --git a/README.md b/README.md index ca1c0a1..e00b79c 100644 --- a/README.md +++ b/README.md @@ -88,13 +88,14 @@ Note: Most `Itr` methods are **lazy transformations**, meaning they return a new `Itr` instance without immediately processing any data. This allows for arbitrary chaining and efficient memory usage, as items are only processed as they are requested. In most cases, `Itr` simply acts as a convenient wrapper around `itertools`, enabling this left-to-right chaining syntax. -- **Combining and splitting:** `partition`, `copy`, `batched`, `pairwise`, `rolling`, `chain`, `cycle`, `repeat`, `product`, `inspect`, `intersperse`, `interleave`, `value_counts` -- **Transformation and filtering:** `accumulated`, `filter`, `map`, `starmap`, `map_while`, `flatten`, `flat_map`, `skip_while`, `take_while`, `groupby` +- **Combining and splitting:** `partition`, `copy`, `batched`, `pairwise`, `rolling`, `chain`, `cycle`, `repeat`, `product`, `inspect`, `intersperse`, `interleave`, `chunk_by` +- **Transformation and filtering:** `accumulated`, `filter`, `map`, `starmap`, `map_while`, `flatten`, `flat_map`, `skip_while`, `take_while` However, some methods are **eager consumers**. These methods iterate over and consume the underlying data, returning concrete values, collections, or aggregates. Examples include: * **Collection methods:** `collect`, `last`, `next`, `next_chunk`, `nth`, `position` * **Aggregation methods:** `count`, `reduce`, `max`, `min`, `all`, `any`, `consume`, `find`, `fold` +* **Sorting/grouping:** `groupby` and `value_counts` sort the entire input up front, so they consume the whole iterator immediately and must not be used on infinite sources. Use the lazy `chunk_by` to group consecutive runs without sorting. ### Important Considerations diff --git a/doc/apidoc.md b/doc/apidoc.md index 223e296..027daba 100644 --- a/doc/apidoc.md +++ b/doc/apidoc.md @@ -1,4 +1,4 @@ -# `Itr` v0.2.2 class documentation +# `Itr` v0.3.0 class documentation A generic iterator adaptor class inspired by Rust's Iterator trait, providing a composable API for functional-style iteration and transformation over Python iterables. ## Public methods @@ -96,6 +96,25 @@ Returns: +### `chunk_by` + + +Group *consecutive* elements that share the same key, lazily. Unlike `groupby`, the input is not sorted, so +only adjacent runs are grouped (mirroring `itertools.groupby` and Rust's `chunk_by`). This preserves order and +works on infinite iterators. + +Args: + grouper (Callable[[T], U]): The key function applied to each element. + +Returns: + Itr[tuple[U, tuple[T, ...]]]: An iterator over (key, group) pairs, where each group is a tuple of the + consecutive elements sharing that key. + +Example: + >>> Itr([1, 1, 2, 3, 3, 1]).chunk_by(lambda x: x).map(lambda kv: kv[0]).collect() + (1, 2, 3, 1) + + ### `collect` Collect all remaining items from the iterator into a sequence (tuple by default). @@ -231,6 +250,14 @@ Sort and then group an iterable by the supplied key function. Note the following - The iterable is pre-sorted because itertools.groupby only works correctly on sorted sequences - The resulting groupby objects are realised into tuples +Because the input is sorted, this method is **eager**: it consumes and materialises the whole iterator +immediately (so it must not be used on an infinite iterator), the output is ordered by key, and the keys must be +mutually orderable. For lazy, order-preserving grouping of consecutive runs, see `chunk_by`. + +Semantically this is equivalent to pandas' default `groupby` (i.e. `sort=True`): all items sharing a key are +collected into a single group regardless of their position, and groups are emitted in sorted-key order. It has +no `sort=False` (appearance-order) option, requires mutually-orderable keys, and does not drop `None` keys. + Returns: Itr[tuple[U, tuple[T,...]]]: An iterator over the keys and tuples of values @@ -259,20 +286,20 @@ Example: ### `interleave` -Interleaves elements from this iterator with elements from another iterator. -Stops when either iterator is exhausted. +Interleaves elements from this iterator with elements from another iterable, yielding alternately from each. +When one iterable is exhausted, the remaining elements of the other are yielded in order. Args: - other (Itr[U]): Another iterator to interleave with. + other (Iterable[U]): Another iterable to interleave with. Returns: Itr[T | U]: A new iterator yielding elements alternately from self and other. Example: - itr1 = Itr([1, 3, 5]) - itr2 = Itr([2, 4, 6]) - result = itr1.interleave(itr2) - list(result) # [1, 2, 3, 4, 5, 6] + >>> Itr([1, 3, 5]).interleave([2, 4, 6]).collect() + (1, 2, 3, 4, 5, 6) + >>> Itr([1, 3, 5, 7]).interleave([2, 4]).collect() + (1, 2, 3, 4, 5, 7) ### `intersperse` @@ -295,6 +322,9 @@ Return the last item from the iterator. Do not use on an open-ended Iterable Returns: T: The last item. +Raises: + ValueError: If the iterator is empty. + ### `map` @@ -340,7 +370,7 @@ Returns: Return the maximum element from the iterator, optionally using a key function. Args: - key (Callable[[T], object] | None, optional): A function to extract a comparison key from each element. Defaults to None. + key (Callable[[T], Any] | None, optional): A function to extract a comparison key from each element. Defaults to None. Returns: T: The maximum element in the iterator. @@ -355,7 +385,7 @@ Raises: Return the minimum element from the iterator, optionally using a key function. Args: - key (Callable[[T], object] | None, optional): A function to extract a comparison key from each element. Defaults to None. + key (Callable[[T], Any] | None, optional): A function to extract a comparison key from each element. Defaults to None. Returns: T: The minimum element in the iterator. @@ -387,17 +417,20 @@ Returns: ### `nth` -Return the n-th item (1-based) from the iterator, or None if out of range. +Return the n-th item (0-based) from the iterator, consuming the preceding items. + +This matches Rust's ``Iterator::nth`` and Python's 0-based indexing conventions: ``nth(0)`` returns the first +item, ``nth(1)`` the second, and so on. Args: - n (int): The index (1-based) of the item to return. + n (int): The index (0-based) of the item to return. Returns: T: The n-th item. Raises: - StopIteration: if the iterator is exhausted. - ValueError: if n < 1 + StopIteration: if the iterator has fewer than n + 1 items. + ValueError: if n < 0 diff --git a/pyproject.toml b/pyproject.toml index ef32a64..74e7768 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "itrx" -version = "0.2.3" +version = "0.3.0" description = "A chainable iterator adapter" readme = "README.md" authors = [ diff --git a/relnotes.md b/relnotes.md new file mode 100644 index 0000000..110827d --- /dev/null +++ b/relnotes.md @@ -0,0 +1,15 @@ +## 0.3.0 + +### Breaking changes + +- `nth` is now **0-based**, consistent with Rust's `Iterator::nth` and Python's indexing conventions: `nth(0)` returns the first item (previously this raised `ValueError` and `nth(1)` returned the first item). Update callers by dropping the `+ 1`. +- `interleave` now yields the remaining elements of the longer iterable once the shorter one is exhausted, matching Rust's `interleave` (previously it stopped at the shorter input, silently dropping the tail). + +### New features + +- `chunk_by`: lazily group *consecutive* elements sharing a key (the semantics of `itertools.groupby` / Rust's `chunk_by`). Unlike `groupby` it does not sort, so it preserves order and works on infinite iterators. + +### Documentation + +- Clarified that `groupby` (and `value_counts`, which builds on it) is **eager**: it sorts the entire input up front, so it reorders output, requires mutually-orderable keys, and must not be used on infinite sources. Corrected the lazy/eager categorisation in the README. +- Corrected the `nth` docstring, which previously claimed it returned `None` when out of range (it raises `StopIteration`). diff --git a/src/itrx/itr.py b/src/itrx/itr.py index e61319c..ce3367d 100644 --- a/src/itrx/itr.py +++ b/src/itrx/itr.py @@ -257,12 +257,41 @@ def for_each(self, func: Callable[[T], None]) -> None: for item in self._it: func(item) + def chunk_by[U](self, grouper: Callable[[T], U]) -> "Itr[tuple[U, tuple[T, ...]]]": + """ + Group *consecutive* elements that share the same key, lazily. Unlike `groupby`, the input is not sorted, so + only adjacent runs are grouped (mirroring `itertools.groupby` and Rust's `chunk_by`). This preserves order and + works on infinite iterators. + + Args: + grouper (Callable[[T], U]): The key function applied to each element. + + Returns: + Itr[tuple[U, tuple[T, ...]]]: An iterator over (key, group) pairs, where each group is a tuple of the + consecutive elements sharing that key. + + Example: + >>> Itr([1, 1, 2, 3, 3, 1]).chunk_by(lambda x: x).map(lambda kv: kv[0]).collect() + (1, 2, 3, 1) + """ + key_fn = cast("Callable[[T], Any]", grouper) + groups = ((k, tuple(v)) for k, v in itertools.groupby(self._it, key=key_fn)) + return cast("Itr[tuple[U, tuple[T, ...]]]", Itr(groups)) + def groupby[U](self, grouper: Callable[[T], U]) -> "Itr[tuple[U, tuple[T,...]]]": """ Sort and then group an iterable by the supplied key function. Note the following differences from itertools: - The iterable is pre-sorted because itertools.groupby only works correctly on sorted sequences - The resulting groupby objects are realised into tuples + Because the input is sorted, this method is **eager**: it consumes and materialises the whole iterator + immediately (so it must not be used on an infinite iterator), the output is ordered by key, and the keys must be + mutually orderable. For lazy, order-preserving grouping of consecutive runs, see `chunk_by`. + + Semantically this is equivalent to pandas' default `groupby` (i.e. `sort=True`): all items sharing a key are + collected into a single group regardless of their position, and groups are emitted in sorted-key order. It has + no `sort=False` (appearance-order) option, requires mutually-orderable keys, and does not drop `None` keys. + Returns: Itr[tuple[U, tuple[T,...]]]: An iterator over the keys and tuples of values @@ -322,23 +351,30 @@ def intersperser(item: U) -> Generator[T | U, None, None]: def interleave[U](self, other: Iterable[U]) -> "Itr[T | U]": """ - Interleaves elements from this iterator with elements from another iterator. - Stops when either iterator is exhausted. + Interleaves elements from this iterator with elements from another iterable, yielding alternately from each. + When one iterable is exhausted, the remaining elements of the other are yielded in order. Args: - other (Itr[U]): Another iterator to interleave with. + other (Iterable[U]): Another iterable to interleave with. Returns: Itr[T | U]: A new iterator yielding elements alternately from self and other. Example: - itr1 = Itr([1, 3, 5]) - itr2 = Itr([2, 4, 6]) - result = itr1.interleave(itr2) - list(result) # [1, 2, 3, 4, 5, 6] + >>> Itr([1, 3, 5]).interleave([2, 4, 6]).collect() + (1, 2, 3, 4, 5, 6) + >>> Itr([1, 3, 5, 7]).interleave([2, 4]).collect() + (1, 2, 3, 4, 5, 7) """ + _sentinel = object() + + def interleaver() -> Generator[T | U, None, None]: + for pair in itertools.zip_longest(self._it, other, fillvalue=_sentinel): + for item in pair: + if item is not _sentinel: + yield cast("T | U", item) - return cast("Itr[T | U]", Itr(self.zip(other).flatten())) + return cast("Itr[T | U]", Itr(interleaver())) def last(self) -> T: """Return the last item from the iterator. Do not use on an open-ended Iterable @@ -442,22 +478,25 @@ def next_chunk(self, n: int) -> tuple[T, ...]: return self.take(n).collect() def nth(self, n: int) -> T: - """Return the n-th item (1-based) from the iterator, or None if out of range. + """Return the n-th item (0-based) from the iterator, consuming the preceding items. + + This matches Rust's ``Iterator::nth`` and Python's 0-based indexing conventions: ``nth(0)`` returns the first + item, ``nth(1)`` the second, and so on. Args: - n (int): The index (1-based) of the item to return. + n (int): The index (0-based) of the item to return. Returns: T: The n-th item. Raises: - StopIteration: if the iterator is exhausted. - ValueError: if n < 1 + StopIteration: if the iterator has fewer than n + 1 items. + ValueError: if n < 0 """ - if n < 1: - raise ValueError(f"nth index must be >= 1, got {n}") - return self.skip(n - 1).next() + if n < 0: + raise ValueError(f"nth index must be >= 0, got {n}") + return self.skip(n).next() def pairwise(self) -> "Itr[tuple[T, T]]": """Returns an iterator that yields consecutive pairs of elements from the iterable. diff --git a/src/test/test_collection.py b/src/test/test_collection.py index a65e13d..f655a5a 100644 --- a/src/test/test_collection.py +++ b/src/test/test_collection.py @@ -52,8 +52,10 @@ def test_next_chunk_overrun() -> None: def test_nth() -> None: it = Itr([10, 20, 30, 40]) with pytest.raises(ValueError): - it.nth(0) - assert it.nth(3) == 30 + it.nth(-1) + assert it.nth(0) == 10 + # consumes preceding items, so this advances from the current position + assert it.nth(2) == 40 with pytest.raises(StopIteration): it.nth(10) diff --git a/src/test/test_combine_split.py b/src/test/test_combine_split.py index 29e8683..1febe25 100644 --- a/src/test/test_combine_split.py +++ b/src/test/test_combine_split.py @@ -65,29 +65,29 @@ def test_interleave_first_longer() -> None: it1 = Itr([1, 3, 5, 7]) it2 = Itr([2, 4]) result = it1.interleave(it2) - # Stops when either iterator is exhausted - assert result.collect() == (1, 2, 3, 4) + # When one iterable is exhausted, the remainder of the other is yielded in order + assert result.collect() == (1, 2, 3, 4, 5, 7) def test_interleave_second_longer() -> None: it1 = Itr([1, 3]) it2 = Itr([2, 4, 6, 8]) result = it1.interleave(it2) - assert result.collect() == (1, 2, 3, 4) + assert result.collect() == (1, 2, 3, 4, 6, 8) def test_interleave_empty_first() -> None: it1: Itr[int] = Itr([]) it2 = Itr([2, 4, 6]) result = it1.interleave(it2) - assert result.collect() == () + assert result.collect() == (2, 4, 6) def test_interleave_empty_second() -> None: it1 = Itr([1, 3, 5]) it2: Itr[int] = Itr([]) result = it1.interleave(it2) - assert result.collect() == () + assert result.collect() == (1, 3, 5) def test_interleave_both_empty() -> None: diff --git a/src/test/test_transform_filter.py b/src/test/test_transform_filter.py index be3e8f7..5eab93d 100644 --- a/src/test/test_transform_filter.py +++ b/src/test/test_transform_filter.py @@ -1,3 +1,4 @@ +import itertools from collections import defaultdict from operator import mul @@ -193,3 +194,15 @@ def test_groupby_string() -> None: assert tuple(d.keys()) == (5, 6) assert d[5] == ("apple",) assert d[6] == ("banana", "carrot") + + +def test_chunk_by() -> None: + # consecutive runs only, order preserved, no sorting (unlike groupby) + it = Itr([1, 1, 2, 3, 3, 1]).chunk_by(lambda x: x) + assert it.collect() == ((1, (1, 1)), (2, (2,)), (3, (3, 3)), (1, (1,))) + + +def test_chunk_by_lazy_on_infinite() -> None: + # chunk_by is lazy, so it works on unbounded iterators + counts = Itr(itertools.count()).chunk_by(lambda n: n // 2).take(3).collect() + assert counts == ((0, (0, 1)), (1, (2, 3)), (2, (4, 5))) diff --git a/uv.lock b/uv.lock index 7f50d7b..e227335 100644 --- a/uv.lock +++ b/uv.lock @@ -323,7 +323,7 @@ wheels = [ [[package]] name = "itrx" -version = "0.2.3" +version = "0.3.0" source = { editable = "." } [package.optional-dependencies]