diff --git a/Cargo.toml b/Cargo.toml
index 6eeec14e..f94e1c2b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ members = ["rust", "python"]
 resolver = "2"
 
 [workspace.package]
-version = "0.1.25"
+version = "0.1.26"
 edition = "2024"
 authors = ["zTgx <beautifularea@gmail.com>"]
 license = "Apache-2.0"
diff --git a/examples/batch_indexing/README.md b/examples/batch_indexing/README.md
new file mode 100644
index 00000000..41e87fae
--- /dev/null
+++ b/examples/batch_indexing/README.md
@@ -0,0 +1,28 @@
+# Batch Indexing Example
+
+Demonstrates indexing multiple documents at once using:
+- `from_paths` -- explicit list of file paths
+- `from_dir` -- all supported files in a directory
+- `from_bytes` -- raw in-memory content
+
+Also shows cross-document querying with `with_doc_ids`.
+
+## Setup
+
+```bash
+pip install vectorless
+```
+
+## Run
+
+```bash
+python main.py
+```
+
+## Environment Variables
+
+| Variable                | Description          | Default   |
+|------------------------|----------------------|-----------|
+| `VECTORLESS_API_KEY`   | LLM API key          | `sk-...`  |
+| `VECTORLESS_MODEL`     | LLM model name       | `gpt-4o`  |
+| `VECTORLESS_ENDPOINT`  | Custom API endpoint  | `None`    |
diff --git a/examples/batch_indexing/main.py b/examples/batch_indexing/main.py
new file mode 100644
index 00000000..7d6d03cb
--- /dev/null
+++ b/examples/batch_indexing/main.py
@@ -0,0 +1,183 @@
+"""
+Batch indexing example -- demonstrates indexing multiple documents at once
+using from_paths, from_dir, and from_bytes.
+
+Usage:
+    pip install vectorless
+    python main.py
+"""
+
+import asyncio
+import os
+
+from vectorless import (
+    Engine,
+    IndexContext,
+    IndexOptions,
+    QueryContext,
+    VectorlessError,
+)
+
+# --- Configuration ---
+API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
+MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
+ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
+WORKSPACE = "./workspace"
+
+# Sample documents for demonstration
+DOCS = {
+    "alpha.md": """\
+# Alpha Report
+
+## Summary
+
+Alpha is a distributed key-value store designed for low-latency reads.
+It uses a log-structured merge tree for storage.
+
+## Architecture
+
+Write requests go through a write-ahead log, then are buffered in memory.
+When the buffer is full, it is flushed to disk as an immutable SSTable.
+""",
+    "beta.md": """\
+# Beta Report
+
+## Summary
+
+Beta is a stream processing engine that consumes events from Kafka topics
+and applies real-time transformations using a DAG-based execution model.
+
+## Performance
+
+Beta processes up to 2 million events per second per node on commodity hardware.
+""",
+    "gamma.md": """\
+# Gamma Report
+
+## Summary
+
+Gamma is a feature store that bridges the gap between offline feature
+computation and online serving. Features are computed in Spark and served
+via a low-latency gRPC endpoint.
+
+## Integration
+
+Gamma integrates with Alpha for feature metadata storage and Beta for
+real-time feature updates.
+""",
+}
+
+
+def write_sample_docs(base_dir: str) -> list[str]:
+    """Write sample markdown files and return their paths."""
+    paths = []
+    for name, content in DOCS.items():
+        path = os.path.join(base_dir, name)
+        with open(path, "w") as f:
+            f.write(content)
+        paths.append(path)
+    return paths
+
+
+async def main() -> None:
+    engine = Engine(
+        workspace=WORKSPACE,
+        api_key=API_KEY,
+        model=MODEL,
+        endpoint=ENDPOINT,
+    )
+
+    # Create a temp directory with sample documents
+    docs_dir = "./batch_docs"
+    os.makedirs(docs_dir, exist_ok=True)
+    paths = write_sample_docs(docs_dir)
+
+    # ---- 1. Index multiple files at once via from_paths ----
+    print("=" * 50)
+    print("  from_paths -- index a list of files")
+    print("=" * 50)
+
+    ctx = IndexContext.from_paths(paths)
+    result = await engine.index(ctx)
+
+    print(f"  Indexed {len(result.items)} document(s)")
+    for item in result.items:
+        print(f"    - {item.name} ({item.doc_id[:8]}...)")
+    if result.has_failures():
+        for f in result.failed:
+            print(f"    ! Failed: {f.source} -- {f.error}")
+    print()
+
+    doc_ids = [item.doc_id for item in result.items]
+
+    # ---- 2. Query across all batch-indexed documents ----
+    print("=" * 50)
+    print("  Query across multiple documents")
+    print("=" * 50)
+
+    answer = await engine.query(
+        QueryContext(
+            "Which system processes the most events per second?"
+        ).with_doc_ids(doc_ids)
+    )
+    for item in answer.items:
+        print(f"  [{item.doc_id[:8]}...] score={item.score:.2f}")
+        print(f"    {item.content[:200]}...")
+    print()
+
+    # ---- 3. Index a directory via from_dir ----
+    print("=" * 50)
+    print("  from_dir -- index all supported files in a directory")
+    print("=" * 50)
+
+    # Clear first so we see fresh results
+    await engine.clear()
+
+    ctx = IndexContext.from_dir(docs_dir).with_options(
+        IndexOptions(generate_summaries=True, generate_description=True)
+    )
+    result = await engine.index(ctx)
+
+    print(f"  Indexed {len(result.items)} document(s)")
+    for item in result.items:
+        desc = item.description[:80] if item.description else "N/A"
+        print(f"    - {item.name}: {desc}...")
+    print()
+
+    # ---- 4. Index from raw bytes via from_bytes ----
+    print("=" * 50)
+    print("  from_bytes -- index in-memory content")
+    print("=" * 50)
+
+    md_bytes = b"""# Delta Notes
+
+## Key Points
+
+- Delta uses CRDTs for conflict-free replication.
+- Writes are locally committed then asynchronously propagated.
+- Read repair ensures eventual consistency across all replicas.
+"""
+
+    ctx = IndexContext.from_bytes(md_bytes, "markdown").with_name("delta")
+    result = await engine.index(ctx)
+
+    print(f"  Indexed: {result.doc_id}")
+    print()
+
+    # ---- Cleanup ----
+    print("=" * 50)
+    print("  Cleanup")
+    print("=" * 50)
+
+    removed = await engine.clear()
+    print(f"  Removed {removed} document(s)")
+
+    # Remove temp files
+    for p in paths:
+        os.remove(p)
+    os.rmdir(docs_dir)
+    print(f"  Cleaned up {docs_dir}/")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/document_management/README.md b/examples/document_management/README.md
new file mode 100644
index 00000000..e41148e0
--- /dev/null
+++ b/examples/document_management/README.md
@@ -0,0 +1,28 @@
+# Document Management Example
+
+Demonstrates CRUD operations on indexed documents:
+
+- `engine.list()` -- list all documents
+- `engine.exists(doc_id)` -- check if a document exists
+- `engine.remove(doc_id)` -- remove a single document
+- `engine.clear()` -- remove all documents
+
+## Setup
+
+```bash
+pip install vectorless
+```
+
+## Run
+
+```bash
+python main.py
+```
+
+## Environment Variables
+
+| Variable                | Description          | Default   |
+|------------------------|----------------------|-----------|
+| `VECTORLESS_API_KEY`   | LLM API key          | `sk-...`  |
+| `VECTORLESS_MODEL`     | LLM model name       | `gpt-4o`  |
+| `VECTORLESS_ENDPOINT`  | Custom API endpoint  | `None`    |
diff --git a/examples/document_management/main.py b/examples/document_management/main.py
new file mode 100644
index 00000000..f5d72360
--- /dev/null
+++ b/examples/document_management/main.py
@@ -0,0 +1,135 @@
+"""
+Document management example -- demonstrates CRUD operations on indexed documents:
+list, exists, remove, and clear.
+
+Usage:
+    pip install vectorless
+    python main.py
+"""
+
+import asyncio
+import os
+
+from vectorless import (
+    Engine,
+    IndexContext,
+    QueryContext,
+    VectorlessError,
+)
+
+# --- Configuration ---
+API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
+MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
+ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
+WORKSPACE = "./workspace"
+
+# Sample documents
+SAMPLE_A = """\
+# Project Alpha
+
+## Overview
+
+Project Alpha is a next-generation database engine written in Rust.
+It supports ACID transactions and serializable isolation.
+
+## Features
+
+- MVCC concurrency control
+- B-tree and LSM storage engines
+- Query planner with cost-based optimization
+"""
+
+SAMPLE_B = """\
+# Project Beta
+
+## Overview
+
+Project Beta is a web framework for building real-time applications.
+It uses WebSocket-based communication and server-side rendering.
+
+## Features
+
+- Hot module reloading
+- Built-in authentication middleware
+- Automatic code splitting
+"""
+
+
+async def main() -> None:
+    engine = Engine(
+        workspace=WORKSPACE,
+        api_key=API_KEY,
+        model=MODEL,
+        endpoint=ENDPOINT,
+    )
+
+    # ---- Index two documents ----
+    print("Indexing two documents...")
+
+    result_a = await engine.index(
+        IndexContext.from_content(SAMPLE_A, "markdown").with_name("alpha")
+    )
+    doc_id_a = result_a.doc_id
+    print(f"  A: {doc_id_a}")
+
+    result_b = await engine.index(
+        IndexContext.from_content(SAMPLE_B, "markdown").with_name("beta")
+    )
+    doc_id_b = result_b.doc_id
+    print(f"  B: {doc_id_b}")
+    print()
+
+    # ---- list() -- show all indexed documents ----
+    print("--- list() ---")
+    docs = await engine.list()
+    for doc in docs:
+        pages = f", pages={doc.page_count}" if doc.page_count else ""
+        lines = f", lines={doc.line_count}" if doc.line_count else ""
+        print(f"  {doc.name}  id={doc.id[:8]}...  format={doc.format}{pages}{lines}")
+    print(f"  Total: {len(docs)} document(s)\n")
+
+    # ---- exists() -- check if a document is indexed ----
+    print("--- exists() ---")
+    for did, label in [(doc_id_a, "A"), (doc_id_b, "B"), ("nonexistent-id", "?")]:
+        found = await engine.exists(did)
+        print(f"  {label}: exists={found}")
+    print()
+
+    # ---- Query a specific document ----
+    print("--- query(doc_id_a) ---")
+    answer = await engine.query(
+        QueryContext("What storage engines does Alpha support?").with_doc_id(doc_id_a)
+    )
+    item = answer.single()
+    if item:
+        print(f"  Score: {item.score:.2f}")
+        print(f"  Answer: {item.content[:200]}...\n")
+
+    # ---- remove() -- delete a single document ----
+    print("--- remove(doc_id_a) ---")
+    removed = await engine.remove(doc_id_a)
+    print(f"  Removed A: {removed}")
+
+    # Verify it's gone
+    exists_a = await engine.exists(doc_id_a)
+    print(f"  exists(A) after removal: {exists_a}")
+    print()
+
+    # ---- list() again -- only B should remain ----
+    print("--- list() after removal ---")
+    docs = await engine.list()
+    for doc in docs:
+        print(f"  {doc.name}  id={doc.id[:8]}...")
+    print(f"  Total: {len(docs)} document(s)\n")
+
+    # ---- clear() -- remove all remaining documents ----
+    print("--- clear() ---")
+    cleared = await engine.clear()
+    print(f"  Cleared {cleared} document(s)")
+
+    docs = await engine.list()
+    print(f"  Remaining: {len(docs)} document(s)")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/error_handling/README.md b/examples/error_handling/README.md
new file mode 100644
index 00000000..2424d618
--- /dev/null
+++ b/examples/error_handling/README.md
@@ -0,0 +1,33 @@
+# Error Handling Example
+
+Demonstrates how to catch and inspect `VectorlessError` exceptions:
+
+- Invalid format strings
+- Invalid indexing modes
+- Querying non-existent documents
+- Batch indexing with partial failures
+- Engine creation with invalid credentials
+
+The `VectorlessError` exception provides:
+- `kind` -- error category (`"config"`, `"not_found"`, `"parse"`, `"llm"`, etc.)
+- `message` -- human-readable error description
+
+## Setup
+
+```bash
+pip install vectorless
+```
+
+## Run
+
+```bash
+python main.py
+```
+
+## Environment Variables
+
+| Variable                | Description          | Default   |
+|------------------------|----------------------|-----------|
+| `VECTORLESS_API_KEY`   | LLM API key          | `sk-...`  |
+| `VECTORLESS_MODEL`     | LLM model name       | `gpt-4o`  |
+| `VECTORLESS_ENDPOINT`  | Custom API endpoint  | `None`    |
diff --git a/examples/error_handling/main.py b/examples/error_handling/main.py
new file mode 100644
index 00000000..993814a6
--- /dev/null
+++ b/examples/error_handling/main.py
@@ -0,0 +1,111 @@
+"""
+Error handling example -- demonstrates catching and inspecting VectorlessError.
+
+Usage:
+    pip install vectorless
+    python main.py
+"""
+
+import asyncio
+import os
+
+from vectorless import (
+    Engine,
+    IndexContext,
+    IndexOptions,
+    QueryContext,
+    VectorlessError,
+)
+
+# --- Configuration ---
+API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
+MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
+ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
+WORKSPACE = "./workspace"
+
+
+async def main() -> None:
+    engine = Engine(
+        workspace=WORKSPACE,
+        api_key=API_KEY,
+        model=MODEL,
+        endpoint=ENDPOINT,
+    )
+
+    # ---- 1. Invalid format ----
+    print("--- Invalid format in from_bytes ---")
+    try:
+        ctx = IndexContext.from_bytes(b"hello", "xml")
+    except VectorlessError as e:
+        print(f"  Caught VectorlessError:")
+        print(f"    kind:    {e.kind}")
+        print(f"    message: {e.message}")
+        print(f"    repr:    {repr(e)}")
+    print()
+
+    # ---- 2. Invalid indexing mode ----
+    print("--- Invalid indexing mode ---")
+    try:
+        opts = IndexOptions(mode="bad_mode")
+    except VectorlessError as e:
+        print(f"  Caught VectorlessError:")
+        print(f"    kind:    {e.kind}")
+        print(f"    message: {e.message}")
+    print()
+
+    # ---- 3. Query a non-existent document ----
+    print("--- Query non-existent document ---")
+    try:
+        await engine.query(
+            QueryContext("What is this?").with_doc_id("does-not-exist")
+        )
+    except VectorlessError as e:
+        print(f"  Caught VectorlessError:")
+        print(f"    kind:    {e.kind}")
+        print(f"    message: {e.message}")
+    print()
+
+    # ---- 4. Index with partial failure in batch ----
+    print("--- Batch indexing with mixed results ---")
+    good = IndexContext.from_content("# Real Doc\n\nThis is valid content.", "markdown")
+
+    result = await engine.index(good.with_name("good_doc"))
+    if result.has_failures():
+        for f in result.failed:
+            print(f"  Failed: {f.source} -- {f.error}")
+    else:
+        print(f"  Success: {result.doc_id}")
+
+        # Inspect individual items
+        for item in result.items:
+            print(f"  Item: {item.name} ({item.format})")
+            if item.metrics:
+                m = item.metrics
+                print(f"    Total time: {m.total_time_ms} ms, LLM calls: {m.llm_calls}")
+    print()
+
+    # ---- 5. Engine creation with bad credentials ----
+    print("--- Engine with invalid credentials ---")
+    try:
+        bad_engine = Engine(
+            workspace=WORKSPACE + "_bad",
+            api_key="sk-invalid-key-12345",
+            model="gpt-4o",
+        )
+        # Try to use it -- the error will surface on the first LLM call
+        await bad_engine.index(
+            IndexContext.from_content("# Test\n", "markdown").with_name("fail_test")
+        )
+    except VectorlessError as e:
+        print(f"  Caught VectorlessError:")
+        print(f"    kind:    {e.kind}")
+        print(f"    message: {e.message[:120]}...")
+    print()
+
+    # ---- Cleanup ----
+    await engine.clear()
+    print("Done.")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/index_metrics/README.md b/examples/index_metrics/README.md
new file mode 100644
index 00000000..78bdd552
--- /dev/null
+++ b/examples/index_metrics/README.md
@@ -0,0 +1,42 @@
+# IndexMetrics Example
+
+Demonstrates how to inspect detailed indexing pipeline metrics via `IndexMetrics`.
+
+`IndexMetrics` is attached to each `IndexItem` and provides:
+
+| Field                  | Description                                  |
+|------------------------|----------------------------------------------|
+| `total_time_ms`        | Total indexing time                           |
+| `parse_time_ms`        | Document parsing stage duration               |
+| `build_time_ms`        | Tree building stage duration                  |
+| `enhance_time_ms`      | Summary/enhancement stage duration            |
+| `nodes_processed`      | Number of tree nodes processed                |
+| `summaries_generated`  | Successfully generated summaries              |
+| `summaries_failed`     | Failed summary generations                    |
+| `llm_calls`            | Total LLM API calls made                      |
+| `total_tokens_generated` | Total tokens produced by the LLM            |
+| `topics_indexed`       | Topics added to the reasoning index           |
+| `keywords_indexed`     | Keywords added to the reasoning index         |
+
+This example compares documents indexed with and without summaries enabled
+to show how `IndexOptions` affect pipeline stages and LLM usage.
+
+## Setup
+
+```bash
+pip install vectorless
+```
+
+## Run
+
+```bash
+python main.py
+```
+
+## Environment Variables
+
+| Variable                | Description          | Default   |
+|------------------------|----------------------|-----------|
+| `VECTORLESS_API_KEY`   | LLM API key          | `sk-...`  |
+| `VECTORLESS_MODEL`     | LLM model name       | `gpt-4o`  |
+| `VECTORLESS_ENDPOINT`  | Custom API endpoint  | `None`    |
diff --git a/examples/index_metrics/main.py b/examples/index_metrics/main.py
new file mode 100644
index 00000000..3bff91cb
--- /dev/null
+++ b/examples/index_metrics/main.py
@@ -0,0 +1,236 @@
+"""
+IndexMetrics example -- demonstrates inspecting detailed indexing pipeline metrics.
+
+IndexMetrics exposes timing, node processing, LLM usage, and reasoning index
+statistics for each indexed document.  This example compares two documents with
+different IndexOptions to show how options affect the pipeline.
+
+Usage:
+    pip install vectorless
+    python main.py
+"""
+
+import asyncio
+import os
+
+from vectorless import (
+    Engine,
+    IndexContext,
+    IndexItem,
+    IndexMetrics,
+    IndexOptions,
+    VectorlessError,
+)
+
+# --- Configuration ---
+API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
+MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
+ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
+WORKSPACE = "./workspace"
+
+# --- Sample documents with varying complexity ---
+SIMPLE_DOC = """\
+# Quick Note
+
+This is a short note about caching strategies.
+Redis is commonly used as an in-memory cache.
+"""
+
+COMPLEX_DOC = """\
+# Distributed Systems Design Guide
+
+## Consensus
+
+Raft is a consensus algorithm designed to be easy to understand.
+It elects a leader via randomized timeouts and replicates log entries
+to a majority of followers before committing them.
+
+## Replication
+
+State machine replication ensures that all replicas execute the same
+commands in the same order. Primary-backup replication is simpler but
+provides lower availability during leader failover.
+
+## Partitioning
+
+Consistent hashing distributes keys across nodes with minimal
+remapping when the cluster size changes. Virtual nodes improve balance
+when the key space is small.
+
+## Failure Detection
+
+Phi accrual failure detection treats failure as a continuous suspicion
+level rather than a binary alive/dead state. This reduces false
+positives during transient network issues.
+"""
+
+
+def print_pipeline_breakdown(m: IndexMetrics) -> None:
+    """Print a breakdown of pipeline stages and their percentages."""
+    total = m.total_time_ms
+    if total == 0:
+        print("    (no timing data)")
+        return
+
+    parse_pct = m.parse_time_ms / total * 100
+    build_pct = m.build_time_ms / total * 100
+    enhance_pct = m.enhance_time_ms / total * 100
+    other_pct = max(0, 100 - parse_pct - build_pct - enhance_pct)
+
+    print(f"    Parse:    {m.parse_time_ms:>5} ms  ({parse_pct:5.1f}%)")
+    print(f"    Build:    {m.build_time_ms:>5} ms  ({build_pct:5.1f}%)")
+    print(f"    Enhance:  {m.enhance_time_ms:>5} ms  ({enhance_pct:5.1f}%)")
+    print(f"    Other:    {total - m.parse_time_ms - m.build_time_ms - m.enhance_time_ms:>5} ms  ({other_pct:5.1f}%)")
+
+
+def print_llm_stats(m: IndexMetrics) -> None:
+    """Print LLM utilization statistics."""
+    print(f"    LLM calls:         {m.llm_calls}")
+    print(f"    Tokens generated:   {m.total_tokens_generated}")
+    if m.llm_calls > 0:
+        avg_tokens = m.total_tokens_generated / m.llm_calls
+        print(f"    Avg tokens/call:    {avg_tokens:.0f}")
+
+
+def print_summary_stats(m: IndexMetrics) -> None:
+    """Print summary generation success/failure."""
+    total = m.summaries_generated + m.summaries_failed
+    print(f"    Summaries ok:       {m.summaries_generated}")
+    print(f"    Summaries failed:   {m.summaries_failed}")
+    if total > 0:
+        success_rate = m.summaries_generated / total * 100
+        print(f"    Success rate:       {success_rate:.1f}%")
+
+
+def print_reasoning_index(m: IndexMetrics) -> None:
+    """Print reasoning index statistics."""
+    print(f"    Nodes processed:    {m.nodes_processed}")
+    print(f"    Topics indexed:     {m.topics_indexed}")
+    print(f"    Keywords indexed:   {m.keywords_indexed}")
+
+
+def print_full_report(item: IndexItem) -> None:
+    """Print a full metrics report for an indexed item."""
+    m = item.metrics
+    print(f"  Document: {item.name} ({item.format})")
+    if m is None:
+        print("    (no metrics)")
+        return
+
+    print(f"  Total time: {m.total_time_ms} ms")
+    print(f"  repr: {repr(m)}")
+
+    print()
+    print("  Pipeline stages:")
+    print_pipeline_breakdown(m)
+
+    print()
+    print("  LLM usage:")
+    print_llm_stats(m)
+
+    print()
+    print("  Summary generation:")
+    print_summary_stats(m)
+
+    print()
+    print("  Reasoning index:")
+    print_reasoning_index(m)
+
+
+async def main() -> None:
+    engine = Engine(
+        workspace=WORKSPACE,
+        api_key=API_KEY,
+        model=MODEL,
+        endpoint=ENDPOINT,
+    )
+
+    # ================================================================
+    # 1. Index a simple document WITHOUT summaries
+    # ================================================================
+    print("=" * 55)
+    print("  Run 1: Simple doc, summaries OFF")
+    print("=" * 55)
+
+    opts_no_summary = IndexOptions(
+        generate_summaries=False,
+        generate_description=False,
+    )
+    result = await engine.index(
+        IndexContext.from_content(SIMPLE_DOC, "markdown")
+        .with_name("simple_no_summary")
+        .with_options(opts_no_summary)
+    )
+    item = result.items[0]
+    print_full_report(item)
+    doc_id_1 = item.doc_id
+    print()
+
+    # ================================================================
+    # 2. Index the same simple document WITH summaries
+    # ================================================================
+    print("=" * 55)
+    print("  Run 2: Simple doc, summaries ON")
+    print("=" * 55)
+
+    opts_with_summary = IndexOptions(
+        generate_summaries=True,
+        generate_description=True,
+    )
+    result = await engine.index(
+        IndexContext.from_content(SIMPLE_DOC, "markdown")
+        .with_name("simple_with_summary")
+        .with_options(opts_with_summary)
+    )
+    item = result.items[0]
+    print_full_report(item)
+    doc_id_2 = item.doc_id
+    print()
+
+    # ================================================================
+    # 3. Compare: summaries OFF vs ON for the simple doc
+    # ================================================================
+    m_off = (await engine.list())[0]  # first indexed
+    # Find the second document's metrics via a fresh index
+    # (We already have both items above; let's compare directly)
+
+    # ================================================================
+    # 4. Index a complex document WITH summaries
+    # ================================================================
+    print("=" * 55)
+    print("  Run 3: Complex doc, summaries ON")
+    print("=" * 55)
+
+    result = await engine.index(
+        IndexContext.from_content(COMPLEX_DOC, "markdown")
+        .with_name("complex_with_summary")
+        .with_options(opts_with_summary)
+    )
+    item = result.items[0]
+    print_full_report(item)
+    doc_id_3 = item.doc_id
+    print()
+
+    # ================================================================
+    # 5. Summary table
+    # ================================================================
+    print("=" * 55)
+    print("  Comparison table")
+    print("=" * 55)
+
+    docs = await engine.list()
+    for doc in docs:
+        print(f"  {doc.name:<30} id={doc.id[:8]}...")
+        if doc.description:
+            print(f"    description: {doc.description[:80]}")
+
+    # ================================================================
+    # Cleanup
+    # ================================================================
+    print()
+    cleared = await engine.clear()
+    print(f"Cleaned up {cleared} document(s).")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/examples/pdf_indexing/README.md b/examples/pdf_indexing/README.md
new file mode 100644
index 00000000..cfee9a95
--- /dev/null
+++ b/examples/pdf_indexing/README.md
@@ -0,0 +1,27 @@
+# PDF Indexing Example
+
+Demonstrates indexing a PDF file, inspecting indexing metrics, and querying.
+
+## Setup
+
+```bash
+pip install vectorless
+```
+
+## Run
+
+```bash
+# Use the sample PDF from the repository
+python main.py
+
+# Or specify your own PDF file
+python main.py /path/to/document.pdf
+```
+
+## Environment Variables
+
+| Variable                | Description          | Default   |
+|------------------------|----------------------|-----------|
+| `VECTORLESS_API_KEY`   | LLM API key          | `sk-...`  |
+| `VECTORLESS_MODEL`     | LLM model name       | `gpt-4o`  |
+| `VECTORLESS_ENDPOINT`  | Custom API endpoint  | `None`    |
diff --git a/examples/pdf_indexing/main.py b/examples/pdf_indexing/main.py
new file mode 100644
index 00000000..e79b6db5
--- /dev/null
+++ b/examples/pdf_indexing/main.py
@@ -0,0 +1,126 @@
+"""
+PDF indexing example -- demonstrates indexing PDF files and inspecting metrics.
+
+Usage:
+    pip install vectorless
+    python main.py [path/to/file.pdf]
+
+If no path is given, uses the sample PDF in the repository.
+"""
+
+import asyncio
+import os
+import sys
+
+from vectorless import (
+    Engine,
+    IndexContext,
+    IndexItem,
+    IndexMetrics,
+    IndexOptions,
+    QueryContext,
+    VectorlessError,
+)
+
+# --- Configuration ---
+API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...")
+MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o")
+ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None)
+WORKSPACE = "./workspace"
+
+# Resolve the sample PDF path relative to the repo root
+SAMPLE_PDF = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+    "samples",
+    "Docker_Cheat_Sheet.pdf",
+)
+
+
+def print_separator(title: str) -> None:
+    print(f"\n{'=' * 40}")
+    print(f"  {title}")
+    print(f"{'=' * 40}")
+
+
+def print_metrics(item: IndexItem) -> None:
+    """Pretty-print indexing metrics for a single item."""
+    m: IndexMetrics | None = item.metrics
+    if m is None:
+        print("  (no metrics available)")
+        return
+
+    print(f"  Total time:       {m.total_time_ms:>6} ms")
+    print(f"  Parse time:       {m.parse_time_ms:>6} ms")
+    print(f"  Build time:       {m.build_time_ms:>6} ms")
+    print(f"  Enhance time:     {m.enhance_time_ms:>6} ms")
+    print(f"  Nodes processed:  {m.nodes_processed:>6}")
+    print(f"  Summaries ok:     {m.summaries_generated:>6}")
+    print(f"  Summaries failed: {m.summaries_failed:>6}")
+    print(f"  LLM calls:        {m.llm_calls:>6}")
+    print(f"  Tokens generated:  {m.total_tokens_generated:>6}")
+    print(f"  Topics indexed:   {m.topics_indexed:>6}")
+    print(f"  Keywords indexed: {m.keywords_indexed:>6}")
+
+
+async def main() -> None:
+    pdf_path = sys.argv[1] if len(sys.argv) > 1 else SAMPLE_PDF
+
+    if not os.path.isfile(pdf_path):
+        print(f"Error: file not found: {pdf_path}")
+        sys.exit(1)
+
+    engine = Engine(
+        workspace=WORKSPACE,
+        api_key=API_KEY,
+        model=MODEL,
+        endpoint=ENDPOINT,
+    )
+
+    # ---- Index with description + summaries enabled ----
+    print_separator("Indexing PDF")
+
+    options = IndexOptions(generate_summaries=True, generate_description=True)
+    ctx = IndexContext.from_path(pdf_path).with_options(options)
+
+    try:
+        result = await engine.index(ctx)
+    except VectorlessError as e:
+        print(f"Indexing failed: [{e.kind}] {e.message}")
+        return
+
+    if result.has_failures():
+        for f in result.failed:
+            print(f"  Failed: {f.source} -- {f.error}")
+        return
+
+    doc_id = result.doc_id
+    print(f"  doc_id: {doc_id}")
+
+    for item in result.items:
+        print(f"\n  Item: {item.name} ({item.format})")
+        if item.page_count is not None:
+            print(f"  Pages: {item.page_count}")
+        if item.description:
+            print(f"  Description: {item.description[:120]}...")
+        print_metrics(item)
+
+    # ---- Query the PDF ----
+    print_separator("Query")
+
+    answer = await engine.query(
+        QueryContext("What is this document about?").with_doc_id(doc_id)
+    )
+    item = answer.single()
+    if item:
+        print(f"  Score:   {item.score:.2f}")
+        print(f"  Nodes:   {item.node_ids}")
+        print(f"  Content: {item.content[:300]}...")
+
+    # ---- Cleanup ----
+    print_separator("Cleanup")
+    removed = await engine.clear()
+    print(f"  Removed {removed} document(s)")
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/pyproject.toml b/pyproject.toml
index 23bffc7d..4951e3e5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "vectorless"
-version = "0.1.4"
+version = "0.1.5"
 description = "Hierarchical document intelligence without vectors"
 readme = "README.md"
 requires-python = ">=3.9"
diff --git a/python/src/lib.rs b/python/src/lib.rs
index a8b32439..a5da45bb 100644
--- a/python/src/lib.rs
+++ b/python/src/lib.rs
@@ -14,6 +14,7 @@ use ::vectorless::client::{
     IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, QueryResultItem,
 };
 use ::vectorless::error::Error as RustError;
+use ::vectorless::metrics::IndexMetrics;
 
 // ============================================================
 // Error Types
@@ -477,6 +478,95 @@ impl PyQueryResult {
     }
 }
 
+// ============================================================
+// IndexMetrics
+// ============================================================
+
+/// Indexing pipeline metrics.
+#[pyclass(name = "IndexMetrics")]
+pub struct PyIndexMetrics {
+    inner: IndexMetrics,
+}
+
+#[pymethods]
+impl PyIndexMetrics {
+    /// Total indexing time (ms).
+    #[getter]
+    fn total_time_ms(&self) -> u64 {
+        self.inner.total_time_ms()
+    }
+
+    /// Parse stage duration (ms).
+    #[getter]
+    fn parse_time_ms(&self) -> u64 {
+        self.inner.parse_time_ms
+    }
+
+    /// Build stage duration (ms).
+    #[getter]
+    fn build_time_ms(&self) -> u64 {
+        self.inner.build_time_ms
+    }
+
+    /// Enhance (summary) stage duration (ms).
+    #[getter]
+    fn enhance_time_ms(&self) -> u64 {
+        self.inner.enhance_time_ms
+    }
+
+    /// Number of nodes processed.
+    #[getter]
+    fn nodes_processed(&self) -> usize {
+        self.inner.nodes_processed
+    }
+
+    /// Number of summaries successfully generated.
+    #[getter]
+    fn summaries_generated(&self) -> usize {
+        self.inner.summaries_generated
+    }
+
+    /// Number of summaries that failed to generate.
+    #[getter]
+    fn summaries_failed(&self) -> usize {
+        self.inner.summaries_failed
+    }
+
+    /// Number of LLM calls made.
+    #[getter]
+    fn llm_calls(&self) -> usize {
+        self.inner.llm_calls
+    }
+
+    /// Total tokens generated by LLM.
+    #[getter]
+    fn total_tokens_generated(&self) -> usize {
+        self.inner.total_tokens_generated
+    }
+
+    /// Number of topics in reasoning index.
+    #[getter]
+    fn topics_indexed(&self) -> usize {
+        self.inner.topics_indexed
+    }
+
+    /// Number of keywords in reasoning index.
+    #[getter]
+    fn keywords_indexed(&self) -> usize {
+        self.inner.keywords_indexed
+    }
+
+    fn __repr__(&self) -> String {
+        format!(
+            "IndexMetrics(total={}ms, summaries={}, failed={}, llm_calls={})",
+            self.inner.total_time_ms(),
+            self.inner.summaries_generated,
+            self.inner.summaries_failed,
+            self.inner.llm_calls,
+        )
+    }
+}
+
 // ============================================================
 // IndexItem / IndexResult
 // ============================================================
@@ -514,6 +604,12 @@ impl PyIndexItem {
         self.inner.page_count
     }
 
+    /// Indexing pipeline metrics (timing, LLM usage, etc.).
+    #[getter]
+    fn metrics(&self) -> Option<PyIndexMetrics> {
+        self.inner.metrics.as_ref().map(|m| PyIndexMetrics { inner: m.clone() })
+    }
+
     fn __repr__(&self) -> String {
         format!(
             "IndexItem(doc_id='{}', name='{}')",
@@ -1076,6 +1172,7 @@ fn _vectorless(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyQueryContext>()?;
     m.add_class::<PyIndexResult>()?;
     m.add_class::<PyIndexItem>()?;
+    m.add_class::<PyIndexMetrics>()?;
     m.add_class::<PyQueryResult>()?;
     m.add_class::<PyQueryResultItem>()?;
     m.add_class::<PyFailedItem>()?;
diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs
index c7840e14..b370b39d 100644
--- a/rust/examples/index_pdf.rs
+++ b/rust/examples/index_pdf.rs
@@ -36,9 +36,19 @@ async fn main() -> vectorless::Result<()> {
 
     println!("=== Indexing PDF: {} ===\n", pdf_path);
 
-    // Build engine with LLM configuration from environment or defaults.
-    let api_key = std::env::var("LLM_API_KEY")
-        .unwrap_or_else(|_| "sk-or-v1-...".to_string());
+    // LLM configuration is required — set these environment variables:
+    //   LLM_API_KEY   — your API key (required)
+    //   LLM_MODEL     — model name (default: google/gemini-3-flash-preview)
+    //   LLM_ENDPOINT  — API endpoint (default: http://localhost:4000/api/v1)
+    let api_key = match std::env::var("LLM_API_KEY") {
+        Ok(key) => key,
+        Err(_) => {
+            eprintln!("Error: LLM_API_KEY environment variable is required.");
+            eprintln!("Set it before running:");
+            eprintln!("  LLM_API_KEY=sk-xxx cargo run --example index_pdf -- <path>");
+            std::process::exit(1);
+        }
+    };
     let model = std::env::var("LLM_MODEL")
         .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string());
     let endpoint = std::env::var("LLM_ENDPOINT")
@@ -83,6 +93,7 @@ async fn main() -> vectorless::Result<()> {
             println!("  enhance:       {}ms", metrics.enhance_time_ms);
             println!("  nodes:         {}", metrics.nodes_processed);
             println!("  summaries:     {}", metrics.summaries_generated);
+            println!("  failed:        {}", metrics.summaries_failed);
             println!("  llm calls:     {}", metrics.llm_calls);
             println!("  tokens:        {}", metrics.total_tokens_generated);
             println!("  topics:        {}", metrics.topics_indexed);
diff --git a/rust/src/index/parse/toc/assigner.rs b/rust/src/index/parse/toc/assigner.rs
index beff3021..52d50403 100644
--- a/rust/src/index/parse/toc/assigner.rs
+++ b/rust/src/index/parse/toc/assigner.rs
@@ -4,7 +4,7 @@
 //! Page assigner - assigns physical page numbers to TOC entries.
 
 use std::collections::HashMap;
-use futures::future::join_all;
+use futures::stream::{self, StreamExt};
 use tracing::{debug, info};
 
 use crate::config::LlmConfig;
@@ -175,7 +175,10 @@ impl PageAssigner {
             })
             .collect();
 
-        let verified_offsets = join_all(futures).await;
+        let verified_offsets: Vec<_> = stream::iter(futures)
+            .buffer_unordered(5)
+            .collect()
+            .await;
 
         // Calculate the mode (most common offset)
         let successful: Vec<_> = verified_offsets
@@ -265,29 +268,32 @@ Reply in JSON format:
         Ok(result.page)
     }
 
-    /// Assign pages using LLM for each entry (concurrently).
+    /// Assign pages using LLM for each entry (with bounded concurrency).
     async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> {
         info!("Assigning pages using LLM positioning");
 
         let client = self.client.clone();
         let pages_owned = pages.to_vec();
+        let total = entries.len();
 
-        // Launch all entry searches concurrently
-        let futures: Vec<_> = entries
-            .iter()
-            .map(|entry| {
-                let title = entry.title.clone();
-                let client = client.clone();
-                let pages = pages_owned.clone();
+        // Launch entry searches with bounded concurrency to avoid rate limiting
+        let futures: Vec<_> = entries.iter().map(|entry| {
+            let title = entry.title.clone();
+            let client = client.clone();
+            let pages = pages_owned.clone();
 
-                async move {
-                    let groups = Self::group_pages_owned(&pages, 5);
-                    Self::locate_title_in_groups_static(&client, &title, &groups).await
-                }
-            })
-            .collect();
+            async move {
+                let groups = Self::group_pages_owned(&pages, 5);
+                Self::locate_title_in_groups_static(&client, &title, &groups).await
+            }
+        }).collect();
+
+        let results: Vec<_> = stream::iter(futures)
+            .buffer_unordered(5)
+            .collect()
+            .await;
 
-        let results = join_all(futures).await;
+        info!("Assigned pages for {}/{} entries", results.len(), total);
 
         // Write results back
         for (entry, result) in entries.iter_mut().zip(results.into_iter()) {
diff --git a/rust/src/index/parse/toc/processor.rs b/rust/src/index/parse/toc/processor.rs
index 9ed2c95b..8e5f59b0 100644
--- a/rust/src/index/parse/toc/processor.rs
+++ b/rust/src/index/parse/toc/processor.rs
@@ -7,7 +7,7 @@
 //! degradation: if one mode fails verification, it falls back to a lower-quality
 //! but more reliable mode.
 
-use futures::future::join_all;
+use futures::stream::{self, StreamExt};
 use tracing::{debug, info, warn};
 
 use crate::error::Result;
@@ -505,7 +505,10 @@ impl TocProcessor {
             })
             .collect();
 
-        let extraction_results = join_all(oversized_futures).await;
+        let extraction_results: Vec<_> = stream::iter(oversized_futures)
+            .buffer_unordered(3)
+            .collect()
+            .await;
 
         // Build a lookup from index → refined sub-entries
         let mut refined_map = std::collections::HashMap::new();
diff --git a/rust/src/index/parse/toc/repairer.rs b/rust/src/index/parse/toc/repairer.rs
index 51931674..13c19877 100644
--- a/rust/src/index/parse/toc/repairer.rs
+++ b/rust/src/index/parse/toc/repairer.rs
@@ -3,7 +3,7 @@
 
 //! Index repairer - fixes incorrect TOC entry page assignments.
 
-use futures::future::join_all;
+use futures::stream::{self, StreamExt};
 use tracing::{debug, info};
 
 use crate::config::LlmConfig;
@@ -63,7 +63,7 @@ impl IndexRepairer {
         Self::new(RepairerConfig::default())
     }
 
-    /// Repair incorrect entries concurrently.
+    /// Repair incorrect entries with bounded concurrency.
     pub async fn repair(
         &self,
         entries: &mut [TocEntry],
@@ -107,7 +107,10 @@ impl IndexRepairer {
             })
             .collect();
 
-        let results = join_all(tasks).await;
+        let results: Vec<_> = stream::iter(tasks)
+            .buffer_unordered(5)
+            .collect()
+            .await;
 
         // Apply repairs
         let mut repaired_count = 0;
diff --git a/rust/src/index/parse/toc/structure_extractor.rs b/rust/src/index/parse/toc/structure_extractor.rs
index 17511b36..be2486d9 100644
--- a/rust/src/index/parse/toc/structure_extractor.rs
+++ b/rust/src/index/parse/toc/structure_extractor.rs
@@ -7,6 +7,7 @@
 //! module uses LLM to analyse page content and extract the document's
 //! hierarchical structure directly.
 
+use futures::stream::{self, StreamExt};
 use tracing::{debug, info, warn};
 
 use crate::config::LlmConfig;
@@ -40,6 +41,7 @@ impl Default for StructureExtractorConfig {
 }
 
 /// A group of consecutive pages with their combined text.
+#[derive(Clone)]
 struct PageGroup {
     /// Combined text with page markers: `<page_N>\n...\n</page_N>`.
     text: String,
@@ -77,42 +79,102 @@ impl StructureExtractor {
     }
 
     /// Extract hierarchical structure from all pages.
+    ///
+    /// The first page group is processed alone (initial structure), then all
+    /// remaining groups are processed in parallel, each using the initial
+    /// entries as context. Results are merged and deduplicated.
     pub async fn extract(&self, pages: &[PdfPage]) -> Result<Vec<TocEntry>> {
         if pages.is_empty() {
             return Ok(Vec::new());
         }
 
         let groups = self.group_pages(pages);
+        let page_count = pages.len();
         info!(
             "Extracting structure from {} pages in {} groups",
-            pages.len(),
+            page_count,
             groups.len()
         );
 
-        let mut all_entries = Vec::new();
-        let page_count = pages.len();
+        // Phase 1: Generate initial structure from first group
+        let initial_entries = self.generate_initial(&groups[0]).await?;
+        debug!(
+            "Initial group (pages {}-{}): extracted {} entries",
+            groups[0].start_page,
+            groups[0].end_page,
+            initial_entries.len()
+        );
 
-        for (i, group) in groups.iter().enumerate() {
-            let group_entries = if i == 0 {
-                self.generate_initial(group).await?
-            } else {
-                self.generate_continuation(group, &all_entries).await?
-            };
+        if groups.len() == 1 {
+            return Ok(Self::finalize_entries(initial_entries, page_count));
+        }
 
-            debug!(
-                "Group {}/{} (pages {}-{}): extracted {} entries",
-                i + 1,
-                groups.len(),
-                group.start_page,
-                group.end_page,
-                group_entries.len()
-            );
+        // Phase 2: Process remaining groups in parallel (bounded concurrency)
+        // Each continuation group uses the initial entries as shared context.
+        let client = self.client.clone();
+        let initial_entries_ref = &initial_entries;
 
-            all_entries.extend(group_entries);
+        let continuation_futures: Vec<_> = groups[1..]
+            .iter()
+            .map(|group| {
+                let group = group.clone();
+                let client = client.clone();
+                let initial = initial_entries_ref.to_vec();
+
+                async move {
+                    let result = Self::generate_continuation_with_client(
+                        &client, &group, &initial,
+                    )
+                    .await;
+                    (group.start_page, group.end_page, result)
+                }
+            })
+            .collect();
+
+        let continuation_results: Vec<_> = stream::iter(continuation_futures)
+            .buffer_unordered(5)
+            .collect()
+            .await;
+
+        // Phase 3: Merge initial + continuation entries
+        let mut all_entries = initial_entries;
+        for (start, end, result) in continuation_results {
+            match result {
+                Ok(entries) => {
+                    debug!(
+                        "Continuation group (pages {}-{}): extracted {} entries",
+                        start,
+                        end,
+                        entries.len()
+                    );
+                    all_entries.extend(entries);
+                }
+                Err(e) => {
+                    warn!(
+                        "Continuation group (pages {}-{}) failed: {}",
+                        start, end, e
+                    );
+                }
+            }
         }
 
-        // Truncate physical_page values that exceed document length
-        for entry in &mut all_entries {
+        // Phase 4: Sort by page number, deduplicate, truncate
+        all_entries.sort_by(|a, b| {
+            a.physical_page
+                .unwrap_or(0)
+                .cmp(&b.physical_page.unwrap_or(0))
+        });
+        all_entries.dedup_by(|a, b| {
+            a.title.trim() == b.title.trim()
+                && a.physical_page == b.physical_page
+        });
+
+        Ok(Self::finalize_entries(all_entries, page_count))
+    }
+
+    /// Truncate out-of-range page numbers and log stats.
+    fn finalize_entries(mut entries: Vec<TocEntry>, page_count: usize) -> Vec<TocEntry> {
+        for entry in &mut entries {
             if let Some(p) = entry.physical_page {
                 if p > page_count {
                     warn!(
@@ -123,9 +185,8 @@ impl StructureExtractor {
                 }
             }
         }
-
-        info!("Structure extraction complete: {} entries", all_entries.len());
-        Ok(all_entries)
+        info!("Structure extraction complete: {} entries", entries.len());
+        entries
     }
 
     /// Group pages by estimated token count.
@@ -267,6 +328,63 @@ If no new structural elements are found, return: []"#,
             })
             .collect())
     }
+
+    /// Static version of continuation generation for parallel use.
+    ///
+    /// Uses an owned `LlmClient` reference instead of `&self`.
+    async fn generate_continuation_with_client(
+        client: &LlmClient,
+        group: &PageGroup,
+        previous: &[TocEntry],
+    ) -> Result<Vec<TocEntry>> {
+        let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT;
+
+        let prev_summary = previous
+            .iter()
+            .rev()
+            .take(10)
+            .rev()
+            .map(|e| {
+                format!(
+                    "  {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}",
+                    e.title,
+                    e.level,
+                    e.physical_page.unwrap_or(0)
+                )
+            })
+            .collect::<Vec<_>>()
+            .join(",\n");
+
+        let user = format!(
+            r#"Previously extracted structure:
+[
+{}
+]
+
+Continue extracting structure from these pages:
+{}
+
+Return ONLY the NEW entries (do not repeat previous ones):
+[
+  {{"title": "...", "level": N, "physical_page": M}},
+  ...
+]
+
+If no new structural elements are found, return: []"#,
+            prev_summary, group.text
+        );
+
+        let sections: Vec<ExtractedSection> = client.complete_json(system, &user).await?;
+
+        Ok(sections
+            .into_iter()
+            .map(|s| {
+                TocEntry::new(s.title, s.level)
+                    .with_physical_page(s.physical_page)
+                    .with_confidence(0.7)
+            })
+            .collect())
+    }
 }
 
 /// Format pages into tagged text for LLM consumption.
diff --git a/rust/src/index/parse/toc/verifier.rs b/rust/src/index/parse/toc/verifier.rs
index 09b28059..fd944386 100644
--- a/rust/src/index/parse/toc/verifier.rs
+++ b/rust/src/index/parse/toc/verifier.rs
@@ -3,7 +3,7 @@
 
 //! Index verifier - verifies TOC entry page assignments.
 
-use futures::future::join_all;
+use futures::stream::{self, StreamExt};
 use rand::seq::SliceRandom;
 use tracing::{debug, info};
 
@@ -65,7 +65,7 @@ impl IndexVerifier {
 
     /// Verify TOC entries against PDF pages.
     ///
-    /// All sample entries are verified concurrently via LLM calls.
+    /// Sample entries are verified via LLM calls with bounded concurrency.
     pub async fn verify(
         &self,
         entries: &[TocEntry],
@@ -77,36 +77,36 @@ impl IndexVerifier {
 
         let sample = self.select_sample(entries);
 
-        // Launch all verification checks concurrently
+        // Launch verification checks with bounded concurrency
         let client = self.client.clone();
-        let futures: Vec<_> = sample
-            .iter()
-            .map(|(index, entry)| {
-                let index = *index;
-                let title = entry.title.clone();
-                let physical_page = entry.physical_page;
-                let client = client.clone();
-                let pages = pages.to_vec();
-
-                async move {
-                    match physical_page {
-                        Some(page) => {
-                            let result =
-                                Self::verify_entry_with_client(&client, &title, page, &pages).await;
-                            (index, title, page, result)
-                        }
-                        None => (
-                            index,
-                            title,
-                            0,
-                            Ok(Err(ErrorType::PageOutOfRange)),
-                        ),
+        let futures: Vec<_> = sample.iter().map(|(index, entry)| {
+            let index = *index;
+            let title = entry.title.clone();
+            let physical_page = entry.physical_page;
+            let client = client.clone();
+            let pages = pages.to_vec();
+
+            async move {
+                match physical_page {
+                    Some(page) => {
+                        let result =
+                            Self::verify_entry_with_client(&client, &title, page, &pages).await;
+                        (index, title, page, result)
                     }
+                    None => (
+                        index,
+                        title,
+                        0,
+                        Ok(Err(ErrorType::PageOutOfRange)),
+                    ),
                 }
-            })
-            .collect();
+            }
+        }).collect();
 
-        let results = join_all(futures).await;
+        let results: Vec<_> = stream::iter(futures)
+            .buffer_unordered(5)
+            .collect()
+            .await;
 
         // Aggregate results
         let total = results.len();
diff --git a/rust/src/index/stages/enhance.rs b/rust/src/index/stages/enhance.rs
index 5550de45..a79b5fb3 100644
--- a/rust/src/index/stages/enhance.rs
+++ b/rust/src/index/stages/enhance.rs
@@ -313,6 +313,9 @@ impl IndexStage for EnhanceStage {
 
         let duration = start.elapsed().as_millis() as u64;
         ctx.metrics.record_enhance(duration);
+        if failed > 0 {
+            ctx.metrics.add_summaries_failed(failed);
+        }
 
         info!(
             "Generated {} summaries ({} shortcut, {} failed, {} skipped no content, {} skipped tokens) in {}ms",
diff --git a/rust/src/lib.rs b/rust/src/lib.rs
index 92e6a4cc..689d331f 100644
--- a/rust/src/lib.rs
+++ b/rust/src/lib.rs
@@ -54,7 +54,7 @@ pub mod graph;
 mod index;
 mod llm;
 mod memo;
-mod metrics;
+pub mod metrics;
 mod retrieval;
 mod storage;
 mod throttle;
@@ -62,8 +62,9 @@ mod utils;
 
 // Client API
 pub use client::{
-    BuildError, ClientError, DocumentFormat, DocumentInfo, Engine, EngineBuilder, IndexContext,
-    IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult,
+    BuildError, ClientError, DocumentFormat, DocumentInfo, Engine, EngineBuilder, FailedItem,
+    IndexContext, IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult,
+    QueryResultItem,
 };
 
 // Error types
diff --git a/rust/src/metrics/index.rs b/rust/src/metrics/index.rs
index 58054661..4432e32f 100644
--- a/rust/src/metrics/index.rs
+++ b/rust/src/metrics/index.rs
@@ -64,6 +64,10 @@ pub struct IndexMetrics {
     #[serde(default)]
     pub summaries_generated: usize,
 
+    /// Number of summaries that failed to generate (LLM error, rate limit, etc.).
+    #[serde(default)]
+    pub summaries_failed: usize,
+
     /// Number of nodes skipped (thinning).
     #[serde(default)]
     pub nodes_skipped: usize,
@@ -141,6 +145,11 @@ impl IndexMetrics {
         self.summaries_generated += 1;
     }
 
+    /// Add to summaries failed count.
+    pub fn add_summaries_failed(&mut self, count: usize) {
+        self.summaries_failed += count;
+    }
+
     /// Increment nodes skipped.
     pub fn increment_nodes_skipped(&mut self) {
         self.nodes_skipped += 1;