diff --git a/Cargo.toml b/Cargo.toml index 6eeec14e..f94e1c2b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["rust", "python"] resolver = "2" [workspace.package] -version = "0.1.25" +version = "0.1.26" edition = "2024" authors = ["zTgx "] license = "Apache-2.0" diff --git a/examples/batch_indexing/README.md b/examples/batch_indexing/README.md new file mode 100644 index 00000000..41e87fae --- /dev/null +++ b/examples/batch_indexing/README.md @@ -0,0 +1,28 @@ +# Batch Indexing Example + +Demonstrates indexing multiple documents at once using: +- `from_paths` -- explicit list of file paths +- `from_dir` -- all supported files in a directory +- `from_bytes` -- raw in-memory content + +Also shows cross-document querying with `with_doc_ids`. + +## Setup + +```bash +pip install vectorless +``` + +## Run + +```bash +python main.py +``` + +## Environment Variables + +| Variable | Description | Default | +|------------------------|----------------------|-----------| +| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | +| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | +| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/batch_indexing/main.py b/examples/batch_indexing/main.py new file mode 100644 index 00000000..7d6d03cb --- /dev/null +++ b/examples/batch_indexing/main.py @@ -0,0 +1,183 @@ +""" +Batch indexing example -- demonstrates indexing multiple documents at once +using from_paths, from_dir, and from_bytes. + +Usage: + pip install vectorless + python main.py +""" + +import asyncio +import os + +from vectorless import ( + Engine, + IndexContext, + IndexOptions, + QueryContext, + VectorlessError, +) + +# --- Configuration --- +API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") +MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") +ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) +WORKSPACE = "./workspace" + +# Sample documents for demonstration +DOCS = { + "alpha.md": """\ +# Alpha Report + +## Summary + +Alpha is a distributed key-value store designed for low-latency reads. +It uses a log-structured merge tree for storage. + +## Architecture + +Write requests go through a write-ahead log, then are buffered in memory. +When the buffer is full, it is flushed to disk as an immutable SSTable. +""", + "beta.md": """\ +# Beta Report + +## Summary + +Beta is a stream processing engine that consumes events from Kafka topics +and applies real-time transformations using a DAG-based execution model. + +## Performance + +Beta processes up to 2 million events per second per node on commodity hardware. +""", + "gamma.md": """\ +# Gamma Report + +## Summary + +Gamma is a feature store that bridges the gap between offline feature +computation and online serving. Features are computed in Spark and served +via a low-latency gRPC endpoint. + +## Integration + +Gamma integrates with Alpha for feature metadata storage and Beta for +real-time feature updates. +""", +} + + +def write_sample_docs(base_dir: str) -> list[str]: + """Write sample markdown files and return their paths.""" + paths = [] + for name, content in DOCS.items(): + path = os.path.join(base_dir, name) + with open(path, "w") as f: + f.write(content) + paths.append(path) + return paths + + +async def main() -> None: + engine = Engine( + workspace=WORKSPACE, + api_key=API_KEY, + model=MODEL, + endpoint=ENDPOINT, + ) + + # Create a temp directory with sample documents + docs_dir = "./batch_docs" + os.makedirs(docs_dir, exist_ok=True) + paths = write_sample_docs(docs_dir) + + # ---- 1. Index multiple files at once via from_paths ---- + print("=" * 50) + print(" from_paths -- index a list of files") + print("=" * 50) + + ctx = IndexContext.from_paths(paths) + result = await engine.index(ctx) + + print(f" Indexed {len(result.items)} document(s)") + for item in result.items: + print(f" - {item.name} ({item.doc_id[:8]}...)") + if result.has_failures(): + for f in result.failed: + print(f" ! Failed: {f.source} -- {f.error}") + print() + + doc_ids = [item.doc_id for item in result.items] + + # ---- 2. Query across all batch-indexed documents ---- + print("=" * 50) + print(" Query across multiple documents") + print("=" * 50) + + answer = await engine.query( + QueryContext( + "Which system processes the most events per second?" + ).with_doc_ids(doc_ids) + ) + for item in answer.items: + print(f" [{item.doc_id[:8]}...] score={item.score:.2f}") + print(f" {item.content[:200]}...") + print() + + # ---- 3. Index a directory via from_dir ---- + print("=" * 50) + print(" from_dir -- index all supported files in a directory") + print("=" * 50) + + # Clear first so we see fresh results + await engine.clear() + + ctx = IndexContext.from_dir(docs_dir).with_options( + IndexOptions(generate_summaries=True, generate_description=True) + ) + result = await engine.index(ctx) + + print(f" Indexed {len(result.items)} document(s)") + for item in result.items: + desc = item.description[:80] if item.description else "N/A" + print(f" - {item.name}: {desc}...") + print() + + # ---- 4. Index from raw bytes via from_bytes ---- + print("=" * 50) + print(" from_bytes -- index in-memory content") + print("=" * 50) + + md_bytes = b"""# Delta Notes + +## Key Points + +- Delta uses CRDTs for conflict-free replication. +- Writes are locally committed then asynchronously propagated. +- Read repair ensures eventual consistency across all replicas. +""" + + ctx = IndexContext.from_bytes(md_bytes, "markdown").with_name("delta") + result = await engine.index(ctx) + + print(f" Indexed: {result.doc_id}") + print() + + # ---- Cleanup ---- + print("=" * 50) + print(" Cleanup") + print("=" * 50) + + removed = await engine.clear() + print(f" Removed {removed} document(s)") + + # Remove temp files + for p in paths: + os.remove(p) + os.rmdir(docs_dir) + print(f" Cleaned up {docs_dir}/") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/document_management/README.md b/examples/document_management/README.md new file mode 100644 index 00000000..e41148e0 --- /dev/null +++ b/examples/document_management/README.md @@ -0,0 +1,28 @@ +# Document Management Example + +Demonstrates CRUD operations on indexed documents: + +- `engine.list()` -- list all documents +- `engine.exists(doc_id)` -- check if a document exists +- `engine.remove(doc_id)` -- remove a single document +- `engine.clear()` -- remove all documents + +## Setup + +```bash +pip install vectorless +``` + +## Run + +```bash +python main.py +``` + +## Environment Variables + +| Variable | Description | Default | +|------------------------|----------------------|-----------| +| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | +| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | +| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/document_management/main.py b/examples/document_management/main.py new file mode 100644 index 00000000..f5d72360 --- /dev/null +++ b/examples/document_management/main.py @@ -0,0 +1,135 @@ +""" +Document management example -- demonstrates CRUD operations on indexed documents: +list, exists, remove, and clear. + +Usage: + pip install vectorless + python main.py +""" + +import asyncio +import os + +from vectorless import ( + Engine, + IndexContext, + QueryContext, + VectorlessError, +) + +# --- Configuration --- +API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") +MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") +ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) +WORKSPACE = "./workspace" + +# Sample documents +SAMPLE_A = """\ +# Project Alpha + +## Overview + +Project Alpha is a next-generation database engine written in Rust. +It supports ACID transactions and serializable isolation. + +## Features + +- MVCC concurrency control +- B-tree and LSM storage engines +- Query planner with cost-based optimization +""" + +SAMPLE_B = """\ +# Project Beta + +## Overview + +Project Beta is a web framework for building real-time applications. +It uses WebSocket-based communication and server-side rendering. + +## Features + +- Hot module reloading +- Built-in authentication middleware +- Automatic code splitting +""" + + +async def main() -> None: + engine = Engine( + workspace=WORKSPACE, + api_key=API_KEY, + model=MODEL, + endpoint=ENDPOINT, + ) + + # ---- Index two documents ---- + print("Indexing two documents...") + + result_a = await engine.index( + IndexContext.from_content(SAMPLE_A, "markdown").with_name("alpha") + ) + doc_id_a = result_a.doc_id + print(f" A: {doc_id_a}") + + result_b = await engine.index( + IndexContext.from_content(SAMPLE_B, "markdown").with_name("beta") + ) + doc_id_b = result_b.doc_id + print(f" B: {doc_id_b}") + print() + + # ---- list() -- show all indexed documents ---- + print("--- list() ---") + docs = await engine.list() + for doc in docs: + pages = f", pages={doc.page_count}" if doc.page_count else "" + lines = f", lines={doc.line_count}" if doc.line_count else "" + print(f" {doc.name} id={doc.id[:8]}... format={doc.format}{pages}{lines}") + print(f" Total: {len(docs)} document(s)\n") + + # ---- exists() -- check if a document is indexed ---- + print("--- exists() ---") + for did, label in [(doc_id_a, "A"), (doc_id_b, "B"), ("nonexistent-id", "?")]: + found = await engine.exists(did) + print(f" {label}: exists={found}") + print() + + # ---- Query a specific document ---- + print("--- query(doc_id_a) ---") + answer = await engine.query( + QueryContext("What storage engines does Alpha support?").with_doc_id(doc_id_a) + ) + item = answer.single() + if item: + print(f" Score: {item.score:.2f}") + print(f" Answer: {item.content[:200]}...\n") + + # ---- remove() -- delete a single document ---- + print("--- remove(doc_id_a) ---") + removed = await engine.remove(doc_id_a) + print(f" Removed A: {removed}") + + # Verify it's gone + exists_a = await engine.exists(doc_id_a) + print(f" exists(A) after removal: {exists_a}") + print() + + # ---- list() again -- only B should remain ---- + print("--- list() after removal ---") + docs = await engine.list() + for doc in docs: + print(f" {doc.name} id={doc.id[:8]}...") + print(f" Total: {len(docs)} document(s)\n") + + # ---- clear() -- remove all remaining documents ---- + print("--- clear() ---") + cleared = await engine.clear() + print(f" Cleared {cleared} document(s)") + + docs = await engine.list() + print(f" Remaining: {len(docs)} document(s)") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/error_handling/README.md b/examples/error_handling/README.md new file mode 100644 index 00000000..2424d618 --- /dev/null +++ b/examples/error_handling/README.md @@ -0,0 +1,33 @@ +# Error Handling Example + +Demonstrates how to catch and inspect `VectorlessError` exceptions: + +- Invalid format strings +- Invalid indexing modes +- Querying non-existent documents +- Batch indexing with partial failures +- Engine creation with invalid credentials + +The `VectorlessError` exception provides: +- `kind` -- error category (`"config"`, `"not_found"`, `"parse"`, `"llm"`, etc.) +- `message` -- human-readable error description + +## Setup + +```bash +pip install vectorless +``` + +## Run + +```bash +python main.py +``` + +## Environment Variables + +| Variable | Description | Default | +|------------------------|----------------------|-----------| +| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | +| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | +| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/error_handling/main.py b/examples/error_handling/main.py new file mode 100644 index 00000000..993814a6 --- /dev/null +++ b/examples/error_handling/main.py @@ -0,0 +1,111 @@ +""" +Error handling example -- demonstrates catching and inspecting VectorlessError. + +Usage: + pip install vectorless + python main.py +""" + +import asyncio +import os + +from vectorless import ( + Engine, + IndexContext, + IndexOptions, + QueryContext, + VectorlessError, +) + +# --- Configuration --- +API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") +MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") +ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) +WORKSPACE = "./workspace" + + +async def main() -> None: + engine = Engine( + workspace=WORKSPACE, + api_key=API_KEY, + model=MODEL, + endpoint=ENDPOINT, + ) + + # ---- 1. Invalid format ---- + print("--- Invalid format in from_bytes ---") + try: + ctx = IndexContext.from_bytes(b"hello", "xml") + except VectorlessError as e: + print(f" Caught VectorlessError:") + print(f" kind: {e.kind}") + print(f" message: {e.message}") + print(f" repr: {repr(e)}") + print() + + # ---- 2. Invalid indexing mode ---- + print("--- Invalid indexing mode ---") + try: + opts = IndexOptions(mode="bad_mode") + except VectorlessError as e: + print(f" Caught VectorlessError:") + print(f" kind: {e.kind}") + print(f" message: {e.message}") + print() + + # ---- 3. Query a non-existent document ---- + print("--- Query non-existent document ---") + try: + await engine.query( + QueryContext("What is this?").with_doc_id("does-not-exist") + ) + except VectorlessError as e: + print(f" Caught VectorlessError:") + print(f" kind: {e.kind}") + print(f" message: {e.message}") + print() + + # ---- 4. Index with partial failure in batch ---- + print("--- Batch indexing with mixed results ---") + good = IndexContext.from_content("# Real Doc\n\nThis is valid content.", "markdown") + + result = await engine.index(good.with_name("good_doc")) + if result.has_failures(): + for f in result.failed: + print(f" Failed: {f.source} -- {f.error}") + else: + print(f" Success: {result.doc_id}") + + # Inspect individual items + for item in result.items: + print(f" Item: {item.name} ({item.format})") + if item.metrics: + m = item.metrics + print(f" Total time: {m.total_time_ms} ms, LLM calls: {m.llm_calls}") + print() + + # ---- 5. Engine creation with bad credentials ---- + print("--- Engine with invalid credentials ---") + try: + bad_engine = Engine( + workspace=WORKSPACE + "_bad", + api_key="sk-invalid-key-12345", + model="gpt-4o", + ) + # Try to use it -- the error will surface on the first LLM call + await bad_engine.index( + IndexContext.from_content("# Test\n", "markdown").with_name("fail_test") + ) + except VectorlessError as e: + print(f" Caught VectorlessError:") + print(f" kind: {e.kind}") + print(f" message: {e.message[:120]}...") + print() + + # ---- Cleanup ---- + await engine.clear() + print("Done.") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/index_metrics/README.md b/examples/index_metrics/README.md new file mode 100644 index 00000000..78bdd552 --- /dev/null +++ b/examples/index_metrics/README.md @@ -0,0 +1,42 @@ +# IndexMetrics Example + +Demonstrates how to inspect detailed indexing pipeline metrics via `IndexMetrics`. + +`IndexMetrics` is attached to each `IndexItem` and provides: + +| Field | Description | +|------------------------|----------------------------------------------| +| `total_time_ms` | Total indexing time | +| `parse_time_ms` | Document parsing stage duration | +| `build_time_ms` | Tree building stage duration | +| `enhance_time_ms` | Summary/enhancement stage duration | +| `nodes_processed` | Number of tree nodes processed | +| `summaries_generated` | Successfully generated summaries | +| `summaries_failed` | Failed summary generations | +| `llm_calls` | Total LLM API calls made | +| `total_tokens_generated` | Total tokens produced by the LLM | +| `topics_indexed` | Topics added to the reasoning index | +| `keywords_indexed` | Keywords added to the reasoning index | + +This example compares documents indexed with and without summaries enabled +to show how `IndexOptions` affect pipeline stages and LLM usage. + +## Setup + +```bash +pip install vectorless +``` + +## Run + +```bash +python main.py +``` + +## Environment Variables + +| Variable | Description | Default | +|------------------------|----------------------|-----------| +| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | +| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | +| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/index_metrics/main.py b/examples/index_metrics/main.py new file mode 100644 index 00000000..3bff91cb --- /dev/null +++ b/examples/index_metrics/main.py @@ -0,0 +1,236 @@ +""" +IndexMetrics example -- demonstrates inspecting detailed indexing pipeline metrics. + +IndexMetrics exposes timing, node processing, LLM usage, and reasoning index +statistics for each indexed document. This example compares two documents with +different IndexOptions to show how options affect the pipeline. + +Usage: + pip install vectorless + python main.py +""" + +import asyncio +import os + +from vectorless import ( + Engine, + IndexContext, + IndexItem, + IndexMetrics, + IndexOptions, + VectorlessError, +) + +# --- Configuration --- +API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") +MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") +ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) +WORKSPACE = "./workspace" + +# --- Sample documents with varying complexity --- +SIMPLE_DOC = """\ +# Quick Note + +This is a short note about caching strategies. +Redis is commonly used as an in-memory cache. +""" + +COMPLEX_DOC = """\ +# Distributed Systems Design Guide + +## Consensus + +Raft is a consensus algorithm designed to be easy to understand. +It elects a leader via randomized timeouts and replicates log entries +to a majority of followers before committing them. + +## Replication + +State machine replication ensures that all replicas execute the same +commands in the same order. Primary-backup replication is simpler but +provides lower availability during leader failover. + +## Partitioning + +Consistent hashing distributes keys across nodes with minimal +remapping when the cluster size changes. Virtual nodes improve balance +when the key space is small. + +## Failure Detection + +Phi accrual failure detection treats failure as a continuous suspicion +level rather than a binary alive/dead state. This reduces false +positives during transient network issues. +""" + + +def print_pipeline_breakdown(m: IndexMetrics) -> None: + """Print a breakdown of pipeline stages and their percentages.""" + total = m.total_time_ms + if total == 0: + print(" (no timing data)") + return + + parse_pct = m.parse_time_ms / total * 100 + build_pct = m.build_time_ms / total * 100 + enhance_pct = m.enhance_time_ms / total * 100 + other_pct = max(0, 100 - parse_pct - build_pct - enhance_pct) + + print(f" Parse: {m.parse_time_ms:>5} ms ({parse_pct:5.1f}%)") + print(f" Build: {m.build_time_ms:>5} ms ({build_pct:5.1f}%)") + print(f" Enhance: {m.enhance_time_ms:>5} ms ({enhance_pct:5.1f}%)") + print(f" Other: {total - m.parse_time_ms - m.build_time_ms - m.enhance_time_ms:>5} ms ({other_pct:5.1f}%)") + + +def print_llm_stats(m: IndexMetrics) -> None: + """Print LLM utilization statistics.""" + print(f" LLM calls: {m.llm_calls}") + print(f" Tokens generated: {m.total_tokens_generated}") + if m.llm_calls > 0: + avg_tokens = m.total_tokens_generated / m.llm_calls + print(f" Avg tokens/call: {avg_tokens:.0f}") + + +def print_summary_stats(m: IndexMetrics) -> None: + """Print summary generation success/failure.""" + total = m.summaries_generated + m.summaries_failed + print(f" Summaries ok: {m.summaries_generated}") + print(f" Summaries failed: {m.summaries_failed}") + if total > 0: + success_rate = m.summaries_generated / total * 100 + print(f" Success rate: {success_rate:.1f}%") + + +def print_reasoning_index(m: IndexMetrics) -> None: + """Print reasoning index statistics.""" + print(f" Nodes processed: {m.nodes_processed}") + print(f" Topics indexed: {m.topics_indexed}") + print(f" Keywords indexed: {m.keywords_indexed}") + + +def print_full_report(item: IndexItem) -> None: + """Print a full metrics report for an indexed item.""" + m = item.metrics + print(f" Document: {item.name} ({item.format})") + if m is None: + print(" (no metrics)") + return + + print(f" Total time: {m.total_time_ms} ms") + print(f" repr: {repr(m)}") + + print() + print(" Pipeline stages:") + print_pipeline_breakdown(m) + + print() + print(" LLM usage:") + print_llm_stats(m) + + print() + print(" Summary generation:") + print_summary_stats(m) + + print() + print(" Reasoning index:") + print_reasoning_index(m) + + +async def main() -> None: + engine = Engine( + workspace=WORKSPACE, + api_key=API_KEY, + model=MODEL, + endpoint=ENDPOINT, + ) + + # ================================================================ + # 1. Index a simple document WITHOUT summaries + # ================================================================ + print("=" * 55) + print(" Run 1: Simple doc, summaries OFF") + print("=" * 55) + + opts_no_summary = IndexOptions( + generate_summaries=False, + generate_description=False, + ) + result = await engine.index( + IndexContext.from_content(SIMPLE_DOC, "markdown") + .with_name("simple_no_summary") + .with_options(opts_no_summary) + ) + item = result.items[0] + print_full_report(item) + doc_id_1 = item.doc_id + print() + + # ================================================================ + # 2. Index the same simple document WITH summaries + # ================================================================ + print("=" * 55) + print(" Run 2: Simple doc, summaries ON") + print("=" * 55) + + opts_with_summary = IndexOptions( + generate_summaries=True, + generate_description=True, + ) + result = await engine.index( + IndexContext.from_content(SIMPLE_DOC, "markdown") + .with_name("simple_with_summary") + .with_options(opts_with_summary) + ) + item = result.items[0] + print_full_report(item) + doc_id_2 = item.doc_id + print() + + # ================================================================ + # 3. Compare: summaries OFF vs ON for the simple doc + # ================================================================ + m_off = (await engine.list())[0] # first indexed + # Find the second document's metrics via a fresh index + # (We already have both items above; let's compare directly) + + # ================================================================ + # 4. Index a complex document WITH summaries + # ================================================================ + print("=" * 55) + print(" Run 3: Complex doc, summaries ON") + print("=" * 55) + + result = await engine.index( + IndexContext.from_content(COMPLEX_DOC, "markdown") + .with_name("complex_with_summary") + .with_options(opts_with_summary) + ) + item = result.items[0] + print_full_report(item) + doc_id_3 = item.doc_id + print() + + # ================================================================ + # 5. Summary table + # ================================================================ + print("=" * 55) + print(" Comparison table") + print("=" * 55) + + docs = await engine.list() + for doc in docs: + print(f" {doc.name:<30} id={doc.id[:8]}...") + if doc.description: + print(f" description: {doc.description[:80]}") + + # ================================================================ + # Cleanup + # ================================================================ + print() + cleared = await engine.clear() + print(f"Cleaned up {cleared} document(s).") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/examples/pdf_indexing/README.md b/examples/pdf_indexing/README.md new file mode 100644 index 00000000..cfee9a95 --- /dev/null +++ b/examples/pdf_indexing/README.md @@ -0,0 +1,27 @@ +# PDF Indexing Example + +Demonstrates indexing a PDF file, inspecting indexing metrics, and querying. + +## Setup + +```bash +pip install vectorless +``` + +## Run + +```bash +# Use the sample PDF from the repository +python main.py + +# Or specify your own PDF file +python main.py /path/to/document.pdf +``` + +## Environment Variables + +| Variable | Description | Default | +|------------------------|----------------------|-----------| +| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | +| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | +| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/pdf_indexing/main.py b/examples/pdf_indexing/main.py new file mode 100644 index 00000000..e79b6db5 --- /dev/null +++ b/examples/pdf_indexing/main.py @@ -0,0 +1,126 @@ +""" +PDF indexing example -- demonstrates indexing PDF files and inspecting metrics. + +Usage: + pip install vectorless + python main.py [path/to/file.pdf] + +If no path is given, uses the sample PDF in the repository. +""" + +import asyncio +import os +import sys + +from vectorless import ( + Engine, + IndexContext, + IndexItem, + IndexMetrics, + IndexOptions, + QueryContext, + VectorlessError, +) + +# --- Configuration --- +API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") +MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") +ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) +WORKSPACE = "./workspace" + +# Resolve the sample PDF path relative to the repo root +SAMPLE_PDF = os.path.join( + os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), + "samples", + "Docker_Cheat_Sheet.pdf", +) + + +def print_separator(title: str) -> None: + print(f"\n{'=' * 40}") + print(f" {title}") + print(f"{'=' * 40}") + + +def print_metrics(item: IndexItem) -> None: + """Pretty-print indexing metrics for a single item.""" + m: IndexMetrics | None = item.metrics + if m is None: + print(" (no metrics available)") + return + + print(f" Total time: {m.total_time_ms:>6} ms") + print(f" Parse time: {m.parse_time_ms:>6} ms") + print(f" Build time: {m.build_time_ms:>6} ms") + print(f" Enhance time: {m.enhance_time_ms:>6} ms") + print(f" Nodes processed: {m.nodes_processed:>6}") + print(f" Summaries ok: {m.summaries_generated:>6}") + print(f" Summaries failed: {m.summaries_failed:>6}") + print(f" LLM calls: {m.llm_calls:>6}") + print(f" Tokens generated: {m.total_tokens_generated:>6}") + print(f" Topics indexed: {m.topics_indexed:>6}") + print(f" Keywords indexed: {m.keywords_indexed:>6}") + + +async def main() -> None: + pdf_path = sys.argv[1] if len(sys.argv) > 1 else SAMPLE_PDF + + if not os.path.isfile(pdf_path): + print(f"Error: file not found: {pdf_path}") + sys.exit(1) + + engine = Engine( + workspace=WORKSPACE, + api_key=API_KEY, + model=MODEL, + endpoint=ENDPOINT, + ) + + # ---- Index with description + summaries enabled ---- + print_separator("Indexing PDF") + + options = IndexOptions(generate_summaries=True, generate_description=True) + ctx = IndexContext.from_path(pdf_path).with_options(options) + + try: + result = await engine.index(ctx) + except VectorlessError as e: + print(f"Indexing failed: [{e.kind}] {e.message}") + return + + if result.has_failures(): + for f in result.failed: + print(f" Failed: {f.source} -- {f.error}") + return + + doc_id = result.doc_id + print(f" doc_id: {doc_id}") + + for item in result.items: + print(f"\n Item: {item.name} ({item.format})") + if item.page_count is not None: + print(f" Pages: {item.page_count}") + if item.description: + print(f" Description: {item.description[:120]}...") + print_metrics(item) + + # ---- Query the PDF ---- + print_separator("Query") + + answer = await engine.query( + QueryContext("What is this document about?").with_doc_id(doc_id) + ) + item = answer.single() + if item: + print(f" Score: {item.score:.2f}") + print(f" Nodes: {item.node_ids}") + print(f" Content: {item.content[:300]}...") + + # ---- Cleanup ---- + print_separator("Cleanup") + removed = await engine.clear() + print(f" Removed {removed} document(s)") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/pyproject.toml b/pyproject.toml index 23bffc7d..4951e3e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "vectorless" -version = "0.1.4" +version = "0.1.5" description = "Hierarchical document intelligence without vectors" readme = "README.md" requires-python = ">=3.9" diff --git a/python/src/lib.rs b/python/src/lib.rs index a8b32439..a5da45bb 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -14,6 +14,7 @@ use ::vectorless::client::{ IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, QueryResultItem, }; use ::vectorless::error::Error as RustError; +use ::vectorless::metrics::IndexMetrics; // ============================================================ // Error Types @@ -477,6 +478,95 @@ impl PyQueryResult { } } +// ============================================================ +// IndexMetrics +// ============================================================ + +/// Indexing pipeline metrics. +#[pyclass(name = "IndexMetrics")] +pub struct PyIndexMetrics { + inner: IndexMetrics, +} + +#[pymethods] +impl PyIndexMetrics { + /// Total indexing time (ms). + #[getter] + fn total_time_ms(&self) -> u64 { + self.inner.total_time_ms() + } + + /// Parse stage duration (ms). + #[getter] + fn parse_time_ms(&self) -> u64 { + self.inner.parse_time_ms + } + + /// Build stage duration (ms). + #[getter] + fn build_time_ms(&self) -> u64 { + self.inner.build_time_ms + } + + /// Enhance (summary) stage duration (ms). + #[getter] + fn enhance_time_ms(&self) -> u64 { + self.inner.enhance_time_ms + } + + /// Number of nodes processed. + #[getter] + fn nodes_processed(&self) -> usize { + self.inner.nodes_processed + } + + /// Number of summaries successfully generated. + #[getter] + fn summaries_generated(&self) -> usize { + self.inner.summaries_generated + } + + /// Number of summaries that failed to generate. + #[getter] + fn summaries_failed(&self) -> usize { + self.inner.summaries_failed + } + + /// Number of LLM calls made. + #[getter] + fn llm_calls(&self) -> usize { + self.inner.llm_calls + } + + /// Total tokens generated by LLM. + #[getter] + fn total_tokens_generated(&self) -> usize { + self.inner.total_tokens_generated + } + + /// Number of topics in reasoning index. + #[getter] + fn topics_indexed(&self) -> usize { + self.inner.topics_indexed + } + + /// Number of keywords in reasoning index. + #[getter] + fn keywords_indexed(&self) -> usize { + self.inner.keywords_indexed + } + + fn __repr__(&self) -> String { + format!( + "IndexMetrics(total={}ms, summaries={}, failed={}, llm_calls={})", + self.inner.total_time_ms(), + self.inner.summaries_generated, + self.inner.summaries_failed, + self.inner.llm_calls, + ) + } +} + // ============================================================ // IndexItem / IndexResult // ============================================================ @@ -514,6 +604,12 @@ impl PyIndexItem { self.inner.page_count } + /// Indexing pipeline metrics (timing, LLM usage, etc.). + #[getter] + fn metrics(&self) -> Option { + self.inner.metrics.as_ref().map(|m| PyIndexMetrics { inner: m.clone() }) + } + fn __repr__(&self) -> String { format!( "IndexItem(doc_id='{}', name='{}')", @@ -1076,6 +1172,7 @@ fn _vectorless(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs index c7840e14..b370b39d 100644 --- a/rust/examples/index_pdf.rs +++ b/rust/examples/index_pdf.rs @@ -36,9 +36,19 @@ async fn main() -> vectorless::Result<()> { println!("=== Indexing PDF: {} ===\n", pdf_path); - // Build engine with LLM configuration from environment or defaults. - let api_key = std::env::var("LLM_API_KEY") - .unwrap_or_else(|_| "sk-or-v1-...".to_string()); + // LLM configuration is required — set these environment variables: + // LLM_API_KEY — your API key (required) + // LLM_MODEL — model name (default: google/gemini-3-flash-preview) + // LLM_ENDPOINT — API endpoint (default: http://localhost:4000/api/v1) + let api_key = match std::env::var("LLM_API_KEY") { + Ok(key) => key, + Err(_) => { + eprintln!("Error: LLM_API_KEY environment variable is required."); + eprintln!("Set it before running:"); + eprintln!(" LLM_API_KEY=sk-xxx cargo run --example index_pdf -- "); + std::process::exit(1); + } + }; let model = std::env::var("LLM_MODEL") .unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); let endpoint = std::env::var("LLM_ENDPOINT") @@ -83,6 +93,7 @@ async fn main() -> vectorless::Result<()> { println!(" enhance: {}ms", metrics.enhance_time_ms); println!(" nodes: {}", metrics.nodes_processed); println!(" summaries: {}", metrics.summaries_generated); + println!(" failed: {}", metrics.summaries_failed); println!(" llm calls: {}", metrics.llm_calls); println!(" tokens: {}", metrics.total_tokens_generated); println!(" topics: {}", metrics.topics_indexed); diff --git a/rust/src/index/parse/toc/assigner.rs b/rust/src/index/parse/toc/assigner.rs index beff3021..52d50403 100644 --- a/rust/src/index/parse/toc/assigner.rs +++ b/rust/src/index/parse/toc/assigner.rs @@ -4,7 +4,7 @@ //! Page assigner - assigns physical page numbers to TOC entries. use std::collections::HashMap; -use futures::future::join_all; +use futures::stream::{self, StreamExt}; use tracing::{debug, info}; use crate::config::LlmConfig; @@ -175,7 +175,10 @@ impl PageAssigner { }) .collect(); - let verified_offsets = join_all(futures).await; + let verified_offsets: Vec<_> = stream::iter(futures) + .buffer_unordered(5) + .collect() + .await; // Calculate the mode (most common offset) let successful: Vec<_> = verified_offsets @@ -265,29 +268,32 @@ Reply in JSON format: Ok(result.page) } - /// Assign pages using LLM for each entry (concurrently). + /// Assign pages using LLM for each entry (with bounded concurrency). async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { info!("Assigning pages using LLM positioning"); let client = self.client.clone(); let pages_owned = pages.to_vec(); + let total = entries.len(); - // Launch all entry searches concurrently - let futures: Vec<_> = entries - .iter() - .map(|entry| { - let title = entry.title.clone(); - let client = client.clone(); - let pages = pages_owned.clone(); + // Launch entry searches with bounded concurrency to avoid rate limiting + let futures: Vec<_> = entries.iter().map(|entry| { + let title = entry.title.clone(); + let client = client.clone(); + let pages = pages_owned.clone(); - async move { - let groups = Self::group_pages_owned(&pages, 5); - Self::locate_title_in_groups_static(&client, &title, &groups).await - } - }) - .collect(); + async move { + let groups = Self::group_pages_owned(&pages, 5); + Self::locate_title_in_groups_static(&client, &title, &groups).await + } + }).collect(); + + let results: Vec<_> = stream::iter(futures) + .buffer_unordered(5) + .collect() + .await; - let results = join_all(futures).await; + info!("Assigned pages for {}/{} entries", results.len(), total); // Write results back for (entry, result) in entries.iter_mut().zip(results.into_iter()) { diff --git a/rust/src/index/parse/toc/processor.rs b/rust/src/index/parse/toc/processor.rs index 9ed2c95b..8e5f59b0 100644 --- a/rust/src/index/parse/toc/processor.rs +++ b/rust/src/index/parse/toc/processor.rs @@ -7,7 +7,7 @@ //! degradation: if one mode fails verification, it falls back to a lower-quality //! but more reliable mode. -use futures::future::join_all; +use futures::stream::{self, StreamExt}; use tracing::{debug, info, warn}; use crate::error::Result; @@ -505,7 +505,10 @@ impl TocProcessor { }) .collect(); - let extraction_results = join_all(oversized_futures).await; + let extraction_results: Vec<_> = stream::iter(oversized_futures) + .buffer_unordered(3) + .collect() + .await; // Build a lookup from index → refined sub-entries let mut refined_map = std::collections::HashMap::new(); diff --git a/rust/src/index/parse/toc/repairer.rs b/rust/src/index/parse/toc/repairer.rs index 51931674..13c19877 100644 --- a/rust/src/index/parse/toc/repairer.rs +++ b/rust/src/index/parse/toc/repairer.rs @@ -3,7 +3,7 @@ //! Index repairer - fixes incorrect TOC entry page assignments. -use futures::future::join_all; +use futures::stream::{self, StreamExt}; use tracing::{debug, info}; use crate::config::LlmConfig; @@ -63,7 +63,7 @@ impl IndexRepairer { Self::new(RepairerConfig::default()) } - /// Repair incorrect entries concurrently. + /// Repair incorrect entries with bounded concurrency. pub async fn repair( &self, entries: &mut [TocEntry], @@ -107,7 +107,10 @@ impl IndexRepairer { }) .collect(); - let results = join_all(tasks).await; + let results: Vec<_> = stream::iter(tasks) + .buffer_unordered(5) + .collect() + .await; // Apply repairs let mut repaired_count = 0; diff --git a/rust/src/index/parse/toc/structure_extractor.rs b/rust/src/index/parse/toc/structure_extractor.rs index 17511b36..be2486d9 100644 --- a/rust/src/index/parse/toc/structure_extractor.rs +++ b/rust/src/index/parse/toc/structure_extractor.rs @@ -7,6 +7,7 @@ //! module uses LLM to analyse page content and extract the document's //! hierarchical structure directly. +use futures::stream::{self, StreamExt}; use tracing::{debug, info, warn}; use crate::config::LlmConfig; @@ -40,6 +41,7 @@ impl Default for StructureExtractorConfig { } /// A group of consecutive pages with their combined text. +#[derive(Clone)] struct PageGroup { /// Combined text with page markers: `\n...\n`. text: String, @@ -77,42 +79,102 @@ impl StructureExtractor { } /// Extract hierarchical structure from all pages. + /// + /// The first page group is processed alone (initial structure), then all + /// remaining groups are processed in parallel, each using the initial + /// entries as context. Results are merged and deduplicated. pub async fn extract(&self, pages: &[PdfPage]) -> Result> { if pages.is_empty() { return Ok(Vec::new()); } let groups = self.group_pages(pages); + let page_count = pages.len(); info!( "Extracting structure from {} pages in {} groups", - pages.len(), + page_count, groups.len() ); - let mut all_entries = Vec::new(); - let page_count = pages.len(); + // Phase 1: Generate initial structure from first group + let initial_entries = self.generate_initial(&groups[0]).await?; + debug!( + "Initial group (pages {}-{}): extracted {} entries", + groups[0].start_page, + groups[0].end_page, + initial_entries.len() + ); - for (i, group) in groups.iter().enumerate() { - let group_entries = if i == 0 { - self.generate_initial(group).await? - } else { - self.generate_continuation(group, &all_entries).await? - }; + if groups.len() == 1 { + return Ok(Self::finalize_entries(initial_entries, page_count)); + } - debug!( - "Group {}/{} (pages {}-{}): extracted {} entries", - i + 1, - groups.len(), - group.start_page, - group.end_page, - group_entries.len() - ); + // Phase 2: Process remaining groups in parallel (bounded concurrency) + // Each continuation group uses the initial entries as shared context. + let client = self.client.clone(); + let initial_entries_ref = &initial_entries; - all_entries.extend(group_entries); + let continuation_futures: Vec<_> = groups[1..] + .iter() + .map(|group| { + let group = group.clone(); + let client = client.clone(); + let initial = initial_entries_ref.to_vec(); + + async move { + let result = Self::generate_continuation_with_client( + &client, &group, &initial, + ) + .await; + (group.start_page, group.end_page, result) + } + }) + .collect(); + + let continuation_results: Vec<_> = stream::iter(continuation_futures) + .buffer_unordered(5) + .collect() + .await; + + // Phase 3: Merge initial + continuation entries + let mut all_entries = initial_entries; + for (start, end, result) in continuation_results { + match result { + Ok(entries) => { + debug!( + "Continuation group (pages {}-{}): extracted {} entries", + start, + end, + entries.len() + ); + all_entries.extend(entries); + } + Err(e) => { + warn!( + "Continuation group (pages {}-{}) failed: {}", + start, end, e + ); + } + } } - // Truncate physical_page values that exceed document length - for entry in &mut all_entries { + // Phase 4: Sort by page number, deduplicate, truncate + all_entries.sort_by(|a, b| { + a.physical_page + .unwrap_or(0) + .cmp(&b.physical_page.unwrap_or(0)) + }); + all_entries.dedup_by(|a, b| { + a.title.trim() == b.title.trim() + && a.physical_page == b.physical_page + }); + + Ok(Self::finalize_entries(all_entries, page_count)) + } + + /// Truncate out-of-range page numbers and log stats. + fn finalize_entries(mut entries: Vec, page_count: usize) -> Vec { + for entry in &mut entries { if let Some(p) = entry.physical_page { if p > page_count { warn!( @@ -123,9 +185,8 @@ impl StructureExtractor { } } } - - info!("Structure extraction complete: {} entries", all_entries.len()); - Ok(all_entries) + info!("Structure extraction complete: {} entries", entries.len()); + entries } /// Group pages by estimated token count. @@ -267,6 +328,63 @@ If no new structural elements are found, return: []"#, }) .collect()) } + + /// Static version of continuation generation for parallel use. + /// + /// Uses an owned `LlmClient` reference instead of `&self`. + async fn generate_continuation_with_client( + client: &LlmClient, + group: &PageGroup, + previous: &[TocEntry], + ) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + + let prev_summary = previous + .iter() + .rev() + .take(10) + .rev() + .map(|e| { + format!( + " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", + e.title, + e.level, + e.physical_page.unwrap_or(0) + ) + }) + .collect::>() + .join(",\n"); + + let user = format!( + r#"Previously extracted structure: +[ +{} +] + +Continue extracting structure from these pages: +{} + +Return ONLY the NEW entries (do not repeat previous ones): +[ + {{"title": "...", "level": N, "physical_page": M}}, + ... +] + +If no new structural elements are found, return: []"#, + prev_summary, group.text + ); + + let sections: Vec = client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } } /// Format pages into tagged text for LLM consumption. diff --git a/rust/src/index/parse/toc/verifier.rs b/rust/src/index/parse/toc/verifier.rs index 09b28059..fd944386 100644 --- a/rust/src/index/parse/toc/verifier.rs +++ b/rust/src/index/parse/toc/verifier.rs @@ -3,7 +3,7 @@ //! Index verifier - verifies TOC entry page assignments. -use futures::future::join_all; +use futures::stream::{self, StreamExt}; use rand::seq::SliceRandom; use tracing::{debug, info}; @@ -65,7 +65,7 @@ impl IndexVerifier { /// Verify TOC entries against PDF pages. /// - /// All sample entries are verified concurrently via LLM calls. + /// Sample entries are verified via LLM calls with bounded concurrency. pub async fn verify( &self, entries: &[TocEntry], @@ -77,36 +77,36 @@ impl IndexVerifier { let sample = self.select_sample(entries); - // Launch all verification checks concurrently + // Launch verification checks with bounded concurrency let client = self.client.clone(); - let futures: Vec<_> = sample - .iter() - .map(|(index, entry)| { - let index = *index; - let title = entry.title.clone(); - let physical_page = entry.physical_page; - let client = client.clone(); - let pages = pages.to_vec(); - - async move { - match physical_page { - Some(page) => { - let result = - Self::verify_entry_with_client(&client, &title, page, &pages).await; - (index, title, page, result) - } - None => ( - index, - title, - 0, - Ok(Err(ErrorType::PageOutOfRange)), - ), + let futures: Vec<_> = sample.iter().map(|(index, entry)| { + let index = *index; + let title = entry.title.clone(); + let physical_page = entry.physical_page; + let client = client.clone(); + let pages = pages.to_vec(); + + async move { + match physical_page { + Some(page) => { + let result = + Self::verify_entry_with_client(&client, &title, page, &pages).await; + (index, title, page, result) } + None => ( + index, + title, + 0, + Ok(Err(ErrorType::PageOutOfRange)), + ), } - }) - .collect(); + } + }).collect(); - let results = join_all(futures).await; + let results: Vec<_> = stream::iter(futures) + .buffer_unordered(5) + .collect() + .await; // Aggregate results let total = results.len(); diff --git a/rust/src/index/stages/enhance.rs b/rust/src/index/stages/enhance.rs index 5550de45..a79b5fb3 100644 --- a/rust/src/index/stages/enhance.rs +++ b/rust/src/index/stages/enhance.rs @@ -313,6 +313,9 @@ impl IndexStage for EnhanceStage { let duration = start.elapsed().as_millis() as u64; ctx.metrics.record_enhance(duration); + if failed > 0 { + ctx.metrics.add_summaries_failed(failed); + } info!( "Generated {} summaries ({} shortcut, {} failed, {} skipped no content, {} skipped tokens) in {}ms", diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 92e6a4cc..689d331f 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -54,7 +54,7 @@ pub mod graph; mod index; mod llm; mod memo; -mod metrics; +pub mod metrics; mod retrieval; mod storage; mod throttle; @@ -62,8 +62,9 @@ mod utils; // Client API pub use client::{ - BuildError, ClientError, DocumentFormat, DocumentInfo, Engine, EngineBuilder, IndexContext, - IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, + BuildError, ClientError, DocumentFormat, DocumentInfo, Engine, EngineBuilder, FailedItem, + IndexContext, IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, + QueryResultItem, }; // Error types diff --git a/rust/src/metrics/index.rs b/rust/src/metrics/index.rs index 58054661..4432e32f 100644 --- a/rust/src/metrics/index.rs +++ b/rust/src/metrics/index.rs @@ -64,6 +64,10 @@ pub struct IndexMetrics { #[serde(default)] pub summaries_generated: usize, + /// Number of summaries that failed to generate (LLM error, rate limit, etc.). + #[serde(default)] + pub summaries_failed: usize, + /// Number of nodes skipped (thinning). #[serde(default)] pub nodes_skipped: usize, @@ -141,6 +145,11 @@ impl IndexMetrics { self.summaries_generated += 1; } + /// Add to summaries failed count. + pub fn add_summaries_failed(&mut self, count: usize) { + self.summaries_failed += count; + } + /// Increment nodes skipped. pub fn increment_nodes_skipped(&mut self) { self.nodes_skipped += 1;