diff --git a/docs/docs/architecture.mdx b/docs/docs/architecture.mdx new file mode 100644 index 00000000..d98fa7e6 --- /dev/null +++ b/docs/docs/architecture.mdx @@ -0,0 +1,88 @@ +--- +sidebar_position: 3 +--- + +# Architecture + +Vectorless transforms documents into hierarchical semantic trees and uses LLM-powered reasoning to navigate them. This page describes the end-to-end pipeline. + +## High-Level Flow + +```text +┌──────────────┐ ┌──────────────┐ ┌──────────────┐ +│ Document │────▶│ Index │────▶│ Storage │ +│ (PDF/MD) │ │ Pipeline │ │ (Disk) │ +└──────────────┘ └──────────────┘ └──────┬───────┘ + │ + ┌──────────────┐ ┌──────▼───────┐ + │ Result │◀────│ Retrieval │ + │ (Answer) │ │ Pipeline │ + └──────────────┘ └──────────────┘ +``` + +## Index Pipeline + +The indexing pipeline processes documents through ordered stages: + +| Stage | Priority | Description | +|-------|----------|-------------| +| **Parse** | 10 | Parse document into raw nodes (Markdown headings, PDF pages) | +| **Build** | 20 | Construct arena-based tree with thinning and content merge | +| **Validate** | 22 | Tree integrity checks | +| **Split** | 25 | Split oversized leaf nodes (>4000 tokens) | +| **Enhance** | 30 | Generate LLM summaries (Full, Selective, or Lazy strategy) | +| **Enrich** | 40 | Calculate metadata, page ranges, resolve cross-references | +| **Reasoning Index** | 45 | Build keyword-to-node mappings, synonym expansion, summary shortcuts | +| **Optimize** | 60 | Final tree optimization | + +Each stage is independently configurable. The pipeline supports incremental re-indexing via content fingerprinting. + +## Tree Structure + +Each node in the tree contains: + +```text +TreeNode +├── title — Section heading +├── content — Raw text (leaf nodes) +├── summary — LLM-generated summary +├── structure — Hierarchical index (e.g., "1.2.3") +├── depth — Tree depth (root = 0) +├── references[] — Resolved cross-references ("see Section 2.1" → NodeId) +├── token_count — Estimated token count +└── page_range — Start/end page (PDF) +``` + +## Retrieval Pipeline + +The retrieval pipeline consists of four phases: + +1. **Analyze** — Detect query complexity, extract keywords, decompose complex queries +2. **Plan** — Select retrieval strategy and search algorithm +3. **Search** — Execute tree traversal with Pilot guidance +4. **Evaluate** — Score, deduplicate, and aggregate results + +### Pilot + +The Pilot is the core intelligence component. It provides LLM-guided navigation at key decision points: + +- **Fork points** — When multiple children exist, Pilot evaluates which path to follow +- **Backtracking** — When a path yields insufficient results, Pilot suggests alternatives +- **Binary pruning** — Quick relevance filter for nodes with many children + +### Search Algorithms + +| Algorithm | Description | Use Case | +|-----------|-------------|----------| +| **Beam Search** | Explores multiple paths with backtracking | General purpose (recommended) | +| **MCTS** | Monte Carlo Tree Search with UCT selection | Complex multi-hop queries | +| **Pure Pilot** | Greedy single-path, Pilot at every level | High-accuracy, higher token cost | +| **ToC Navigator** | Table-of-contents based location | Broad queries ("what is this about?") | + +## Cross-Document Graph + +When multiple documents are indexed, Vectorless automatically builds a relationship graph based on shared keywords and Jaccard similarity. This graph enables cross-document retrieval with score boosting. + +## Zero Infrastructure + +The entire system requires only an LLM API key. No vector database, no embedding models, no additional infrastructure. Trees and metadata are persisted to the local filesystem in the workspace directory. diff --git a/docs/docs/examples/batch-indexing.mdx b/docs/docs/examples/batch-indexing.mdx new file mode 100644 index 00000000..ea7b23db --- /dev/null +++ b/docs/docs/examples/batch-indexing.mdx @@ -0,0 +1,94 @@ +--- +sidebar_position: 3 +--- + +# Batch Indexing + +Index multiple documents efficiently with progress tracking and error handling. + +## Python + +```python +import asyncio +from vectorless import Engine, IndexContext, IndexOptions + +async def main(): + engine = Engine( + workspace="./workspace", + api_key="sk-...", + model="gpt-4o", + ) + + # Index a directory of documents + result = await engine.index( + IndexContext.from_dir("./documents/") + ) + + print(f"Indexed {len(result.items)} documents") + print(f"Failures: {len(result.failed)}") + + for item in result.items: + print(f" ✓ {item.name} ({item.format}) → {item.doc_id}") + if item.metrics: + m = item.metrics + print(f" Nodes: {m.nodes_processed}, " + f"Summaries: {m.summaries_generated}, " + f"Time: {m.total_time_ms}ms") + + for fail in result.failed: + print(f" ✗ {fail.source}: {fail.error}") + + # List all indexed documents + docs = await engine.list() + print(f"\nTotal indexed: {len(docs)} documents") + +asyncio.run(main()) +``` + +## Rust + +```rust +use vectorless::client::{Engine, EngineBuilder, IndexContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let engine = EngineBuilder::new() + .with_workspace("./workspace") + .with_key("sk-...") + .with_model("gpt-4o") + .build() + .await?; + + // Index a directory + let result = engine.index(IndexContext::from_dir("./documents/")).await?; + + println!("Indexed {} documents", result.items.len()); + println!("Failures: {}", result.failed.len()); + + for item in &result.items { + println!(" ✓ {} ({:?}) → {}", item.name, item.format, item.doc_id); + } + + // List all documents + let docs = engine.list().await?; + println!("Total indexed: {} documents", docs.len()); + + Ok(()) +} +``` + +## Error Handling + +Each item in the result is either successful or failed. Failures don't prevent other documents from being indexed: + +```python +result = await engine.index(IndexContext.from_paths(mixed_paths)) + +# Successful items +for item in result.items: + process(item) + +# Failed items — handle gracefully +for fail in result.failed: + print(f"Failed: {fail.source} — {fail.error}") +``` diff --git a/docs/docs/examples/multi-document.mdx b/docs/docs/examples/multi-document.mdx new file mode 100644 index 00000000..6b6c404b --- /dev/null +++ b/docs/docs/examples/multi-document.mdx @@ -0,0 +1,88 @@ +--- +sidebar_position: 2 +--- + +# Multi-Document Retrieval + +Query across multiple indexed documents using the cross-document strategy with graph-based score boosting. + +## Python + +```python +import asyncio +from vectorless import ( + Engine, IndexContext, QueryContext, + IndexOptions, StrategyPreference +) + +async def main(): + engine = Engine( + workspace="./workspace", + api_key="sk-...", + model="gpt-4o", + ) + + # Index multiple documents + docs = ["./report-q1.pdf", "./report-q2.pdf", "./report-q3.pdf"] + doc_ids = [] + + for path in docs: + result = await engine.index(IndexContext.from_path(path)) + doc_ids.append(result.doc_id) + print(f"Indexed: {path} → {result.doc_id}") + + # Check the cross-document graph + graph = await engine.get_graph() + if graph: + print(f"\nGraph: {graph.node_count()} docs, {graph.edge_count()} edges") + for doc_id in doc_ids: + neighbors = graph.get_neighbors(doc_id) + for edge in neighbors: + print(f" {doc_id[:8]}... → {edge.target_doc_id[:8]}... ({edge.weight:.2f})") + + # Query across all documents + result = await engine.query( + QueryContext("Compare quarterly revenue trends") + .with_doc_ids(doc_ids) + .with_strategy(StrategyPreference.CROSS_DOCUMENT) + ) + + for item in result.items: + print(f"\n[{item.doc_id[:8]}...] Score: {item.score:.2f}") + print(item.content[:300]) + + # Or query entire workspace + result = await engine.query( + QueryContext("What documents discuss risk factors?") + .with_workspace() + ) + + print(f"\nFound in {len(result.items)} document(s)") + + # Cleanup + for doc_id in doc_ids: + await engine.remove(doc_id) + +asyncio.run(main()) +``` + +## Key Concepts + +### Document Graph + +After indexing, documents are connected in a graph based on shared keywords. The graph enables: + +- **Score boosting** — High-confidence results in one document boost neighbor documents +- **Relationship discovery** — Automatically find related documents +- **Cross-referencing** — Results from connected documents are surfaced together + +### Merge Strategies + +The cross-document strategy supports multiple merge modes: + +| Strategy | Description | +|----------|-------------| +| **TopK** | Return top-K results across all documents | +| **BestPerDocument** | Best result from each document | +| **WeightedByRelevance** | Weight by each document's best score | +| **GraphBoosted** | Use graph connections to boost scores | diff --git a/docs/docs/examples/quick-query.mdx b/docs/docs/examples/quick-query.mdx new file mode 100644 index 00000000..fd172a5f --- /dev/null +++ b/docs/docs/examples/quick-query.mdx @@ -0,0 +1,88 @@ +--- +sidebar_position: 1 +--- + +# Quick Query Example + +This example demonstrates the basic index-and-query workflow with both Python and Rust. + +## Python + +```python +import asyncio +from vectorless import Engine, IndexContext, QueryContext, StrategyPreference + +async def main(): + # 1. Create engine + engine = Engine( + workspace="./data", + api_key="sk-...", + model="gpt-4o", + ) + + # 2. Index a document + result = await engine.index(IndexContext.from_path("./report.pdf")) + doc_id = result.doc_id + print(f"Indexed document: {doc_id}") + + # 3. Simple keyword query + answer = await engine.query( + QueryContext("revenue") + .with_doc_id(doc_id) + .with_strategy(StrategyPreference.KEYWORD) + ) + print(f"Keyword result: {answer.single().content[:200]}") + + # 4. Complex reasoning query + answer = await engine.query( + QueryContext("What are the main factors affecting performance?") + .with_doc_id(doc_id) + .with_strategy(StrategyPreference.HYBRID) + ) + print(f"Score: {answer.single().score:.2f}") + print(f"Hybrid result: {answer.single().content[:200]}") + + # 5. Cleanup + await engine.remove(doc_id) + +asyncio.run(main()) +``` + +## Rust + +```rust +use vectorless::client::{Engine, EngineBuilder, IndexContext, QueryContext}; +use vectorless::StrategyPreference; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + // 1. Create engine + let engine = EngineBuilder::new() + .with_workspace("./data") + .with_key("sk-...") + .with_model("gpt-4o") + .build() + .await?; + + // 2. Index a document + let result = engine.index(IndexContext::from_path("./report.pdf")).await?; + let doc_id = result.doc_id().unwrap().to_string(); + println!("Indexed document: {}", doc_id); + + // 3. Query with hybrid strategy + let answer = engine.query( + QueryContext::new("What are the main factors affecting performance?") + .with_doc_id(&doc_id) + ).await?; + + if let Some(item) = answer.single() { + println!("Score: {:.2}", item.score); + println!("{}", item.content); + } + + // 4. Cleanup + engine.remove(&doc_id).await?; + + Ok(()) +} +``` diff --git a/docs/docs/features/cross-document-graph.mdx b/docs/docs/features/cross-document-graph.mdx new file mode 100644 index 00000000..e87c7d29 --- /dev/null +++ b/docs/docs/features/cross-document-graph.mdx @@ -0,0 +1,104 @@ +--- +sidebar_position: 3 +--- + +# Cross-Document Graph + +When multiple documents are indexed in a workspace, Vectorless automatically builds a relationship graph connecting documents by shared concepts. + +## Overview + +The cross-document graph represents documents as nodes and their relationships as weighted edges. Edge weights are computed from: + +- **Jaccard similarity** (60%) — Ratio of shared keywords to total unique keywords +- **Shared keyword count** (40%) — Absolute number of overlapping keywords + +```text +Document A ←── 0.72 ──→ Document B + │ │ + └── 0.45 ──→ Document C ←┘ +``` + +## How It Works + +### Graph Building + +After each indexing operation, the graph is automatically rebuilt: + +1. Extract keyword profiles from each document's reasoning index +2. Compute pairwise Jaccard similarity +3. Create edges for document pairs exceeding the similarity threshold +4. Store the graph in the workspace + +### Graph-Aware Retrieval + +When using the cross-document strategy, the graph boosts scores for connected documents: + +1. Search each document independently +2. Identify high-confidence results (score > 0.5) +3. For each high-confidence result, boost neighbor documents' scores +4. Re-rank the merged result set + +```python +from vectorless import Engine, QueryContext, StrategyPreference + +engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") + +# Query across all documents with graph boosting +result = await engine.query( + QueryContext("Compare the approaches").with_strategy( + StrategyPreference.CROSS_DOCUMENT + ) +) +``` + +## Accessing the Graph + +### Python + +```python +graph = await engine.get_graph() + +if graph: + print(f"Documents: {graph.node_count()}") + print(f"Relationships: {graph.edge_count()}") + + # Get neighbors for a specific document + neighbors = graph.get_neighbors(doc_id) + for edge in neighbors: + print(f" → {edge.target_doc_id} (weight: {edge.weight:.2f})") +``` + +### Rust + +```rust +if let Some(graph) = engine.get_graph().await? { + println!("Documents: {}", graph.node_count()); + println!("Edges: {}", graph.edge_count()); + + for edge in graph.get_neighbors(&doc_id) { + println!("→ {} (weight: {:.2})", edge.target_doc_id, edge.weight); + } +} +``` + +## Graph Node Information + +Each document node contains: + +- **doc_id** — Document identifier +- **title** — Root node title +- **format** — Document format (markdown, pdf) +- **node_count** — Number of tree nodes +- **top_keywords** — Top weighted keywords from the reasoning index + +## Configuration + +The graph is built automatically with default thresholds. Advanced configuration is available in Rust: + +```text +min_shared_keywords: 3 — Minimum shared keywords to create an edge +min_keyword_jaccard: 0.1 — Minimum Jaccard similarity threshold +max_keywords_per_doc: 50 — Max keywords extracted per document +max_edges_per_node: 10 — Max edges per document node +``` diff --git a/docs/docs/features/pdf-support.mdx b/docs/docs/features/pdf-support.mdx new file mode 100644 index 00000000..96a683a2 --- /dev/null +++ b/docs/docs/features/pdf-support.mdx @@ -0,0 +1,61 @@ +--- +sidebar_position: 4 +--- + +# PDF Support + +Vectorless supports PDF documents with full page-level tracking and hierarchical structure extraction. + +## Basic Usage + +```python +from vectorless import Engine, IndexContext + +engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") + +# Index a PDF +result = await engine.index(IndexContext.from_path("./report.pdf")) +doc_id = result.doc_id + +# Query the PDF +answer = await engine.query( + QueryContext("What is discussed on page 5?").with_doc_id(doc_id) +) +print(answer.single().content) +``` + +## Page-Level Tracking + +Each tree node records the page range it spans: + +- **Leaf nodes** — Store the exact page(s) their content comes from +- **Branch nodes** — Page range is propagated from children (min start, max end) + +This enables: + +- **Page-scoped queries** — Filter results by page range +- **Context display** — Show which pages contributed to an answer +- **Citation** — Reference specific pages in results + +## PDF Structure Extraction + +The parser extracts structure from PDFs by analyzing: + +1. **Font size and weight** — Larger/bold text indicates headings +2. **Text position** — Top-of-page text often indicates section titles +3. **Spacing** — Paragraph breaks signal content boundaries + +Extracted sections are organized into a hierarchical tree, just like Markdown documents. + +## Limitations + +- **Scanned PDFs** — OCR is not built-in. Scanned/image-based PDFs require pre-processing +- **Complex layouts** — Multi-column layouts may not be perfectly structured +- **Tables** — Table content is extracted as text but loses cell structure +- **Images** — Image content is not analyzed + +## Best Practices + +- Use text-based PDFs for best results (not scanned documents) +- Larger documents (>50 pages) may take longer to index due to LLM summary generation +- Use incremental indexing when re-indexing updated PDFs to avoid redundant processing diff --git a/docs/docs/features/summary-strategies.mdx b/docs/docs/features/summary-strategies.mdx new file mode 100644 index 00000000..a2bde66e --- /dev/null +++ b/docs/docs/features/summary-strategies.mdx @@ -0,0 +1,65 @@ +--- +sidebar_position: 1 +--- + +# Summary Strategies + +Summaries are critical for retrieval quality. The Pilot uses summaries to evaluate candidate nodes during tree navigation. Without summaries, the Pilot can only use node titles for decision-making, which significantly reduces accuracy. + +## Available Strategies + +### Full (Default) + +Generates summaries for every node in the tree. Branch nodes get navigation-oriented summaries ("what does this section cover"), while leaf nodes get content-oriented summaries ("what does this section say"). + +```rust +use vectorless::index::summary::SummaryStrategy; + +let strategy = SummaryStrategy::full(); +``` + +**Trade-off**: Highest token cost during indexing, but best retrieval quality. Recommended for production use. + +### Selective + +Only generates summaries for branch nodes (non-leaves) that exceed a token threshold. Useful when indexing large document sets on a budget. + +```rust +let strategy = SummaryStrategy::selective(100, true); +// min_tokens ↑ ↑ branch_only +``` + +- `min_tokens` — Minimum content tokens to generate a summary (default: 100) +- `branch_only` — Only generate for non-leaf nodes (default: true) + +**Trade-off**: Lower indexing cost, but leaf nodes lack summaries. The Pilot falls back to title-only evaluation at leaf level. + +### Lazy + +Defers summary generation to query time. Summaries are generated on-demand when a node is first accessed during retrieval. + +```rust +let strategy = SummaryStrategy::lazy(true); +// ↑ persist to disk +``` + +**Trade-off**: Zero indexing cost for summaries, but adds latency to the first query that touches each node. Subsequent queries benefit from cached summaries. + +## Choosing a Strategy + +| Scenario | Recommended Strategy | +|----------|---------------------| +| Production, accuracy matters | **Full** | +| Large document set, budget-constrained | **Selective** (min_tokens=100) | +| One-time queries, minimal indexing time | **Lazy** | +| Batch indexing with later queries | **Full** (index once, query many times) | + +## How Summaries Are Used + +During retrieval, the Pilot builds context for each candidate node: + +1. **Title** — Always available, highest priority signal +2. **Summary** — Used for semantic evaluation at fork points +3. **Content** — Used for BM25 scoring and final result + +When a node has no summary, the Pilot's decision quality degrades. This is why **Full** is the default — it ensures the Pilot always has summaries to work with. diff --git a/docs/docs/features/synonym-expansion.mdx b/docs/docs/features/synonym-expansion.mdx new file mode 100644 index 00000000..527bc70e --- /dev/null +++ b/docs/docs/features/synonym-expansion.mdx @@ -0,0 +1,71 @@ +--- +sidebar_position: 2 +--- + +# Synonym Expansion + +When users query with different wording than the document, keyword-based retrieval can miss relevant content. Synonym expansion addresses this by generating alternative search terms during indexing. + +## The Problem + +A document might use "revenue" throughout, but a user queries for "income" or "earnings". Without synonym expansion, the keyword strategy would miss these connections entirely. + +## How It Works + +During the reasoning index stage, the system: + +1. Selects the top-N keywords by frequency (capped at 20-100 keywords) +2. For each keyword, calls the LLM to generate up to 5 synonyms or related terms +3. Adds synonym entries to the topic index with **0.6x weight** (lower than direct keyword matches) + +```text +Keyword: "revenue" (weight: 1.0) + └── Synonym: "income" (weight: 0.6) + └── Synonym: "earnings" (weight: 0.6) + └── Synonym: "turnover" (weight: 0.6) +``` + +The lower weight ensures that direct keyword matches are always preferred over synonym matches, while still surfacing relevant content that would otherwise be missed. + +## Enabling Synonym Expansion + +### Python + +```python +from vectorless import IndexOptions + +# Enabled by default +opts = IndexOptions(enable_synonym_expansion=True) + +# Disable for faster indexing (at the cost of recall) +opts = IndexOptions(enable_synonym_expansion=False) +``` + +### Rust + +The synonym expansion is controlled via `ReasoningIndexConfig`: + +```rust +use vectorless::document::ReasoningIndexConfig; + +let config = ReasoningIndexConfig::default() + .with_synonym_expansion(true); +``` + +## Cost Impact + +Synonym expansion adds LLM calls during indexing (one per top keyword). For a typical document: + +- **Additional LLM calls**: 20-100 (depending on keyword count) +- **Additional tokens**: ~500-2000 (short prompt + response per keyword) +- **Indexing time increase**: 10-30 seconds + +The cost is paid once at index time. At query time, synonyms are already in the topic index, so there is **zero additional cost** during retrieval. + +## When to Disable + +Consider disabling synonym expansion when: + +- Your documents use highly domain-specific terminology with no common synonyms +- You need the fastest possible indexing time +- Your queries always use the same terminology as the documents diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx new file mode 100644 index 00000000..60c27ea0 --- /dev/null +++ b/docs/docs/getting-started.mdx @@ -0,0 +1,103 @@ +--- +sidebar_position: 2 +--- + +# Getting Started + +## Prerequisites + +- Python 3.9+ or Rust 1.75+ +- An LLM API key (OpenAI, or any OpenAI-compatible endpoint) + +## Python SDK + +### Installation + +```bash +pip install vectorless +``` + +### Index and Query + +```python +import asyncio +from vectorless import Engine, IndexContext, QueryContext + +async def main(): + # Create an engine + engine = Engine( + workspace="./data", + api_key="sk-...", + model="gpt-4o", + ) + + # Index a document + result = await engine.index(IndexContext.from_path("./report.pdf")) + doc_id = result.doc_id + print(f"Indexed: {doc_id}") + + # Query the document + answer = await engine.query( + QueryContext("What is the total revenue?").with_doc_id(doc_id) + ) + print(answer.single().content) + +asyncio.run(main()) +``` + +### Using a Custom Endpoint + +```python +engine = Engine( + workspace="./data", + api_key="sk-...", + model="gpt-4o", + endpoint="https://api.your-provider.com/v1", +) +``` + +## Rust Crate + +### Installation + +Add to your `Cargo.toml`: + +```toml +[dependencies] +vectorless = "0.1" +``` + +### Index and Query + +```rust +use vectorless::{EngineBuilder, IndexContext, QueryContext}; + +#[tokio::main] +async fn main() -> vectorless::Result<()> { + let engine = EngineBuilder::new() + .with_workspace("./data") + .with_key("sk-...") + .with_model("gpt-4o") + .build() + .await?; + + let result = engine.index(IndexContext::from_path("./report.pdf")).await?; + let doc_id = result.doc_id().unwrap(); + + let result = engine.query( + QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + ).await?; + + if let Some(item) = result.single() { + println!("{}", item.content); + } + + Ok(()) +} +``` + +## Next Steps + +- [Architecture](/docs/architecture) — Understand the indexing and retrieval pipeline +- [Indexing Overview](/docs/indexing/overview) — Learn about each pipeline stage +- [Retrieval Strategies](/docs/retrieval/strategies) — Choose the right strategy for your use case diff --git a/docs/docs/indexing/configuration.mdx b/docs/docs/indexing/configuration.mdx new file mode 100644 index 00000000..907d0b23 --- /dev/null +++ b/docs/docs/indexing/configuration.mdx @@ -0,0 +1,88 @@ +--- +sidebar_position: 2 +--- + +# Configuration + +This page covers the configurable options for the indexing pipeline. + +## IndexOptions (Python) + +```python +from vectorless import IndexOptions + +opts = IndexOptions( + mode="default", # "default", "force", "incremental" + generate_summaries=True, # Generate LLM summaries + generate_description=False, # Generate document description + include_text=True, # Include node text in tree + generate_ids=True, # Generate node IDs + enable_synonym_expansion=True, # Expand keywords with LLM synonyms +) +``` + +### Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `mode` | `str` | `"default"` | Indexing mode: `"default"`, `"force"`, or `"incremental"` | +| `generate_summaries` | `bool` | `True` | Generate LLM summaries for tree nodes | +| `generate_description` | `bool` | `False` | Generate a document-level description | +| `include_text` | `bool` | `True` | Store node text content in the tree | +| `generate_ids` | `bool` | `True` | Generate unique node IDs | +| `enable_synonym_expansion` | `bool` | `True` | Expand indexed keywords with LLM-generated synonyms | + +## PipelineOptions (Rust) + +```rust +use vectorless::index::{PipelineOptions, SummaryStrategy}; + +let options = PipelineOptions::default() + .with_summary_strategy(SummaryStrategy::full()); +``` + +## Summary Strategy + +Three strategies control how summaries are generated: + +| Strategy | Description | Token Cost | +|----------|-------------|------------| +| **Full** | Summarize every node | High | +| **Selective** | Only branch nodes above token threshold | Medium | +| **Lazy** | Generate on-demand at query time | Deferred | + +The default is **Full** since summaries are critical for Pilot navigation quality. + +```rust +use vectorless::index::summary::SummaryStrategy; + +// Full (default) +let strategy = SummaryStrategy::full(); + +// Selective — only branch nodes with >= 100 tokens +let strategy = SummaryStrategy::selective(100, true); + +// Lazy — generate at query time +let strategy = SummaryStrategy::lazy(true); +``` + +## Reasoning Index Config + +The reasoning index is configured via `ReasoningIndexConfig`: + +| Field | Default | Description | +|-------|---------|-------------| +| `enabled` | `true` | Enable reasoning index construction | +| `enable_synonym_expansion` | `true` | Expand keywords with LLM synonyms | +| `max_keyword_entries` | `5000` | Max keyword-to-node mappings | +| `max_topic_entries` | `20` | Max topic entries per keyword | +| `min_keyword_length` | `2` | Minimum keyword length to index | +| `build_summary_shortcut` | `true` | Build summary shortcut for overview queries | + +## Split Config + +| Field | Default | Description | +|-------|---------|-------------| +| `max_tokens_per_node` | `4000` | Token limit before splitting a leaf node | + +Lower values produce finer-grained nodes but increase tree size. The default of 4000 tokens balances retrieval precision with tree compactness. diff --git a/docs/docs/indexing/incremental.mdx b/docs/docs/indexing/incremental.mdx new file mode 100644 index 00000000..d50ae3b3 --- /dev/null +++ b/docs/docs/indexing/incremental.mdx @@ -0,0 +1,59 @@ +--- +sidebar_position: 3 +--- + +# Incremental Indexing + +When documents change, re-indexing from scratch can be wasteful. Vectorless supports incremental indexing to avoid redundant LLM calls and processing. + +## Content Fingerprinting + +Every indexed document stores a content fingerprint (hash). When incremental mode is enabled: + +1. Compute the fingerprint of the new document content +2. Compare against the stored fingerprint +3. If identical, skip reprocessing entirely +4. If changed, reprocess only the affected parts + +## Usage + +### Python + +```python +from vectorless import IndexContext, IndexOptions + +# Only re-index if content changed +ctx = IndexContext.from_path("./report.pdf").with_options( + IndexOptions(mode="incremental") +) +result = await engine.index(ctx) +``` + +### Rust + +```rust +use vectorless::client::{IndexContext, IndexMode}; + +let ctx = IndexContext::from_path("./report.pdf") + .with_mode(IndexMode::Incremental); +let result = engine.index(ctx).await?; +``` + +## Indexing Modes + +| Mode | Behavior | +|------|----------| +| `default` | Skip if already indexed (by document ID) | +| `incremental` | Re-index only if content fingerprint changed | +| `force` | Always re-index, overwriting existing data | + +## What Gets Reused + +When a document is incrementally re-indexed: + +- **Summaries** — Reused for nodes whose content hasn't changed +- **Reasoning index** — Keyword mappings and synonym expansions are preserved +- **Cross-references** — Re-resolved against the updated tree +- **Metadata** — Page ranges, token counts recalculated + +This can reduce indexing time by 60-80% for documents with minor edits. diff --git a/docs/docs/indexing/overview.mdx b/docs/docs/indexing/overview.mdx new file mode 100644 index 00000000..f50d01a1 --- /dev/null +++ b/docs/docs/indexing/overview.mdx @@ -0,0 +1,98 @@ +--- +sidebar_position: 1 +--- + +# Indexing Overview + +The indexing pipeline transforms documents into searchable hierarchical trees. This page describes each stage and how they work together. + +## Pipeline Stages + +### Parse (Priority 10) + +Parses raw documents into a list of `RawNode` structures: + +- **Markdown** — Splits by headings (`#`, `##`, `###`), preserving hierarchy +- **PDF** — Extracts text per page, groups into sections by layout analysis + +### Build (Priority 20) + +Constructs an arena-based `DocumentTree` from raw nodes: + +- Creates parent-child relationships based on heading levels +- Applies **thinning** — merges single-child chains to reduce tree depth +- Merges content from nodes that don't add structural value + +### Validate (Priority 22) + +Checks tree integrity: + +- No orphaned nodes +- Consistent depth values +- Valid parent-child relationships + +### Split (Priority 25) + +Splits oversized leaf nodes that exceed the token threshold (default: 4000 tokens): + +- Finds paragraph or heading boundaries for clean splits +- Preserves semantic coherence within each split + +### Enhance (Priority 30) + +Generates LLM summaries for tree nodes: + +- **Full** — Summarize every node (default) +- **Selective** — Only branch nodes above a token threshold +- **Lazy** — Generate summaries on-demand at query time + +Branch nodes get navigation-oriented summaries ("what does this section cover"), while leaf nodes get content-oriented summaries ("what does this section say"). + +### Enrich (Priority 40) + +Adds metadata to the tree: + +- **Page ranges** — Propagates page boundaries from leaves to parents +- **Token statistics** — Calculates total tokens and node counts +- **Cross-reference resolution** — Parses "see Section 2.1", "Appendix G" references and resolves them to actual `NodeId`s in the tree +- **Document description** — Generates a description from the root summary + +### Reasoning Index (Priority 45) + +Builds a pre-computed index for fast retrieval: + +- **Topic paths** — Maps keywords to nodes with weights (title: 2.0x, summary: 1.5x, content: 1.0x) +- **Synonym expansion** — Expands top keywords with LLM-generated synonyms (0.6x weight) +- **Summary shortcut** — Pre-computed document overview for "what is this about" queries +- **Section map** — Depth-1 section titles for fast ToC lookup + +### Optimize (Priority 60) + +Final tree optimization: + +- Removes redundant metadata +- Compacts tree structure for efficient storage + +## Pipeline Options + +```python +from vectorless import IndexOptions + +# Default options (synonym expansion enabled) +opts = IndexOptions() + +# Force re-indexing +opts = IndexOptions(mode="force") + +# Disable summaries for speed +opts = IndexOptions(generate_summaries=False) +``` + +## Incremental Indexing + +When indexing with `mode="incremental"`, the pipeline: + +1. Computes a content fingerprint (hash) of the input +2. Compares against the previously stored fingerprint +3. Skips reprocessing if the content hasn't changed +4. Reuses existing summaries and reasoning index data for unchanged nodes diff --git a/docs/docs/intro.mdx b/docs/docs/intro.mdx index b82d43a2..2e65e88a 100644 --- a/docs/docs/intro.mdx +++ b/docs/docs/intro.mdx @@ -11,8 +11,8 @@ It transforms documents into hierarchical semantic trees and uses LLMs to naviga ## How It Works 1. **Parse** — Documents (Markdown, PDF) are parsed into hierarchical semantic trees, preserving structure and relationships between sections. -2. **Index** — Trees are stored with metadata, keywords, and optional summaries. Incremental indexing skips unchanged files via content fingerprinting. -3. **Query** — An LLM navigates the tree to find the most relevant sections. No embeddings, no similarity search — just structural reasoning. +2. **Index** — Trees are stored with metadata, keywords, and summaries. The pipeline resolves cross-references ("see Section 2.1") and expands keywords with LLM-generated synonyms for improved recall. Incremental indexing skips unchanged files via content fingerprinting. +3. **Query** — An LLM navigates the tree to find the most relevant sections. Multiple search algorithms (Beam Search, MCTS, Greedy) are available, and the Pilot component provides LLM-guided navigation at key decision points. ## Quick Start @@ -77,7 +77,10 @@ async fn main() -> vectorless::Result<()> { - **Hierarchical Semantic Trees** — Preserves document structure, not flat chunks - **LLM-Powered Retrieval** — Structural reasoning over the tree, not vector similarity +- **Cross-Reference Navigation** — Automatically resolves "see Section 2.1", "Appendix G" references and follows them during retrieval +- **Synonym Expansion** — LLM-generated synonyms for indexed keywords improve recall for differently-worded queries +- **Multi-Algorithm Search** — Beam Search, MCTS, Greedy, and ToC Navigator with LLM Pilot guidance +- **Cross-Document Graph** — Automatic relationship discovery between documents via shared keywords - **Incremental Indexing** — Content fingerprinting skips unchanged files -- **Cross-Document Graph** — Automatic relationship discovery between documents - **Multi-Format** — Markdown and PDF support - **Zero Infrastructure** — No vector DB, no embedding models, just an LLM API key diff --git a/docs/docs/retrieval/cross-references.mdx b/docs/docs/retrieval/cross-references.mdx new file mode 100644 index 00000000..6ec4b7d2 --- /dev/null +++ b/docs/docs/retrieval/cross-references.mdx @@ -0,0 +1,71 @@ +--- +sidebar_position: 4 +--- + +# Cross-Reference Navigation + +Documents often contain internal references like "see Section 2.1" or "refer to Appendix G". Vectorless automatically extracts and resolves these references, enabling the retrieval engine to follow them during search. + +## How It Works + +### Extraction + +During the enrich stage, the `ReferenceExtractor` scans node content for reference patterns: + +| Pattern | Example | Reference Type | +|---------|---------|---------------| +| `Section X.Y` | "see Section 2.1" | Section | +| `Chapter X` | "Chapter 3" | Section | +| `Appendix X` | "Appendix G" | Appendix | +| `Table X.Y` | "Table 5.3" | Table | +| `Figure X.Y` | "Figure 2.1" | Figure | +| `Equation X.Y` | "Equation 2.3" | Equation | +| `Page X` | "see page 42" | Page | + +### Resolution + +Extracted references are resolved to actual `NodeId`s in the tree: + +- **Section references** — Matched by structure index (e.g., "2.1" → node with structure "2.1") +- **Appendix references** — Matched by title ("Appendix G") +- **Table/Figure references** — Matched by title substring +- **Page references** — Matched via the page index + +Resolved references are stored on the node with a confidence score. + +### Search Integration + +During retrieval, when the search algorithm expands a node's children, it also includes resolved reference targets: + +```text +Node: "Results Overview" +├── Child: "Performance Metrics" +├── Child: "Comparison" +└── Reference → "Appendix A: Raw Data" ← followed during search +``` + +This means the search engine can jump from a section that references an appendix directly to that appendix, even though it's not a direct child in the tree. + +## Supported Reference Types + +- `Section` — Matches section numbers like "2.1", "3.2.1" +- `Appendix` — Matches lettered appendices like "A", "G" +- `Table` — Matches table numbers like "5.3" +- `Figure` — Matches figure numbers like "2.1" +- `Equation` — Matches equation numbers +- `Page` — Matches page numbers + +## Example + +Given this document structure: + +```text +1. Introduction +2. Results + 2.1. Performance + 2.2. Analysis (content: "see Appendix A for raw data") +3. Conclusion +Appendix A: Raw Data +``` + +When the search reaches "2.2 Analysis", it will also have "Appendix A" as a candidate thanks to the resolved cross-reference. If the query asks about "raw data", the search can jump directly to the appendix. diff --git a/docs/docs/retrieval/overview.mdx b/docs/docs/retrieval/overview.mdx new file mode 100644 index 00000000..7e381604 --- /dev/null +++ b/docs/docs/retrieval/overview.mdx @@ -0,0 +1,51 @@ +--- +sidebar_position: 1 +--- + +# Retrieval Overview + +The retrieval pipeline transforms a user query into relevant document content by navigating the hierarchical tree structure with LLM-guided reasoning. + +## Pipeline Phases + +```text +Query ──▶ Analyze ──▶ Plan ──▶ Search ──▶ Evaluate ──▶ Result + │ │ │ │ + ▼ ▼ ▼ ▼ + Keywords Strategy Algorithm Score & + Complexity Selection Execution Dedup +``` + +### Analyze + +- Extract keywords from the query +- Detect query complexity (simple keyword match vs. multi-hop reasoning) +- Decompose complex queries into sub-queries when needed + +### Plan + +- Select the retrieval strategy (Keyword, LLM, Hybrid, Cross-Document) +- Select the search algorithm (Beam Search, MCTS, Pure Pilot) +- Configure beam width, depth limits, and iteration budgets + +### Search + +- Use the ToC Navigator to locate relevant subtrees +- Execute the selected search algorithm from located subtrees +- The Pilot provides LLM guidance at key decision points + +### Evaluate + +- Score and rank candidate nodes +- Deduplicate overlapping results +- Aggregate content within the token budget + +## Quick Selection Guide + +| Use Case | Strategy | Algorithm | +|----------|----------|-----------| +| Simple keyword lookup | Keyword | Beam Search | +| Complex question requiring reasoning | Hybrid | Beam Search | +| Multi-hop reasoning across sections | LLM | MCTS | +| Multiple documents | Cross-Document | Beam Search | +| Fast overview of document | Keyword | ToC Navigator | diff --git a/docs/docs/retrieval/search-algorithms.mdx b/docs/docs/retrieval/search-algorithms.mdx new file mode 100644 index 00000000..370c762c --- /dev/null +++ b/docs/docs/retrieval/search-algorithms.mdx @@ -0,0 +1,79 @@ +--- +sidebar_position: 3 +--- + +# Search Algorithms + +Search algorithms determine how the tree is traversed during retrieval. Each algorithm has different trade-offs between accuracy, speed, and token cost. + +## Algorithm Overview + +| Algorithm | Paths Explored | Backtracking | Token Cost | Use Case | +|-----------|---------------|-------------|------------|----------| +| **Beam Search** | Top-K (beam width) | Yes | Medium | General purpose | +| **MCTS** | Statistical sampling | Yes | High | Complex multi-hop | +| **Pure Pilot** | Single best path | No | High | High-accuracy single-path | +| **ToC Navigator** | ToC-guided | No | Low | Broad overview queries | + +## Beam Search + +Explores multiple paths simultaneously, keeping the top-K candidates at each level. Supports backtracking when paths yield low scores. + +```text +Root +├── A (score: 0.9) ──▶ explore children +├── B (score: 0.7) ──▶ explore children +└── C (score: 0.3) ──▶ discard (below beam width) +``` + +- **Beam width** controls how many paths are kept (default: 3) +- **Fallback stack** preserves truncated paths for backtracking +- Pilot weight: 0.7 (blended with NodeScorer at 0.3) + +This is the **recommended algorithm** for most use cases. + +## MCTS (Monte Carlo Tree Search) + +Uses Upper Confidence Bound for Trees (UCT) to balance exploration vs. exploitation. Runs multiple simulations to statistically identify the best path. + +```text + Root (100 visits) + / \ + A (60v) B (40v) + / \ \ + C (35v) D (25v) E (40v) +``` + +- Best for complex queries requiring multi-hop reasoning +- Pilot provides priors for UCT selection +- Higher iteration count improves accuracy + +## Pure Pilot + +Greedy single-path search where the Pilot picks the best child at each level. The most token-expensive approach since it makes an LLM call at every tree level. + +- Pilot weight: 1.0 (no algorithm fallback) +- Best for queries where the correct path is unambiguous +- Fast for shallow trees, expensive for deep ones + +## ToC Navigator + +Uses the document's table of contents to locate relevant top-level sections before running a search algorithm. Works in two modes: + +- **Keyword mode** — Matches query keywords against ToC entries +- **LLM mode** — Sends the ToC to the LLM for semantic matching + +The ToC Navigator is typically the first step in the search pipeline, narrowing the search space before running Beam Search or MCTS within the located subtree. + +## Configuration + +```python +from vectorless import QueryContext + +ctx = ( + QueryContext("complex multi-hop question") + .with_doc_id(doc_id) + .with_depth_limit(10) # Max tree traversal depth + .with_max_tokens(4000) # Max tokens in result +) +``` diff --git a/docs/docs/retrieval/strategies.mdx b/docs/docs/retrieval/strategies.mdx new file mode 100644 index 00000000..e60cd3e1 --- /dev/null +++ b/docs/docs/retrieval/strategies.mdx @@ -0,0 +1,81 @@ +--- +sidebar_position: 2 +--- + +# Retrieval Strategies + +Vectorless provides five retrieval strategies, each designed for different query types and accuracy/speed trade-offs. + +## Strategy Overview + +| Strategy | LLM Calls | Speed | Accuracy | Best For | +|----------|-----------|-------|----------|----------| +| **Keyword** | 0 (search) | Fastest | Good | Simple keyword matches | +| **LLM** | High | Slowest | Best | Complex reasoning queries | +| **Hybrid** | Medium | Medium | High | General purpose (recommended) | +| **Cross-Document** | Varies | Medium | High | Multi-document queries | +| **Page Range** | 0 (search) | Fast | Good | PDF page-scoped queries | + +## Keyword Strategy + +Fast TF-IDF/BM25 matching against the pre-computed reasoning index. No LLM calls during search. + +```python +from vectorless import QueryContext, StrategyPreference + +ctx = QueryContext("revenue").with_doc_id(doc_id).with_strategy( + StrategyPreference.KEYWORD +) +``` + +Use when: +- The query contains exact terms from the document +- Speed is the priority +- You want zero additional LLM token cost + +## LLM Strategy + +LLM-powered tree navigation with full contextual understanding. The LLM sees the table of contents, node summaries, and makes navigation decisions at each level. + +```python +ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_id(doc_id).with_strategy( + StrategyPreference.LLM +) +``` + +Use when: +- The query requires multi-hop reasoning +- Synonyms or paraphrases are likely +- Accuracy is more important than speed + +## Hybrid Strategy (Recommended) + +Two-phase retrieval: BM25 pre-filter followed by LLM refinement. Combines the speed of keyword matching with the accuracy of LLM reasoning. + +```python +ctx = QueryContext("What are the growth trends?").with_doc_id(doc_id).with_strategy( + StrategyPreference.HYBRID +) +``` + +The recommended default for most queries. Fast pre-filtering reduces the number of nodes sent to the LLM, keeping token costs manageable while maintaining high accuracy. + +## Cross-Document Strategy + +Searches across multiple indexed documents and aggregates results. Uses the cross-document relationship graph for score boosting. + +```python +ctx = QueryContext("Compare the architectures").with_strategy( + StrategyPreference.CROSS_DOCUMENT +) +``` + +When a high-confidence result is found in one document, neighbor documents in the graph receive a score boost, surfacing related content across the workspace. + +## Auto Selection + +When using `StrategyPreference.AUTO` (default), the engine analyzes query complexity and selects the appropriate strategy: + +- Simple keyword queries → Keyword strategy +- Complex reasoning queries → Hybrid strategy +- Multi-document scope → Cross-Document strategy diff --git a/docs/docs/sdk/python.mdx b/docs/docs/sdk/python.mdx new file mode 100644 index 00000000..a4306d68 --- /dev/null +++ b/docs/docs/sdk/python.mdx @@ -0,0 +1,175 @@ +--- +sidebar_position: 1 +--- + +# Python SDK + +The Python SDK provides an async API built on PyO3 for high-performance integration with Python applications. + +## Installation + +```bash +pip install vectorless +``` + +## Engine + +The `Engine` is the main entry point. It requires an LLM API key and model name. + +```python +from vectorless import Engine + +engine = Engine( + workspace="./data", # Local directory for indexed data + api_key="sk-...", # LLM API key + model="gpt-4o", # LLM model name + endpoint=None, # Optional: custom API endpoint +) +``` + +## Indexing + +### From a File + +```python +from vectorless import IndexContext, IndexOptions + +result = await engine.index(IndexContext.from_path("./report.pdf")) +print(result.doc_id) # Document ID for querying +``` + +### From Multiple Files + +```python +result = await engine.index(IndexContext.from_paths(["./a.pdf", "./b.md"])) +``` + +### From a Directory + +```python +result = await engine.index(IndexContext.from_dir("./documents/")) +``` + +### From Text Content + +```python +result = await engine.index( + IndexContext.from_content("# Title\n\nContent...", "markdown").with_name("my-doc") +) +``` + +### With Options + +```python +result = await engine.index( + IndexContext.from_path("./report.pdf").with_options( + IndexOptions( + mode="force", + generate_summaries=True, + enable_synonym_expansion=True, + ) + ) +) +``` + +## Querying + +### Single Document + +```python +from vectorless import QueryContext, StrategyPreference + +answer = await engine.query( + QueryContext("What is the total revenue?") + .with_doc_id(doc_id) + .with_strategy(StrategyPreference.HYBRID) +) + +if answer.single(): + print(answer.single().content) + print(f"Score: {answer.single().score}") +``` + +### Multiple Documents + +```python +answer = await engine.query( + QueryContext("Compare the approaches") + .with_doc_ids(["doc-1", "doc-2"]) +) +``` + +### Workspace Query + +```python +answer = await engine.query( + QueryContext("What documents discuss performance?") + .with_workspace() +) +``` + +### Query Options + +```python +answer = await engine.query( + QueryContext("Explain the architecture") + .with_doc_id(doc_id) + .with_max_tokens(4000) # Max tokens in result + .with_include_reasoning(True) # Include reasoning chain + .with_depth_limit(10) # Max traversal depth + .with_strategy(StrategyPreference.LLM) +) +``` + +## Document Management + +```python +# List all indexed documents +docs = await engine.list() +for doc in docs: + print(f"{doc.id}: {doc.name} ({doc.format})") + +# Check if a document exists +exists = await engine.exists(doc_id) + +# Remove a document +removed = await engine.remove(doc_id) + +# Remove all documents +count = await engine.clear() +``` + +## Document Graph + +```python +graph = await engine.get_graph() +if graph: + print(f"Nodes: {graph.node_count()}, Edges: {graph.edge_count()}") + neighbors = graph.get_neighbors(doc_id) + for edge in neighbors: + print(f" → {edge.target_doc_id} (weight: {edge.weight:.2f})") +``` + +## API Reference + +### IndexOptions + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `mode` | `str` | `"default"` | Indexing mode | +| `generate_summaries` | `bool` | `True` | Generate LLM summaries | +| `generate_description` | `bool` | `False` | Generate document description | +| `include_text` | `bool` | `True` | Include node text | +| `generate_ids` | `bool` | `True` | Generate node IDs | +| `enable_synonym_expansion` | `bool` | `True` | LLM synonym expansion | + +### StrategyPreference + +| Constant | Description | +|----------|-------------| +| `StrategyPreference.AUTO` | Auto-select based on query complexity | +| `StrategyPreference.KEYWORD` | Fast keyword matching | +| `StrategyPreference.LLM` | LLM-guided navigation | +| `StrategyPreference.HYBRID` | BM25 + LLM refinement | +| `StrategyPreference.CROSS_DOCUMENT` | Multi-document retrieval | +| `StrategyPreference.PAGE_RANGE` | Page-scoped retrieval | diff --git a/docs/docs/sdk/rust.mdx b/docs/docs/sdk/rust.mdx new file mode 100644 index 00000000..768368b1 --- /dev/null +++ b/docs/docs/sdk/rust.mdx @@ -0,0 +1,127 @@ +--- +sidebar_position: 2 +--- + +# Rust Crate + +The Rust crate provides the core engine with full control over the indexing and retrieval pipeline. + +## Installation + +```toml +[dependencies] +vectorless = "0.1" +``` + +## Engine + +```rust +use vectorless::client::{Engine, EngineBuilder}; + +let engine = EngineBuilder::new() + .with_workspace("./data") + .with_key("sk-...") + .with_model("gpt-4o") + .with_endpoint("https://api.openai.com/v1") // optional + .build() + .await?; +``` + +## Indexing + +```rust +use vectorless::client::{IndexContext, IndexOptions, IndexMode}; + +// From a file +let result = engine.index(IndexContext::from_path("./report.pdf")).await?; +let doc_id = result.doc_id().unwrap(); + +// From content +let result = engine.index( + IndexContext::from_content("# Title\n\nContent...", DocumentFormat::Markdown) +).await?; + +// With options +let opts = IndexOptions::new() + .with_mode(IndexMode::Force); +let result = engine.index( + IndexContext::from_path("./report.pdf").with_options(opts) +).await?; +``` + +## Querying + +```rust +use vectorless::client::QueryContext; +use vectorless::StrategyPreference; + +let result = engine.query( + QueryContext::new("What is the total revenue?") + .with_doc_id(doc_id) + .with_strategy(StrategyPreference::ForceHybrid) + .with_max_tokens(4000) + .with_include_reasoning(true) +).await?; + +if let Some(item) = result.single() { + println!("Score: {:.2}", item.score); + println!("Content: {}", item.content); +} +``` + +## Document Management + +```rust +// List documents +for doc in engine.list().await? { + println!("{}: {} ({})", doc.id, doc.name, doc.format); +} + +// Check existence +let exists = engine.exists(&doc_id).await?; + +// Remove +let removed = engine.remove(&doc_id).await?; + +// Clear all +let count = engine.clear().await?; +``` + +## Document Graph + +```rust +if let Some(graph) = engine.get_graph().await? { + println!("Nodes: {}", graph.node_count()); + println!("Edges: {}", graph.edge_count()); + + for edge in graph.get_neighbors(&doc_id) { + println!("→ {} (weight: {:.2})", edge.target_doc_id, edge.weight); + } +} +``` + +## Advanced Configuration + +### Summary Strategy + +```rust +use vectorless::index::summary::SummaryStrategy; + +// Full summaries (default) +let strategy = SummaryStrategy::full(); + +// Selective — only branch nodes with >= 100 tokens +let strategy = SummaryStrategy::selective(100, true); + +// Lazy — generate at query time +let strategy = SummaryStrategy::lazy(true); +``` + +### Pipeline Options + +```rust +use vectorless::index::PipelineOptions; + +let options = PipelineOptions::default(); +// Pass to indexer via the engine builder or context +``` diff --git a/docs/sidebars.ts b/docs/sidebars.ts index 28971397..e3ddf067 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -1,33 +1,57 @@ import type {SidebarsConfig} from '@docusaurus/plugin-content-docs'; -// This runs in Node.js - Don't use client-side code here (browser APIs, JSX...) - -/** - * Creating a sidebar enables you to: - - create an ordered group of docs - - render a sidebar for each doc of that group - - provide next/previous navigation - - The sidebars can be generated from the filesystem, or explicitly defined here. - - Create as many sidebars as you want. - */ const sidebars: SidebarsConfig = { - // By default, Docusaurus generates a sidebar from the docs folder structure - tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], - - // But you can create a sidebar manually - /* tutorialSidebar: [ 'intro', - 'hello', + 'getting-started', + 'architecture', + { + type: 'category', + label: 'Indexing', + items: [ + 'indexing/overview', + 'indexing/configuration', + 'indexing/incremental', + ], + }, + { + type: 'category', + label: 'Retrieval', + items: [ + 'retrieval/overview', + 'retrieval/strategies', + 'retrieval/search-algorithms', + 'retrieval/cross-references', + ], + }, + { + type: 'category', + label: 'Features', + items: [ + 'features/summary-strategies', + 'features/synonym-expansion', + 'features/cross-document-graph', + 'features/pdf-support', + ], + }, + { + type: 'category', + label: 'SDK', + items: [ + 'sdk/python', + 'sdk/rust', + ], + }, { type: 'category', - label: 'Tutorial', - items: ['tutorial-basics/create-a-document'], + label: 'Examples', + items: [ + 'examples/quick-query', + 'examples/multi-document', + 'examples/batch-indexing', + ], }, ], - */ }; export default sidebars; diff --git a/python/src/lib.rs b/python/src/lib.rs index 4a743842..45ac87fb 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -107,6 +107,9 @@ fn parse_format(format: &str) -> PyResult { /// generate_description: Whether to generate document description. Default: False. /// include_text: Whether to include node text in the tree. Default: True. /// generate_ids: Whether to generate node IDs. Default: True. +/// enable_synonym_expansion: Whether to expand keywords with LLM-generated +/// synonyms during indexing. Improves recall for differently-worded queries. +/// Default: False. #[pyclass(name = "IndexOptions", skip_from_py_object)] #[derive(Clone)] pub struct PyIndexOptions { @@ -116,13 +119,14 @@ pub struct PyIndexOptions { #[pymethods] impl PyIndexOptions { #[new] - #[pyo3(signature = (mode="default", generate_summaries=true, generate_description=false, include_text=true, generate_ids=true))] + #[pyo3(signature = (mode="default", generate_summaries=true, generate_description=false, include_text=true, generate_ids=true, enable_synonym_expansion=false))] fn new( mode: &str, generate_summaries: bool, generate_description: bool, include_text: bool, generate_ids: bool, + enable_synonym_expansion: bool, ) -> PyResult { let mut opts = IndexOptions::new(); match mode { @@ -140,12 +144,13 @@ impl PyIndexOptions { opts.generate_description = generate_description; opts.include_text = include_text; opts.generate_ids = generate_ids; + opts.enable_synonym_expansion = enable_synonym_expansion; Ok(Self { inner: opts }) } fn __repr__(&self) -> String { format!( - "IndexOptions(mode='{}', generate_summaries={}, generate_description={}, include_text={}, generate_ids={})", + "IndexOptions(mode='{}', generate_summaries={}, generate_description={}, include_text={}, generate_ids={}, enable_synonym_expansion={})", match self.inner.mode { IndexMode::Default => "default", IndexMode::Force => "force", @@ -155,6 +160,7 @@ impl PyIndexOptions { self.inner.generate_description, self.inner.include_text, self.inner.generate_ids, + self.inner.enable_synonym_expansion, ) } } diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index 2764aaa7..490e8a78 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -28,7 +28,7 @@ use uuid::Uuid; use crate::error::{Error, Result}; use crate::index::parse::DocumentFormat; -use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, SummaryStrategy}; +use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions, ReasoningIndexConfig, SummaryStrategy}; use crate::llm::LlmClient; use crate::storage::{DocumentMeta, PersistedDocument}; @@ -285,6 +285,10 @@ impl IndexerClient { SummaryStrategy::none() }, generate_description: options.generate_description, + reasoning_index: ReasoningIndexConfig { + enable_synonym_expansion: options.enable_synonym_expansion, + ..ReasoningIndexConfig::default() + }, existing_tree, ..Default::default() } diff --git a/rust/src/client/types.rs b/rust/src/client/types.rs index 079e5cfe..4ab82590 100644 --- a/rust/src/client/types.rs +++ b/rust/src/client/types.rs @@ -206,6 +206,11 @@ pub struct IndexOptions { /// Whether to generate document description. pub generate_description: bool, + + /// Whether to expand keywords with LLM-generated synonyms + /// during reasoning index construction. Improves recall for + /// queries that use different wording than the document. + pub enable_synonym_expansion: bool, } impl Default for IndexOptions { @@ -216,6 +221,7 @@ impl Default for IndexOptions { include_text: true, generate_ids: true, generate_description: false, + enable_synonym_expansion: true, } } } diff --git a/rust/src/document/reasoning.rs b/rust/src/document/reasoning.rs index 6146763e..f673ce58 100644 --- a/rust/src/document/reasoning.rs +++ b/rust/src/document/reasoning.rs @@ -265,6 +265,11 @@ pub struct ReasoningIndexConfig { pub min_keyword_length: usize, /// Whether to build the summary shortcut. pub build_summary_shortcut: bool, + /// Whether to expand keywords with LLM-generated synonyms. + /// When enabled, the indexing stage calls the LLM to generate + /// synonym terms for each keyword, improving recall for queries + /// that use different wording than the document. + pub enable_synonym_expansion: bool, } impl Default for ReasoningIndexConfig { @@ -276,6 +281,7 @@ impl Default for ReasoningIndexConfig { max_keyword_entries: 5000, min_keyword_length: 2, build_summary_shortcut: true, + enable_synonym_expansion: true, } } } @@ -305,6 +311,12 @@ impl ReasoningIndexConfig { self.build_summary_shortcut = build; self } + + /// Enable or disable synonym expansion. + pub fn with_synonym_expansion(mut self, enable: bool) -> Self { + self.enable_synonym_expansion = enable; + self + } } #[cfg(test)] diff --git a/rust/src/document/tree.rs b/rust/src/document/tree.rs index 24dacb26..e0ca6a59 100644 --- a/rust/src/document/tree.rs +++ b/rust/src/document/tree.rs @@ -359,6 +359,26 @@ impl DocumentTree { self.children_iter(id).collect() } + /// Get the children of a node plus any resolved cross-reference targets. + /// + /// In addition to direct children, this collects `NodeId`s pointed to by + /// resolved references (`node.references[i].target_node`) on the given node. + /// Duplicate node IDs (e.g. a reference that happens to be a child) are + /// de-duplicated so the caller never sees the same node twice. + pub fn children_with_refs(&self, id: NodeId) -> Vec { + let mut result: Vec = self.children_iter(id).collect(); + if let Some(node) = self.get(id) { + for r#ref in &node.references { + if let Some(target) = r#ref.target_node { + if !result.contains(&target) { + result.push(target); + } + } + } + } + result + } + /// Get the parent of a node. /// /// Returns None if the node is the root or doesn't have a parent. @@ -602,6 +622,13 @@ impl DocumentTree { } } + /// Set the references for a node. + pub fn set_references(&mut self, id: NodeId, references: Vec) { + if let Some(node) = self.get_mut(id) { + node.references = references; + } + } + /// Export the tree structure to JSON format. pub fn to_structure_json(&self, doc_name: &str) -> DocumentStructure { let structure = self.build_structure_nodes(self.root_id); @@ -775,3 +802,88 @@ impl Default for DocumentTree { Self::new("Root", "") } } + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::reference::{NodeReference, RefType}; + + #[test] + fn test_children_with_refs_no_references() { + let mut tree = DocumentTree::new("Root", "root content"); + let child1 = tree.add_child(tree.root(), "Section 1", "content 1"); + let child2 = tree.add_child(tree.root(), "Section 2", "content 2"); + + let children = tree.children_with_refs(tree.root()); + assert_eq!(children.len(), 2); + assert!(children.contains(&child1)); + assert!(children.contains(&child2)); + } + + #[test] + fn test_children_with_refs_includes_resolved_references() { + let mut tree = DocumentTree::new("Root", "root content"); + let section1 = tree.add_child(tree.root(), "Section 1", "content 1"); + let section2 = tree.add_child(tree.root(), "Section 2", "content 2"); + let appendix = tree.add_child(tree.root(), "Appendix A", "appendix content"); + + // Add a resolved reference from Section 1 to Appendix A + let refs = vec![NodeReference::resolved( + "see Appendix A".to_string(), + "A".to_string(), + RefType::Appendix, + 10, + appendix, + 0.9, + )]; + tree.set_references(section1, refs); + + // section1's children_with_refs should include appendix as a reference target + let children = tree.children_with_refs(section1); + // section1 has no direct children, but has a resolved reference to appendix + assert_eq!(children.len(), 1); + assert!(children.contains(&appendix)); + } + + #[test] + fn test_children_with_refs_deduplicates() { + let mut tree = DocumentTree::new("Root", "root content"); + let child = tree.add_child(tree.root(), "Section 1", "content 1"); + + // Add a reference that points to the same node as an existing child + let refs = vec![NodeReference::resolved( + "see Section 1".to_string(), + "1".to_string(), + RefType::Section, + 5, + child, + 0.8, + )]; + tree.set_references(tree.root(), refs); + + let children = tree.children_with_refs(tree.root()); + // Should not duplicate + assert_eq!(children.len(), 1); + assert!(children.contains(&child)); + } + + #[test] + fn test_children_with_refs_unresolved_ignored() { + let mut tree = DocumentTree::new("Root", "root content"); + let child = tree.add_child(tree.root(), "Section 1", "content 1"); + + // Add an unresolved reference (target_node = None) + let refs = vec![NodeReference::new( + "see Section 5".to_string(), + "5".to_string(), + RefType::Section, + 5, + )]; + tree.set_references(tree.root(), refs); + + let children = tree.children_with_refs(tree.root()); + // Unresolved reference should not be included + assert_eq!(children.len(), 1); + assert!(children.contains(&child)); + } +} diff --git a/rust/src/index/config.rs b/rust/src/index/config.rs index d43d7900..edb20c2e 100644 --- a/rust/src/index/config.rs +++ b/rust/src/index/config.rs @@ -158,7 +158,7 @@ impl Default for SplitConfig { fn default() -> Self { Self { enabled: true, - max_tokens_per_node: 8000, + max_tokens_per_node: 4000, pattern_split: true, } } diff --git a/rust/src/index/mod.rs b/rust/src/index/mod.rs index 269ad362..512f93c2 100644 --- a/rust/src/index/mod.rs +++ b/rust/src/index/mod.rs @@ -64,6 +64,7 @@ pub use pipeline::{IndexInput, IndexMetrics, PipelineExecutor, PipelineResult}; // Re-export config types pub use config::{IndexMode, PipelineOptions, ThinningConfig}; +pub use crate::document::ReasoningIndexConfig; // Re-export summary pub use summary::SummaryStrategy; diff --git a/rust/src/index/stages/enrich.rs b/rust/src/index/stages/enrich.rs index ff758ddd..29fe55ab 100644 --- a/rust/src/index/stages/enrich.rs +++ b/rust/src/index/stages/enrich.rs @@ -7,7 +7,7 @@ use super::async_trait; use std::time::Instant; use tracing::info; -use crate::document::{DocumentTree, NodeId, TocView}; +use crate::document::{DocumentTree, NodeId, RefType, ReferenceExtractor, TocView}; use crate::error::Result; use super::{AccessPattern, IndexStage, StageResult}; @@ -93,6 +93,46 @@ impl EnrichStage { } } } + + /// Extract and resolve in-document cross-references for all nodes. + /// + /// Parses content for patterns like "see Section 2.1", "Appendix G", etc. + /// and resolves them to actual `NodeId`s in the tree using the retrieval + /// index for fast lookup. + fn resolve_references(tree: &mut DocumentTree) -> usize { + let retrieval_index = tree.build_retrieval_index(); + let node_ids: Vec = tree.traverse().into_iter().collect(); + let mut total_resolved = 0; + + for node_id in node_ids { + let content = tree.get(node_id).map(|n| n.content.clone()).unwrap_or_default(); + if content.is_empty() { + continue; + } + + // Quick check: skip nodes without any reference-like patterns + let content_lower = content.to_lowercase(); + let has_ref_pattern = content_lower.contains("section") + || content_lower.contains("appendix") + || content_lower.contains("table") + || content_lower.contains("figure") + || content_lower.contains("page") + || content_lower.contains("equation"); + + if !has_ref_pattern { + continue; + } + + let refs = ReferenceExtractor::extract_and_resolve(&content, tree, &retrieval_index); + let resolved = refs.iter().filter(|r| r.is_resolved()).count(); + if resolved > 0 { + total_resolved += resolved; + } + tree.set_references(node_id, refs); + } + + total_resolved + } } impl Default for EnrichStage { @@ -142,7 +182,13 @@ impl IndexStage for EnrichStage { let (total_tokens, node_count) = Self::calculate_token_stats(tree); info!("Total tokens: {}, nodes: {}", total_tokens, node_count); - // 4. Generate document description + // 4. Extract and resolve cross-references + let resolved_refs = Self::resolve_references(tree); + if resolved_refs > 0 { + info!("Resolved {} cross-references", resolved_refs); + } + + // 5. Generate document description self.generate_description(ctx); let duration = start.elapsed().as_millis() as u64; @@ -158,7 +204,42 @@ impl IndexStage for EnrichStage { stage_result .metadata .insert("node_count".to_string(), serde_json::json!(node_count)); + stage_result + .metadata + .insert("resolved_references".to_string(), serde_json::json!(resolved_refs)); Ok(stage_result) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resolve_references_section_ref() { + let mut tree = DocumentTree::new("Root", "root content"); + let s1 = tree.add_child(tree.root(), "Introduction", "Introduction text."); + tree.set_structure(s1, "1"); + let s2 = tree.add_child(tree.root(), "Details", "For details, see Section 1 for more info"); + tree.set_structure(s2, "2"); + + let resolved = EnrichStage::resolve_references(&mut tree); + assert_eq!(resolved, 1); + + // Verify the reference was stored on s2 and resolved to s1 + let refs = tree.get(s2).unwrap().references.clone(); + assert_eq!(refs.len(), 1); + assert_eq!(refs[0].ref_type, RefType::Section); + assert_eq!(refs[0].target_node, Some(s1)); + } + + #[test] + fn test_resolve_references_no_refs() { + let mut tree = DocumentTree::new("Root", "root content"); + tree.add_child(tree.root(), "Section 1", "No references here."); + + let resolved = EnrichStage::resolve_references(&mut tree); + assert_eq!(resolved, 0); + } +} diff --git a/rust/src/index/stages/reasoning.rs b/rust/src/index/stages/reasoning.rs index 0b02fcc5..0a7d1711 100644 --- a/rust/src/index/stages/reasoning.rs +++ b/rust/src/index/stages/reasoning.rs @@ -15,6 +15,7 @@ use crate::document::{ TopicEntry, }; use crate::error::Result; +use crate::llm::LlmClient; use crate::retrieval::scoring::extract_keywords; use super::async_trait; @@ -165,6 +166,76 @@ impl ReasoningIndexStage { section_map } + /// Expand keywords with LLM-generated synonyms. + /// + /// For each existing keyword in `topic_paths`, ask the LLM for synonymous + /// search terms. Synonym entries inherit the same node mappings but with + /// a reduced weight (0.6x) to reflect the indirect match. + async fn expand_synonyms( + topic_paths: &mut std::collections::HashMap>, + llm_client: &LlmClient, + max_keywords: usize, + ) -> usize { + use std::collections::HashSet; + + let existing_keys: HashSet = topic_paths.keys().cloned().collect(); + // Pick top keywords by entry count for synonym expansion + let mut ranked: Vec<(String, usize)> = topic_paths + .iter() + .map(|(k, v)| (k.clone(), v.len())) + .collect(); + ranked.sort_by(|a, b| b.1.cmp(&a.1)); + ranked.truncate(max_keywords); + + let mut synonym_count = 0; + + for (keyword, _) in &ranked { + let prompt = format!( + "List up to 5 synonyms or related search terms for \"{}\". \ + Return only the terms separated by commas, no numbering, no explanation.", + keyword + ); + + match llm_client + .complete( + "You are a thesaurus assistant. Return only comma-separated synonyms.", + &prompt, + ) + .await + { + Ok(response) => { + let synonyms: Vec = response + .to_lowercase() + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty() && s.len() >= 2 && !existing_keys.contains(s)) + .collect(); + + if let Some(entries) = topic_paths.get(keyword) { + let source_entries = entries.clone(); + for syn in synonyms { + let synonym_entries: Vec = source_entries + .iter() + .map(|e| TopicEntry { + node_id: e.node_id, + weight: e.weight * 0.6, + depth: e.depth, + }) + .collect(); + topic_paths.insert(syn, synonym_entries); + synonym_count += 1; + } + } + } + Err(e) => { + tracing::warn!("Synonym expansion failed for '{}': {}", keyword, e); + } + } + } + + synonym_count + } + /// Build summary shortcut from root and depth-1 nodes. fn build_summary_shortcut(tree: &crate::document::DocumentTree) -> Option { let root = tree.root(); @@ -257,13 +328,28 @@ impl IndexStage for ReasoningIndexStage { info!("Building reasoning index..."); // 1. Build topic-to-path mapping - let (topic_paths, keyword_count) = Self::build_topic_paths(tree, config); + let (mut topic_paths, keyword_count) = Self::build_topic_paths(tree, config); let topic_count: usize = topic_paths.values().map(|v| v.len()).sum(); info!( "Built topic paths: {} keywords, {} topic entries", keyword_count, topic_count ); + // 1b. Optional: expand keywords with LLM-generated synonyms + let synonym_count = if config.enable_synonym_expansion { + if let Some(ref llm_client) = ctx.llm_client { + let max_kw = (keyword_count / 4).max(20).min(100); + let count = Self::expand_synonyms(&mut topic_paths, llm_client, max_kw).await; + info!("Expanded {} synonym keywords", count); + count + } else { + info!("Synonym expansion enabled but no LLM client available"); + 0 + } + } else { + 0 + }; + // 2. Build section map let section_map = Self::build_section_map(tree); info!("Built section map with {} entries", section_map.len()); @@ -301,11 +387,12 @@ impl IndexStage for ReasoningIndexStage { .record_reasoning_index(duration, topic_count, keyword_count); info!( - "Reasoning index built in {}ms ({} keywords, {} topic entries, {} sections)", + "Reasoning index built in {}ms ({} keywords, {} topic entries, {} sections, {} synonyms)", duration, keyword_count, topic_count, reasoning_index.section_count(), + synonym_count, ); ctx.reasoning_index = Some(reasoning_index); @@ -319,6 +406,9 @@ impl IndexStage for ReasoningIndexStage { stage_result .metadata .insert("topics_indexed".to_string(), serde_json::json!(topic_count)); + stage_result + .metadata + .insert("synonyms_expanded".to_string(), serde_json::json!(synonym_count)); Ok(stage_result) } diff --git a/rust/src/index/summary/strategy.rs b/rust/src/index/summary/strategy.rs index 1e48b229..2753be91 100644 --- a/rust/src/index/summary/strategy.rs +++ b/rust/src/index/summary/strategy.rs @@ -75,9 +75,7 @@ pub enum SummaryStrategy { impl Default for SummaryStrategy { fn default() -> Self { - Self::Selective { - min_tokens: 100, - branch_only: true, + Self::Full { config: SummaryStrategyConfig::default(), } } diff --git a/rust/src/retrieval/search/beam.rs b/rust/src/retrieval/search/beam.rs index 649ef1ee..6181e1c7 100644 --- a/rust/src/retrieval/search/beam.rs +++ b/rust/src/retrieval/search/beam.rs @@ -256,8 +256,8 @@ impl BeamSearch { // Fallback stack holds viable paths truncated from the beam let mut fallback_stack: Vec = Vec::new(); - // Initialize with start_node's children - let start_children = tree.children(start_node); + // Initialize with start_node's children (includes resolved cross-references) + let start_children = tree.children_with_refs(start_node); debug!("Start node has {} children", start_children.len()); let initial_candidates = score_candidates_detailed( @@ -373,8 +373,8 @@ impl BeamSearch { continue; } - // Expand this path - let children = tree.children(leaf_id); + // Expand this path (includes resolved cross-references) + let children = tree.children_with_refs(leaf_id); let scored_children = score_candidates_detailed( tree, @@ -457,7 +457,7 @@ impl BeamSearch { // Fallback: if no results found, add best candidates regardless of score if result.paths.is_empty() && config.min_score > 0.0 { debug!("No results above min_score, adding best candidates as fallback"); - let all_children = tree.children(start_node); + let all_children = tree.children_with_refs(start_node); let fallback = score_candidates( tree, &all_children, diff --git a/rust/src/retrieval/search/greedy.rs b/rust/src/retrieval/search/greedy.rs index f644e986..506509b6 100644 --- a/rust/src/retrieval/search/greedy.rs +++ b/rust/src/retrieval/search/greedy.rs @@ -56,7 +56,7 @@ impl PurePilotSearch { for iteration in 0..config.max_iterations { result.iterations = iteration + 1; - let children = tree.children(current_node); + let children = tree.children_with_refs(current_node); if children.is_empty() { current_path.leaf = Some(current_node); diff --git a/rust/src/retrieval/search/mcts.rs b/rust/src/retrieval/search/mcts.rs index 3470af73..9f17cccc 100644 --- a/rust/src/retrieval/search/mcts.rs +++ b/rust/src/retrieval/search/mcts.rs @@ -89,7 +89,7 @@ impl MctsSearch { cache: &PilotDecisionCache, visited: &HashSet, ) -> Option<(NodeId, f32)> { - let children = tree.children(node_id); + let children = tree.children_with_refs(node_id); if children.is_empty() { return None; } @@ -160,7 +160,7 @@ impl MctsSearch { count += 1; while depth < max_depth { - let children = tree.children(current); + let children = tree.children_with_refs(current); if children.is_empty() { break; } @@ -312,7 +312,7 @@ impl MctsSearch { top_k: usize, result: &mut SearchResult, ) { - let root_children = tree.children(root); + let root_children = tree.children_with_refs(root); let mut scored_children: Vec<_> = root_children .iter() .filter_map(|&child_id| { diff --git a/rust/src/retrieval/search/toc_navigator.rs b/rust/src/retrieval/search/toc_navigator.rs index 95e0cf2a..badf6444 100644 --- a/rust/src/retrieval/search/toc_navigator.rs +++ b/rust/src/retrieval/search/toc_navigator.rs @@ -434,7 +434,7 @@ fn collect_tree_entries( entries.push((node.title.clone(), summary)); node_ids.push(node_id); - for child_id in tree.children(node_id) { + for child_id in tree.children_with_refs(node_id) { collect_tree_entries(tree, child_id, entries, node_ids, depth + 1, max_depth); } } diff --git a/rust/src/retrieval/stages/search.rs b/rust/src/retrieval/stages/search.rs index a15410c3..bdc0d9c7 100644 --- a/rust/src/retrieval/stages/search.rs +++ b/rust/src/retrieval/stages/search.rs @@ -27,7 +27,8 @@ use crate::retrieval::search::{ SearchTree, ToCNavigator, }; use crate::retrieval::strategy::{ - HybridConfig, HybridStrategy, KeywordStrategy, LlmStrategy, RetrievalStrategy, + CrossDocumentConfig, CrossDocumentStrategy, DocumentEntry, HybridConfig, HybridStrategy, + KeywordStrategy, LlmStrategy, RetrievalStrategy, }; use crate::retrieval::types::{ NavigationDecision, ReasoningCandidate, ReasoningStep, StageName, StrategyPreference, @@ -152,14 +153,35 @@ impl SearchStage { Arc::new(self.keyword_strategy.clone()) } } - StrategyPreference::ForceCrossDocument | StrategyPreference::ForcePageRange => { + StrategyPreference::ForceCrossDocument => { + // Build a CrossDocumentStrategy with graph-based boosting + let inner: Box = + Box::new(self.keyword_strategy.clone()); + + let cross_doc = + CrossDocumentStrategy::new(inner).with_config(CrossDocumentConfig::default()); + + // Attach graph for GraphBoosted merge if available. + // Multi-document trees are collected at the orchestrator level. + let cross_doc = if let Some(ref graph) = ctx.document_graph { + cross_doc.with_graph(graph.clone()) + } else { + cross_doc + }; + + info!( + "Using CrossDocument strategy (graph={})", + ctx.document_graph.is_some() + ); + Arc::new(cross_doc) + } + StrategyPreference::ForcePageRange => { if let Some(ref strategy) = self.hybrid_strategy { - info!("Using Hybrid strategy as fallback for {:?})", preference); + info!("Using Hybrid strategy as fallback for ForcePageRange"); strategy.clone() } else { warn!( - "{:?} requires special configuration, falling back to Keyword", - preference + "ForcePageRange requires special configuration, falling back to Keyword" ); Arc::new(self.keyword_strategy.clone()) } diff --git a/rust/src/retrieval/strategy/mod.rs b/rust/src/retrieval/strategy/mod.rs index 44bdf880..19f9ac38 100644 --- a/rust/src/retrieval/strategy/mod.rs +++ b/rust/src/retrieval/strategy/mod.rs @@ -18,6 +18,7 @@ mod llm; mod page_range; mod r#trait; +pub use cross_document::{CrossDocumentConfig, CrossDocumentStrategy, DocumentEntry}; pub use hybrid::{HybridConfig, HybridStrategy}; pub use keyword::KeywordStrategy; pub use llm::LlmStrategy;