diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..c7f5c576 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,56 @@ +name: Release + +on: + push: + tags: + - 'v*' + +env: + CARGO_TERM_COLOR: always + +jobs: + # Publish Rust crate to crates.io + publish-crates: + name: Publish to crates.io + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - name: Publish vectorless crate + run: cargo publish -p vectorless + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + # Publish Python package to PyPI + publish-pypi: + name: Publish to PyPI + runs-on: ubuntu-latest + permissions: + id-token: write # Trusted Publishers + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: pyo3/maturin-action@v1 + with: + command: publish + args: --skip-existing + maturin-version: latest + + # Create GitHub Release + github-release: + name: GitHub Release + runs-on: ubuntu-latest + needs: [publish-crates, publish-pypi] + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + - name: Extract version from tag + id: version + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ github.ref_name }} + name: Release ${{ steps.version.outputs.VERSION }} + generate_release_notes: true diff --git a/Cargo.toml b/Cargo.toml index ef9c22b5..1a626bab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["rust", "python"] resolver = "2" [workspace.package] -version = "0.1.27" +version = "0.1.28" edition = "2024" authors = ["zTgx "] license = "Apache-2.0" diff --git a/README.md b/README.md index 2dd54431..d74ea257 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,6 @@ from vectorless import Engine, IndexContext async def main(): # Create engine — api_key and model are required engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -63,7 +62,6 @@ use vectorless::client::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() @@ -75,7 +73,7 @@ async fn main() -> vectorless::Result<()> { // Query let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; println!("Answer: {}", result.content); diff --git a/docs/blog/2026-04-12-welcome/index.mdx b/docs/blog/2026-04-12-welcome/index.mdx index e9de1b1d..686655cd 100644 --- a/docs/blog/2026-04-12-welcome/index.mdx +++ b/docs/blog/2026-04-12-welcome/index.mdx @@ -29,7 +29,6 @@ from vectorless import Engine, IndexContext async def main(): engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -53,7 +52,6 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() @@ -63,7 +61,7 @@ async fn main() -> vectorless::Result<()> { let doc_id = result.doc_id().unwrap(); let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; println!("{}", result.content); diff --git a/docs/docs/examples/batch-indexing.mdx b/docs/docs/examples/batch-indexing.mdx index ea7b23db..847e738b 100644 --- a/docs/docs/examples/batch-indexing.mdx +++ b/docs/docs/examples/batch-indexing.mdx @@ -14,7 +14,6 @@ from vectorless import Engine, IndexContext, IndexOptions async def main(): engine = Engine( - workspace="./workspace", api_key="sk-...", model="gpt-4o", ) @@ -53,7 +52,6 @@ use vectorless::client::{Engine, EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./workspace") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/docs/docs/examples/multi-document.mdx b/docs/docs/examples/multi-document.mdx index 6b6c404b..11ddc458 100644 --- a/docs/docs/examples/multi-document.mdx +++ b/docs/docs/examples/multi-document.mdx @@ -12,12 +12,11 @@ Query across multiple indexed documents using the cross-document strategy with g import asyncio from vectorless import ( Engine, IndexContext, QueryContext, - IndexOptions, StrategyPreference + IndexOptions, ) async def main(): engine = Engine( - workspace="./workspace", api_key="sk-...", model="gpt-4o", ) @@ -44,7 +43,6 @@ async def main(): result = await engine.query( QueryContext("Compare quarterly revenue trends") .with_doc_ids(doc_ids) - .with_strategy(StrategyPreference.CROSS_DOCUMENT) ) for item in result.items: @@ -54,7 +52,6 @@ async def main(): # Or query entire workspace result = await engine.query( QueryContext("What documents discuss risk factors?") - .with_workspace() ) print(f"\nFound in {len(result.items)} document(s)") diff --git a/docs/docs/examples/quick-query.mdx b/docs/docs/examples/quick-query.mdx index fd172a5f..07f66390 100644 --- a/docs/docs/examples/quick-query.mdx +++ b/docs/docs/examples/quick-query.mdx @@ -10,12 +10,11 @@ This example demonstrates the basic index-and-query workflow with both Python an ```python import asyncio -from vectorless import Engine, IndexContext, QueryContext, StrategyPreference +from vectorless import Engine, IndexContext, QueryContext async def main(): # 1. Create engine engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -28,16 +27,14 @@ async def main(): # 3. Simple keyword query answer = await engine.query( QueryContext("revenue") - .with_doc_id(doc_id) - .with_strategy(StrategyPreference.KEYWORD) + .with_doc_ids([doc_id]) ) print(f"Keyword result: {answer.single().content[:200]}") # 4. Complex reasoning query answer = await engine.query( QueryContext("What are the main factors affecting performance?") - .with_doc_id(doc_id) - .with_strategy(StrategyPreference.HYBRID) + .with_doc_ids([doc_id]) ) print(f"Score: {answer.single().score:.2f}") print(f"Hybrid result: {answer.single().content[:200]}") @@ -58,7 +55,6 @@ use vectorless::StrategyPreference; async fn main() -> vectorless::Result<()> { // 1. Create engine let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() @@ -72,7 +68,7 @@ async fn main() -> vectorless::Result<()> { // 3. Query with hybrid strategy let answer = engine.query( QueryContext::new("What are the main factors affecting performance?") - .with_doc_id(&doc_id) + .with_doc_ids(vec![doc_id.clone()]) ).await?; if let Some(item) = answer.single() { diff --git a/docs/docs/features/cross-document-graph.mdx b/docs/docs/features/cross-document-graph.mdx index e87c7d29..1ac22fbe 100644 --- a/docs/docs/features/cross-document-graph.mdx +++ b/docs/docs/features/cross-document-graph.mdx @@ -40,15 +40,13 @@ When using the cross-document strategy, the graph boosts scores for connected do 4. Re-rank the merged result set ```python -from vectorless import Engine, QueryContext, StrategyPreference +from vectorless import Engine, QueryContext -engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") +engine = Engine(api_key="sk-...", model="gpt-4o") # Query across all documents with graph boosting result = await engine.query( - QueryContext("Compare the approaches").with_strategy( - StrategyPreference.CROSS_DOCUMENT - ) + QueryContext("Compare the approaches") ) ``` diff --git a/docs/docs/features/pdf-support.mdx b/docs/docs/features/pdf-support.mdx index 96a683a2..48682f22 100644 --- a/docs/docs/features/pdf-support.mdx +++ b/docs/docs/features/pdf-support.mdx @@ -11,7 +11,7 @@ Vectorless supports PDF documents with full page-level tracking and hierarchical ```python from vectorless import Engine, IndexContext -engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") +engine = Engine(api_key="sk-...", model="gpt-4o") # Index a PDF result = await engine.index(IndexContext.from_path("./report.pdf")) @@ -19,7 +19,7 @@ doc_id = result.doc_id # Query the PDF answer = await engine.query( - QueryContext("What is discussed on page 5?").with_doc_id(doc_id) + QueryContext("What is discussed on page 5?").with_doc_ids([doc_id]) ) print(answer.single().content) ``` diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx index 60c27ea0..14f541a6 100644 --- a/docs/docs/getting-started.mdx +++ b/docs/docs/getting-started.mdx @@ -26,7 +26,6 @@ from vectorless import Engine, IndexContext, QueryContext async def main(): # Create an engine engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -38,7 +37,7 @@ async def main(): # Query the document answer = await engine.query( - QueryContext("What is the total revenue?").with_doc_id(doc_id) + QueryContext("What is the total revenue?").with_doc_ids([doc_id]) ) print(answer.single().content) @@ -49,7 +48,6 @@ asyncio.run(main()) ```python engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", endpoint="https://api.your-provider.com/v1", @@ -75,7 +73,6 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() @@ -85,7 +82,7 @@ async fn main() -> vectorless::Result<()> { let doc_id = result.doc_id().unwrap(); let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; if let Some(item) = result.single() { diff --git a/docs/docs/intro.mdx b/docs/docs/intro.mdx index 2e65e88a..88fa23fb 100644 --- a/docs/docs/intro.mdx +++ b/docs/docs/intro.mdx @@ -28,7 +28,6 @@ from vectorless import Engine, IndexContext async def main(): engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -55,7 +54,6 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() @@ -65,7 +63,7 @@ async fn main() -> vectorless::Result<()> { let doc_id = result.doc_id().unwrap(); let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; println!("{}", result.content); diff --git a/docs/docs/retrieval/search-algorithms.mdx b/docs/docs/retrieval/search-algorithms.mdx index 370c762c..a0f45242 100644 --- a/docs/docs/retrieval/search-algorithms.mdx +++ b/docs/docs/retrieval/search-algorithms.mdx @@ -72,7 +72,7 @@ from vectorless import QueryContext ctx = ( QueryContext("complex multi-hop question") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_depth_limit(10) # Max tree traversal depth .with_max_tokens(4000) # Max tokens in result ) diff --git a/docs/docs/retrieval/strategies.mdx b/docs/docs/retrieval/strategies.mdx index e60cd3e1..e718220a 100644 --- a/docs/docs/retrieval/strategies.mdx +++ b/docs/docs/retrieval/strategies.mdx @@ -21,11 +21,9 @@ Vectorless provides five retrieval strategies, each designed for different query Fast TF-IDF/BM25 matching against the pre-computed reasoning index. No LLM calls during search. ```python -from vectorless import QueryContext, StrategyPreference +from vectorless import QueryContext -ctx = QueryContext("revenue").with_doc_id(doc_id).with_strategy( - StrategyPreference.KEYWORD -) +ctx = QueryContext("revenue").with_doc_ids([doc_id]) ``` Use when: @@ -38,9 +36,7 @@ Use when: LLM-powered tree navigation with full contextual understanding. The LLM sees the table of contents, node summaries, and makes navigation decisions at each level. ```python -ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_id(doc_id).with_strategy( - StrategyPreference.LLM -) +ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_ids([doc_id]) ``` Use when: @@ -53,9 +49,7 @@ Use when: Two-phase retrieval: BM25 pre-filter followed by LLM refinement. Combines the speed of keyword matching with the accuracy of LLM reasoning. ```python -ctx = QueryContext("What are the growth trends?").with_doc_id(doc_id).with_strategy( - StrategyPreference.HYBRID -) +ctx = QueryContext("What are the growth trends?").with_doc_ids([doc_id]) ``` The recommended default for most queries. Fast pre-filtering reduces the number of nodes sent to the LLM, keeping token costs manageable while maintaining high accuracy. @@ -65,16 +59,14 @@ The recommended default for most queries. Fast pre-filtering reduces the number Searches across multiple indexed documents and aggregates results. Uses the cross-document relationship graph for score boosting. ```python -ctx = QueryContext("Compare the architectures").with_strategy( - StrategyPreference.CROSS_DOCUMENT -) +ctx = QueryContext("Compare the architectures") ``` When a high-confidence result is found in one document, neighbor documents in the graph receive a score boost, surfacing related content across the workspace. ## Auto Selection -When using `StrategyPreference.AUTO` (default), the engine analyzes query complexity and selects the appropriate strategy: +By default, the engine analyzes query complexity and automatically selects the appropriate strategy: - Simple keyword queries → Keyword strategy - Complex reasoning queries → Hybrid strategy diff --git a/docs/docs/sdk/python.mdx b/docs/docs/sdk/python.mdx index a4306d68..bc3ae0ba 100644 --- a/docs/docs/sdk/python.mdx +++ b/docs/docs/sdk/python.mdx @@ -20,7 +20,6 @@ The `Engine` is the main entry point. It requires an LLM API key and model name. from vectorless import Engine engine = Engine( - workspace="./data", # Local directory for indexed data api_key="sk-...", # LLM API key model="gpt-4o", # LLM model name endpoint=None, # Optional: custom API endpoint @@ -77,12 +76,11 @@ result = await engine.index( ### Single Document ```python -from vectorless import QueryContext, StrategyPreference +from vectorless import QueryContext answer = await engine.query( QueryContext("What is the total revenue?") - .with_doc_id(doc_id) - .with_strategy(StrategyPreference.HYBRID) + .with_doc_ids([doc_id]) ) if answer.single(): @@ -104,7 +102,6 @@ answer = await engine.query( ```python answer = await engine.query( QueryContext("What documents discuss performance?") - .with_workspace() ) ``` @@ -113,11 +110,10 @@ answer = await engine.query( ```python answer = await engine.query( QueryContext("Explain the architecture") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_max_tokens(4000) # Max tokens in result .with_include_reasoning(True) # Include reasoning chain .with_depth_limit(10) # Max traversal depth - .with_strategy(StrategyPreference.LLM) ) ``` @@ -162,14 +158,3 @@ if graph: | `include_text` | `bool` | `True` | Include node text | | `generate_ids` | `bool` | `True` | Generate node IDs | | `enable_synonym_expansion` | `bool` | `True` | LLM synonym expansion | - -### StrategyPreference - -| Constant | Description | -|----------|-------------| -| `StrategyPreference.AUTO` | Auto-select based on query complexity | -| `StrategyPreference.KEYWORD` | Fast keyword matching | -| `StrategyPreference.LLM` | LLM-guided navigation | -| `StrategyPreference.HYBRID` | BM25 + LLM refinement | -| `StrategyPreference.CROSS_DOCUMENT` | Multi-document retrieval | -| `StrategyPreference.PAGE_RANGE` | Page-scoped retrieval | diff --git a/docs/docs/sdk/rust.mdx b/docs/docs/sdk/rust.mdx index 768368b1..2136cafa 100644 --- a/docs/docs/sdk/rust.mdx +++ b/docs/docs/sdk/rust.mdx @@ -19,7 +19,6 @@ vectorless = "0.1" use vectorless::client::{Engine, EngineBuilder}; let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .with_endpoint("https://api.openai.com/v1") // optional @@ -57,7 +56,7 @@ use vectorless::StrategyPreference; let result = engine.query( QueryContext::new("What is the total revenue?") - .with_doc_id(doc_id) + .with_doc_ids(vec![doc_id.to_string()]) .with_strategy(StrategyPreference::ForceHybrid) .with_max_tokens(4000) .with_include_reasoning(true) diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx index 932e1f0f..3ecc47c7 100644 --- a/docs/src/pages/index.tsx +++ b/docs/src/pages/index.tsx @@ -49,7 +49,6 @@ from vectorless import Engine, IndexContext async def main(): engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) diff --git a/examples/batch_indexing/main.py b/examples/batch_indexing/main.py index 7d6d03cb..c68b3626 100644 --- a/examples/batch_indexing/main.py +++ b/examples/batch_indexing/main.py @@ -22,8 +22,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # Sample documents for demonstration DOCS = { "alpha.md": """\ @@ -81,7 +79,6 @@ def write_sample_docs(base_dir: str) -> list[str]: async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, diff --git a/examples/document_management/main.py b/examples/document_management/main.py index f5d72360..5d206a89 100644 --- a/examples/document_management/main.py +++ b/examples/document_management/main.py @@ -21,8 +21,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # Sample documents SAMPLE_A = """\ # Project Alpha @@ -57,7 +55,6 @@ async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, @@ -98,7 +95,7 @@ async def main() -> None: # ---- Query a specific document ---- print("--- query(doc_id_a) ---") answer = await engine.query( - QueryContext("What storage engines does Alpha support?").with_doc_id(doc_id_a) + QueryContext("What storage engines does Alpha support?").with_doc_ids([doc_id_a]) ) item = answer.single() if item: diff --git a/examples/error_handling/main.py b/examples/error_handling/main.py index 993814a6..22099e3d 100644 --- a/examples/error_handling/main.py +++ b/examples/error_handling/main.py @@ -21,12 +21,9 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, @@ -57,7 +54,7 @@ async def main() -> None: print("--- Query non-existent document ---") try: await engine.query( - QueryContext("What is this?").with_doc_id("does-not-exist") + QueryContext("What is this?").with_doc_ids(["does-not-exist"]) ) except VectorlessError as e: print(f" Caught VectorlessError:") @@ -88,7 +85,6 @@ async def main() -> None: print("--- Engine with invalid credentials ---") try: bad_engine = Engine( - workspace=WORKSPACE + "_bad", api_key="sk-invalid-key-12345", model="gpt-4o", ) diff --git a/examples/index_directory/main.py b/examples/index_directory/main.py index f2446215..08b1c3bd 100644 --- a/examples/index_directory/main.py +++ b/examples/index_directory/main.py @@ -34,7 +34,6 @@ async def main(): endpoint = os.environ.get("LLM_ENDPOINT", "http://localhost:4000/api/v1") engine = Engine( - workspace="./workspace_directory_example", api_key=api_key, model=model, endpoint=endpoint, diff --git a/examples/index_metrics/main.py b/examples/index_metrics/main.py index 3bff91cb..bfea4cf0 100644 --- a/examples/index_metrics/main.py +++ b/examples/index_metrics/main.py @@ -26,8 +26,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # --- Sample documents with varying complexity --- SIMPLE_DOC = """\ # Quick Note @@ -139,7 +137,6 @@ def print_full_report(item: IndexItem) -> None: async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, diff --git a/examples/indexing/main.py b/examples/indexing/main.py index fd507fdd..f2adce3b 100644 --- a/examples/indexing/main.py +++ b/examples/indexing/main.py @@ -16,13 +16,11 @@ # Replace with your own credentials API_KEY = "sk-..." MODEL = "gpt-4o" -WORKSPACE = "./workspace" async def main(): # --- 1. Create engine --- engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, ) @@ -94,7 +92,7 @@ async def main(): # --- 5. Query --- print("--- Query ---") answer = await engine.query( - QueryContext("What was the total revenue?").with_doc_id(file_doc_id) + QueryContext("What was the total revenue?").with_doc_ids([file_doc_id]) ) item = answer.single() if item: diff --git a/examples/pdf_indexing/main.py b/examples/pdf_indexing/main.py index e79b6db5..c1e36727 100644 --- a/examples/pdf_indexing/main.py +++ b/examples/pdf_indexing/main.py @@ -26,8 +26,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # Resolve the sample PDF path relative to the repo root SAMPLE_PDF = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), @@ -70,7 +68,6 @@ async def main() -> None: sys.exit(1) engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, @@ -108,7 +105,7 @@ async def main() -> None: print_separator("Query") answer = await engine.query( - QueryContext("What is this document about?").with_doc_id(doc_id) + QueryContext("What is this document about?").with_doc_ids([doc_id]) ) item = answer.single() if item: diff --git a/pyproject.toml b/pyproject.toml index 8bc47032..f752a6ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,8 @@ build-backend = "maturin" [project] name = "vectorless" -version = "0.1.6" -description = "Hierarchical document intelligence without vectors" +version = "0.1.7" +description = "Reasoning-native document intelligence engine for AI" readme = "README.md" requires-python = ">=3.9" license = { text = "Apache-2.0" } diff --git a/python/README.md b/python/README.md index 97cf79cd..4ca5fa40 100644 --- a/python/README.md +++ b/python/README.md @@ -1,6 +1,6 @@ -# Vectorless Python Bindings +# Vectorless Python SDK -Python bindings for [vectorless](https://github.com/vectorlessflow/vectorless) - a hierarchical document intelligence engine. +Python bindings for [vectorless](https://github.com/vectorlessflow/vectorless) — a reasoning-native document intelligence engine for AI. ## Installation @@ -12,23 +12,24 @@ pip install vectorless ```python import asyncio -from vectorless import Engine, IndexContext +from vectorless import Engine, IndexContext, QueryContext async def main(): # Create engine — api_key and model are required engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) # Index a document - result = await engine.index(IndexContext.from_file("./report.pdf")) + result = await engine.index(IndexContext.from_path("./report.pdf")) doc_id = result.doc_id print(f"Indexed: {doc_id}") # Query the document - result = await engine.query(doc_id, "What is the total revenue?") + result = await engine.query( + QueryContext("What is the total revenue?").with_doc_ids([doc_id]) + ) item = result.single() print(f"Answer: {item.content}") print(f"Score: {item.score:.2f}") @@ -53,7 +54,6 @@ The main entry point for vectorless. class Engine: def __init__( self, - workspace: str | None = None, config_path: str | None = None, api_key: str | None = None, model: str | None = None, @@ -61,7 +61,7 @@ class Engine: ): ... async def index(self, ctx: IndexContext) -> IndexResult: ... - async def query(self, doc_id: str | list[str], question: str) -> QueryResult: ... + async def query(self, ctx: QueryContext) -> QueryResult: ... async def list(self) -> list[DocumentInfo]: ... async def remove(self, doc_id: str) -> bool: ... async def clear(self) -> int: ... @@ -76,13 +76,13 @@ Context for indexing documents. ```python class IndexContext: @staticmethod - def from_file(path: str, name: str | None = None) -> IndexContext: ... + def from_path(path: str, name: str | None = None) -> IndexContext: ... @staticmethod - def from_files(paths: list[str]) -> IndexContext: ... + def from_paths(paths: list[str]) -> IndexContext: ... @staticmethod - def from_dir(path: str) -> IndexContext: ... + def from_dir(path: str, recursive: bool = True) -> IndexContext: ... @staticmethod def from_content( @@ -106,16 +106,19 @@ class IndexContext: - `"markdown"` / `"md"` - Markdown content - `"pdf"` - PDF documents -### IndexOptions +### QueryContext + +Context for querying documents. ```python -class IndexOptions: - def __init__( - self, - mode: str = "default", - summaries: bool = False, - description: bool = False, - ): ... +class QueryContext: + def __init__(self, query: str): ... + + def with_doc_ids(self, doc_ids: list[str]) -> QueryContext: ... + def with_workspace(self) -> QueryContext: ... + def with_max_tokens(self, tokens: int) -> QueryContext: ... + def with_include_reasoning(self, include: bool) -> QueryContext: ... + def with_depth_limit(self, depth: int) -> QueryContext: ... ``` ### IndexResult @@ -160,6 +163,26 @@ class QueryResultItem: def node_ids(self) -> list[str]: ... ``` +### IndexItem + +```python +class IndexItem: + @property + def doc_id(self) -> str: ... + @property + def name(self) -> str: ... + @property + def format(self) -> str: ... + @property + def description(self) -> str | None: ... + @property + def source_path(self) -> str | None: ... + @property + def page_count(self) -> int | None: ... + @property + def metrics(self) -> IndexMetrics | None: ... +``` + ### DocumentInfo ```python @@ -173,6 +196,8 @@ class DocumentInfo: @property def description(self) -> str | None: ... @property + def source_path(self) -> str | None: ... + @property def page_count(self) -> int | None: ... @property def line_count(self) -> int | None: ... @@ -196,8 +221,7 @@ class VectorlessError(Exception): # Install maturin pip install maturin -# Build and install -cd python +# Build and install (from project root) maturin develop # Run tests diff --git a/python/src/lib.rs b/python/src/lib.rs index 640b1024..c0649759 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -9,7 +9,6 @@ use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; use tokio::runtime::Runtime; -use ::vectorless::StrategyPreference; use ::vectorless::client::{ DocumentFormat, DocumentInfo, Engine, EngineBuilder, FailedItem, IndexContext, IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, QueryResultItem, @@ -226,11 +225,7 @@ impl PyIndexContext { #[staticmethod] #[pyo3(signature = (path, recursive=false))] fn from_dir(path: String, recursive: bool) -> Self { - let inner = if recursive { - IndexContext::from_dir_recursive(&path) - } else { - IndexContext::from_dir(&path) - }; + let inner = IndexContext::from_dir(&path, recursive); Self { inner } } @@ -298,83 +293,6 @@ impl PyIndexContext { } } -// ============================================================ -// StrategyPreference -// ============================================================ - -/// Retrieval strategy preference. -/// -/// Controls how the engine searches the document tree. -/// -/// ```python -/// from vectorless import QueryContext, StrategyPreference -/// -/// # Force keyword-only (fastest, no LLM calls during search) -/// ctx = QueryContext("revenue").with_doc_id(doc_id).with_strategy(StrategyPreference.KEYWORD) -/// -/// # Force LLM-guided navigation (most accurate, uses more tokens) -/// ctx = QueryContext("explain the architecture").with_doc_id(doc_id).with_strategy(StrategyPreference.LLM) -/// -/// # Force hybrid (BM25 + LLM refinement) -/// ctx = QueryContext("growth trends").with_doc_id(doc_id).with_strategy(StrategyPreference.HYBRID) -/// ``` -#[pyclass(name = "StrategyPreference", skip_from_py_object)] -#[derive(Clone)] -pub struct PyStrategyPreference { - inner: StrategyPreference, -} - -#[pymethods] -impl PyStrategyPreference { - /// Auto-select based on query complexity (default). - #[classattr] - const AUTO: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::Auto, - }; - - /// Force keyword-based strategy (fast, no LLM during search). - #[classattr] - const KEYWORD: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceKeyword, - }; - - /// Force LLM-guided navigation (deep reasoning). - #[classattr] - const LLM: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceLlm, - }; - - /// Force hybrid strategy (BM25 + LLM refinement). - #[classattr] - const HYBRID: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceHybrid, - }; - - /// Force cross-document strategy (multi-document retrieval). - #[classattr] - const CROSS_DOCUMENT: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceCrossDocument, - }; - - /// Force page-range strategy (filter by page range). - #[classattr] - const PAGE_RANGE: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForcePageRange, - }; - - fn __repr__(&self) -> String { - let name = match self.inner { - StrategyPreference::Auto => "AUTO", - StrategyPreference::ForceKeyword => "KEYWORD", - StrategyPreference::ForceLlm => "LLM", - StrategyPreference::ForceHybrid => "HYBRID", - StrategyPreference::ForceCrossDocument => "CROSS_DOCUMENT", - StrategyPreference::ForcePageRange => "PAGE_RANGE", - }; - format!("StrategyPreference.{}", name) - } -} - // ============================================================ // QueryContext // ============================================================ @@ -384,8 +302,8 @@ impl PyStrategyPreference { /// ```python /// from vectorless import QueryContext /// -/// # Query a single document -/// ctx = QueryContext("What is the total revenue?").with_doc_id(doc_id) +/// # Query specific documents +/// ctx = QueryContext("What is the total revenue?").with_doc_ids([doc_id]) /// /// # Query multiple documents /// ctx = QueryContext("What is the architecture?").with_doc_ids(["doc-1", "doc-2"]) @@ -408,13 +326,7 @@ impl PyQueryContext { } } - /// Set scope to a single document. - fn with_doc_id(&self, doc_id: String) -> Self { - let ctx = self.inner.clone().with_doc_id(&doc_id); - Self { inner: ctx } - } - - /// Set scope to multiple documents. + /// Set scope to specific documents. fn with_doc_ids(&self, doc_ids: Vec) -> Self { let ctx = self.inner.clone().with_doc_ids(doc_ids); Self { inner: ctx } @@ -444,15 +356,6 @@ impl PyQueryContext { Self { inner: ctx } } - /// Set the retrieval strategy. - /// - /// Args: - /// strategy: A StrategyPreference constant, e.g. StrategyPreference.LLM. - fn with_strategy(&self, strategy: &PyStrategyPreference) -> Self { - let ctx = self.inner.clone().with_strategy(strategy.inner); - Self { inner: ctx } - } - fn __repr__(&self) -> String { "QueryContext(...)".to_string() } @@ -1087,6 +990,11 @@ impl PyIndexItem { self.inner.description.as_deref() } + #[getter] + fn source_path(&self) -> Option<&str> { + self.inner.source_path.as_deref() + } + #[getter] fn page_count(&self) -> Option { self.inner.page_count @@ -1199,6 +1107,11 @@ impl PyDocumentInfo { self.inner.description.as_deref() } + #[getter] + fn source_path(&self) -> Option<&str> { + self.inner.source_path.as_deref() + } + #[getter] fn page_count(&self) -> Option { self.inner.page_count @@ -1484,7 +1397,6 @@ fn run_metrics_report(engine: Arc) -> PyMetricsReport { /// from vectorless import Engine, IndexContext, QueryContext /// /// engine = Engine( -/// workspace="./data", /// api_key="sk-...", /// model="gpt-4o", /// ) @@ -1494,7 +1406,7 @@ fn run_metrics_report(engine: Arc) -> PyMetricsReport { /// doc_id = result.doc_id /// /// # Query -/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(doc_id)) +/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([doc_id])) /// print(answer.single().content) /// ``` #[pyclass(name = "Engine")] @@ -1507,7 +1419,6 @@ impl PyEngine { /// Create a new Engine. /// /// Args: - /// workspace: Path to the workspace directory. /// config_path: Path to configuration file (optional). /// api_key: **Required**. LLM API key. /// model: **Required**. LLM model name. @@ -1516,9 +1427,8 @@ impl PyEngine { /// Raises: /// VectorlessError: If engine creation fails. #[new] - #[pyo3(signature = (workspace=None, config_path=None, api_key=None, model=None, endpoint=None))] + #[pyo3(signature = (config_path=None, api_key=None, model=None, endpoint=None))] fn new( - workspace: Option, config_path: Option, api_key: Option, model: Option, @@ -1537,9 +1447,6 @@ impl PyEngine { if let Some(path) = &config_path { builder = builder.with_config_path(path); } - if let Some(ws) = &workspace { - builder = builder.with_workspace(ws); - } if let Some(m) = &model { builder = builder.with_model(m); } @@ -1661,9 +1568,9 @@ impl PyEngine { /// ```python /// from vectorless import Engine, IndexContext, QueryContext /// -/// engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") +/// engine = Engine(api_key="sk-...", model="gpt-4o") /// result = await engine.index(IndexContext.from_path("./report.pdf")) -/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(result.doc_id)) +/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([result.doc_id])) /// print(answer.single().content) /// ``` #[pymodule] @@ -1671,7 +1578,6 @@ fn _vectorless(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/python/vectorless/__init__.py b/python/vectorless/__init__.py index c046ed90..a7f599ad 100644 --- a/python/vectorless/__init__.py +++ b/python/vectorless/__init__.py @@ -1,14 +1,15 @@ """ -Vectorless - Hierarchical document intelligence without vectors. +Vectorless - Reasoning-native document intelligence engine for AI. -A document intelligence engine that uses tree-based understanding -instead of vector databases for accurate, explainable retrieval. +An ultra-performant reasoning-native document intelligence engine +that transforms documents into rich semantic trees and uses LLMs to +intelligently traverse the hierarchy for accurate, explainable retrieval. Quick Start: from vectorless import Engine, IndexContext, QueryContext # Create engine - engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") + engine = Engine(api_key="sk-...", model="gpt-4o") # Index a document ctx = IndexContext.from_path("./report.pdf") @@ -16,7 +17,7 @@ doc_id = result.doc_id # Query - answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(doc_id)) + answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([doc_id])) print(answer.single().content) """ @@ -30,7 +31,6 @@ QueryContext, QueryResult, QueryResultItem, - StrategyPreference, DocumentInfo, DocumentGraph, DocumentGraphNode, @@ -52,7 +52,6 @@ "QueryContext", "QueryResult", "QueryResultItem", - "StrategyPreference", "DocumentInfo", "DocumentGraph", "DocumentGraphNode", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index e52d251f..d6984f6e 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -3,7 +3,7 @@ name = "vectorless" version.workspace = true edition.workspace = true authors.workspace = true -description = "Hierarchical, reasoning-native document intelligence engine" +description = "Reasoning-native document intelligence engine for AI" license.workspace = true repository.workspace = true homepage.workspace = true diff --git a/rust/examples/advanced.rs b/rust/examples/advanced.rs index 1316a68d..2df75f9b 100644 --- a/rust/examples/advanced.rs +++ b/rust/examples/advanced.rs @@ -55,7 +55,7 @@ async fn main() -> vectorless::Result<()> { // Query let result = client - .query(QueryContext::new("What features does Vectorless provide?").with_doc_id(&doc_id)) + .query(QueryContext::new("What features does Vectorless provide?").with_doc_ids(vec![doc_id.clone()])) .await?; println!("Query: What features does Vectorless provide?"); if let Some(item) = result.single() { diff --git a/rust/examples/events.rs b/rust/examples/events.rs index a0cefbb2..51398da8 100644 --- a/rust/examples/events.rs +++ b/rust/examples/events.rs @@ -108,7 +108,6 @@ async fn main() -> Result<(), Box> { // 2. Create engine with events println!("Step 2: Creating engine with event emitter..."); let engine = EngineBuilder::new() - .with_workspace("./workspace_events_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) @@ -128,7 +127,7 @@ async fn main() -> Result<(), Box> { // 4. Query with events println!("Step 4: Querying (with events)..."); let result = engine - .query(QueryContext::new("What is vectorless?").with_doc_id(&doc_id)) + .query(QueryContext::new("What is vectorless?").with_doc_ids(vec![doc_id.clone()])) .await?; if let Some(item) = result.single() { println!(" ✓ Found result ({} chars)", item.content.len()); diff --git a/rust/examples/flow.rs b/rust/examples/flow.rs index 758ddbe3..57d92891 100644 --- a/rust/examples/flow.rs +++ b/rust/examples/flow.rs @@ -69,7 +69,6 @@ async fn main() -> vectorless::Result<()> { println!("Step 1: Creating Vectorless client..."); let engine = EngineBuilder::new() - .with_workspace("./workspace_flow_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) @@ -112,7 +111,7 @@ async fn main() -> vectorless::Result<()> { println!(" Query: \"{}\"", query); match engine - .query(QueryContext::new(query).with_doc_id(&doc_id)) + .query(QueryContext::new(query).with_doc_ids(vec![doc_id.clone()])) .await { Ok(result) => { diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index 940bf7ee..5fccd084 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -34,7 +34,6 @@ async fn main() -> vectorless::Result<()> { // 1. Create engine let engine = EngineBuilder::new() - .with_workspace("./workspace_graph_example") .with_key(&api_key) .with_model(&model) .build() diff --git a/rust/examples/index_directory.rs b/rust/examples/index_directory.rs index 289cb8a2..922c38a5 100644 --- a/rust/examples/index_directory.rs +++ b/rust/examples/index_directory.rs @@ -38,7 +38,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_directory_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) @@ -47,13 +46,8 @@ async fn main() -> vectorless::Result<()> { .map_err(|e| vectorless::Error::Config(e.to_string()))?; // Index directory - let ctx = if recursive { - println!("Recursively indexing: {}", dir); - IndexContext::from_dir_recursive(dir) - } else { - println!("Indexing top-level files in: {}", dir); - IndexContext::from_dir(dir) - }; + println!("{}indexing: {}", if recursive { "Recursively " } else { "" }, dir); + let ctx = IndexContext::from_dir(dir, recursive); if ctx.is_empty() { println!("No supported files found in: {}", dir); diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs index b85a01e9..6500a992 100644 --- a/rust/examples/index_incremental.rs +++ b/rust/examples/index_incremental.rs @@ -28,7 +28,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_incremental_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs index d8d8b57c..0f9ae607 100644 --- a/rust/examples/index_pdf.rs +++ b/rust/examples/index_pdf.rs @@ -62,7 +62,6 @@ async fn main() -> vectorless::Result<()> { ); let engine = EngineBuilder::new() - .with_workspace("./workspace_pdf_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs index 623b4cb3..edaa2460 100644 --- a/rust/examples/index_single.rs +++ b/rust/examples/index_single.rs @@ -28,7 +28,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_single_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index ee77e5f2..fe78c254 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -28,7 +28,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_batch_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/src/client/builder.rs b/rust/src/client/builder.rs index d042d6aa..06519d8f 100644 --- a/rust/src/client/builder.rs +++ b/rust/src/client/builder.rs @@ -24,7 +24,6 @@ //! # #[tokio::main] //! # async fn main() -> Result<(), vectorless::BuildError> { //! let engine = EngineBuilder::new() -//! .with_workspace("./data") //! .with_key("sk-...") //! .with_model("gpt-4o") //! .build() @@ -41,7 +40,6 @@ //! # #[tokio::main] //! # async fn main() -> Result<(), vectorless::BuildError> { //! let engine = EngineBuilder::new() -//! .with_workspace("./data") //! .with_key("sk-...") //! .with_model("deepseek-chat") //! .with_endpoint("https://api.deepseek.com/v1") @@ -51,15 +49,13 @@ //! # } //! ``` -use std::path::PathBuf; - use crate::config::{Config, ConfigLoader, RetrievalConfig}; use crate::memo::MemoStore; use crate::retrieval::PipelineRetriever; use crate::storage::Workspace; use super::engine::Engine; -use super::events::EventEmitter; +use crate::events::EventEmitter; /// Builder for creating a [`Engine`] client. /// @@ -74,7 +70,6 @@ use super::events::EventEmitter; /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let client = EngineBuilder::new() -/// .with_workspace("./my_workspace") /// .with_key("sk-...") /// .with_model("gpt-4o") /// .build() @@ -84,11 +79,8 @@ use super::events::EventEmitter; /// ``` #[derive(Debug)] pub struct EngineBuilder { - /// Workspace path. - workspace: Option, - /// Configuration file path. - config_path: Option, + config_path: Option, /// Custom configuration. config: Option, @@ -126,7 +118,6 @@ impl EngineBuilder { #[must_use] pub fn new() -> Self { Self { - workspace: None, config_path: None, config: None, retrieval_config: None, @@ -145,36 +136,11 @@ impl EngineBuilder { // Basic Configuration // ============================================================ - /// Set the workspace path for document persistence. - /// - /// The workspace stores indexed documents and metadata. - /// If not set, defaults to `./workspace` or the value in config. - /// - /// # Example - /// - /// ```rust,no_run - /// use vectorless::client::EngineBuilder; - /// - /// # #[tokio::main] - /// # async fn main() -> Result<(), vectorless::BuildError> { - /// let engine = EngineBuilder::new() - /// .with_workspace("./data") - /// .build() - /// .await?; - /// # Ok(()) - /// # } - /// ``` - #[must_use] - pub fn with_workspace(mut self, path: impl Into) -> Self { - self.workspace = Some(path.into()); - self - } - /// Set the configuration file path. /// /// The file must be a valid TOML configuration. No auto-detection is performed. #[must_use] - pub fn with_config_path(mut self, path: impl Into) -> Self { + pub fn with_config_path(mut self, path: impl Into) -> Self { self.config_path = Some(path.into()); self } @@ -222,7 +188,8 @@ impl EngineBuilder { /// .with_model("gpt-4o"); /// /// let engine = EngineBuilder::new() - /// .with_workspace("./data") + /// .with_key("sk-...") + /// .with_model("gpt-4o") /// .with_memo_store(memo_store) /// .build() /// .await?; @@ -249,7 +216,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_key("sk-...") /// .build() /// .await?; @@ -274,7 +240,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_model("gpt-4o-mini") /// .build() /// .await?; @@ -299,7 +264,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_model("deepseek-chat") /// .with_endpoint("https://api.deepseek.com/v1") /// .build() @@ -375,7 +339,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_key("sk-...") /// .with_model("gpt-4o") /// .build() @@ -401,24 +364,31 @@ impl EngineBuilder { config.retrieval = retrieval_config; } - // Apply individual overrides + // Apply individual overrides to LlmPoolConfig (primary) + legacy config (compat) if let Some(api_key) = self.api_key { - // Set API key for both retrieval and index + config.llm.api_key = Some(api_key.clone()); + // Legacy compat config.retrieval.api_key = Some(api_key.clone()); config.summary.api_key = Some(api_key); - // Also set LLM pool config - if config.llm.index.api_key.is_none() { - config.llm.index.api_key = config.summary.api_key.clone(); - } - if config.llm.retrieval.api_key.is_none() { - config.llm.retrieval.api_key = config.summary.api_key.clone(); - } } if let Some(model) = self.model { + // Apply model to pool slots + if config.llm.index.model.is_empty() { + config.llm.index.model = model.clone(); + } + if config.llm.retrieval.model.is_empty() { + config.llm.retrieval.model = model.clone(); + } + if config.llm.pilot.model.is_empty() { + config.llm.pilot.model = model.clone(); + } + // Legacy compat config.retrieval.model = model.clone(); config.summary.model = model; } if let Some(endpoint) = self.endpoint { + config.llm.endpoint = Some(endpoint.clone()); + // Legacy compat config.retrieval.endpoint = endpoint.clone(); config.summary.endpoint = endpoint; } @@ -435,55 +405,51 @@ impl EngineBuilder { } // Validate required settings - if config.summary.api_key.is_none() && config.retrieval.api_key.is_none() { + let resolved_key = config + .llm + .api_key + .as_ref() + .or_else(|| config.llm.retrieval.api_key.as_ref()) + .or_else(|| config.summary.api_key.as_ref()) + .or_else(|| config.retrieval.api_key.as_ref()); + if resolved_key.is_none() { return Err(BuildError::MissingApiKey); } - if config.retrieval.model.is_empty() { + let retrieval_model = if config.llm.retrieval.model.is_empty() { + &config.retrieval.model + } else { + &config.llm.retrieval.model + }; + if retrieval_model.is_empty() { return Err(BuildError::MissingModel); } - // Open workspace: prefer explicit path, fallback to config - let workspace_path = self - .workspace - .as_ref() - .unwrap_or(&config.storage.workspace_dir); - - let workspace = Workspace::new(workspace_path) + // Open workspace from config + let workspace = Workspace::new(&config.storage.workspace_dir) .await .map_err(|e| BuildError::Workspace(e.to_string()))?; - // Create indexer client with LLM-enabled factory if API key is available - let indexer = if let Some(api_key) = config.summary.api_key.clone() { - let llm_config = crate::llm::LlmConfig::new(&config.summary.model) - .with_endpoint(config.summary.endpoint.clone()) - .with_api_key(api_key) - .with_max_tokens(config.summary.max_tokens) - .with_temperature(config.summary.temperature); - - let llm_client = crate::llm::LlmClient::new(llm_config); - crate::client::indexer::IndexerClient::with_llm(llm_client) - } else { - crate::client::indexer::IndexerClient::new(crate::index::PipelineExecutor::new()) + // Build LlmPool from config.llm — centralizes all LLM client creation + let llm_configs: crate::llm::LlmConfigs = config.llm.clone().into(); + let pool = { + let controller = crate::throttle::ConcurrencyController::new( + crate::throttle::ConcurrencyConfig::new() + .with_max_concurrent_requests(config.concurrency.max_concurrent_requests) + .with_requests_per_minute(config.concurrency.requests_per_minute) + .with_enabled(config.concurrency.enabled), + ); + crate::llm::LlmPool::new(llm_configs).with_concurrency(controller) }; - // Create pipeline retriever with config + // Indexer uses pool.index() + let indexer = + crate::client::indexer::IndexerClient::with_llm(pool.index().clone()); + + // Retriever uses pool.retrieval() let retrieval_config = config.retrieval.clone(); let mut retriever = PipelineRetriever::new().with_max_iterations(retrieval_config.search.max_iterations); - - // Resolve API key: retrieval config first, then summary config - let retrieval_api_key = retrieval_config - .api_key - .clone() - .or_else(|| config.summary.api_key.clone()) - .ok_or(BuildError::MissingApiKey)?; - - let llm_config = crate::llm::LlmConfig::new(&retrieval_config.model) - .with_endpoint(retrieval_config.endpoint.clone()) - .with_api_key(retrieval_api_key) - .with_temperature(retrieval_config.temperature); - let llm_client = crate::llm::LlmClient::new(llm_config); - retriever = retriever.with_llm_client(llm_client); + retriever = retriever.with_llm_client(pool.retrieval().clone()); // Configure content aggregator if enabled if retrieval_config.content.enabled { @@ -497,7 +463,7 @@ impl EngineBuilder { } else { // Create default memo store with model from config let memo_store = MemoStore::new() - .with_model(&retrieval_config.model) + .with_model(retrieval_model) .with_version(1); retriever = retriever.with_memo_store(memo_store); } @@ -547,18 +513,10 @@ mod tests { #[test] fn test_builder_defaults() { let builder = EngineBuilder::new(); - assert!(builder.workspace.is_none()); assert!(!builder.fast_mode); assert!(!builder.precise_mode); } - #[test] - fn test_builder_with_workspace() { - let builder = EngineBuilder::new().with_workspace("./test_workspace"); - - assert_eq!(builder.workspace, Some(PathBuf::from("./test_workspace"))); - } - #[test] fn test_builder_with_key() { let builder = EngineBuilder::new().with_key("sk-test-key"); diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 94cbcfb4..ebb302f4 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -17,7 +17,8 @@ //! # #[tokio::main] //! # async fn main() -> Result<(), Box> { //! let engine = EngineBuilder::new() -//! .with_workspace("./data") +//! .with_key("sk-...") +//! .with_model("gpt-4o") //! .build() //! .await?; //! @@ -27,7 +28,7 @@ //! //! // Query //! let result = engine.query( -//! QueryContext::new("What is this?").with_doc_id(doc_id) +//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) //! ).await?; //! //! println!("Found: {}", result.content); @@ -50,7 +51,7 @@ use crate::retrieval::{PipelineRetriever, RetrieveEventReceiver}; use crate::storage::{PersistedDocument, Workspace}; use crate::{DocumentTree, Error}; -use super::events::EventEmitter; +use crate::events::EventEmitter; use super::index_context::{IndexContext, IndexSource}; use super::indexer::IndexerClient; use super::query_context::{QueryContext, QueryScope}; @@ -84,6 +85,9 @@ pub struct Engine { /// Workspace client for persistence. workspace: Option, + /// Workspace root directory (for checkpoint path). + workspace_dir: Option, + /// Event emitter. events: EventEmitter, @@ -105,6 +109,7 @@ impl Engine { events: EventEmitter, ) -> Result { let config = Arc::new(config); + let workspace_dir = Some(std::path::PathBuf::from(&config.storage.workspace_dir)); // Attach event emitter to indexer let indexer = indexer.with_events(events.clone()); @@ -123,6 +128,7 @@ impl Engine { indexer, retriever, workspace: Some(workspace_client), + workspace_dir, events, metrics_hub: Arc::new(MetricsHub::with_defaults()), }) @@ -147,7 +153,8 @@ impl Engine { /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") + /// .with_key("sk-...") + /// .with_model("gpt-4o") /// .build() /// .await?; /// @@ -275,6 +282,12 @@ impl Engine { doc.description.clone(), doc.page_count, ) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) .with_metrics_opt(metrics); let persisted = self .indexer @@ -326,6 +339,12 @@ impl Engine { doc.description.clone(), doc.page_count, ) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) .with_metrics_opt(metrics); let persisted = self .indexer @@ -380,14 +399,15 @@ impl Engine { /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") + /// .with_key("sk-...") + /// .with_model("gpt-4o") /// .build() /// .await?; /// /// // Single document /// let result = engine.query( /// QueryContext::new("What is the total revenue?") - /// .with_doc_id("doc-123") + /// .with_doc_ids(vec!["doc-123".to_string()]) /// ).await?; /// /// if let Some(item) = result.single() { @@ -420,6 +440,7 @@ impl Engine { let mut items = Vec::with_capacity(doc_ids.len()); let mut failed = Vec::new(); + // TODO: if doc_ids.len() > 1, consider parallelizing queries across documents (with concurrency limit) for doc_id in doc_ids { let (tree, reasoning_index) = match self.get_structure(&doc_id).await { Ok((t, ri)) => (t, ri), @@ -467,13 +488,13 @@ impl Engine { /// Returns a [`RetrieveEventReceiver`] that yields [`RetrieveEvent`](crate::retrieval::RetrieveEvent)s /// as the retrieval pipeline progresses through each stage. /// - /// Only supports single-document scope (via `with_doc_id`). + /// Only supports single-document scope (via `with_doc_ids` with one ID). pub async fn query_stream(&self, ctx: QueryContext) -> Result { let doc_id = match &ctx.scope { - QueryScope::Single(id) => id.clone(), + QueryScope::Documents(ids) if ids.len() == 1 => ids[0].clone(), _ => { return Err(Error::Config( - "query_stream requires a single doc_id".to_string(), + "query_stream requires a single doc_id via with_doc_ids".to_string(), )); } }; @@ -581,8 +602,7 @@ impl Engine { /// Resolve QueryScope into a list of document IDs. async fn resolve_scope(&self, scope: &QueryScope) -> Result> { match scope { - QueryScope::Single(id) => Ok(vec![id.clone()]), - QueryScope::Multiple(ids) => Ok(ids.clone()), + QueryScope::Documents(ids) => Ok(ids.clone()), QueryScope::Workspace => { let docs = self.list().await?; if docs.is_empty() { @@ -600,6 +620,7 @@ impl Engine { format: crate::index::parse::DocumentFormat, ) -> PipelineOptions { use crate::index::SummaryStrategy; + let checkpoint_dir = self.workspace_dir.as_ref().map(|p| p.join("checkpoints")); PipelineOptions { mode: match format { crate::index::parse::DocumentFormat::Markdown => crate::index::IndexMode::Markdown, @@ -612,6 +633,7 @@ impl Engine { SummaryStrategy::none() }, generate_description: options.generate_description, + checkpoint_dir, ..Default::default() } } @@ -739,6 +761,7 @@ impl Clone for Engine { indexer: self.indexer.clone(), retriever: self.retriever.clone(), workspace: self.workspace.clone(), + workspace_dir: self.workspace_dir.clone(), events: self.events.clone(), metrics_hub: Arc::clone(&self.metrics_hub), } diff --git a/rust/src/client/events.rs b/rust/src/client/events.rs deleted file mode 100644 index 433498ee..00000000 --- a/rust/src/client/events.rs +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Re-export shim — event types and emitter live in the top-level -//! [`events`](crate::events) module. - -pub use crate::events::{Event, EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; diff --git a/rust/src/client/index_context.rs b/rust/src/client/index_context.rs index 989252b8..1ee324f1 100644 --- a/rust/src/client/index_context.rs +++ b/rust/src/client/index_context.rs @@ -30,10 +30,10 @@ //! use vectorless::client::IndexContext; //! //! // Non-recursive (top-level only) -//! let ctx = IndexContext::from_dir("./documents"); +//! let ctx = IndexContext::from_dir("./documents", false); //! //! // Recursive (includes subdirectories) -//! let ctx = IndexContext::from_dir_recursive("./documents"); +//! let ctx = IndexContext::from_dir("./documents", true); //! ``` use std::path::PathBuf; @@ -65,23 +65,6 @@ pub(crate) enum IndexSource { }, } -impl IndexSource { - /// Check if this is a path source. - pub fn is_path(&self) -> bool { - matches!(self, IndexSource::Path(_)) - } - - /// Check if this is a content source. - pub fn is_content(&self) -> bool { - matches!(self, IndexSource::Content { .. }) - } - - /// Check if this is a bytes source. - pub fn is_bytes(&self) -> bool { - matches!(self, IndexSource::Bytes { .. }) - } -} - // ============================================================ // Index Context // ============================================================ @@ -110,7 +93,7 @@ impl IndexSource { /// ).await?; /// /// // Entire directory -/// let result = engine.index(IndexContext::from_dir("./docs")).await?; +/// let result = engine.index(IndexContext::from_dir("./docs", false)).await?; /// # Ok(()) /// # } /// ``` @@ -152,18 +135,12 @@ impl IndexContext { /// Create from a directory path. /// - /// Indexes all supported files in the directory (non-recursive). + /// Indexes all supported files in the directory. /// Supported extensions: `.md`, `.pdf`. - pub fn from_dir(dir: impl Into) -> Self { - Self::scan_dir(dir, false) - } - - /// Create from a directory path with recursive scanning. /// - /// Recursively indexes all supported files in the directory and its - /// subdirectories. Supported extensions: `.md`, `.pdf`. - pub fn from_dir_recursive(dir: impl Into) -> Self { - Self::scan_dir(dir, true) + /// Set `recursive` to `true` to include subdirectories. + pub fn from_dir(dir: impl Into, recursive: bool) -> Self { + Self::scan_dir(dir, recursive) } /// Internal: scan a directory for supported document files. @@ -171,6 +148,10 @@ impl IndexContext { let dir = dir.into(); let supported_extensions = ["md", "pdf"]; + if !dir.exists() { + tracing::warn!("Directory not found: {}", dir.display()); + } + let mut sources = Vec::new(); Self::collect_files(&dir, &supported_extensions, recursive, &mut sources); @@ -353,7 +334,7 @@ mod tests { } #[test] - fn test_from_dir_recursive() { + fn test_from_dir_with_recursive() { // Create a temp directory structure: // tmp/ // a.md @@ -370,11 +351,11 @@ mod tests { std::fs::write(tmp.join("sub/deep/ignore.dat"), b"xxx").unwrap(); // Non-recursive: only top-level - let ctx = IndexContext::from_dir(&tmp); + let ctx = IndexContext::from_dir(&tmp, false); assert_eq!(ctx.len(), 1); // only a.md // Recursive: all levels - let ctx = IndexContext::from_dir_recursive(&tmp); + let ctx = IndexContext::from_dir(&tmp, true); assert_eq!(ctx.len(), 3); // a.md, b.md, c.pdf let _ = std::fs::remove_dir_all(&tmp); diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index 4d9dd6f0..693000b3 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -34,7 +34,7 @@ use crate::index::{ use crate::llm::LlmClient; use crate::storage::{DocumentMeta, PersistedDocument}; -use super::events::{EventEmitter, IndexEvent}; +use crate::events::{EventEmitter, IndexEvent}; use super::index_context::IndexSource; use super::types::{IndexOptions, IndexedDocument}; @@ -49,51 +49,15 @@ pub(crate) struct IndexerClient { /// Event emitter. events: EventEmitter, - - /// Configuration. - config: IndexerConfig, -} - -/// Indexer configuration. -#[derive(Debug, Clone)] -pub struct IndexerConfig { - /// Minimum content tokens required to generate a summary. - pub min_summary_tokens: usize, - - /// Whether to generate IDs by default. - pub generate_ids: bool, - - /// Whether to generate descriptions by default. - pub generate_descriptions: bool, -} - -impl Default for IndexerConfig { - fn default() -> Self { - Self { - min_summary_tokens: 20, - generate_ids: true, - generate_descriptions: false, - } - } } impl IndexerClient { - /// Create a new indexer client with a default pipeline executor. - pub fn new(_executor: PipelineExecutor) -> Self { - Self { - executor_factory: Arc::new(PipelineExecutor::new), - events: EventEmitter::new(), - config: IndexerConfig::default(), - } - } - /// Create with an LLM-enabled pipeline. pub fn with_llm(client: LlmClient) -> Self { let client = Arc::new(client); Self { executor_factory: Arc::new(move || PipelineExecutor::with_llm((*client).clone())), events: EventEmitter::new(), - config: IndexerConfig::default(), } } @@ -103,25 +67,6 @@ impl IndexerClient { self } - /// Create with configuration. - pub fn with_config(mut self, config: IndexerConfig) -> Self { - self.config = config; - self - } - - /// Create from an executor factory function. - pub(crate) fn from_factory( - factory: Arc PipelineExecutor + Send + Sync>, - events: EventEmitter, - config: IndexerConfig, - ) -> Self { - Self { - executor_factory: factory, - events, - config, - } - } - /// Index a document from an index context. pub async fn index( &self, @@ -166,8 +111,15 @@ impl IndexerClient { ) -> Result { let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); - if !path.exists() { - return Err(Error::Parse(format!("File not found: {}", path.display()))); + // Validate file before indexing + let validation = crate::utils::validate_file(&path)?; + if !validation.valid { + return Err(Error::Parse( + validation.errors.first().cloned().unwrap_or_else(|| "Invalid file".to_string()), + )); + } + for warning in &validation.warnings { + tracing::warn!("{}", warning); } // Emit start event @@ -206,6 +158,14 @@ impl IndexerClient { options: &IndexOptions, existing_tree: Option<&crate::DocumentTree>, ) -> Result { + // Validate content before indexing + let validation = crate::utils::validate_content(content, format); + if !validation.valid { + return Err(Error::Parse( + validation.errors.first().cloned().unwrap_or_else(|| "Invalid content".to_string()), + )); + } + self.events.emit_index(IndexEvent::Started { path: name.unwrap_or("content").to_string(), }); @@ -235,6 +195,14 @@ impl IndexerClient { options: &IndexOptions, existing_tree: Option<&crate::DocumentTree>, ) -> Result { + // Validate bytes before indexing + let validation = crate::utils::validate_bytes(bytes, format); + if !validation.valid { + return Err(Error::Parse( + validation.errors.first().cloned().unwrap_or_else(|| "Invalid bytes".to_string()), + )); + } + self.events.emit_index(IndexEvent::Started { path: name.unwrap_or("bytes").to_string(), }); @@ -259,15 +227,6 @@ impl IndexerClient { self.build_indexed_document(doc_id, result, format, name, None) } - /// Build pipeline options from client options. - fn build_pipeline_options( - &self, - options: &IndexOptions, - format: DocumentFormat, - ) -> PipelineOptions { - self.build_pipeline_options_with_existing(options, format, None) - } - /// Build pipeline options with optional existing tree for incremental updates. fn build_pipeline_options_with_existing( &self, @@ -352,63 +311,6 @@ impl IndexerClient { .ok_or_else(|| Error::Parse(format!("Unsupported format: {}", ext))) } - /// Validate a document before indexing. - /// - /// # Errors - /// - /// Returns an error if the file doesn't exist or is not readable. - pub fn validate(&self, path: impl AsRef) -> Result { - let path = path.as_ref(); - - if !path.exists() { - return Ok(ValidationResult { - valid: false, - errors: vec![format!("File not found: {}", path.display())], - warnings: vec![], - format: None, - estimated_size: 0, - }); - } - - let metadata = std::fs::metadata(path) - .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; - - let estimated_size = metadata.len() as usize; - let mut warnings = Vec::new(); - - // Check file size - if estimated_size > 100 * 1024 * 1024 { - warnings.push("Large file (>100MB) may take longer to index".to_string()); - } - - // Detect format - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - let format = DocumentFormat::from_extension(ext); - - if format.is_none() { - return Ok(ValidationResult { - valid: false, - errors: vec![format!("Unsupported format: {}", ext)], - warnings, - format: None, - estimated_size, - }); - } - - Ok(ValidationResult { - valid: true, - errors: vec![], - warnings, - format, - estimated_size, - }) - } - - /// Convert IndexedDocument to PersistedDocument for storage. - pub fn to_persisted(&self, doc: IndexedDocument) -> PersistedDocument { - self.to_persisted_with_options(doc, &PipelineOptions::default()) - } - /// Convert IndexedDocument to PersistedDocument, storing fingerprints from pipeline options. pub fn to_persisted_with_options( &self, @@ -466,48 +368,6 @@ impl Clone for IndexerClient { Self { executor_factory: Arc::clone(&self.executor_factory), events: self.events.clone(), - config: self.config.clone(), } } } - -/// Document validation result. -#[derive(Debug, Clone)] -pub(crate) struct ValidationResult { - /// Whether the document is valid for indexing. - pub valid: bool, - - /// Validation errors (prevents indexing). - pub errors: Vec, - - /// Validation warnings (non-blocking). - pub warnings: Vec, - - /// Detected document format. - pub format: Option, - - /// Estimated file size in bytes. - pub estimated_size: usize, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_indexer_client_creation() { - let executor = PipelineExecutor::new(); - let client = IndexerClient::new(executor); - assert_eq!(client.config.min_summary_tokens, 20); - } - - #[test] - fn test_validate_missing_file() { - let executor = PipelineExecutor::new(); - let client = IndexerClient::new(executor); - - let result = client.validate("./nonexistent.md").unwrap(); - assert!(!result.valid); - assert!(!result.errors.is_empty()); - } -} diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index 286e0511..ce00ff34 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -18,7 +18,8 @@ //! # async fn main() -> Result<(), Box> { //! // Create a client with default settings //! let client = EngineBuilder::new() -//! .with_workspace("./my_workspace") +//! .with_key("sk-...") +//! .with_model("gpt-4o") //! .build() //! .await?; //! @@ -28,7 +29,7 @@ //! //! // Query the document //! let result = client.query( -//! QueryContext::new("What is this?").with_doc_id(doc_id) +//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) //! ).await?; //! if let Some(item) = result.single() { //! println!("{}", item.content); @@ -66,7 +67,6 @@ mod builder; mod engine; -pub mod events; mod index_context; mod indexer; mod query_context; @@ -88,12 +88,6 @@ pub use engine::Engine; pub use index_context::IndexContext; pub use query_context::QueryContext; -// ============================================================ -// Events -// ============================================================ - -pub use events::EventEmitter; - // ============================================================ // Result & Info Types // ============================================================ diff --git a/rust/src/client/query_context.rs b/rust/src/client/query_context.rs index 991acf4d..bb32d05a 100644 --- a/rust/src/client/query_context.rs +++ b/rust/src/client/query_context.rs @@ -4,20 +4,16 @@ //! Query context for the Engine API. //! //! [`QueryContext`] encapsulates all parameters for a query operation, -//! supporting single document, multiple documents, or entire workspace queries. +//! supporting specific documents or entire workspace queries. //! //! # Example //! //! ```rust //! use vectorless::client::QueryContext; //! -//! // Query a single document +//! // Query specific documents //! let ctx = QueryContext::new("What is the total revenue?") -//! .with_doc_id("doc-abc123"); -//! -//! // Query multiple documents -//! let ctx = QueryContext::new("What is the architecture?") -//! .with_doc_ids(vec!["doc-1", "doc-2"]); +//! .with_doc_ids(vec!["doc-1".to_string()]); //! //! // Query entire workspace //! let ctx = QueryContext::new("Explain the algorithm"); @@ -29,19 +25,16 @@ use crate::retrieval::{RetrieveOptions, StrategyPreference}; /// Query scope — determines which documents to search. #[derive(Debug, Clone)] pub(crate) enum QueryScope { - /// Query a single document. - Single(String), - /// Query multiple specific documents. - Multiple(Vec), + /// Query specific documents. + Documents(Vec), /// Query all documents in the workspace. Workspace, } /// Context for a query operation. /// -/// Supports three scopes: -/// - **Single document** — via `with_doc_id()` -/// - **Multiple documents** — via `with_doc_ids()` +/// Supports two scopes: +/// - **Specific documents** — via `with_doc_ids()` /// - **Entire workspace** — default when no scope is set /// /// # Convenience @@ -82,15 +75,12 @@ impl QueryContext { } } - /// Set scope to a single document. - pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { - self.scope = QueryScope::Single(doc_id.into()); - self - } - - /// Set scope to multiple documents. + /// Set scope to specific documents. + /// + /// Pass a single ID or multiple IDs to restrict the query + /// to those documents only. pub fn with_doc_ids(mut self, doc_ids: Vec) -> Self { - self.scope = QueryScope::Multiple(doc_ids); + self.scope = QueryScope::Documents(doc_ids); self } @@ -180,14 +170,14 @@ mod tests { #[test] fn test_single_doc_scope() { - let ctx = QueryContext::new("test").with_doc_id("doc-1"); - assert!(matches!(ctx.scope, QueryScope::Single(ref id) if id == "doc-1")); + let ctx = QueryContext::new("test").with_doc_ids(vec!["doc-1".to_string()]); + assert!(matches!(ctx.scope, QueryScope::Documents(ref ids) if ids == &["doc-1".to_string()])); } #[test] fn test_multi_doc_scope() { let ctx = QueryContext::new("test").with_doc_ids(vec!["a".into(), "b".into()]); - assert!(matches!(ctx.scope, QueryScope::Multiple(ref ids) if ids.len() == 2)); + assert!(matches!(ctx.scope, QueryScope::Documents(ref ids) if ids.len() == 2)); } #[test] @@ -199,7 +189,7 @@ mod tests { #[test] fn test_builder_options() { let ctx = QueryContext::new("test") - .with_doc_id("doc-1") + .with_doc_ids(vec!["doc-1".to_string()]) .with_max_tokens(4000) .with_include_reasoning(false) .with_depth_limit(5); diff --git a/rust/src/client/retriever.rs b/rust/src/client/retriever.rs index 29c0e0d4..f1b38a4d 100644 --- a/rust/src/client/retriever.rs +++ b/rust/src/client/retriever.rs @@ -21,14 +21,13 @@ use std::sync::Arc; use tracing::info; -use super::events::{EventEmitter, QueryEvent}; +use crate::events::{EventEmitter, QueryEvent}; use super::types::QueryResultItem; use crate::config::Config; -use crate::document::{DocumentTree, NodeId, ReasoningIndex}; +use crate::document::{DocumentTree, ReasoningIndex}; use crate::error::{Error, Result}; -use crate::retrieval::content::ContentAggregatorConfig; use crate::retrieval::stream::RetrieveEventReceiver; -use crate::retrieval::{RetrievalResult, RetrieveOptions, RetrieveResponse}; +use crate::retrieval::{RetrieveOptions, RetrieveResponse}; /// Document retrieval client. /// @@ -47,33 +46,6 @@ pub(crate) struct RetrieverClient { default_options: RetrieveOptions, } -/// Retriever configuration. -#[derive(Debug, Clone)] -pub(crate) struct RetrieverClientConfig { - /// Default top_k for retrieval. - pub default_top_k: usize, - - /// Default token budget. - pub default_token_budget: usize, - - /// Content aggregator config. - pub content_config: Option, - - /// Enable result caching. - pub enable_cache: bool, -} - -impl Default for RetrieverClientConfig { - fn default() -> Self { - Self { - default_top_k: 5, - default_token_budget: 4000, - content_config: None, - enable_cache: true, - } - } -} - impl RetrieverClient { /// Create a new retriever client. pub fn new(retriever: crate::retrieval::PipelineRetriever, config: Arc) -> Self { @@ -91,44 +63,6 @@ impl RetrieverClient { self } - /// Create with configuration. - pub fn with_config(mut self, config: RetrieverClientConfig) -> Self { - self.default_options = RetrieveOptions::new() - .with_top_k(config.default_top_k) - .with_max_tokens(config.default_token_budget) - .with_enable_cache(config.enable_cache); - self - } - - /// Create from existing retriever Arc. - pub(crate) fn from_arc( - retriever: Arc, - config: Arc, - events: EventEmitter, - ) -> Self { - Self { - retriever, - config, - events, - default_options: RetrieveOptions::default(), - } - } - - /// Query a document tree. - /// - /// # Errors - /// - /// Returns an error if the retrieval pipeline fails. - pub async fn query( - &self, - tree: &DocumentTree, - question: &str, - options: &RetrieveOptions, - ) -> Result { - self.query_with_reasoning_index(tree, question, options, None) - .await - } - /// Query a document tree with optional reasoning index for fast-path lookup. /// /// # Errors @@ -269,165 +203,6 @@ impl RetrieverClient { score: response.confidence, } } - - /// Get similar nodes to a given node. - /// - /// Uses tree structure and content to find similar nodes. - pub fn find_similar( - &self, - tree: &DocumentTree, - node_id: NodeId, - top_k: usize, - ) -> Result> { - let mut results = Vec::new(); - - // Get the target node's content for comparison - let target_content = tree - .get(node_id) - .map(|n| n.content.clone()) - .unwrap_or_default(); - - if target_content.is_empty() { - return Ok(results); - } - - // Extract keywords from target content - let target_keywords = self.extract_keywords(&target_content); - - // Search all nodes for similarity - let root = tree.root(); - let mut stack = vec![root]; - - while let Some(current_id) = stack.pop() { - if current_id == node_id { - // Skip the target node itself - stack.extend(tree.children(current_id)); - continue; - } - - if let Some(node) = tree.get(current_id) { - let node_keywords = self.extract_keywords(&node.content); - let similarity = self.calculate_similarity(&target_keywords, &node_keywords); - - if similarity > 0.3 { - results.push( - RetrievalResult::new(&node.title) - .with_node_id(format!("{:?}", current_id)) - .with_content(node.content.clone()) - .with_score(similarity) - .with_depth(tree.depth(current_id)), - ); - } - } - - stack.extend(tree.children(current_id)); - } - - // Sort by score and take top_k - results.sort_by(|a, b| { - b.score - .partial_cmp(&a.score) - .unwrap_or(std::cmp::Ordering::Equal) - }); - results.truncate(top_k); - - Ok(results) - } - - /// Extract keywords from content. - fn extract_keywords(&self, content: &str) -> Vec { - content - .to_lowercase() - .split_whitespace() - .filter(|w| w.len() > 3) - .take(20) - .map(|s| s.to_string()) - .collect() - } - - /// Calculate similarity between keyword sets. - fn calculate_similarity(&self, set1: &[String], set2: &[String]) -> f32 { - if set1.is_empty() || set2.is_empty() { - return 0.0; - } - - let set1_set: std::collections::HashSet<_> = set1.iter().collect(); - let set2_set: std::collections::HashSet<_> = set2.iter().collect(); - - let intersection = set1_set.intersection(&set2_set).count(); - let union = set1_set.union(&set2_set).count(); - - intersection as f32 / union as f32 - } - - /// Get node context (ancestors and siblings). - /// - /// Returns the node's ancestors up to the specified depth, - /// along with sibling nodes at each level. - pub fn get_node_context( - &self, - tree: &DocumentTree, - node_id: NodeId, - ancestor_depth: usize, - ) -> Result { - let mut ancestors = Vec::new(); - let mut siblings = Vec::new(); - - // Get ancestors - let mut current_id = Some(node_id); - let mut depth = 0; - - while let Some(id) = current_id { - if depth >= ancestor_depth { - break; - } - - if let Some(node) = tree.get(id) { - ancestors.push( - RetrievalResult::new(&node.title) - .with_node_id(format!("{:?}", id)) - .with_depth(tree.depth(id)), - ); - - // Get siblings at this level - if let Some(parent_id) = tree.parent(id) { - for child_id in tree.children(parent_id) { - if child_id != id { - if let Some(sibling) = tree.get(child_id) { - siblings.push( - RetrievalResult::new(&sibling.title) - .with_node_id(format!("{:?}", child_id)) - .with_depth(tree.depth(child_id)), - ); - } - } - } - } - } - - current_id = tree.parent(id); - depth += 1; - } - - // Get the target node - let target = tree.get(node_id).map(|n| { - RetrievalResult::new(&n.title) - .with_node_id(format!("{:?}", node_id)) - .with_content(n.content.clone()) - .with_depth(tree.depth(node_id)) - }); - - Ok(NodeContext { - target, - ancestors, - siblings, - }) - } - - /// Get the underlying retriever Arc. - pub(crate) fn inner(&self) -> Arc { - Arc::clone(&self.retriever) - } } impl Clone for RetrieverClient { @@ -441,19 +216,6 @@ impl Clone for RetrieverClient { } } -/// Node context information. -#[derive(Debug, Clone)] -pub(crate) struct NodeContext { - /// The target node. - pub target: Option, - - /// Ancestor nodes (ordered from parent to root). - pub ancestors: Vec, - - /// Sibling nodes at each ancestor level. - pub siblings: Vec, -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust/src/client/types.rs b/rust/src/client/types.rs index 4ab82590..5c638846 100644 --- a/rust/src/client/types.rs +++ b/rust/src/client/types.rs @@ -37,9 +37,6 @@ pub struct IndexedDocument { /// Page count (for PDFs). pub page_count: Option, - /// Line count (for text files). - pub line_count: Option, - /// The document tree structure. pub tree: Option, @@ -63,7 +60,6 @@ impl IndexedDocument { description: None, source_path: None, page_count: None, - line_count: None, tree: None, pages: Vec::new(), metrics: None, @@ -95,12 +91,6 @@ impl IndexedDocument { self } - /// Set the line count. - pub fn with_line_count(mut self, count: usize) -> Self { - self.line_count = Some(count); - self - } - /// Set the document tree. pub fn with_tree(mut self, tree: DocumentTree) -> Self { self.tree = Some(tree); @@ -112,19 +102,6 @@ impl IndexedDocument { self.metrics = Some(metrics); self } - - /// Add a page content. - pub fn add_page(&mut self, page: usize, content: impl Into) { - self.pages.push(PageContent { - page, - content: content.into(), - }); - } - - /// Check if the tree is loaded. - pub fn is_loaded(&self) -> bool { - self.tree.is_some() - } } /// Content for a single page. @@ -326,6 +303,8 @@ pub struct IndexItem { pub format: DocumentFormat, /// Document description (from root summary). pub description: Option, + /// Source file path (if indexed from a file). + pub source_path: Option, /// Page count (for PDFs). pub page_count: Option, /// Indexing pipeline metrics (timing, LLM usage, node stats). @@ -346,11 +325,18 @@ impl IndexItem { name: name.into(), format, description, + source_path: None, page_count, metrics: None, } } + /// Set the source file path. + pub fn with_source_path(mut self, path: impl Into) -> Self { + self.source_path = Some(path.into()); + self + } + /// Set the indexing metrics. pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { self.metrics = Some(metrics); @@ -466,6 +452,9 @@ pub struct DocumentInfo { /// Document description. pub description: Option, + /// Source file path. + pub source_path: Option, + /// Page count (for PDFs). pub page_count: Option, @@ -481,6 +470,7 @@ impl DocumentInfo { name: name.into(), format: String::new(), description: None, + source_path: None, page_count: None, line_count: None, } diff --git a/rust/src/client/workspace.rs b/rust/src/client/workspace.rs index 061533ed..7a27d1d6 100644 --- a/rust/src/client/workspace.rs +++ b/rust/src/client/workspace.rs @@ -30,7 +30,7 @@ use tracing::{debug, info}; use crate::error::Result; use crate::storage::{PersistedDocument, Workspace}; -use super::events::{EventEmitter, WorkspaceEvent}; +use crate::events::{EventEmitter, WorkspaceEvent}; use super::types::DocumentInfo; /// Workspace management client. @@ -49,28 +49,6 @@ pub(crate) struct WorkspaceClient { /// Event emitter. events: EventEmitter, - - /// Configuration. - config: WorkspaceClientConfig, -} - -/// Workspace client configuration. -#[derive(Debug, Clone)] -pub(crate) struct WorkspaceClientConfig { - /// Auto-save interval in seconds (None = disabled). - pub auto_save_interval: Option, - - /// Enable verbose logging. - pub verbose: bool, -} - -impl Default for WorkspaceClientConfig { - fn default() -> Self { - Self { - auto_save_interval: None, - verbose: false, - } - } } impl WorkspaceClient { @@ -79,7 +57,6 @@ impl WorkspaceClient { Self { workspace: Arc::new(workspace), events: EventEmitter::new(), - config: WorkspaceClientConfig::default(), } } @@ -89,21 +66,6 @@ impl WorkspaceClient { self } - /// Create with configuration. - pub fn with_config(mut self, config: WorkspaceClientConfig) -> Self { - self.config = config; - self - } - - /// Create from an existing workspace Arc. - pub(crate) fn from_arc(workspace: Arc, events: EventEmitter) -> Self { - Self { - workspace, - events, - config: WorkspaceClientConfig::default(), - } - } - /// Save a document to the workspace. /// /// # Errors @@ -192,6 +154,7 @@ impl WorkspaceClient { name: meta.doc_name, format: meta.doc_type, description: meta.doc_description, + source_path: meta.path, page_count: meta.page_count, line_count: meta.line_count, }); @@ -216,37 +179,12 @@ impl WorkspaceClient { name: meta.doc_name, format: meta.doc_type, description: meta.doc_description, + source_path: meta.path, page_count: meta.page_count, line_count: meta.line_count, })) } - /// Remove multiple documents from the workspace. - /// - /// Returns the number of documents successfully removed. - /// - /// # Errors - /// - /// Returns an error if the workspace write fails. - pub async fn batch_remove(&self, doc_ids: &[&str]) -> Result { - let mut removed = 0; - - for doc_id in doc_ids { - if self.workspace.remove(doc_id).await? { - removed += 1; - self.events.emit_workspace(WorkspaceEvent::Removed { - doc_id: doc_id.to_string(), - }); - } - } - - if removed > 0 { - info!("Batch removed {} documents", removed); - } - - Ok(removed) - } - /// Clear all documents from the workspace. /// /// Returns the number of documents removed. @@ -271,23 +209,6 @@ impl WorkspaceClient { Ok(count) } - /// Get workspace statistics. - pub async fn stats(&self) -> Result { - Ok(WorkspaceStats { - document_count: self.workspace.len().await, - }) - } - - /// Get the number of documents in the workspace. - pub async fn len(&self) -> usize { - self.workspace.len().await - } - - /// Check if the workspace is empty. - pub async fn is_empty(&self) -> bool { - self.workspace.is_empty().await - } - /// Get the underlying workspace Arc (for advanced use). pub(crate) fn inner(&self) -> Arc { Arc::clone(&self.workspace) @@ -309,11 +230,4 @@ impl WorkspaceClient { pub async fn set_graph(&self, graph: &crate::graph::DocumentGraph) -> Result<()> { self.workspace.set_graph(graph).await } -} - -/// Workspace statistics. -#[derive(Debug, Clone)] -pub(crate) struct WorkspaceStats { - /// Number of documents in the workspace. - pub document_count: usize, -} +} \ No newline at end of file diff --git a/rust/src/config/merge.rs b/rust/src/config/merge.rs index c6d995a0..7e524aad 100644 --- a/rust/src/config/merge.rs +++ b/rust/src/config/merge.rs @@ -217,9 +217,7 @@ impl Merge for ContentAggregatorConfig { impl Merge for StorageConfig { fn merge(&mut self, other: &Self, strategy: MergeStrategy) { - if strategy == MergeStrategy::Replace - || self.workspace_dir == std::path::PathBuf::from("./workspace") - { + if strategy == MergeStrategy::Replace { self.workspace_dir = other.workspace_dir.clone(); } } diff --git a/rust/src/config/mod.rs b/rust/src/config/mod.rs index af96c518..f2bedd85 100644 --- a/rust/src/config/mod.rs +++ b/rust/src/config/mod.rs @@ -14,6 +14,6 @@ mod validator; pub(crate) use loader::ConfigLoader; pub(crate) use types::{ CacheConfig, CompressionAlgorithm, ConcurrencyConfig, Config, FallbackBehavior, FallbackConfig, - IndexerConfig, LlmConfig, LlmMetricsConfig, MetricsConfig, OnAllFailedBehavior, + IndexerConfig, LlmClientConfig, LlmConfig, LlmMetricsConfig, LlmPoolConfig, MetricsConfig, OnAllFailedBehavior, PilotMetricsConfig, RetrievalConfig, RetrievalMetricsConfig, SufficiencyConfig, SummaryConfig, }; diff --git a/rust/src/config/types/mod.rs b/rust/src/config/types/mod.rs index 32634a60..8ca3b434 100644 --- a/rust/src/config/types/mod.rs +++ b/rust/src/config/types/mod.rs @@ -23,7 +23,7 @@ pub(crate) use content::ContentAggregatorConfig; pub(crate) use fallback::{FallbackBehavior, FallbackConfig, OnAllFailedBehavior}; pub(crate) use indexer::IndexerConfig; pub(crate) use llm::{LlmConfig, SummaryConfig}; -pub(crate) use llm_pool::LlmPoolConfig; +pub(crate) use llm_pool::{LlmClientConfig, LlmPoolConfig}; pub(crate) use metrics::{ LlmMetricsConfig, MetricsConfig, PilotMetricsConfig, RetrievalMetricsConfig, }; diff --git a/rust/src/config/types/storage.rs b/rust/src/config/types/storage.rs index b50e86e6..00b9b7ea 100644 --- a/rust/src/config/types/storage.rs +++ b/rust/src/config/types/storage.rs @@ -36,7 +36,50 @@ pub struct StorageConfig { } fn default_workspace_dir() -> PathBuf { - PathBuf::from("./workspace") + default_workspace_path_for_cwd() +} + +/// Compute the default workspace path for the current working directory. +/// +/// Returns a platform-appropriate path: +/// - **Linux/macOS**: `~/.vectorless/workspaces/{cwd_hash}/` +/// - **Windows**: `%APPDATA%\vectorless\workspaces\{cwd_hash}\` +/// +/// where `cwd_hash` is a 12-hex-char hash derived from the current working +/// directory. This ensures different projects automatically get isolated +/// workspaces. +/// +/// # Environment variable resolution order +/// +/// | Platform | Primary | Fallback | Last resort | +/// |----------|-----------------|---------------------|-------------| +/// | Unix | `$HOME` | — | `"."` | +/// | Windows | `%LOCALAPPDATA%`| `%APPDATA%` | `"."` | +pub fn default_workspace_path_for_cwd() -> PathBuf { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let base_dir = if cfg!(windows) { + // Windows: prefer %LOCALAPPDATA% (e.g. C:\Users\xxx\AppData\Local) + // then %APPDATA% (e.g. C:\Users\xxx\AppData\Roaming) + std::env::var("LOCALAPPDATA") + .or_else(|_| std::env::var("APPDATA")) + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(".")) + } else { + // Unix (Linux, macOS): use $HOME + std::env::var("HOME") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(".")) + }; + + let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + + let mut hasher = DefaultHasher::new(); + cwd.to_string_lossy().hash(&mut hasher); + let hash = format!("{:012x}", hasher.finish()); + + base_dir.join(".vectorless").join("workspaces").join(hash) } fn default_cache_size() -> usize { @@ -580,7 +623,22 @@ mod tests { #[test] fn test_storage_config_defaults() { let config = StorageConfig::default(); - assert_eq!(config.workspace_dir, PathBuf::from("./workspace")); + // Default workspace should be under .vectorless/workspaces/ (Unix) + // or vectorless/workspaces/ (Windows via AppData) + let path_str = config.workspace_dir.to_string_lossy(); + if cfg!(windows) { + assert!( + path_str.contains("vectorless"), + "expected ...\\vectorless\\workspaces\\..., got {:?}", + config.workspace_dir, + ); + } else { + assert!( + path_str.contains(".vectorless"), + "expected ~/.vectorless/workspaces/..., got {:?}", + config.workspace_dir, + ); + } assert_eq!(config.cache_size, 100); assert!(config.atomic_writes); assert!(config.file_lock); diff --git a/rust/src/document/tree.rs b/rust/src/document/tree.rs index e0ca6a59..a23b8c0e 100644 --- a/rust/src/document/tree.rs +++ b/rust/src/document/tree.rs @@ -820,31 +820,6 @@ mod tests { assert!(children.contains(&child2)); } - #[test] - fn test_children_with_refs_includes_resolved_references() { - let mut tree = DocumentTree::new("Root", "root content"); - let section1 = tree.add_child(tree.root(), "Section 1", "content 1"); - let section2 = tree.add_child(tree.root(), "Section 2", "content 2"); - let appendix = tree.add_child(tree.root(), "Appendix A", "appendix content"); - - // Add a resolved reference from Section 1 to Appendix A - let refs = vec![NodeReference::resolved( - "see Appendix A".to_string(), - "A".to_string(), - RefType::Appendix, - 10, - appendix, - 0.9, - )]; - tree.set_references(section1, refs); - - // section1's children_with_refs should include appendix as a reference target - let children = tree.children_with_refs(section1); - // section1 has no direct children, but has a resolved reference to appendix - assert_eq!(children.len(), 1); - assert!(children.contains(&appendix)); - } - #[test] fn test_children_with_refs_deduplicates() { let mut tree = DocumentTree::new("Root", "root content"); diff --git a/rust/src/index/incremental/detector.rs b/rust/src/index/incremental/detector.rs index c69e653e..23107bb1 100644 --- a/rust/src/index/incremental/detector.rs +++ b/rust/src/index/incremental/detector.rs @@ -594,27 +594,6 @@ mod tests { assert!(detector.needs_reindex_by_hash("doc1", "new content")); } - #[test] - fn test_detect_changes() { - let detector = ChangeDetector::new(); - - // Create two simple trees - let mut tree1 = DocumentTree::new("Root", ""); - let child1 = tree1.add_child(tree1.root(), "Section 1", "Content 1"); - tree1.add_child(tree1.root(), "Section 2", "Content 2"); - - let mut tree2 = DocumentTree::new("Root", ""); - tree2.add_child(tree2.root(), "Section 1", "Content 1"); // Same - tree2.add_child(tree2.root(), "Section 2", "Content modified"); // Changed - tree2.add_child(tree2.root(), "Section 3", "Content 3"); // New - - let changes = detector.detect_changes(&tree1, &tree2); - - assert!(!changes.is_empty()); - assert!(!changes.added.is_empty()); // Section 3 added - assert!(!changes.modified.is_empty()); // Section 2 modified - } - #[test] fn test_change_set() { let mut changes = ChangeSet::new(); diff --git a/rust/src/index/pipeline/context.rs b/rust/src/index/pipeline/context.rs index 21e61ddb..502d241e 100644 --- a/rust/src/index/pipeline/context.rs +++ b/rust/src/index/pipeline/context.rs @@ -227,6 +227,9 @@ pub struct IndexContext { /// Source file path (if from file). pub source_path: Option, + /// SHA-256 hash of source content for checkpoint validation. + pub source_hash: String, + /// Parsed raw nodes. pub raw_nodes: Vec, @@ -268,12 +271,14 @@ pub struct IndexContext { impl IndexContext { /// Create a new context from input. pub fn new(input: IndexInput, options: PipelineOptions) -> Self { + let source_hash = Self::compute_source_hash(&input); Self { doc_id: uuid::Uuid::new_v4().to_string(), input, format: DocumentFormat::Markdown, name: String::new(), source_path: None, + source_hash, raw_nodes: Vec::new(), tree: None, options, @@ -289,6 +294,22 @@ impl IndexContext { } } + /// Compute SHA-256 hash of the source content. + fn compute_source_hash(input: &IndexInput) -> String { + use sha2::{Sha256, Digest}; + let hash = match input { + IndexInput::File(path) => { + // Hash the file path as proxy — actual content may not be readable yet + // (the parse stage reads it). This is sufficient for checkpoint invalidation + // since a different file path implies different content. + Sha256::digest(path.to_string_lossy().as_bytes()) + } + IndexInput::Content { content, .. } => Sha256::digest(content.as_bytes()), + IndexInput::Bytes { data, .. } => Sha256::digest(data), + }; + format!("{:x}", hash) + } + /// Set the document ID. pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { self.doc_id = doc_id.into(); diff --git a/rust/src/index/pipeline/executor.rs b/rust/src/index/pipeline/executor.rs index 1538c7b3..cee63645 100644 --- a/rust/src/index/pipeline/executor.rs +++ b/rust/src/index/pipeline/executor.rs @@ -81,8 +81,9 @@ impl PipelineExecutor { /// 7. `reasoning_index` - Build pre-computed reasoning index /// 8. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { - tracing::info!("PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage"); + tracing::info!("PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage + context"); let orchestrator = PipelineOrchestrator::new() + .with_llm_client(client.clone()) .stage_with_priority(ParseStage::with_llm_client(client.clone()), 10) .stage_with_priority(BuildStage::new(), 20) .stage_with_priority(ValidateStage::new(), 22) diff --git a/rust/src/index/pipeline/orchestrator.rs b/rust/src/index/pipeline/orchestrator.rs index 892497f2..95ace65a 100644 --- a/rust/src/index/pipeline/orchestrator.rs +++ b/rust/src/index/pipeline/orchestrator.rs @@ -31,6 +31,7 @@ use crate::error::Result; use super::super::PipelineOptions; use super::super::stages::IndexStage; +use super::checkpoint::{CheckpointContextData, CheckpointManager, PipelineCheckpoint}; use super::context::{IndexContext, IndexInput, PipelineResult, StageResult}; use super::policy::FailurePolicy; @@ -93,6 +94,8 @@ pub struct ExecutionGroup { pub struct PipelineOrchestrator { /// Registered stages with metadata. stages: Vec, + /// Shared LLM client injected into pipeline context. + llm_client: Option, } impl Default for PipelineOrchestrator { @@ -104,7 +107,16 @@ impl Default for PipelineOrchestrator { impl PipelineOrchestrator { /// Create a new empty orchestrator. pub fn new() -> Self { - Self { stages: Vec::new() } + Self { + stages: Vec::new(), + llm_client: None, + } + } + + /// Set the shared LLM client (injected into pipeline context). + pub fn with_llm_client(mut self, client: crate::llm::LlmClient) -> Self { + self.llm_client = Some(client); + self } /// Add a stage with default priority (100). @@ -452,10 +464,48 @@ impl PipelineOrchestrator { let mut opts = options; let existing_tree = opts.existing_tree.take(); let mut ctx = IndexContext::new(input, opts); + // Inject shared LLM client into context for stages that need it (e.g. ReasoningIndexStage) + if let Some(client) = self.llm_client.take() { + ctx = ctx.with_llm_client(client); + } if let Some(tree) = existing_tree { ctx = ctx.with_existing_tree(tree); } + // Try to resume from checkpoint + if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { + let manager = CheckpointManager::new(checkpoint_dir); + if let Some(checkpoint) = manager.load(&ctx.doc_id) { + if CheckpointManager::is_valid_for_resume( + &checkpoint, + &ctx.source_hash, + ctx.options.processing_version, + &ctx.options.logic_fingerprint().to_string(), + ) { + info!( + "Resuming from checkpoint: {} stages already completed", + checkpoint.completed_stages.len() + ); + // Restore context data from checkpoint + ctx.raw_nodes = checkpoint.context_data.raw_nodes; + if let Some(tree) = checkpoint.context_data.tree { + ctx.tree = Some(tree); + } + ctx.metrics = checkpoint.context_data.metrics; + ctx.page_count = checkpoint.context_data.page_count; + ctx.line_count = checkpoint.context_data.line_count; + ctx.description = checkpoint.context_data.description; + // Mark completed stages as done + for stage_name in &checkpoint.completed_stages { + ctx.stage_results + .insert(stage_name.clone(), StageResult::success(stage_name)); + } + } else { + info!("Checkpoint exists but invalid, starting fresh"); + } + } + } + // Execute each group for (group_idx, group) in groups.iter().enumerate() { if group.parallel { @@ -472,6 +522,19 @@ impl PipelineOrchestrator { } if group.parallel && group.stage_indices.len() == 2 { + // Check if all stages in this group are already completed (from checkpoint) + let all_completed = group.stage_indices.iter().all(|&idx| { + let name = self.stages[idx].stage.name(); + ctx.stage_results.contains_key(name) + }); + if all_completed { + let names: Vec<&str> = group.stage_indices.iter() + .map(|&i| self.stages[i].stage.name()) + .collect(); + info!("Skipping already completed parallel group: {:?}", names); + continue; + } + // === Parallel execution for 2-stage groups === // One stage gets the main ctx (mutates tree), the other // gets a cloned snapshot (read-only). Results are merged back. @@ -566,6 +629,13 @@ impl PipelineOrchestrator { for &idx in &group.stage_indices { let entry = &mut self.stages[idx]; let stage_name = entry.stage.name().to_string(); + + // Skip stages already completed (from checkpoint resume) + if ctx.stage_results.contains_key(&stage_name) { + info!("Skipping already completed stage: {}", stage_name); + continue; + } + let policy = entry.stage.failure_policy(); info!( @@ -589,12 +659,17 @@ impl PipelineOrchestrator { ); } else { error!("Stage {} failed, stopping pipeline: {}", stage_name, e); + // Save checkpoint before returning error + Self::save_checkpoint(&ctx); return Err(e); } } } } } + + // Save checkpoint after each group completes + Self::save_checkpoint(&ctx); } let total_duration = total_start.elapsed().as_millis() as u64; @@ -603,10 +678,49 @@ impl PipelineOrchestrator { total_duration, ctx.name ); + // Clear checkpoint on successful completion + if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { + let manager = CheckpointManager::new(checkpoint_dir); + if let Err(e) = manager.clear(&ctx.doc_id) { + warn!("Failed to clear checkpoint for {}: {}", ctx.doc_id, e); + } + } + // Finalize result Ok(ctx.finalize()) } + /// Save a checkpoint of the current pipeline state. + fn save_checkpoint(ctx: &IndexContext) { + let checkpoint_dir = match ctx.options.checkpoint_dir { + Some(ref dir) => dir.clone(), + None => return, + }; + + let completed_stages: Vec = ctx.stage_results.keys().cloned().collect(); + let checkpoint = PipelineCheckpoint { + doc_id: ctx.doc_id.clone(), + source_hash: ctx.source_hash.clone(), + processing_version: ctx.options.processing_version, + config_fingerprint: ctx.options.logic_fingerprint().to_string(), + completed_stages, + context_data: CheckpointContextData { + raw_nodes: ctx.raw_nodes.clone(), + tree: ctx.tree.clone(), + metrics: ctx.metrics.clone(), + page_count: ctx.page_count, + line_count: ctx.line_count, + description: ctx.description.clone(), + }, + timestamp: chrono::Utc::now(), + }; + + let manager = CheckpointManager::new(checkpoint_dir); + if let Err(e) = manager.save(&ctx.doc_id, &checkpoint) { + warn!("Failed to save checkpoint for {}: {}", ctx.doc_id, e); + } + } + /// Get list of stage names in execution order. pub fn stage_names(&self) -> Result> { let order = self.resolve_order()?; diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 59756a37..780e1f09 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,12 +1,6 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 - -//! # Vectorless - -// Clippy: allow specific lints that are too noisy for this project -#![allow(clippy::iter_over_hash_type)] -#![allow(clippy::large_enum_variant)] -#![allow(clippy::manual_unwrap_or_default)] +#![allow(dead_code)] //! # Vectorless //! @@ -24,7 +18,6 @@ //! #[tokio::main] //! async fn main() -> Result<(), Box> { //! let client = EngineBuilder::new() -//! .with_workspace("./workspace") //! .with_key("sk-...") //! .with_model("gpt-4o") //! .build() @@ -34,7 +27,7 @@ //! let doc_id = result.doc_id().unwrap(); //! //! let result = client.query( -//! QueryContext::new("What is this about?").with_doc_id(doc_id) +//! QueryContext::new("What is this about?").with_doc_ids(vec![doc_id.to_string()]) //! ).await?; //! println!("{}", result.content); //! @@ -64,11 +57,6 @@ pub use client::{ QueryResultItem, }; -// Retrieval types -pub use retrieval::StrategyPreference; -pub use retrieval::pipeline::SearchAlgorithm; -pub use retrieval::QueryComplexity; - // Error types pub use error::{Error, Result}; @@ -84,8 +72,5 @@ pub use graph::DocumentGraph; // Event types pub use events::{EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; -// Index metrics -pub use metrics::IndexMetrics; - // Runtime metrics reports -pub use metrics::{LlmMetricsReport, MetricsReport, PilotMetricsReport, RetrievalMetricsReport}; +pub use metrics::{IndexMetrics, LlmMetricsReport, MetricsReport, PilotMetricsReport, RetrievalMetricsReport}; diff --git a/rust/src/llm/config.rs b/rust/src/llm/config.rs index 882ca828..7be140a1 100644 --- a/rust/src/llm/config.rs +++ b/rust/src/llm/config.rs @@ -248,9 +248,42 @@ impl Default for LlmConfigs { } // ============================================================================ -// Conversion from old config types (for backward compatibility) +// Conversion from config types // ============================================================================ +impl From for LlmConfigs { + fn from(pool: crate::config::LlmPoolConfig) -> Self { + // Resolve shared values before moving individual client configs + let default_api_key = pool.api_key.clone(); + let default_endpoint = pool.endpoint.clone(); + + fn to_llm_config( + client: crate::config::LlmClientConfig, + default_api_key: &Option, + default_endpoint: &Option, + ) -> LlmConfig { + LlmConfig { + model: client.model, + endpoint: if client.endpoint.is_empty() { + default_endpoint.clone().unwrap_or_default() + } else { + client.endpoint + }, + api_key: client.api_key.or_else(|| default_api_key.clone()), + max_tokens: client.max_tokens, + temperature: client.temperature, + retry: RetryConfig::default(), + } + } + + Self { + index: to_llm_config(pool.index, &default_api_key, &default_endpoint), + retrieval: to_llm_config(pool.retrieval, &default_api_key, &default_endpoint), + pilot: to_llm_config(pool.pilot, &default_api_key, &default_endpoint), + } + } +} + impl From for LlmConfig { fn from(old: crate::config::LlmConfig) -> Self { Self { diff --git a/rust/src/llm/mod.rs b/rust/src/llm/mod.rs index 6d23e3dd..84fca4f2 100644 --- a/rust/src/llm/mod.rs +++ b/rust/src/llm/mod.rs @@ -72,6 +72,7 @@ mod pool; mod retry; pub use client::LlmClient; -pub use config::LlmConfig; +pub use config::LlmConfigs; pub use error::LlmResult; pub use executor::LlmExecutor; +pub use pool::LlmPool; diff --git a/rust/src/llm/pool.rs b/rust/src/llm/pool.rs index 51b07ff3..d7ddf637 100644 --- a/rust/src/llm/pool.rs +++ b/rust/src/llm/pool.rs @@ -166,20 +166,6 @@ impl LlmPool { _ => None, } } - - /// Create a pool with a single model for all purposes. - /// - /// Useful for testing or simple deployments. - pub fn single_model(model: impl Into) -> Self { - let config = super::config::LlmConfig::new(model); - let client = Arc::new(LlmClient::new(config)); - Self { - index: client.clone(), - retrieval: client.clone(), - pilot: client, - concurrency: None, - } - } } impl Default for LlmPool { @@ -214,16 +200,6 @@ mod tests { assert!(pool.get("navigate").is_some()); } - #[test] - fn test_single_model_pool() { - let pool = LlmPool::single_model("gpt-4o-mini"); - - // All clients should use the same model - assert_eq!(pool.index().config().model, "gpt-4o-mini"); - assert_eq!(pool.retrieval().config().model, "gpt-4o-mini"); - assert_eq!(pool.pilot().config().model, "gpt-4o-mini"); - } - #[test] fn test_pool_with_concurrency() { use crate::throttle::ConcurrencyConfig; diff --git a/rust/src/retrieval/content/scorer.rs b/rust/src/retrieval/content/scorer.rs index 8597c0a1..2f0e66e3 100644 --- a/rust/src/retrieval/content/scorer.rs +++ b/rust/src/retrieval/content/scorer.rs @@ -325,26 +325,6 @@ mod tests { assert!(!keywords.contains(&"the".to_string())); // stopword } - #[test] - fn test_keyword_score() { - let scorer = RelevanceScorer::new( - "vectorless architecture", - ScoringStrategyConfig::KeywordOnly, - ); - - let chunk = ContentChunk::new( - make_test_node_id(), - "Test".to_string(), - "Vectorless has a unique architecture for document retrieval.".to_string(), - 0, - ); - - let ctx = ScoringContext::default(); - let score = scorer.compute_keyword_score(&chunk.content); - - assert!(score > 0.5); // Should match both keywords - } - #[test] fn test_density_score() { // High density content diff --git a/rust/src/retrieval/pilot/scorer.rs b/rust/src/retrieval/pilot/scorer.rs index b612a23b..6bf8cedb 100644 --- a/rust/src/retrieval/pilot/scorer.rs +++ b/rust/src/retrieval/pilot/scorer.rs @@ -345,19 +345,6 @@ mod tests { assert!(score > 0.0); } - #[test] - fn test_hybrid_scoring() { - let ctx = ScoringContext::with_strategy("test query", ScoringStrategy::Hybrid); - - let keyword_score = ctx.keyword_overlap("test query content"); - let bm25_score = ctx.bm25_field_score("test query content"); - let hybrid = ctx.keyword_overlap("test query content") * 0.4 - + ctx.bm25_field_score("test query content") * 0.6; - - // Hybrid should be between keyword and bm25 scores (roughly) - assert!(hybrid > 0.0); - } - #[test] fn test_scorer_creation() { let scorer = NodeScorer::for_query("test query"); diff --git a/rust/src/utils/format.rs b/rust/src/utils/format.rs deleted file mode 100644 index 95ceea07..00000000 --- a/rust/src/utils/format.rs +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Text formatting utilities. - -/// Truncate text to a maximum length with ellipsis. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::truncate; -/// -/// assert_eq!(truncate("hello world", 8), "hello..."); -/// assert_eq!(truncate("hi", 10), "hi"); -/// ``` -pub fn truncate(text: &str, max_len: usize) -> String { - if text.len() <= max_len { - return text.to_string(); - } - - if max_len <= 3 { - return ".".repeat(max_len); - } - - format!("{}...", &text[..max_len - 3]) -} - -/// Truncate text to a maximum length, respecting word boundaries. -pub fn truncate_words(text: &str, max_len: usize) -> String { - if text.len() <= max_len { - return text.to_string(); - } - - if max_len <= 3 { - return ".".repeat(max_len); - } - - // Find a good break point - let truncated = &text[..max_len - 3]; - - // Try to break at a word boundary - if let Some(last_space) = truncated.rfind(' ') { - if last_space > max_len / 2 { - return format!("{}...", &truncated[..last_space]); - } - } - - format!("{}...", truncated) -} - -/// Format a number with thousand separators. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::format_number; -/// -/// assert_eq!(format_number(1000), "1,000"); -/// assert_eq!(format_number(1234567), "1,234,567"); -/// ``` -pub fn format_number(n: usize) -> String { - let s = n.to_string(); - let mut result = String::new(); - let chars: Vec = s.chars().collect(); - - for (i, c) in chars.iter().enumerate() { - if i > 0 && (chars.len() - i) % 3 == 0 { - result.push(','); - } - result.push(*c); - } - - result -} - -/// Format bytes for human-readable display. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::format_bytes; -/// -/// assert_eq!(format_bytes(500), "500 B"); -/// assert_eq!(format_bytes(1024), "1.0 KB"); -/// assert_eq!(format_bytes(1536), "1.5 KB"); -/// assert_eq!(format_bytes(1048576), "1.0 MB"); -/// ``` -pub fn format_bytes(bytes: usize) -> String { - const KB: usize = 1024; - const MB: usize = KB * 1024; - const GB: usize = MB * 1024; - - if bytes >= GB { - format!("{:.1} GB", bytes as f64 / GB as f64) - } else if bytes >= MB { - format!("{:.1} MB", bytes as f64 / MB as f64) - } else if bytes >= KB { - format!("{:.1} KB", bytes as f64 / KB as f64) - } else { - format!("{} B", bytes) - } -} - -/// Format a percentage. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::format_percent; -/// -/// assert_eq!(format_percent(0.5), "50.0%"); -/// assert_eq!(format_percent(0.123), "12.3%"); -/// ``` -pub fn format_percent(value: f32) -> String { - format!("{:.1}%", value * 100.0) -} - -/// Clean whitespace in text (collapse multiple spaces, trim). -pub fn clean_whitespace(text: &str) -> String { - text.split_whitespace().collect::>().join(" ") -} - -/// Indent each line of text. -pub fn indent(text: &str, spaces: usize) -> String { - let indent_str = " ".repeat(spaces); - text.lines() - .map(|line| format!("{}{}", indent_str, line)) - .collect::>() - .join("\n") -} - -/// Count words in text. -pub fn word_count(text: &str) -> usize { - text.split_whitespace().count() -} - -/// Count lines in text. -pub fn line_count(text: &str) -> usize { - if text.is_empty() { - return 0; - } - text.chars().filter(|&c| c == '\n').count() + 1 -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_truncate() { - assert_eq!(truncate("hello", 10), "hello"); - assert_eq!(truncate("hello world", 8), "hello..."); - assert_eq!(truncate("hi", 3), "hi"); - } - - #[test] - fn test_truncate_words() { - // "hello world foo" with max_len=12: - // truncated = "hello wor" (9 chars), last_space at 5 - // 5 > 12/2 is false, so no word boundary break - assert_eq!(truncate_words("hello world foo", 12), "hello wor..."); - // Word boundary break happens when space is past halfway - assert_eq!(truncate_words("hello world foo bar", 15), "hello world..."); - assert_eq!(truncate_words("hello", 10), "hello"); - } - - #[test] - fn test_format_number() { - assert_eq!(format_number(100), "100"); - assert_eq!(format_number(1000), "1,000"); - assert_eq!(format_number(1234567), "1,234,567"); - } - - #[test] - fn test_format_bytes() { - assert_eq!(format_bytes(500), "500 B"); - assert_eq!(format_bytes(1024), "1.0 KB"); - assert_eq!(format_bytes(1536), "1.5 KB"); - assert_eq!(format_bytes(1048576), "1.0 MB"); - } - - #[test] - fn test_format_percent() { - assert_eq!(format_percent(0.5), "50.0%"); - assert_eq!(format_percent(1.0), "100.0%"); - } - - #[test] - fn test_clean_whitespace() { - assert_eq!(clean_whitespace(" hello world "), "hello world"); - assert_eq!(clean_whitespace("single"), "single"); - } - - #[test] - fn test_indent() { - assert_eq!(indent("hello\nworld", 2), " hello\n world"); - } - - #[test] - fn test_word_count() { - assert_eq!(word_count("hello world"), 2); - assert_eq!(word_count(" hello world "), 2); - assert_eq!(word_count(""), 0); - } - - #[test] - fn test_line_count() { - assert_eq!(line_count("hello\nworld"), 2); - assert_eq!(line_count("single"), 1); - assert_eq!(line_count(""), 0); - } -} diff --git a/rust/src/utils/mod.rs b/rust/src/utils/mod.rs index c6fd9b17..472bed71 100644 --- a/rust/src/utils/mod.rs +++ b/rust/src/utils/mod.rs @@ -5,13 +5,13 @@ //! //! This module provides common utilities used across the codebase: //! -//! - **Token estimation** — Fast and accurate token counting -//! - **Timing** — Performance measurement utilities -//! - **Format** — Text and number formatting utilities +//! - **Token estimation** — Fast and accurate token counting (tiktoken-based) +//! - **Fingerprint** — BLAKE2b content hashing for change detection +//! - **Validation** — Pre-index source validation (file, content, bytes) pub mod fingerprint; -mod format; -mod timing; mod token; +pub mod validation; pub use token::estimate_tokens; +pub use validation::{validate_bytes, validate_content, validate_file}; diff --git a/rust/src/utils/timing.rs b/rust/src/utils/timing.rs deleted file mode 100644 index f133f484..00000000 --- a/rust/src/utils/timing.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Timing and performance measurement utilities. - -use std::time::{Duration, Instant}; - -/// A simple timing guard that records elapsed time on drop. -/// -/// # Example -/// -/// ```rust -/// use vectorless::utils::Timer; -/// -/// let timer = Timer::start("indexing"); -/// // ... do work ... -/// drop(timer); // Logs elapsed time -/// ``` -#[derive(Debug)] -pub struct Timer { - label: String, - start: Instant, - log_on_drop: bool, -} - -impl Timer { - /// Create and start a new timer. - pub fn start(label: impl Into) -> Self { - Self { - label: label.into(), - start: Instant::now(), - log_on_drop: true, - } - } - - /// Create a silent timer (doesn't log on drop). - pub fn silent() -> Self { - Self { - label: String::new(), - start: Instant::now(), - log_on_drop: false, - } - } - - /// Get the elapsed time without stopping. - pub fn elapsed(&self) -> Duration { - self.start.elapsed() - } - - /// Get elapsed time in milliseconds. - pub fn elapsed_ms(&self) -> u64 { - self.elapsed().as_millis() as u64 - } - - /// Get elapsed time in seconds. - pub fn elapsed_secs(&self) -> f64 { - self.elapsed().as_secs_f64() - } - - /// Stop the timer and return the elapsed duration. - pub fn stop(self) -> Duration { - let elapsed = self.elapsed(); - if self.log_on_drop { - tracing::debug!( - "{} completed in {:.2}ms", - self.label, - elapsed.as_secs_f64() * 1000.0 - ); - } - elapsed - } - - /// Stop the timer and return elapsed milliseconds. - pub fn stop_ms(self) -> u64 { - self.stop().as_millis() as u64 - } - - /// Disable logging on drop. - pub fn silent_on_drop(mut self) -> Self { - self.log_on_drop = false; - self - } - - /// Reset the timer. - pub fn reset(&mut self) { - self.start = Instant::now(); - } -} - -impl Drop for Timer { - fn drop(&mut self) { - if self.log_on_drop { - let elapsed = self.elapsed(); - tracing::debug!( - "{} completed in {:.2}ms", - self.label, - elapsed.as_secs_f64() * 1000.0 - ); - } - } -} - -/// Format a duration for human-readable display. -pub fn format_duration(duration: Duration) -> String { - let total_ms = duration.as_millis(); - - if total_ms < 1000 { - format!("{}ms", total_ms) - } else if total_ms < 60_000 { - format!("{:.2}s", duration.as_secs_f64()) - } else { - let secs = duration.as_secs(); - let mins = secs / 60; - let remaining_secs = secs % 60; - format!("{}m {}s", mins, remaining_secs) - } -} - -/// Format a duration as a compact string. -pub fn format_duration_compact(duration: Duration) -> String { - let total_ms = duration.as_millis(); - - if total_ms < 1000 { - format!("{}ms", total_ms) - } else if total_ms < 60_000 { - format!("{:.1}s", duration.as_secs_f64()) - } else { - let mins = duration.as_secs() / 60; - let secs = duration.as_secs() % 60; - format!("{}:{:02}", mins, secs) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_timer_elapsed() { - let timer = Timer::silent(); - std::thread::sleep(std::time::Duration::from_millis(10)); - let elapsed = timer.elapsed(); - assert!(elapsed.as_millis() >= 10); - } - - #[test] - fn test_format_duration() { - assert_eq!(format_duration(Duration::from_millis(500)), "500ms"); - assert_eq!(format_duration(Duration::from_millis(1500)), "1.50s"); - assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s"); - } - - #[test] - fn test_format_duration_compact() { - assert_eq!(format_duration_compact(Duration::from_millis(500)), "500ms"); - assert_eq!(format_duration_compact(Duration::from_millis(1500)), "1.5s"); - assert_eq!(format_duration_compact(Duration::from_secs(90)), "1:30"); - } -} diff --git a/rust/src/utils/token.rs b/rust/src/utils/token.rs index 390f20cf..9e23ea85 100644 --- a/rust/src/utils/token.rs +++ b/rust/src/utils/token.rs @@ -46,33 +46,6 @@ pub fn estimate_tokens(text: &str) -> usize { get_bpe().encode_with_special_tokens(text).len() } -/// Estimate token count with a simple character-based approximation. -/// -/// This is faster but less accurate. Use when you don't need exact counts. -/// Approximation: ~4 characters per token for English text. -/// -/// # Example -/// -/// ``` -/// use vectorless::estimate_tokens_fast; -/// -/// assert_eq!(estimate_tokens_fast(""), 0); -/// assert_eq!(estimate_tokens_fast("hi"), 1); // 2 chars -> 1 token min -/// assert_eq!(estimate_tokens_fast("hello world"), 3); // 11 chars / 4 = 2.75 -> 3 -/// ``` -pub fn estimate_tokens_fast(text: &str) -> usize { - if text.is_empty() { - return 0; - } - // Use ceiling division for better accuracy - (text.len() + 3) / 4 -} - -/// Count tokens in multiple texts. -pub fn estimate_tokens_batch(texts: &[&str]) -> usize { - texts.iter().map(|t| estimate_tokens(t)).sum() -} - #[cfg(test)] mod tests { use super::*; @@ -88,24 +61,4 @@ mod tests { let count = estimate_tokens("hello world"); assert!(count >= 2, "Expected at least 2 tokens, got {}", count); } - - #[test] - fn test_estimate_tokens_fast_empty() { - assert_eq!(estimate_tokens_fast(""), 0); - } - - #[test] - fn test_estimate_tokens_fast_simple() { - assert_eq!(estimate_tokens_fast("hi"), 1); // 2 chars, (2+3)/4 = 1 - assert_eq!(estimate_tokens_fast("hello world"), 3); // 11 chars, (11+3)/4 = 3 - assert_eq!(estimate_tokens_fast(&"a".repeat(100)), 25); // 100 chars, (100+3)/4 = 25 - } - - #[test] - fn test_estimate_tokens_batch() { - let texts = vec!["hello", "world"]; - let batch_count = estimate_tokens_batch(&texts); - let individual_count = estimate_tokens("hello") + estimate_tokens("world"); - assert_eq!(batch_count, individual_count); - } } diff --git a/rust/src/utils/validation.rs b/rust/src/utils/validation.rs new file mode 100644 index 00000000..fc18aee5 --- /dev/null +++ b/rust/src/utils/validation.rs @@ -0,0 +1,196 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Source validation utilities for indexing. + +use std::path::Path; + +use crate::error::{Error, Result}; +use crate::index::parse::DocumentFormat; + +/// Maximum file size before emitting a warning (100 MB). +const LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024; + +/// Result of validating a source before indexing. +#[derive(Debug, Clone)] +pub struct SourceValidation { + /// Whether the source is valid for indexing. + pub valid: bool, + + /// Validation errors (prevents indexing). + pub errors: Vec, + + /// Validation warnings (non-blocking). + pub warnings: Vec, +} + +impl SourceValidation { + fn valid() -> Self { + Self { + valid: true, + errors: vec![], + warnings: vec![], + } + } + + fn invalid(errors: Vec) -> Self { + Self { + valid: false, + errors, + warnings: vec![], + } + } + + fn with_warnings(mut self, warnings: Vec) -> Self { + self.warnings = warnings; + self + } +} + +/// Validate a file path for indexing. +/// +/// Checks: exists, readable, supported format, size. +pub fn validate_file(path: &Path) -> Result { + if !path.exists() { + return Ok(SourceValidation::invalid(vec![format!( + "File not found: {}", + path.display() + )])); + } + + let metadata = std::fs::metadata(path) + .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; + + let size = metadata.len() as usize; + let mut warnings = Vec::new(); + + if size > LARGE_FILE_THRESHOLD { + warnings.push(format!( + "Large file ({}MB) may take longer to index", + size / (1024 * 1024) + )); + } + + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + if DocumentFormat::from_extension(ext).is_none() { + return Ok(SourceValidation::invalid(vec![format!( + "Unsupported format: .{}", + ext + )]) + .with_warnings(warnings)); + } + + Ok(SourceValidation::valid().with_warnings(warnings)) +} + +/// Validate content string for indexing. +/// +/// Checks: non-empty. +pub fn validate_content(content: &str, _format: DocumentFormat) -> SourceValidation { + let mut errors = Vec::new(); + + if content.trim().is_empty() { + errors.push("Content is empty".to_string()); + } + + if errors.is_empty() { + SourceValidation::valid() + } else { + SourceValidation::invalid(errors) + } +} + +/// Validate binary data for indexing. +/// +/// Checks: non-empty, PDF magic number. +pub fn validate_bytes(data: &[u8], format: DocumentFormat) -> SourceValidation { + let mut errors = Vec::new(); + + if data.is_empty() { + errors.push("Byte data is empty".to_string()); + } + + // PDF magic number check + if format == DocumentFormat::Pdf && !data.is_empty() { + if !data.starts_with(b"%PDF") { + errors.push("Data does not appear to be a valid PDF (missing %PDF header)".to_string()); + } + } + + if errors.is_empty() { + SourceValidation::valid() + } else { + SourceValidation::invalid(errors) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_file_missing() { + let result = validate_file(Path::new("./nonexistent.md")).unwrap(); + assert!(!result.valid); + assert!(result.errors[0].contains("not found")); + } + + #[test] + fn test_validate_file_unsupported_format() { + let tmp = std::env::temp_dir().join("vectorless_test_validate.dat"); + std::fs::write(&tmp, b"data").unwrap(); + let result = validate_file(&tmp).unwrap(); + assert!(!result.valid); + assert!(result.errors[0].contains("Unsupported")); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_validate_file_valid() { + let tmp = std::env::temp_dir().join("vectorless_test_validate.md"); + std::fs::write(&tmp, b"# Hello").unwrap(); + let result = validate_file(&tmp).unwrap(); + assert!(result.valid); + assert!(result.errors.is_empty()); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_validate_content_empty() { + let result = validate_content(" \n ", DocumentFormat::Markdown); + assert!(!result.valid); + assert!(result.errors[0].contains("empty")); + } + + #[test] + fn test_validate_content_valid() { + let result = validate_content("# Hello", DocumentFormat::Markdown); + assert!(result.valid); + } + + #[test] + fn test_validate_bytes_empty() { + let result = validate_bytes(&[], DocumentFormat::Pdf); + assert!(!result.valid); + assert!(result.errors[0].contains("empty")); + } + + #[test] + fn test_validate_bytes_invalid_pdf() { + let result = validate_bytes(b"not a pdf", DocumentFormat::Pdf); + assert!(!result.valid); + assert!(result.errors[0].contains("PDF")); + } + + #[test] + fn test_validate_bytes_valid_pdf() { + let result = validate_bytes(b"%PDF-1.4 some content", DocumentFormat::Pdf); + assert!(result.valid); + } + + #[test] + fn test_validate_bytes_valid_markdown() { + let result = validate_bytes(b"# Hello", DocumentFormat::Markdown); + assert!(result.valid); + } +}