From a47f207ed22597781e5fbbe901027ba821188e5d Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:00:09 +0800 Subject: [PATCH 01/28] feat(python): add Answer wrapper and refactor engine interface - Add PyAnswer wrapper with content, evidence, confidence, and trace getters - Rename DocumentInfo to reflect "understood" instead of "indexed" - Change id field to doc_id for clarity - Replace summary field with concepts extraction - Update section_count and rename list method to list_documents - Add Concept class for key concept extraction - Refactor Engine methods: index->ingest, query->ask, remove->forget - Remove deprecated streaming and context modules - Update documentation examples to use new API --- python/src/answer.rs | 103 +++++ python/src/document.rs | 45 ++- python/src/engine.rs | 155 ++++---- python/src/lib.rs | 42 +- rust/src/client/engine.rs | 591 +++++++++-------------------- rust/src/client/mod.rs | 4 +- rust/src/document/mod.rs | 4 + rust/src/document/understanding.rs | 272 +++++++++++++ rust/src/lib.rs | 38 +- 9 files changed, 695 insertions(+), 559 deletions(-) create mode 100644 python/src/answer.rs create mode 100644 rust/src/document/understanding.rs diff --git a/python/src/answer.rs b/python/src/answer.rs new file mode 100644 index 00000000..d36af66a --- /dev/null +++ b/python/src/answer.rs @@ -0,0 +1,103 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Answer Python wrapper. + +use pyo3::prelude::*; + +use ::vectorless::Answer; + +/// A reasoned answer with evidence and trace. +#[pyclass(name = "Answer")] +pub struct PyAnswer { + pub(crate) inner: Answer, +} + +#[pymethods] +impl PyAnswer { + /// The answer content. + #[getter] + fn content(&self) -> &str { + &self.inner.content + } + + /// Evidence supporting the answer. + #[getter] + fn evidence(&self) -> Vec { + self.inner + .evidence + .iter() + .map(|e| PyEvidence { + content: e.content.clone(), + source_path: e.source_path.clone(), + doc_name: e.doc_name.clone(), + relevance: e.relevance, + }) + .collect() + } + + /// Confidence score (0.0–1.0). + #[getter] + fn confidence(&self) -> f32 { + self.inner.confidence + } + + /// Reasoning trace — how the agent arrived at this answer. + #[getter] + fn trace(&self) -> PyReasoningTrace { + PyReasoningTrace { + steps: self + .inner + .trace + .steps + .iter() + .map(|s| PyTraceStep { + action: s.action.clone(), + observation: s.observation.clone(), + round: s.round, + }) + .collect(), + } + } + + fn __repr__(&self) -> String { + format!( + "Answer(confidence={:.2}, evidence={}, trace_steps={})", + self.inner.confidence, + self.inner.evidence.len(), + self.inner.trace.steps.len() + ) + } +} + +/// A piece of evidence with source attribution. +#[pyclass(name = "Evidence")] +pub struct PyEvidence { + #[pyo3(get)] + pub content: String, + #[pyo3(get)] + pub source_path: String, + #[pyo3(get)] + pub doc_name: String, + #[pyo3(get)] + pub relevance: f32, +} + +/// Reasoning trace — always present. +#[pyclass(name = "ReasoningTrace")] +pub struct PyReasoningTrace { + #[pyo3(get)] + pub steps: Vec, +} + +/// A single step in the reasoning trace. +#[pyclass(name = "TraceStep")] +#[derive(Clone)] +pub struct PyTraceStep { + #[pyo3(get)] + pub action: String, + #[pyo3(get)] + pub observation: String, + #[pyo3(get)] + pub round: u32, +} diff --git a/python/src/document.rs b/python/src/document.rs index d5652fba..56dd1570 100644 --- a/python/src/document.rs +++ b/python/src/document.rs @@ -7,7 +7,7 @@ use pyo3::prelude::*; use ::vectorless::DocumentInfo; -/// Information about an indexed document. +/// Information about an understood document. #[pyclass(name = "DocumentInfo")] pub struct PyDocumentInfo { pub(crate) inner: DocumentInfo, @@ -16,8 +16,8 @@ pub struct PyDocumentInfo { #[pymethods] impl PyDocumentInfo { #[getter] - fn id(&self) -> &str { - &self.inner.id + fn doc_id(&self) -> &str { + &self.inner.doc_id } #[getter] @@ -31,29 +31,48 @@ impl PyDocumentInfo { } #[getter] - fn description(&self) -> Option<&str> { - self.inner.description.as_deref() + fn summary(&self) -> &str { + &self.inner.summary } #[getter] - fn source_path(&self) -> Option<&str> { - self.inner.source_path.as_deref() + fn concepts(&self) -> Vec { + self.inner + .concepts + .iter() + .map(|c| PyConcept { + name: c.name.clone(), + summary: c.summary.clone(), + sections: c.sections.clone(), + }) + .collect() } #[getter] - fn page_count(&self) -> Option { - self.inner.page_count + fn section_count(&self) -> usize { + self.inner.section_count } #[getter] - fn line_count(&self) -> Option { - self.inner.line_count + fn page_count(&self) -> Option { + self.inner.page_count } fn __repr__(&self) -> String { format!( - "DocumentInfo(id='{}', name='{}', format='{}')", - self.inner.id, self.inner.name, self.inner.format + "DocumentInfo(doc_id='{}', name='{}', format='{}')", + self.inner.doc_id, self.inner.name, self.inner.format ) } } + +/// A key concept extracted from a document. +#[pyclass(name = "Concept")] +pub struct PyConcept { + #[pyo3(get)] + pub name: String, + #[pyo3(get)] + pub summary: String, + #[pyo3(get)] + pub sections: Vec, +} diff --git a/python/src/engine.rs b/python/src/engine.rs index 23924568..b7572d9c 100644 --- a/python/src/engine.rs +++ b/python/src/engine.rs @@ -1,69 +1,61 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! Engine Python wrapper and async helpers. +//! Engine Python wrapper — async ingest/ask/forget/list_documents. use pyo3::prelude::*; use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; use tokio::runtime::Runtime; -use ::vectorless::{Engine, EngineBuilder, IndexContext, QueryContext}; +use ::vectorless::{Engine, EngineBuilder, IngestInput}; -use super::config::PyConfig; -use super::context::{PyIndexContext, PyQueryContext}; +use super::answer::PyAnswer; use super::document::PyDocumentInfo; use super::error::VectorlessError; use super::error::to_py_err; use super::graph::PyDocumentGraph; use super::metrics::PyMetricsReport; -use super::results::{PyIndexResult, PyQueryResult}; -use super::streaming::PyStreamingQuery; // ============================================================ // Engine async helpers (named functions to avoid FnOnce HRTB issue) // ============================================================ -async fn run_index(engine: Arc, ctx: IndexContext) -> PyResult { - let result = engine.index(ctx).await.map_err(to_py_err)?; - Ok(PyIndexResult { inner: result }) +async fn run_ingest(engine: Arc, input: IngestInput) -> PyResult { + let doc = engine.ingest(input).await.map_err(to_py_err)?; + Ok(PyDocumentInfo { inner: doc }) } -async fn run_query(engine: Arc, ctx: QueryContext) -> PyResult { - let result = engine.query(ctx).await.map_err(to_py_err)?; - Ok(PyQueryResult { inner: result }) +async fn run_ask(engine: Arc, question: String, doc_ids: Vec) -> PyResult { + let answer = engine.ask(&question, &doc_ids).await.map_err(to_py_err)?; + Ok(PyAnswer { inner: answer }) } -async fn run_list(engine: Arc) -> PyResult> { - let docs = engine.list().await.map_err(to_py_err)?; +async fn run_forget(engine: Arc, doc_id: String) -> PyResult<()> { + engine.forget(&doc_id).await.map_err(to_py_err) +} + +async fn run_list_documents(engine: Arc) -> PyResult> { + let docs = engine.list_documents().await.map_err(to_py_err)?; Ok(docs .into_iter() .map(|d| PyDocumentInfo { inner: d }) .collect()) } -async fn run_remove(engine: Arc, doc_id: String) -> PyResult { - engine.remove(&doc_id).await.map_err(to_py_err) +async fn run_exists(engine: Arc, doc_id: String) -> PyResult { + engine.exists(&doc_id).await.map_err(to_py_err) } async fn run_clear(engine: Arc) -> PyResult { engine.clear().await.map_err(to_py_err) } -async fn run_exists(engine: Arc, doc_id: String) -> PyResult { - engine.exists(&doc_id).await.map_err(to_py_err) -} - async fn run_get_graph(engine: Arc) -> PyResult> { let graph = engine.get_graph().await.map_err(to_py_err)?; Ok(graph.map(|g| PyDocumentGraph { inner: g })) } -async fn run_query_stream(engine: Arc, ctx: QueryContext) -> PyResult { - let rx = engine.query_stream(ctx).await.map_err(to_py_err)?; - Ok(PyStreamingQuery::new(rx)) -} - fn run_metrics_report(engine: Arc) -> PyMetricsReport { PyMetricsReport { inner: engine.metrics_report(), @@ -74,25 +66,29 @@ fn run_metrics_report(engine: Arc) -> PyMetricsReport { // Engine // ============================================================ -/// The main vectorless engine. +/// The vectorless Document Understanding Engine. /// -/// `api_key` and `model` are **required**. +/// All methods are **async** — use `await` to call them. /// /// ```python -/// from vectorless import Engine, IndexContext, QueryContext +/// from vectorless import Engine +/// +/// engine = Engine(api_key="sk-...", model="gpt-4o") /// -/// engine = Engine( -/// api_key="sk-...", -/// model="gpt-4o", -/// ) +/// # Understand a document +/// doc = await engine.ingest("./report.pdf") +/// print(doc.summary) /// -/// # Index -/// result = await engine.index(IndexContext.from_path("./report.pdf")) -/// doc_id = result.doc_id +/// # Ask a question +/// answer = await engine.ask("What is the revenue?", doc_ids=[doc.doc_id]) +/// print(answer.content) +/// print(answer.trace) # reasoning trace — always present /// -/// # Query -/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([doc_id])) -/// print(answer.single().content) +/// # List all understood documents +/// docs = await engine.list_documents() +/// +/// # Forget a document +/// await engine.forget(doc.doc_id) /// ``` #[pyclass(name = "Engine")] pub struct PyEngine { @@ -117,7 +113,7 @@ impl PyEngine { api_key: Option, model: Option, endpoint: Option, - config: Option>, + config: Option>, ) -> PyResult { let rt = Runtime::new().map_err(|e| { PyErr::from(VectorlessError::new( @@ -160,80 +156,73 @@ impl PyEngine { }) } - /// Index a document. + /// Understand a document — parse, analyze, and persist. /// /// Args: - /// ctx: IndexContext created from from_path, from_paths, from_dir, etc. + /// path: File path to the document (PDF or Markdown). /// /// Returns: - /// IndexResult with doc_id and items. + /// DocumentInfo with doc_id, summary, structure, concepts. /// /// Raises: - /// VectorlessError: If indexing fails. - fn index<'py>(&self, py: Python<'py>, ctx: &PyIndexContext) -> PyResult> { + /// VectorlessError: If ingest fails. + fn ingest<'py>(&self, py: Python<'py>, path: String) -> PyResult> { let engine = Arc::clone(&self.inner); - let index_ctx = ctx.inner.clone(); - future_into_py(py, run_index(engine, index_ctx)) + let input = IngestInput::Path(path.into()); + future_into_py(py, run_ingest(engine, input)) } - /// Query indexed documents. + /// Ask a question — returns a reasoned answer with evidence and trace. /// /// Args: - /// ctx: QueryContext with query text and scope. + /// question: The question to ask (required). + /// doc_ids: List of document IDs to search. Empty = search all. /// /// Returns: - /// QueryResult with answer and score. + /// Answer with content, evidence, confidence, and trace. /// /// Raises: - /// VectorlessError: If query fails. - fn query<'py>(&self, py: Python<'py>, ctx: &PyQueryContext) -> PyResult> { + /// VectorlessError: If ask fails. + #[pyo3(signature = (question, doc_ids=None))] + fn ask<'py>( + &self, + py: Python<'py>, + question: String, + doc_ids: Option>, + ) -> PyResult> { let engine = Arc::clone(&self.inner); - let query_ctx = ctx.inner.clone(); - future_into_py(py, run_query(engine, query_ctx)) + let ids = doc_ids.unwrap_or_default(); + future_into_py(py, run_ask(engine, question, ids)) } - /// Query documents with streaming progress events. - /// - /// Returns a StreamingQuery async iterator that yields real-time - /// retrieval events as dicts with a ``"type"`` key. + /// Remove a document by ID. /// /// Args: - /// ctx: QueryContext with query text and scope. - /// - /// Returns: - /// StreamingQuery async iterator. + /// doc_id: The document ID to remove. /// /// Raises: - /// VectorlessError: If query setup fails. - fn query_stream<'py>( - &self, - py: Python<'py>, - ctx: &PyQueryContext, - ) -> PyResult> { + /// VectorlessError: If removal fails. + fn forget<'py>(&self, py: Python<'py>, doc_id: String) -> PyResult> { let engine = Arc::clone(&self.inner); - let query_ctx = ctx.inner.clone(); - future_into_py(py, run_query_stream(engine, query_ctx)) + future_into_py(py, run_forget(engine, doc_id)) } - /// List all indexed documents. + /// List all understood documents. /// /// Returns: - /// List of DocumentInfo objects. - fn list<'py>(&self, py: Python<'py>) -> PyResult> { + /// List of DocumentInfo objects with summary, structure, and concepts. + fn list_documents<'py>(&self, py: Python<'py>) -> PyResult> { let engine = Arc::clone(&self.inner); - future_into_py(py, run_list(engine)) + future_into_py(py, run_list_documents(engine)) } - /// Remove a document by ID. - /// - /// Returns: - /// True if removed, False if not found. - fn remove<'py>(&self, py: Python<'py>, doc_id: String) -> PyResult> { + /// Check if a document exists. + fn exists<'py>(&self, py: Python<'py>, doc_id: String) -> PyResult> { let engine = Arc::clone(&self.inner); - future_into_py(py, run_remove(engine, doc_id)) + future_into_py(py, run_exists(engine, doc_id)) } - /// Remove all indexed documents. + /// Remove all documents. /// /// Returns: /// Number of documents removed. @@ -242,12 +231,6 @@ impl PyEngine { future_into_py(py, run_clear(engine)) } - /// Check if a document exists. - fn exists<'py>(&self, py: Python<'py>, doc_id: String) -> PyResult> { - let engine = Arc::clone(&self.inner); - future_into_py(py, run_exists(engine, doc_id)) - } - /// Get the cross-document relationship graph. /// /// Returns: diff --git a/python/src/lib.rs b/python/src/lib.rs index d17c5830..e6ad77ea 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -5,54 +5,42 @@ use pyo3::prelude::*; +mod answer; mod config; -mod context; mod document; mod engine; mod error; mod graph; mod metrics; -mod results; -mod streaming; +use answer::{PyAnswer, PyEvidence, PyReasoningTrace, PyTraceStep}; use config::PyConfig; -use context::{PyIndexContext, PyIndexOptions, PyQueryContext}; -use document::PyDocumentInfo; +use document::{PyConcept, PyDocumentInfo}; use engine::PyEngine; use error::VectorlessError; use graph::{PyDocumentGraph, PyDocumentGraphNode, PyEdgeEvidence, PyGraphEdge, PyWeightedKeyword}; use metrics::{PyLlmMetricsReport, PyMetricsReport, PyRetrievalMetricsReport}; -use results::{ - PyEvidenceItem, PyFailedItem, PyIndexItem, PyIndexMetrics, PyIndexResult, PyQueryMetrics, - PyQueryResult, PyQueryResultItem, -}; -use streaming::PyStreamingQuery; -/// Vectorless - Reasoning-native document intelligence engine. +/// Vectorless — Document Understanding Engine for AI. /// /// ```python -/// from vectorless import Engine, IndexContext, QueryContext +/// from vectorless import Engine /// /// engine = Engine(api_key="sk-...", model="gpt-4o") -/// result = await engine.index(IndexContext.from_path("./report.pdf")) -/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([result.doc_id])) -/// print(answer.single().content) +/// doc = await engine.ingest("./report.pdf") +/// answer = await engine.ask("What is the revenue?", doc_ids=[doc.doc_id]) +/// print(answer.content) /// ``` #[pymodule] fn _vectorless(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; - m.add_class::()?; + m.add_class::()?; m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; @@ -62,8 +50,6 @@ fn _vectorless(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; - m.add_class::()?; m.add("__version__", env!("CARGO_PKG_VERSION"))?; diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 47ad9bf3..7ab4896f 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -3,16 +3,17 @@ //! Main Engine client - the entry point for vectorless. //! -//! The Engine provides a unified API for document indexing and retrieval: +//! The Engine provides a unified API for the Document Understanding Engine: //! -//! - [`index`](Engine::index) — Index documents from files, content, or bytes -//! - [`query`](Engine::query) — Query documents using natural language -//! - [`query_stream`](Engine::query_stream) — Query with streaming results +//! - [`ingest`](Engine::ingest) — Understand a document (parse, analyze, persist) +//! - [`ask`](Engine::ask) — Ask a question (returns answer + evidence + trace) +//! - [`forget`](Engine::forget) — Remove a document +//! - [`list_documents`](Engine::list_documents) — List all understood documents //! //! # Example //! //! ```rust,no_run -//! use vectorless::client::{EngineBuilder, IndexContext, QueryContext}; +//! use vectorless::{EngineBuilder, IngestInput}; //! //! # #[tokio::main] //! # async fn main() -> Result<(), Box> { @@ -23,16 +24,22 @@ //! .build() //! .await?; //! -//! // Index a document -//! let result = engine.index(IndexContext::from_path("./document.md")).await?; -//! let doc_id = result.doc_id().unwrap(); +//! // Understand a document +//! let doc = engine.ingest(IngestInput::Path("./document.md".into())).await?; +//! println!("{}: {}", doc.name, doc.summary); //! -//! // Query -//! let result = engine.query( -//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) -//! ).await?; +//! // Ask a question +//! let answer = engine.ask("What is this?", &[doc.doc_id.clone()]).await?; +//! println!("{}", answer.content); //! -//! println!("Found: {}", result.content); +//! // List all understood documents +//! let docs = engine.list_documents().await?; +//! for d in &docs { +//! println!("{}: {}", d.name, d.summary); +//! } +//! +//! // Forget a document +//! engine.forget(&doc.doc_id).await?; //! # Ok(()) //! # } //! ``` @@ -43,7 +50,8 @@ use futures::StreamExt; use tracing::{info, warn}; use crate::{ - DocumentTree, Error, + Answer, Document as UnderstandingDocument, DocumentTree, Error, Evidence, IngestInput, + ReasoningTrace, config::Config, error::Result, events::EventEmitter, @@ -52,16 +60,14 @@ use crate::{ incremental::{self, IndexAction}, }, metrics::MetricsHub, - retrieval::RetrieveEventReceiver, storage::{PersistedDocument, Workspace}, }; use super::{ index_context::{IndexContext, IndexSource}, indexer::IndexerClient, - query_context::{QueryContext, QueryScope}, retriever::RetrieverClient, - types::{DocumentInfo, FailedItem, IndexItem, IndexMode, IndexResult, QueryResult}, + types::{FailedItem, IndexItem, IndexMode, IndexResult}, workspace::WorkspaceClient, }; @@ -132,18 +138,16 @@ impl Engine { } // ============================================================ - // Document Indexing + // Ingest Pipeline (private — called by ingest()) // ============================================================ - /// Index one or more documents. - /// - /// Accepts an [`IndexContext`] that specifies the source (file path, - /// directory, content string, or bytes) and indexing options. - /// Multiple sources are indexed in parallel. + /// Run the ingest pipeline: parse, compile, persist. /// + /// Accepts an [`IndexContext`] that specifies the source and options. + /// Multiple sources are processed in parallel. /// Returns an [`IndexResult`] containing the indexed document metadata. #[tracing::instrument(skip_all, fields(sources = ctx.sources.len()))] - pub async fn index(&self, ctx: IndexContext) -> Result { + async fn ingest_pipeline(&self, ctx: IndexContext) -> Result { if ctx.is_empty() { return Err(Error::Config("No document sources provided".into())); } @@ -407,392 +411,130 @@ impl Engine { } // ============================================================ - // Document Querying + // Understanding Engine API // ============================================================ - /// Query documents. + /// Understand a document — parse, analyze, and persist. /// - /// Accepts a [`QueryContext`] that specifies the query text and scope - /// (single document, multiple documents, or entire workspace). - #[tracing::instrument(skip_all, fields(query = %ctx.query))] - pub async fn query(&self, ctx: QueryContext) -> Result { - let timeout_secs = ctx.timeout_secs; - - self.with_timeout(timeout_secs, async move { - let doc_ids = self.resolve_scope(&ctx.scope).await?; - info!(doc_count = doc_ids.len(), "Resolving documents for query"); - - let (documents, failed) = self.load_documents(&doc_ids).await?; - info!( - loaded = documents.len(), - failed = failed.len(), - "Documents loaded" - ); - if documents.is_empty() { - return Err(Error::Config(format!( - "No documents available for query: {} failures", - failed.len() - ))); + /// Returns a [`crate::document::DocumentInfo`] with summary, structure, and concepts. + /// The engine builds a full understanding including tree, navigation index, + /// reasoning index, summary, and key concepts. + pub async fn ingest(&self, input: IngestInput) -> Result { + let ctx = match &input { + IngestInput::Path(path) => IndexContext::from_path(path), + IngestInput::Bytes { data, format, .. } => IndexContext::from_bytes(data.clone(), *format), + IngestInput::Text { content, .. } => { + IndexContext::from_content(content, crate::index::parse::DocumentFormat::Markdown) } + }; - let skip_analysis = !ctx.force_analysis; - let mut result = self - .retriever - .query(&documents, &ctx.query, skip_analysis) - .await?; - result.failed.extend(failed); - Ok(result) - }) - .await + let result = self.ingest_pipeline(ctx).await?; + + let doc_id = result + .doc_id() + .ok_or_else(|| Error::Config("ingest produced no results".into()))? + .to_string(); + + // Load the persisted document to build DocumentInfo + let persisted = self + .workspace + .load(&doc_id) + .await? + .ok_or_else(|| Error::Config("Document not found after ingest".into()))?; + + let doc = Self::persisted_to_understanding_document(persisted); + Ok(doc.info()) } - /// Query a document with streaming results. + /// Ask a question — returns a reasoned answer with evidence and trace. /// - /// Returns a receiver that yields retrieval events - /// as the retrieval agent progresses through navigation. + /// - `input`: the question (required) + /// - `ids`: document IDs to search. Empty = search all documents. /// - /// Supports single-document and multi-document scope. - /// Events are translated from the agent's internal event stream - /// into the public `RetrieveEventReceiver` stream. - pub async fn query_stream(&self, ctx: QueryContext) -> Result { - let doc_ids = self.resolve_scope(&ctx.scope).await?; - let query = ctx.query.clone(); - - // Load all requested documents (need owned PersistedDocument for spawned task) - let mut docs = Vec::new(); - for doc_id in &doc_ids { - let doc = match self.workspace.load(doc_id).await? { - Some(d) => d, - None => return Err(Error::Config(format!("Document not found: {}", doc_id))), - }; - docs.push((doc_id.clone(), doc)); - } + /// Always returns an [`Answer`] with content, evidence, confidence, and + /// a mandatory reasoning trace. + pub async fn ask(&self, input: &str, ids: &[String]) -> Result { + // Resolve doc IDs + let doc_ids = if ids.is_empty() { + let docs = self.list_documents().await?; + if docs.is_empty() { + return Err(Error::Config("Workspace is empty".into())); + } + docs.into_iter().map(|d| d.doc_id).collect::>() + } else { + ids.to_vec() + }; - // Create agent event channel - let (agent_tx, mut agent_rx) = - crate::agent::events::channel(crate::agent::events::DEFAULT_AGENT_EVENT_BOUND); - let (retrieve_tx, retrieve_rx) = - crate::retrieval::stream::channel(crate::retrieval::stream::DEFAULT_STREAM_BOUND); - - // Spawn a task that translates AgentEvents → RetrieveEvents - tokio::spawn(async move { - use crate::agent::AgentEvent; - use crate::retrieval::stream::RetrieveEvent; - - while let Some(event) = agent_rx.recv().await { - let translated = match event { - // ── Query Understanding ── - AgentEvent::QueryUnderstandingStarted { query } => RetrieveEvent::Started { - query, - strategy: "query_understanding".to_string(), - }, - AgentEvent::QueryUnderstandingCompleted { query, .. } => { - RetrieveEvent::StageCompleted { - stage: format!("query_understanding: {}", query), - elapsed_ms: 0, - } - } + // Load documents + let (documents, failed) = self.load_documents(&doc_ids).await?; + if documents.is_empty() { + return Err(Error::Config(format!( + "No documents available: {} failures", + failed.len() + ))); + } - // ── Orchestrator ── - AgentEvent::OrchestratorStarted { - query, - doc_count, - skip_analysis, - } => RetrieveEvent::Started { - query, - strategy: if skip_analysis { - "orchestrator_skip_analysis".to_string() - } else { - format!("orchestrator({}_docs)", doc_count) - }, - }, - AgentEvent::OrchestratorAnalyzing { - doc_count, - keywords, - } => RetrieveEvent::StageCompleted { - stage: format!( - "orchestrator_analyzing_{}_docs_kw_{}", - doc_count, - keywords.len() - ), - elapsed_ms: 0, - }, - AgentEvent::WorkerDispatched { - doc_idx, - doc_name, - task, - .. - } => RetrieveEvent::StageCompleted { - stage: format!("dispatch_{}_{}_{}", doc_idx, doc_name, task.len().min(30)), - elapsed_ms: 0, - }, - AgentEvent::WorkerCompleted { - doc_idx, - doc_name, - evidence_count, - rounds_used, - llm_calls, - success, - } => RetrieveEvent::StageCompleted { - stage: format!( - "worker_{}_{}_done_e{}_r{}_l{}_{}", - doc_idx, doc_name, evidence_count, rounds_used, llm_calls, success - ), - elapsed_ms: 0, - }, - AgentEvent::OrchestratorEvaluated { - sufficient, - evidence_count, - missing_info: _, - } => RetrieveEvent::SufficiencyCheck { - level: if sufficient { - crate::retrieval::SufficiencyLevel::Sufficient - } else { - crate::retrieval::SufficiencyLevel::Insufficient - }, - tokens: evidence_count, - }, - AgentEvent::OrchestratorReplanning { - reason, - evidence_count, - } => RetrieveEvent::StageCompleted { - stage: format!( - "orchestrator_replan_{}_e{}", - &reason[..reason.len().min(30)], - evidence_count - ), - elapsed_ms: 0, - }, - AgentEvent::OrchestratorCompleted { - evidence_count, - total_llm_calls, - dispatch_rounds, - } => RetrieveEvent::StageCompleted { - stage: format!( - "orchestrator_done_e{}_l{}_r{}", - evidence_count, total_llm_calls, dispatch_rounds - ), - elapsed_ms: 0, - }, - - // ── Worker ── - AgentEvent::WorkerStarted { - doc_name, - task: _, - max_rounds, - } => RetrieveEvent::StageCompleted { - stage: format!("worker_started_{}_r{}", doc_name, max_rounds), - elapsed_ms: 0, - }, - AgentEvent::WorkerPlanGenerated { doc_name, plan_len } => { - RetrieveEvent::StageCompleted { - stage: format!("plan_{}_{}chars", doc_name, plan_len), - elapsed_ms: 0, - } - } - AgentEvent::WorkerRound { - doc_name, - round, - command, - success: _, - elapsed_ms, - } => RetrieveEvent::StageCompleted { - stage: format!("round_{}_{}_{}", doc_name, round, command), - elapsed_ms, - }, - AgentEvent::EvidenceCollected { - doc_name, - node_title, - source_path, - content_len, - total_evidence: _, - } => RetrieveEvent::ContentFound { - node_id: source_path, - title: format!("[{}] {}", doc_name, node_title), - preview: String::new(), - score: if content_len > 0 { 0.8 } else { 0.0 }, - }, - AgentEvent::WorkerSufficiencyCheck { - doc_name: _, - sufficient, - evidence_count, - .. - } => RetrieveEvent::SufficiencyCheck { - level: if sufficient { - crate::retrieval::SufficiencyLevel::Sufficient - } else { - crate::retrieval::SufficiencyLevel::Insufficient - }, - tokens: evidence_count, - }, - AgentEvent::WorkerReplan { - doc_name, - missing_info, - plan_len, - } => RetrieveEvent::StageCompleted { - stage: format!( - "replan_{}_{}_{}chars", - doc_name, - &missing_info[..missing_info.len().min(30)], - plan_len - ), - elapsed_ms: 0, - }, - AgentEvent::WorkerBudgetWarning { - doc_name, - warning_type, - round, - } => RetrieveEvent::StageCompleted { - stage: format!( - "budget_warning_{}_{}_round_{}", - doc_name, warning_type, round - ), - elapsed_ms: 0, - }, - AgentEvent::WorkerDone { - doc_name, - evidence_count, - rounds_used, - llm_calls, - budget_exhausted: _, - plan_generated: _, - } => RetrieveEvent::StageCompleted { - stage: format!( - "worker_done_{}_e{}_r{}_l{}", - doc_name, evidence_count, rounds_used, llm_calls - ), - elapsed_ms: 0, - }, - - // ── Answer Pipeline ── - AgentEvent::AnswerStarted { - evidence_count, - multi_doc, - } => RetrieveEvent::StageCompleted { - stage: format!( - "answer_start_{}_e{}", - if multi_doc { "multi" } else { "single" }, - evidence_count - ), - elapsed_ms: 0, - }, - AgentEvent::AnswerCompleted { - answer_len, - confidence, - } => RetrieveEvent::StageCompleted { - stage: format!("synthesis_{}_{}chars", confidence, answer_len), - elapsed_ms: 0, - }, - - // ── Terminal ── - AgentEvent::Completed { - evidence_count, - llm_calls, - answer_len, - } => { - let response = crate::retrieval::RetrieveResponse { - results: Vec::new(), - content: String::new(), - confidence: if evidence_count > 0 { 0.8 } else { 0.0 }, - is_sufficient: true, - strategy_used: format!("agent(l={},a={})", llm_calls, answer_len), - reasoning_chain: crate::retrieval::ReasoningChain::default(), - tokens_used: answer_len, - }; - let _ = retrieve_tx - .send(RetrieveEvent::Completed { response }) - .await; - break; // Completed is terminal - } - AgentEvent::Error { stage, message } => { - let _ = retrieve_tx - .send(RetrieveEvent::Error { - message: format!("[{}] {}", stage, message), - }) - .await; - break; // Error is terminal - } - }; - - // For non-terminal events, send the translated event - if !matches!( - translated, - RetrieveEvent::Completed { .. } | RetrieveEvent::Error { .. } - ) { - if retrieve_tx.send(translated).await.is_err() { - break; // Receiver dropped - } - } - } - }); + // Build DocContexts and dispatch + let doc_contexts: Vec = documents + .iter() + .map(|(tree, nav, ridx, id)| crate::agent::DocContext { + tree, + nav_index: nav, + reasoning_index: ridx, + doc_name: id.as_str(), + }) + .collect(); + + let skip_analysis = !ids.is_empty(); + let scope = if skip_analysis { + crate::agent::Scope::Specified(doc_contexts) + } else { + crate::agent::Scope::Workspace(crate::agent::WorkspaceContext::new(doc_contexts)) + }; - // Run the agent in a background task + let emitter = crate::agent::EventEmitter::noop(); let config = self.retriever.config().clone(); let llm = self.retriever.llm().clone(); - let emitter = crate::agent::EventEmitter::new(agent_tx); - let metrics_hub = Arc::clone(&self.metrics_hub); - let start = std::time::Instant::now(); - - tokio::spawn(async move { - // Prepare owned indices (fill defaults for missing) - let owned_docs: Vec<( - String, - crate::storage::PersistedDocument, - crate::document::NavigationIndex, - crate::document::ReasoningIndex, - )> = docs - .into_iter() - .map(|(id, doc)| { - let nav = doc.navigation_index.clone().unwrap_or_default(); - let ridx = doc.reasoning_index.clone().unwrap_or_default(); - (id, doc, nav, ridx) - }) - .collect(); - - // All streaming queries are user-specified docs → always use Scope::Specified - let doc_contexts: Vec = owned_docs - .iter() - .map(|(id, doc, nav, ridx)| crate::agent::DocContext { - tree: &doc.tree, - nav_index: nav, - reasoning_index: ridx, - doc_name: id.as_str(), - }) - .collect(); - let scope = crate::agent::Scope::Specified(doc_contexts); - let result = - crate::retrieval::dispatcher::dispatch(&query, scope, &config, &llm, &emitter) - .await; - - // Bridge agent metrics into global MetricsHub - if let Ok(output) = result { - let m = &output.metrics; - let elapsed = start.elapsed(); - metrics_hub.record_retrieval_query( - m.rounds_used as u64, - m.nodes_visited as u64, - elapsed.as_millis() as u64, - ); - } - }); + let output = + crate::retrieval::dispatcher::dispatch(input, scope, &config, &llm, &emitter).await?; - Ok(retrieve_rx) + // Convert Output -> Answer + Ok(Self::output_to_answer(&output)) } - // ============================================================ - // Document Management - // ============================================================ - - /// Get a list of all indexed documents. - pub async fn list(&self) -> Result> { - self.workspace.list().await + /// Remove a document from the workspace. + pub async fn forget(&self, doc_id: &str) -> Result<()> { + self.workspace.remove(doc_id).await?; + Ok(()) } - /// Remove a document from the workspace. - pub async fn remove(&self, doc_id: &str) -> Result { - self.workspace.remove(doc_id).await + /// List all understood documents. + /// + /// Returns [`Vec`] with summary, structure, and concepts + /// for each document. + pub async fn list_documents(&self) -> Result> { + let ids = self.workspace.inner().list_documents().await; + let mut result = Vec::new(); + for id in ids { + match self.workspace.load(&id).await { + Ok(Some(persisted)) => { + result.push(Self::persisted_to_understanding_document(persisted).info()); + } + Ok(None) => { + tracing::warn!(doc_id = %id, "Document in index but not in storage"); + } + Err(e) => { + tracing::warn!(doc_id = %id, error = %e, "Failed to load document"); + } + } + } + Ok(result) } + // ============================================================ + // Utility Methods + // ============================================================ + /// Check if a document exists in the workspace. pub async fn exists(&self, doc_id: &str) -> Result { self.workspace.exists(doc_id).await @@ -821,6 +563,55 @@ impl Engine { self.metrics_hub.generate_report() } + // ============================================================ + // Internal: type conversions + // ============================================================ + + /// Convert a PersistedDocument to a Document (understanding type). + fn persisted_to_understanding_document(persisted: PersistedDocument) -> UnderstandingDocument { + let nav_index = persisted.navigation_index.unwrap_or_default(); + let reasoning_index = persisted.reasoning_index.unwrap_or_default(); + let tree = persisted.tree; + + let section_count = tree.node_count(); + + UnderstandingDocument { + doc_id: persisted.meta.id, + name: persisted.meta.name, + format: persisted.meta.format, + source_path: persisted.meta.source_path.map(|p| p.to_string_lossy().to_string()), + tree, + nav_index, + reasoning_index, + summary: persisted.meta.description.unwrap_or_default(), + concepts: Vec::new(), // Will be populated by pipeline Stage 7 + page_count: persisted.meta.page_count, + section_count, + } + } + + /// Convert agent Output to public Answer type. + fn output_to_answer(output: &crate::agent::Output) -> Answer { + // Build evidence + let evidence: Vec = output + .evidence + .iter() + .map(|e| Evidence { + content: e.content.clone(), + source_path: e.source_path.clone(), + doc_name: e.doc_name.clone().unwrap_or_default(), + relevance: 0.0, + }) + .collect(); + + Answer { + content: output.answer.clone(), + evidence, + confidence: output.confidence, + trace: ReasoningTrace::empty(), // TODO: wire up actual trace collection + } + } + // ============================================================ // Internal // ============================================================ @@ -875,20 +666,6 @@ impl Engine { } } - /// Resolve QueryScope into a list of document IDs. - async fn resolve_scope(&self, scope: &QueryScope) -> Result> { - match scope { - QueryScope::Documents(ids) => Ok(ids.clone()), - QueryScope::Workspace => { - let docs = self.list().await?; - if docs.is_empty() { - return Err(Error::Config("Workspace is empty".to_string())); - } - Ok(docs.into_iter().map(|d| d.id).collect()) - } - } - } - /// Build pipeline options for pipeline execution (with checkpoint dir). /// /// This is the single source of truth for pipeline configuration. @@ -1074,7 +851,7 @@ impl Engine { Ok(()) } - /// Extract keyword → weight map from a persisted document's ReasoningIndex. + /// Extract keyword -> weight map from a persisted document's ReasoningIndex. fn extract_keywords_from_doc(doc: &PersistedDocument) -> HashMap { let mut keywords = HashMap::new(); if let Some(ref ri) = doc.reasoning_index { @@ -1111,7 +888,7 @@ mod tests { use super::*; use crate::client::types::IndexMode; - // ── resolve_index_action Default mode ────────────────────────────────── + // -- resolve_index_action Default mode ---------------------------------------------- // We can't call resolve_index_action without a workspace, but we can // verify IndexMode equality logic used inside. @@ -1123,9 +900,9 @@ mod tests { assert_ne!(mode, IndexMode::Incremental); } - // ── build_index_item ────────────────────────────────────────────────── + // -- build_index_item ---------------------------------------------------------------- - // Build_index_item only transforms data — no I/O. + // Build_index_item only transforms data -- no I/O. use crate::client::indexed_document::IndexedDocument; fn make_doc() -> IndexedDocument { diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index 8a370e57..fc33a594 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -95,8 +95,8 @@ pub use query_context::QueryContext; // ============================================================ pub use types::{ - Confidence, DocumentInfo, EvidenceItem, FailedItem, IndexItem, IndexMode, IndexOptions, - IndexResult, QueryMetrics, QueryResult, QueryResultItem, + Confidence, EvidenceItem, FailedItem, IndexItem, IndexMode, IndexOptions, IndexResult, + QueryMetrics, QueryResult, QueryResultItem, }; // ============================================================ diff --git a/rust/src/document/mod.rs b/rust/src/document/mod.rs index d3de3bfc..18f77ce0 100644 --- a/rust/src/document/mod.rs +++ b/rust/src/document/mod.rs @@ -24,6 +24,7 @@ mod serde_helpers; mod structure; mod toc; mod tree; +pub mod understanding; pub use navigation::{ChildRoute, DocCard, NavEntry, NavigationIndex, SectionCard}; pub use node::{NodeId, TreeNode}; @@ -35,3 +36,6 @@ pub use reference::ReferenceExtractor; pub use structure::{DocumentStructure, StructureNode}; pub use toc::{TocConfig, TocEntry, TocNode, TocView}; pub use tree::{DocumentTree, RetrievalIndex}; +pub use understanding::{ + Answer, Concept, Document, DocumentInfo, Evidence, IngestInput, ReasoningTrace, TraceStep, +}; diff --git a/rust/src/document/understanding.rs b/rust/src/document/understanding.rs new file mode 100644 index 00000000..3249c4c3 --- /dev/null +++ b/rust/src/document/understanding.rs @@ -0,0 +1,272 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Understanding types — the core objects that define the Document Understanding Engine. +//! +//! These types form the stable public contract: +//! - [`Document`] — the unified post-ingest artifact (internal first-class citizen) +//! - [`DocumentInfo`] — what `ingest()` returns to users +//! - [`Concept`] — key concept extracted from a document +//! - [`Answer`] — what `ask()` returns +//! - [`Evidence`] — proof trail for an answer +//! - [`ReasoningTrace`] / [`TraceStep`] — always-mandatory reasoning trace + +use serde::{Deserialize, Serialize}; + +use super::toc::TocNode; + +// --------------------------------------------------------------------------- +// Document — unified post-ingest artifact +// --------------------------------------------------------------------------- + +/// A understood document — the core artifact of the understand phase. +/// +/// This is what `ingest()` produces internally and what `ask()` consumes. +/// It unifies tree + navigation index + reasoning index + summary + concepts +/// into a single first-class type, replacing the previous loose coupling of +/// `DocContext { &tree, &nav, &reasoning }`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Document { + /// Unique document identifier. + pub doc_id: String, + /// Document name/title. + pub name: String, + /// Document format ("pdf", "markdown", "docx"). + pub format: String, + /// Source file path (if indexed from a file). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_path: Option, + + // ── Three indexes (engine internal) ── + /// Hierarchical semantic tree. + pub tree: super::tree::DocumentTree, + /// Pre-computed navigation structure. + pub nav_index: super::navigation::NavigationIndex, + /// Keyword / topic / section summaries. + pub reasoning_index: super::reasoning::ReasoningIndex, + + // ── Understanding results (ingest stage output) ── + /// Document-level summary. + pub summary: String, + /// Key concepts the engine identified. + #[serde(default)] + pub concepts: Vec, + + // ── Metadata ── + /// Page count (for PDFs). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub page_count: Option, + /// Number of sections in the tree. + #[serde(default)] + pub section_count: usize, +} + +// --------------------------------------------------------------------------- +// DocumentInfo — what ingest() returns to users +// --------------------------------------------------------------------------- + +/// The engine's understanding of a document — returned by `ingest()`. +/// +/// Rich enough for users to confirm the engine "got it right": +/// summary, structure (TOC), and key concepts. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentInfo { + /// Unique document identifier. + pub doc_id: String, + /// Document name. + pub name: String, + /// Document format ("pdf", "markdown", "docx"). + pub format: String, + /// Document-level summary — what this document is about. + pub summary: String, + /// Table of contents — the document's structure as the engine sees it. + pub structure: TocNode, + /// Key concepts the engine identified. + pub concepts: Vec, + /// Number of sections in the document. + pub section_count: usize, + /// Page count (for PDFs). + pub page_count: Option, +} + +impl Document { + /// Produce the public DocumentInfo view of this document. + pub fn info(&self) -> DocumentInfo { + let toc = super::toc::TocView::new().generate(&self.tree); + DocumentInfo { + doc_id: self.doc_id.clone(), + name: self.name.clone(), + format: self.format.clone(), + summary: self.summary.clone(), + structure: toc, + concepts: self.concepts.clone(), + section_count: self.section_count, + page_count: self.page_count, + } + } +} + +// --------------------------------------------------------------------------- +// Concept +// --------------------------------------------------------------------------- + +/// A key concept extracted from a document. +/// +/// Produced during the ingest pipeline's final concept extraction step. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Concept { + /// Concept name (e.g., "capacitor derating"). + pub name: String, + /// One-sentence explanation. + pub summary: String, + /// Which sections this concept appears in. + pub sections: Vec, +} + +// --------------------------------------------------------------------------- +// Answer — what ask() returns +// --------------------------------------------------------------------------- + +/// The result of `ask()` — a reasoned answer with evidence and trace. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Answer { + /// The answer content. + pub content: String, + /// Evidence supporting the answer. + pub evidence: Vec, + /// Confidence score (0.0–1.0). + pub confidence: f32, + /// Reasoning trace — how the agent arrived at this answer. Always present. + pub trace: ReasoningTrace, +} + +// --------------------------------------------------------------------------- +// Evidence +// --------------------------------------------------------------------------- + +/// A piece of evidence supporting an answer — with source attribution. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Evidence { + /// Original document text. + pub content: String, + /// Navigation path (e.g., "Root/Chapter 3/Section 3.2"). + pub source_path: String, + /// Which document this evidence came from. + pub doc_name: String, + /// Relevance to the question (0.0–1.0). + pub relevance: f32, +} + +// --------------------------------------------------------------------------- +// ReasoningTrace — always mandatory +// --------------------------------------------------------------------------- + +/// Reasoning trace — how the agent arrived at the answer. Always present. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReasoningTrace { + /// The steps the agent took. + pub steps: Vec, +} + +impl ReasoningTrace { + /// Create an empty trace. + pub fn empty() -> Self { + Self { steps: Vec::new() } + } + + /// Create a trace with a single step. + pub fn single(action: impl Into, observation: impl Into, round: u32) -> Self { + Self { + steps: vec![TraceStep { + action: action.into(), + observation: observation.into(), + round, + }], + } + } + + /// Add a step to the trace. + pub fn push(&mut self, action: impl Into, observation: impl Into, round: u32) { + self.steps.push(TraceStep { + action: action.into(), + observation: observation.into(), + round, + }); + } +} + +/// A single step in the reasoning trace. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TraceStep { + /// What the agent did (e.g., "cd Chapter 3"). + pub action: String, + /// What the agent observed (e.g., "Found 5 sections about..."). + pub observation: String, + /// Which round this step was in. + pub round: u32, +} + +// --------------------------------------------------------------------------- +// IngestInput — what ingest() takes +// --------------------------------------------------------------------------- + +/// Input to `ingest()` — the document to be understood. +#[derive(Debug, Clone)] +pub enum IngestInput { + /// Index from a file path. + Path(std::path::PathBuf), + /// Index from raw bytes. + Bytes { + /// Document name. + name: String, + /// Raw document bytes. + data: Vec, + /// Document format. + format: super::super::index::parse::DocumentFormat, + }, + /// Index from a text string. + Text { + /// Document name. + name: String, + /// Document content. + content: String, + }, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_reasoning_trace_empty() { + let trace = ReasoningTrace::empty(); + assert!(trace.steps.is_empty()); + } + + #[test] + fn test_reasoning_trace_single() { + let trace = ReasoningTrace::single("cd Chapter 3", "Found 5 sections", 1); + assert_eq!(trace.steps.len(), 1); + assert_eq!(trace.steps[0].action, "cd Chapter 3"); + assert_eq!(trace.steps[0].round, 1); + } + + #[test] + fn test_reasoning_trace_push() { + let mut trace = ReasoningTrace::empty(); + trace.push("ls", "Root with 3 children", 0); + trace.push("cd Chapter 2", "Found target section", 1); + assert_eq!(trace.steps.len(), 2); + } + + #[test] + fn test_concept_serialization() { + let concept = Concept { + name: "capacitor derating".into(), + summary: "Reducing capacitor specs for reliability".into(), + sections: vec!["Section 3.2".into()], + }; + let json = serde_json::to_string(&concept).unwrap(); + assert!(json.contains("capacitor derating")); + } +} diff --git a/rust/src/lib.rs b/rust/src/lib.rs index a4263042..d2f75e6e 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -5,16 +5,15 @@ //! # Vectorless //! -//! A reasoning-native document engine for AI. +//! A Document Understanding Engine for AI. //! -//! It will reason through any of your structured documents — **PDFs, Markdown, -//! reports, contracts** — and retrieve only what's relevant. Nothing more, -//! nothing less. +//! It compiles documents into structured trees of meaning, then dispatches +//! multiple agents to reason through headings, sections, and paragraphs. //! //! ## Quick Start //! //! ```rust,no_run -//! use vectorless::{EngineBuilder, IndexContext, QueryContext}; +//! use vectorless::{EngineBuilder, IngestInput}; //! //! #[tokio::main] //! async fn main() -> Result<(), Box> { @@ -25,14 +24,13 @@ //! .build() //! .await?; //! -//! let result = engine.index(IndexContext::from_path("./document.md")).await?; -//! let doc_id = result.doc_id().unwrap(); +//! // Understand a document +//! let doc = engine.ingest(IngestInput::Path("./report.pdf".into())).await?; +//! println!("{}: {}", doc.name, doc.summary); //! -//! let result = engine.query( -//! QueryContext::new("What is this about?") -//! .with_doc_ids(vec![doc_id.to_string()]), -//! ).await?; -//! println!("{}", result.content); +//! // Ask a question +//! let answer = engine.ask("What is the total revenue?", &[doc.doc_id.clone()]).await?; +//! println!("{}", answer.content); //! //! Ok(()) //! } @@ -61,19 +59,16 @@ mod utils; // ── Public API ─────────────────────────────────────────────────────────────── // Client -pub use client::{ - BuildError, Confidence, DocumentFormat, DocumentInfo, Engine, EngineBuilder, EvidenceItem, - FailedItem, IndexContext, IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, - QueryMetrics, QueryResult, QueryResultItem, -}; +pub use client::{BuildError, Engine, EngineBuilder}; // Config pub use config::Config; -// Documents +// Documents (understanding types) pub use document::{ - DocumentStructure, DocumentTree, NodeId, ReasoningIndexConfig, StructureNode, TocConfig, - TocEntry, TocNode, TocView, TreeNode, + Answer, Concept, Document, DocumentInfo, DocumentStructure, DocumentTree, Evidence, + IngestInput, NodeId, ReasoningIndexConfig, ReasoningTrace, StructureNode, TocConfig, + TocEntry, TocNode, TocView, TraceStep, TreeNode, }; // Graph @@ -85,9 +80,6 @@ pub use events::{EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; // Metrics pub use metrics::{IndexMetrics, LlmMetricsReport, MetricsReport, RetrievalMetricsReport}; -// Retrieval (streaming) -pub use retrieval::{RetrieveEvent, SufficiencyLevel}; - // Errors pub use error::{Error, Result}; From d5389a2aaf89f489e48eec1038155130a00a5369 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:36:59 +0800 Subject: [PATCH 02/28] refactor(docs): update project structure and rename engine description - Rename "reasoning-native document intelligence engine" to "Document Understanding Engine for AI" - Update project structure to reflect cargo workspace with vectorless-core/vectorless and vectorless-py crates - Change Engine.query() to Engine.ask() in retrieval flow - Update build commands to use workspace root - Adjust development workflow paths to use crates/vectorless - Update Python binding paths to crates/vectorless-py/src/lib.rs - Add Python SDK development notes --- CLAUDE.md | 61 ++- Cargo.toml | 2 +- pyproject.toml | 6 +- python/README.md | 239 --------- python/src/context.rs | 282 ---------- python/src/results.rs | 506 ------------------ python/src/streaming.rs | 179 ------- python/vectorless/__init__.py | 71 --- python/vectorless/_core.py | 54 -- rust/tests/integration.rs | 165 ------ {python/tests => tests}/__init__.py | 0 {python/tests => tests}/conftest.py | 0 {python/tests => tests}/test_cli/__init__.py | 0 .../tests => tests}/test_compat/__init__.py | 0 {python/tests => tests}/test_config.py | 0 {python/tests => tests}/test_events.py | 0 {python/tests => tests}/test_session.py | 0 {python/tests => tests}/test_types.py | 0 .../vectorless-py}/Cargo.toml | 8 +- .../vectorless-py}/src/answer.rs | 2 +- .../vectorless-py}/src/config.rs | 0 .../vectorless-py}/src/document.rs | 0 .../vectorless-py}/src/engine.rs | 0 .../vectorless-py}/src/error.rs | 0 .../vectorless-py}/src/graph.rs | 0 .../vectorless-py}/src/lib.rs | 0 .../vectorless-py}/src/metrics.rs | 0 .../vectorless}/Cargo.toml | 2 +- .../vectorless}/examples/deep_retrieval.rs | 0 .../vectorless}/examples/events.rs | 0 .../vectorless}/examples/flow.rs | 0 .../vectorless}/examples/graph.rs | 0 .../vectorless}/examples/index_directory.rs | 0 .../vectorless}/examples/index_incremental.rs | 0 .../vectorless}/examples/index_pdf.rs | 0 .../vectorless}/examples/index_single.rs | 0 .../vectorless}/examples/indexing.rs | 0 .../vectorless}/examples/indexing_flow.rs | 0 .../vectorless}/examples/query.rs | 0 .../examples/single_doc_challenge.rs | 0 .../vectorless}/src/agent/command.rs | 0 .../vectorless}/src/agent/config.rs | 0 .../vectorless}/src/agent/context.rs | 0 .../vectorless}/src/agent/events.rs | 0 .../vectorless}/src/agent/mod.rs | 0 .../src/agent/orchestrator/analyze.rs | 0 .../src/agent/orchestrator/dispatch.rs | 0 .../src/agent/orchestrator/evaluate.rs | 0 .../vectorless}/src/agent/orchestrator/mod.rs | 0 .../src/agent/orchestrator/replan.rs | 0 .../src/agent/orchestrator/supervisor.rs | 0 .../vectorless}/src/agent/prompts.rs | 0 .../vectorless}/src/agent/state.rs | 0 .../vectorless}/src/agent/tools/common.rs | 0 .../vectorless}/src/agent/tools/mod.rs | 0 .../src/agent/tools/orchestrator.rs | 0 .../vectorless}/src/agent/tools/worker/cat.rs | 0 .../vectorless}/src/agent/tools/worker/cd.rs | 0 .../src/agent/tools/worker/find.rs | 0 .../src/agent/tools/worker/grep.rs | 0 .../src/agent/tools/worker/head.rs | 0 .../vectorless}/src/agent/tools/worker/ls.rs | 0 .../vectorless}/src/agent/tools/worker/mod.rs | 0 .../vectorless}/src/agent/tools/worker/pwd.rs | 0 .../vectorless}/src/agent/tools/worker/wc.rs | 0 .../vectorless}/src/agent/worker/execute.rs | 0 .../vectorless}/src/agent/worker/format.rs | 0 .../vectorless}/src/agent/worker/mod.rs | 0 .../src/agent/worker/navigation.rs | 0 .../vectorless}/src/agent/worker/planning.rs | 0 .../vectorless}/src/client/builder.rs | 0 .../vectorless}/src/client/engine.rs | 0 .../vectorless}/src/client/index_context.rs | 0 .../src/client/indexed_document.rs | 0 .../vectorless}/src/client/indexer.rs | 0 .../vectorless}/src/client/mod.rs | 0 .../vectorless}/src/client/query_context.rs | 0 .../vectorless}/src/client/retriever.rs | 0 .../vectorless}/src/client/test_support.rs | 0 .../vectorless}/src/client/types.rs | 0 .../vectorless}/src/client/workspace.rs | 0 .../vectorless}/src/config/mod.rs | 0 .../vectorless}/src/config/types/indexer.rs | 0 .../vectorless}/src/config/types/llm_pool.rs | 0 .../vectorless}/src/config/types/metrics.rs | 0 .../vectorless}/src/config/types/mod.rs | 0 .../vectorless}/src/config/types/retrieval.rs | 0 .../vectorless}/src/config/types/storage.rs | 0 .../vectorless}/src/config/validator.rs | 0 .../vectorless}/src/document/mod.rs | 0 .../vectorless}/src/document/navigation.rs | 0 .../vectorless}/src/document/node.rs | 0 .../vectorless}/src/document/reasoning.rs | 0 .../vectorless}/src/document/reference.rs | 0 .../vectorless}/src/document/serde_helpers.rs | 0 .../vectorless}/src/document/structure.rs | 0 .../vectorless}/src/document/toc.rs | 0 .../vectorless}/src/document/tree.rs | 0 .../vectorless}/src/document/understanding.rs | 0 .../vectorless}/src/error.rs | 0 .../vectorless}/src/events/emitter.rs | 0 .../vectorless}/src/events/mod.rs | 0 .../vectorless}/src/events/types.rs | 0 .../vectorless}/src/graph/builder.rs | 0 .../vectorless}/src/graph/config.rs | 0 .../vectorless}/src/graph/mod.rs | 0 .../vectorless}/src/graph/types.rs | 0 .../vectorless}/src/index/config.rs | 0 .../src/index/incremental/detector.rs | 0 .../vectorless}/src/index/incremental/mod.rs | 0 .../src/index/incremental/resolver.rs | 0 .../src/index/incremental/updater.rs | 0 .../vectorless}/src/index/mod.rs | 0 .../src/index/parse/markdown/config.rs | 0 .../src/index/parse/markdown/frontmatter.rs | 0 .../src/index/parse/markdown/mod.rs | 0 .../src/index/parse/markdown/parser.rs | 0 .../vectorless}/src/index/parse/mod.rs | 0 .../vectorless}/src/index/parse/pdf/mod.rs | 0 .../vectorless}/src/index/parse/pdf/parser.rs | 0 .../vectorless}/src/index/parse/pdf/types.rs | 0 .../src/index/parse/toc/assigner.rs | 0 .../src/index/parse/toc/detector.rs | 0 .../vectorless}/src/index/parse/toc/mod.rs | 0 .../vectorless}/src/index/parse/toc/parser.rs | 0 .../src/index/parse/toc/processor.rs | 0 .../src/index/parse/toc/repairer.rs | 0 .../index/parse/toc/structure_extractor.rs | 0 .../vectorless}/src/index/parse/toc/types.rs | 0 .../src/index/parse/toc/verifier.rs | 0 .../vectorless}/src/index/parse/types.rs | 0 .../src/index/pipeline/checkpoint.rs | 0 .../vectorless}/src/index/pipeline/context.rs | 0 .../src/index/pipeline/executor.rs | 0 .../vectorless}/src/index/pipeline/metrics.rs | 0 .../vectorless}/src/index/pipeline/mod.rs | 0 .../src/index/pipeline/orchestrator.rs | 0 .../vectorless}/src/index/pipeline/policy.rs | 0 .../vectorless}/src/index/stages/build.rs | 0 .../vectorless}/src/index/stages/enhance.rs | 0 .../vectorless}/src/index/stages/enrich.rs | 0 .../vectorless}/src/index/stages/mod.rs | 0 .../src/index/stages/navigation.rs | 0 .../vectorless}/src/index/stages/optimize.rs | 0 .../vectorless}/src/index/stages/parse.rs | 0 .../vectorless}/src/index/stages/reasoning.rs | 0 .../vectorless}/src/index/stages/split.rs | 0 .../vectorless}/src/index/stages/validate.rs | 0 .../vectorless}/src/index/summary/full.rs | 0 .../vectorless}/src/index/summary/lazy.rs | 0 .../vectorless}/src/index/summary/mod.rs | 0 .../src/index/summary/selective.rs | 0 .../vectorless}/src/index/summary/strategy.rs | 0 .../vectorless}/src/lib.rs | 0 .../vectorless}/src/llm/client.rs | 0 .../vectorless}/src/llm/config.rs | 0 .../vectorless}/src/llm/error.rs | 0 .../vectorless}/src/llm/executor.rs | 0 .../vectorless}/src/llm/fallback.rs | 0 .../vectorless}/src/llm/memo/mod.rs | 0 .../vectorless}/src/llm/memo/store.rs | 0 .../vectorless}/src/llm/memo/types.rs | 0 .../vectorless}/src/llm/mod.rs | 0 .../vectorless}/src/llm/pool.rs | 0 .../vectorless}/src/llm/throttle.rs | 0 .../vectorless}/src/metrics/hub.rs | 0 .../vectorless}/src/metrics/index.rs | 0 .../vectorless}/src/metrics/llm.rs | 0 .../vectorless}/src/metrics/mod.rs | 0 .../vectorless}/src/metrics/retrieval.rs | 0 .../vectorless}/src/query/mod.rs | 0 .../vectorless}/src/query/types.rs | 0 .../vectorless}/src/query/understand.rs | 0 .../vectorless}/src/rerank/dedup.rs | 0 .../vectorless}/src/rerank/mod.rs | 0 .../vectorless}/src/rerank/types.rs | 0 .../vectorless}/src/retrieval/cache.rs | 0 .../vectorless}/src/retrieval/dispatcher.rs | 0 .../vectorless}/src/retrieval/mod.rs | 0 .../src/retrieval/postprocessor.rs | 0 .../vectorless}/src/retrieval/stream.rs | 0 .../vectorless}/src/retrieval/types.rs | 0 .../vectorless}/src/scoring/bm25.rs | 0 .../vectorless}/src/scoring/mod.rs | 0 .../vectorless}/src/storage/backend/file.rs | 0 .../vectorless}/src/storage/backend/memory.rs | 0 .../vectorless}/src/storage/backend/mod.rs | 0 .../src/storage/backend/trait_def.rs | 0 .../vectorless}/src/storage/cache.rs | 0 .../vectorless}/src/storage/codec.rs | 0 .../vectorless}/src/storage/lock.rs | 0 .../vectorless}/src/storage/migration.rs | 0 .../vectorless}/src/storage/mod.rs | 0 .../vectorless}/src/storage/persistence.rs | 0 .../vectorless}/src/storage/workspace.rs | 0 .../vectorless}/src/utils/fingerprint.rs | 0 .../vectorless}/src/utils/mod.rs | 0 .../vectorless}/src/utils/token.rs | 0 .../vectorless}/src/utils/validation.rs | 0 vectorless/README.md | 165 ++++++ vectorless/__init__.py | 67 +++ .../vectorless => vectorless}/_async_utils.py | 0 .../_compat/__init__.py | 0 .../_compat/langchain.py | 0 .../_compat/llamaindex.py | 0 vectorless/_core.py | 48 ++ .../vectorless => vectorless}/cli/__init__.py | 0 .../cli/commands/__init__.py | 0 .../cli/commands/add.py | 0 .../cli/commands/ask.py | 0 .../cli/commands/config_cmd.py | 0 .../cli/commands/info.py | 0 .../cli/commands/init.py | 0 .../cli/commands/list_cmd.py | 0 .../cli/commands/query.py | 0 .../cli/commands/remove.py | 0 .../cli/commands/stats.py | 0 .../cli/commands/tree.py | 0 {python/vectorless => vectorless}/cli/main.py | 0 .../vectorless => vectorless}/cli/output.py | 0 .../config/__init__.py | 0 .../config/loading.py | 0 .../config/models.py | 0 {python/vectorless => vectorless}/events.py | 0 {python/vectorless => vectorless}/jupyter.py | 0 {python/vectorless => vectorless}/py.typed | 0 {python/vectorless => vectorless}/session.py | 0 .../vectorless => vectorless}/streaming.py | 0 .../vectorless => vectorless}/sync_session.py | 0 .../types/__init__.py | 0 .../vectorless => vectorless}/types/graph.py | 0 .../types/results.py | 0 232 files changed, 324 insertions(+), 1533 deletions(-) delete mode 100644 python/README.md delete mode 100644 python/src/context.rs delete mode 100644 python/src/results.rs delete mode 100644 python/src/streaming.rs delete mode 100644 python/vectorless/__init__.py delete mode 100644 python/vectorless/_core.py delete mode 100644 rust/tests/integration.rs rename {python/tests => tests}/__init__.py (100%) rename {python/tests => tests}/conftest.py (100%) rename {python/tests => tests}/test_cli/__init__.py (100%) rename {python/tests => tests}/test_compat/__init__.py (100%) rename {python/tests => tests}/test_config.py (100%) rename {python/tests => tests}/test_events.py (100%) rename {python/tests => tests}/test_session.py (100%) rename {python/tests => tests}/test_types.py (100%) rename {python => vectorless-core/vectorless-py}/Cargo.toml (76%) rename {python => vectorless-core/vectorless-py}/src/answer.rs (97%) rename {python => vectorless-core/vectorless-py}/src/config.rs (100%) rename {python => vectorless-core/vectorless-py}/src/document.rs (100%) rename {python => vectorless-core/vectorless-py}/src/engine.rs (100%) rename {python => vectorless-core/vectorless-py}/src/error.rs (100%) rename {python => vectorless-core/vectorless-py}/src/graph.rs (100%) rename {python => vectorless-core/vectorless-py}/src/lib.rs (100%) rename {python => vectorless-core/vectorless-py}/src/metrics.rs (100%) rename {rust => vectorless-core/vectorless}/Cargo.toml (98%) rename {rust => vectorless-core/vectorless}/examples/deep_retrieval.rs (100%) rename {rust => vectorless-core/vectorless}/examples/events.rs (100%) rename {rust => vectorless-core/vectorless}/examples/flow.rs (100%) rename {rust => vectorless-core/vectorless}/examples/graph.rs (100%) rename {rust => vectorless-core/vectorless}/examples/index_directory.rs (100%) rename {rust => vectorless-core/vectorless}/examples/index_incremental.rs (100%) rename {rust => vectorless-core/vectorless}/examples/index_pdf.rs (100%) rename {rust => vectorless-core/vectorless}/examples/index_single.rs (100%) rename {rust => vectorless-core/vectorless}/examples/indexing.rs (100%) rename {rust => vectorless-core/vectorless}/examples/indexing_flow.rs (100%) rename {rust => vectorless-core/vectorless}/examples/query.rs (100%) rename {rust => vectorless-core/vectorless}/examples/single_doc_challenge.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/command.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/config.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/context.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/events.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/orchestrator/analyze.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/orchestrator/dispatch.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/orchestrator/evaluate.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/orchestrator/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/orchestrator/replan.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/orchestrator/supervisor.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/prompts.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/state.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/common.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/orchestrator.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/cat.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/cd.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/find.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/grep.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/head.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/ls.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/pwd.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/tools/worker/wc.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/worker/execute.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/worker/format.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/worker/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/worker/navigation.rs (100%) rename {rust => vectorless-core/vectorless}/src/agent/worker/planning.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/builder.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/engine.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/index_context.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/indexed_document.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/indexer.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/query_context.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/retriever.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/test_support.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/client/workspace.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/types/indexer.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/types/llm_pool.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/types/metrics.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/types/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/types/retrieval.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/types/storage.rs (100%) rename {rust => vectorless-core/vectorless}/src/config/validator.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/navigation.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/node.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/reasoning.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/reference.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/serde_helpers.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/structure.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/toc.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/tree.rs (100%) rename {rust => vectorless-core/vectorless}/src/document/understanding.rs (100%) rename {rust => vectorless-core/vectorless}/src/error.rs (100%) rename {rust => vectorless-core/vectorless}/src/events/emitter.rs (100%) rename {rust => vectorless-core/vectorless}/src/events/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/events/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/graph/builder.rs (100%) rename {rust => vectorless-core/vectorless}/src/graph/config.rs (100%) rename {rust => vectorless-core/vectorless}/src/graph/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/graph/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/config.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/incremental/detector.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/incremental/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/incremental/resolver.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/incremental/updater.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/markdown/config.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/markdown/frontmatter.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/markdown/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/markdown/parser.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/pdf/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/pdf/parser.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/pdf/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/assigner.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/detector.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/parser.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/processor.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/repairer.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/structure_extractor.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/toc/verifier.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/parse/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/checkpoint.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/context.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/executor.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/metrics.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/orchestrator.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/pipeline/policy.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/build.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/enhance.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/enrich.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/navigation.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/optimize.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/parse.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/reasoning.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/split.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/stages/validate.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/summary/full.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/summary/lazy.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/summary/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/summary/selective.rs (100%) rename {rust => vectorless-core/vectorless}/src/index/summary/strategy.rs (100%) rename {rust => vectorless-core/vectorless}/src/lib.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/client.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/config.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/error.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/executor.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/fallback.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/memo/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/memo/store.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/memo/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/pool.rs (100%) rename {rust => vectorless-core/vectorless}/src/llm/throttle.rs (100%) rename {rust => vectorless-core/vectorless}/src/metrics/hub.rs (100%) rename {rust => vectorless-core/vectorless}/src/metrics/index.rs (100%) rename {rust => vectorless-core/vectorless}/src/metrics/llm.rs (100%) rename {rust => vectorless-core/vectorless}/src/metrics/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/metrics/retrieval.rs (100%) rename {rust => vectorless-core/vectorless}/src/query/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/query/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/query/understand.rs (100%) rename {rust => vectorless-core/vectorless}/src/rerank/dedup.rs (100%) rename {rust => vectorless-core/vectorless}/src/rerank/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/rerank/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/retrieval/cache.rs (100%) rename {rust => vectorless-core/vectorless}/src/retrieval/dispatcher.rs (100%) rename {rust => vectorless-core/vectorless}/src/retrieval/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/retrieval/postprocessor.rs (100%) rename {rust => vectorless-core/vectorless}/src/retrieval/stream.rs (100%) rename {rust => vectorless-core/vectorless}/src/retrieval/types.rs (100%) rename {rust => vectorless-core/vectorless}/src/scoring/bm25.rs (100%) rename {rust => vectorless-core/vectorless}/src/scoring/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/backend/file.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/backend/memory.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/backend/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/backend/trait_def.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/cache.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/codec.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/lock.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/migration.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/persistence.rs (100%) rename {rust => vectorless-core/vectorless}/src/storage/workspace.rs (100%) rename {rust => vectorless-core/vectorless}/src/utils/fingerprint.rs (100%) rename {rust => vectorless-core/vectorless}/src/utils/mod.rs (100%) rename {rust => vectorless-core/vectorless}/src/utils/token.rs (100%) rename {rust => vectorless-core/vectorless}/src/utils/validation.rs (100%) create mode 100644 vectorless/README.md create mode 100644 vectorless/__init__.py rename {python/vectorless => vectorless}/_async_utils.py (100%) rename {python/vectorless => vectorless}/_compat/__init__.py (100%) rename {python/vectorless => vectorless}/_compat/langchain.py (100%) rename {python/vectorless => vectorless}/_compat/llamaindex.py (100%) create mode 100644 vectorless/_core.py rename {python/vectorless => vectorless}/cli/__init__.py (100%) rename {python/vectorless => vectorless}/cli/commands/__init__.py (100%) rename {python/vectorless => vectorless}/cli/commands/add.py (100%) rename {python/vectorless => vectorless}/cli/commands/ask.py (100%) rename {python/vectorless => vectorless}/cli/commands/config_cmd.py (100%) rename {python/vectorless => vectorless}/cli/commands/info.py (100%) rename {python/vectorless => vectorless}/cli/commands/init.py (100%) rename {python/vectorless => vectorless}/cli/commands/list_cmd.py (100%) rename {python/vectorless => vectorless}/cli/commands/query.py (100%) rename {python/vectorless => vectorless}/cli/commands/remove.py (100%) rename {python/vectorless => vectorless}/cli/commands/stats.py (100%) rename {python/vectorless => vectorless}/cli/commands/tree.py (100%) rename {python/vectorless => vectorless}/cli/main.py (100%) rename {python/vectorless => vectorless}/cli/output.py (100%) rename {python/vectorless => vectorless}/config/__init__.py (100%) rename {python/vectorless => vectorless}/config/loading.py (100%) rename {python/vectorless => vectorless}/config/models.py (100%) rename {python/vectorless => vectorless}/events.py (100%) rename {python/vectorless => vectorless}/jupyter.py (100%) rename {python/vectorless => vectorless}/py.typed (100%) rename {python/vectorless => vectorless}/session.py (100%) rename {python/vectorless => vectorless}/streaming.py (100%) rename {python/vectorless => vectorless}/sync_session.py (100%) rename {python/vectorless => vectorless}/types/__init__.py (100%) rename {python/vectorless => vectorless}/types/graph.py (100%) rename {python/vectorless => vectorless}/types/results.py (100%) diff --git a/CLAUDE.md b/CLAUDE.md index e4230818..c326169e 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,6 +1,6 @@ # CLAUDE.md -Vectorless is a reasoning-native document intelligence engine written in Rust. +Vectorless is a Document Understanding Engine for AI written in Rust. ## Principles @@ -10,32 +10,36 @@ Vectorless is a reasoning-native document intelligence engine written in Rust. ## Project Structure -- `rust/` - Rust core engine - - `src/client/` - Client API (EngineBuilder, Engine) - facade layer, no business logic - - `src/document/` - Document data structures (DocumentTree, NavigationIndex, ReasoningIndex) - - `src/index/` - Compile pipeline (8-stage, checkpointing, incremental update) - - `src/retrieval/` - Retrieval dispatch layer (preprocessing, dispatch, postprocessing, cache, streaming) - - `src/query/` - Query understanding and planning (intent classification, rewrite, decomposition) - - `src/agent/` - Retrieval execution (Worker: doc navigation, Orchestrator: supervisor loop + multi-doc fusion) - - `src/rerank/` - Result reranking and answer synthesis (dedup, scoring, fusion, synthesis) - - `src/scoring/` - Scoring and ranking strategies (BM25, relevance scoring, score combination) - - `src/llm/` - LLM client (connection pool, memo/caching, throttle/rate-limiting, fallback) - - `src/storage/` - Persistence (Workspace, LRU cache, backend abstraction file/memory) - - `src/graph/` - Cross-document relationship graph - - `src/metrics/` - Metrics collection and reporting - - `src/events/` - Event system for progress monitoring - - `src/config/` - Configuration types and validation - - `src/error.rs` - Unified error types - - `src/utils/` - Utility functions (token counting, fingerprinting, validation) - - `examples/` - Rust examples (flow, indexing, pdf, batch, etc.) -- `python/` - Python SDK (PyO3 bindings) + CLI +Cargo workspace with 2 crates + pure Python SDK: + +- `vectorless-core/` - Rust crates + - `vectorless/` - Core engine + - `src/client/` - Client API (EngineBuilder, Engine) - facade layer, no business logic + - `src/document/` - Document data structures (Document, DocumentTree, NavigationIndex, ReasoningIndex) + - `src/index/` - Compile pipeline (8-stage, checkpointing, incremental update) + - `src/retrieval/` - Retrieval dispatch layer (preprocessing, dispatch, postprocessing, cache, streaming) + - `src/query/` - Query understanding and planning (intent classification, rewrite, decomposition) + - `src/agent/` - Retrieval execution (Worker: doc navigation, Orchestrator: supervisor loop + multi-doc fusion) + - `src/rerank/` - Result reranking and answer synthesis (dedup, scoring, fusion, synthesis) + - `src/scoring/` - Scoring and ranking strategies (BM25, relevance scoring, score combination) + - `src/llm/` - LLM client (connection pool, memo/caching, throttle/rate-limiting, fallback) + - `src/storage/` - Persistence (Workspace, LRU cache, backend abstraction file/memory) + - `src/graph/` - Cross-document relationship graph + - `src/metrics/` - Metrics collection and reporting + - `src/events/` - Event system for progress monitoring + - `src/config/` - Configuration types and validation + - `src/error.rs` - Unified error types + - `src/utils/` - Utility functions (token counting, fingerprinting, validation) + - `examples/` - Rust examples (flow, indexing, pdf, batch, etc.) + - `vectorless-py/` - PyO3 bindings (compiled into Python native module) +- `vectorless/` - Pure Python SDK (high-level wrappers, CLI, config loading, integrations) - `docs/` - Docusaurus documentation site - `samples/` - Sample files ### Retrieval Call Flow ``` -Engine.query() +Engine.ask() → retrieval/dispatcher → query/understand() → QueryPlan (LLM intent + concepts + strategy) → Orchestrator (always, single or multi-doc) @@ -49,16 +53,14 @@ Engine.query() ## Build Commands ```bash -# Rust core -cd rust -cargo build # Build +# Build (workspace) +cargo build # Build all crates cargo test # Run tests cargo clippy # Lint cargo fmt # Format code # Python SDK -cd python -pip install -e . # Install in editable mode +pip install -e . # Install in editable mode (from project root, uses maturin) # Docs site cd docs @@ -145,7 +147,8 @@ When uncertain whether an operation is safe, **default to asking user confirmati ## Common Development Workflow -1. **Adding features**: Implement in appropriate `rust/src/` module, add tests +1. **Adding features**: Implement in appropriate `crates/vectorless/src/` module, add tests 2. **Fixing bugs**: Add failing test case first, fix and ensure tests pass -3. **Python bindings**: Update `python/src/lib.rs` (PyO3) when Rust APIs change -4. **Committing code**: Use semantic commit messages, format: `type(scope): description` +3. **Python bindings**: Update `crates/vectorless-py/src/lib.rs` (PyO3) when Rust APIs change +4. **Python SDK**: Update `python/vectorless/` when API surface changes +5. **Committing code**: Use semantic commit messages, format: `type(scope): description` diff --git a/Cargo.toml b/Cargo.toml index c3c13c88..fa671216 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["rust", "python"] +members = ["vectorless-core/vectorless", "vectorless-core/vectorless-py"] resolver = "2" [workspace.package] diff --git a/pyproject.toml b/pyproject.toml index ac8ae458..eeca3a79 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,14 +67,14 @@ Repository = "https://github.com/vectorlessflow/vectorless" Documentation = "https://www.vectorless.dev/docs/intro" [tool.maturin] -python-source = "python" +python-source = "." module-name = "vectorless._vectorless" -manifest-path = "python/Cargo.toml" +manifest-path = "vectorless-core/vectorless-py/Cargo.toml" features = ["pyo3/extension-module"] [tool.pytest.ini_options] asyncio_mode = "auto" -testpaths = ["python/tests"] +testpaths = ["tests"] [tool.mypy] python_version = "3.9" diff --git a/python/README.md b/python/README.md deleted file mode 100644 index be4761d0..00000000 --- a/python/README.md +++ /dev/null @@ -1,239 +0,0 @@ -# Vectorless Python SDK - -Python bindings for [vectorless](https://github.com/vectorlessflow/vectorless) — a reasoning-native document intelligence engine for AI. - -## Installation - -```bash -pip install vectorless -``` - -## Quick Start - -```python -import asyncio -from vectorless import Engine, IndexContext, QueryContext - -async def main(): - # Create engine — api_key and model are required - engine = Engine( - api_key="sk-...", - model="gpt-4o", - ) - - # Index a document - result = await engine.index(IndexContext.from_path("./report.pdf")) - doc_id = result.doc_id - print(f"Indexed: {doc_id}") - - # Query the document - result = await engine.query( - QueryContext("What is the total revenue?").with_doc_ids([doc_id]) - ) - item = result.single() - print(f"Answer: {item.content}") - print(f"Score: {item.score:.2f}") - - # List all documents - for doc in await engine.list(): - print(f" - {doc.name} ({doc.id})") - - # Cleanup - await engine.remove(doc_id) - -asyncio.run(main()) -``` - -## API Reference - -### Engine - -The main entry point for vectorless. - -```python -class Engine: - def __init__( - self, - config_path: str | None = None, - api_key: str | None = None, - model: str | None = None, - endpoint: str | None = None, - ): ... - - async def index(self, ctx: IndexContext) -> IndexResult: ... - async def query(self, ctx: QueryContext) -> QueryResult: ... - async def list(self) -> list[DocumentInfo]: ... - async def remove(self, doc_id: str) -> bool: ... - async def clear(self) -> int: ... - async def exists(self, doc_id: str) -> bool: ... - async def get_graph(self) -> DocumentGraph | None: ... -``` - -### IndexContext - -Context for indexing documents. - -```python -class IndexContext: - @staticmethod - def from_path(path: str, name: str | None = None) -> IndexContext: ... - - @staticmethod - def from_paths(paths: list[str]) -> IndexContext: ... - - @staticmethod - def from_dir(path: str, recursive: bool = True) -> IndexContext: ... - - @staticmethod - def from_content( - content: str, - name: str | None = None, - format: str = "markdown", - ) -> IndexContext: ... - - @staticmethod - def from_bytes( - data: bytes, - name: str, - format: str, - ) -> IndexContext: ... - - def with_options(self, options: IndexOptions) -> IndexContext: ... - def with_mode(self, mode: str) -> IndexContext: ... -``` - -**Supported formats:** -- `"markdown"` / `"md"` - Markdown content -- `"pdf"` - PDF documents - -### QueryContext - -Context for querying documents. - -```python -class QueryContext: - def __init__(self, query: str): ... - - def with_doc_ids(self, doc_ids: list[str]) -> QueryContext: ... - def with_workspace(self) -> QueryContext: ... - def with_timeout_secs(self, secs: int) -> QueryContext: ... - def with_force_analysis(self, force: bool) -> QueryContext: ... -``` - -### IndexResult - -```python -class IndexResult: - @property - def doc_id(self) -> str | None: ... - @property - def items(self) -> list[IndexItem]: ... - @property - def failed(self) -> list[FailedItem]: ... - def has_failures(self) -> bool: ... - def total(self) -> int: ... - def __len__(self) -> int: ... -``` - -### QueryResult - -```python -class QueryResult: - @property - def items(self) -> list[QueryResultItem]: ... - @property - def failed(self) -> list[FailedItem]: ... - def single(self) -> QueryResultItem | None: ... - def has_failures(self) -> bool: ... - def __len__(self) -> int: ... -``` - -### QueryResultItem - -```python -class QueryResultItem: - @property - def doc_id(self) -> str: ... - @property - def content(self) -> str: ... - @property - def score(self) -> float: ... - @property - def node_ids(self) -> list[str]: ... -``` - -### IndexItem - -```python -class IndexItem: - @property - def doc_id(self) -> str: ... - @property - def name(self) -> str: ... - @property - def format(self) -> str: ... - @property - def description(self) -> str | None: ... - @property - def source_path(self) -> str | None: ... - @property - def page_count(self) -> int | None: ... - @property - def metrics(self) -> IndexMetrics | None: ... -``` - -### DocumentInfo - -```python -class DocumentInfo: - @property - def id(self) -> str: ... - @property - def name(self) -> str: ... - @property - def format(self) -> str: ... - @property - def description(self) -> str | None: ... - @property - def source_path(self) -> str | None: ... - @property - def page_count(self) -> int | None: ... - @property - def line_count(self) -> int | None: ... -``` - -### VectorlessError - -```python -class VectorlessError(Exception): - @property - def message(self) -> str: ... - @property - def kind(self) -> str: ... # "config", "parse", "not_found", "llm" -``` - -## Development - -### Building from source - -```bash -# Install maturin -pip install maturin - -# Build and install (from project root) -maturin develop - -# Run tests -pytest -``` - -### Publishing to PyPI - -```bash -maturin build --release -maturin publish -``` - -## License - -Apache-2.0 diff --git a/python/src/context.rs b/python/src/context.rs deleted file mode 100644 index 2bf0ae94..00000000 --- a/python/src/context.rs +++ /dev/null @@ -1,282 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! IndexContext, QueryContext, and IndexOptions Python wrappers. - -use pyo3::prelude::*; - -use ::vectorless::{DocumentFormat, IndexContext, IndexMode, IndexOptions, QueryContext}; - -use super::error::VectorlessError; - -/// Parse format string to DocumentFormat. -fn parse_format(format: &str) -> PyResult { - match format.to_lowercase().as_str() { - "markdown" | "md" => Ok(DocumentFormat::Markdown), - "pdf" => Ok(DocumentFormat::Pdf), - _ => Err(PyErr::from(VectorlessError::new( - format!("Unknown format: {}. Supported: markdown, pdf", format), - "config", - ))), - } -} - -// ============================================================ -// IndexOptions -// ============================================================ - -/// Options for controlling indexing behavior. -/// -/// Args: -/// mode: Indexing mode - "default", "force", or "incremental". -/// generate_summaries: Whether to generate summaries. Default: True. -/// generate_description: Whether to generate document description. Default: False. -/// generate_ids: Whether to generate node IDs. Default: True. -/// enable_synonym_expansion: Whether to expand keywords with LLM-generated -/// synonyms during indexing. Improves recall for differently-worded queries. -/// Default: False. -#[pyclass(name = "IndexOptions", skip_from_py_object)] -#[derive(Clone)] -pub struct PyIndexOptions { - pub(crate) inner: IndexOptions, -} - -#[pymethods] -impl PyIndexOptions { - #[new] - #[pyo3(signature = (mode="default", generate_summaries=true, generate_description=false, generate_ids=true, enable_synonym_expansion=true, timeout_secs=None))] - fn new( - mode: &str, - generate_summaries: bool, - generate_description: bool, - generate_ids: bool, - enable_synonym_expansion: bool, - timeout_secs: Option, - ) -> PyResult { - let mut opts = IndexOptions::new(); - match mode { - "default" => {} - "force" => opts = opts.with_mode(IndexMode::Force), - "incremental" => opts = opts.with_mode(IndexMode::Incremental), - _ => { - return Err(PyErr::from(VectorlessError::new( - format!( - "Unknown mode: {}. Supported: default, force, incremental", - mode - ), - "config", - ))); - } - } - opts.generate_summaries = generate_summaries; - opts.generate_description = generate_description; - opts.generate_ids = generate_ids; - opts.enable_synonym_expansion = enable_synonym_expansion; - if let Some(secs) = timeout_secs { - opts = opts.with_timeout_secs(secs); - } - Ok(Self { inner: opts }) - } - - fn __repr__(&self) -> String { - format!( - "IndexOptions(mode='{}', generate_summaries={}, generate_description={}, generate_ids={}, enable_synonym_expansion={})", - match self.inner.mode { - IndexMode::Default => "default", - IndexMode::Force => "force", - IndexMode::Incremental => "incremental", - }, - self.inner.generate_summaries, - self.inner.generate_description, - self.inner.generate_ids, - self.inner.enable_synonym_expansion, - ) - } -} - -// ============================================================ -// IndexContext -// ============================================================ - -/// Context for indexing a document. -/// -/// Create using the static methods: -/// -/// ```python -/// from vectorless import IndexContext -/// -/// # Single file -/// ctx = IndexContext.from_path("./document.pdf") -/// -/// # Multiple files -/// ctx = IndexContext.from_paths(["./a.pdf", "./b.md"]) -/// -/// # Directory -/// ctx = IndexContext.from_dir("./docs/") -/// -/// # From text -/// ctx = IndexContext.from_content("# Title\\nContent...", "markdown").with_name("doc") -/// -/// # From bytes -/// ctx = IndexContext.from_bytes(data, "pdf").with_name("doc") -/// ``` -#[pyclass(name = "IndexContext")] -pub struct PyIndexContext { - pub(crate) inner: IndexContext, -} - -#[pymethods] -impl PyIndexContext { - /// Create an IndexContext from a single file path. - #[staticmethod] - fn from_path(path: String) -> Self { - Self { - inner: IndexContext::from_path(&path), - } - } - - /// Create an IndexContext from multiple file paths. - #[staticmethod] - fn from_paths(paths: Vec) -> Self { - Self { - inner: IndexContext::from_paths(&paths), - } - } - - /// Create an IndexContext from all supported files in a directory. - /// - /// Args: - /// path: Directory path to scan. - /// recursive: If True, scan subdirectories recursively. Default: False. - #[staticmethod] - #[pyo3(signature = (path, recursive=false))] - fn from_dir(path: String, recursive: bool) -> Self { - let inner = IndexContext::from_dir(&path, recursive); - Self { inner } - } - - /// Create an IndexContext from text content. - #[staticmethod] - #[pyo3(signature = (content, format="markdown"))] - fn from_content(content: String, format: &str) -> PyResult { - let doc_format = parse_format(format)?; - let ctx = IndexContext::from_content(&content, doc_format); - Ok(Self { inner: ctx }) - } - - /// Create an IndexContext from binary data. - #[staticmethod] - fn from_bytes(data: Vec, format: &str) -> PyResult { - let doc_format = parse_format(format)?; - let ctx = IndexContext::from_bytes(data, doc_format); - Ok(Self { inner: ctx }) - } - - /// Set the document name (single-source only). - fn with_name(&self, name: String) -> Self { - let ctx = self.inner.clone().with_name(&name); - Self { inner: ctx } - } - - /// Apply indexing options. - fn with_options(&self, options: &PyIndexOptions) -> Self { - let ctx = self.inner.clone().with_options(options.inner.clone()); - Self { inner: ctx } - } - - /// Set indexing mode. - fn with_mode(&self, mode: &str) -> PyResult { - let m = match mode { - "default" => IndexMode::Default, - "force" => IndexMode::Force, - "incremental" => IndexMode::Incremental, - _ => { - return Err(PyErr::from(VectorlessError::new( - format!( - "Unknown mode: {}. Supported: default, force, incremental", - mode - ), - "config", - ))); - } - }; - let ctx = self.inner.clone().with_mode(m); - Ok(Self { inner: ctx }) - } - - /// Number of document sources. - fn __len__(&self) -> usize { - self.inner.len() - } - - /// Whether no sources are present. - fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - fn __repr__(&self) -> String { - format!("IndexContext(sources={})", self.inner.len()) - } -} - -// ============================================================ -// QueryContext -// ============================================================ - -/// Context for a query operation. -/// -/// ```python -/// from vectorless import QueryContext -/// -/// # Query specific documents -/// ctx = QueryContext("What is the total revenue?").with_doc_ids([doc_id]) -/// -/// # Query multiple documents -/// ctx = QueryContext("What is the architecture?").with_doc_ids(["doc-1", "doc-2"]) -/// -/// # Query entire workspace -/// ctx = QueryContext("Explain the algorithm") -/// ``` -#[pyclass(name = "QueryContext")] -pub struct PyQueryContext { - pub(crate) inner: QueryContext, -} - -#[pymethods] -impl PyQueryContext { - /// Create a new query context (defaults to workspace scope). - #[new] - fn new(query: String) -> Self { - Self { - inner: QueryContext::new(&query), - } - } - - /// Set scope to specific documents. - fn with_doc_ids(&self, doc_ids: Vec) -> Self { - let ctx = self.inner.clone().with_doc_ids(doc_ids); - Self { inner: ctx } - } - - /// Set scope to entire workspace. - fn with_workspace(&self) -> Self { - let ctx = self.inner.clone().with_workspace(); - Self { inner: ctx } - } - - /// Set per-operation timeout in seconds. - fn with_timeout_secs(&self, secs: u64) -> Self { - let ctx = self.inner.clone().with_timeout_secs(secs); - Self { inner: ctx } - } - - /// Force the Orchestrator to analyze documents before dispatching Workers. - fn with_force_analysis(&self, force: bool) -> Self { - let ctx = self.inner.clone().with_force_analysis(force); - Self { inner: ctx } - } - - fn __repr__(&self) -> String { - "QueryContext(...)".to_string() - } -} diff --git a/python/src/results.rs b/python/src/results.rs deleted file mode 100644 index ba4ea776..00000000 --- a/python/src/results.rs +++ /dev/null @@ -1,506 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Query and index result Python wrappers. - -use pyo3::prelude::*; - -use ::vectorless::IndexMetrics; -use ::vectorless::{ - EvidenceItem, FailedItem, IndexItem, IndexResult, QueryMetrics, QueryResult, QueryResultItem, -}; - -// ============================================================ -// EvidenceItem -// ============================================================ - -/// A single piece of evidence with source attribution. -#[pyclass(name = "EvidenceItem")] -pub struct PyEvidenceItem { - pub(crate) inner: EvidenceItem, -} - -#[pymethods] -impl PyEvidenceItem { - /// Section title where this evidence was found. - #[getter] - fn title(&self) -> &str { - &self.inner.title - } - - /// Navigation path (e.g., "Root/Chapter 1/Section 1.2"). - #[getter] - fn path(&self) -> &str { - &self.inner.path - } - - /// Raw evidence content. - #[getter] - fn content(&self) -> &str { - &self.inner.content - } - - /// Source document name. - #[getter] - fn doc_name(&self) -> Option<&str> { - self.inner.doc_name.as_deref() - } - - fn __repr__(&self) -> String { - format!( - "EvidenceItem(title='{}', path='{}', content_len={})", - self.inner.title, - self.inner.path, - self.inner.content.len() - ) - } -} - -// ============================================================ -// QueryMetrics -// ============================================================ - -/// Query execution metrics. -#[pyclass(name = "QueryMetrics")] -pub struct PyQueryMetrics { - pub(crate) inner: QueryMetrics, -} - -#[pymethods] -impl PyQueryMetrics { - /// Number of LLM calls made. - #[getter] - fn llm_calls(&self) -> u32 { - self.inner.llm_calls - } - - /// Number of navigation rounds used. - #[getter] - fn rounds_used(&self) -> u32 { - self.inner.rounds_used - } - - /// Number of distinct nodes visited. - #[getter] - fn nodes_visited(&self) -> usize { - self.inner.nodes_visited - } - - /// Number of evidence items collected. - #[getter] - fn evidence_count(&self) -> usize { - self.inner.evidence_count - } - - /// Total characters of collected evidence. - #[getter] - fn evidence_chars(&self) -> usize { - self.inner.evidence_chars - } - - fn __repr__(&self) -> String { - format!( - "QueryMetrics(llm_calls={}, rounds={}, evidence={})", - self.inner.llm_calls, self.inner.rounds_used, self.inner.evidence_count - ) - } -} - -// ============================================================ -// QueryResultItem -// ============================================================ - -/// A single document's query result. -#[pyclass(name = "QueryResultItem")] -pub struct PyQueryResultItem { - pub(crate) inner: QueryResultItem, -} - -#[pymethods] -impl PyQueryResultItem { - /// The document ID. - #[getter] - fn doc_id(&self) -> &str { - &self.inner.doc_id - } - - /// The retrieved content (synthesized answer or raw evidence). - #[getter] - fn content(&self) -> &str { - &self.inner.content - } - - /// Confidence score (0.0 to 1.0). - #[getter] - fn score(&self) -> f32 { - self.inner.confidence - } - - /// Node IDs that matched (navigation paths). - #[getter] - fn node_ids(&self) -> Vec { - self.inner.node_ids.clone() - } - - /// Evidence items with source attribution. - #[getter] - fn evidence(&self) -> Vec { - self.inner - .evidence - .iter() - .map(|e| PyEvidenceItem { - inner: EvidenceItem { - title: e.title.clone(), - path: e.path.clone(), - content: e.content.clone(), - doc_name: e.doc_name.clone(), - }, - }) - .collect() - } - - /// Execution metrics for this query. - #[getter] - fn metrics(&self) -> Option { - self.inner.metrics.as_ref().map(|m| PyQueryMetrics { - inner: QueryMetrics { - llm_calls: m.llm_calls, - rounds_used: m.rounds_used, - nodes_visited: m.nodes_visited, - evidence_count: m.evidence_count, - evidence_chars: m.evidence_chars, - }, - }) - } - - /// Confidence score (0.0 to 1.0). - #[getter] - fn confidence(&self) -> f32 { - self.inner.confidence - } - - fn __repr__(&self) -> String { - format!( - "QueryResultItem(doc_id='{}', confidence={:.2}, evidence={})", - self.inner.doc_id, - self.inner.confidence, - self.inner.evidence.len() - ) - } -} - -// ============================================================ -// FailedItem -// ============================================================ - -/// A failed item in a batch operation. -#[pyclass(name = "FailedItem")] -pub struct PyFailedItem { - pub(crate) inner: FailedItem, -} - -#[pymethods] -impl PyFailedItem { - /// Source description. - #[getter] - fn source(&self) -> &str { - &self.inner.source - } - - /// Error message. - #[getter] - fn error(&self) -> &str { - &self.inner.error - } - - fn __repr__(&self) -> String { - format!( - "FailedItem(source='{}', error='{}')", - self.inner.source, self.inner.error - ) - } -} - -// ============================================================ -// QueryResult -// ============================================================ - -/// Result of a document query. -#[pyclass(name = "QueryResult")] -pub struct PyQueryResult { - pub(crate) inner: QueryResult, -} - -#[pymethods] -impl PyQueryResult { - /// Result items (one per document). - #[getter] - fn items(&self) -> Vec { - self.inner - .items - .iter() - .map(|i| PyQueryResultItem { - inner: QueryResultItem { - doc_id: i.doc_id.clone(), - node_ids: i.node_ids.clone(), - content: i.content.clone(), - evidence: i.evidence.clone(), - metrics: i.metrics.clone(), - confidence: i.confidence, - }, - }) - .collect() - } - - /// Get the first (single-doc) result item. - fn single(&self) -> Option { - self.inner.single().map(|i| PyQueryResultItem { - inner: QueryResultItem { - doc_id: i.doc_id.clone(), - node_ids: i.node_ids.clone(), - content: i.content.clone(), - evidence: i.evidence.clone(), - metrics: i.metrics.clone(), - confidence: i.confidence, - }, - }) - } - - /// Number of result items. - fn __len__(&self) -> usize { - self.inner.len() - } - - /// Whether any documents failed. - fn has_failures(&self) -> bool { - self.inner.has_failures() - } - - /// Failed items. - #[getter] - fn failed(&self) -> Vec { - self.inner - .failed - .iter() - .map(|f| PyFailedItem { - inner: FailedItem::new(&f.source, &f.error), - }) - .collect() - } - - fn __repr__(&self) -> String { - format!( - "QueryResult(items={}, failed={})", - self.inner.len(), - self.inner.failed.len() - ) - } -} - -// ============================================================ -// IndexMetrics -// ============================================================ - -/// Indexing pipeline metrics. -#[pyclass(name = "IndexMetrics")] -pub struct PyIndexMetrics { - pub(crate) inner: IndexMetrics, -} - -#[pymethods] -impl PyIndexMetrics { - /// Total indexing time (ms). - #[getter] - fn total_time_ms(&self) -> u64 { - self.inner.total_time_ms() - } - - /// Parse stage duration (ms). - #[getter] - fn parse_time_ms(&self) -> u64 { - self.inner.parse_time_ms - } - - /// Build stage duration (ms). - #[getter] - fn build_time_ms(&self) -> u64 { - self.inner.build_time_ms - } - - /// Enhance (summary) stage duration (ms). - #[getter] - fn enhance_time_ms(&self) -> u64 { - self.inner.enhance_time_ms - } - - /// Number of nodes processed. - #[getter] - fn nodes_processed(&self) -> usize { - self.inner.nodes_processed - } - - /// Number of summaries successfully generated. - #[getter] - fn summaries_generated(&self) -> usize { - self.inner.summaries_generated - } - - /// Number of summaries that failed to generate. - #[getter] - fn summaries_failed(&self) -> usize { - self.inner.summaries_failed - } - - /// Number of LLM calls made. - #[getter] - fn llm_calls(&self) -> usize { - self.inner.llm_calls - } - - /// Total tokens generated by LLM. - #[getter] - fn total_tokens_generated(&self) -> usize { - self.inner.total_tokens_generated - } - - /// Number of topics in reasoning index. - #[getter] - fn topics_indexed(&self) -> usize { - self.inner.topics_indexed - } - - /// Number of keywords in reasoning index. - #[getter] - fn keywords_indexed(&self) -> usize { - self.inner.keywords_indexed - } - - fn __repr__(&self) -> String { - format!( - "IndexMetrics(total={}ms, summaries={}, failed={}, llm_calls={})", - self.inner.total_time_ms(), - self.inner.summaries_generated, - self.inner.summaries_failed, - self.inner.llm_calls, - ) - } -} - -// ============================================================ -// IndexItem / IndexResult -// ============================================================ - -/// A single indexed document item. -#[pyclass(name = "IndexItem")] -pub struct PyIndexItem { - pub(crate) inner: IndexItem, -} - -#[pymethods] -impl PyIndexItem { - #[getter] - fn doc_id(&self) -> &str { - &self.inner.doc_id - } - - #[getter] - fn name(&self) -> &str { - &self.inner.name - } - - #[getter] - fn format(&self) -> String { - format!("{:?}", self.inner.format).to_lowercase() - } - - #[getter] - fn description(&self) -> Option<&str> { - self.inner.description.as_deref() - } - - #[getter] - fn source_path(&self) -> Option<&str> { - self.inner.source_path.as_deref() - } - - #[getter] - fn page_count(&self) -> Option { - self.inner.page_count - } - - /// Indexing pipeline metrics (timing, LLM usage, etc.). - #[getter] - fn metrics(&self) -> Option { - self.inner - .metrics - .as_ref() - .map(|m| PyIndexMetrics { inner: m.clone() }) - } - - fn __repr__(&self) -> String { - format!( - "IndexItem(doc_id='{}', name='{}')", - self.inner.doc_id, self.inner.name - ) - } -} - -/// Result of a document indexing operation. -#[pyclass(name = "IndexResult")] -pub struct PyIndexResult { - pub(crate) inner: IndexResult, -} - -#[pymethods] -impl PyIndexResult { - /// The document ID (convenience for single-document indexing). - #[getter] - fn doc_id(&self) -> Option { - self.inner.doc_id().map(|s| s.to_string()) - } - - /// All indexed items. - #[getter] - fn items(&self) -> Vec { - self.inner - .items - .iter() - .map(|i| PyIndexItem { inner: i.clone() }) - .collect() - } - - /// Failed items. - #[getter] - fn failed(&self) -> Vec { - self.inner - .failed - .iter() - .map(|f| PyFailedItem { - inner: FailedItem::new(&f.source, &f.error), - }) - .collect() - } - - /// Whether any items failed. - fn has_failures(&self) -> bool { - self.inner.has_failures() - } - - /// Total number of items (successful + failed). - fn total(&self) -> usize { - self.inner.total() - } - - fn __len__(&self) -> usize { - self.inner.len() - } - - fn __repr__(&self) -> String { - format!( - "IndexResult(doc_id={:?}, count={}, failed={})", - self.inner.doc_id(), - self.inner.items.len(), - self.inner.failed.len() - ) - } -} diff --git a/python/src/streaming.rs b/python/src/streaming.rs deleted file mode 100644 index eafa688e..00000000 --- a/python/src/streaming.rs +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! PyO3 streaming query wrapper. -//! -//! Bridges Rust's `mpsc::Receiver` to a Python async iterator, -//! yielding real-time retrieval progress events as dicts. - -use pyo3::exceptions::PyStopAsyncIteration; -use pyo3::prelude::*; -use pyo3::types::PyDict; -use pyo3_async_runtimes::tokio::future_into_py; -use std::sync::Arc; -use tokio::sync::{Mutex, mpsc}; - -use ::vectorless::{RetrieveEvent, SufficiencyLevel}; - -/// Convert a `RetrieveEvent` into a Python dict with a `"type"` key. -fn event_to_dict(event: RetrieveEvent, py: Python<'_>) -> PyResult> { - let dict = PyDict::new(py); - match event { - RetrieveEvent::Started { query, strategy } => { - dict.set_item("type", "started")?; - dict.set_item("query", query)?; - dict.set_item("strategy", strategy)?; - } - RetrieveEvent::StageCompleted { stage, elapsed_ms } => { - dict.set_item("type", "stage_completed")?; - dict.set_item("stage", stage)?; - dict.set_item("elapsed_ms", elapsed_ms)?; - } - RetrieveEvent::NodeVisited { - node_id, - title, - score, - } => { - dict.set_item("type", "node_visited")?; - dict.set_item("node_id", node_id)?; - dict.set_item("title", title)?; - dict.set_item("score", score)?; - } - RetrieveEvent::ContentFound { - node_id, - title, - preview, - score, - } => { - dict.set_item("type", "content_found")?; - dict.set_item("node_id", node_id)?; - dict.set_item("title", title)?; - dict.set_item("preview", preview)?; - dict.set_item("score", score)?; - } - RetrieveEvent::Backtracking { from, to, reason } => { - dict.set_item("type", "backtracking")?; - dict.set_item("from", from)?; - dict.set_item("to", to)?; - dict.set_item("reason", reason)?; - } - RetrieveEvent::SufficiencyCheck { level, tokens } => { - let level_str = match level { - SufficiencyLevel::Sufficient => "sufficient", - SufficiencyLevel::PartialSufficient => "partial_sufficient", - SufficiencyLevel::Insufficient => "insufficient", - }; - dict.set_item("type", "sufficiency_check")?; - dict.set_item("level", level_str)?; - dict.set_item("tokens", tokens)?; - } - RetrieveEvent::Completed { response } => { - dict.set_item("type", "completed")?; - dict.set_item("confidence", response.confidence)?; - dict.set_item("is_sufficient", response.is_sufficient)?; - dict.set_item("strategy_used", response.strategy_used)?; - dict.set_item("tokens_used", response.tokens_used)?; - dict.set_item("content", response.content)?; - - let results: Vec> = response - .results - .into_iter() - .map(|r| { - let rd = PyDict::new(py); - rd.set_item("node_id", &r.node_id)?; - rd.set_item("title", &r.title)?; - rd.set_item("content", &r.content)?; - rd.set_item("score", r.score)?; - rd.set_item("depth", r.depth)?; - Ok(rd) - }) - .collect::>>()?; - dict.set_item("results", results)?; - } - RetrieveEvent::Error { message } => { - dict.set_item("type", "error")?; - dict.set_item("message", message)?; - } - } - Ok(dict) -} - -/// Python-facing async iterator over streaming retrieval events. -/// -/// Usage:: -/// -/// stream = await engine.query_stream(ctx) -/// async for event in stream: -/// print(event["type"]) -#[pyclass(name = "StreamingQuery")] -pub struct PyStreamingQuery { - rx: Arc>>>, -} - -impl PyStreamingQuery { - pub fn new(rx: mpsc::Receiver) -> Self { - Self { - rx: Arc::new(Mutex::new(Some(rx))), - } - } -} - -#[pymethods] -impl PyStreamingQuery { - fn __aiter__(slf: PyRef<'_, Self>) -> PyRef<'_, Self> { - slf - } - - fn __anext__<'py>(&self, py: Python<'py>) -> PyResult> { - let rx: Arc>>> = Arc::clone(&self.rx); - future_into_py(py, async move { - let mut guard = rx.lock().await; - let receiver: &mut Option> = &mut *guard; - match receiver { - None => Err(PyStopAsyncIteration::new_err("stream exhausted")), - Some(rx) => match rx.recv().await { - Some(event) => { - let is_terminal = matches!( - &event, - RetrieveEvent::Completed { .. } | RetrieveEvent::Error { .. } - ); - if is_terminal { - *guard = None; - } - // We cannot convert to dict here (no Python token in async context). - // Instead, store the event and convert on the Python side. - // PyO3 0.28: future_into_py resolves on the Python thread, - // so we use Python::with_gil equivalent via pyo3_async_runtimes. - // - // The cleanest approach: wrap in a PyO3-compatible type. - // Since RetrieveEvent doesn't implement IntoPyObject, we convert - // to a simple serializable form. - Ok(SerializedEvent(event)) - } - None => { - *guard = None; - Err(PyStopAsyncIteration::new_err("stream closed")) - } - }, - } - }) - } - - fn __repr__(&self) -> String { - "StreamingQuery(...)".to_string() - } -} - -/// Wrapper to carry a RetrieveEvent across the async boundary -/// and convert it to a dict on the Python thread. -struct SerializedEvent(RetrieveEvent); - -impl<'py> IntoPyObject<'py> for SerializedEvent { - type Target = PyDict; - type Output = Bound<'py, Self::Target>; - type Error = PyErr; - - fn into_pyobject(self, py: Python<'py>) -> Result { - event_to_dict(self.0, py) - } -} diff --git a/python/vectorless/__init__.py b/python/vectorless/__init__.py deleted file mode 100644 index d2ff88d0..00000000 --- a/python/vectorless/__init__.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Vectorless — Reasoning-native document engine. - -Every retrieval is a reasoning act. - -Quick Start: - from vectorless import Session - - session = Session(api_key="sk-...", model="gpt-4o") - result = await session.index(path="./report.pdf") - answer = await session.ask("What is the revenue?", doc_ids=[result.doc_id]) - print(answer.single().content) -""" - -# High-level API (recommended) -from vectorless.session import Session -from vectorless.sync_session import SyncSession -from vectorless.config import EngineConfig, load_config, load_config_from_env, load_config_from_file -from vectorless.events import EventEmitter -from vectorless.streaming import StreamingQueryResult -from vectorless.types import ( - DocumentGraphWrapper, - EdgeEvidence, - Evidence, - FailedItem, - GraphEdge, - GraphNode, - IndexItemWrapper, - IndexMetrics, - IndexResultWrapper, - QueryMetrics, - QueryResponse, - QueryResult, - WeightedKeyword, -) - -# Version and error types -from vectorless._vectorless import VectorlessError, __version__ - -__all__ = [ - # Primary API - "Session", - "SyncSession", - # Configuration - "EngineConfig", - "load_config", - "load_config_from_env", - "load_config_from_file", - # Events - "EventEmitter", - # Streaming - "StreamingQueryResult", - # Result types - "QueryResponse", - "QueryResult", - "QueryMetrics", - "Evidence", - "IndexResultWrapper", - "IndexItemWrapper", - "IndexMetrics", - "FailedItem", - # Graph types - "DocumentGraphWrapper", - "GraphNode", - "GraphEdge", - "EdgeEvidence", - "WeightedKeyword", - # Error and version - "VectorlessError", - "__version__", -] diff --git a/python/vectorless/_core.py b/python/vectorless/_core.py deleted file mode 100644 index c83d089a..00000000 --- a/python/vectorless/_core.py +++ /dev/null @@ -1,54 +0,0 @@ -"""Internal re-exports from the Rust PyO3 module. - -This module is NOT part of the public API. Use ``vectorless.Session`` instead. -""" - -from vectorless._vectorless import ( - Config, - DocumentGraph, - DocumentGraphNode, - DocumentInfo, - EdgeEvidence, - Engine, - EvidenceItem, - FailedItem, - GraphEdge, - IndexContext, - IndexItem, - IndexMetrics, - IndexOptions, - IndexResult, - QueryContext, - QueryMetrics, - QueryResult, - QueryResultItem, - StreamingQuery, - VectorlessError, - WeightedKeyword, - __version__, -) - -__all__ = [ - "Config", - "DocumentGraph", - "DocumentGraphNode", - "DocumentInfo", - "EdgeEvidence", - "Engine", - "EvidenceItem", - "FailedItem", - "GraphEdge", - "IndexContext", - "IndexItem", - "IndexMetrics", - "IndexOptions", - "IndexResult", - "QueryContext", - "QueryMetrics", - "QueryResult", - "QueryResultItem", - "StreamingQuery", - "VectorlessError", - "WeightedKeyword", - "__version__", -] diff --git a/rust/tests/integration.rs b/rust/tests/integration.rs deleted file mode 100644 index 00c6b0cf..00000000 --- a/rust/tests/integration.rs +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Integration tests for the Engine client. -//! -//! These tests exercise the full index → persist → query lifecycle -//! without requiring a real LLM endpoint, using the no-LLM pipeline. - -use std::path::PathBuf; - -use vectorless::__test_support::build_test_engine; -use vectorless::{Engine, IndexContext, IndexMode}; - -async fn setup() -> (Engine, tempfile::TempDir) { - let tmp = tempfile::tempdir().unwrap(); - let engine = build_test_engine(tmp.path()).await; - (engine, tmp) -} - -#[tokio::test] -async fn test_index_and_persist_single_markdown() { - let (engine, tmp) = setup().await; - - // Write a test markdown file - let md_path = tmp.path().join("test.md"); - std::fs::write(&md_path, "# Hello\n\nWorld content here.").unwrap(); - - let ctx = IndexContext::from_path(&md_path).with_mode(IndexMode::Force); - let result = engine.index(ctx).await.unwrap(); - - assert_eq!(result.len(), 1); - assert!(!result.has_failures()); - let doc_id = result.doc_id().unwrap(); - assert!(!doc_id.is_empty()); - - // Verify persisted - assert!(engine.exists(doc_id).await.unwrap()); - - // List should contain 1 doc - let docs = engine.list().await.unwrap(); - assert_eq!(docs.len(), 1); - assert_eq!(docs[0].name, "test"); - - // Remove - assert!(engine.remove(doc_id).await.unwrap()); - assert!(!engine.exists(doc_id).await.unwrap()); -} - -#[tokio::test] -async fn test_index_from_content() { - let (engine, _tmp) = setup().await; - - let ctx = IndexContext::from_content( - "# Title\n\nParagraph 1\n\n## Section\n\nParagraph 2", - vectorless::DocumentFormat::Markdown, - ) - .with_name("inline-doc"); - - let result = engine.index(ctx).await.unwrap(); - assert_eq!(result.len(), 1); - let doc_id = result.doc_id().unwrap(); - - // Verify it's persisted and loadable - assert!(engine.exists(doc_id).await.unwrap()); - - // Clean up - engine.remove(doc_id).await.unwrap(); -} - -#[tokio::test] -async fn test_index_multiple_sources_parallel() { - let (engine, tmp) = setup().await; - - // Create 3 markdown files - let paths: Vec = (0..3) - .map(|i| { - let p = tmp.path().join(format!("doc{i}.md")); - std::fs::write(&p, format!("# Doc {i}\n\nContent {i}")).unwrap(); - p - }) - .collect(); - - let ctx = IndexContext::from_paths(paths).with_mode(IndexMode::Force); - let result = engine.index(ctx).await.unwrap(); - - assert_eq!(result.len(), 3); - assert!(!result.has_failures()); - - let docs = engine.list().await.unwrap(); - assert_eq!(docs.len(), 3); - - // Clear all - let count = engine.clear().await.unwrap(); - assert_eq!(count, 3); -} - -#[tokio::test] -async fn test_index_default_mode_skips_existing() { - let (engine, tmp) = setup().await; - - let md_path = tmp.path().join("existing.md"); - std::fs::write(&md_path, "# Original\n\nOriginal content.").unwrap(); - - // First index - let ctx = IndexContext::from_path(&md_path); - let result1 = engine.index(ctx).await.unwrap(); - assert_eq!(result1.len(), 1); - let id1 = result1.doc_id().unwrap().to_string(); - - // Second index with Default mode — should skip - let ctx = IndexContext::from_path(&md_path); - let result2 = engine.index(ctx).await.unwrap(); - assert_eq!(result2.len(), 1); - assert!(!result2.has_failures()); - // Same doc ID — not re-indexed - assert_eq!(result2.doc_id().unwrap(), id1); -} - -#[tokio::test] -async fn test_force_mode_reindexes() { - let (engine, tmp) = setup().await; - - let md_path = tmp.path().join("force.md"); - std::fs::write(&md_path, "# Version 1").unwrap(); - - // First index - let ctx = IndexContext::from_path(&md_path); - let result1 = engine.index(ctx).await.unwrap(); - let id1 = result1.doc_id().unwrap().to_string(); - - // Force re-index — should get a new doc ID - let ctx = IndexContext::from_path(&md_path).with_mode(IndexMode::Force); - let result2 = engine.index(ctx).await.unwrap(); - assert_eq!(result2.len(), 1); - // Different doc ID — re-indexed - assert_ne!(result2.doc_id().unwrap(), id1); -} - -#[tokio::test] -async fn test_clear_empty_workspace() { - let (engine, _tmp) = setup().await; - - let count = engine.clear().await.unwrap(); - assert_eq!(count, 0); -} - -#[tokio::test] -async fn test_remove_nonexistent() { - let (engine, _tmp) = setup().await; - - let removed = engine.remove("nonexistent-id").await.unwrap(); - assert!(!removed); -} - -#[tokio::test] -async fn test_index_from_bytes() { - let (engine, _tmp) = setup().await; - - let ctx = IndexContext::from_bytes(vec![1, 2, 3, 4], vectorless::DocumentFormat::Pdf) - .with_name("test-bytes"); - - // This will fail at parse (not a real PDF), but should error gracefully - let result = engine.index(ctx).await; - assert!(result.is_err()); -} diff --git a/python/tests/__init__.py b/tests/__init__.py similarity index 100% rename from python/tests/__init__.py rename to tests/__init__.py diff --git a/python/tests/conftest.py b/tests/conftest.py similarity index 100% rename from python/tests/conftest.py rename to tests/conftest.py diff --git a/python/tests/test_cli/__init__.py b/tests/test_cli/__init__.py similarity index 100% rename from python/tests/test_cli/__init__.py rename to tests/test_cli/__init__.py diff --git a/python/tests/test_compat/__init__.py b/tests/test_compat/__init__.py similarity index 100% rename from python/tests/test_compat/__init__.py rename to tests/test_compat/__init__.py diff --git a/python/tests/test_config.py b/tests/test_config.py similarity index 100% rename from python/tests/test_config.py rename to tests/test_config.py diff --git a/python/tests/test_events.py b/tests/test_events.py similarity index 100% rename from python/tests/test_events.py rename to tests/test_events.py diff --git a/python/tests/test_session.py b/tests/test_session.py similarity index 100% rename from python/tests/test_session.py rename to tests/test_session.py diff --git a/python/tests/test_types.py b/tests/test_types.py similarity index 100% rename from python/tests/test_types.py rename to tests/test_types.py diff --git a/python/Cargo.toml b/vectorless-core/vectorless-py/Cargo.toml similarity index 76% rename from python/Cargo.toml rename to vectorless-core/vectorless-py/Cargo.toml index 93a0d557..a5236730 100644 --- a/python/Cargo.toml +++ b/vectorless-core/vectorless-py/Cargo.toml @@ -1,11 +1,12 @@ [package] name = "vectorless-py" -version = "0.1.0" +version.workspace = true edition.workspace = true authors.workspace = true description = "Python bindings for vectorless" license.workspace = true repository.workspace = true +homepage.workspace = true [lib] name = "vectorless" @@ -15,4 +16,7 @@ crate-type = ["cdylib"] pyo3 = { workspace = true } pyo3-async-runtimes = { workspace = true } tokio = { version = "1", features = ["rt-multi-thread"] } -vectorless = { path = "../rust" } +vectorless = { path = "../vectorless" } + +[lints] +workspace = true diff --git a/python/src/answer.rs b/vectorless-core/vectorless-py/src/answer.rs similarity index 97% rename from python/src/answer.rs rename to vectorless-core/vectorless-py/src/answer.rs index d36af66a..d1c9eba0 100644 --- a/python/src/answer.rs +++ b/vectorless-core/vectorless-py/src/answer.rs @@ -91,7 +91,7 @@ pub struct PyReasoningTrace { } /// A single step in the reasoning trace. -#[pyclass(name = "TraceStep")] +#[pyclass(name = "TraceStep", skip_from_py_object)] #[derive(Clone)] pub struct PyTraceStep { #[pyo3(get)] diff --git a/python/src/config.rs b/vectorless-core/vectorless-py/src/config.rs similarity index 100% rename from python/src/config.rs rename to vectorless-core/vectorless-py/src/config.rs diff --git a/python/src/document.rs b/vectorless-core/vectorless-py/src/document.rs similarity index 100% rename from python/src/document.rs rename to vectorless-core/vectorless-py/src/document.rs diff --git a/python/src/engine.rs b/vectorless-core/vectorless-py/src/engine.rs similarity index 100% rename from python/src/engine.rs rename to vectorless-core/vectorless-py/src/engine.rs diff --git a/python/src/error.rs b/vectorless-core/vectorless-py/src/error.rs similarity index 100% rename from python/src/error.rs rename to vectorless-core/vectorless-py/src/error.rs diff --git a/python/src/graph.rs b/vectorless-core/vectorless-py/src/graph.rs similarity index 100% rename from python/src/graph.rs rename to vectorless-core/vectorless-py/src/graph.rs diff --git a/python/src/lib.rs b/vectorless-core/vectorless-py/src/lib.rs similarity index 100% rename from python/src/lib.rs rename to vectorless-core/vectorless-py/src/lib.rs diff --git a/python/src/metrics.rs b/vectorless-core/vectorless-py/src/metrics.rs similarity index 100% rename from python/src/metrics.rs rename to vectorless-core/vectorless-py/src/metrics.rs diff --git a/rust/Cargo.toml b/vectorless-core/vectorless/Cargo.toml similarity index 98% rename from rust/Cargo.toml rename to vectorless-core/vectorless/Cargo.toml index 723e46f7..70c06ab3 100644 --- a/rust/Cargo.toml +++ b/vectorless-core/vectorless/Cargo.toml @@ -10,7 +10,7 @@ homepage.workspace = true documentation = "https://docs.rs/vectorless" keywords = ["rag", "document", "retrieval", "indexing", "llm"] categories = ["text-processing", "data-structures", "algorithms"] -readme = "../README.md" +readme = "../../README.md" exclude = ["docs/", "examples/", ".*"] [dependencies] diff --git a/rust/examples/deep_retrieval.rs b/vectorless-core/vectorless/examples/deep_retrieval.rs similarity index 100% rename from rust/examples/deep_retrieval.rs rename to vectorless-core/vectorless/examples/deep_retrieval.rs diff --git a/rust/examples/events.rs b/vectorless-core/vectorless/examples/events.rs similarity index 100% rename from rust/examples/events.rs rename to vectorless-core/vectorless/examples/events.rs diff --git a/rust/examples/flow.rs b/vectorless-core/vectorless/examples/flow.rs similarity index 100% rename from rust/examples/flow.rs rename to vectorless-core/vectorless/examples/flow.rs diff --git a/rust/examples/graph.rs b/vectorless-core/vectorless/examples/graph.rs similarity index 100% rename from rust/examples/graph.rs rename to vectorless-core/vectorless/examples/graph.rs diff --git a/rust/examples/index_directory.rs b/vectorless-core/vectorless/examples/index_directory.rs similarity index 100% rename from rust/examples/index_directory.rs rename to vectorless-core/vectorless/examples/index_directory.rs diff --git a/rust/examples/index_incremental.rs b/vectorless-core/vectorless/examples/index_incremental.rs similarity index 100% rename from rust/examples/index_incremental.rs rename to vectorless-core/vectorless/examples/index_incremental.rs diff --git a/rust/examples/index_pdf.rs b/vectorless-core/vectorless/examples/index_pdf.rs similarity index 100% rename from rust/examples/index_pdf.rs rename to vectorless-core/vectorless/examples/index_pdf.rs diff --git a/rust/examples/index_single.rs b/vectorless-core/vectorless/examples/index_single.rs similarity index 100% rename from rust/examples/index_single.rs rename to vectorless-core/vectorless/examples/index_single.rs diff --git a/rust/examples/indexing.rs b/vectorless-core/vectorless/examples/indexing.rs similarity index 100% rename from rust/examples/indexing.rs rename to vectorless-core/vectorless/examples/indexing.rs diff --git a/rust/examples/indexing_flow.rs b/vectorless-core/vectorless/examples/indexing_flow.rs similarity index 100% rename from rust/examples/indexing_flow.rs rename to vectorless-core/vectorless/examples/indexing_flow.rs diff --git a/rust/examples/query.rs b/vectorless-core/vectorless/examples/query.rs similarity index 100% rename from rust/examples/query.rs rename to vectorless-core/vectorless/examples/query.rs diff --git a/rust/examples/single_doc_challenge.rs b/vectorless-core/vectorless/examples/single_doc_challenge.rs similarity index 100% rename from rust/examples/single_doc_challenge.rs rename to vectorless-core/vectorless/examples/single_doc_challenge.rs diff --git a/rust/src/agent/command.rs b/vectorless-core/vectorless/src/agent/command.rs similarity index 100% rename from rust/src/agent/command.rs rename to vectorless-core/vectorless/src/agent/command.rs diff --git a/rust/src/agent/config.rs b/vectorless-core/vectorless/src/agent/config.rs similarity index 100% rename from rust/src/agent/config.rs rename to vectorless-core/vectorless/src/agent/config.rs diff --git a/rust/src/agent/context.rs b/vectorless-core/vectorless/src/agent/context.rs similarity index 100% rename from rust/src/agent/context.rs rename to vectorless-core/vectorless/src/agent/context.rs diff --git a/rust/src/agent/events.rs b/vectorless-core/vectorless/src/agent/events.rs similarity index 100% rename from rust/src/agent/events.rs rename to vectorless-core/vectorless/src/agent/events.rs diff --git a/rust/src/agent/mod.rs b/vectorless-core/vectorless/src/agent/mod.rs similarity index 100% rename from rust/src/agent/mod.rs rename to vectorless-core/vectorless/src/agent/mod.rs diff --git a/rust/src/agent/orchestrator/analyze.rs b/vectorless-core/vectorless/src/agent/orchestrator/analyze.rs similarity index 100% rename from rust/src/agent/orchestrator/analyze.rs rename to vectorless-core/vectorless/src/agent/orchestrator/analyze.rs diff --git a/rust/src/agent/orchestrator/dispatch.rs b/vectorless-core/vectorless/src/agent/orchestrator/dispatch.rs similarity index 100% rename from rust/src/agent/orchestrator/dispatch.rs rename to vectorless-core/vectorless/src/agent/orchestrator/dispatch.rs diff --git a/rust/src/agent/orchestrator/evaluate.rs b/vectorless-core/vectorless/src/agent/orchestrator/evaluate.rs similarity index 100% rename from rust/src/agent/orchestrator/evaluate.rs rename to vectorless-core/vectorless/src/agent/orchestrator/evaluate.rs diff --git a/rust/src/agent/orchestrator/mod.rs b/vectorless-core/vectorless/src/agent/orchestrator/mod.rs similarity index 100% rename from rust/src/agent/orchestrator/mod.rs rename to vectorless-core/vectorless/src/agent/orchestrator/mod.rs diff --git a/rust/src/agent/orchestrator/replan.rs b/vectorless-core/vectorless/src/agent/orchestrator/replan.rs similarity index 100% rename from rust/src/agent/orchestrator/replan.rs rename to vectorless-core/vectorless/src/agent/orchestrator/replan.rs diff --git a/rust/src/agent/orchestrator/supervisor.rs b/vectorless-core/vectorless/src/agent/orchestrator/supervisor.rs similarity index 100% rename from rust/src/agent/orchestrator/supervisor.rs rename to vectorless-core/vectorless/src/agent/orchestrator/supervisor.rs diff --git a/rust/src/agent/prompts.rs b/vectorless-core/vectorless/src/agent/prompts.rs similarity index 100% rename from rust/src/agent/prompts.rs rename to vectorless-core/vectorless/src/agent/prompts.rs diff --git a/rust/src/agent/state.rs b/vectorless-core/vectorless/src/agent/state.rs similarity index 100% rename from rust/src/agent/state.rs rename to vectorless-core/vectorless/src/agent/state.rs diff --git a/rust/src/agent/tools/common.rs b/vectorless-core/vectorless/src/agent/tools/common.rs similarity index 100% rename from rust/src/agent/tools/common.rs rename to vectorless-core/vectorless/src/agent/tools/common.rs diff --git a/rust/src/agent/tools/mod.rs b/vectorless-core/vectorless/src/agent/tools/mod.rs similarity index 100% rename from rust/src/agent/tools/mod.rs rename to vectorless-core/vectorless/src/agent/tools/mod.rs diff --git a/rust/src/agent/tools/orchestrator.rs b/vectorless-core/vectorless/src/agent/tools/orchestrator.rs similarity index 100% rename from rust/src/agent/tools/orchestrator.rs rename to vectorless-core/vectorless/src/agent/tools/orchestrator.rs diff --git a/rust/src/agent/tools/worker/cat.rs b/vectorless-core/vectorless/src/agent/tools/worker/cat.rs similarity index 100% rename from rust/src/agent/tools/worker/cat.rs rename to vectorless-core/vectorless/src/agent/tools/worker/cat.rs diff --git a/rust/src/agent/tools/worker/cd.rs b/vectorless-core/vectorless/src/agent/tools/worker/cd.rs similarity index 100% rename from rust/src/agent/tools/worker/cd.rs rename to vectorless-core/vectorless/src/agent/tools/worker/cd.rs diff --git a/rust/src/agent/tools/worker/find.rs b/vectorless-core/vectorless/src/agent/tools/worker/find.rs similarity index 100% rename from rust/src/agent/tools/worker/find.rs rename to vectorless-core/vectorless/src/agent/tools/worker/find.rs diff --git a/rust/src/agent/tools/worker/grep.rs b/vectorless-core/vectorless/src/agent/tools/worker/grep.rs similarity index 100% rename from rust/src/agent/tools/worker/grep.rs rename to vectorless-core/vectorless/src/agent/tools/worker/grep.rs diff --git a/rust/src/agent/tools/worker/head.rs b/vectorless-core/vectorless/src/agent/tools/worker/head.rs similarity index 100% rename from rust/src/agent/tools/worker/head.rs rename to vectorless-core/vectorless/src/agent/tools/worker/head.rs diff --git a/rust/src/agent/tools/worker/ls.rs b/vectorless-core/vectorless/src/agent/tools/worker/ls.rs similarity index 100% rename from rust/src/agent/tools/worker/ls.rs rename to vectorless-core/vectorless/src/agent/tools/worker/ls.rs diff --git a/rust/src/agent/tools/worker/mod.rs b/vectorless-core/vectorless/src/agent/tools/worker/mod.rs similarity index 100% rename from rust/src/agent/tools/worker/mod.rs rename to vectorless-core/vectorless/src/agent/tools/worker/mod.rs diff --git a/rust/src/agent/tools/worker/pwd.rs b/vectorless-core/vectorless/src/agent/tools/worker/pwd.rs similarity index 100% rename from rust/src/agent/tools/worker/pwd.rs rename to vectorless-core/vectorless/src/agent/tools/worker/pwd.rs diff --git a/rust/src/agent/tools/worker/wc.rs b/vectorless-core/vectorless/src/agent/tools/worker/wc.rs similarity index 100% rename from rust/src/agent/tools/worker/wc.rs rename to vectorless-core/vectorless/src/agent/tools/worker/wc.rs diff --git a/rust/src/agent/worker/execute.rs b/vectorless-core/vectorless/src/agent/worker/execute.rs similarity index 100% rename from rust/src/agent/worker/execute.rs rename to vectorless-core/vectorless/src/agent/worker/execute.rs diff --git a/rust/src/agent/worker/format.rs b/vectorless-core/vectorless/src/agent/worker/format.rs similarity index 100% rename from rust/src/agent/worker/format.rs rename to vectorless-core/vectorless/src/agent/worker/format.rs diff --git a/rust/src/agent/worker/mod.rs b/vectorless-core/vectorless/src/agent/worker/mod.rs similarity index 100% rename from rust/src/agent/worker/mod.rs rename to vectorless-core/vectorless/src/agent/worker/mod.rs diff --git a/rust/src/agent/worker/navigation.rs b/vectorless-core/vectorless/src/agent/worker/navigation.rs similarity index 100% rename from rust/src/agent/worker/navigation.rs rename to vectorless-core/vectorless/src/agent/worker/navigation.rs diff --git a/rust/src/agent/worker/planning.rs b/vectorless-core/vectorless/src/agent/worker/planning.rs similarity index 100% rename from rust/src/agent/worker/planning.rs rename to vectorless-core/vectorless/src/agent/worker/planning.rs diff --git a/rust/src/client/builder.rs b/vectorless-core/vectorless/src/client/builder.rs similarity index 100% rename from rust/src/client/builder.rs rename to vectorless-core/vectorless/src/client/builder.rs diff --git a/rust/src/client/engine.rs b/vectorless-core/vectorless/src/client/engine.rs similarity index 100% rename from rust/src/client/engine.rs rename to vectorless-core/vectorless/src/client/engine.rs diff --git a/rust/src/client/index_context.rs b/vectorless-core/vectorless/src/client/index_context.rs similarity index 100% rename from rust/src/client/index_context.rs rename to vectorless-core/vectorless/src/client/index_context.rs diff --git a/rust/src/client/indexed_document.rs b/vectorless-core/vectorless/src/client/indexed_document.rs similarity index 100% rename from rust/src/client/indexed_document.rs rename to vectorless-core/vectorless/src/client/indexed_document.rs diff --git a/rust/src/client/indexer.rs b/vectorless-core/vectorless/src/client/indexer.rs similarity index 100% rename from rust/src/client/indexer.rs rename to vectorless-core/vectorless/src/client/indexer.rs diff --git a/rust/src/client/mod.rs b/vectorless-core/vectorless/src/client/mod.rs similarity index 100% rename from rust/src/client/mod.rs rename to vectorless-core/vectorless/src/client/mod.rs diff --git a/rust/src/client/query_context.rs b/vectorless-core/vectorless/src/client/query_context.rs similarity index 100% rename from rust/src/client/query_context.rs rename to vectorless-core/vectorless/src/client/query_context.rs diff --git a/rust/src/client/retriever.rs b/vectorless-core/vectorless/src/client/retriever.rs similarity index 100% rename from rust/src/client/retriever.rs rename to vectorless-core/vectorless/src/client/retriever.rs diff --git a/rust/src/client/test_support.rs b/vectorless-core/vectorless/src/client/test_support.rs similarity index 100% rename from rust/src/client/test_support.rs rename to vectorless-core/vectorless/src/client/test_support.rs diff --git a/rust/src/client/types.rs b/vectorless-core/vectorless/src/client/types.rs similarity index 100% rename from rust/src/client/types.rs rename to vectorless-core/vectorless/src/client/types.rs diff --git a/rust/src/client/workspace.rs b/vectorless-core/vectorless/src/client/workspace.rs similarity index 100% rename from rust/src/client/workspace.rs rename to vectorless-core/vectorless/src/client/workspace.rs diff --git a/rust/src/config/mod.rs b/vectorless-core/vectorless/src/config/mod.rs similarity index 100% rename from rust/src/config/mod.rs rename to vectorless-core/vectorless/src/config/mod.rs diff --git a/rust/src/config/types/indexer.rs b/vectorless-core/vectorless/src/config/types/indexer.rs similarity index 100% rename from rust/src/config/types/indexer.rs rename to vectorless-core/vectorless/src/config/types/indexer.rs diff --git a/rust/src/config/types/llm_pool.rs b/vectorless-core/vectorless/src/config/types/llm_pool.rs similarity index 100% rename from rust/src/config/types/llm_pool.rs rename to vectorless-core/vectorless/src/config/types/llm_pool.rs diff --git a/rust/src/config/types/metrics.rs b/vectorless-core/vectorless/src/config/types/metrics.rs similarity index 100% rename from rust/src/config/types/metrics.rs rename to vectorless-core/vectorless/src/config/types/metrics.rs diff --git a/rust/src/config/types/mod.rs b/vectorless-core/vectorless/src/config/types/mod.rs similarity index 100% rename from rust/src/config/types/mod.rs rename to vectorless-core/vectorless/src/config/types/mod.rs diff --git a/rust/src/config/types/retrieval.rs b/vectorless-core/vectorless/src/config/types/retrieval.rs similarity index 100% rename from rust/src/config/types/retrieval.rs rename to vectorless-core/vectorless/src/config/types/retrieval.rs diff --git a/rust/src/config/types/storage.rs b/vectorless-core/vectorless/src/config/types/storage.rs similarity index 100% rename from rust/src/config/types/storage.rs rename to vectorless-core/vectorless/src/config/types/storage.rs diff --git a/rust/src/config/validator.rs b/vectorless-core/vectorless/src/config/validator.rs similarity index 100% rename from rust/src/config/validator.rs rename to vectorless-core/vectorless/src/config/validator.rs diff --git a/rust/src/document/mod.rs b/vectorless-core/vectorless/src/document/mod.rs similarity index 100% rename from rust/src/document/mod.rs rename to vectorless-core/vectorless/src/document/mod.rs diff --git a/rust/src/document/navigation.rs b/vectorless-core/vectorless/src/document/navigation.rs similarity index 100% rename from rust/src/document/navigation.rs rename to vectorless-core/vectorless/src/document/navigation.rs diff --git a/rust/src/document/node.rs b/vectorless-core/vectorless/src/document/node.rs similarity index 100% rename from rust/src/document/node.rs rename to vectorless-core/vectorless/src/document/node.rs diff --git a/rust/src/document/reasoning.rs b/vectorless-core/vectorless/src/document/reasoning.rs similarity index 100% rename from rust/src/document/reasoning.rs rename to vectorless-core/vectorless/src/document/reasoning.rs diff --git a/rust/src/document/reference.rs b/vectorless-core/vectorless/src/document/reference.rs similarity index 100% rename from rust/src/document/reference.rs rename to vectorless-core/vectorless/src/document/reference.rs diff --git a/rust/src/document/serde_helpers.rs b/vectorless-core/vectorless/src/document/serde_helpers.rs similarity index 100% rename from rust/src/document/serde_helpers.rs rename to vectorless-core/vectorless/src/document/serde_helpers.rs diff --git a/rust/src/document/structure.rs b/vectorless-core/vectorless/src/document/structure.rs similarity index 100% rename from rust/src/document/structure.rs rename to vectorless-core/vectorless/src/document/structure.rs diff --git a/rust/src/document/toc.rs b/vectorless-core/vectorless/src/document/toc.rs similarity index 100% rename from rust/src/document/toc.rs rename to vectorless-core/vectorless/src/document/toc.rs diff --git a/rust/src/document/tree.rs b/vectorless-core/vectorless/src/document/tree.rs similarity index 100% rename from rust/src/document/tree.rs rename to vectorless-core/vectorless/src/document/tree.rs diff --git a/rust/src/document/understanding.rs b/vectorless-core/vectorless/src/document/understanding.rs similarity index 100% rename from rust/src/document/understanding.rs rename to vectorless-core/vectorless/src/document/understanding.rs diff --git a/rust/src/error.rs b/vectorless-core/vectorless/src/error.rs similarity index 100% rename from rust/src/error.rs rename to vectorless-core/vectorless/src/error.rs diff --git a/rust/src/events/emitter.rs b/vectorless-core/vectorless/src/events/emitter.rs similarity index 100% rename from rust/src/events/emitter.rs rename to vectorless-core/vectorless/src/events/emitter.rs diff --git a/rust/src/events/mod.rs b/vectorless-core/vectorless/src/events/mod.rs similarity index 100% rename from rust/src/events/mod.rs rename to vectorless-core/vectorless/src/events/mod.rs diff --git a/rust/src/events/types.rs b/vectorless-core/vectorless/src/events/types.rs similarity index 100% rename from rust/src/events/types.rs rename to vectorless-core/vectorless/src/events/types.rs diff --git a/rust/src/graph/builder.rs b/vectorless-core/vectorless/src/graph/builder.rs similarity index 100% rename from rust/src/graph/builder.rs rename to vectorless-core/vectorless/src/graph/builder.rs diff --git a/rust/src/graph/config.rs b/vectorless-core/vectorless/src/graph/config.rs similarity index 100% rename from rust/src/graph/config.rs rename to vectorless-core/vectorless/src/graph/config.rs diff --git a/rust/src/graph/mod.rs b/vectorless-core/vectorless/src/graph/mod.rs similarity index 100% rename from rust/src/graph/mod.rs rename to vectorless-core/vectorless/src/graph/mod.rs diff --git a/rust/src/graph/types.rs b/vectorless-core/vectorless/src/graph/types.rs similarity index 100% rename from rust/src/graph/types.rs rename to vectorless-core/vectorless/src/graph/types.rs diff --git a/rust/src/index/config.rs b/vectorless-core/vectorless/src/index/config.rs similarity index 100% rename from rust/src/index/config.rs rename to vectorless-core/vectorless/src/index/config.rs diff --git a/rust/src/index/incremental/detector.rs b/vectorless-core/vectorless/src/index/incremental/detector.rs similarity index 100% rename from rust/src/index/incremental/detector.rs rename to vectorless-core/vectorless/src/index/incremental/detector.rs diff --git a/rust/src/index/incremental/mod.rs b/vectorless-core/vectorless/src/index/incremental/mod.rs similarity index 100% rename from rust/src/index/incremental/mod.rs rename to vectorless-core/vectorless/src/index/incremental/mod.rs diff --git a/rust/src/index/incremental/resolver.rs b/vectorless-core/vectorless/src/index/incremental/resolver.rs similarity index 100% rename from rust/src/index/incremental/resolver.rs rename to vectorless-core/vectorless/src/index/incremental/resolver.rs diff --git a/rust/src/index/incremental/updater.rs b/vectorless-core/vectorless/src/index/incremental/updater.rs similarity index 100% rename from rust/src/index/incremental/updater.rs rename to vectorless-core/vectorless/src/index/incremental/updater.rs diff --git a/rust/src/index/mod.rs b/vectorless-core/vectorless/src/index/mod.rs similarity index 100% rename from rust/src/index/mod.rs rename to vectorless-core/vectorless/src/index/mod.rs diff --git a/rust/src/index/parse/markdown/config.rs b/vectorless-core/vectorless/src/index/parse/markdown/config.rs similarity index 100% rename from rust/src/index/parse/markdown/config.rs rename to vectorless-core/vectorless/src/index/parse/markdown/config.rs diff --git a/rust/src/index/parse/markdown/frontmatter.rs b/vectorless-core/vectorless/src/index/parse/markdown/frontmatter.rs similarity index 100% rename from rust/src/index/parse/markdown/frontmatter.rs rename to vectorless-core/vectorless/src/index/parse/markdown/frontmatter.rs diff --git a/rust/src/index/parse/markdown/mod.rs b/vectorless-core/vectorless/src/index/parse/markdown/mod.rs similarity index 100% rename from rust/src/index/parse/markdown/mod.rs rename to vectorless-core/vectorless/src/index/parse/markdown/mod.rs diff --git a/rust/src/index/parse/markdown/parser.rs b/vectorless-core/vectorless/src/index/parse/markdown/parser.rs similarity index 100% rename from rust/src/index/parse/markdown/parser.rs rename to vectorless-core/vectorless/src/index/parse/markdown/parser.rs diff --git a/rust/src/index/parse/mod.rs b/vectorless-core/vectorless/src/index/parse/mod.rs similarity index 100% rename from rust/src/index/parse/mod.rs rename to vectorless-core/vectorless/src/index/parse/mod.rs diff --git a/rust/src/index/parse/pdf/mod.rs b/vectorless-core/vectorless/src/index/parse/pdf/mod.rs similarity index 100% rename from rust/src/index/parse/pdf/mod.rs rename to vectorless-core/vectorless/src/index/parse/pdf/mod.rs diff --git a/rust/src/index/parse/pdf/parser.rs b/vectorless-core/vectorless/src/index/parse/pdf/parser.rs similarity index 100% rename from rust/src/index/parse/pdf/parser.rs rename to vectorless-core/vectorless/src/index/parse/pdf/parser.rs diff --git a/rust/src/index/parse/pdf/types.rs b/vectorless-core/vectorless/src/index/parse/pdf/types.rs similarity index 100% rename from rust/src/index/parse/pdf/types.rs rename to vectorless-core/vectorless/src/index/parse/pdf/types.rs diff --git a/rust/src/index/parse/toc/assigner.rs b/vectorless-core/vectorless/src/index/parse/toc/assigner.rs similarity index 100% rename from rust/src/index/parse/toc/assigner.rs rename to vectorless-core/vectorless/src/index/parse/toc/assigner.rs diff --git a/rust/src/index/parse/toc/detector.rs b/vectorless-core/vectorless/src/index/parse/toc/detector.rs similarity index 100% rename from rust/src/index/parse/toc/detector.rs rename to vectorless-core/vectorless/src/index/parse/toc/detector.rs diff --git a/rust/src/index/parse/toc/mod.rs b/vectorless-core/vectorless/src/index/parse/toc/mod.rs similarity index 100% rename from rust/src/index/parse/toc/mod.rs rename to vectorless-core/vectorless/src/index/parse/toc/mod.rs diff --git a/rust/src/index/parse/toc/parser.rs b/vectorless-core/vectorless/src/index/parse/toc/parser.rs similarity index 100% rename from rust/src/index/parse/toc/parser.rs rename to vectorless-core/vectorless/src/index/parse/toc/parser.rs diff --git a/rust/src/index/parse/toc/processor.rs b/vectorless-core/vectorless/src/index/parse/toc/processor.rs similarity index 100% rename from rust/src/index/parse/toc/processor.rs rename to vectorless-core/vectorless/src/index/parse/toc/processor.rs diff --git a/rust/src/index/parse/toc/repairer.rs b/vectorless-core/vectorless/src/index/parse/toc/repairer.rs similarity index 100% rename from rust/src/index/parse/toc/repairer.rs rename to vectorless-core/vectorless/src/index/parse/toc/repairer.rs diff --git a/rust/src/index/parse/toc/structure_extractor.rs b/vectorless-core/vectorless/src/index/parse/toc/structure_extractor.rs similarity index 100% rename from rust/src/index/parse/toc/structure_extractor.rs rename to vectorless-core/vectorless/src/index/parse/toc/structure_extractor.rs diff --git a/rust/src/index/parse/toc/types.rs b/vectorless-core/vectorless/src/index/parse/toc/types.rs similarity index 100% rename from rust/src/index/parse/toc/types.rs rename to vectorless-core/vectorless/src/index/parse/toc/types.rs diff --git a/rust/src/index/parse/toc/verifier.rs b/vectorless-core/vectorless/src/index/parse/toc/verifier.rs similarity index 100% rename from rust/src/index/parse/toc/verifier.rs rename to vectorless-core/vectorless/src/index/parse/toc/verifier.rs diff --git a/rust/src/index/parse/types.rs b/vectorless-core/vectorless/src/index/parse/types.rs similarity index 100% rename from rust/src/index/parse/types.rs rename to vectorless-core/vectorless/src/index/parse/types.rs diff --git a/rust/src/index/pipeline/checkpoint.rs b/vectorless-core/vectorless/src/index/pipeline/checkpoint.rs similarity index 100% rename from rust/src/index/pipeline/checkpoint.rs rename to vectorless-core/vectorless/src/index/pipeline/checkpoint.rs diff --git a/rust/src/index/pipeline/context.rs b/vectorless-core/vectorless/src/index/pipeline/context.rs similarity index 100% rename from rust/src/index/pipeline/context.rs rename to vectorless-core/vectorless/src/index/pipeline/context.rs diff --git a/rust/src/index/pipeline/executor.rs b/vectorless-core/vectorless/src/index/pipeline/executor.rs similarity index 100% rename from rust/src/index/pipeline/executor.rs rename to vectorless-core/vectorless/src/index/pipeline/executor.rs diff --git a/rust/src/index/pipeline/metrics.rs b/vectorless-core/vectorless/src/index/pipeline/metrics.rs similarity index 100% rename from rust/src/index/pipeline/metrics.rs rename to vectorless-core/vectorless/src/index/pipeline/metrics.rs diff --git a/rust/src/index/pipeline/mod.rs b/vectorless-core/vectorless/src/index/pipeline/mod.rs similarity index 100% rename from rust/src/index/pipeline/mod.rs rename to vectorless-core/vectorless/src/index/pipeline/mod.rs diff --git a/rust/src/index/pipeline/orchestrator.rs b/vectorless-core/vectorless/src/index/pipeline/orchestrator.rs similarity index 100% rename from rust/src/index/pipeline/orchestrator.rs rename to vectorless-core/vectorless/src/index/pipeline/orchestrator.rs diff --git a/rust/src/index/pipeline/policy.rs b/vectorless-core/vectorless/src/index/pipeline/policy.rs similarity index 100% rename from rust/src/index/pipeline/policy.rs rename to vectorless-core/vectorless/src/index/pipeline/policy.rs diff --git a/rust/src/index/stages/build.rs b/vectorless-core/vectorless/src/index/stages/build.rs similarity index 100% rename from rust/src/index/stages/build.rs rename to vectorless-core/vectorless/src/index/stages/build.rs diff --git a/rust/src/index/stages/enhance.rs b/vectorless-core/vectorless/src/index/stages/enhance.rs similarity index 100% rename from rust/src/index/stages/enhance.rs rename to vectorless-core/vectorless/src/index/stages/enhance.rs diff --git a/rust/src/index/stages/enrich.rs b/vectorless-core/vectorless/src/index/stages/enrich.rs similarity index 100% rename from rust/src/index/stages/enrich.rs rename to vectorless-core/vectorless/src/index/stages/enrich.rs diff --git a/rust/src/index/stages/mod.rs b/vectorless-core/vectorless/src/index/stages/mod.rs similarity index 100% rename from rust/src/index/stages/mod.rs rename to vectorless-core/vectorless/src/index/stages/mod.rs diff --git a/rust/src/index/stages/navigation.rs b/vectorless-core/vectorless/src/index/stages/navigation.rs similarity index 100% rename from rust/src/index/stages/navigation.rs rename to vectorless-core/vectorless/src/index/stages/navigation.rs diff --git a/rust/src/index/stages/optimize.rs b/vectorless-core/vectorless/src/index/stages/optimize.rs similarity index 100% rename from rust/src/index/stages/optimize.rs rename to vectorless-core/vectorless/src/index/stages/optimize.rs diff --git a/rust/src/index/stages/parse.rs b/vectorless-core/vectorless/src/index/stages/parse.rs similarity index 100% rename from rust/src/index/stages/parse.rs rename to vectorless-core/vectorless/src/index/stages/parse.rs diff --git a/rust/src/index/stages/reasoning.rs b/vectorless-core/vectorless/src/index/stages/reasoning.rs similarity index 100% rename from rust/src/index/stages/reasoning.rs rename to vectorless-core/vectorless/src/index/stages/reasoning.rs diff --git a/rust/src/index/stages/split.rs b/vectorless-core/vectorless/src/index/stages/split.rs similarity index 100% rename from rust/src/index/stages/split.rs rename to vectorless-core/vectorless/src/index/stages/split.rs diff --git a/rust/src/index/stages/validate.rs b/vectorless-core/vectorless/src/index/stages/validate.rs similarity index 100% rename from rust/src/index/stages/validate.rs rename to vectorless-core/vectorless/src/index/stages/validate.rs diff --git a/rust/src/index/summary/full.rs b/vectorless-core/vectorless/src/index/summary/full.rs similarity index 100% rename from rust/src/index/summary/full.rs rename to vectorless-core/vectorless/src/index/summary/full.rs diff --git a/rust/src/index/summary/lazy.rs b/vectorless-core/vectorless/src/index/summary/lazy.rs similarity index 100% rename from rust/src/index/summary/lazy.rs rename to vectorless-core/vectorless/src/index/summary/lazy.rs diff --git a/rust/src/index/summary/mod.rs b/vectorless-core/vectorless/src/index/summary/mod.rs similarity index 100% rename from rust/src/index/summary/mod.rs rename to vectorless-core/vectorless/src/index/summary/mod.rs diff --git a/rust/src/index/summary/selective.rs b/vectorless-core/vectorless/src/index/summary/selective.rs similarity index 100% rename from rust/src/index/summary/selective.rs rename to vectorless-core/vectorless/src/index/summary/selective.rs diff --git a/rust/src/index/summary/strategy.rs b/vectorless-core/vectorless/src/index/summary/strategy.rs similarity index 100% rename from rust/src/index/summary/strategy.rs rename to vectorless-core/vectorless/src/index/summary/strategy.rs diff --git a/rust/src/lib.rs b/vectorless-core/vectorless/src/lib.rs similarity index 100% rename from rust/src/lib.rs rename to vectorless-core/vectorless/src/lib.rs diff --git a/rust/src/llm/client.rs b/vectorless-core/vectorless/src/llm/client.rs similarity index 100% rename from rust/src/llm/client.rs rename to vectorless-core/vectorless/src/llm/client.rs diff --git a/rust/src/llm/config.rs b/vectorless-core/vectorless/src/llm/config.rs similarity index 100% rename from rust/src/llm/config.rs rename to vectorless-core/vectorless/src/llm/config.rs diff --git a/rust/src/llm/error.rs b/vectorless-core/vectorless/src/llm/error.rs similarity index 100% rename from rust/src/llm/error.rs rename to vectorless-core/vectorless/src/llm/error.rs diff --git a/rust/src/llm/executor.rs b/vectorless-core/vectorless/src/llm/executor.rs similarity index 100% rename from rust/src/llm/executor.rs rename to vectorless-core/vectorless/src/llm/executor.rs diff --git a/rust/src/llm/fallback.rs b/vectorless-core/vectorless/src/llm/fallback.rs similarity index 100% rename from rust/src/llm/fallback.rs rename to vectorless-core/vectorless/src/llm/fallback.rs diff --git a/rust/src/llm/memo/mod.rs b/vectorless-core/vectorless/src/llm/memo/mod.rs similarity index 100% rename from rust/src/llm/memo/mod.rs rename to vectorless-core/vectorless/src/llm/memo/mod.rs diff --git a/rust/src/llm/memo/store.rs b/vectorless-core/vectorless/src/llm/memo/store.rs similarity index 100% rename from rust/src/llm/memo/store.rs rename to vectorless-core/vectorless/src/llm/memo/store.rs diff --git a/rust/src/llm/memo/types.rs b/vectorless-core/vectorless/src/llm/memo/types.rs similarity index 100% rename from rust/src/llm/memo/types.rs rename to vectorless-core/vectorless/src/llm/memo/types.rs diff --git a/rust/src/llm/mod.rs b/vectorless-core/vectorless/src/llm/mod.rs similarity index 100% rename from rust/src/llm/mod.rs rename to vectorless-core/vectorless/src/llm/mod.rs diff --git a/rust/src/llm/pool.rs b/vectorless-core/vectorless/src/llm/pool.rs similarity index 100% rename from rust/src/llm/pool.rs rename to vectorless-core/vectorless/src/llm/pool.rs diff --git a/rust/src/llm/throttle.rs b/vectorless-core/vectorless/src/llm/throttle.rs similarity index 100% rename from rust/src/llm/throttle.rs rename to vectorless-core/vectorless/src/llm/throttle.rs diff --git a/rust/src/metrics/hub.rs b/vectorless-core/vectorless/src/metrics/hub.rs similarity index 100% rename from rust/src/metrics/hub.rs rename to vectorless-core/vectorless/src/metrics/hub.rs diff --git a/rust/src/metrics/index.rs b/vectorless-core/vectorless/src/metrics/index.rs similarity index 100% rename from rust/src/metrics/index.rs rename to vectorless-core/vectorless/src/metrics/index.rs diff --git a/rust/src/metrics/llm.rs b/vectorless-core/vectorless/src/metrics/llm.rs similarity index 100% rename from rust/src/metrics/llm.rs rename to vectorless-core/vectorless/src/metrics/llm.rs diff --git a/rust/src/metrics/mod.rs b/vectorless-core/vectorless/src/metrics/mod.rs similarity index 100% rename from rust/src/metrics/mod.rs rename to vectorless-core/vectorless/src/metrics/mod.rs diff --git a/rust/src/metrics/retrieval.rs b/vectorless-core/vectorless/src/metrics/retrieval.rs similarity index 100% rename from rust/src/metrics/retrieval.rs rename to vectorless-core/vectorless/src/metrics/retrieval.rs diff --git a/rust/src/query/mod.rs b/vectorless-core/vectorless/src/query/mod.rs similarity index 100% rename from rust/src/query/mod.rs rename to vectorless-core/vectorless/src/query/mod.rs diff --git a/rust/src/query/types.rs b/vectorless-core/vectorless/src/query/types.rs similarity index 100% rename from rust/src/query/types.rs rename to vectorless-core/vectorless/src/query/types.rs diff --git a/rust/src/query/understand.rs b/vectorless-core/vectorless/src/query/understand.rs similarity index 100% rename from rust/src/query/understand.rs rename to vectorless-core/vectorless/src/query/understand.rs diff --git a/rust/src/rerank/dedup.rs b/vectorless-core/vectorless/src/rerank/dedup.rs similarity index 100% rename from rust/src/rerank/dedup.rs rename to vectorless-core/vectorless/src/rerank/dedup.rs diff --git a/rust/src/rerank/mod.rs b/vectorless-core/vectorless/src/rerank/mod.rs similarity index 100% rename from rust/src/rerank/mod.rs rename to vectorless-core/vectorless/src/rerank/mod.rs diff --git a/rust/src/rerank/types.rs b/vectorless-core/vectorless/src/rerank/types.rs similarity index 100% rename from rust/src/rerank/types.rs rename to vectorless-core/vectorless/src/rerank/types.rs diff --git a/rust/src/retrieval/cache.rs b/vectorless-core/vectorless/src/retrieval/cache.rs similarity index 100% rename from rust/src/retrieval/cache.rs rename to vectorless-core/vectorless/src/retrieval/cache.rs diff --git a/rust/src/retrieval/dispatcher.rs b/vectorless-core/vectorless/src/retrieval/dispatcher.rs similarity index 100% rename from rust/src/retrieval/dispatcher.rs rename to vectorless-core/vectorless/src/retrieval/dispatcher.rs diff --git a/rust/src/retrieval/mod.rs b/vectorless-core/vectorless/src/retrieval/mod.rs similarity index 100% rename from rust/src/retrieval/mod.rs rename to vectorless-core/vectorless/src/retrieval/mod.rs diff --git a/rust/src/retrieval/postprocessor.rs b/vectorless-core/vectorless/src/retrieval/postprocessor.rs similarity index 100% rename from rust/src/retrieval/postprocessor.rs rename to vectorless-core/vectorless/src/retrieval/postprocessor.rs diff --git a/rust/src/retrieval/stream.rs b/vectorless-core/vectorless/src/retrieval/stream.rs similarity index 100% rename from rust/src/retrieval/stream.rs rename to vectorless-core/vectorless/src/retrieval/stream.rs diff --git a/rust/src/retrieval/types.rs b/vectorless-core/vectorless/src/retrieval/types.rs similarity index 100% rename from rust/src/retrieval/types.rs rename to vectorless-core/vectorless/src/retrieval/types.rs diff --git a/rust/src/scoring/bm25.rs b/vectorless-core/vectorless/src/scoring/bm25.rs similarity index 100% rename from rust/src/scoring/bm25.rs rename to vectorless-core/vectorless/src/scoring/bm25.rs diff --git a/rust/src/scoring/mod.rs b/vectorless-core/vectorless/src/scoring/mod.rs similarity index 100% rename from rust/src/scoring/mod.rs rename to vectorless-core/vectorless/src/scoring/mod.rs diff --git a/rust/src/storage/backend/file.rs b/vectorless-core/vectorless/src/storage/backend/file.rs similarity index 100% rename from rust/src/storage/backend/file.rs rename to vectorless-core/vectorless/src/storage/backend/file.rs diff --git a/rust/src/storage/backend/memory.rs b/vectorless-core/vectorless/src/storage/backend/memory.rs similarity index 100% rename from rust/src/storage/backend/memory.rs rename to vectorless-core/vectorless/src/storage/backend/memory.rs diff --git a/rust/src/storage/backend/mod.rs b/vectorless-core/vectorless/src/storage/backend/mod.rs similarity index 100% rename from rust/src/storage/backend/mod.rs rename to vectorless-core/vectorless/src/storage/backend/mod.rs diff --git a/rust/src/storage/backend/trait_def.rs b/vectorless-core/vectorless/src/storage/backend/trait_def.rs similarity index 100% rename from rust/src/storage/backend/trait_def.rs rename to vectorless-core/vectorless/src/storage/backend/trait_def.rs diff --git a/rust/src/storage/cache.rs b/vectorless-core/vectorless/src/storage/cache.rs similarity index 100% rename from rust/src/storage/cache.rs rename to vectorless-core/vectorless/src/storage/cache.rs diff --git a/rust/src/storage/codec.rs b/vectorless-core/vectorless/src/storage/codec.rs similarity index 100% rename from rust/src/storage/codec.rs rename to vectorless-core/vectorless/src/storage/codec.rs diff --git a/rust/src/storage/lock.rs b/vectorless-core/vectorless/src/storage/lock.rs similarity index 100% rename from rust/src/storage/lock.rs rename to vectorless-core/vectorless/src/storage/lock.rs diff --git a/rust/src/storage/migration.rs b/vectorless-core/vectorless/src/storage/migration.rs similarity index 100% rename from rust/src/storage/migration.rs rename to vectorless-core/vectorless/src/storage/migration.rs diff --git a/rust/src/storage/mod.rs b/vectorless-core/vectorless/src/storage/mod.rs similarity index 100% rename from rust/src/storage/mod.rs rename to vectorless-core/vectorless/src/storage/mod.rs diff --git a/rust/src/storage/persistence.rs b/vectorless-core/vectorless/src/storage/persistence.rs similarity index 100% rename from rust/src/storage/persistence.rs rename to vectorless-core/vectorless/src/storage/persistence.rs diff --git a/rust/src/storage/workspace.rs b/vectorless-core/vectorless/src/storage/workspace.rs similarity index 100% rename from rust/src/storage/workspace.rs rename to vectorless-core/vectorless/src/storage/workspace.rs diff --git a/rust/src/utils/fingerprint.rs b/vectorless-core/vectorless/src/utils/fingerprint.rs similarity index 100% rename from rust/src/utils/fingerprint.rs rename to vectorless-core/vectorless/src/utils/fingerprint.rs diff --git a/rust/src/utils/mod.rs b/vectorless-core/vectorless/src/utils/mod.rs similarity index 100% rename from rust/src/utils/mod.rs rename to vectorless-core/vectorless/src/utils/mod.rs diff --git a/rust/src/utils/token.rs b/vectorless-core/vectorless/src/utils/token.rs similarity index 100% rename from rust/src/utils/token.rs rename to vectorless-core/vectorless/src/utils/token.rs diff --git a/rust/src/utils/validation.rs b/vectorless-core/vectorless/src/utils/validation.rs similarity index 100% rename from rust/src/utils/validation.rs rename to vectorless-core/vectorless/src/utils/validation.rs diff --git a/vectorless/README.md b/vectorless/README.md new file mode 100644 index 00000000..d03b76a6 --- /dev/null +++ b/vectorless/README.md @@ -0,0 +1,165 @@ +# Vectorless Python SDK + +Python bindings for [vectorless](https://github.com/vectorlessflow/vectorless) — a Document Understanding Engine for AI. + +## Installation + +```bash +pip install vectorless +``` + +## Quick Start + +```python +import asyncio +from vectorless import Engine + +async def main(): + # Create engine — api_key and model are required + engine = Engine( + api_key="sk-...", + model="gpt-4o", + ) + + # Understand a document + doc = await engine.ingest("./report.pdf") + print(f"Understood: {doc.name} — {doc.summary}") + + # Ask a question + answer = await engine.ask( + "What is the total revenue?", + doc_ids=[doc.doc_id], + ) + print(f"Answer: {answer.content}") + print(f"Confidence: {answer.confidence:.2f}") + print(f"Evidence: {len(answer.evidence)} pieces") + print(f"Trace: {len(answer.trace.steps)} steps") + + # List all understood documents + docs = await engine.list_documents() + for d in docs: + print(f" - {d.name} ({d.doc_id})") + + # Forget a document + await engine.forget(doc.doc_id) + +asyncio.run(main()) +``` + +## API Reference + +### Engine + +The main entry point. All methods are **async**. + +```python +class Engine: + def __init__( + self, + api_key: str | None = None, + model: str | None = None, + endpoint: str | None = None, + config: Config | None = None, + ): ... + + async def ingest(self, path: str) -> DocumentInfo: ... + async def ask(self, question: str, doc_ids: list[str] | None = None) -> Answer: ... + async def forget(self, doc_id: str) -> None: ... + async def list_documents(self) -> list[DocumentInfo]: ... + async def exists(self, doc_id: str) -> bool: ... + async def clear(self) -> int: ... + async def get_graph(self) -> DocumentGraph | None: ... + def metrics_report(self) -> MetricsReport: ... +``` + +### DocumentInfo + +```python +class DocumentInfo: + doc_id: str + name: str + format: str + summary: str + concepts: list[Concept] + section_count: int + page_count: int | None +``` + +### Answer + +```python +class Answer: + content: str + evidence: list[Evidence] + confidence: float + trace: ReasoningTrace +``` + +### Evidence + +```python +class Evidence: + content: str + source_path: str + doc_name: str + relevance: float +``` + +### ReasoningTrace + +```python +class ReasoningTrace: + steps: list[TraceStep] +``` + +### TraceStep + +```python +class TraceStep: + action: str + observation: str + round: int +``` + +### Concept + +```python +class Concept: + name: str + description: str + confidence: float +``` + +### VectorlessError + +```python +class VectorlessError(Exception): + message: str + kind: str # "config", "parse", "not_found", "llm" +``` + +## Development + +### Building from source + +```bash +# Install maturin +pip install maturin + +# Build and install (from project root) +maturin develop + +# Run tests +pytest +``` + +### Publishing to PyPI + +```bash +maturin build --release +maturin publish +``` + +## License + +Apache-2.0 diff --git a/vectorless/__init__.py b/vectorless/__init__.py new file mode 100644 index 00000000..7b5f67a4 --- /dev/null +++ b/vectorless/__init__.py @@ -0,0 +1,67 @@ +""" +Vectorless — Document Understanding Engine for AI. + +Quick Start: + from vectorless import Engine + + engine = Engine(api_key="sk-...", model="gpt-4o") + doc = await engine.ingest("./report.pdf") + answer = await engine.ask("What is the revenue?", doc_ids=[doc.doc_id]) + print(answer.content) +""" + +# Core Engine and types from Rust +from vectorless._vectorless import ( + Answer, + Concept, + Config, + DocumentGraph, + DocumentInfo, + EdgeEvidence, + Engine, + Evidence, + GraphEdge, + MetricsReport, + ReasoningTrace, + TraceStep, + VectorlessError, + WeightedKeyword, + __version__, +) + +# Configuration utilities +from vectorless.config import EngineConfig, load_config, load_config_from_env, load_config_from_file + +# Events +from vectorless.events import EventEmitter + +__all__ = [ + # Primary API + "Engine", + # Configuration + "EngineConfig", + "load_config", + "load_config_from_env", + "load_config_from_file", + "Config", + # Events + "EventEmitter", + # Document types + "DocumentInfo", + "Concept", + # Answer types + "Answer", + "Evidence", + "ReasoningTrace", + "TraceStep", + # Graph types + "DocumentGraph", + "GraphEdge", + "EdgeEvidence", + "WeightedKeyword", + # Metrics + "MetricsReport", + # Error and version + "VectorlessError", + "__version__", +] diff --git a/python/vectorless/_async_utils.py b/vectorless/_async_utils.py similarity index 100% rename from python/vectorless/_async_utils.py rename to vectorless/_async_utils.py diff --git a/python/vectorless/_compat/__init__.py b/vectorless/_compat/__init__.py similarity index 100% rename from python/vectorless/_compat/__init__.py rename to vectorless/_compat/__init__.py diff --git a/python/vectorless/_compat/langchain.py b/vectorless/_compat/langchain.py similarity index 100% rename from python/vectorless/_compat/langchain.py rename to vectorless/_compat/langchain.py diff --git a/python/vectorless/_compat/llamaindex.py b/vectorless/_compat/llamaindex.py similarity index 100% rename from python/vectorless/_compat/llamaindex.py rename to vectorless/_compat/llamaindex.py diff --git a/vectorless/_core.py b/vectorless/_core.py new file mode 100644 index 00000000..53ea7ce5 --- /dev/null +++ b/vectorless/_core.py @@ -0,0 +1,48 @@ +"""Internal re-exports from the Rust PyO3 module. + +This module is NOT part of the public API. Use ``vectorless.Engine`` instead. +""" + +from vectorless._vectorless import ( + Answer, + Concept, + Config, + DocumentGraph, + DocumentGraphEdge, + DocumentGraphNode, + DocumentInfo, + EdgeEvidence, + Engine, + Evidence, + GraphEdge, + LlmMetricsReport, + MetricsReport, + ReasoningTrace, + RetrievalMetricsReport, + TraceStep, + VectorlessError, + WeightedKeyword, + __version__, +) + +__all__ = [ + "Answer", + "Concept", + "Config", + "DocumentGraph", + "DocumentGraphEdge", + "DocumentGraphNode", + "DocumentInfo", + "EdgeEvidence", + "Engine", + "Evidence", + "GraphEdge", + "LlmMetricsReport", + "MetricsReport", + "ReasoningTrace", + "RetrievalMetricsReport", + "TraceStep", + "VectorlessError", + "WeightedKeyword", + "__version__", +] diff --git a/python/vectorless/cli/__init__.py b/vectorless/cli/__init__.py similarity index 100% rename from python/vectorless/cli/__init__.py rename to vectorless/cli/__init__.py diff --git a/python/vectorless/cli/commands/__init__.py b/vectorless/cli/commands/__init__.py similarity index 100% rename from python/vectorless/cli/commands/__init__.py rename to vectorless/cli/commands/__init__.py diff --git a/python/vectorless/cli/commands/add.py b/vectorless/cli/commands/add.py similarity index 100% rename from python/vectorless/cli/commands/add.py rename to vectorless/cli/commands/add.py diff --git a/python/vectorless/cli/commands/ask.py b/vectorless/cli/commands/ask.py similarity index 100% rename from python/vectorless/cli/commands/ask.py rename to vectorless/cli/commands/ask.py diff --git a/python/vectorless/cli/commands/config_cmd.py b/vectorless/cli/commands/config_cmd.py similarity index 100% rename from python/vectorless/cli/commands/config_cmd.py rename to vectorless/cli/commands/config_cmd.py diff --git a/python/vectorless/cli/commands/info.py b/vectorless/cli/commands/info.py similarity index 100% rename from python/vectorless/cli/commands/info.py rename to vectorless/cli/commands/info.py diff --git a/python/vectorless/cli/commands/init.py b/vectorless/cli/commands/init.py similarity index 100% rename from python/vectorless/cli/commands/init.py rename to vectorless/cli/commands/init.py diff --git a/python/vectorless/cli/commands/list_cmd.py b/vectorless/cli/commands/list_cmd.py similarity index 100% rename from python/vectorless/cli/commands/list_cmd.py rename to vectorless/cli/commands/list_cmd.py diff --git a/python/vectorless/cli/commands/query.py b/vectorless/cli/commands/query.py similarity index 100% rename from python/vectorless/cli/commands/query.py rename to vectorless/cli/commands/query.py diff --git a/python/vectorless/cli/commands/remove.py b/vectorless/cli/commands/remove.py similarity index 100% rename from python/vectorless/cli/commands/remove.py rename to vectorless/cli/commands/remove.py diff --git a/python/vectorless/cli/commands/stats.py b/vectorless/cli/commands/stats.py similarity index 100% rename from python/vectorless/cli/commands/stats.py rename to vectorless/cli/commands/stats.py diff --git a/python/vectorless/cli/commands/tree.py b/vectorless/cli/commands/tree.py similarity index 100% rename from python/vectorless/cli/commands/tree.py rename to vectorless/cli/commands/tree.py diff --git a/python/vectorless/cli/main.py b/vectorless/cli/main.py similarity index 100% rename from python/vectorless/cli/main.py rename to vectorless/cli/main.py diff --git a/python/vectorless/cli/output.py b/vectorless/cli/output.py similarity index 100% rename from python/vectorless/cli/output.py rename to vectorless/cli/output.py diff --git a/python/vectorless/config/__init__.py b/vectorless/config/__init__.py similarity index 100% rename from python/vectorless/config/__init__.py rename to vectorless/config/__init__.py diff --git a/python/vectorless/config/loading.py b/vectorless/config/loading.py similarity index 100% rename from python/vectorless/config/loading.py rename to vectorless/config/loading.py diff --git a/python/vectorless/config/models.py b/vectorless/config/models.py similarity index 100% rename from python/vectorless/config/models.py rename to vectorless/config/models.py diff --git a/python/vectorless/events.py b/vectorless/events.py similarity index 100% rename from python/vectorless/events.py rename to vectorless/events.py diff --git a/python/vectorless/jupyter.py b/vectorless/jupyter.py similarity index 100% rename from python/vectorless/jupyter.py rename to vectorless/jupyter.py diff --git a/python/vectorless/py.typed b/vectorless/py.typed similarity index 100% rename from python/vectorless/py.typed rename to vectorless/py.typed diff --git a/python/vectorless/session.py b/vectorless/session.py similarity index 100% rename from python/vectorless/session.py rename to vectorless/session.py diff --git a/python/vectorless/streaming.py b/vectorless/streaming.py similarity index 100% rename from python/vectorless/streaming.py rename to vectorless/streaming.py diff --git a/python/vectorless/sync_session.py b/vectorless/sync_session.py similarity index 100% rename from python/vectorless/sync_session.py rename to vectorless/sync_session.py diff --git a/python/vectorless/types/__init__.py b/vectorless/types/__init__.py similarity index 100% rename from python/vectorless/types/__init__.py rename to vectorless/types/__init__.py diff --git a/python/vectorless/types/graph.py b/vectorless/types/graph.py similarity index 100% rename from python/vectorless/types/graph.py rename to vectorless/types/graph.py diff --git a/python/vectorless/types/results.py b/vectorless/types/results.py similarity index 100% rename from python/vectorless/types/results.py rename to vectorless/types/results.py From e106de0b363d1baabe6b2fcf95e5b448ecf03edc Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:38:57 +0800 Subject: [PATCH 03/28] docs(CLAUDE.md): update development workflow paths Update the development workflow documentation to reflect new directory structure: - Change feature implementation path from crates/vectorless/src/ to vectorless-core/vectorless/src/ - Update Python bindings path from crates/vectorless-py/src/lib.rs to vectorless-core/vectorless-py/src/lib.rs - Update Python SDK path from python/vectorless/ to vectorless/ --- CLAUDE.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index c326169e..4ade4cd1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -147,8 +147,8 @@ When uncertain whether an operation is safe, **default to asking user confirmati ## Common Development Workflow -1. **Adding features**: Implement in appropriate `crates/vectorless/src/` module, add tests +1. **Adding features**: Implement in appropriate `vectorless-core/vectorless/src/` module, add tests 2. **Fixing bugs**: Add failing test case first, fix and ensure tests pass -3. **Python bindings**: Update `crates/vectorless-py/src/lib.rs` (PyO3) when Rust APIs change -4. **Python SDK**: Update `python/vectorless/` when API surface changes +3. **Python bindings**: Update `vectorless-core/vectorless-py/src/lib.rs` (PyO3) when Rust APIs change +4. **Python SDK**: Update `vectorless/` when API surface changes 5. **Committing code**: Use semantic commit messages, format: `type(scope): description` From 2e8a47bee8b3e6455a248e31944cbbccb1db85a0 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:47:07 +0800 Subject: [PATCH 04/28] feat(pyproject.toml): update project configuration and dependencies - Update description from "Reasoning-based Document Engine" to "Document Understanding Engine for AI" - Bump minimum Python requirement from 3.9 to 3.10 - Update author name from "vectorless developers" to "Vectorless" - Remove Python 3.9 classifier and add clarifying comment for tomli dependency - Update keywords to better reflect document understanding focus - Update mypy and ruff target versions to Python 3.10 - Add uv tool configuration with dev dependencies --- pyproject.toml | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index eeca3a79..347bab18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,19 +5,18 @@ build-backend = "maturin" [project] name = "vectorless" version = "0.1.11" -description = "Reasoning-based Document Engine" +description = "Document Understanding Engine for AI" readme = "README.md" -requires-python = ">=3.9" +requires-python = ">=3.10" license = { text = "Apache-2.0" } authors = [ - { name = "vectorless developers", email = "beautifularea@gmail.com" } + { name = "Vectorless", email = "beautifularea@gmail.com" } ] classifiers = [ "Development Status :: 4 - Beta", "Intended Audience :: Developers", "License :: OSI Approved :: Apache Software License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", @@ -26,12 +25,12 @@ classifiers = [ "Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Text Processing :: Linguistic", ] -keywords = ["rag", "document", "retrieval", "llm", "document-intelligence"] +keywords = ["document", "understanding", "ai", "reasoning", "document-intelligence"] dependencies = [ "pydantic>=2.0", "click>=8.0", - "tomli>=2.0; python_version < '3.11'", + "tomli>=2.0; python_version < '3.11'", # 3.10 only, 3.11+ has tomllib built-in ] [project.optional-dependencies] @@ -77,13 +76,13 @@ asyncio_mode = "auto" testpaths = ["tests"] [tool.mypy] -python_version = "3.9" +python_version = "3.10" warn_return_any = true warn_unused_configs = true [tool.ruff] line-length = 100 -target-version = "py39" +target-version = "py310" [tool.ruff.lint] select = ["E", "F", "W", "I", "N", "UP", "B"] @@ -93,3 +92,12 @@ quote-style = "double" indent-style = "space" skip-magic-trailing-comma = false line-ending = "lf" + +[tool.uv] +dev-dependencies = [ + "pytest>=7.0", + "pytest-asyncio>=0.21", + "mypy>=1.0", + "ruff>=0.4", + "maturin>=1.5", +] From 63354a646b111864cbd27284de9308a93f64f178 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:48:36 +0800 Subject: [PATCH 05/28] refactor(core): remove exclusion rules and example files - Remove exclude directive from Cargo.toml that was excluding docs/, examples/, and .* patterns - Delete all example files including deep_retrieval.rs, events.rs, flow.rs, graph.rs, index_directory.rs, index_incremental.rs, and index_pdf.rs --- vectorless-core/vectorless/Cargo.toml | 1 - .../vectorless/examples/deep_retrieval.rs | 221 ------------------ vectorless-core/vectorless/examples/events.rs | 155 ------------ vectorless-core/vectorless/examples/flow.rs | 143 ------------ vectorless-core/vectorless/examples/graph.rs | 106 --------- .../vectorless/examples/index_directory.rs | 114 --------- .../vectorless/examples/index_incremental.rs | 122 ---------- .../vectorless/examples/index_pdf.rs | 118 ---------- .../vectorless/examples/index_single.rs | 103 -------- .../vectorless/examples/indexing.rs | 59 ----- .../vectorless/examples/indexing_flow.rs | 173 -------------- vectorless-core/vectorless/examples/query.rs | 83 ------- 12 files changed, 1398 deletions(-) delete mode 100644 vectorless-core/vectorless/examples/deep_retrieval.rs delete mode 100644 vectorless-core/vectorless/examples/events.rs delete mode 100644 vectorless-core/vectorless/examples/flow.rs delete mode 100644 vectorless-core/vectorless/examples/graph.rs delete mode 100644 vectorless-core/vectorless/examples/index_directory.rs delete mode 100644 vectorless-core/vectorless/examples/index_incremental.rs delete mode 100644 vectorless-core/vectorless/examples/index_pdf.rs delete mode 100644 vectorless-core/vectorless/examples/index_single.rs delete mode 100644 vectorless-core/vectorless/examples/indexing.rs delete mode 100644 vectorless-core/vectorless/examples/indexing_flow.rs delete mode 100644 vectorless-core/vectorless/examples/query.rs diff --git a/vectorless-core/vectorless/Cargo.toml b/vectorless-core/vectorless/Cargo.toml index 70c06ab3..a8de0234 100644 --- a/vectorless-core/vectorless/Cargo.toml +++ b/vectorless-core/vectorless/Cargo.toml @@ -11,7 +11,6 @@ documentation = "https://docs.rs/vectorless" keywords = ["rag", "document", "retrieval", "indexing", "llm"] categories = ["text-processing", "data-structures", "algorithms"] readme = "../../README.md" -exclude = ["docs/", "examples/", ".*"] [dependencies] # Async runtime diff --git a/vectorless-core/vectorless/examples/deep_retrieval.rs b/vectorless-core/vectorless/examples/deep_retrieval.rs deleted file mode 100644 index 44877543..00000000 --- a/vectorless-core/vectorless/examples/deep_retrieval.rs +++ /dev/null @@ -1,221 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Complex retrieval example — forces SubAgent navigation, not fast path. -//! -//! This example indexes a document where the answer to a tricky question -//! is NOT directly accessible via keyword lookup in the ReasoningIndex. -//! The SubAgent must navigate through multiple levels, collect evidence -//! from different sections, and synthesize a cross-referenced answer. -//! -//! # Usage -//! -//! ```bash -//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ -//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example deep_retrieval -//! ``` - -use vectorless::{EngineBuilder, IndexContext, IndexOptions, QueryContext}; - -/// A compact but deeply nested document about a fictional space mission. -/// -/// Structure (4 levels deep): -/// -/// Mission Atlas Report -/// ├── Launch Operations -/// │ ├── Vehicle Configuration -/// │ │ ├── Stage 1 Parameters -/// │ │ └── Stage 2 Parameters -/// │ └── Countdown Timeline -/// │ ├── T-48h to T-12h -/// │ └── T-12h to T-0 -/// ├── Orbital Mechanics -/// │ ├── Transfer Orbit Analysis -/// │ │ ├── Delta-V Budget -/// │ │ └── Gravity Assist Profile -/// │ └── Station-Keeping Schedule -/// ├── Payload Operations -/// │ ├── Satellite Alpha Deployment -/// │ │ ├── Separation Sequence -/// │ │ └── Solar Panel Extension -/// │ ├── Satellite Beta Deployment -/// │ │ ├── Antenna Calibration -/// │ │ └── Frequency Assignment -/// │ └── Re-entry Capsule -/// │ ├── Heat Shield Specs -/// │ └── Landing Zone Selection -/// └── Mission Anomalies -/// ├── Day 3 Communication Blackout -/// └── Day 17 Thruster Misfire -const MISSION_REPORT: &str = r#" -# Mission Atlas Report - -## Launch Operations - -### Vehicle Configuration - -#### Stage 1 Parameters - -The first stage utilizes a LOX/RP-1 bipropellant configuration with a sea-level thrust of 7,600 kN. Burn time is 162 seconds with a specific impulse of 282 seconds. The propellant mass fraction is 0.894. Stage separation occurs at T+162s at an altitude of approximately 68 km with a velocity of 2,340 m/s. - -#### Stage 2 Parameters - -The second stage employs a single RL-10C engine using LOX/LH2 with a vacuum thrust of 110 kN. Burn duration extends to 370 seconds with a specific impulse of 448 seconds. The stage carries 20,800 kg of propellant. Engine ignition occurs at T+165s following a 3-second coast phase after stage separation. - -### Countdown Timeline - -#### T-48h to T-12h - -During the early countdown phase, the launch team completed propellant loading verification and navigation system alignment. A minor issue was detected in the Stage 2 fuel temperature sensor at T-36h, which was resolved by recalibrating the sensor threshold from 20.1K to 19.8K. Weather briefing at T-24h indicated 85% probability of favorable conditions with upper-level winds at 45 knots. - -#### T-12h to T-0 - -Final countdown proceeded nominally. Auxiliary power unit start occurred at T-4h. Range safety checks completed at T-2h. Go/No-Go poll at T-30 minutes was unanimous across all stations. Terminal count at T-9 minutes was initiated with no holds. Liftoff occurred at 14:37:22 UTC on March 15, achieving the targeted azimuth of 72.3 degrees. - -## Orbital Mechanics - -### Transfer Orbit Analysis - -#### Delta-V Budget - -The total mission delta-V budget is 4,832 m/s, allocated as follows: ascent to parking orbit 1,890 m/s, trans-target injection 2,210 m/s, orbit insertion 510 m/s, and station-keeping reserve 222 m/s. The parking orbit was achieved at 185 km circular with an inclination of 28.5 degrees. The gravity assist maneuver at Titan contributed an effective delta-V savings of 380 m/s, which allowed the mission to carry 15% more payload than the original baseline design. - -#### Gravity Assist Profile - -The Titan flyby occurred on Day 47 at a closest approach distance of 950 km. The bending angle was 38.7 degrees with an asymptotic velocity of 4.2 km/s relative to Titan. This maneuver shifted the spacecraft trajectory from a Hohmann-type direct transfer to a gravity-assisted trajectory, reducing total flight time from 187 days to 143 days. Post-flyby trajectory correction burn of 3.4 m/s was executed on Day 49 to refine the approach corridor. - -### Station-Keeping Schedule - -Station-keeping maneuvers are planned at 14-day intervals with a delta-V allocation of 2.8 m/s per maneuver. The first three maneuvers consumed 2.6, 3.1, and 2.5 m/s respectively, staying within the allocated budget. Orbital decay rate without correction is approximately 0.3 km per 14-day cycle due to atmospheric drag at the operational altitude of 420 km. - -## Payload Operations - -### Satellite Alpha Deployment - -#### Separation Sequence - -Satellite Alpha separated from the payload adapter at T+3h42m using a Marman band release mechanism. Separation velocity was 0.45 m/s with a tip-off rate of 0.02 deg/s. Initial telemetry confirmed solar panel deployment signal at T+3h58m. First ground station contact occurred over Svalbard at T+4h12m confirming nominal spacecraft health. - -#### Solar Panel Extension - -Both solar arrays deployed fully within 8 minutes of the deployment command. Array 1 generated 4,280 W and Array 2 generated 4,310 W, for a combined initial output of 8,590 W against a design target of 8,400 W. The arrays use triple-junction GaAs cells with a beginning-of-life efficiency of 30.7%. Power margin at end-of-life (7 years) is projected at 6,950 W, still above the minimum operational requirement of 6,200 W. - -### Satellite Beta Deployment - -#### Antenna Calibration - -Satellite Beta's high-gain antenna completed calibration in three phases. Phase 1 (boresight alignment) achieved a pointing accuracy of 0.023 degrees against a requirement of 0.05 degrees. Phase 2 (pattern verification) confirmed the sidelobe levels were within specification at -28 dB below main beam. Phase 3 (EIRP verification) measured 52.4 dBW against a required minimum of 51.0 dBW. - -#### Frequency Assignment - -Satellite Beta operates in Ka-band with a downlink center frequency of 20.185 GHz and an uplink at 30.050 GHz. The allocated bandwidth is 500 MHz per polarization, supporting 24 transponders with 36 MHz spacing. Cross-polarization isolation exceeds 30 dB. The link budget supports a minimum data rate of 1.2 Gbps under rain fade conditions corresponding to 99.7% availability in the primary coverage zone. - -### Re-entry Capsule - -#### Heat Shield Specs - -The re-entry capsule thermal protection system uses a phenolic-impregnated carbon ablator (PICA-X) with a thickness of 33 mm on the forebody. Maximum predicted heat flux is 185 W/cm² at the stagnation point during re-entry at 11.2 km/s. The heat shield mass is 86 kg, representing 12% of the total capsule dry mass of 717 kg. The backshell uses a lighter SLA-561V material with a 15 mm thickness rated for 45 W/cm². - -#### Landing Zone Selection - -The primary landing zone is located at 34.2°N 108.7°W in the White Sands Proving Ground, with an elliptical footprint of 15 km × 8 km at the 3-sigma confidence level. Wind drift analysis based on 10 years of upper-atmosphere data predicts a mean offset of 3.2 km northeast. The backup landing zone is at 32.5°N 106.5°W near Fort Bliss, activated only if the primary zone weather violates the surface wind constraint of 12 m/s. - -## Mission Anomalies - -### Day 3 Communication Blackout - -At approximately 07:14 UTC on Day 3, the primary S-band transponder experienced an unexpected carrier loss lasting 4 hours and 22 minutes. Root cause analysis identified a single-event upset (SEU) in the command decoder ASIC, caused by a high-energy proton from the inner Van Allen belt. The transponder recovered autonomously after a watchdog timer reset. No command sequences were lost as the onboard computer continued executing the stored timeline. Redundant transponder was not activated because the primary recovery occurred before the 6-hour switchover threshold. - -### Day 17 Thruster Misfire - -At 14:52 UTC on Day 17, thruster cluster B3 (one of eight attitude control clusters) fired for 2.3 seconds during a period when no thruster activity was commanded. This produced an unplanned delta-V of 0.08 m/s and an attitude perturbation of 0.3 degrees. Telemetry analysis revealed a stuck valve in the B3 propellant control valve assembly, likely caused by particulate contamination during ground processing. The flight software detected the anomaly within 500 ms and inhibited the B3 cluster. Subsequent attitude corrections were performed using the remaining seven clusters. The propellant impact of the lost cluster reduces the available delta-V for the mission by approximately 4 m/s, leaving a remaining reserve of 218 m/s against a requirement of 150 m/s. -"#; - -/// Questions designed to force deep navigation: -/// -/// 1. "How much delta-V budget remains after the Day 17 thruster failure, -/// and is it enough to complete the mission?" -/// → Requires finding delta-V budget (Orbital Mechanics > Transfer > Delta-V Budget) -/// AND the anomaly impact (Mission Anomalies > Day 17 Thruster Misfire) -/// AND cross-referencing reserve vs requirement. -/// -/// 2. "What is the total power generation margin at end-of-life for Satellite Alpha -/// compared to its minimum operational requirement?" -/// → Requires finding EOL power (Payload > Alpha > Solar Panel Extension) -/// and computing the difference. -/// -/// 3. "If the B3 thruster cluster had failed during the Day 3 blackout instead of -/// Day 17, would the spacecraft have been able to recover attitude without -/// ground intervention?" -/// → Requires combining anomaly timelines and thruster redundancy info. -const QUERIES: &[&str] = &["where can i find the backup landing zone"]; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - tracing_subscriber::fmt::init(); - - println!("=== Deep Retrieval Example ===\n"); - - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = std::env::var("LLM_MODEL").unwrap_or_else(|_| "gpt-4o".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT").unwrap_or_else(|_| "https://api".to_string()); - - // Build engine - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - // Index document - let temp_dir = tempfile::tempdir()?; - let md_path = temp_dir.path().join("mission_atlas.md"); - tokio::fs::write(&md_path, MISSION_REPORT).await?; - - let index_result = engine - .index(IndexContext::from_path(&md_path).with_options(IndexOptions::new().with_summaries())) - .await?; - let doc_id = index_result.doc_id().unwrap().to_string(); - println!("Indexed document: {}\n", doc_id); - - // Query - for query in QUERIES { - println!("Q: \"{}\"", query); - - match engine - .query( - QueryContext::new(*query) - .with_doc_ids(vec![doc_id.clone()]) - .with_force_analysis(true), - ) - .await - { - Ok(result) => { - if let Some(item) = result.single() { - if item.content.is_empty() { - println!(" No relevant content found"); - } else { - println!(" A:"); - for line in item.content.lines().take(10) { - println!(" {}", line); - } - if item.content.lines().count() > 10 { - println!( - " ... ({} more lines)", - item.content.lines().count() - 10 - ); - } - } - } - } - Err(e) => println!(" Error: {}", e), - } - println!(); - } - - // Cleanup - engine.remove(&doc_id).await?; - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/events.rs b/vectorless-core/vectorless/examples/events.rs deleted file mode 100644 index 3db97706..00000000 --- a/vectorless-core/vectorless/examples/events.rs +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Event callbacks example. -//! -//! This example demonstrates the event system for: -//! - Monitoring indexing progress -//! - Tracking query execution -//! - Debugging retrieval behavior -//! -//! # Usage -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ -//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example events -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example events -//! ``` - -use std::sync::Arc; -use std::sync::atomic::{AtomicUsize, Ordering}; - -use vectorless::{EngineBuilder, IndexContext, QueryContext}; -use vectorless::{EventEmitter, IndexEvent, QueryEvent}; - -#[tokio::main] -async fn main() -> Result<(), Box> { - // Initialize tracing for debug output (set RUST_LOG=debug to see more) - tracing_subscriber::fmt::init(); - - println!("=== Event Callbacks Example ===\n"); - - // 1. Create event emitter with handlers - println!("Step 1: Setting up event handlers...\n"); - - let index_count = Arc::new(AtomicUsize::new(0)); - let query_count = Arc::new(AtomicUsize::new(0)); - let nodes_visited = Arc::new(AtomicUsize::new(0)); - - let index_count_clone = index_count.clone(); - let query_count_clone = query_count.clone(); - let nodes_visited_clone = nodes_visited.clone(); - - let events = EventEmitter::new() - // Index events - .on_index(move |e| match e { - IndexEvent::Started { path } => { - println!(" [INDEX] Started: {}", path); - } - IndexEvent::FormatDetected { format } => { - println!(" [INDEX] Format: {:?}", format); - } - IndexEvent::TreeBuilt { node_count } => { - println!(" [INDEX] Tree built: {} nodes", node_count); - } - IndexEvent::Complete { doc_id } => { - println!(" [INDEX] Complete: {}", &doc_id[..8]); - index_count_clone.fetch_add(1, Ordering::SeqCst); - } - IndexEvent::Error { message } => { - println!(" [INDEX] Error: {}", message); - } - _ => {} - }) - // Query events - .on_query(move |e| match e { - QueryEvent::Started { query } => { - println!(" [QUERY] Started: \"{}\"", query); - query_count_clone.fetch_add(1, Ordering::SeqCst); - } - QueryEvent::NodeVisited { title, score, .. } => { - println!(" [QUERY] Visited: \"{}\" (score: {:.2})", title, score); - nodes_visited_clone.fetch_add(1, Ordering::SeqCst); - } - QueryEvent::CandidateFound { node_id, score } => { - println!( - " [QUERY] Candidate: {} (score: {:.2})", - &node_id[..8], - score - ); - } - QueryEvent::Complete { - total_results, - confidence, - } => { - println!( - " [QUERY] Complete: {} results, confidence: {:.2}", - total_results, confidence - ); - } - QueryEvent::Error { message } => { - println!(" [QUERY] Error: {}", message); - } - _ => {} - }); - - println!(" ✓ Event handlers configured\n"); - - // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = std::env::var("LLM_MODEL").unwrap_or_else(|_| "gpt-4o".to_string()); - let endpoint = - std::env::var("LLM_ENDPOINT").unwrap_or_else(|_| "https://api.openai.com/v1".to_string()); - - // 2. Create engine with events - println!("Step 2: Creating engine with event emitter..."); - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .with_events(events) - .build() - .await?; - println!(" ✓ Engine created\n"); - - // 3. Index a document with events - println!("Step 3: Indexing document (with events)..."); - let result = engine - .index(IndexContext::from_path("../README.md")) - .await?; - let doc_id = result.doc_id().unwrap().to_string(); - println!(" ✓ Indexed: {doc_id}\n"); - - // 4. Query with events - println!("Step 4: Querying (with events)..."); - let result = engine - .query(QueryContext::new("What is vectorless?").with_doc_ids(vec![doc_id.clone()])) - .await?; - if let Some(item) = result.single() { - println!(" ✓ Found result ({} chars)", item.content.len()); - if !item.content.is_empty() { - let preview: String = item.content.chars().take(200).collect(); - println!(" Preview: {}...", preview); - } - } - - // 5. Stats - println!("\n--- Stats ---"); - println!( - " Documents indexed: {}", - index_count.load(Ordering::SeqCst) - ); - println!(" Queries executed: {}", query_count.load(Ordering::SeqCst)); - println!(" Nodes visited: {}", nodes_visited.load(Ordering::SeqCst)); - - // Cleanup - engine.remove(&doc_id).await?; - println!("\n Cleaned up"); - - println!("\n=== Done ==="); - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/flow.rs b/vectorless-core/vectorless/examples/flow.rs deleted file mode 100644 index ce13d80b..00000000 --- a/vectorless-core/vectorless/examples/flow.rs +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Complete Markdown processing flow example. -//! -//! This example demonstrates the full pipeline: -//! 1. Create a Vectorless client -//! 2. Index a Markdown document -//! 3. Show document structure in JSON format -//! 4. Query the document -//! -//! # Usage -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ -//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example flow -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example flow -//! ``` - -use vectorless::{EngineBuilder, IndexContext, IndexOptions, QueryContext}; - -/// Sample markdown content for demonstration. -const SAMPLE_MARKDOWN: &str = r#" -# Vectorless Architecture Guide - -## Overview - -Vectorless is a reasoning-native document intelligence engine that transforms documents into hierarchical semantic trees. Unlike traditional RAG systems that rely on vector embeddings and similarity search, Vectorless uses LLM-powered tree navigation to retrieve relevant content through deep contextual understanding. - -The core idea is simple: structured documents already have inherent semantic relationships encoded in their headings, sections, and paragraphs. By preserving this structure as a navigable tree, an LLM can efficiently locate relevant information by following the document's own logical organization. - -## Architecture - -The system consists of three main components: an indexing pipeline, a storage layer, and a retrieval engine. The indexing pipeline parses documents into tree structures and generates summaries. The storage layer persists indexed documents to disk. The retrieval engine navigates the tree at query time using search algorithms guided by LLM decisions. - -### Indexing Pipeline - -The indexing pipeline processes documents through multiple stages: parsing, tree building, enhancement (LLM summary generation), and reasoning index construction. Each stage is independently configurable and can be enabled or disabled based on requirements. The pipeline supports incremental re-indexing with content fingerprinting to avoid redundant work when documents haven't changed. - -### Retrieval Engine - -The retrieval engine uses an agent-based architecture where an Orchestrator coordinates Workers that navigate the document tree using LLM-guided decisions (ls, cd, cat, find, grep). The Orchestrator evaluates progress after each step and can replan when results are insufficient. The engine is budget-aware, tracking token usage and making cost-conscious decisions about when to invoke the LLM versus using cheaper heuristic scoring. - -## Performance - -Under typical workloads, indexing a 50-page document takes approximately 10-30 seconds depending on LLM response latency and the complexity of the document structure. Query latency ranges from 200ms for simple keyword-matched queries to 3-5 seconds for complex multi-hop reasoning queries that require multiple LLM calls during tree navigation. - -The system is designed for accuracy over speed. By leveraging document structure and LLM reasoning, it achieves higher retrieval quality than vector-based approaches on structured documents like technical reports, legal contracts, and research papers. -"#; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - // Initialize tracing for debug output (set RUST_LOG=debug to see more) - tracing_subscriber::fmt::init(); - - println!("=== Vectorless Flow Example ===\n"); - - // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = std::env::var("LLM_MODEL").unwrap_or_else(|_| "gpt-4o".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT").unwrap_or_else(|_| "https://api".to_string()); - - // Step 1: Create a Vectorless client - println!("Step 1: Creating Vectorless client..."); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - println!(" - Client created successfully"); - println!(); - - // Step 2: Index the sample Markdown document - println!("Step 2: Indexing Markdown document..."); - - let temp_dir = tempfile::tempdir()?; - let md_path = temp_dir.path().join("sample.md"); - tokio::fs::write(&md_path, SAMPLE_MARKDOWN).await?; - - let index_result = engine - .index(IndexContext::from_path(&md_path).with_options(IndexOptions::new().with_summaries())) - .await?; - let doc_id = index_result.doc_id().unwrap().to_string(); - - println!(" - Document indexed successfully"); - println!(" - Document ID: {}", doc_id); - println!(); - - // Step 3: List indexed documents - println!("Step 3: Indexed documents:"); - for doc in engine.list().await? { - println!(" - {} ({})", doc.name, doc.id); - } - println!(); - - // Step 4: Query the document - println!("Step 4: Querying the document..."); - - let queries = vec!["What is the seconds for complex multi-hop?"]; - - for query in queries { - println!(" Query: \"{}\"", query); - - match engine - .query(QueryContext::new(query).with_doc_ids(vec![doc_id.clone()])) - .await - { - Ok(result) => { - if let Some(item) = result.single() { - if item.content.is_empty() { - println!(" - No relevant content found"); - } else { - println!(" - Found relevant content:"); - for line in item.content.lines() { - println!(" {}", line); - } - } - } else { - println!(" - No results"); - } - } - Err(e) => { - println!(" - Error: {}", e); - } - } - println!(); - } - - // Cleanup - // for doc in engine.list().await? { - // engine.remove(&doc.id).await?; - // } - - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/graph.rs b/vectorless-core/vectorless/examples/graph.rs deleted file mode 100644 index 5fccd084..00000000 --- a/vectorless-core/vectorless/examples/graph.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document graph example for Vectorless. -//! -//! Demonstrates how to retrieve the cross-document relationship graph -//! after indexing. The graph is automatically rebuilt after each index call, -//! connecting documents that share keywords via Jaccard similarity. -//! -//! # Usage -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ -//! cargo run --example graph -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example graph -//! ``` - -use vectorless::{EngineBuilder, IndexContext}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - // Initialize tracing for debug output (set RUST_LOG=debug to see more) - tracing_subscriber::fmt::init(); - - println!("=== Document Graph Example ===\n"); - - // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = std::env::var("LLM_MODEL").unwrap_or_else(|_| "gpt-4o".to_string()); - - // 1. Create engine - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .build() - .await - .map_err(|e: vectorless::BuildError| vectorless::Error::Config(e.to_string()))?; - - // 2. Index documents — graph is rebuilt automatically - let result = engine - .index(IndexContext::from_paths(&["../README.md", "../CLAUDE.md"])) - .await?; - - println!("Indexed {} document(s)", result.items.len()); - for item in &result.items { - println!(" - {} ({})", item.name, item.doc_id); - } - println!(); - - // 3. Get the document graph - match engine.get_graph().await? { - Some(graph) => { - println!( - "Document graph: {} nodes, {} edges", - graph.node_count(), - graph.edge_count() - ); - - // Show document nodes - for doc_id in graph.doc_ids() { - if let Some(node) = graph.get_node(doc_id) { - println!( - " Node: {} — {} keyword(s), top: {:?}", - node.title, - node.top_keywords.len(), - node.top_keywords - .iter() - .take(3) - .map(|kw| &kw.keyword) - .collect::>() - ); - - // Show edges (connected documents) - let neighbors = graph.get_neighbors(doc_id); - if !neighbors.is_empty() { - for edge in neighbors { - println!( - " → {} (weight={:.2}, jaccard={:.2}, shared={})", - edge.target_doc_id, - edge.weight, - edge.evidence.keyword_jaccard, - edge.evidence.shared_keyword_count, - ); - } - } else { - println!(" (no connections)"); - } - } - } - } - None => println!("No graph available (no documents with reasoning index)"), - } - - // 4. Cleanup - let docs = engine.list().await?; - for doc in &docs { - engine.remove(&doc.id).await?; - } - - println!("\n=== Done ==="); - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/index_directory.rs b/vectorless-core/vectorless/examples/index_directory.rs deleted file mode 100644 index 2696df99..00000000 --- a/vectorless-core/vectorless/examples/index_directory.rs +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Directory indexing example — recursively index all documents in a directory. -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ -//! LLM_ENDPOINT=http://localhost:4000/api/v1 \ -//! cargo run --example index_directory -- /path/to/docs -//! -//! # With recursive flag (default): -//! cargo run --example index_directory -- /path/to/docs --recursive -//! -//! # Non-recursive (top-level only): -//! cargo run --example index_directory -- /path/to/docs --no-recursive -//! ``` - -use vectorless::{EngineBuilder, IndexContext}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - tracing_subscriber::fmt::init(); - - // Parse CLI arguments - let args: Vec = std::env::args().collect(); - let dir = args.get(1).map(|s| s.as_str()).unwrap_or("./samples"); - let recursive = !args.iter().any(|a| a == "--no-recursive"); - - // Build engine - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-or-v1-...".to_string()); - let model = - std::env::var("LLM_MODEL").unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - // Index directory - println!( - "{}indexing: {}", - if recursive { "Recursively " } else { "" }, - dir - ); - let ctx = IndexContext::from_dir(dir, recursive); - - if ctx.is_empty() { - println!("No supported files found in: {}", dir); - return Ok(()); - } - - println!("Found {} file(s) to index", ctx.len()); - - let result = engine.index(ctx).await?; - - println!("\nIndexed {} document(s):", result.items.len()); - for item in &result.items { - println!(" {} ({})", item.name, item.doc_id); - if let Some(metrics) = &item.metrics { - println!( - " nodes: {}, time: {}ms", - metrics.nodes_processed, - metrics.total_time_ms() - ); - } - } - - if result.has_failures() { - println!("\nFailed:"); - for f in &result.failed { - println!(" {} — {}", f.source, f.error); - } - } - - // Query across all indexed documents - let query = "What is this about?"; - println!("\nQuerying: \"{query}\""); - - let answer = engine.query(vectorless::QueryContext::new(query)).await?; - - for item in &answer.items { - println!(" [{} confidence={:.2}]", item.doc_id, item.confidence); - let preview: String = item.content.chars().take(200).collect(); - println!(" {preview}"); - if item.content.len() > 200 { - println!(" ..."); - } - } - - // Metrics report - let report = engine.metrics_report(); - println!("\nMetrics:"); - println!( - " LLM: {} calls, {} tokens, ${:.4}", - report.llm.total_calls, report.llm.total_tokens, report.llm.estimated_cost_usd, - ); - println!( - " Retrieval: {} queries, avg score {:.2}", - report.retrieval.total_queries, report.retrieval.avg_path_score, - ); - - // Cleanup - for doc in engine.list().await? { - engine.remove(&doc.id).await?; - } - - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/index_incremental.rs b/vectorless-core/vectorless/examples/index_incremental.rs deleted file mode 100644 index 6500a992..00000000 --- a/vectorless-core/vectorless/examples/index_incremental.rs +++ /dev/null @@ -1,122 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Incremental indexing example — re-index with change detection. -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ -//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example index_incremental -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example index_incremental -//! ``` - -use vectorless::{DocumentFormat, EngineBuilder, IndexContext, IndexMode}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - // Initialize tracing for debug output (set RUST_LOG=debug to see more) - tracing_subscriber::fmt::init(); - - // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-or-v1-...".to_string()); - let model = - std::env::var("LLM_MODEL").unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - let content_v1 = r#"# API Reference - -## GET /users - -Returns a list of all users in the system. - -## POST /users - -Creates a new user account. -"#; - - let content_v2 = r#"# API Reference - -## GET /users - -Returns a paginated list of users. Supports `?page=` and `?limit=` parameters. - -## POST /users - -Creates a new user account. Requires email and password fields. - -## DELETE /users/:id - -Deletes a user by their unique identifier. -"#; - - // 1. Initial full index - println!("--- Initial index ---"); - let result = engine - .index(IndexContext::from_content( - content_v1, - DocumentFormat::Markdown, - )) - .await?; - - let doc_id = result.items[0].doc_id.clone(); - if let Some(m) = &result.items[0].metrics { - println!( - "indexed in {}ms, {} nodes", - m.total_time_ms(), - m.nodes_processed - ); - } - - // 2. Re-index unchanged content (incremental) — skips processing - println!("\n--- Re-index unchanged (incremental) ---"); - let result = engine - .index( - IndexContext::from_content(content_v1, DocumentFormat::Markdown) - .with_mode(IndexMode::Incremental), - ) - .await?; - - for item in &result.items { - println!("doc_id: {} (unchanged, skipped)", item.doc_id); - } - - // 3. Re-index with changes (incremental) — detects diff and updates - println!("\n--- Re-index with changes (incremental) ---"); - let result = engine - .index( - IndexContext::from_content(content_v2, DocumentFormat::Markdown) - .with_mode(IndexMode::Incremental), - ) - .await?; - - for item in &result.items { - if let Some(m) = &item.metrics { - println!( - "updated in {}ms, {} nodes", - m.total_time_ms(), - m.nodes_processed - ); - } - } - - println!("\ndoc_id: {doc_id}"); - - // Cleanup - for doc in engine.list().await? { - engine.remove(&doc.id).await?; - } - - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/index_pdf.rs b/vectorless-core/vectorless/examples/index_pdf.rs deleted file mode 100644 index 0f9ae607..00000000 --- a/vectorless-core/vectorless/examples/index_pdf.rs +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! PDF indexing example — index a PDF document via the vectorless engine. -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ -//! cargo run --example index_pdf -- ../samples/Docker_Cheat_Sheet.pdf -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example index_pdf -- ../samples/Docker_Cheat_Sheet.pdf -//! ``` - -use std::path::Path; - -use vectorless::{EngineBuilder, IndexContext}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - // Initialize tracing so we can see pipeline logs. - // Set RUST_LOG=info or RUST_LOG=debug for more detail. - tracing_subscriber::fmt::init(); - - let args: Vec = std::env::args().collect(); - - let pdf_path = args.get(1).map(|s| s.as_str()).unwrap_or_else(|| { - eprintln!("Usage: cargo run --example index_pdf -- "); - std::process::exit(1); - }); - - if !Path::new(pdf_path).exists() { - eprintln!("Error: file not found: {}", pdf_path); - std::process::exit(1); - } - - println!("=== Indexing PDF: {} ===\n", pdf_path); - - // LLM configuration is required — set these environment variables: - // LLM_API_KEY — your API key (required) - // LLM_MODEL — model name (default: google/gemini-3-flash-preview) - // LLM_ENDPOINT — API endpoint (default: http://localhost:4000/api/v1) - let api_key = match std::env::var("LLM_API_KEY") { - Ok(key) => key, - Err(_) => { - eprintln!("Error: LLM_API_KEY environment variable is required."); - eprintln!("Set it before running:"); - eprintln!(" LLM_API_KEY=sk-xxx cargo run --example index_pdf -- "); - std::process::exit(1); - } - }; - let model = - std::env::var("LLM_MODEL").unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); - - tracing::info!( - "LLM config — key: {}..., model: {}, endpoint: {}", - &api_key[..api_key.len().min(8)], - model, - endpoint - ); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - let result = engine.index(IndexContext::from_path(pdf_path)).await?; - - println!( - "Indexed: {}, Failed: {}", - result.items.len(), - result.failed.len() - ); - - for item in &result.items { - println!("\n--- {} ---", item.name); - println!("doc_id: {}", item.doc_id); - println!("format: {:?}", item.format); - - if let Some(metrics) = &item.metrics { - println!("\nMetrics:"); - println!(" total time: {}ms", metrics.total_time_ms()); - println!(" parse: {}ms", metrics.parse_time_ms); - println!(" build: {}ms", metrics.build_time_ms); - println!(" enhance: {}ms", metrics.enhance_time_ms); - println!(" nodes: {}", metrics.nodes_processed); - println!(" summaries: {}", metrics.summaries_generated); - println!(" failed: {}", metrics.summaries_failed); - println!(" llm calls: {}", metrics.llm_calls); - println!(" tokens: {}", metrics.total_tokens_generated); - println!(" topics: {}", metrics.topics_indexed); - println!(" keywords: {}", metrics.keywords_indexed); - - if metrics.llm_calls == 0 { - println!("\n *** WARNING: No LLM calls were made. ***"); - println!(" Set RUST_LOG=info to see pipeline logs:"); - println!(" RUST_LOG=info cargo run --example index_pdf -- "); - println!(" Check LLM_API_KEY, LLM_MODEL, and LLM_ENDPOINT are valid."); - } - } - } - - for fail in &result.failed { - eprintln!("FAILED: {} — {}", fail.source, fail.error); - } - - // Cleanup workspace (uncomment to clean up after run) - for doc in engine.list().await? { - engine.remove(&doc.id).await?; - } - - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/index_single.rs b/vectorless-core/vectorless/examples/index_single.rs deleted file mode 100644 index edaa2460..00000000 --- a/vectorless-core/vectorless/examples/index_single.rs +++ /dev/null @@ -1,103 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Single document indexing example — index one document from content. -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ -//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example index_single -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example index_single -//! ``` - -use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - // Initialize tracing for debug output (set RUST_LOG=debug to see more) - tracing_subscriber::fmt::init(); - - // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-or-v1-...".to_string()); - let model = - std::env::var("LLM_MODEL").unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - let content = r#"# Distributed Data Processing Platform - -## Introduction - -This document provides a comprehensive overview of the distributed data processing platform architecture. The system is designed to handle petabyte-scale data workloads with sub-second query latency, supporting both real-time streaming and batch processing paradigms. The architecture follows a microservices-based approach with independent scaling capabilities for each component, enabling cost-effective resource utilization across varying workload patterns. - - -## System Architecture - -The platform follows a layered architecture pattern with clear separation of concerns between ingestion, processing, storage, and serving layers. Each layer can be independently deployed, scaled, and upgraded without affecting other layers, following the principle of bounded contexts from domain-driven design. Inter-layer communication uses a combination of asynchronous message passing for data flow and synchronous gRPC calls for control plane operations. - -### Ingestion Layer - -The ingestion layer serves as the entry point for all data entering the platform. It supports multiple protocols including HTTP REST, gRPC, Apache Kafka, and AWS Kinesis. The layer is responsible for data validation, schema enforcement, initial transformation, and routing to downstream processing pipelines. Built on a reactive architecture using backpressure-aware operators, the ingestion layer gracefully handles burst traffic patterns without overwhelming downstream services. - - -### Processing Engine - -The processing engine is the core computational component of the platform, responsible for transforming, enriching, aggregating, and analyzing ingested data. It supports both stream processing for real-time analytics and batch processing for historical analysis. The engine is built on a custom execution framework that optimizes query plans based on data statistics and available compute resources. - -### Storage Layer - -The storage layer provides a unified abstraction over multiple storage backends, each optimized for different access patterns. The hot tier uses an in-memory columnar cache for frequently accessed dimensions and recent fact data, providing microsecond-level access latency. The warm tier uses a distributed key-value store backed by NVMe SSDs for data accessed within the past 30 days. The cold tier uses object storage with Parquet file format for historical data, achieving cost efficiency at the expense of higher access latency. - -Data is automatically tiered based on configurable policies that consider access frequency, data age, and query patterns. The tiering engine runs as a background service that continuously monitors access patterns and migrates data between tiers. Metadata about data placement is maintained in a distributed metadata service built on etcd, which provides consistent reads and writes with linearizable semantics. - -### Query Serving Layer - -The query serving layer provides the external-facing API for executing analytical queries against the processed data. It supports SQL queries via a PostgreSQL-compatible wire protocol, making it accessible to a wide range of BI tools and existing applications without requiring driver changes. The query router analyzes incoming queries and determines the optimal execution strategy, considering which storage tiers contain the relevant data and whether partial results can be served from cached aggregations. - -Query results are optionally materialized in a result cache that uses a time-to-live (TTL) policy combined with lazy invalidation based on upstream data freshness markers. The cache achieves a hit rate of approximately 85% for dashboard workloads, significantly reducing the computational load on the processing engine for repetitive query patterns. - -## Deployment and Operations - -The platform is deployed on Kubernetes with Helm charts that encapsulate all deployment configurations, resource limits, and scaling policies. Each microservice is packaged as a container image with multi-stage builds that minimize image size and attack surface. The CI/CD pipeline uses a GitOps workflow with ArgoCD, ensuring that all changes to production are auditable, reproducible, and reversible. - -Monitoring is implemented using a Prometheus and Grafana stack, with custom metrics exported by each service using a shared instrumentation library. Key performance indicators including query latency percentiles, ingestion throughput, processing lag, and error rates are tracked on operational dashboards with automated alerting through PagerDuty integration. Distributed tracing using OpenTelemetry provides end-to-end visibility into request flows across microservices, enabling rapid diagnosis of performance anomalies and error root causes. -"#; - - // Index from content string - let result = engine - .index(IndexContext::from_content( - content, - DocumentFormat::Markdown, - )) - .await?; - - for item in &result.items { - println!("doc_id: {}", item.doc_id); - println!("name: {}", item.name); - println!("format: {:?}", item.format); - - if let Some(ref metrics) = item.metrics { - println!("time: {}ms", metrics.total_time_ms()); - println!("nodes: {}", metrics.nodes_processed); - println!("tokens: {}", metrics.total_tokens_generated); - } - } - - // Cleanup - for doc in engine.list().await? { - engine.remove(&doc.id).await?; - } - - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/indexing.rs b/vectorless-core/vectorless/examples/indexing.rs deleted file mode 100644 index fe78c254..00000000 --- a/vectorless-core/vectorless/examples/indexing.rs +++ /dev/null @@ -1,59 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Batch indexing example — index multiple documents via the vectorless engine. -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ -//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example indexing -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example indexing -//! ``` - -use vectorless::{EngineBuilder, IndexContext}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - // Initialize tracing for debug output (set RUST_LOG=debug to see more) - tracing_subscriber::fmt::init(); - - // Build engine with LLM configuration from environment or defaults. - // Adjust the defaults below to match your setup. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-or-v1-...".to_string()); - let model = - std::env::var("LLM_MODEL").unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - // Index multiple documents in a single call. - // Paths are resolved relative to the workspace directory. - let result = engine - .index(IndexContext::from_paths(&["../README.md", "../CLAUDE.md"])) - .await?; - - println!("Indexed {} document(s)", result.items.len()); - for item in &result.items { - println!(" - {} ({})", item.name, item.doc_id); - if let Some(metrics) = &item.metrics { - println!(" Time: {}ms", metrics.total_time_ms()); - println!(" Nodes: {}", metrics.nodes_processed); - } - } - - // Cleanup - for doc in engine.list().await? { - engine.remove(&doc.id).await?; - } - - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/indexing_flow.rs b/vectorless-core/vectorless/examples/indexing_flow.rs deleted file mode 100644 index 03eb3a87..00000000 --- a/vectorless-core/vectorless/examples/indexing_flow.rs +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Indexing pipeline flow example — demonstrates the full indexing pipeline -//! with detailed metrics breakdown. -//! -//! This example walks through: -//! 1. Creating a Vectorless engine -//! 2. Indexing a Markdown document from content -//! 3. Inspecting per-stage timing metrics -//! -//! Set `RUST_LOG=info` to see pipeline stage logs, or `RUST_LOG=debug` for -//! detailed internal progress. -//! -//! # Usage -//! -//! ```bash -//! # Using environment variables for LLM config: -//! LLM_API_KEY=sk-xxx LLM_MODEL=google/gemini-3-flash-preview \ -//! LLM_ENDPOINT=http://localhost:4000/api/v1 cargo run --example indexing_flow -//! -//! # Or with defaults (edit the code to set your key/endpoint): -//! cargo run --example indexing_flow -//! ``` - -use vectorless::{DocumentFormat, EngineBuilder, IndexContext}; - -/// Sample document with multi-level headings to exercise tree construction -/// and navigation index building. -const SAMPLE_MARKDOWN: &str = r#" -# Payment Platform Technical Guide - -## Overview - -This guide covers the architecture and implementation details of the payment processing platform. The system handles credit card payments, bank transfers, and digital wallets across multiple currencies and regions. It is designed for high availability with 99.99% uptime SLA and supports peak throughput of 10,000 transactions per second. - -## Architecture - -The platform uses a microservices architecture with event-driven communication between services. Each service owns its data store and communicates through a message broker for eventual consistency. The system is deployed on Kubernetes with automatic horizontal scaling based on request queue depth. - -### Ingestion Gateway - -The ingestion gateway is the entry point for all payment requests. It handles request validation, authentication, idempotency checks, and routing to the appropriate payment processor. The gateway implements circuit breaker patterns to gracefully degrade when downstream processors experience issues. - -### Payment Processing Engine - -The payment processing engine orchestrates the lifecycle of each payment transaction. It manages state transitions from initiation through authorization, capture, settlement, and reconciliation. The engine supports both synchronous and asynchronous payment flows, depending on the payment method and processor requirements. - -### Settlement Service - -The settlement service handles batch settlement with acquiring banks and payment networks. It runs on a configurable schedule (typically end-of-day for each banking region) and groups authorized transactions into settlement batches. The service handles currency conversion, fee calculation, and split payments for marketplace scenarios. - -## Security - -All payment data is encrypted at rest using AES-256 and in transit using TLS 1.3. Cardholder data is tokenized immediately upon receipt and stored in a PCI DSS Level 1 compliant vault. The platform undergoes annual PCI DSS audits and quarterly network vulnerability scans. - -### Fraud Detection - -Real-time fraud detection uses a rules engine combined with a machine learning model that scores each transaction based on velocity checks, geolocation anomalies, device fingerprinting, and behavioral patterns. Transactions exceeding configurable risk thresholds are automatically held for manual review. - -### Compliance - -The platform complies with PCI DSS, SOC 2 Type II, GDPR, and regional payment regulations including PSD2 (Europe) and local data residency requirements. Audit logs are retained for 7 years and accessible through a dedicated compliance API. - -## Monitoring and Operations - -Real-time dashboards track transaction volumes, success rates, latency percentiles, and error rates across all payment methods and processors. Automated alerting triggers on-call rotations when key metrics deviate from baseline thresholds. -"#; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - tracing_subscriber::fmt::init(); - - println!("=== Indexing Pipeline Flow Example ===\n"); - - // Build engine with LLM configuration from environment or defaults. - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = - std::env::var("LLM_MODEL").unwrap_or_else(|_| "google/gemini-3-flash-preview".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT") - .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); - - // Step 1: Create engine - println!("Step 1: Creating engine..."); - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - println!(" Done.\n"); - - // Step 2: Index from content - println!("Step 2: Indexing document from content...\n"); - let result = engine - .index(IndexContext::from_content( - SAMPLE_MARKDOWN, - DocumentFormat::Markdown, - )) - .await?; - - println!(" Indexed {} document(s)\n", result.items.len()); - - // Step 3: Inspect indexing results and metrics - for item in &result.items { - println!("--- Document Info ---"); - println!(" doc_id: {}", item.doc_id); - println!(" name: {}", item.name); - println!(" format: {:?}", item.format); - - if let Some(desc) = &item.description { - println!(" summary: {}...", &desc[..desc.len().min(120)]); - } - - if let Some(ref metrics) = item.metrics { - println!("\n--- Pipeline Stage Metrics ---"); - println!(" Stage Time (ms)"); - println!(" ─────────────────────────────"); - println!(" Parse {:>8}", metrics.parse_time_ms); - println!(" Build {:>8}", metrics.build_time_ms); - println!(" Validate {:>8}", metrics.validate_time_ms); - println!(" Split {:>8}", metrics.split_time_ms); - println!(" Enhance {:>8}", metrics.enhance_time_ms); - println!(" Enrich {:>8}", metrics.enrich_time_ms); - println!( - " Reasoning Index {:>8}", - metrics.reasoning_index_time_ms - ); - println!( - " Navigation Index {:>8}", - metrics.navigation_index_time_ms - ); - println!(" Optimize {:>8}", metrics.optimize_time_ms); - println!(" ─────────────────────────────"); - println!(" Total {:>8}", metrics.total_time_ms()); - - println!("\n--- Index Output ---"); - println!(" Nodes processed: {}", metrics.nodes_processed); - println!(" Summaries generated: {}", metrics.summaries_generated); - println!(" Summaries failed: {}", metrics.summaries_failed); - println!(" LLM calls: {}", metrics.llm_calls); - println!( - " Tokens generated: {}", - metrics.total_tokens_generated - ); - - println!("\n--- Navigation Index ---"); - println!(" Nav entries: {}", metrics.nav_entries_indexed); - println!(" Child routes: {}", metrics.child_routes_indexed); - - println!("\n--- Reasoning Index ---"); - println!(" Topics indexed: {}", metrics.topics_indexed); - println!(" Keywords indexed: {}", metrics.keywords_indexed); - - println!("\n--- Tree Optimization ---"); - println!(" Nodes skipped: {}", metrics.nodes_skipped); - println!(" Nodes merged: {}", metrics.nodes_merged); - } - - println!(); - } - - // Step 4: Cleanup - println!("Step 3: Cleaning up..."); - for doc in engine.list().await? { - engine.remove(&doc.id).await?; - println!(" Removed: {} ({})", doc.name, doc.id); - } - - println!("\n=== Done ==="); - Ok(()) -} diff --git a/vectorless-core/vectorless/examples/query.rs b/vectorless-core/vectorless/examples/query.rs deleted file mode 100644 index 8914081d..00000000 --- a/vectorless-core/vectorless/examples/query.rs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Query-only example — query an already-indexed document. -//! -//! Assumes the workspace already contains indexed documents -//! (e.g. from `cargo run --example flow` or `index_single`). -//! -//! # Usage -//! -//! ```bash -//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ -//! LLM_ENDPOINT=https://api.openai.com/v1 cargo run --example query -//! ``` - -use vectorless::{EngineBuilder, QueryContext}; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - tracing_subscriber::fmt::init(); - - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = std::env::var("LLM_MODEL").unwrap_or_else(|_| "gpt-4o".to_string()); - let endpoint = std::env::var("LLM_ENDPOINT").unwrap_or_else(|_| "https://api".to_string()); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - // List available documents - let docs = engine.list().await?; - if docs.is_empty() { - println!("No indexed documents found. Run an indexing example first."); - return Ok(()); - } - - println!("Available documents:"); - for doc in &docs { - println!(" - {} ({})", doc.name, doc.id); - } - println!(); - - // Query a specific document - let doc_id = docs[0].id.clone(); - let queries = vec![ - "What is the system architecture?", - "How does the storage layer work?", - ]; - - for query in queries { - println!("Query: \"{}\"", query); - - match engine - .query(QueryContext::new(query).with_doc_ids(vec![doc_id.clone()])) - .await - { - Ok(result) => { - if let Some(item) = result.single() { - if item.content.is_empty() { - println!(" No relevant content found"); - } else { - println!(" Found:"); - for line in item.content.lines() { - println!(" {}", line); - } - } - } else { - println!(" No results"); - } - } - Err(e) => { - println!(" Error: {}", e); - } - } - println!(); - } - - Ok(()) -} From 65208c7f58793a831127b50a396be479d5f72112 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:51:40 +0800 Subject: [PATCH 06/28] docs(CLAUDE.md): update examples section documentation - Change Rust examples description from "Rust examples (flow, indexing, pdf, batch, etc.)" to "Rust examples (legacy, no new additions)" - Add Python examples entry with description "Python examples (primary, for Python ecosystem)" - Remove samples/ directory from documentation --- CLAUDE.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 4ade4cd1..6a335513 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -30,11 +30,11 @@ Cargo workspace with 2 crates + pure Python SDK: - `src/config/` - Configuration types and validation - `src/error.rs` - Unified error types - `src/utils/` - Utility functions (token counting, fingerprinting, validation) - - `examples/` - Rust examples (flow, indexing, pdf, batch, etc.) + - `examples/` - Rust examples (legacy, no new additions) - `vectorless-py/` - PyO3 bindings (compiled into Python native module) - `vectorless/` - Pure Python SDK (high-level wrappers, CLI, config loading, integrations) +- `examples/` - Python examples (primary, for Python ecosystem) - `docs/` - Docusaurus documentation site -- `samples/` - Sample files ### Retrieval Call Flow From d7362e361cd0c01650c5583867425c47bbd40555 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:58:09 +0800 Subject: [PATCH 07/28] refactor(examples): update single_doc_challenge example with new API - Replace deprecated IndexContext and QueryContext imports with IngestInput - Update method names: index/list/remove to ingest/list_documents/forget - Change query API usage from QueryContext to direct ask method call - Update terminology from 'indexing' to 'ingesting' and 'understanding' - Rename challenge queries to challenge questions - Add confidence, evidence count, and trace steps to output display - Update variable names from doc.id to doc.doc_id for consistency --- .../examples/single_doc_challenge.rs | 71 +++++++++---------- 1 file changed, 34 insertions(+), 37 deletions(-) diff --git a/vectorless-core/vectorless/examples/single_doc_challenge.rs b/vectorless-core/vectorless/examples/single_doc_challenge.rs index cb174acf..7a0f44da 100644 --- a/vectorless-core/vectorless/examples/single_doc_challenge.rs +++ b/vectorless-core/vectorless/examples/single_doc_challenge.rs @@ -14,7 +14,7 @@ //! cargo run --example single_doc_challenge //! ``` -use vectorless::{DocumentFormat, EngineBuilder, IndexContext, QueryContext}; +use vectorless::{EngineBuilder, IngestInput}; /// A research report with information scattered across sections. /// The answers to the challenge questions require connecting dots @@ -191,52 +191,49 @@ async fn main() -> vectorless::Result<()> { .await .map_err(|e| vectorless::Error::Config(e.to_string()))?; - // Index (skip if already indexed — we're testing retrieval, not indexing) + // Ingest (skip if already indexed — we're testing reasoning, not indexing) let doc_name = "qc_report_2025"; let doc_id = { - let existing = engine.list().await?; + let existing = engine.list_documents().await?; if let Some(doc) = existing.iter().find(|d| d.name == doc_name) { - println!("Document already indexed, reusing: {}\n", doc.id); - doc.id.clone() + println!("Document already understood, reusing: {}\n", doc.doc_id); + doc.doc_id.clone() } else { - println!("Indexing research report..."); - let result = engine - .index( - IndexContext::from_content(REPORT, DocumentFormat::Markdown) - .with_name(doc_name), - ) + println!("Understanding research report..."); + let doc = engine + .ingest(IngestInput::Text { + name: doc_name.to_string(), + content: REPORT.to_string(), + }) .await?; - let id = result.doc_id().unwrap().to_string(); - println!(" doc_id: {}\n", id); - id + println!(" doc_id: {}", doc.doc_id); + println!(" summary: {}\n", doc.summary); + doc.doc_id } }; - // Challenge queries + // Challenge questions for (i, question) in CHALLENGE_QUESTIONS.iter().enumerate() { println!("Q{}: {}", i + 1, question); - match engine - .query(QueryContext::new(*question).with_doc_ids(vec![doc_id.clone()])) - .await - { - Ok(response) => { - if let Some(item) = response.single() { - if item.content.is_empty() { - println!(" (no answer found)\n"); - } else { - // Print first 3 lines as preview - for line in item.content.lines().take(3) { - println!(" {}", line); - } - let remaining = item.content.lines().count().saturating_sub(3); - if remaining > 0 { - println!(" ... ({} more lines)", remaining); - } - println!(" confidence: {:.2}\n", item.confidence); - } + match engine.ask(question, &[doc_id.clone()]).await { + Ok(answer) => { + if answer.content.is_empty() { + println!(" (no answer found)\n"); } else { - println!(" (no results)\n"); + for line in answer.content.lines().take(3) { + println!(" {}", line); + } + let remaining = answer.content.lines().count().saturating_sub(3); + if remaining > 0 { + println!(" ... ({} more lines)", remaining); + } + println!( + " confidence: {:.2}, evidence: {}, trace_steps: {}\n", + answer.confidence, + answer.evidence.len(), + answer.trace.steps.len() + ); } } Err(e) => { @@ -245,8 +242,8 @@ async fn main() -> vectorless::Result<()> { } } - // Uncomment to remove the document after testing: - // engine.remove(&doc_id).await?; + // Uncomment to forget the document after testing: + // engine.forget(&doc_id).await?; // println!("Cleaned up."); Ok(()) From 6c3ecb57a6bfd07e48d3242474ba0be237c24903 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 22:59:46 +0800 Subject: [PATCH 08/28] docs(HISTORY): add history tracking file - Create HISTORY.md to track project changes and version history --- HISTORY.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 HISTORY.md diff --git a/HISTORY.md b/HISTORY.md new file mode 100644 index 00000000..e69de29b From 1291e8f065774fcd3cb3f7a928f25c004f838c8b Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 23:15:45 +0800 Subject: [PATCH 09/28] docs(HISTORY): add comprehensive history file with version changelog - Add complete history tracking from initial release (0.1.0) to current version (0.1.11) - Document core principles: "Reason don't vector", "Model fails we fail", "No thought no answer" - Include detailed changelog covering agent-based retrieval architecture, navigation commands, orchestrator supervisor loop, and query understanding pipeline - Track evolution from basic indexing to reasoning-based document engine - Document PDF parsing improvements, streaming retrieval, and multi-document support --- HISTORY.md | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index e69de29b..387ed3fb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -0,0 +1,99 @@ +# HISTORY + +## 0.1.11 (2026-04-21) + +- Project description updated to "reasoning-based document engine" +- Core principles documentation (Reason don't vector, Model fails we fail, No thought no answer) +- Updated homepage with three core principles and key features + +## 0.1.10 (2026-04-21) + +- Description generation enabled by default +- `timeout_secs` option for Python indexing +- Agent-based navigation documentation + +## 0.1.9 (2026-04-20) + +- **Agent-based retrieval architecture**: replaced pilot/search with Orchestrator + Workers +- Navigation commands: `ls`, `cd`, `cat`, `grep`, `find`, `head`, `pwd`, `wc` +- Orchestrator supervisor loop with dynamic re-planning +- Query understanding pipeline with `QueryPlan` +- Evidence evaluation and replanning modules +- `NavigationIndex` with `DocCard` and `SectionCard` +- LLM-based confidence scoring (replaced BM25) +- Unified rerank pipeline (replaced synthesis/fusion) +- `DocCard` catalog in workspace storage +- Shared concurrency control for LLM clients +- Memoization for LLM operations in retrieval pipeline +- LLM request timeout configuration + +## 0.1.8 (2026-04-16) + +- GitHub Actions workflow for automated releases +- Endpoint parameter support for API configuration +- Custom config option in `EngineBuilder` +- Enhanced error messages with detailed failure info +- Endpoint validation in engine builder + +## 0.1.7 (2026-04-15) + +- Runtime metrics reports (LLM, Pilot, Retrieval) +- Recursive option for `from_dir` method +- Directory indexing support via `IndexContext` +- Centralized `LlmPool` configuration system +- Shared LLM client injected into pipeline context +- Pipeline checkpoint for resumable indexing +- `source_path` field and updated `QueryContext` API + +## 0.1.6 (2026-04-15) + +- `IndexMetrics` binding with detailed indexing statistics +- `StrategyPreference` for controlling retrieval strategies +- Pure Pilot search algorithm, beam search with backtracking +- Per-step reasoning support in search algorithms +- Binary pruning and pre-filtering for wide nodes +- LLM-based query complexity detection +- Cross-document strategy with graph-based boosting +- Synonym expansion for improved query recall +- Default summary strategy changed to Full + +## 0.1.4 (2026-04-13) + +- PDF parser: switch to `pdf-extract` for reliable text extraction +- Concurrent LLM verification for TOC entries +- PDF indexing example + +## 0.1.3 (2026-04-13) + +- Internal module naming cleanup (`_` prefix for private functions) + +## 0.1.2 (2026-04-13) + +- Search-from functionality and ToC-based navigation +- Reasoning chain (replacing navigation trace) +- Adaptive budget controller for pipeline token management +- Structural path constraints and hints extraction +- Reasoning index for fast retrieval path resolution +- Document graph system for cross-document relationships +- Streaming retrieval with `RetrieveEvent` support +- Multi-document query support +- Incremental indexing with content and logic fingerprinting +- Parallel processing for multiple document sources +- Pipeline checkpoint and content merging/splitting support + +## 0.1.1 (2026-04-08) + +- Workspace-managed dependencies and configuration +- LLM pilot functionality and summary generation +- Query decomposition support +- LLM-first search with TOC-based location +- Restructured Python examples + +## 0.1.0 (2026-04-07) + +Initial Python SDK release. + +- PyO3 bindings for the Rust engine core +- Basic `Engine` class with `index()` and `query()` methods +- `pyproject.toml` with maturin build backend +- Ruff formatting configuration From edf97d7040ff296ff46adb69cbd67888c651e793 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 23:26:10 +0800 Subject: [PATCH 10/28] chore(workspace): update project metadata and version management - Downgrade workspace package version from 0.1.32 to 0.1.12 - Update description from "Reasoning-based Document Engine" to "Document Understanding Engine for AI" - Change pyproject.toml to use dynamic version management instead of hardcoded version 0.1.11 --- Cargo.toml | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index fa671216..5c9a82a6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,8 @@ members = ["vectorless-core/vectorless", "vectorless-core/vectorless-py"] resolver = "2" [workspace.package] -version = "0.1.32" -description = "Reasoning-based Document Engine" +version = "0.1.12" +description = "Document Understanding Engine for AI" edition = "2024" authors = ["zTgx "] license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index 347bab18..cee403e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "vectorless" -version = "0.1.11" +dynamic = ["version"] description = "Document Understanding Engine for AI" readme = "README.md" requires-python = ">=3.10" From 67113e76dd364803cbaebddfd0cdb409c112dbba Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 23:26:34 +0800 Subject: [PATCH 11/28] feat(index): add concept extraction stage with LLM support - Introduce ConceptExtractionStage that extracts key concepts from document topics and summaries using LLM calls - Add fallback mechanism for keyword-based concept extraction when LLM is unavailable - Implement maximum limits for topics (20) and concepts (15) to control processing scope - Add proper error handling with fallback to basic extraction on LLM failures feat(document): add utility methods for document navigation - Add `cat()` method to get node content by ID for agent commands - Add `find()` method to search nodes by keyword in title/content - Add `node_title()` method to retrieve node titles by ID - Add `section_count()` method to get total number of sections refactor(index): integrate concept extraction into pipeline - Register ConceptExtractionStage in pipeline executor at priority 47 - Update pipeline documentation to reflect new stage ordering - Modify IndexContext to include concepts field for stage output - Update PipelineResult to include concepts for final output refactor(storage): persist concepts in indexed documents - Add concepts field to PersistedDocument struct with serde serialization - Include concepts in IndexedDocument for runtime access - Ensure concepts are properly saved and loaded during persistence refactor(indexer): pass concepts through indexing workflow - Update indexer to transfer concepts from pipeline results to indexed documents - Ensure concepts are properly persisted along with other document metadata --- .../vectorless/src/client/engine.rs | 2 +- .../vectorless/src/client/indexed_document.rs | 4 + .../vectorless/src/client/indexer.rs | 2 + .../vectorless/src/document/understanding.rs | 34 +++ .../vectorless/src/index/pipeline/context.rs | 10 +- .../vectorless/src/index/pipeline/executor.rs | 19 +- .../vectorless/src/index/stages/concept.rs | 238 ++++++++++++++++++ .../vectorless/src/index/stages/mod.rs | 4 + .../vectorless/src/index/stages/split.rs | 1 + .../vectorless/src/index/stages/validate.rs | 1 + .../vectorless/src/storage/persistence.rs | 5 + 11 files changed, 311 insertions(+), 9 deletions(-) create mode 100644 vectorless-core/vectorless/src/index/stages/concept.rs diff --git a/vectorless-core/vectorless/src/client/engine.rs b/vectorless-core/vectorless/src/client/engine.rs index 7ab4896f..a1e30656 100644 --- a/vectorless-core/vectorless/src/client/engine.rs +++ b/vectorless-core/vectorless/src/client/engine.rs @@ -584,7 +584,7 @@ impl Engine { nav_index, reasoning_index, summary: persisted.meta.description.unwrap_or_default(), - concepts: Vec::new(), // Will be populated by pipeline Stage 7 + concepts: persisted.concepts, page_count: persisted.meta.page_count, section_count, } diff --git a/vectorless-core/vectorless/src/client/indexed_document.rs b/vectorless-core/vectorless/src/client/indexed_document.rs index 3aa78f65..b2416392 100644 --- a/vectorless-core/vectorless/src/client/indexed_document.rs +++ b/vectorless-core/vectorless/src/client/indexed_document.rs @@ -52,6 +52,9 @@ pub(crate) struct IndexedDocument { /// Pre-computed navigation index for agent-based retrieval. pub navigation_index: Option, + + /// Key concepts extracted from the document. + pub concepts: Vec, } impl IndexedDocument { @@ -69,6 +72,7 @@ impl IndexedDocument { metrics: None, reasoning_index: None, navigation_index: None, + concepts: Vec::new(), } } diff --git a/vectorless-core/vectorless/src/client/indexer.rs b/vectorless-core/vectorless/src/client/indexer.rs index 2c598382..5b5e558b 100644 --- a/vectorless-core/vectorless/src/client/indexer.rs +++ b/vectorless-core/vectorless/src/client/indexer.rs @@ -280,6 +280,7 @@ impl IndexerClient { doc.reasoning_index = result.reasoning_index; doc.navigation_index = result.navigation_index; + doc.concepts = result.concepts; if let Some(p) = path { doc = doc.with_source_path(p); @@ -367,6 +368,7 @@ impl IndexerClient { persisted.reasoning_index = doc.reasoning_index; persisted.navigation_index = doc.navigation_index; + persisted.concepts = doc.concepts; persisted .meta .update_processing_stats(node_count, summary_tokens, duration_ms); diff --git a/vectorless-core/vectorless/src/document/understanding.rs b/vectorless-core/vectorless/src/document/understanding.rs index 3249c4c3..147a794b 100644 --- a/vectorless-core/vectorless/src/document/understanding.rs +++ b/vectorless-core/vectorless/src/document/understanding.rs @@ -90,6 +90,40 @@ pub struct DocumentInfo { } impl Document { + /// Get node content by ID (Agent `cat` command). + pub fn cat(&self, node_id: super::node::NodeId) -> Option<&str> { + self.tree.get(node_id).map(|n| n.content.as_str()) + } + + /// Find nodes containing a keyword in title or content. + pub fn find(&self, keyword: &str) -> Vec<(super::node::NodeId, &str)> { + let kw = keyword.to_lowercase(); + self.tree + .traverse() + .iter() + .filter_map(|&id| { + let node = self.tree.get(id)?; + if node.title.to_lowercase().contains(&kw) + || node.content.to_lowercase().contains(&kw) + { + Some((id, node.title.as_str())) + } else { + None + } + }) + .collect() + } + + /// Get node title by ID. + pub fn node_title(&self, node_id: super::node::NodeId) -> Option<&str> { + self.tree.get(node_id).map(|n| n.title.as_str()) + } + + /// Number of sections in the tree. + pub fn section_count(&self) -> usize { + self.section_count + } + /// Produce the public DocumentInfo view of this document. pub fn info(&self) -> DocumentInfo { let toc = super::toc::TocView::new().generate(&self.tree); diff --git a/vectorless-core/vectorless/src/index/pipeline/context.rs b/vectorless-core/vectorless/src/index/pipeline/context.rs index f34876b9..27f638a4 100644 --- a/vectorless-core/vectorless/src/index/pipeline/context.rs +++ b/vectorless-core/vectorless/src/index/pipeline/context.rs @@ -6,7 +6,7 @@ use std::collections::HashMap; use std::path::PathBuf; -use crate::document::{DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; +use crate::document::{Concept, DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; use crate::index::parse::{DocumentFormat, RawNode}; use crate::llm::LlmClient; @@ -251,6 +251,9 @@ pub struct IndexContext { /// Navigation index for Agent-based retrieval (built by NavigationIndexStage). pub navigation_index: Option, + /// Key concepts extracted from the document (built by ConceptExtractionStage). + pub concepts: Vec, + /// Existing tree from previous indexing (for incremental updates). /// When set, the enhance and reasoning stages can reuse data from unchanged nodes. pub existing_tree: Option, @@ -289,6 +292,7 @@ impl IndexContext { summary_cache: SummaryCache::default(), reasoning_index: None, navigation_index: None, + concepts: Vec::new(), existing_tree: None, stage_results: HashMap::new(), metrics: IndexMetrics::default(), @@ -387,6 +391,7 @@ impl IndexContext { summary_cache: self.summary_cache, reasoning_index: self.reasoning_index, navigation_index: self.navigation_index, + concepts: self.concepts, } } } @@ -429,6 +434,9 @@ pub struct PipelineResult { /// Navigation index for Agent-based retrieval. pub navigation_index: Option, + + /// Key concepts extracted from the document. + pub concepts: Vec, } impl PipelineResult { diff --git a/vectorless-core/vectorless/src/index/pipeline/executor.rs b/vectorless-core/vectorless/src/index/pipeline/executor.rs index 34c1f43a..0a0ec22c 100644 --- a/vectorless-core/vectorless/src/index/pipeline/executor.rs +++ b/vectorless-core/vectorless/src/index/pipeline/executor.rs @@ -13,8 +13,9 @@ use crate::llm::LlmClient; use super::super::PipelineOptions; use super::super::stages::{ - BuildStage, EnhanceStage, EnrichStage, IndexStage, NavigationIndexStage, OptimizeStage, - ParseStage, ReasoningIndexStage, SplitStage, ValidateStage, + BuildStage, ConceptExtractionStage, EnhanceStage, EnrichStage, IndexStage, + NavigationIndexStage, OptimizeStage, ParseStage, ReasoningIndexStage, SplitStage, + ValidateStage, }; use super::context::{IndexInput, PipelineResult}; use super::orchestrator::PipelineOrchestrator; @@ -55,8 +56,9 @@ impl PipelineExecutor { /// 4. `split` - Split oversized leaf nodes (optional) /// 5. `enrich` - Add metadata and cross-references /// 6. `reasoning_index` - Build pre-computed reasoning index - /// 7. `navigation_index` - Build Agent navigation index - /// 8. `optimize` - Optimize tree structure + /// 7. `concept_extraction` - Extract key concepts (optional) + /// 8. `navigation_index` - Build Agent navigation index + /// 9. `optimize` - Optimize tree structure pub fn new() -> Self { let orchestrator = PipelineOrchestrator::new() .stage_with_priority(ParseStage::new(), 10) @@ -65,6 +67,7 @@ impl PipelineExecutor { .stage_with_priority(SplitStage::new(), 25) .stage_with_priority(EnrichStage::new(), 40) .stage_with_priority(ReasoningIndexStage::new(), 45) + .stage_with_priority(ConceptExtractionStage::new(), 47) .stage_with_priority(NavigationIndexStage::new(), 50) .stage_with_priority(OptimizeStage::new(), 60); @@ -81,8 +84,9 @@ impl PipelineExecutor { /// 5. `enhance` - LLM-based enhancement (summaries) /// 6. `enrich` - Add metadata /// 7. `reasoning_index` - Build pre-computed reasoning index - /// 8. `navigation_index` - Build Agent navigation index - /// 9. `optimize` - Optimize tree + /// 8. `concept_extraction` - Extract key concepts via LLM (optional) + /// 9. `navigation_index` - Build Agent navigation index + /// 10. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { tracing::info!( "PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage + context" @@ -93,9 +97,10 @@ impl PipelineExecutor { .stage_with_priority(BuildStage::new(), 20) .stage_with_priority(ValidateStage::new(), 22) .stage_with_priority(SplitStage::new(), 25) - .stage_with_priority(EnhanceStage::with_llm_client(client), 30) + .stage_with_priority(EnhanceStage::with_llm_client(client.clone()), 30) .stage_with_priority(EnrichStage::new(), 40) .stage_with_priority(ReasoningIndexStage::new(), 45) + .stage_with_priority(ConceptExtractionStage::with_llm_client(client), 47) .stage_with_priority(NavigationIndexStage::new(), 50) .stage_with_priority(OptimizeStage::new(), 60); diff --git a/vectorless-core/vectorless/src/index/stages/concept.rs b/vectorless-core/vectorless/src/index/stages/concept.rs new file mode 100644 index 00000000..7bffb660 --- /dev/null +++ b/vectorless-core/vectorless/src/index/stages/concept.rs @@ -0,0 +1,238 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Concept extraction stage — extracts key concepts from topics and summaries. + +use std::collections::HashMap; + +use serde::Deserialize; +use tracing::{info, warn}; + +use crate::document::Concept; +use crate::error::Result; +use crate::llm::LlmClient; + +use super::async_trait; +use super::{AccessPattern, IndexStage, StageResult}; +use crate::index::pipeline::IndexContext; + +/// Maximum number of top keywords to send to the LLM for concept extraction. +const MAX_TOPICS: usize = 20; + +/// Maximum number of concepts to extract. +const MAX_CONCEPTS: usize = 15; + +/// Concept extraction stage. +/// +/// Takes the reasoning index's topic entries and tree summaries, then uses +/// a single LLM call to extract structured [`Concept`] values. +/// Falls back to basic keyword-based concepts when no LLM is available. +pub struct ConceptExtractionStage { + llm_client: Option, +} + +impl ConceptExtractionStage { + /// Create a new stage without LLM support (keyword-based fallback). + pub fn new() -> Self { + Self { llm_client: None } + } + + /// Create a stage with LLM support for rich concept extraction. + pub fn with_llm_client(client: LlmClient) -> Self { + Self { + llm_client: Some(client), + } + } +} + +#[async_trait] +impl IndexStage for ConceptExtractionStage { + fn name(&self) -> &str { + "concept_extraction" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["reasoning_index"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_concepts: true, + ..AccessPattern::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let concepts = if let Some(ref client) = self.llm_client { + extract_with_llm(ctx, client).await + } else { + extract_from_topics(ctx) + }; + + let count = concepts.len(); + ctx.concepts = concepts; + info!("[concept_extraction] Extracted {} concepts", count); + + Ok(StageResult::success("concept_extraction")) + } +} + +/// Extract concepts using LLM from topics and summaries. +async fn extract_with_llm(ctx: &mut IndexContext, client: &LlmClient) -> Vec { + let (topics, section_titles) = gather_source_data(ctx); + + if topics.is_empty() { + warn!("[concept_extraction] No topics available for extraction"); + return Vec::new(); + } + + let system = "You are a document analysis assistant. Extract the most important concepts \ + from the given topics and section titles. For each concept, provide:\n\ + - name: a short name (2-4 words)\n\ + - summary: a one-sentence explanation\n\ + - sections: list of section titles where this concept appears\n\n\ + Return ONLY a valid JSON array of objects. No explanation, no markdown. \ + Maximum 15 concepts, ordered by importance."; + + let user_prompt = format!( + "Document topics (keyword: relevance weight):\n{}\n\n\ + Section titles:\n{}", + topics + .iter() + .map(|(k, w)| format!("- {} (weight: {:.2})", k, w)) + .collect::>() + .join("\n"), + section_titles.join(", "), + ); + + #[derive(Debug, Deserialize)] + #[serde(rename_all = "snake_case")] + struct RawConcept { + name: String, + summary: String, + #[serde(default)] + sections: Vec, + } + + match client + .complete_json::>(&system, &user_prompt) + .await + { + Ok(raw) => raw + .into_iter() + .take(MAX_CONCEPTS) + .map(|c| Concept { + name: c.name, + summary: c.summary, + sections: c.sections, + }) + .collect(), + Err(e) => { + warn!("[concept_extraction] LLM extraction failed: {}, using fallback", e); + extract_from_topics(ctx) + } + } +} + +/// Fallback: derive basic concepts from topic keywords. +fn extract_from_topics(ctx: &mut IndexContext) -> Vec { + let (topics, section_titles) = gather_source_data(ctx); + + topics + .into_iter() + .take(MAX_CONCEPTS) + .map(|(name, _)| Concept { + name: name.clone(), + summary: String::new(), + sections: section_titles.clone(), + }) + .collect() +} + +/// Gather top topics and section titles from the pipeline context. +fn gather_source_data(ctx: &IndexContext) -> (Vec<(String, f32)>, Vec) { + // Collect top keywords by weight + let mut topics: Vec<(String, f32)> = Vec::new(); + + if let Some(ref ri) = ctx.reasoning_index { + let mut all: Vec<(String, f32)> = ri + .all_topic_entries() + .map(|(keyword, entries)| { + let max_weight = entries.iter().map(|e| e.weight).fold(0.0_f32, f32::max); + (keyword.clone(), max_weight) + }) + .collect(); + all.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + all.truncate(MAX_TOPICS); + topics = all; + } + + // Collect section titles from the tree + let section_titles: Vec = ctx + .tree + .as_ref() + .map(|tree| { + tree.traverse() + .iter() + .filter_map(|&id| { + let node = tree.get(id)?; + if !node.title.is_empty() { + Some(node.title.clone()) + } else { + None + } + }) + .collect() + }) + .unwrap_or_default(); + + (topics, section_titles) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_from_empty_topics() { + let topics = Vec::<(String, f32)>::new(); + let titles = vec!["Section 1".to_string()]; + // Basic sanity: empty topics produce empty concepts + let concepts: Vec = topics + .into_iter() + .take(MAX_CONCEPTS) + .map(|(name, _)| Concept { + name, + summary: String::new(), + sections: titles.clone(), + }) + .collect(); + assert!(concepts.is_empty()); + } + + #[test] + fn test_extract_from_topics_basic() { + let topics: Vec<(String, f32)> = vec![ + ("quantum".to_string(), 0.95), + ("error correction".to_string(), 0.88), + ("qubit".to_string(), 0.82), + ]; + let titles = vec!["Research Labs".to_string()]; + let concepts: Vec = topics + .into_iter() + .take(MAX_CONCEPTS) + .map(|(name, _)| Concept { + name, + summary: String::new(), + sections: titles.clone(), + }) + .collect(); + assert_eq!(concepts.len(), 3); + assert_eq!(concepts[0].name, "quantum"); + } +} diff --git a/vectorless-core/vectorless/src/index/stages/mod.rs b/vectorless-core/vectorless/src/index/stages/mod.rs index 9a3c405f..22a424c4 100644 --- a/vectorless-core/vectorless/src/index/stages/mod.rs +++ b/vectorless-core/vectorless/src/index/stages/mod.rs @@ -4,6 +4,7 @@ //! Index pipeline stages. mod build; +mod concept; mod enhance; mod enrich; mod navigation; @@ -14,6 +15,7 @@ mod split; mod validate; pub use build::BuildStage; +pub use concept::ConceptExtractionStage; pub use enhance::EnhanceStage; pub use enrich::EnrichStage; pub use navigation::NavigationIndexStage; @@ -41,6 +43,8 @@ pub struct AccessPattern { pub writes_navigation_index: bool, /// Whether this stage writes to `description`. pub writes_description: bool, + /// Whether this stage writes to `concepts`. + pub writes_concepts: bool, } /// Index pipeline stage. diff --git a/vectorless-core/vectorless/src/index/stages/split.rs b/vectorless-core/vectorless/src/index/stages/split.rs index 245688b8..54fe3edd 100644 --- a/vectorless-core/vectorless/src/index/stages/split.rs +++ b/vectorless-core/vectorless/src/index/stages/split.rs @@ -228,6 +228,7 @@ impl IndexStage for SplitStage { writes_reasoning_index: false, writes_navigation_index: false, writes_description: false, + writes_concepts: false, } } diff --git a/vectorless-core/vectorless/src/index/stages/validate.rs b/vectorless-core/vectorless/src/index/stages/validate.rs index 312ff18a..e0909521 100644 --- a/vectorless-core/vectorless/src/index/stages/validate.rs +++ b/vectorless-core/vectorless/src/index/stages/validate.rs @@ -239,6 +239,7 @@ impl IndexStage for ValidateStage { writes_reasoning_index: false, writes_navigation_index: false, writes_description: false, + writes_concepts: false, } } diff --git a/vectorless-core/vectorless/src/storage/persistence.rs b/vectorless-core/vectorless/src/storage/persistence.rs index b2dac4d4..38700925 100644 --- a/vectorless-core/vectorless/src/storage/persistence.rs +++ b/vectorless-core/vectorless/src/storage/persistence.rs @@ -232,6 +232,10 @@ pub struct PersistedDocument { /// Navigation index for Agent-based retrieval. #[serde(default, skip_serializing_if = "Option::is_none")] pub navigation_index: Option, + + /// Key concepts extracted from the document. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub concepts: Vec, } impl PersistedDocument { @@ -244,6 +248,7 @@ impl PersistedDocument { pages: Vec::new(), reasoning_index: None, navigation_index: None, + concepts: Vec::new(), } } From 451f3fac6f7639d55013a66861318b50a16c07a4 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 22 Apr 2026 23:47:20 +0800 Subject: [PATCH 12/28] feat(agent): add reasoning trace collection and verification stage Add trace_steps field to Output and WorkerOutput structs to capture reasoning trace steps during agent navigation. Initialize trace_steps in constructors and extend WorkerState with trace collection capabilities. Add navigation index building and verification stage to pipeline that validates ingest output reliability by checking tree structure, document summary, and concept extraction results before persistence. Refactor document loading to use unified Document structure and implement trace collection in agent state management. --- .../vectorless/src/agent/config.rs | 6 ++ vectorless-core/vectorless/src/agent/state.rs | 17 ++++ .../vectorless/src/agent/worker/navigation.rs | 9 ++- .../vectorless/src/client/engine.rs | 27 ++----- .../vectorless/src/document/understanding.rs | 12 +++ .../vectorless/src/index/pipeline/executor.rs | 10 ++- .../vectorless/src/index/stages/mod.rs | 2 + .../src/index/stages/verify_ingest.rs | 79 +++++++++++++++++++ 8 files changed, 138 insertions(+), 24 deletions(-) create mode 100644 vectorless-core/vectorless/src/index/stages/verify_ingest.rs diff --git a/vectorless-core/vectorless/src/agent/config.rs b/vectorless-core/vectorless/src/agent/config.rs index 0873c8c6..54b6897e 100644 --- a/vectorless-core/vectorless/src/agent/config.rs +++ b/vectorless-core/vectorless/src/agent/config.rs @@ -84,6 +84,8 @@ pub struct Output { pub metrics: Metrics, /// Confidence score (0.0–1.0) — derived from LLM evaluate() result. pub confidence: f32, + /// Reasoning trace steps collected during agent navigation. + pub trace_steps: Vec, } impl Output { @@ -94,6 +96,7 @@ impl Output { evidence: Vec::new(), metrics: Metrics::default(), confidence: 0.0, + trace_steps: Vec::new(), } } } @@ -148,6 +151,8 @@ pub struct WorkerOutput { pub metrics: WorkerMetrics, /// Document name this Worker was assigned to. pub doc_name: String, + /// Reasoning trace steps from this Worker. + pub trace_steps: Vec, } /// Metrics specific to a single Worker's execution. @@ -184,6 +189,7 @@ impl From for Output { evidence_chars: wo.metrics.evidence_chars, }, confidence: 0.0, + trace_steps: wo.trace_steps, } } } diff --git a/vectorless-core/vectorless/src/agent/state.rs b/vectorless-core/vectorless/src/agent/state.rs index 9e0612ad..8d59d9a0 100644 --- a/vectorless-core/vectorless/src/agent/state.rs +++ b/vectorless-core/vectorless/src/agent/state.rs @@ -6,6 +6,7 @@ use std::collections::HashSet; use crate::document::NodeId; +use crate::document::TraceStep; use super::config::{Evidence, Output}; @@ -47,6 +48,8 @@ pub struct WorkerState { pub check_count: u32, /// Whether a navigation plan was generated in Phase 1.5. pub plan_generated: bool, + /// Reasoning trace steps collected during navigation. + pub trace_steps: Vec, } /// Maximum number of history entries to keep for prompt injection. @@ -69,6 +72,7 @@ impl WorkerState { plan: String::new(), check_count: 0, plan_generated: false, + trace_steps: Vec::new(), } } @@ -184,6 +188,7 @@ impl WorkerState { evidence_chars, }, doc_name: doc_name.to_string(), + trace_steps: self.trace_steps, } } } @@ -259,11 +264,13 @@ impl OrchestratorState { ..Default::default() }, confidence: 0.0, + trace_steps: self.collect_trace_steps(), } } /// Merge all sub-results into a single Output (consuming self). pub fn into_output(self, answer: String) -> Output { + let trace_steps = self.collect_trace_steps(); Output { answer, evidence: self.all_evidence, @@ -284,8 +291,18 @@ impl OrchestratorState { ..Default::default() }, confidence: 0.0, + trace_steps, } } + + /// Collect trace steps from all sub-results. + fn collect_trace_steps(&self) -> Vec { + let mut steps = Vec::new(); + for result in &self.sub_results { + steps.extend(result.trace_steps.iter().cloned()); + } + steps + } } impl Default for OrchestratorState { diff --git a/vectorless-core/vectorless/src/agent/worker/navigation.rs b/vectorless-core/vectorless/src/agent/worker/navigation.rs index 29b1a680..ce5c4ee7 100644 --- a/vectorless-core/vectorless/src/agent/worker/navigation.rs +++ b/vectorless-core/vectorless/src/agent/worker/navigation.rs @@ -203,7 +203,7 @@ fn handle_parse_failure( (command, is_parse_failure) } -/// Push a round's command + feedback preview into history. +/// Push a round's command + feedback preview into history and trace. fn push_round_history(state: &mut WorkerState, cmd_str: &str) { let feedback_preview = if state.last_feedback.len() > 120 { let boundary = state.last_feedback.ceil_char_boundary(120); @@ -212,6 +212,13 @@ fn push_round_history(state: &mut WorkerState, cmd_str: &str) { state.last_feedback.clone() }; state.push_history(format!("{} → {}", cmd_str, feedback_preview)); + + let round = state.max_rounds.saturating_sub(state.remaining); + state.trace_steps.push(crate::document::TraceStep { + action: cmd_str.to_string(), + observation: state.last_feedback.chars().take(200).collect(), + round, + }); } /// Dynamic re-planning after an insufficient check. diff --git a/vectorless-core/vectorless/src/client/engine.rs b/vectorless-core/vectorless/src/client/engine.rs index a1e30656..0e885c89 100644 --- a/vectorless-core/vectorless/src/client/engine.rs +++ b/vectorless-core/vectorless/src/client/engine.rs @@ -474,15 +474,10 @@ impl Engine { ))); } - // Build DocContexts and dispatch + // Build DocContexts from Documents and dispatch let doc_contexts: Vec = documents .iter() - .map(|(tree, nav, ridx, id)| crate::agent::DocContext { - tree, - nav_index: nav, - reasoning_index: ridx, - doc_name: id.as_str(), - }) + .map(|doc| doc.as_context()) .collect(); let skip_analysis = !ids.is_empty(); @@ -608,7 +603,9 @@ impl Engine { content: output.answer.clone(), evidence, confidence: output.confidence, - trace: ReasoningTrace::empty(), // TODO: wire up actual trace collection + trace: ReasoningTrace { + steps: output.trace_steps.clone(), + }, } } @@ -620,23 +617,13 @@ impl Engine { async fn load_documents( &self, doc_ids: &[String], - ) -> Result<( - Vec<( - crate::document::DocumentTree, - crate::document::NavigationIndex, - crate::document::ReasoningIndex, - String, - )>, - Vec, - )> { + ) -> Result<(Vec, Vec)> { let mut documents = Vec::new(); let mut failed = Vec::new(); for doc_id in doc_ids { match self.workspace.load(doc_id).await { Ok(Some(doc)) => { - let nav_index = doc.navigation_index.unwrap_or_default(); - let reasoning_index = doc.reasoning_index.unwrap_or_default(); - documents.push((doc.tree, nav_index, reasoning_index, doc_id.clone())); + documents.push(Self::persisted_to_understanding_document(doc)); } Ok(None) => { failed.push(FailedItem::new(doc_id, "Document not found")); diff --git a/vectorless-core/vectorless/src/document/understanding.rs b/vectorless-core/vectorless/src/document/understanding.rs index 147a794b..94eaeac4 100644 --- a/vectorless-core/vectorless/src/document/understanding.rs +++ b/vectorless-core/vectorless/src/document/understanding.rs @@ -90,6 +90,18 @@ pub struct DocumentInfo { } impl Document { + /// Create a read-only agent context from this document. + /// + /// Used internally by the retrieval agent for navigation and reasoning. + pub fn as_context(&self) -> crate::agent::DocContext<'_> { + crate::agent::DocContext { + tree: &self.tree, + nav_index: &self.nav_index, + reasoning_index: &self.reasoning_index, + doc_name: &self.name, + } + } + /// Get node content by ID (Agent `cat` command). pub fn cat(&self, node_id: super::node::NodeId) -> Option<&str> { self.tree.get(node_id).map(|n| n.content.as_str()) diff --git a/vectorless-core/vectorless/src/index/pipeline/executor.rs b/vectorless-core/vectorless/src/index/pipeline/executor.rs index 0a0ec22c..4421eeca 100644 --- a/vectorless-core/vectorless/src/index/pipeline/executor.rs +++ b/vectorless-core/vectorless/src/index/pipeline/executor.rs @@ -15,7 +15,7 @@ use super::super::PipelineOptions; use super::super::stages::{ BuildStage, ConceptExtractionStage, EnhanceStage, EnrichStage, IndexStage, NavigationIndexStage, OptimizeStage, ParseStage, ReasoningIndexStage, SplitStage, - ValidateStage, + ValidateStage, VerifyStage, }; use super::context::{IndexInput, PipelineResult}; use super::orchestrator::PipelineOrchestrator; @@ -58,7 +58,8 @@ impl PipelineExecutor { /// 6. `reasoning_index` - Build pre-computed reasoning index /// 7. `concept_extraction` - Extract key concepts (optional) /// 8. `navigation_index` - Build Agent navigation index - /// 9. `optimize` - Optimize tree structure + /// 9. `verify` - Validate ingest output reliability + /// 10. `optimize` - Optimize tree structure pub fn new() -> Self { let orchestrator = PipelineOrchestrator::new() .stage_with_priority(ParseStage::new(), 10) @@ -69,6 +70,7 @@ impl PipelineExecutor { .stage_with_priority(ReasoningIndexStage::new(), 45) .stage_with_priority(ConceptExtractionStage::new(), 47) .stage_with_priority(NavigationIndexStage::new(), 50) + .stage_with_priority(VerifyStage, 55) .stage_with_priority(OptimizeStage::new(), 60); Self { orchestrator } @@ -86,7 +88,8 @@ impl PipelineExecutor { /// 7. `reasoning_index` - Build pre-computed reasoning index /// 8. `concept_extraction` - Extract key concepts via LLM (optional) /// 9. `navigation_index` - Build Agent navigation index - /// 10. `optimize` - Optimize tree + /// 10. `verify` - Validate ingest output reliability + /// 11. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { tracing::info!( "PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage + context" @@ -102,6 +105,7 @@ impl PipelineExecutor { .stage_with_priority(ReasoningIndexStage::new(), 45) .stage_with_priority(ConceptExtractionStage::with_llm_client(client), 47) .stage_with_priority(NavigationIndexStage::new(), 50) + .stage_with_priority(VerifyStage, 55) .stage_with_priority(OptimizeStage::new(), 60); Self { orchestrator } diff --git a/vectorless-core/vectorless/src/index/stages/mod.rs b/vectorless-core/vectorless/src/index/stages/mod.rs index 22a424c4..2efed1e6 100644 --- a/vectorless-core/vectorless/src/index/stages/mod.rs +++ b/vectorless-core/vectorless/src/index/stages/mod.rs @@ -13,6 +13,7 @@ mod parse; mod reasoning; mod split; mod validate; +mod verify_ingest; pub use build::BuildStage; pub use concept::ConceptExtractionStage; @@ -24,6 +25,7 @@ pub use parse::ParseStage; pub use reasoning::ReasoningIndexStage; pub use split::SplitStage; pub use validate::ValidateStage; +pub use verify_ingest::VerifyStage; use super::pipeline::{FailurePolicy, IndexContext, StageResult}; use crate::error::Result; diff --git a/vectorless-core/vectorless/src/index/stages/verify_ingest.rs b/vectorless-core/vectorless/src/index/stages/verify_ingest.rs new file mode 100644 index 00000000..2d7125a6 --- /dev/null +++ b/vectorless-core/vectorless/src/index/stages/verify_ingest.rs @@ -0,0 +1,79 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Verify stage — validates ingest output reliability before persist. + +use tracing::{info, warn}; + +use super::{AccessPattern, IndexStage}; +use crate::error::{Error, Result}; +use crate::index::pipeline::{IndexContext, StageResult}; +use super::async_trait; + +/// Verification stage — ensures ingest produced reliable output. +/// +/// Checks: +/// - Tree is non-empty (at least root node) +/// - Document summary is non-empty +/// - At least one concept was extracted +/// +/// Any check failure produces an error — no silent degradation. +pub struct VerifyStage; + +#[async_trait] +impl IndexStage for VerifyStage { + fn name(&self) -> &str { + "verify" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["concept_extraction"] + } + + fn is_optional(&self) -> bool { + false + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + ..AccessPattern::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + // Tree must exist and have nodes + let tree = ctx.tree.as_ref().ok_or_else(|| { + Error::InvalidStructure("document tree is empty".into()) + })?; + let node_count = tree.node_count(); + if node_count == 0 { + return Err(Error::InvalidStructure( + "tree has no nodes".into(), + )); + } + + // Summary must be non-empty + let has_summary = ctx + .description + .as_ref() + .is_some_and(|s| !s.trim().is_empty()); + if !has_summary { + warn!("[verify] Document summary is empty"); + } + + // Concepts must be present (warning only — non-fatal) + if ctx.concepts.is_empty() { + warn!("[verify] No concepts extracted from document"); + } + + info!( + "[verify] Passed: {} nodes, summary={}, concepts={}", + node_count, + has_summary, + ctx.concepts.len() + ); + + Ok(StageResult::success("verify")) + } +} From 03d3cc2587c6fc9ba06bd86bc0dd37b9de0817de Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 08:40:40 +0800 Subject: [PATCH 13/28] feat: split monorepo into modular crates and add agent functionality - Split the main crate into multiple specialized crates including vectorless-error, vectorless-document, vectorless-config, vectorless-utils, vectorless-scoring, vectorless-graph, vectorless-events, vectorless-metrics, vectorless-llm, vectorless-storage, vectorless-query, vectorless-index, vectorless-agent, vectorless-retrieval, vectorless-rerank, and vectorless-engine - Add comprehensive command parsing system for agent navigation with support for ls, cd, cat, find, grep, head, findtree, wc, pwd, check, and done commands - Implement quote-stripping and multi-level target resolution with exact, case-insensitive, substring, and numeric matching - Add extended target resolution with deep search capability up to depth 4 using BFS algorithm - Create agent configuration system with worker and answer pipeline settings including navigation budgets and evidence caps - Implement structured output types for agent results including evidence collection, metrics tracking, and confidence scoring - Add read-only context wrappers for accessing document navigation indices, content trees, and reasoning indexes - Include comprehensive test suite for command parsing and target resolution functionality - Add Python script to fix crate:: import references across split modules --- Cargo.toml | 20 +- vectorless-core/fix_imports.py | 96 ++ vectorless-core/vectorless-agent/Cargo.toml | 27 + .../vectorless-agent/src/command.rs | 629 ++++++++++ .../vectorless-agent/src/config.rs | 248 ++++ .../vectorless-agent/src/context.rs | 120 ++ .../vectorless-agent/src/events.rs | 537 +++++++++ vectorless-core/vectorless-agent/src/lib.rs | 55 + .../src/orchestrator/analyze.rs | 159 +++ .../src/orchestrator/dispatch.rs | 92 ++ .../src/orchestrator/evaluate.rs | 128 ++ .../vectorless-agent/src/orchestrator/mod.rs | 223 ++++ .../src/orchestrator/replan.rs | 249 ++++ .../src/orchestrator/supervisor.rs | 159 +++ .../vectorless-agent/src/prompts.rs | 569 +++++++++ vectorless-core/vectorless-agent/src/state.rs | 312 +++++ .../vectorless-agent/src/tools/common.rs | 69 ++ .../vectorless-agent/src/tools/mod.rs | 101 ++ .../src/tools/orchestrator.rs | 203 ++++ .../vectorless-agent/src/tools/worker/cat.rs | 115 ++ .../vectorless-agent/src/tools/worker/cd.rs | 262 +++++ .../vectorless-agent/src/tools/worker/find.rs | 128 ++ .../vectorless-agent/src/tools/worker/grep.rs | 175 +++ .../vectorless-agent/src/tools/worker/head.rs | 119 ++ .../vectorless-agent/src/tools/worker/ls.rs | 124 ++ .../vectorless-agent/src/tools/worker/mod.rs | 39 + .../vectorless-agent/src/tools/worker/pwd.rs | 58 + .../vectorless-agent/src/tools/worker/wc.rs | 109 ++ .../vectorless-agent/src/worker/execute.rs | 278 +++++ .../vectorless-agent/src/worker/format.rs | 20 + .../vectorless-agent/src/worker/mod.rs | 236 ++++ .../vectorless-agent/src/worker/navigation.rs | 448 +++++++ .../vectorless-agent/src/worker/planning.rs | 708 ++++++++++++ vectorless-core/vectorless-config/Cargo.toml | 18 + vectorless-core/vectorless-config/src/lib.rs | 21 + .../vectorless-config/src/types/graph.rs | 51 + .../vectorless-config/src/types/indexer.rs | 108 ++ .../vectorless-config/src/types/llm_pool.rs | 612 ++++++++++ .../vectorless-config/src/types/metrics.rs | 181 +++ .../vectorless-config/src/types/mod.rs | 372 ++++++ .../vectorless-config/src/types/retrieval.rs | 170 +++ .../vectorless-config/src/types/storage.rs | 742 ++++++++++++ .../vectorless-config/src/validator.rs | 323 ++++++ .../vectorless-document/Cargo.toml | 23 + .../vectorless-document/src/format.rs | 62 + .../vectorless-document/src/lib.rs | 43 + .../vectorless-document/src/navigation.rs | 626 ++++++++++ .../vectorless-document/src/node.rs | 144 +++ .../vectorless-document/src/reasoning.rs | 444 +++++++ .../vectorless-document/src/reference.rs | 559 +++++++++ .../vectorless-document/src/serde_helpers.rs | 241 ++++ .../vectorless-document/src/structure.rs | 65 ++ .../vectorless-document/src/toc.rs | 343 ++++++ .../vectorless-document/src/tree.rs | 883 ++++++++++++++ .../vectorless-document/src/understanding.rs | 306 +++++ vectorless-core/vectorless-engine/Cargo.toml | 34 + .../vectorless-engine/src/builder.rs | 268 +++++ .../vectorless-engine/src/engine.rs | 923 +++++++++++++++ .../vectorless-engine/src/index_context.rs | 363 ++++++ .../vectorless-engine/src/indexed_document.rs | 130 +++ .../vectorless-engine/src/indexer.rs | 387 +++++++ vectorless-core/vectorless-engine/src/lib.rs | 106 ++ .../vectorless-engine/src/query_context.rs | 179 +++ .../vectorless-engine/src/retriever.rs | 140 +++ .../vectorless-engine/src/test_support.rs | 54 + .../vectorless-engine/src/types.rs | 536 +++++++++ vectorless-core/vectorless-error/Cargo.toml | 15 + vectorless-core/vectorless-error/src/error.rs | 329 ++++++ vectorless-core/vectorless-error/src/lib.rs | 8 + vectorless-core/vectorless-events/Cargo.toml | 17 + .../vectorless-events/src/emitter.rs | 256 ++++ vectorless-core/vectorless-events/src/lib.rs | 31 + .../vectorless-events/src/types.rs | 138 +++ vectorless-core/vectorless-graph/Cargo.toml | 18 + .../vectorless-graph/src/builder.rs | 400 +++++++ .../vectorless-graph/src/config.rs | 51 + vectorless-core/vectorless-graph/src/lib.rs | 38 + vectorless-core/vectorless-graph/src/types.rs | 310 +++++ vectorless-core/vectorless-index/Cargo.toml | 36 + .../vectorless-index/src/config.rs | 389 +++++++ .../src/incremental/detector.rs | 654 +++++++++++ .../vectorless-index/src/incremental/mod.rs | 81 ++ .../src/incremental/resolver.rs | 105 ++ .../src/incremental/updater.rs | 177 +++ vectorless-core/vectorless-index/src/lib.rs | 73 ++ .../src/parse/markdown/config.rs | 219 ++++ .../src/parse/markdown/frontmatter.rs | 219 ++++ .../src/parse/markdown/mod.rs | 30 + .../src/parse/markdown/parser.rs | 601 ++++++++++ .../vectorless-index/src/parse/mod.rs | 0 .../vectorless-index/src/parse/pdf/mod.rs | 32 + .../vectorless-index/src/parse/pdf/parser.rs | 366 ++++++ .../vectorless-index/src/parse/pdf/types.rs | 171 +++ .../src/parse/toc/assigner.rs | 395 +++++++ .../src/parse/toc/detector.rs | 349 ++++++ .../vectorless-index/src/parse/toc/mod.rs | 28 + .../vectorless-index/src/parse/toc/parser.rs | 279 +++++ .../src/parse/toc/processor.rs | 573 +++++++++ .../src/parse/toc/repairer.rs | 247 ++++ .../src/parse/toc/structure_extractor.rs | 481 ++++++++ .../vectorless-index/src/parse/toc/types.rs | 350 ++++++ .../src/parse/toc/verifier.rs | 281 +++++ .../vectorless-index/src/parse/types.rs | 173 +++ .../src/pipeline/checkpoint.rs | 329 ++++++ .../vectorless-index/src/pipeline/context.rs | 465 ++++++++ .../vectorless-index/src/pipeline/executor.rs | 198 ++++ .../vectorless-index/src/pipeline/metrics.rs | 6 + .../vectorless-index/src/pipeline/mod.rs | 24 + .../src/pipeline/orchestrator.rs | 1028 +++++++++++++++++ .../vectorless-index/src/pipeline/policy.rs | 222 ++++ .../vectorless-index/src/stages/build.rs | 334 ++++++ .../vectorless-index/src/stages/concept.rs | 238 ++++ .../vectorless-index/src/stages/enhance.rs | 449 +++++++ .../vectorless-index/src/stages/enrich.rs | 240 ++++ .../vectorless-index/src/stages/mod.rs | 141 +++ .../vectorless-index/src/stages/navigation.rs | 563 +++++++++ .../vectorless-index/src/stages/optimize.rs | 455 ++++++++ .../vectorless-index/src/stages/parse.rs | 166 +++ .../vectorless-index/src/stages/reasoning.rs | 639 ++++++++++ .../vectorless-index/src/stages/split.rs | 347 ++++++ .../vectorless-index/src/stages/validate.rs | 365 ++++++ .../src/stages/verify_ingest.rs | 79 ++ .../vectorless-index/src/summary/full.rs | 65 ++ .../vectorless-index/src/summary/lazy.rs | 153 +++ .../vectorless-index/src/summary/mod.rs | 24 + .../vectorless-index/src/summary/selective.rs | 120 ++ .../vectorless-index/src/summary/strategy.rs | 322 ++++++ vectorless-core/vectorless-llm/Cargo.toml | 32 + vectorless-core/vectorless-llm/src/client.rs | 378 ++++++ vectorless-core/vectorless-llm/src/config.rs | 260 +++++ vectorless-core/vectorless-llm/src/error.rs | 135 +++ .../vectorless-llm/src/executor.rs | 568 +++++++++ .../vectorless-llm/src/fallback.rs | 378 ++++++ vectorless-core/vectorless-llm/src/lib.rs | 45 + .../vectorless-llm/src/memo/mod.rs | 14 + .../vectorless-llm/src/memo/store.rs | 679 +++++++++++ .../vectorless-llm/src/memo/types.rs | 414 +++++++ vectorless-core/vectorless-llm/src/pool.rs | 176 +++ .../vectorless-llm/src/throttle.rs | 270 +++++ vectorless-core/vectorless-metrics/Cargo.toml | 19 + vectorless-core/vectorless-metrics/src/hub.rs | 324 ++++++ .../vectorless-metrics/src/index.rs | 199 ++++ vectorless-core/vectorless-metrics/src/lib.rs | 56 + vectorless-core/vectorless-metrics/src/llm.rs | 207 ++++ .../vectorless-metrics/src/retrieval.rs | 263 +++++ vectorless-core/vectorless-py/Cargo.toml | 2 +- vectorless-core/vectorless-query/Cargo.toml | 22 + vectorless-core/vectorless-query/src/lib.rs | 45 + vectorless-core/vectorless-query/src/types.rs | 114 ++ .../vectorless-query/src/understand.rs | 246 ++++ vectorless-core/vectorless-rerank/Cargo.toml | 19 + .../vectorless-rerank/src/dedup.rs | 216 ++++ vectorless-core/vectorless-rerank/src/lib.rs | 104 ++ .../vectorless-rerank/src/types.rs | 14 + .../vectorless-retrieval/Cargo.toml | 27 + .../vectorless-retrieval/src/cache.rs | 577 +++++++++ .../vectorless-retrieval/src/dispatcher.rs | 78 ++ .../vectorless-retrieval/src/lib.rs | 28 + .../vectorless-retrieval/src/postprocessor.rs | 130 +++ .../vectorless-retrieval/src/stream.rs | 128 ++ .../vectorless-retrieval/src/types.rs | 193 ++++ vectorless-core/vectorless-scoring/Cargo.toml | 17 + .../vectorless-scoring/src/bm25.rs | 690 +++++++++++ vectorless-core/vectorless-scoring/src/lib.rs | 8 + vectorless-core/vectorless-storage/Cargo.toml | 35 + .../vectorless-storage/src/backend/file.rs | 293 +++++ .../vectorless-storage/src/backend/memory.rs | 181 +++ .../vectorless-storage/src/backend/mod.rs | 34 + .../src/backend/trait_def.rs | 113 ++ .../vectorless-storage/src/cache.rs | 381 ++++++ .../vectorless-storage/src/codec.rs | 245 ++++ vectorless-core/vectorless-storage/src/lib.rs | 46 + .../vectorless-storage/src/lock.rs | 280 +++++ .../vectorless-storage/src/migration.rs | 385 ++++++ .../vectorless-storage/src/persistence.rs | 877 ++++++++++++++ vectorless-core/vectorless-utils/Cargo.toml | 22 + .../vectorless-utils/src/fingerprint.rs | 496 ++++++++ vectorless-core/vectorless-utils/src/lib.rs | 17 + vectorless-core/vectorless-utils/src/token.rs | 64 + .../vectorless-utils/src/validation.rs | 195 ++++ .../vectorless/src/client/index_context.rs | 2 +- .../vectorless/src/client/indexed_document.rs | 2 +- .../vectorless/src/client/indexer.rs | 2 +- vectorless-core/vectorless/src/client/mod.rs | 2 +- .../vectorless/src/client/types.rs | 2 +- .../vectorless/src/document/format.rs | 62 + .../vectorless/src/document/mod.rs | 2 + .../vectorless/src/document/understanding.rs | 2 +- .../vectorless/src/events/types.rs | 4 +- .../src/index/incremental/resolver.rs | 2 +- .../vectorless/src/index/parse/types.rs | 37 +- .../vectorless/src/index/stages/parse.rs | 2 +- .../vectorless/src/retrieval/types.rs | 20 +- .../vectorless/src/utils/validation.rs | 2 +- 194 files changed, 41981 insertions(+), 65 deletions(-) create mode 100644 vectorless-core/fix_imports.py create mode 100644 vectorless-core/vectorless-agent/Cargo.toml create mode 100644 vectorless-core/vectorless-agent/src/command.rs create mode 100644 vectorless-core/vectorless-agent/src/config.rs create mode 100644 vectorless-core/vectorless-agent/src/context.rs create mode 100644 vectorless-core/vectorless-agent/src/events.rs create mode 100644 vectorless-core/vectorless-agent/src/lib.rs create mode 100644 vectorless-core/vectorless-agent/src/orchestrator/analyze.rs create mode 100644 vectorless-core/vectorless-agent/src/orchestrator/dispatch.rs create mode 100644 vectorless-core/vectorless-agent/src/orchestrator/evaluate.rs create mode 100644 vectorless-core/vectorless-agent/src/orchestrator/mod.rs create mode 100644 vectorless-core/vectorless-agent/src/orchestrator/replan.rs create mode 100644 vectorless-core/vectorless-agent/src/orchestrator/supervisor.rs create mode 100644 vectorless-core/vectorless-agent/src/prompts.rs create mode 100644 vectorless-core/vectorless-agent/src/state.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/common.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/mod.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/orchestrator.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/cat.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/cd.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/find.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/grep.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/head.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/ls.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/mod.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/pwd.rs create mode 100644 vectorless-core/vectorless-agent/src/tools/worker/wc.rs create mode 100644 vectorless-core/vectorless-agent/src/worker/execute.rs create mode 100644 vectorless-core/vectorless-agent/src/worker/format.rs create mode 100644 vectorless-core/vectorless-agent/src/worker/mod.rs create mode 100644 vectorless-core/vectorless-agent/src/worker/navigation.rs create mode 100644 vectorless-core/vectorless-agent/src/worker/planning.rs create mode 100644 vectorless-core/vectorless-config/Cargo.toml create mode 100644 vectorless-core/vectorless-config/src/lib.rs create mode 100644 vectorless-core/vectorless-config/src/types/graph.rs create mode 100644 vectorless-core/vectorless-config/src/types/indexer.rs create mode 100644 vectorless-core/vectorless-config/src/types/llm_pool.rs create mode 100644 vectorless-core/vectorless-config/src/types/metrics.rs create mode 100644 vectorless-core/vectorless-config/src/types/mod.rs create mode 100644 vectorless-core/vectorless-config/src/types/retrieval.rs create mode 100644 vectorless-core/vectorless-config/src/types/storage.rs create mode 100644 vectorless-core/vectorless-config/src/validator.rs create mode 100644 vectorless-core/vectorless-document/Cargo.toml create mode 100644 vectorless-core/vectorless-document/src/format.rs create mode 100644 vectorless-core/vectorless-document/src/lib.rs create mode 100644 vectorless-core/vectorless-document/src/navigation.rs create mode 100644 vectorless-core/vectorless-document/src/node.rs create mode 100644 vectorless-core/vectorless-document/src/reasoning.rs create mode 100644 vectorless-core/vectorless-document/src/reference.rs create mode 100644 vectorless-core/vectorless-document/src/serde_helpers.rs create mode 100644 vectorless-core/vectorless-document/src/structure.rs create mode 100644 vectorless-core/vectorless-document/src/toc.rs create mode 100644 vectorless-core/vectorless-document/src/tree.rs create mode 100644 vectorless-core/vectorless-document/src/understanding.rs create mode 100644 vectorless-core/vectorless-engine/Cargo.toml create mode 100644 vectorless-core/vectorless-engine/src/builder.rs create mode 100644 vectorless-core/vectorless-engine/src/engine.rs create mode 100644 vectorless-core/vectorless-engine/src/index_context.rs create mode 100644 vectorless-core/vectorless-engine/src/indexed_document.rs create mode 100644 vectorless-core/vectorless-engine/src/indexer.rs create mode 100644 vectorless-core/vectorless-engine/src/lib.rs create mode 100644 vectorless-core/vectorless-engine/src/query_context.rs create mode 100644 vectorless-core/vectorless-engine/src/retriever.rs create mode 100644 vectorless-core/vectorless-engine/src/test_support.rs create mode 100644 vectorless-core/vectorless-engine/src/types.rs create mode 100644 vectorless-core/vectorless-error/Cargo.toml create mode 100644 vectorless-core/vectorless-error/src/error.rs create mode 100644 vectorless-core/vectorless-error/src/lib.rs create mode 100644 vectorless-core/vectorless-events/Cargo.toml create mode 100644 vectorless-core/vectorless-events/src/emitter.rs create mode 100644 vectorless-core/vectorless-events/src/lib.rs create mode 100644 vectorless-core/vectorless-events/src/types.rs create mode 100644 vectorless-core/vectorless-graph/Cargo.toml create mode 100644 vectorless-core/vectorless-graph/src/builder.rs create mode 100644 vectorless-core/vectorless-graph/src/config.rs create mode 100644 vectorless-core/vectorless-graph/src/lib.rs create mode 100644 vectorless-core/vectorless-graph/src/types.rs create mode 100644 vectorless-core/vectorless-index/Cargo.toml create mode 100644 vectorless-core/vectorless-index/src/config.rs create mode 100644 vectorless-core/vectorless-index/src/incremental/detector.rs create mode 100644 vectorless-core/vectorless-index/src/incremental/mod.rs create mode 100644 vectorless-core/vectorless-index/src/incremental/resolver.rs create mode 100644 vectorless-core/vectorless-index/src/incremental/updater.rs create mode 100644 vectorless-core/vectorless-index/src/lib.rs create mode 100644 vectorless-core/vectorless-index/src/parse/markdown/config.rs create mode 100644 vectorless-core/vectorless-index/src/parse/markdown/frontmatter.rs create mode 100644 vectorless-core/vectorless-index/src/parse/markdown/mod.rs create mode 100644 vectorless-core/vectorless-index/src/parse/markdown/parser.rs create mode 100644 vectorless-core/vectorless-index/src/parse/mod.rs create mode 100644 vectorless-core/vectorless-index/src/parse/pdf/mod.rs create mode 100644 vectorless-core/vectorless-index/src/parse/pdf/parser.rs create mode 100644 vectorless-core/vectorless-index/src/parse/pdf/types.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/assigner.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/detector.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/mod.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/parser.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/processor.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/repairer.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/types.rs create mode 100644 vectorless-core/vectorless-index/src/parse/toc/verifier.rs create mode 100644 vectorless-core/vectorless-index/src/parse/types.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/checkpoint.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/context.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/executor.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/metrics.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/mod.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/orchestrator.rs create mode 100644 vectorless-core/vectorless-index/src/pipeline/policy.rs create mode 100644 vectorless-core/vectorless-index/src/stages/build.rs create mode 100644 vectorless-core/vectorless-index/src/stages/concept.rs create mode 100644 vectorless-core/vectorless-index/src/stages/enhance.rs create mode 100644 vectorless-core/vectorless-index/src/stages/enrich.rs create mode 100644 vectorless-core/vectorless-index/src/stages/mod.rs create mode 100644 vectorless-core/vectorless-index/src/stages/navigation.rs create mode 100644 vectorless-core/vectorless-index/src/stages/optimize.rs create mode 100644 vectorless-core/vectorless-index/src/stages/parse.rs create mode 100644 vectorless-core/vectorless-index/src/stages/reasoning.rs create mode 100644 vectorless-core/vectorless-index/src/stages/split.rs create mode 100644 vectorless-core/vectorless-index/src/stages/validate.rs create mode 100644 vectorless-core/vectorless-index/src/stages/verify_ingest.rs create mode 100644 vectorless-core/vectorless-index/src/summary/full.rs create mode 100644 vectorless-core/vectorless-index/src/summary/lazy.rs create mode 100644 vectorless-core/vectorless-index/src/summary/mod.rs create mode 100644 vectorless-core/vectorless-index/src/summary/selective.rs create mode 100644 vectorless-core/vectorless-index/src/summary/strategy.rs create mode 100644 vectorless-core/vectorless-llm/Cargo.toml create mode 100644 vectorless-core/vectorless-llm/src/client.rs create mode 100644 vectorless-core/vectorless-llm/src/config.rs create mode 100644 vectorless-core/vectorless-llm/src/error.rs create mode 100644 vectorless-core/vectorless-llm/src/executor.rs create mode 100644 vectorless-core/vectorless-llm/src/fallback.rs create mode 100644 vectorless-core/vectorless-llm/src/lib.rs create mode 100644 vectorless-core/vectorless-llm/src/memo/mod.rs create mode 100644 vectorless-core/vectorless-llm/src/memo/store.rs create mode 100644 vectorless-core/vectorless-llm/src/memo/types.rs create mode 100644 vectorless-core/vectorless-llm/src/pool.rs create mode 100644 vectorless-core/vectorless-llm/src/throttle.rs create mode 100644 vectorless-core/vectorless-metrics/Cargo.toml create mode 100644 vectorless-core/vectorless-metrics/src/hub.rs create mode 100644 vectorless-core/vectorless-metrics/src/index.rs create mode 100644 vectorless-core/vectorless-metrics/src/lib.rs create mode 100644 vectorless-core/vectorless-metrics/src/llm.rs create mode 100644 vectorless-core/vectorless-metrics/src/retrieval.rs create mode 100644 vectorless-core/vectorless-query/Cargo.toml create mode 100644 vectorless-core/vectorless-query/src/lib.rs create mode 100644 vectorless-core/vectorless-query/src/types.rs create mode 100644 vectorless-core/vectorless-query/src/understand.rs create mode 100644 vectorless-core/vectorless-rerank/Cargo.toml create mode 100644 vectorless-core/vectorless-rerank/src/dedup.rs create mode 100644 vectorless-core/vectorless-rerank/src/lib.rs create mode 100644 vectorless-core/vectorless-rerank/src/types.rs create mode 100644 vectorless-core/vectorless-retrieval/Cargo.toml create mode 100644 vectorless-core/vectorless-retrieval/src/cache.rs create mode 100644 vectorless-core/vectorless-retrieval/src/dispatcher.rs create mode 100644 vectorless-core/vectorless-retrieval/src/lib.rs create mode 100644 vectorless-core/vectorless-retrieval/src/postprocessor.rs create mode 100644 vectorless-core/vectorless-retrieval/src/stream.rs create mode 100644 vectorless-core/vectorless-retrieval/src/types.rs create mode 100644 vectorless-core/vectorless-scoring/Cargo.toml create mode 100644 vectorless-core/vectorless-scoring/src/bm25.rs create mode 100644 vectorless-core/vectorless-scoring/src/lib.rs create mode 100644 vectorless-core/vectorless-storage/Cargo.toml create mode 100644 vectorless-core/vectorless-storage/src/backend/file.rs create mode 100644 vectorless-core/vectorless-storage/src/backend/memory.rs create mode 100644 vectorless-core/vectorless-storage/src/backend/mod.rs create mode 100644 vectorless-core/vectorless-storage/src/backend/trait_def.rs create mode 100644 vectorless-core/vectorless-storage/src/cache.rs create mode 100644 vectorless-core/vectorless-storage/src/codec.rs create mode 100644 vectorless-core/vectorless-storage/src/lib.rs create mode 100644 vectorless-core/vectorless-storage/src/lock.rs create mode 100644 vectorless-core/vectorless-storage/src/migration.rs create mode 100644 vectorless-core/vectorless-storage/src/persistence.rs create mode 100644 vectorless-core/vectorless-utils/Cargo.toml create mode 100644 vectorless-core/vectorless-utils/src/fingerprint.rs create mode 100644 vectorless-core/vectorless-utils/src/lib.rs create mode 100644 vectorless-core/vectorless-utils/src/token.rs create mode 100644 vectorless-core/vectorless-utils/src/validation.rs create mode 100644 vectorless-core/vectorless/src/document/format.rs diff --git a/Cargo.toml b/Cargo.toml index 5c9a82a6..30641940 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,23 @@ [workspace] -members = ["vectorless-core/vectorless", "vectorless-core/vectorless-py"] +members = [ + "vectorless-core/vectorless-error", + "vectorless-core/vectorless-document", + "vectorless-core/vectorless-config", + "vectorless-core/vectorless-utils", + "vectorless-core/vectorless-scoring", + "vectorless-core/vectorless-graph", + "vectorless-core/vectorless-events", + "vectorless-core/vectorless-metrics", + "vectorless-core/vectorless-llm", + "vectorless-core/vectorless-storage", + "vectorless-core/vectorless-query", + "vectorless-core/vectorless-index", + "vectorless-core/vectorless-agent", + "vectorless-core/vectorless-retrieval", + "vectorless-core/vectorless-rerank", + "vectorless-core/vectorless-engine", + "vectorless-core/vectorless-py", +] resolver = "2" [workspace.package] diff --git a/vectorless-core/fix_imports.py b/vectorless-core/fix_imports.py new file mode 100644 index 00000000..ce2c7aea --- /dev/null +++ b/vectorless-core/fix_imports.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +"""Fix crate:: imports for the split crates. + +For each crate, self-references (crate::SELF_MODULE::) stay as crate::. +External references (crate::OTHER_MODULE::) become vectorless_other::. +Also handles bare `crate::Error` -> `vectorless_error::Error`. +""" +import os +import re +import sys + +# Mapping: crate_dir -> (self_module, [external_deps]) +CRATES = { + "vectorless-error": ("error", []), + "vectorless-document": ("document", []), + "vectorless-config": ("config", []), + "vectorless-utils": ("utils", ["error", "document"]), + "vectorless-scoring": ("scoring", []), + "vectorless-graph": ("graph", ["document"]), + "vectorless-events": ("events", ["error", "document"]), + "vectorless-metrics": ("metrics", ["config", "error"]), + "vectorless-llm": ("llm", ["config", "error", "metrics", "utils"]), + "vectorless-storage": ("storage", ["config", "document", "error", "utils"]), + "vectorless-query": ("query", ["error", "llm", "scoring"]), + "vectorless-index": ("index", ["config", "document", "error", "llm", "metrics", "scoring", "storage", "utils"]), + "vectorless-agent": ("agent", ["document", "error", "llm", "query", "scoring"]), + "vectorless-retrieval": ("retrieval", ["agent", "document", "error", "llm", "query", "storage", "utils"]), + "vectorless-rerank": ("rerank", ["agent", "error", "query"]), + "vectorless-engine": ("client", ["agent", "config", "document", "error", "events", "index", "llm", "metrics", "retrieval", "rerank", "storage"]), +} + +MODULE_TO_CRATE = { + "error": "vectorless_error", + "document": "vectorless_document", + "config": "vectorless_config", + "utils": "vectorless_utils", + "scoring": "vectorless_scoring", + "graph": "vectorless_graph", + "events": "vectorless_events", + "metrics": "vectorless_metrics", + "llm": "vectorless_llm", + "storage": "vectorless_storage", + "query": "vectorless_query", + "index": "vectorless_index", + "agent": "vectorless_agent", + "retrieval": "vectorless_retrieval", + "rerank": "vectorless_rerank", + "client": "vectorless_engine", +} + +BASE = "/home/ztgx/Desktop/vectorless/vectorless-core" + +def fix_file(filepath, self_module): + with open(filepath, 'r') as f: + content = f.read() + + original = content + + # Replace crate::OTHER_MODULE:: with vectorless_other:: + # But keep crate::SELF_MODULE:: as crate::SELF_MODULE:: + for module, crate_name in MODULE_TO_CRATE.items(): + if module == self_module: + continue + # Match crate::module:: (with word boundary to avoid partial matches) + pattern = r'crate::' + re.escape(module) + r'::' + replacement = crate_name + '::' + content = re.sub(pattern, replacement, content) + + # Replace bare crate::Error (without any module prefix) with vectorless_error::Error + # But only if self_module is not "error" + if self_module != "error": + # Match "crate::Error" that isn't followed by :: (i.e., not crate::error::) + content = re.sub(r'crate::Error(?!::)', 'vectorless_error::Error', content) + # Match "crate::Result" -> "vectorless_error::Result" + content = re.sub(r'crate::Result(?!::)', 'vectorless_error::Result', content) + + if content != original: + with open(filepath, 'w') as f: + f.write(content) + return True + return False + +changed_files = 0 +for crate_dir, (self_module, deps) in CRATES.items(): + src_dir = os.path.join(BASE, crate_dir, "src") + if not os.path.isdir(src_dir): + continue + for root, dirs, files in os.walk(src_dir): + for fname in files: + if fname.endswith('.rs'): + fpath = os.path.join(root, fname) + if fix_file(fpath, self_module): + changed_files += 1 + print(f" Fixed: {os.path.relpath(fpath, BASE)}") + +print(f"\nTotal files changed: {changed_files}") diff --git a/vectorless-core/vectorless-agent/Cargo.toml b/vectorless-core/vectorless-agent/Cargo.toml new file mode 100644 index 00000000..7ecfbaa5 --- /dev/null +++ b/vectorless-core/vectorless-agent/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "vectorless-agent" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-document = { path = "../vectorless-document" } +vectorless-error = { path = "../vectorless-error" } +vectorless-llm = { path = "../vectorless-llm" } +vectorless-query = { path = "../vectorless-query" } +vectorless-scoring = { path = "../vectorless-scoring" } +tokio = { workspace = true } +async-trait = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +futures = { workspace = true } +chrono = { workspace = true } +thiserror = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-agent/src/command.rs b/vectorless-core/vectorless-agent/src/command.rs new file mode 100644 index 00000000..07385420 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/command.rs @@ -0,0 +1,629 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Command parsing for the agent navigation loop. +//! +//! LLM output is parsed into `Command` variants. The parser is intentionally +//! simple and forgiving — unknown input falls back to `Ls` so the agent can +//! re-observe its surroundings. + +use vectorless_document::{NavigationIndex, NodeId}; + +/// Parsed command from LLM output. +#[derive(Debug, Clone, PartialEq)] +pub enum Command { + /// List children of the current node. + Ls, + /// Navigate into a child node by name. + Cd { target: String }, + /// Navigate back to parent. + CdUp, + /// Read node content (collects as evidence). + Cat { target: String }, + /// Search for a keyword in the ReasoningIndex. + Find { keyword: String }, + /// Regex search across node content in the current subtree. + Grep { pattern: String }, + /// Preview first N lines of a node without collecting evidence. + Head { target: String, lines: usize }, + /// Search for nodes by title pattern in the tree. + FindTree { pattern: String }, + /// Show node content size (lines, chars). + Wc { target: String }, + /// Show current navigation path. + Pwd, + /// Evaluate evidence sufficiency. + Check, + /// End navigation. + Done, +} + +/// Strip surrounding quotes from a target string. +/// +/// Handles straight quotes (`"`, `'`) and Unicode smart quotes (U+201C/U+201D, U+2018/U+2019). +fn strip_quotes(s: &str) -> String { + let trimmed = s.trim(); + let chars: Vec = trimmed.chars().collect(); + if chars.len() < 2 { + return trimmed.to_string(); + } + let (first, last) = (chars[0], chars[chars.len() - 1]); + let matching = (first == '"' && last == '"') + || (first == '\'' && last == '\'') + || (first == '\u{201c}' && last == '\u{201d}') + || (first == '\u{2018}' && last == '\u{2019}'); + if matching { + trimmed[chars[0].len_utf8()..trimmed.len() - chars[chars.len() - 1].len_utf8()].to_string() + } else { + trimmed.to_string() + } +} + +/// Parse the first non-empty line of LLM output into a Command. +pub fn parse_command(llm_output: &str) -> Command { + let line = llm_output + .lines() + .find(|l| !l.trim().is_empty()) + .unwrap_or("") + .trim(); + + // Remove common wrapping (markdown code blocks, etc.) + let line = line.trim_start_matches('`').trim_end_matches('`').trim(); + + let parts: Vec<&str> = line.split_whitespace().collect(); + + match parts.as_slice() { + ["ls"] => Command::Ls, + ["cat"] => Command::Cat { + target: ".".to_string(), + }, + ["cd", ".."] => Command::CdUp, + ["cd", target] => Command::Cd { + target: strip_quotes(target), + }, + ["cd", _target, ..] => Command::Cd { + // Handle "cd some name" by joining remaining parts + target: strip_quotes(&parts[1..].join(" ")), + }, + ["cat", target] => Command::Cat { + target: strip_quotes(target), + }, + ["cat", _target, ..] => Command::Cat { + target: strip_quotes(&parts[1..].join(" ")), + }, + ["find", keyword] => Command::Find { + keyword: strip_quotes(keyword), + }, + ["find", _keyword, ..] => Command::Find { + keyword: strip_quotes(&parts[1..].join(" ")), + }, + ["grep", pattern] => Command::Grep { + pattern: strip_quotes(pattern), + }, + ["grep", _pattern, ..] => Command::Grep { + pattern: strip_quotes(&parts[1..].join(" ")), + }, + ["head", target] => Command::Head { + target: strip_quotes(target), + lines: 20, // default + }, + ["head", "-n", n, target @ ..] => Command::Head { + target: strip_quotes(&target.join(" ")), + lines: n.parse().unwrap_or(20), + }, + ["head", _target, ..] => Command::Head { + target: strip_quotes(&parts[1..].join(" ")), + lines: 20, + }, + ["findtree", pattern] => Command::FindTree { + pattern: strip_quotes(pattern), + }, + ["findtree", _pattern, ..] => Command::FindTree { + pattern: strip_quotes(&parts[1..].join(" ")), + }, + ["wc", target] => Command::Wc { + target: strip_quotes(target), + }, + ["wc", _target, ..] => Command::Wc { + target: strip_quotes(&parts[1..].join(" ")), + }, + ["pwd"] => Command::Pwd, + ["check"] => Command::Check, + ["done"] => Command::Done, + _ => Command::Ls, // fallback: re-observe + } +} + +/// Resolve a cd/cat target string to a NodeId using multi-level matching. +/// +/// Matching priority: +/// 1. Exact title match +/// 2. Case-insensitive title match +/// 3. Substring (contains) match +/// 4. Numeric index match ("1" → first child, "2" → second, etc.) +pub fn resolve_target( + target: &str, + nav_index: &NavigationIndex, + current_node: NodeId, +) -> Option { + let target = strip_quotes(target); + let routes = nav_index.get_child_routes(current_node)?; + + // 1. Exact match + if let Some(r) = routes.iter().find(|r| r.title == target) { + return Some(r.node_id); + } + + // 2. Case-insensitive match + let target_lower = target.to_lowercase(); + if let Some(r) = routes + .iter() + .find(|r| r.title.to_lowercase() == target_lower) + { + return Some(r.node_id); + } + + // 3. Substring (contains) match + if let Some(r) = routes + .iter() + .find(|r| r.title.to_lowercase().contains(&target_lower)) + { + return Some(r.node_id); + } + + // 4. Numeric index match ("1" → first child) + if let Ok(idx) = target.parse::() { + if idx > 0 && idx <= routes.len() { + return Some(routes[idx - 1].node_id); + } + } + + None +} + +/// Resolve a cd/cat target with additional context from the tree node titles. +/// +/// Matching priority: +/// 1. Direct children via NavigationIndex (exact, case-insensitive, substring, numeric) +/// 2. Direct children via TreeNode titles (case-insensitive contains) +/// 3. Deep descendant search (BFS, up to depth 4) — enables `cd "Research Labs"` from +/// root when "Research Labs" is a grandchild behind an intermediate wrapper node. +pub fn resolve_target_extended( + target: &str, + nav_index: &NavigationIndex, + current_node: NodeId, + tree: &vectorless_document::DocumentTree, +) -> Option { + let target = strip_quotes(target); + // Try the primary resolver first + if let Some(id) = resolve_target(&target, nav_index, current_node) { + return Some(id); + } + + let target_lower = target.to_lowercase(); + + // Extended: check all direct children by their TreeNode titles + let children: Vec = tree.children_iter(current_node).collect(); + for child_id in &children { + if let Some(node) = tree.get(*child_id) { + if node.title.to_lowercase().contains(&target_lower) { + return Some(*child_id); + } + } + } + + // Deep search: BFS through descendants up to depth 4. + // Returns the shallowest match so `cd "Research Labs"` from root finds it + // at depth 1 even if another "Research Labs" exists deeper. + search_descendants(&target_lower, current_node, tree, 4) +} + +/// BFS search through descendants, returning the shallowest matching NodeId. +fn search_descendants( + target_lower: &str, + start: NodeId, + tree: &vectorless_document::DocumentTree, + max_depth: usize, +) -> Option { + let mut queue: Vec<(NodeId, usize)> = vec![(start, 0)]; + + while let Some((node_id, depth)) = queue.pop() { + if depth >= max_depth { + continue; + } + for child_id in tree.children_iter(node_id) { + if let Some(node) = tree.get(child_id) { + if node.title.to_lowercase().contains(target_lower) { + return Some(child_id); + } + } + queue.push((child_id, depth + 1)); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_ls() { + assert_eq!(parse_command("ls"), Command::Ls); + assert_eq!(parse_command(" ls "), Command::Ls); + } + + #[test] + fn test_parse_cd() { + assert_eq!(parse_command("cd .."), Command::CdUp); + assert_eq!( + parse_command("cd Getting Started"), + Command::Cd { + target: "Getting Started".to_string() + } + ); + assert_eq!( + parse_command("cd some long name"), + Command::Cd { + target: "some long name".to_string() + } + ); + // Quoted multi-word targets should have quotes stripped + assert_eq!( + parse_command("cd \"Vectorless Architecture Guide\""), + Command::Cd { + target: "Vectorless Architecture Guide".to_string() + } + ); + assert_eq!( + parse_command("cd 'Vectorless Architecture Guide'"), + Command::Cd { + target: "Vectorless Architecture Guide".to_string() + } + ); + // Smart quotes + assert_eq!( + parse_command("\u{201c}Vectorless Architecture Guide\u{201d}"), + Command::Ls // doesn't start with a command keyword + ); + } + + #[test] + fn test_strip_quotes_straight() { + assert_eq!(strip_quotes("\"hello\""), "hello"); + assert_eq!(strip_quotes("'hello'"), "hello"); + assert_eq!(strip_quotes("hello"), "hello"); + assert_eq!(strip_quotes("\"only left"), "\"only left"); + } + + #[test] + fn test_strip_quotes_smart() { + assert_eq!(strip_quotes("\u{201c}hello\u{201d}"), "hello"); + assert_eq!(strip_quotes("\u{2018}hello\u{2019}"), "hello"); + } + + #[test] + fn test_resolve_target_quoted() { + use vectorless_document::{ChildRoute, DocumentTree}; + + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "Vectorless Architecture Guide", "content"); + + let mut nav_index = NavigationIndex::new(); + nav_index.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "Vectorless Architecture Guide".to_string(), + description: "Main guide".to_string(), + leaf_count: 5, + }], + ); + + // Quoted target should still resolve + assert_eq!( + resolve_target("\"Vectorless Architecture Guide\"", &nav_index, root), + Some(c1) + ); + assert_eq!( + resolve_target("'Vectorless Architecture Guide'", &nav_index, root), + Some(c1) + ); + } + + #[test] + fn test_parse_cat() { + assert_eq!( + parse_command("cat Installation"), + Command::Cat { + target: "Installation".to_string() + } + ); + assert_eq!( + parse_command("cat API Reference"), + Command::Cat { + target: "API Reference".to_string() + } + ); + } + + #[test] + fn test_parse_find() { + assert_eq!( + parse_command("find authentication"), + Command::Find { + keyword: "authentication".to_string() + } + ); + } + + #[test] + fn test_parse_misc() { + assert_eq!(parse_command("pwd"), Command::Pwd); + assert_eq!(parse_command("check"), Command::Check); + assert_eq!(parse_command("done"), Command::Done); + } + + #[test] + fn test_parse_fallback() { + assert_eq!(parse_command(""), Command::Ls); + assert_eq!(parse_command("unknown command"), Command::Ls); + assert_eq!(parse_command("blah blah"), Command::Ls); + } + + #[test] + fn test_parse_with_wrapping() { + assert_eq!(parse_command("`ls`"), Command::Ls); + assert_eq!(parse_command("```ls```"), Command::Ls); + } + + #[test] + fn test_parse_multiline() { + // Should parse the first non-empty line + assert_eq!(parse_command("\n\nls\n\n// listing children"), Command::Ls); + } + + #[test] + fn test_resolve_target_numeric() { + use vectorless_document::{ChildRoute, DocumentTree}; + + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "Getting Started", "content"); + let c2 = tree.add_child(root, "API Reference", "content"); + + let mut nav_index = NavigationIndex::new(); + nav_index.add_child_routes( + root, + vec![ + ChildRoute { + node_id: c1, + title: "Getting Started".to_string(), + description: "Setup guide".to_string(), + leaf_count: 3, + }, + ChildRoute { + node_id: c2, + title: "API Reference".to_string(), + description: "API docs".to_string(), + leaf_count: 7, + }, + ], + ); + + assert_eq!(resolve_target("1", &nav_index, root), Some(c1)); + assert_eq!(resolve_target("2", &nav_index, root), Some(c2)); + assert_eq!(resolve_target("3", &nav_index, root), None); + } + + #[test] + fn test_resolve_target_exact() { + use vectorless_document::{ChildRoute, DocumentTree}; + + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "Getting Started", "content"); + + let mut nav_index = NavigationIndex::new(); + nav_index.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "Getting Started".to_string(), + description: "Setup".to_string(), + leaf_count: 3, + }], + ); + + assert_eq!( + resolve_target("Getting Started", &nav_index, root), + Some(c1) + ); + } + + #[test] + fn test_resolve_target_case_insensitive() { + use vectorless_document::{ChildRoute, DocumentTree}; + + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "Getting Started", "content"); + + let mut nav_index = NavigationIndex::new(); + nav_index.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "Getting Started".to_string(), + description: "Setup".to_string(), + leaf_count: 3, + }], + ); + + assert_eq!( + resolve_target("getting started", &nav_index, root), + Some(c1) + ); + assert_eq!( + resolve_target("GETTING STARTED", &nav_index, root), + Some(c1) + ); + } + + #[test] + fn test_resolve_target_contains() { + use vectorless_document::{ChildRoute, DocumentTree}; + + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "API Reference", "content"); + + let mut nav_index = NavigationIndex::new(); + nav_index.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "API Reference".to_string(), + description: "API docs".to_string(), + leaf_count: 7, + }], + ); + + assert_eq!(resolve_target("api", &nav_index, root), Some(c1)); + assert_eq!(resolve_target("reference", &nav_index, root), Some(c1)); + } + + #[test] + fn test_resolve_target_no_routes() { + let nav_index = NavigationIndex::new(); + let tree = vectorless_document::DocumentTree::new("Root", ""); + assert!(resolve_target("anything", &nav_index, tree.root()).is_none()); + } + + #[test] + fn test_resolve_target_extended_deep_search() { + use vectorless_document::{ChildRoute, DocumentTree}; + + // root → "Wrapper" → "Research Labs" → "Lab B" + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let wrapper = tree.add_child(root, "Quantum Computing Division", "wrapper"); + let labs = tree.add_child(wrapper, "Research Labs", "labs content"); + let lab_b = tree.add_child(labs, "Lab B", "lab b content"); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ChildRoute { + node_id: wrapper, + title: "Quantum Computing Division".to_string(), + description: "Division".to_string(), + leaf_count: 7, + }], + ); + nav.add_child_routes( + wrapper, + vec![ChildRoute { + node_id: labs, + title: "Research Labs".to_string(), + description: "Labs".to_string(), + leaf_count: 4, + }], + ); + nav.add_child_routes( + labs, + vec![ChildRoute { + node_id: lab_b, + title: "Lab B".to_string(), + description: "Topological".to_string(), + leaf_count: 1, + }], + ); + + // "Research Labs" is a grandchild of root — deep search should find it + assert_eq!( + resolve_target_extended("Research Labs", &nav, root, &tree), + Some(labs) + ); + + // "Lab B" is a great-grandchild — deep search should find it + assert_eq!( + resolve_target_extended("Lab B", &nav, root, &tree), + Some(lab_b) + ); + + // Direct children should still work via primary resolver + assert_eq!( + resolve_target_extended("Quantum Computing Division", &nav, root, &tree), + Some(wrapper) + ); + } + + #[test] + fn test_parse_grep() { + assert_eq!( + parse_command("grep EBITDA"), + Command::Grep { + pattern: "EBITDA".to_string() + } + ); + assert_eq!( + parse_command("grep revenue.*2024"), + Command::Grep { + pattern: "revenue.*2024".to_string() + } + ); + } + + #[test] + fn test_parse_head() { + assert_eq!( + parse_command("head Installation"), + Command::Head { + target: "Installation".to_string(), + lines: 20 + } + ); + assert_eq!( + parse_command("head -n 5 API Reference"), + Command::Head { + target: "API Reference".to_string(), + lines: 5 + } + ); + } + + #[test] + fn test_parse_findtree() { + assert_eq!( + parse_command("findtree revenue"), + Command::FindTree { + pattern: "revenue".to_string() + } + ); + assert_eq!( + parse_command("findtree API Reference"), + Command::FindTree { + pattern: "API Reference".to_string() + } + ); + } + + #[test] + fn test_parse_wc() { + assert_eq!( + parse_command("wc Installation"), + Command::Wc { + target: "Installation".to_string() + } + ); + assert_eq!( + parse_command("wc API Reference"), + Command::Wc { + target: "API Reference".to_string() + } + ); + } +} diff --git a/vectorless-core/vectorless-agent/src/config.rs b/vectorless-core/vectorless-agent/src/config.rs new file mode 100644 index 00000000..1628d5a8 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/config.rs @@ -0,0 +1,248 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration and output types for the retrieval agent. + +use serde::{Deserialize, Serialize}; + +// --------------------------------------------------------------------------- +// Worker configuration +// --------------------------------------------------------------------------- + +/// Worker configuration — navigation budget settings. +#[derive(Debug, Clone)] +pub struct WorkerConfig { + /// Maximum navigation rounds per Worker loop (ls/cd/cat/grep/head/find etc.). + /// `check` does NOT count against this budget. + pub max_rounds: u32, + /// Hard cap on total LLM calls per Worker (planning + nav + check). + /// Prevents runaway costs regardless of max_rounds. 0 = no limit. + pub max_llm_calls: u32, +} + +impl Default for WorkerConfig { + fn default() -> Self { + Self { + max_rounds: 100, + max_llm_calls: 200, + } + } +} + +impl WorkerConfig { + pub fn new() -> Self { + Self::default() + } +} + +// --------------------------------------------------------------------------- +// Answer pipeline configuration +// --------------------------------------------------------------------------- + +/// Answer pipeline configuration — synthesis settings. +#[derive(Debug, Clone)] +pub struct AnswerConfig { + /// Maximum number of evidence items to feed into synthesis. + pub evidence_cap: usize, +} + +impl Default for AnswerConfig { + fn default() -> Self { + Self { evidence_cap: 20 } + } +} + +// --------------------------------------------------------------------------- +// Aggregated agent configuration +// --------------------------------------------------------------------------- + +/// Aggregated configuration for the entire retrieval agent system. +#[derive(Debug, Clone, Default)] +pub struct AgentConfig { + pub worker: WorkerConfig, + pub answer: AnswerConfig, +} + +impl AgentConfig { + pub fn new() -> Self { + Self::default() + } +} + +// --------------------------------------------------------------------------- +// Output types +// --------------------------------------------------------------------------- + +/// Agent output — the final result of a retrieval operation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Output { + /// Final synthesized answer. + pub answer: String, + /// Collected evidence from navigation. + pub evidence: Vec, + /// Agent execution metrics. + pub metrics: Metrics, + /// Confidence score (0.0–1.0) — derived from LLM evaluate() result. + pub confidence: f32, + /// Reasoning trace steps collected during agent navigation. + pub trace_steps: Vec, +} + +impl Output { + /// Create an empty output (no evidence found). + pub fn empty() -> Self { + Self { + answer: String::new(), + evidence: Vec::new(), + metrics: Metrics::default(), + confidence: 0.0, + trace_steps: Vec::new(), + } + } +} + +/// A single piece of evidence collected during navigation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Evidence { + /// Navigation path where this evidence was found (e.g., "Root/API Reference/Auth"). + pub source_path: String, + /// Title of the node. + pub node_title: String, + /// Content of the node. + pub content: String, + /// Source document name (set by Orchestrator in multi-doc scenarios). + pub doc_name: Option, +} + +/// Agent execution metrics. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct Metrics { + pub rounds_used: u32, + pub llm_calls: u32, + pub nodes_visited: usize, + pub budget_exhausted: bool, + pub plan_generated: bool, + pub check_count: u32, + pub evidence_chars: usize, +} + +/// Step result from the navigation loop. +#[derive(Debug, Clone, PartialEq)] +pub enum Step { + /// Continue to next round with the given feedback. + Continue, + /// Navigation is done, proceed to synthesis. + Done, + /// Forced done due to budget exhaustion or error. + ForceDone(String), +} + +// --------------------------------------------------------------------------- +// Worker output (evidence only, no answer) +// --------------------------------------------------------------------------- + +/// Output from a single Worker — pure evidence, no answer synthesis. +/// Rerank handles all answer generation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WorkerOutput { + /// Collected evidence from document navigation. + pub evidence: Vec, + /// Worker execution metrics. + pub metrics: WorkerMetrics, + /// Document name this Worker was assigned to. + pub doc_name: String, + /// Reasoning trace steps from this Worker. + pub trace_steps: Vec, +} + +/// Metrics specific to a single Worker's execution. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct WorkerMetrics { + /// Number of navigation rounds used. + pub rounds_used: u32, + /// Number of LLM calls made. + pub llm_calls: u32, + /// Number of distinct nodes visited. + pub nodes_visited: usize, + /// Whether the LLM call budget was exhausted. + pub budget_exhausted: bool, + /// Whether a navigation plan was generated. + pub plan_generated: bool, + /// Number of times `check` was called. + pub check_count: u32, + /// Total characters of collected evidence. + pub evidence_chars: usize, +} + +impl From for Output { + fn from(wo: WorkerOutput) -> Self { + Output { + answer: String::new(), + evidence: wo.evidence, + metrics: Metrics { + rounds_used: wo.metrics.rounds_used, + llm_calls: wo.metrics.llm_calls, + nodes_visited: wo.metrics.nodes_visited, + budget_exhausted: wo.metrics.budget_exhausted, + plan_generated: wo.metrics.plan_generated, + check_count: wo.metrics.check_count, + evidence_chars: wo.metrics.evidence_chars, + }, + confidence: 0.0, + trace_steps: wo.trace_steps, + } + } +} + +// --------------------------------------------------------------------------- +// Scope types +// --------------------------------------------------------------------------- + +/// Scope context — determines which path the dispatcher takes. +/// +/// Both variants go through the Orchestrator. The difference is: +/// - `Specified`: user chose specific documents → skip Orchestrator analysis phase +/// - `Workspace`: user didn't specify → Orchestrator analyzes DocCards to select docs +pub enum Scope<'a> { + /// User specified one or more documents (by doc_id). + /// Orchestrator skips analysis, spawns Workers directly. + Specified(Vec>), + /// Workspace scope — user didn't specify documents. + /// Orchestrator analyzes DocCards and selects relevant ones. + Workspace(WorkspaceContext<'a>), +} + +/// Read-only access to a single document's compile artifacts. +pub struct DocContext<'a> { + /// Document content tree. + pub tree: &'a vectorless_document::DocumentTree, + /// Navigation index (includes DocCard). + pub nav_index: &'a vectorless_document::NavigationIndex, + /// Reasoning index (keyword/topic lookup). + pub reasoning_index: &'a vectorless_document::ReasoningIndex, + /// Document name (for evidence source attribution). + pub doc_name: &'a str, +} + +/// Read-only access to multiple documents' compile artifacts. +pub struct WorkspaceContext<'a> { + /// All available documents. + pub docs: Vec>, +} + +impl<'a> WorkspaceContext<'a> { + /// Create a workspace from a slice of DocContexts. + pub fn new(docs: Vec>) -> Self { + Self { docs } + } + + /// Number of documents in the workspace. + pub fn doc_count(&self) -> usize { + self.docs.len() + } + + /// Whether the workspace has only one document. + pub fn is_single(&self) -> bool { + self.docs.len() == 1 + } +} diff --git a/vectorless-core/vectorless-agent/src/context.rs b/vectorless-core/vectorless-agent/src/context.rs new file mode 100644 index 00000000..f984bd51 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/context.rs @@ -0,0 +1,120 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Read-only data access wrappers over compile artifacts. +//! +//! These types provide the agent with structured access to the document's +//! navigation index, content tree, and reasoning index — all read-only. + +use vectorless_document::{ChildRoute, NodeId, TopicEntry}; + +// Re-export from config for convenience +pub use super::config::{DocContext, WorkspaceContext}; + +/// A single hit from a keyword search. +#[derive(Debug, Clone)] +pub struct FindHit { + /// The matched keyword. + pub keyword: String, + /// Topic entries matching the keyword. + pub entries: Vec, +} + +impl<'a> DocContext<'a> { + /// List child routes for a given node. + pub fn ls(&self, node: NodeId) -> Option<&[ChildRoute]> { + self.nav_index.get_child_routes(node) + } + + /// Read the full content of a node. + pub fn cat(&self, node: NodeId) -> Option<&str> { + self.tree.get(node).map(|n| n.content.as_str()) + } + + /// Get the title of a node. + pub fn node_title(&self, node: NodeId) -> Option<&str> { + self.tree.get(node).map(|n| n.title.as_str()) + } + + /// Search for a keyword in the reasoning index. + pub fn find(&self, keyword: &str) -> Option { + self.reasoning_index + .topic_entries(keyword) + .map(|entries| FindHit { + keyword: keyword.to_string(), + entries: entries.to_vec(), + }) + } + + /// Search for multiple keywords, collecting all hits. + pub fn find_all(&self, keywords: &[String]) -> Vec { + keywords.iter().filter_map(|kw| self.find(kw)).collect() + } + + /// Get the root node ID. + pub fn root(&self) -> NodeId { + self.tree.root() + } + + /// Get the document's DocCard, if available. + pub fn doc_card(&self) -> Option<&vectorless_document::DocCard> { + self.nav_index.doc_card() + } + + /// Get the navigation entry for a node (overview, hints, tags). + pub fn nav_entry(&self, node: NodeId) -> Option<&vectorless_document::NavEntry> { + self.nav_index.get_entry(node) + } + + /// Get the summary shortcut (pre-computed overview), if available. + pub fn summary_shortcut(&self) -> Option<&vectorless_document::SummaryShortcut> { + self.reasoning_index.summary_shortcut() + } + + /// Find a top-level section by its title, returning its NodeId. + pub fn find_section(&self, title: &str) -> Option { + self.reasoning_index.find_section(title) + } + + /// Get the parent of a node (by searching the tree). + pub fn parent(&self, node: NodeId) -> Option { + self.tree.parent(node) + } +} + +impl<'a> WorkspaceContext<'a> { + /// Search for a keyword across all documents. + pub fn find_cross(&self, keyword: &str) -> Vec<(usize, FindHit)> { + self.docs + .iter() + .enumerate() + .filter_map(|(idx, doc)| doc.find(keyword).map(|hit| (idx, hit))) + .collect() + } + + /// Search multiple keywords across all documents. + pub fn find_cross_all(&self, keywords: &[String]) -> Vec<(usize, Vec)> { + let mut results: Vec<(usize, Vec)> = Vec::new(); + for (idx, doc) in self.docs.iter().enumerate() { + let hits = doc.find_all(keywords); + if !hits.is_empty() { + results.push((idx, hits)); + } + } + results + } + + /// Get all DocCards for documents that have them. + pub fn doc_cards(&self) -> Vec<(usize, &vectorless_document::DocCard)> { + self.docs + .iter() + .enumerate() + .filter_map(|(idx, doc)| doc.doc_card().map(|card| (idx, card))) + .collect() + } + + /// Get a specific document context by index. + pub fn doc(&self, idx: usize) -> Option<&DocContext<'a>> { + self.docs.get(idx) + } +} diff --git a/vectorless-core/vectorless-agent/src/events.rs b/vectorless-core/vectorless-agent/src/events.rs new file mode 100644 index 00000000..e4575c93 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/events.rs @@ -0,0 +1,537 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Agent events — rich, structured visibility into the entire retrieval pipeline. +//! +//! Events are organized by pipeline stage: +//! 1. **Query Understanding** — intent analysis, keyword extraction +//! 2. **Orchestrator** — document selection, dispatch, evaluation, replan +//! 3. **Worker** — navigation, evidence collection, budget management +//! 4. **Answer** — synthesis and fusion +//! +//! The stream terminates with `Completed` or `Error`. + +use serde::Serialize; + +/// An event emitted during agent-based retrieval. +/// +/// Each variant carries the data a client needs to understand what happened, +/// not just that something happened. All events are `Clone + Serialize` so +/// they can be broadcast or persisted. +#[derive(Debug, Clone, Serialize)] +pub enum AgentEvent { + // ── Query Understanding ────────────────────────────────────────── + /// Query understanding started. + QueryUnderstandingStarted { query: String }, + + /// Query understanding completed (intent, keywords, strategy decided). + QueryUnderstandingCompleted { + query: String, + intent: String, + keywords: Vec, + strategy_hint: String, + complexity: String, + }, + + // ── Orchestrator ───────────────────────────────────────────────── + /// Orchestrator started. + OrchestratorStarted { + query: String, + doc_count: usize, + skip_analysis: bool, + }, + + /// Orchestrator is analyzing documents to select which to dispatch. + OrchestratorAnalyzing { + doc_count: usize, + keywords: Vec, + }, + + /// A Worker was dispatched to a document. + WorkerDispatched { + doc_idx: usize, + doc_name: String, + task: String, + focus_keywords: Vec, + }, + + /// A Worker finished its task. + WorkerCompleted { + doc_idx: usize, + doc_name: String, + evidence_count: usize, + rounds_used: u32, + llm_calls: u32, + success: bool, + }, + + /// Cross-doc sufficiency evaluation result. + OrchestratorEvaluated { + sufficient: bool, + evidence_count: usize, + missing_info: Option, + }, + + /// Orchestrator is replanning after insufficient evidence. + OrchestratorReplanning { + reason: String, + evidence_count: usize, + }, + + /// Orchestrator completed. + OrchestratorCompleted { + evidence_count: usize, + total_llm_calls: u32, + dispatch_rounds: u32, + }, + + // ── Worker (per-document navigation) ───────────────────────────── + /// Worker started on a document. + WorkerStarted { + doc_name: String, + task: Option, + max_rounds: u32, + }, + + /// Worker generated a navigation plan. + WorkerPlanGenerated { doc_name: String, plan_len: usize }, + + /// A navigation round completed. + WorkerRound { + doc_name: String, + round: u32, + command: String, + success: bool, + elapsed_ms: u64, + }, + + /// Evidence was collected from a node. + EvidenceCollected { + doc_name: String, + node_title: String, + source_path: String, + content_len: usize, + total_evidence: usize, + }, + + /// Worker sufficiency check result. + WorkerSufficiencyCheck { + doc_name: String, + sufficient: bool, + evidence_count: usize, + missing_info: Option, + }, + + /// Worker re-planned after insufficient check. + WorkerReplan { + doc_name: String, + missing_info: String, + plan_len: usize, + }, + + /// Worker budget warning (stuck or half-budget). + WorkerBudgetWarning { + doc_name: String, + warning_type: String, + round: u32, + }, + + /// Worker completed. + WorkerDone { + doc_name: String, + evidence_count: usize, + rounds_used: u32, + llm_calls: u32, + budget_exhausted: bool, + plan_generated: bool, + }, + + // ── Answer Pipeline ────────────────────────────────────────────── + /// Answer synthesis started. + AnswerStarted { + evidence_count: usize, + multi_doc: bool, + }, + + /// Answer synthesis completed. + AnswerCompleted { + answer_len: usize, + confidence: String, + }, + + // ── Terminal ───────────────────────────────────────────────────── + /// Entire retrieval pipeline completed. + Completed { + evidence_count: usize, + llm_calls: u32, + answer_len: usize, + }, + + /// An error occurred. + Error { stage: String, message: String }, +} + +// --------------------------------------------------------------------------- +// Channel + EventEmitter +// --------------------------------------------------------------------------- + +/// Sender for agent events. +pub(crate) type AgentEventSender = tokio::sync::mpsc::Sender; + +/// Receiver for agent events. +pub type AgentEventReceiver = tokio::sync::mpsc::Receiver; + +/// Create a bounded channel for agent events. +pub(crate) fn channel(bound: usize) -> (AgentEventSender, AgentEventReceiver) { + tokio::sync::mpsc::channel(bound) +} + +/// Default channel bound for agent events. +pub const DEFAULT_AGENT_EVENT_BOUND: usize = 256; + +/// A handle for emitting agent events. +/// +/// Wraps an `mpsc::Sender` and silently drops events if the receiver +/// is closed (no panic on send failure). Cheaply clonable. +#[derive(Clone)] +pub struct EventEmitter { + tx: Option, +} + +impl EventEmitter { + /// Create a new emitter with the given sender. + pub fn new(tx: AgentEventSender) -> Self { + Self { tx: Some(tx) } + } + + /// Create a noop emitter that discards all events. + pub fn noop() -> Self { + Self { tx: None } + } + + /// Emit an event. Silently drops if the receiver is closed. + pub fn emit(&self, event: AgentEvent) { + if let Some(ref tx) = self.tx { + let _ = tx.try_send(event); + } + } + + // ── Query Understanding ── + + pub fn emit_query_understanding_started(&self, query: &str) { + self.emit(AgentEvent::QueryUnderstandingStarted { + query: query.to_string(), + }); + } + + pub fn emit_query_understanding_completed( + &self, + query: &str, + intent: &str, + keywords: &[String], + strategy_hint: &str, + complexity: &str, + ) { + self.emit(AgentEvent::QueryUnderstandingCompleted { + query: query.to_string(), + intent: intent.to_string(), + keywords: keywords.to_vec(), + strategy_hint: strategy_hint.to_string(), + complexity: complexity.to_string(), + }); + } + + // ── Orchestrator ── + + pub fn emit_orchestrator_started(&self, query: &str, doc_count: usize, skip_analysis: bool) { + self.emit(AgentEvent::OrchestratorStarted { + query: query.to_string(), + doc_count, + skip_analysis, + }); + } + + pub fn emit_orchestrator_analyzing(&self, doc_count: usize, keywords: &[String]) { + self.emit(AgentEvent::OrchestratorAnalyzing { + doc_count, + keywords: keywords.to_vec(), + }); + } + + pub fn emit_worker_dispatched( + &self, + doc_idx: usize, + doc_name: &str, + task: &str, + focus_keywords: &[String], + ) { + self.emit(AgentEvent::WorkerDispatched { + doc_idx, + doc_name: doc_name.to_string(), + task: task.to_string(), + focus_keywords: focus_keywords.to_vec(), + }); + } + + pub fn emit_worker_completed( + &self, + doc_idx: usize, + doc_name: &str, + evidence_count: usize, + rounds_used: u32, + llm_calls: u32, + success: bool, + ) { + self.emit(AgentEvent::WorkerCompleted { + doc_idx, + doc_name: doc_name.to_string(), + evidence_count, + rounds_used, + llm_calls, + success, + }); + } + + pub fn emit_orchestrator_evaluated( + &self, + sufficient: bool, + evidence_count: usize, + missing_info: Option<&str>, + ) { + self.emit(AgentEvent::OrchestratorEvaluated { + sufficient, + evidence_count, + missing_info: missing_info.map(|s| s.to_string()), + }); + } + + pub fn emit_orchestrator_replanning(&self, reason: &str, evidence_count: usize) { + self.emit(AgentEvent::OrchestratorReplanning { + reason: reason.to_string(), + evidence_count, + }); + } + + pub fn emit_orchestrator_completed( + &self, + evidence_count: usize, + total_llm_calls: u32, + dispatch_rounds: u32, + ) { + self.emit(AgentEvent::OrchestratorCompleted { + evidence_count, + total_llm_calls, + dispatch_rounds, + }); + } + + // ── Worker ── + + pub fn emit_worker_started(&self, doc_name: &str, task: Option<&str>, max_rounds: u32) { + self.emit(AgentEvent::WorkerStarted { + doc_name: doc_name.to_string(), + task: task.map(|s| s.to_string()), + max_rounds, + }); + } + + pub fn emit_worker_plan_generated(&self, doc_name: &str, plan_len: usize) { + self.emit(AgentEvent::WorkerPlanGenerated { + doc_name: doc_name.to_string(), + plan_len, + }); + } + + pub fn emit_worker_round( + &self, + doc_name: &str, + round: u32, + command: &str, + success: bool, + elapsed_ms: u64, + ) { + self.emit(AgentEvent::WorkerRound { + doc_name: doc_name.to_string(), + round, + command: command.to_string(), + success, + elapsed_ms, + }); + } + + pub fn emit_evidence( + &self, + doc_name: &str, + node_title: &str, + source_path: &str, + content_len: usize, + total: usize, + ) { + self.emit(AgentEvent::EvidenceCollected { + doc_name: doc_name.to_string(), + node_title: node_title.to_string(), + source_path: source_path.to_string(), + content_len, + total_evidence: total, + }); + } + + pub fn emit_worker_sufficiency_check( + &self, + doc_name: &str, + sufficient: bool, + evidence_count: usize, + missing_info: Option<&str>, + ) { + self.emit(AgentEvent::WorkerSufficiencyCheck { + doc_name: doc_name.to_string(), + sufficient, + evidence_count, + missing_info: missing_info.map(|s| s.to_string()), + }); + } + + pub fn emit_worker_replan(&self, doc_name: &str, missing_info: &str, plan_len: usize) { + self.emit(AgentEvent::WorkerReplan { + doc_name: doc_name.to_string(), + missing_info: missing_info.to_string(), + plan_len, + }); + } + + pub fn emit_worker_budget_warning(&self, doc_name: &str, warning_type: &str, round: u32) { + self.emit(AgentEvent::WorkerBudgetWarning { + doc_name: doc_name.to_string(), + warning_type: warning_type.to_string(), + round, + }); + } + + pub fn emit_worker_done( + &self, + doc_name: &str, + evidence_count: usize, + rounds_used: u32, + llm_calls: u32, + budget_exhausted: bool, + plan_generated: bool, + ) { + self.emit(AgentEvent::WorkerDone { + doc_name: doc_name.to_string(), + evidence_count, + rounds_used, + llm_calls, + budget_exhausted, + plan_generated, + }); + } + + // ── Answer ── + + pub fn emit_answer_started(&self, evidence_count: usize, multi_doc: bool) { + self.emit(AgentEvent::AnswerStarted { + evidence_count, + multi_doc, + }); + } + + pub fn emit_answer_completed(&self, answer_len: usize, confidence: &str) { + self.emit(AgentEvent::AnswerCompleted { + answer_len, + confidence: confidence.to_string(), + }); + } + + // ── Terminal ── + + pub fn emit_completed(&self, evidence_count: usize, llm_calls: u32, answer_len: usize) { + self.emit(AgentEvent::Completed { + evidence_count, + llm_calls, + answer_len, + }); + } + + pub fn emit_error(&self, stage: &str, message: &str) { + self.emit(AgentEvent::Error { + stage: stage.to_string(), + message: message.to_string(), + }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_noop_emitter() { + let emitter = EventEmitter::noop(); + emitter.emit_orchestrator_started("test", 1, false); + emitter.emit_worker_started("doc.md", None, 8); + emitter.emit_worker_round("doc.md", 1, "ls", true, 50); + emitter.emit_worker_done("doc.md", 0, 1, 1, false, false); + emitter.emit_completed(0, 1, 0); + // No panic — events silently dropped + } + + #[test] + fn test_event_roundtrip() { + let (tx, mut rx) = channel(DEFAULT_AGENT_EVENT_BOUND); + let emitter = EventEmitter::new(tx); + + emitter.emit_orchestrator_started("what is X?", 1, true); + emitter.emit_worker_started("doc.md", None, 8); + emitter.emit_evidence("doc.md", "Intro", "root/Intro", 100, 1); + emitter.emit_worker_sufficiency_check("doc.md", true, 1, None); + emitter.emit_worker_done("doc.md", 1, 3, 5, false, true); + emitter.emit_completed(1, 6, 42); + + let events: Vec = (0..6).map(|_| rx.blocking_recv().unwrap()).collect(); + + assert!( + matches!(&events[0], AgentEvent::OrchestratorStarted { query, .. } if query == "what is X?") + ); + assert!( + matches!(&events[1], AgentEvent::WorkerStarted { doc_name, .. } if doc_name == "doc.md") + ); + assert!( + matches!(&events[2], AgentEvent::EvidenceCollected { node_title, .. } if node_title == "Intro") + ); + assert!(matches!( + &events[3], + AgentEvent::WorkerSufficiencyCheck { + sufficient: true, + .. + } + )); + assert!(matches!( + &events[4], + AgentEvent::WorkerDone { + evidence_count: 1, + plan_generated: true, + .. + } + )); + assert!(matches!( + &events[5], + AgentEvent::Completed { + evidence_count: 1, + answer_len: 42, + .. + } + )); + } + + #[test] + fn test_serialization() { + let event = AgentEvent::OrchestratorStarted { + query: "test".to_string(), + doc_count: 3, + skip_analysis: false, + }; + let json = serde_json::to_string(&event).unwrap(); + assert!(json.contains("OrchestratorStarted")); + assert!(json.contains("test")); + } +} diff --git a/vectorless-core/vectorless-agent/src/lib.rs b/vectorless-core/vectorless-agent/src/lib.rs new file mode 100644 index 00000000..a566ab6b --- /dev/null +++ b/vectorless-core/vectorless-agent/src/lib.rs @@ -0,0 +1,55 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Retrieval agent — struct-based document intelligence. +//! +//! # Architecture +//! +//! The retrieval dispatcher always goes through the Orchestrator. +//! Based on [`Scope`]: +//! +//! - **User specified doc_ids** → Orchestrator skips analysis, spawns Workers directly. +//! - **Workspace / unspecified** → Orchestrator analyzes DocCards, selects docs, spawns Workers. +//! +//! Both paths produce the same [`Output`] type and share the same synthesis logic. +//! +//! ```text +//! dispatch(query, scope) +//! └── Orchestrator (always) +//! ├── Scope::Specified(docs) → skip analysis → N × Worker → synthesis +//! └── Scope::Workspace(ws) → analysis → N × Worker → fusion → synthesis +//! ``` +//! +//! # Agent trait +//! +//! All retrieval agents implement [`Agent`] with `async fn run(self)` (Edition 2024). +//! The trait uses native async functions — no `async-trait` crate needed. + +pub mod command; +pub mod config; +pub mod context; +pub mod events; +pub mod state; +pub mod tools; + +pub mod orchestrator; +pub mod prompts; +pub mod worker; + +pub use config::{DocContext, Evidence, Output, Scope, WorkspaceContext}; +pub use events::{AgentEvent, EventEmitter}; + +/// Agent trait — async, consuming-self execution. +/// +/// Each agent struct holds its own configuration and context. +/// Calling `run(self)` consumes the agent and produces output. +/// +/// Uses Edition 2024 native `async fn` in trait — no `async-trait` crate. +pub trait Agent { + /// The output type produced by this agent. + type Output; + /// Agent name for logging and events. + fn name(&self) -> &str; + /// Execute the agent, consuming self. + async fn run(self) -> vectorless_error::Result; +} diff --git a/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs b/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs new file mode 100644 index 00000000..4af33e29 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs @@ -0,0 +1,159 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Phase 1: Analyze documents and produce a dispatch plan. +//! +//! Uses the [`QueryPlan`] from query understanding to inform document selection. +//! LLM errors propagate — no silent degradation. + +use tracing::{debug, info}; + +use vectorless_error::Error; +use vectorless_llm::LlmClient; +use vectorless_query::QueryPlan; +use vectorless_scoring::bm25::extract_keywords; + +use super::super::config::WorkspaceContext; +use super::super::prompts::{DispatchEntry, orchestrator_analysis, parse_dispatch_plan}; +use super::super::state::OrchestratorState; +use super::super::tools::orchestrator as orch_tools; + +/// Outcome of the analyze phase. +pub enum AnalyzeOutcome { + /// Produce dispatch entries for Phase 2. + Proceed { + dispatches: Vec, + llm_calls: u32, + }, + /// Cross-doc search already answered the query. + AlreadyAnswered { llm_calls: u32 }, + /// No relevant documents found. + NoResults { llm_calls: u32 }, +} + +/// Analyze documents and produce a dispatch plan. +/// +/// Uses the [`QueryPlan`] for intent-aware analysis: +/// - Intent and key concepts inform the LLM about what to look for +/// - Complexity hints at how many documents may be needed +/// - Strategy hint guides the analysis approach +/// +/// LLM failures propagate as [`Error::LlmReasoning`] — no fallback. +pub async fn analyze( + query: &str, + ws: &WorkspaceContext<'_>, + state: &mut OrchestratorState, + emitter: &crate::agent::EventEmitter, + skip_analysis: bool, + query_plan: &QueryPlan, + llm: &LlmClient, +) -> vectorless_error::Result { + if skip_analysis { + debug!("Phase 1: skipping (user-specified documents)"); + let dispatches = (0..ws.doc_count()) + .map(|idx| DispatchEntry { + doc_idx: idx, + reason: "User-specified document".to_string(), + task: query.to_string(), + }) + .collect(); + return Ok(AnalyzeOutcome::Proceed { + dispatches, + llm_calls: 0, + }); + } + + debug!( + intent = %query_plan.intent, + complexity = %query_plan.complexity, + strategy = query_plan.strategy_hint, + "Phase 1: analyzing doc cards with query understanding" + ); + + let doc_cards_text = orch_tools::ls_docs(ws).feedback; + let keywords = extract_keywords(query); + let find_text = if keywords.is_empty() { + "(no keywords extracted)".to_string() + } else { + orch_tools::find_cross(&keywords, ws).feedback + }; + + info!(keywords = ?keywords, "Phase 1: analyzing"); + debug!( + doc_cards_len = doc_cards_text.len(), + find_results_len = find_text.len(), + "Phase 1: analysis input" + ); + + // Build analysis prompt enriched with query understanding + let concepts_text = if query_plan.key_concepts.is_empty() { + String::new() + } else { + format!("\nKey concepts: {}", query_plan.key_concepts.join(", ")) + }; + + let strategy_text = if query_plan.strategy_hint.is_empty() { + String::new() + } else { + format!("\nRetrieval strategy: {}", query_plan.strategy_hint) + }; + + let rewritten_text = if query_plan.rewritten.is_empty() { + String::new() + } else { + format!( + "\nRewritten queries for matching: {}", + query_plan.rewritten.join("; ") + ) + }; + + let intent_context = format!( + "\nQuery intent: {} (complexity: {}){concepts_text}{strategy_text}{rewritten_text}", + query_plan.intent, query_plan.complexity, + ); + + let (system, user) = + orchestrator_analysis(&super::super::prompts::OrchestratorAnalysisParams { + query, + doc_cards: &doc_cards_text, + find_results: &find_text, + intent_context: &intent_context, + }); + + let analysis_output = llm.complete(&system, &user).await.map_err(|e| { + emitter.emit_error("orchestrator/analysis", &e.to_string()); + Error::LlmReasoning { + stage: "orchestrator/analysis".to_string(), + detail: format!("LLM call failed: {e}"), + } + })?; + + info!( + response_len = analysis_output.len(), + response = %if analysis_output.len() > 500 { &analysis_output[..500] } else { &analysis_output }, + "Phase 1: analysis LLM response" + ); + + let dispatches = match parse_dispatch_plan(&analysis_output, ws.doc_count()) { + Some(entries) => entries, + None => { + info!("Orchestrator: analysis indicates already answered"); + return Ok(AnalyzeOutcome::AlreadyAnswered { llm_calls: 1 }); + } + }; + + info!( + dispatches = dispatches.len(), + "Phase 1: parsed dispatch plan" + ); + + if dispatches.is_empty() { + return Ok(AnalyzeOutcome::NoResults { llm_calls: 1 }); + } + + state.analyze_done = true; + Ok(AnalyzeOutcome::Proceed { + dispatches, + llm_calls: 1, + }) +} diff --git a/vectorless-core/vectorless-agent/src/orchestrator/dispatch.rs b/vectorless-core/vectorless-agent/src/orchestrator/dispatch.rs new file mode 100644 index 00000000..0916ec46 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/orchestrator/dispatch.rs @@ -0,0 +1,92 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Phase 2: Dispatch Workers and collect results. + +use tracing::{info, warn}; + +use vectorless_llm::LlmClient; + +use super::super::Agent; +use super::super::config::{AgentConfig, WorkspaceContext}; +use super::super::events::EventEmitter; +use super::super::prompts::DispatchEntry; +use super::super::state::OrchestratorState; +use super::super::worker::Worker; +use vectorless_query::QueryPlan; + +/// Dispatch Workers in parallel and collect results. +pub async fn dispatch_and_collect( + query: &str, + dispatches: &[DispatchEntry], + ws: &WorkspaceContext<'_>, + config: &AgentConfig, + llm: &LlmClient, + state: &mut OrchestratorState, + emitter: &EventEmitter, + query_plan: &QueryPlan, +) { + let futures: Vec<_> = dispatches + .iter() + .filter_map(|dispatch| { + let doc = match ws.doc(dispatch.doc_idx) { + Some(d) => d, + None => { + warn!(doc_idx = dispatch.doc_idx, "Document not found, skipping"); + return None; + } + }; + + let query = query.to_string(); + let task = dispatch.task.clone(); + let worker_config = config.worker.clone(); + let doc_idx = dispatch.doc_idx; + let doc_name = doc.doc_name.to_string(); + let llm = llm.clone(); + let sub_emitter = EventEmitter::noop(); + let worker_plan = query_plan.clone(); + + Some(async move { + emitter.emit_worker_dispatched(doc_idx, &doc_name, &task, &[]); + let worker = Worker::new( + &query, + Some(&task), + doc, + worker_config, + llm, + sub_emitter, + worker_plan, + ); + let result = worker.run().await; + (doc_idx, doc_name, result) + }) + }) + .collect(); + + let results: Vec<_> = futures::future::join_all(futures).await; + + for (doc_idx, doc_name, result) in results { + match result { + Ok(output) => { + info!( + doc_idx, + evidence = output.evidence.len(), + "Worker completed" + ); + emitter.emit_worker_completed( + doc_idx, + &doc_name, + output.evidence.len(), + output.metrics.rounds_used, + output.metrics.llm_calls, + true, + ); + state.collect_result(doc_idx, output); + } + Err(e) => { + warn!(doc_idx, error = %e, "Worker failed"); + emitter.emit_worker_completed(doc_idx, &doc_name, 0, 0, 0, false); + } + } + } +} diff --git a/vectorless-core/vectorless-agent/src/orchestrator/evaluate.rs b/vectorless-core/vectorless-agent/src/orchestrator/evaluate.rs new file mode 100644 index 00000000..9b0898e1 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/orchestrator/evaluate.rs @@ -0,0 +1,128 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Evaluate cross-document evidence sufficiency via LLM. +//! +//! Replaces the old `integrate` module's heuristic sufficiency check. +//! LLM errors propagate — no silent "assume sufficient" fallback. + +use tracing::info; + +use vectorless_error::Error; +use vectorless_llm::LlmClient; + +use super::super::config::Evidence; +use super::super::prompts::{check_sufficiency, parse_sufficiency_response}; + +/// Result of the evidence sufficiency evaluation. +pub struct EvalResult { + /// Whether the collected evidence is sufficient to answer the query. + pub sufficient: bool, + /// Description of what information is still missing (empty if sufficient). + pub missing_info: String, +} + +/// Evaluate cross-document evidence sufficiency via LLM. +/// +/// Propagates LLM errors as [`Error::LlmReasoning`]. +/// The caller decides how to handle insufficiency (replan, abort, etc.). +pub async fn evaluate( + query: &str, + evidence: &[Evidence], + llm: &LlmClient, +) -> vectorless_error::Result { + let evidence_summary = format_evidence_summary(evidence); + let (system, user) = check_sufficiency(query, &evidence_summary); + + info!( + evidence = evidence.len(), + "Evaluating evidence sufficiency..." + ); + let response = llm + .complete(&system, &user) + .await + .map_err(|e| Error::LlmReasoning { + stage: "orchestrator/evaluate".to_string(), + detail: format!("Sufficiency check LLM call failed: {e}"), + })?; + + let sufficient = parse_sufficiency_response(&response); + let missing_info = if sufficient { + String::new() + } else { + // Extract the reason from the response (everything after SUFFICIENT/INSUFFICIENT) + let reason = response + .trim() + .strip_prefix("INSUFFICIENT") + .or_else(|| response.trim().strip_prefix("Insufficient")) + .unwrap_or("") + .trim_start_matches(|c: char| c == '-' || c == ' ' || c == ':'); + if reason.is_empty() { + "Evidence does not fully address the query.".to_string() + } else { + reason.to_string() + } + }; + + info!( + sufficient, + evidence = evidence.len(), + missing_info_len = missing_info.len(), + "Cross-doc sufficiency evaluation" + ); + + Ok(EvalResult { + sufficient, + missing_info, + }) +} + +/// Format evidence summary for sufficiency check. +/// Includes actual content so the check LLM can evaluate relevance. +pub fn format_evidence_summary(evidence: &[Evidence]) -> String { + if evidence.is_empty() { + return "(no evidence)".to_string(); + } + evidence + .iter() + .map(|e| { + let doc = e.doc_name.as_deref().unwrap_or("unknown"); + format!("[{}] (from {})\n{}", e.node_title, doc, e.content) + }) + .collect::>() + .join("\n\n") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_evidence_summary() { + let evidence = vec![ + Evidence { + source_path: "root/A".to_string(), + node_title: "A".to_string(), + content: "content".to_string(), + doc_name: Some("doc1".to_string()), + }, + Evidence { + source_path: "root/B".to_string(), + node_title: "B".to_string(), + content: "more content".to_string(), + doc_name: Some("doc2".to_string()), + }, + ]; + let summary = format_evidence_summary(&evidence); + assert!(summary.contains("[A]")); + assert!(summary.contains("doc1")); + assert!(summary.contains("[B]")); + assert!(summary.contains("doc2")); + } + + #[test] + fn test_format_evidence_summary_empty() { + let summary = format_evidence_summary(&[]); + assert!(summary.contains("no evidence")); + } +} diff --git a/vectorless-core/vectorless-agent/src/orchestrator/mod.rs b/vectorless-core/vectorless-agent/src/orchestrator/mod.rs new file mode 100644 index 00000000..17a3009f --- /dev/null +++ b/vectorless-core/vectorless-agent/src/orchestrator/mod.rs @@ -0,0 +1,223 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Orchestrator agent — supervisor loop for multi-document retrieval. +//! +//! The Orchestrator is a consuming-self struct implementing [`Agent`]: +//! 1. Analyze: LLM selects documents + tasks (informed by QueryPlan) +//! 2. Supervisor loop: dispatch → evaluate → replan if insufficient +//! 3. Rerank: dedup → BM25 scoring → synthesis/fusion + +mod analyze; +mod dispatch; +mod evaluate; +mod replan; +mod supervisor; + +use tracing::info; + +use vectorless_llm::LlmClient; +use vectorless_query::QueryPlan; + +use super::Agent; +use super::config::{AgentConfig, Output, WorkspaceContext}; +use super::events::EventEmitter; +use super::state::OrchestratorState; + +use analyze::{AnalyzeOutcome, analyze}; +use supervisor::run_supervisor_loop; + +/// Maximum supervisor loop iterations to prevent infinite loops. +const MAX_SUPERVISOR_ITERATIONS: u32 = 3; + +/// Orchestrator agent — coordinates multi-document retrieval. +/// +/// Holds all execution context. Calling [`run()`](Agent::run) consumes self. +pub struct Orchestrator<'a> { + query: String, + ws: &'a WorkspaceContext<'a>, + config: AgentConfig, + llm: LlmClient, + emitter: EventEmitter, + skip_analysis: bool, + /// Query understanding plan — produced by `QueryPipeline::understand()`. + /// Contains intent, complexity, key concepts, and strategy hints. + query_plan: QueryPlan, +} + +impl<'a> Orchestrator<'a> { + /// Create a new Orchestrator. + pub fn new( + query: &str, + ws: &'a WorkspaceContext<'a>, + config: AgentConfig, + llm: LlmClient, + emitter: EventEmitter, + skip_analysis: bool, + query_plan: QueryPlan, + ) -> Self { + Self { + query: query.to_string(), + ws, + config, + llm, + emitter, + skip_analysis, + query_plan, + } + } +} + +impl<'a> Agent for Orchestrator<'a> { + type Output = Output; + + fn name(&self) -> &str { + "orchestrator" + } + + async fn run(self) -> vectorless_error::Result { + let Orchestrator { + query, + ws, + config, + llm, + emitter, + skip_analysis, + query_plan, + } = self; + + info!( + docs = ws.doc_count(), + skip_analysis, + intent = %query_plan.intent, + complexity = %query_plan.complexity, + "Orchestrator starting" + ); + emitter.emit_orchestrator_started(&query, ws.doc_count(), skip_analysis); + + let mut state = OrchestratorState::new(); + let mut orch_llm_calls: u32 = 0; + + // --- Phase 1: Analyze — LLM selects documents + tasks --- + let initial_dispatches = match analyze( + &query, + ws, + &mut state, + &emitter, + skip_analysis, + &query_plan, + &llm, + ) + .await? + { + AnalyzeOutcome::Proceed { + dispatches, + llm_calls, + } => { + orch_llm_calls += llm_calls; + dispatches + } + AnalyzeOutcome::AlreadyAnswered { llm_calls } => { + let mut output = Output::empty(); + output.answer = "Already answered by cross-document search.".to_string(); + emitter.emit_orchestrator_completed(0, orch_llm_calls + llm_calls, 0); + return Ok(output); + } + AnalyzeOutcome::NoResults { llm_calls } => { + emitter.emit_orchestrator_completed(0, orch_llm_calls + llm_calls, 0); + return Ok(Output::empty()); + } + }; + + // --- Phase 2: Supervisor loop --- + let outcome = run_supervisor_loop( + &query, + initial_dispatches, + ws, + &config, + &llm, + &mut state, + &emitter, + &query_plan, + skip_analysis, + ) + .await?; + orch_llm_calls += outcome.llm_calls; + + let confidence = compute_confidence( + outcome.eval_sufficient, + outcome.iteration, + state.all_evidence.is_empty(), + ); + + // --- Phase 3: Finalize — rerank + synthesize --- + if state.all_evidence.is_empty() { + emitter.emit_orchestrator_completed(0, orch_llm_calls, 0); + return Ok(state.into_output(String::new())); + } + + let multi_doc = ws.doc_count() > 1; + finalize_output( + &query, + &state, + &emitter, + orch_llm_calls, + multi_doc, + query_plan.intent, + confidence, + ) + .await + } +} + +/// Compute confidence from LLM evaluate() outcome. +fn compute_confidence(eval_sufficient: bool, replan_rounds: u32, no_evidence: bool) -> f32 { + if no_evidence { + return 0.0; + } + if eval_sufficient { + // LLM said sufficient: first round = 0.95, each replan round drops 0.15 + (0.95 - replan_rounds as f32 * 0.15).max(0.5) + } else { + // LLM never said sufficient (budget exhausted or no more docs) + (0.4 - replan_rounds as f32 * 0.1).max(0.1) + } +} + +/// Rerank evidence and emit completion events. +pub async fn finalize_output( + query: &str, + state: &OrchestratorState, + emitter: &EventEmitter, + orch_llm_calls: u32, + multi_doc: bool, + intent: vectorless_query::QueryIntent, + confidence: f32, +) -> vectorless_error::Result { + let rerank_result = + vectorless_rerank::process(query, &state.all_evidence, multi_doc, intent, confidence).await?; + + let total_llm_calls = orch_llm_calls + rerank_result.llm_calls; + if !rerank_result.answer.is_empty() { + emitter.emit_answer_completed(rerank_result.answer.len(), "medium"); + } + + let mut output = state.clone_results_into_output(rerank_result.answer); + output.metrics.llm_calls += total_llm_calls; + output.confidence = rerank_result.confidence; + + emitter.emit_orchestrator_completed( + output.evidence.len(), + output.metrics.llm_calls, + output.metrics.rounds_used, + ); + + info!( + evidence = output.evidence.len(), + llm_calls = output.metrics.llm_calls, + confidence = output.confidence, + "Orchestrator complete" + ); + + Ok(output) +} diff --git a/vectorless-core/vectorless-agent/src/orchestrator/replan.rs b/vectorless-core/vectorless-agent/src/orchestrator/replan.rs new file mode 100644 index 00000000..5694c370 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/orchestrator/replan.rs @@ -0,0 +1,249 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Replan: LLM-driven re-dispatch after insufficient evidence. +//! +//! After evaluate() returns insufficient, the Orchestrator replans: +//! the LLM analyzes what's missing and decides which documents to query next. +//! This replaces the old heuristic supplement logic. + +use tracing::info; + +use vectorless_error::Error; +use vectorless_llm::LlmClient; +use vectorless_scoring::bm25::extract_keywords; + +use super::super::config::Evidence; +use super::super::prompts::DispatchEntry; + +/// Result of the replan phase. +pub struct ReplanResult { + /// New dispatch targets for the next round. + pub dispatches: Vec, + /// The LLM's reasoning about what was missing. + pub reasoning: String, +} + +/// Replan dispatch targets based on missing information. +/// +/// The LLM reviews: +/// - The original query +/// - What evidence has been collected so far +/// - What information is still missing +/// - Available documents that haven't been dispatched yet +/// +/// Returns new dispatch targets. LLM errors propagate. +pub async fn replan( + query: &str, + missing_info: &str, + collected_evidence: &[Evidence], + dispatched_indices: &[usize], + total_docs: usize, + doc_cards_text: &str, + llm: &LlmClient, +) -> vectorless_error::Result { + let evidence_summary = format_evidence_context(collected_evidence); + let keywords = extract_keywords(query); + let find_text = if keywords.is_empty() { + String::new() + } else { + format!("\nExtracted keywords: {}", keywords.join(", ")) + }; + + let (system, user) = replan_prompt( + query, + missing_info, + &evidence_summary, + dispatched_indices, + doc_cards_text, + &find_text, + ); + + info!( + evidence = collected_evidence.len(), + "Replanning dispatch targets..." + ); + let response = llm + .complete(&system, &user) + .await + .map_err(|e| Error::LlmReasoning { + stage: "orchestrator/replan".to_string(), + detail: format!("Replan LLM call failed: {e}"), + })?; + + info!( + response_len = response.len(), + "Replan LLM response received" + ); + + let dispatches = parse_replan_response(&response, total_docs, dispatched_indices); + let reasoning = response.lines().take(3).collect::>().join(" "); + + info!( + new_dispatches = dispatches.len(), + "Replan produced new dispatch targets" + ); + + Ok(ReplanResult { + dispatches, + reasoning, + }) +} + +/// Format collected evidence for the replan prompt. +/// Includes content so the LLM can reason about what's actually been found. +fn format_evidence_context(evidence: &[Evidence]) -> String { + if evidence.is_empty() { + return "(no evidence collected)".to_string(); + } + evidence + .iter() + .map(|e| { + let doc = e.doc_name.as_deref().unwrap_or("unknown"); + format!("[{}] (from {})\n{}", e.node_title, doc, e.content) + }) + .collect::>() + .join("\n\n") +} + +/// Build the replan prompt. +fn replan_prompt( + query: &str, + missing_info: &str, + evidence_summary: &str, + dispatched: &[usize], + doc_cards: &str, + keywords_text: &str, +) -> (String, String) { + let dispatched_set: Vec = dispatched + .iter() + .map(|&i| format!("doc {}", i + 1)) + .collect(); + let dispatched_text = if dispatched_set.is_empty() { + "None".to_string() + } else { + dispatched_set.join(", ") + }; + + let system = "You are a multi-document retrieval coordinator. The first round of evidence \ + collection was insufficient to fully answer the query. Review what was collected, \ + what's missing, and decide which additional documents to query. + +Output format — for each additional document to query, output a block: +- doc: + reason: + task: + +Only include documents not yet dispatched. If no additional documents are likely to help, \ +respond with: NO_ADDITIONAL_DOCS" + .to_string(); + + let user = format!( + "Original question: {query} + +Missing information: {missing_info} + +Collected evidence so far: +{evidence_summary} + +Already dispatched documents: {dispatched_text} + +Available documents (all): +{doc_cards}{keywords_text} + +Additional documents to query:" + ); + + (system, user) +} + +/// Parse the replan response into dispatch entries. +fn parse_replan_response( + response: &str, + total_docs: usize, + dispatched: &[usize], +) -> Vec { + let trimmed = response.trim(); + + if trimmed.starts_with("NO_ADDITIONAL_DOCS") { + return Vec::new(); + } + + let mut entries = Vec::new(); + let mut current_doc_idx: Option = None; + let mut current_reason = String::new(); + let mut current_task = String::new(); + + for line in trimmed.lines() { + let line = line.trim(); + + if let Some(rest) = line.strip_prefix("- doc:") { + // Flush previous + if let Some(idx) = current_doc_idx.take() { + entries.push(DispatchEntry { + doc_idx: idx, + reason: std::mem::take(&mut current_reason), + task: std::mem::take(&mut current_task), + }); + } + + let doc_num: usize = rest.trim().trim_end_matches(',').parse().unwrap_or(0); + if doc_num > 0 && doc_num <= total_docs { + let idx = doc_num - 1; + // Only include if not already dispatched + if !dispatched.contains(&idx) { + current_doc_idx = Some(idx); + } + } + } else if let Some(rest) = line.strip_prefix("reason:") { + current_reason = rest.trim().to_string(); + } else if let Some(rest) = line.strip_prefix("task:") { + current_task = rest.trim().to_string(); + } + } + + // Flush last + if let Some(idx) = current_doc_idx { + entries.push(DispatchEntry { + doc_idx: idx, + reason: current_reason, + task: current_task, + }); + } + + entries +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_replan_response_basic() { + let response = "\ +- doc: 3 + reason: May contain the missing financial data + task: Find Q4 revenue figures"; + let entries = parse_replan_response(response, 5, &[0, 1]); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].doc_idx, 2); + assert_eq!(entries[0].task, "Find Q4 revenue figures"); + } + + #[test] + fn test_parse_replan_response_already_dispatched() { + let response = "\ +- doc: 1 + reason: Already queried + task: test"; + let entries = parse_replan_response(response, 3, &[0]); + assert!(entries.is_empty()); // doc 1 (idx 0) already dispatched + } + + #[test] + fn test_parse_replan_response_no_additional() { + let response = "NO_ADDITIONAL_DOCS"; + let entries = parse_replan_response(response, 3, &[0, 1]); + assert!(entries.is_empty()); + } +} diff --git a/vectorless-core/vectorless-agent/src/orchestrator/supervisor.rs b/vectorless-core/vectorless-agent/src/orchestrator/supervisor.rs new file mode 100644 index 00000000..d98dd1a6 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/orchestrator/supervisor.rs @@ -0,0 +1,159 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Phase 2: Supervisor loop — dispatch → evaluate → replan. + +use tracing::info; + +use vectorless_llm::LlmClient; +use vectorless_query::QueryPlan; + +use super::super::config::{AgentConfig, WorkspaceContext}; +use super::super::events::EventEmitter; +use super::super::prompts::DispatchEntry; +use super::super::state::OrchestratorState; +use super::super::tools::orchestrator as orch_tools; +use super::MAX_SUPERVISOR_ITERATIONS; +use super::dispatch; +use super::evaluate::evaluate; +use super::replan::replan; + +/// Outcome of the supervisor loop. +pub struct SupervisorOutcome { + /// Number of replan iterations performed. + pub iteration: u32, + /// Whether the LLM evaluator judged evidence sufficient. + pub eval_sufficient: bool, + /// LLM calls consumed within the supervisor loop itself. + pub llm_calls: u32, +} + +/// Run the supervisor loop: dispatch → evaluate → replan. +/// +/// Returns a [`SupervisorOutcome`] summarizing what happened. +pub async fn run_supervisor_loop( + query: &str, + initial_dispatches: Vec, + ws: &WorkspaceContext<'_>, + config: &AgentConfig, + llm: &LlmClient, + state: &mut OrchestratorState, + emitter: &EventEmitter, + query_plan: &QueryPlan, + skip_analysis: bool, +) -> vectorless_error::Result { + let mut current_dispatches = initial_dispatches; + let mut iteration: u32 = 0; + let mut eval_sufficient = false; + let mut llm_calls: u32 = 0; + + loop { + if iteration >= MAX_SUPERVISOR_ITERATIONS { + info!(iteration, "Supervisor loop budget exhausted"); + break; + } + + // Dispatch current plan + if !current_dispatches.is_empty() { + info!( + docs = current_dispatches.len(), + docs_list = ?current_dispatches.iter().map(|d| d.doc_idx).collect::>(), + iteration, + "Dispatching Workers" + ); + dispatch::dispatch_and_collect( + query, + ¤t_dispatches, + ws, + config, + llm, + state, + emitter, + query_plan, + ) + .await; + } + + // No evidence at all — nothing to evaluate + if state.all_evidence.is_empty() { + info!("No evidence collected from any Worker"); + break; + } + + // Skip evaluation for user-specified documents (no replan needed) + if skip_analysis { + eval_sufficient = !state.all_evidence.is_empty(); + break; + } + + // Evaluate sufficiency + let eval_result = evaluate(query, &state.all_evidence, llm).await?; + llm_calls += 1; + + if eval_result.sufficient { + eval_sufficient = true; + info!( + evidence = state.all_evidence.len(), + iteration, "Evidence sufficient — exiting supervisor loop" + ); + break; + } + + // Insufficient — replan + info!( + evidence = state.all_evidence.len(), + missing = eval_result.missing_info.len(), + iteration, + "Evidence insufficient — replanning" + ); + + let doc_cards_text = orch_tools::ls_docs(ws).feedback; + let replan_result = replan( + query, + &eval_result.missing_info, + &state.all_evidence, + &state.dispatched, + ws.doc_count(), + &doc_cards_text, + llm, + ) + .await?; + llm_calls += 1; + + if replan_result.dispatches.is_empty() { + info!("Replan produced no new dispatches — exiting supervisor loop"); + break; + } + + current_dispatches = replan_result.dispatches; + iteration += 1; + } + + Ok(SupervisorOutcome { + iteration, + eval_sufficient, + llm_calls, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_supervisor_outcome_fields() { + let outcome = SupervisorOutcome { + iteration: 2, + eval_sufficient: true, + llm_calls: 5, + }; + assert_eq!(outcome.iteration, 2); + assert!(outcome.eval_sufficient); + assert_eq!(outcome.llm_calls, 5); + } + + #[test] + fn test_max_iterations_constant() { + assert_eq!(MAX_SUPERVISOR_ITERATIONS, 3); + } +} diff --git a/vectorless-core/vectorless-agent/src/prompts.rs b/vectorless-core/vectorless-agent/src/prompts.rs new file mode 100644 index 00000000..11d26fbf --- /dev/null +++ b/vectorless-core/vectorless-agent/src/prompts.rs @@ -0,0 +1,569 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Prompt templates for the retrieval agent. +//! +//! Prompts for agent-level operations: +//! 1. `worker_navigation` — Worker nav loop, every round +//! 2. `orchestrator_analysis` — Orchestrator Phase 1 +//! 3. `worker_dispatch` — Worker first round (when dispatched by Orchestrator) +//! 4. `check_sufficiency` — evidence sufficiency evaluation +//! +//! Post-processing prompts (answer synthesis, multi-doc fusion) have been +//! moved to `rerank/synthesis.rs` and `rerank/fusion.rs`. + +// --------------------------------------------------------------------------- +// Prompt 1: Worker Navigation (used every round in the nav loop) +// --------------------------------------------------------------------------- + +/// Parameters for the sub-agent navigation prompt. +pub struct NavigationParams<'a> { + pub query: &'a str, + /// Sub-task description (None when Worker is called directly). + pub task: Option<&'a str>, + /// Current breadcrumb path. + pub breadcrumb: &'a str, + /// Summary of collected evidence. + pub evidence_summary: &'a str, + /// Description of what's still missing (empty string if nothing). + pub missing_info: &'a str, + /// Feedback from the last command execution. + pub last_feedback: &'a str, + /// Remaining rounds. + pub remaining: u32, + /// Maximum rounds. + pub max_rounds: u32, + /// ReAct history of recent rounds. + pub history: &'a str, + /// Titles of already-visited nodes. + pub visited_titles: &'a str, + /// Navigation plan from bird's-eye analysis (empty if no plan). + pub plan: &'a str, + /// Query intent context from QueryPlan (e.g. "factual — find specific answer"). + /// Empty string if not available. + pub intent_context: &'a str, + /// Formatted keyword index matches (empty if none). + pub keyword_hints: &'a str, +} + +pub fn worker_navigation(params: &NavigationParams) -> (String, String) { + let query = params.query; + let breadcrumb = params.breadcrumb; + let evidence_summary = params.evidence_summary; + let remaining = params.remaining; + let max_rounds = params.max_rounds; + + let task_section = match params.task { + Some(task) => format!( + "\nYour specific task: {}\n(This is a sub-task for the original query.)", + task + ), + None => String::new(), + }; + + let missing_section = if params.missing_info.is_empty() { + String::new() + } else { + format!("\nPotentially missing info: {}", params.missing_info) + }; + + let last_feedback_section = if params.last_feedback.is_empty() { + String::new() + } else { + format!("\nLast command result:\n{}\n", params.last_feedback) + }; + + let history_section = if params.history == "(no history yet)" { + String::new() + } else { + format!("\nPrevious rounds:\n{}\n", params.history) + }; + + let visited_section = if params.visited_titles == "(none)" { + String::new() + } else { + format!( + "\nAlready visited (do not re-read these): {}", + params.visited_titles + ) + }; + + let plan_section = if params.plan.is_empty() { + String::new() + } else { + format!( + "\nNavigation plan (follow this as guidance, adapt if needed):\n{}\n", + params.plan + ) + }; + + let keyword_section = if params.keyword_hints.is_empty() { + String::new() + } else { + format!("\n{}", params.keyword_hints) + }; + + let intent_section = if params.intent_context.is_empty() { + String::new() + } else { + format!("\nQuery context: {}", params.intent_context) + }; + + let system = format!( + "You are a document navigation assistant. You navigate inside a document to find \ + information that answers the user's question. + +Available commands: +- ls List children at current position (with summaries and leaf counts) +- cd Enter a child node (supports relative paths like Section/Sub and absolute paths like /root/Section) +- cd .. Go back to parent node +- cat Read a child node's content (automatically collected as evidence) +- cat Read the current node's content (useful at leaf nodes) +- head Preview first 20 lines of a node (does NOT collect evidence) +- find Search for a keyword in the document index (also supports multi-word like 'Lab C') +- findtree Search for nodes by title pattern (case-insensitive) +- grep Regex search across all content in current subtree +- wc Show content size (lines, words, chars) +- pwd Show current navigation path +- check Evaluate if collected evidence is sufficient +- done End navigation + +SEARCH STRATEGY (important — follow this priority order): +- When keyword matches are shown below, navigate directly to the highest-weight matched node. \ +Do NOT explore other branches first — the keyword index has already identified the most relevant location. +- When find results include content snippets that answer the question, cd to that node and cat it immediately. +- Use find with the EXACT keyword from the list (single word, \ +not multi-word phrases). Example: if hint shows keyword 'performance' pointing to Performance section, \ +use find performance, NOT find \"performance guide\". +- Use ls only when you have no keyword hints or need to discover the structure of an unknown section. +- Use findtree when you know a section title pattern but not the exact name. + +NAVIGATION EFFICIENCY (critical — every round counts): +- Prefer cd with absolute paths (/root/Section/Subsection) or relative paths (Section/Sub) \ +to reach target nodes in ONE command instead of multiple cd steps. +- Do NOT ls before cd if keyword hints or find results already tell you which node to enter. +- Do NOT cd into nodes one level at a time when you can use a multi-segment path. + +Rules: +- Output exactly ONE command per response, nothing else. +- Content from cat is automatically saved as evidence — don't re-cat the same node. +- Do not cat or cd into nodes you have already visited. +- If the current branch has nothing relevant, use cd .. to go back. +- If you're at the root and no children seem relevant, use done. + +STOPPING RULES (critical — follow these strictly): +- After cat collects evidence, immediately check: does the collected text contain information \ + that answers or relates to the user's question? If YES, output done. Do NOT continue searching. +- Do NOT run grep after cat — cat already collected the full content. grep is for locating \ + content BEFORE cat, not after. +- If ls shows '(no navigation data)' or no children, you are at a leaf node. Use cat to read it \ + or cd .. to go back. Do NOT ls again. +- When remaining rounds are low (≤2), prefer done over exploring new branches." + ); + + let user = format!( + "{last_feedback_section}\ +User question: {query}{task_section}{intent_section} + +Current position: /{breadcrumb} +Collected evidence: +{evidence_summary}{missing_section}{keyword_section}{visited_section}{plan_section} +{history_section} +Remaining rounds: {remaining}/{max_rounds} + +Command:" + ); + + (system, user) +} + +// --------------------------------------------------------------------------- +// Prompt 2: Orchestrator Analysis (multi-doc Phase 1) +// --------------------------------------------------------------------------- + +/// Parameters for the orchestrator analysis prompt. +pub struct OrchestratorAnalysisParams<'a> { + pub query: &'a str, + /// Formatted DocCard listing from ls_docs. + pub doc_cards: &'a str, + /// Formatted cross-document search results. + pub find_results: &'a str, + /// Query understanding context (intent, concepts, strategy, complexity). + pub intent_context: &'a str, +} + +pub fn orchestrator_analysis(params: &OrchestratorAnalysisParams) -> (String, String) { + let doc_cards = params.doc_cards; + let find_results = params.find_results; + let query = params.query; + let intent_context = params.intent_context; + + let system = + "You are a multi-document retrieval coordinator. Analyze the user's question, \ + review the available documents, and decide which documents to search and what to look for in each. + +Output format — for each relevant document, output a block: +- doc: + reason: + task: + +Only include documents that are likely to contain relevant information. +If the cross-document search results already fully answer the question, respond with just: ALREADY_ANSWERED".to_string(); + + let user = format!( + "Available documents: +{doc_cards} + +Cross-document search results: +{find_results} +{intent_context} + +User question: {query} + +Relevant documents:" + ); + + (system, user) +} + +// --------------------------------------------------------------------------- +// Prompt 3: Worker Dispatch (first-round prompt when Orchestrator dispatches) +// --------------------------------------------------------------------------- + +/// Parameters for the dispatch prompt. +pub struct WorkerDispatchParams<'a> { + pub original_query: &'a str, + pub task: &'a str, + pub doc_name: &'a str, + pub breadcrumb: &'a str, +} + +pub fn worker_dispatch(params: &WorkerDispatchParams) -> (String, String) { + let doc_name = params.doc_name; + let original_query = params.original_query; + let task = params.task; + let breadcrumb = params.breadcrumb; + + let system = format!( + "You are a document navigation assistant. You are searching inside the document \ + \"{doc_name}\" for specific information. + +Available commands: ls, cd (supports Section/Sub paths and /root/Section absolute paths), \ +cd .., cat, cat , head , find , findtree , grep , wc , \ +pwd, check, done + +SEARCH STRATEGY: +- Prefer find to jump directly to relevant sections over manual ls→cd exploration. +- When find results include content snippets that answer your task, cd to that node and cat it immediately. +- Use multi-segment paths (e.g. cd Research Labs/Lab A) to reach targets in ONE command. +- Do NOT ls before cd if find results already tell you which node to enter. +- Use findtree when you know a section title pattern but not the exact name. + +Rules: +- Output exactly ONE command per response. +- Content from cat is automatically saved as evidence. +- After cat collects evidence, if it relates to your task, use done immediately. +- Do NOT grep after cat — cat already collected the full content. +- If ls shows no children, use cat to read the current node or cd .. to go back. +- When evidence is sufficient, use done." + ); + + let user = format!( + "Original question: {original_query} +Your task: {task} +Document: {doc_name} +Current position: /{breadcrumb} + +Command:" + ); + + (system, user) +} + +// --------------------------------------------------------------------------- +// Prompt 4: Check (evidence sufficiency evaluation) +// --------------------------------------------------------------------------- + +/// Build the check prompt for LLM-based sufficiency evaluation. +pub fn check_sufficiency(query: &str, evidence_summary: &str) -> (String, String) { + let system = "You evaluate whether collected evidence contains information that can answer or \ + relate to the user's question. The evidence is raw document text — it does not need to be \ + a complete or perfect answer. If the evidence mentions or addresses the key concepts from \ + the question, it is sufficient. + +Respond with ONLY 'SUFFICIENT' or 'INSUFFICIENT' followed by a one-line reason. + +Guidelines: +- If the evidence text contains any information directly related to the question's key terms, \ +respond SUFFICIENT. +- If the evidence is completely unrelated or empty, respond INSUFFICIENT. +- Default to SUFFICIENT unless the evidence is clearly irrelevant." + .to_string(); + + let user = format!( + "Question: {query}\n\n\ + Collected evidence:\n\ + {evidence_summary}\n\n\ + Is this sufficient?" + ); + + (system, user) +} + +// --------------------------------------------------------------------------- +// Dispatch plan parsing +// --------------------------------------------------------------------------- + +/// A single dispatch entry parsed from orchestrator analysis. +#[derive(Debug, Clone)] +pub struct DispatchEntry { + /// Document index (0-based). + pub doc_idx: usize, + /// Why this document was selected. + pub reason: String, + /// What to search for in this document. + pub task: String, +} + +/// Parse the LLM output from orchestrator analysis into dispatch entries. +/// +/// Returns `None` if the response is "ALREADY_ANSWERED". +/// Returns empty vec if no valid dispatch entries found. +pub fn parse_dispatch_plan(llm_output: &str, total_docs: usize) -> Option> { + let trimmed = llm_output.trim(); + + if trimmed.starts_with("ALREADY_ANSWERED") { + return None; + } + + let mut entries = Vec::new(); + let mut current_doc_idx: Option = None; + let mut current_reason = String::new(); + let mut current_task = String::new(); + + for line in trimmed.lines() { + let line = line.trim(); + + if let Some(rest) = line.strip_prefix("- doc:") { + // Flush previous entry + if let Some(idx) = current_doc_idx.take() { + entries.push(DispatchEntry { + doc_idx: idx, + reason: std::mem::take(&mut current_reason), + task: std::mem::take(&mut current_task), + }); + } + + let doc_num: usize = rest.trim().trim_end_matches(',').parse().unwrap_or(0); + if doc_num > 0 && doc_num <= total_docs { + current_doc_idx = Some(doc_num - 1); // Convert to 0-based + } else if doc_num > 0 { + tracing::warn!( + requested_doc = doc_num, + total_docs, + "Dispatch plan references out-of-range document, skipping" + ); + } + } else if let Some(rest) = line.strip_prefix("reason:") { + current_reason = rest.trim().to_string(); + } else if let Some(rest) = line.strip_prefix("task:") { + current_task = rest.trim().to_string(); + } + } + + // Flush last entry + if let Some(idx) = current_doc_idx { + entries.push(DispatchEntry { + doc_idx: idx, + reason: current_reason, + task: current_task, + }); + } + + Some(entries) +} + +/// Parse the sufficiency check response. +pub fn parse_sufficiency_response(response: &str) -> bool { + let upper = response.trim().to_uppercase(); + upper.starts_with("SUFFICIENT") && !upper.starts_with("INSUFFICIENT") +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_worker_navigation_without_task() { + let params = NavigationParams { + query: "What is the revenue?", + task: None, + breadcrumb: "root/Financial Statements", + evidence_summary: "- [Revenue] 200 chars", + missing_info: "2024 comparison", + last_feedback: "[1] Q1 Report — Q1 data (5 leaves)\n[2] Q2 Report — Q2 data (5 leaves)", + remaining: 5, + max_rounds: 15, + history: "(no history yet)", + visited_titles: "(none)", + plan: "", + intent_context: "", + keyword_hints: "", + }; + + let (system, user) = worker_navigation(¶ms); + assert!(system.contains("document navigation")); + assert!(system.contains("SEARCH STRATEGY")); + assert!(user.contains("What is the revenue?")); + assert!(user.contains("root/Financial Statements")); + assert!(user.contains("200 chars")); + assert!(user.contains("2024 comparison")); + assert!(user.contains("5/15")); + assert!(!user.contains("sub-task")); + } + + #[test] + fn test_worker_navigation_with_keyword_hints() { + let params = NavigationParams { + query: "What is the revenue?", + task: None, + breadcrumb: "root", + evidence_summary: "(none)", + missing_info: "", + last_feedback: "", + remaining: 8, + max_rounds: 15, + history: "(no history yet)", + visited_titles: "(none)", + plan: "", + intent_context: "", + keyword_hints: "Keyword matches (use find to jump directly):\n - 'revenue' → root > Revenue (weight 0.85)\n", + }; + + let (_, user) = worker_navigation(¶ms); + assert!(user.contains("revenue")); + assert!(user.contains("find")); + } + + #[test] + fn test_worker_navigation_with_task() { + let params = NavigationParams { + query: "Compare 2024 and 2023 revenue", + task: Some("Find revenue data in this document"), + breadcrumb: "root", + evidence_summary: "(none)", + missing_info: "", + last_feedback: "", + remaining: 8, + max_rounds: 15, + history: "(no history yet)", + visited_titles: "(none)", + plan: "", + intent_context: "analytical — comparative analysis", + keyword_hints: "", + }; + + let (_, user) = worker_navigation(¶ms); + assert!(user.contains("Find revenue data")); + assert!(user.contains("sub-task")); + } + + #[test] + fn test_orchestrator_analysis() { + let params = OrchestratorAnalysisParams { + query: "Compare 2024 and 2023 revenue", + doc_cards: "[1] 2024 Report\n[2] 2023 Report", + find_results: "doc 1: keyword 'revenue' matched", + intent_context: "\nQuery intent: analytical (complexity: moderate)", + }; + + let (system, user) = orchestrator_analysis(¶ms); + assert!(system.contains("multi-document")); + assert!(user.contains("2024 Report")); + assert!(user.contains("revenue")); + assert!(user.contains("analytical")); + } + + #[test] + fn test_worker_dispatch() { + let params = WorkerDispatchParams { + original_query: "Compare revenue", + task: "Find 2024 revenue figures", + doc_name: "2024 Annual Report", + breadcrumb: "root", + }; + + let (system, user) = worker_dispatch(¶ms); + assert!(system.contains("2024 Annual Report")); + assert!(user.contains("Compare revenue")); + assert!(user.contains("Find 2024 revenue")); + } + + #[test] + fn test_check_sufficiency() { + let (system, user) = check_sufficiency("What is X?", "- [A] some data"); + assert!(system.contains("SUFFICIENT")); + assert!(user.contains("What is X?")); + } + + // --- Dispatch plan parsing --- + + #[test] + fn test_parse_dispatch_plan_basic() { + let output = "\ +- doc: 1 + reason: Contains revenue data + task: Find 2024 revenue figures +- doc: 2 + reason: Contains comparison data + task: Find 2023 revenue figures"; + + let entries = parse_dispatch_plan(output, 3).unwrap(); + assert_eq!(entries.len(), 2); + assert_eq!(entries[0].doc_idx, 0); + assert_eq!(entries[0].task, "Find 2024 revenue figures"); + assert_eq!(entries[1].doc_idx, 1); + assert_eq!(entries[1].reason, "Contains comparison data"); + } + + #[test] + fn test_parse_dispatch_plan_already_answered() { + let output = "ALREADY_ANSWERED"; + assert!(parse_dispatch_plan(output, 3).is_none()); + } + + #[test] + fn test_parse_dispatch_plan_empty() { + let entries = parse_dispatch_plan("no relevant documents", 3).unwrap(); + assert!(entries.is_empty()); + } + + #[test] + fn test_parse_dispatch_plan_out_of_range() { + let output = "\ +- doc: 99 + reason: test + task: test"; + + let entries = parse_dispatch_plan(output, 3).unwrap(); + assert!(entries.is_empty()); // doc 99 is out of range, skipped + } + + // --- Sufficiency parsing --- + + #[test] + fn test_parse_sufficiency_sufficient() { + assert!(parse_sufficiency_response("SUFFICIENT - we have all data")); + assert!(parse_sufficiency_response("Sufficient")); + } + + #[test] + fn test_parse_sufficiency_insufficient() { + assert!(!parse_sufficiency_response("INSUFFICIENT - missing data")); + assert!(!parse_sufficiency_response("Insufficient")); + } +} diff --git a/vectorless-core/vectorless-agent/src/state.rs b/vectorless-core/vectorless-agent/src/state.rs new file mode 100644 index 00000000..d2d3029c --- /dev/null +++ b/vectorless-core/vectorless-agent/src/state.rs @@ -0,0 +1,312 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Agent state types — mutable state that lives within a single retrieve() call. + +use std::collections::HashSet; + +use vectorless_document::NodeId; +use vectorless_document::TraceStep; + +use super::config::{Evidence, Output}; + +// --------------------------------------------------------------------------- +// Worker state +// --------------------------------------------------------------------------- + +/// Mutable navigation state for a Worker loop. +/// +/// Created at loop start, destroyed at loop end. Never escapes the call. +pub struct WorkerState { + /// Navigation breadcrumb (path from root to current node). + pub breadcrumb: Vec, + /// Current position in the document tree. + pub current_node: NodeId, + /// Collected evidence so far. + pub evidence: Vec, + /// Nodes already visited (prevents redundant reads). + pub visited: HashSet, + /// Nodes whose content has been collected via cat. Separate from visited + /// because cd-ing through a node ≠ reading its content. + pub collected_nodes: HashSet, + /// Remaining navigation rounds. + pub remaining: u32, + /// Maximum rounds (for display in prompts). + pub max_rounds: u32, + /// Feedback from the last executed command (injected into next prompt). + pub last_feedback: String, + /// Structured description of what information is still missing. + /// Updated after `check` returns "insufficient". + pub missing_info: String, + /// ReAct history: summary of each round's command + result. + /// Keeps last N entries for prompt injection. + pub history: Vec, + /// Navigation plan generated after bird's-eye view (Phase 1.5). + /// Injected into subsequent prompts as guidance (non-binding). + pub plan: String, + /// Number of times `check` has been called. + pub check_count: u32, + /// Whether a navigation plan was generated in Phase 1.5. + pub plan_generated: bool, + /// Reasoning trace steps collected during navigation. + pub trace_steps: Vec, +} + +/// Maximum number of history entries to keep for prompt injection. +const MAX_HISTORY_ENTRIES: usize = 6; + +impl WorkerState { + /// Create a new state starting at the given root node. + pub fn new(root: NodeId, max_rounds: u32) -> Self { + Self { + breadcrumb: vec!["root".to_string()], + current_node: root, + evidence: Vec::new(), + visited: HashSet::new(), + collected_nodes: HashSet::new(), + remaining: max_rounds, + max_rounds, + last_feedback: String::new(), + missing_info: String::new(), + history: Vec::new(), + plan: String::new(), + check_count: 0, + plan_generated: false, + trace_steps: Vec::new(), + } + } + + /// Consume the remaining rounds. + pub fn dec_round(&mut self) { + if self.remaining > 0 { + self.remaining -= 1; + } + } + + /// Set feedback from tool execution. + pub fn set_feedback(&mut self, feedback: String) { + self.last_feedback = feedback; + } + + /// Navigate into a child node. + pub fn cd(&mut self, node: NodeId, title: &str) { + self.breadcrumb.push(title.to_string()); + self.current_node = node; + } + + /// Navigate back to parent. + /// + /// Returns `false` if already at root. + pub fn cd_up(&mut self, parent: NodeId) -> bool { + if self.breadcrumb.len() <= 1 { + return false; + } + self.breadcrumb.pop(); + self.current_node = parent; + true + } + + /// Add a piece of evidence. + pub fn add_evidence(&mut self, evidence: Evidence) { + self.evidence.push(evidence); + } + + /// Check if evidence has already been collected for a specific node. + pub fn has_evidence_for(&self, node_id: vectorless_document::NodeId) -> bool { + self.collected_nodes.contains(&node_id) + } + + /// Push a history entry (command + result summary). + /// Keeps only the last `MAX_HISTORY_ENTRIES` entries. + pub fn push_history(&mut self, entry: String) { + if self.history.len() >= MAX_HISTORY_ENTRIES { + self.history.remove(0); + } + self.history.push(entry); + } + + /// Format history as text for prompt injection. + pub fn history_text(&self) -> String { + if self.history.is_empty() { + return "(no history yet)".to_string(); + } + self.history + .iter() + .enumerate() + .map(|(i, h)| format!("{}. {}", i + 1, h)) + .collect::>() + .join("\n") + } + + /// Format the breadcrumb as a path string (e.g., "root/Chapter 1/Section 1.2"). + pub fn path_str(&self) -> String { + self.breadcrumb.join("/") + } + + /// Summary of collected evidence for prompts. + pub fn evidence_summary(&self) -> String { + if self.evidence.is_empty() { + return "(none)".to_string(); + } + self.evidence + .iter() + .map(|e| format!("- [{}] {} chars", e.node_title, e.content.len())) + .collect::>() + .join("\n") + } + + /// Evidence with actual content for sufficiency evaluation. + pub fn evidence_for_check(&self) -> String { + if self.evidence.is_empty() { + return "(no evidence collected yet)".to_string(); + } + self.evidence + .iter() + .map(|e| format!("[{}]\n{}", e.node_title, e.content)) + .collect::>() + .join("\n\n") + } + + /// Convert this state into a WorkerOutput (consuming the state), with budget flag. + /// Worker returns evidence only — no answer synthesis. + pub fn into_worker_output( + self, + llm_calls: u32, + budget_exhausted: bool, + doc_name: &str, + ) -> super::config::WorkerOutput { + let evidence_chars: usize = self.evidence.iter().map(|e| e.content.len()).sum(); + super::config::WorkerOutput { + evidence: self.evidence, + metrics: super::config::WorkerMetrics { + rounds_used: self.max_rounds.saturating_sub(self.remaining), + llm_calls, + nodes_visited: self.visited.len(), + budget_exhausted, + plan_generated: self.plan_generated, + check_count: self.check_count, + evidence_chars, + }, + doc_name: doc_name.to_string(), + trace_steps: self.trace_steps, + } + } +} + +// --------------------------------------------------------------------------- +// Orchestrator state +// --------------------------------------------------------------------------- + +/// Mutable state for the Orchestrator loop. +/// +/// Tracks which documents have been dispatched and collects Worker results. +pub struct OrchestratorState { + /// Indices of documents that have been dispatched. + pub dispatched: Vec, + /// Results returned by dispatched Workers. + pub sub_results: Vec, + /// All evidence merged from sub-results. + pub all_evidence: Vec, + /// Whether the analysis phase is complete. + pub analyze_done: bool, + /// Total LLM calls across orchestrator + sub-agents. + pub total_llm_calls: u32, +} + +impl OrchestratorState { + /// Create a new orchestrator state. + pub fn new() -> Self { + Self { + dispatched: Vec::new(), + sub_results: Vec::new(), + all_evidence: Vec::new(), + analyze_done: false, + total_llm_calls: 0, + } + } + + /// Record a dispatch to document at the given index. + pub fn record_dispatch(&mut self, doc_idx: usize) { + if !self.dispatched.contains(&doc_idx) { + self.dispatched.push(doc_idx); + } + } + + /// Collect a Worker result, converting WorkerOutput to Output for internal tracking. + pub fn collect_result(&mut self, doc_idx: usize, result: super::config::WorkerOutput) { + self.total_llm_calls += result.metrics.llm_calls; + self.all_evidence.extend(result.evidence.iter().cloned()); + self.sub_results.push(result.into()); + self.record_dispatch(doc_idx); + } + + /// Clone results into an Output without consuming self. + /// + /// Used by `finalize_output` which needs to borrow state for rerank. + pub fn clone_results_into_output(&self, answer: String) -> Output { + Output { + answer, + evidence: self.all_evidence.clone(), + metrics: super::config::Metrics { + llm_calls: self.total_llm_calls, + nodes_visited: self + .sub_results + .iter() + .map(|r| r.metrics.nodes_visited) + .sum(), + plan_generated: self.sub_results.iter().any(|r| r.metrics.plan_generated), + check_count: self.sub_results.iter().map(|r| r.metrics.check_count).sum(), + evidence_chars: self + .sub_results + .iter() + .map(|r| r.metrics.evidence_chars) + .sum(), + ..Default::default() + }, + confidence: 0.0, + trace_steps: self.collect_trace_steps(), + } + } + + /// Merge all sub-results into a single Output (consuming self). + pub fn into_output(self, answer: String) -> Output { + let trace_steps = self.collect_trace_steps(); + Output { + answer, + evidence: self.all_evidence, + metrics: super::config::Metrics { + llm_calls: self.total_llm_calls, + nodes_visited: self + .sub_results + .iter() + .map(|r| r.metrics.nodes_visited) + .sum(), + plan_generated: self.sub_results.iter().any(|r| r.metrics.plan_generated), + check_count: self.sub_results.iter().map(|r| r.metrics.check_count).sum(), + evidence_chars: self + .sub_results + .iter() + .map(|r| r.metrics.evidence_chars) + .sum(), + ..Default::default() + }, + confidence: 0.0, + trace_steps, + } + } + + /// Collect trace steps from all sub-results. + fn collect_trace_steps(&self) -> Vec { + let mut steps = Vec::new(); + for result in &self.sub_results { + steps.extend(result.trace_steps.iter().cloned()); + } + steps + } +} + +impl Default for OrchestratorState { + fn default() -> Self { + Self::new() + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/common.rs b/vectorless-core/vectorless-agent/src/tools/common.rs new file mode 100644 index 00000000..740510de --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/common.rs @@ -0,0 +1,69 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Common tools shared between Orchestrator and Worker (find, check, done). + +use super::ToolResult; + +/// Execute a `find` command — search for a keyword. +/// +/// Returns formatted search results as feedback text. +pub fn format_find_result(keyword: &str, hits: &[super::super::context::FindHit]) -> String { + if hits.is_empty() { + return format!("No results found for '{}'", keyword); + } + + let mut output = format!("Results for '{}':\n", keyword); + for hit in hits { + for entry in &hit.entries { + output.push_str(&format!( + " - node (depth {}, weight {:.2})\n", + entry.depth, entry.weight + )); + } + } + output +} + +/// Execute a `check` command — evaluate evidence sufficiency. +/// +/// Returns a formatted summary of current evidence for the LLM to evaluate. +pub fn format_check_prompt(evidence_summary: &str, query: &str) -> String { + format!( + "Please evaluate whether the collected evidence is sufficient to answer the query.\n\n\ + Query: {}\n\n\ + Evidence:\n{}\n\n\ + Is this sufficient? Answer YES or NO and briefly explain.", + query, evidence_summary + ) +} + +/// Execute a `done` command — signal loop termination. +pub fn format_done() -> ToolResult { + ToolResult::done("Navigation complete.") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_find_result_empty() { + let result = format_find_result("nonexistent", &[]); + assert!(result.contains("No results")); + } + + #[test] + fn test_format_check_prompt() { + let prompt = format_check_prompt("- [Intro] 500 chars", "What is X?"); + assert!(prompt.contains("What is X?")); + assert!(prompt.contains("500 chars")); + } + + #[test] + fn test_format_done() { + let result = format_done(); + assert!(result.should_stop); + assert!(result.success); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/mod.rs b/vectorless-core/vectorless-agent/src/tools/mod.rs new file mode 100644 index 00000000..c44c0021 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/mod.rs @@ -0,0 +1,101 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Tool definitions for the retrieval agent. +//! +//! Tools are organized by role: +//! - `common` — shared between Orchestrator and Worker (find, check, done) +//! - `worker` — Worker-specific (ls, cd, cd_up, cat, pwd) +//! - `orchestrator` — Orchestrator-specific (ls_docs, find_cross, dispatch) + +pub mod common; +pub mod orchestrator; +pub mod worker; + +/// Result of executing a tool command. +#[derive(Debug, Clone)] +pub struct ToolResult { + /// Text feedback to include in the next LLM prompt. + pub feedback: String, + /// Whether the loop should stop. + pub should_stop: bool, + /// Whether the command executed successfully. + pub success: bool, +} + +impl ToolResult { + /// Create a successful result with feedback. + pub fn ok(feedback: impl Into) -> Self { + Self { + feedback: feedback.into(), + should_stop: false, + success: true, + } + } + + /// Create a result that signals loop termination. + pub fn done(feedback: impl Into) -> Self { + Self { + feedback: feedback.into(), + should_stop: true, + success: true, + } + } + + /// Create a failed result (parse error, invalid target, etc.). + pub fn fail(feedback: impl Into) -> Self { + Self { + feedback: feedback.into(), + should_stop: false, + success: false, + } + } +} + +/// Extract a content snippet around the first occurrence of `keyword`. +/// +/// Returns `None` if the content is empty. If the keyword is not found, +/// returns the beginning of the content instead. +pub fn content_snippet(content: &str, keyword: &str, max_len: usize) -> Option { + if content.trim().is_empty() { + return None; + } + + let keyword_lower = keyword.to_lowercase(); + let content_lower = content.to_lowercase(); + + let start = match content_lower.find(&keyword_lower) { + Some(pos) => { + let back = (max_len / 4).min(pos); + pos - back + } + None => 0, + }; + + let start = content + .char_indices() + .find(|(i, _)| *i >= start) + .map(|(i, _)| i) + .unwrap_or(0); + + let end = content + .char_indices() + .take_while(|(i, _)| *i <= start + max_len) + .last() + .map(|(i, c)| i + c.len_utf8()) + .unwrap_or(content.len()); + + let snippet = content[start..end].trim(); + if snippet.is_empty() { + return None; + } + + let mut result = snippet.to_string(); + if end < content.len() { + result.push_str("..."); + } + if start > 0 { + result = format!("...{}", result); + } + Some(result) +} diff --git a/vectorless-core/vectorless-agent/src/tools/orchestrator.rs b/vectorless-core/vectorless-agent/src/tools/orchestrator.rs new file mode 100644 index 00000000..4f9e053e --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/orchestrator.rs @@ -0,0 +1,203 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Orchestrator tools: ls_docs, find_cross, dispatch. + +use super::ToolResult; +use crate::agent::config::WorkspaceContext; + +/// Execute `ls_docs` — list all document cards. +/// +/// Returns a formatted view of all DocCards for the Orchestrator's Bird's-Eye View. +pub fn ls_docs(ctx: &WorkspaceContext) -> ToolResult { + let cards = ctx.doc_cards(); + + if cards.is_empty() { + return ToolResult::ok("No documents with DocCards available."); + } + + let mut output = format!("Available documents ({} total):\n\n", ctx.doc_count()); + + for (idx, card) in &cards { + output.push_str(&format!( + "[{}] {} — {}\n", + idx + 1, + card.title, + card.overview + )); + + for sec in &card.sections { + output.push_str(&format!( + " → {} ({} leaves)\n", + sec.title, sec.leaf_count + )); + } + + if !card.question_hints.is_empty() { + output.push_str(&format!( + " Can answer: {}\n", + card.question_hints.join(", ") + )); + } + + if !card.topic_tags.is_empty() { + output.push_str(&format!(" Topics: {}\n", card.topic_tags.join(", "))); + } + + output.push('\n'); + } + + // Also mention docs without cards + let with_cards: Vec = cards.iter().map(|(idx, _)| *idx).collect(); + let without_cards: Vec = (0..ctx.doc_count()) + .filter(|i| !with_cards.contains(i)) + .collect(); + + if !without_cards.is_empty() { + output.push_str(&format!( + "Documents without DocCards: {:?}\n", + without_cards + .iter() + .map(|i| format!("doc_{}", i)) + .collect::>() + )); + } + + ToolResult::ok(output) +} + +/// Execute `find_cross` — search keywords across all documents. +/// +/// Returns formatted results showing which documents matched, with content snippets. +pub fn find_cross(keywords: &[String], ctx: &WorkspaceContext) -> ToolResult { + let results = ctx.find_cross_all(keywords); + + if results.is_empty() { + return ToolResult::ok(format!( + "No matches found for keywords: {}", + keywords.join(", ") + )); + } + + let mut output = String::new(); + for (doc_idx, hits) in &results { + let doc = ctx.doc(*doc_idx); + let doc_name = doc.map(|d| d.doc_name).unwrap_or("unknown"); + output.push_str(&format!("Document [{}] {}:\n", doc_idx + 1, doc_name)); + + for hit in hits { + for entry in &hit.entries { + let title = doc + .and_then(|d| d.node_title(entry.node_id)) + .unwrap_or("unknown"); + output.push_str(&format!( + " keyword '{}' → {} (depth {}, weight {:.2})", + hit.keyword, title, entry.depth, entry.weight + )); + // Include content snippet for cross-doc relevance judgment + if let Some(content) = doc.and_then(|d| d.cat(entry.node_id)) { + if let Some(snippet) = super::content_snippet(content, &hit.keyword, 300) { + output.push_str(&format!("\n \"{}\"", snippet)); + } + } + output.push('\n'); + } + } + output.push('\n'); + } + + ToolResult::ok(output) +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::{DocCard, NavigationIndex, ReasoningIndex, SectionCard}; + + fn build_workspace() -> ( + Vec, + Vec, + Vec, + ) { + let tree1 = vectorless_document::DocumentTree::new("2024 Report", "content"); + let mut nav1 = NavigationIndex::new(); + nav1.set_doc_card(DocCard { + title: "2024 Financial Report".to_string(), + overview: "Annual financial statements".to_string(), + question_hints: vec!["Revenue?".to_string()], + topic_tags: vec!["finance".to_string(), "2024".to_string()], + sections: vec![SectionCard { + title: "Revenue".to_string(), + description: "Revenue breakdown".to_string(), + leaf_count: 5, + }], + total_leaves: 10, + }); + + let tree2 = vectorless_document::DocumentTree::new("2023 Report", "content"); + let mut nav2 = NavigationIndex::new(); + nav2.set_doc_card(DocCard { + title: "2023 Financial Report".to_string(), + overview: "Previous year financial statements".to_string(), + question_hints: vec!["Sales?".to_string()], + topic_tags: vec!["finance".to_string(), "2023".to_string()], + sections: vec![SectionCard { + title: "Net Sales".to_string(), + description: "Net sales figures".to_string(), + leaf_count: 4, + }], + total_leaves: 8, + }); + + ( + vec![tree1, tree2], + vec![nav1, nav2], + vec![ReasoningIndex::default(), ReasoningIndex::default()], + ) + } + + #[test] + fn test_ls_docs_shows_cards() { + let (trees, navs, ridxs) = build_workspace(); + let docs = vec![ + crate::agent::config::DocContext { + tree: &trees[0], + nav_index: &navs[0], + reasoning_index: &ridxs[0], + doc_name: "2024", + }, + crate::agent::config::DocContext { + tree: &trees[1], + nav_index: &navs[1], + reasoning_index: &ridxs[1], + doc_name: "2023", + }, + ]; + let ctx = WorkspaceContext::new(docs); + + let result = ls_docs(&ctx); + assert!(result.success); + assert!(result.feedback.contains("2024 Financial Report")); + assert!(result.feedback.contains("2023 Financial Report")); + assert!(result.feedback.contains("Revenue")); + assert!(result.feedback.contains("finance")); + } + + #[test] + fn test_ls_docs_empty() { + let tree = vectorless_document::DocumentTree::new("Empty", ""); + let nav = NavigationIndex::new(); + let ridx = ReasoningIndex::default(); + let docs = vec![crate::agent::config::DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &ridx, + doc_name: "empty", + }]; + let ctx = WorkspaceContext::new(docs); + + let result = ls_docs(&ctx); + assert!(result.success); + assert!(result.feedback.contains("No documents with DocCards")); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/cat.rs b/vectorless-core/vectorless-agent/src/tools/worker/cat.rs new file mode 100644 index 00000000..e675bfa0 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/cat.rs @@ -0,0 +1,115 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `cat` — read node content and collect as evidence. + +use crate::agent::command; +use crate::agent::config::{DocContext, Evidence}; +use crate::agent::state::WorkerState; + +use super::super::ToolResult; + +/// Execute `cat ` — read node content and collect as evidence. +/// +/// Special targets: +/// - `cat .` or `cat` (no arg) reads the current node's content. +/// - Otherwise resolves the target to a child node by name. +pub fn cat(target: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { + let node_id = if target == "." || target.is_empty() { + state.current_node + } else { + match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) + { + Some(id) => id, + None => { + return ToolResult::fail(format!( + "Target '{}' not found. Use 'ls' to see children, or 'cat .' to read current node.", + target + )); + } + } + }; + + if state.has_evidence_for(node_id) { + let title = ctx.node_title(node_id).unwrap_or("unknown"); + return ToolResult::ok(format!( + "[Already collected: {}]. Use a different target or cd to another branch.", + title + )); + } + + match ctx.cat(node_id) { + Some(content) => { + let title = ctx.node_title(node_id).unwrap_or("unknown").to_string(); + let content_string = content.to_string(); + + state.add_evidence(Evidence { + source_path: format!("{}/{}", state.path_str(), title), + node_title: title.clone(), + content: content_string.clone(), + doc_name: Some(ctx.doc_name.to_string()), + }); + + state.collected_nodes.insert(node_id); + state.visited.insert(node_id); + + ToolResult::ok(format!( + "[Evidence collected: {}]\n{}", + title, content_string + )) + } + None => ToolResult::fail(format!("No content available for '{}'.", target)), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_test_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let c1 = tree.add_child(root, "Getting Started", "gs content"); + let c2 = tree.add_child(root, "API Reference", "api content"); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ + ChildRoute { + node_id: c1, + title: "Getting Started".to_string(), + description: "Setup guide".to_string(), + leaf_count: 3, + }, + ChildRoute { + node_id: c2, + title: "API Reference".to_string(), + description: "API docs".to_string(), + leaf_count: 7, + }, + ], + ); + + (tree, nav, root, c1, c2) + } + + #[test] + fn test_cat_collects_evidence() { + let (tree, nav, root, _, _) = build_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 15); + + let result = cat("Getting Started", &ctx, &mut state); + assert!(result.success); + assert!(result.feedback.contains("Evidence collected")); + assert_eq!(state.evidence.len(), 1); + assert_eq!(state.evidence[0].content, "gs content"); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/cd.rs b/vectorless-core/vectorless-agent/src/tools/worker/cd.rs new file mode 100644 index 00000000..be6f382f --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/cd.rs @@ -0,0 +1,262 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `cd`, `cd_absolute`, `cd_up` — navigation commands. + +use crate::agent::command; +use crate::agent::config::DocContext; +use crate::agent::state::WorkerState; + +use super::super::ToolResult; + +/// Execute `cd ` — navigate into a child node. +/// +/// Supports: +/// - Relative names (child of current node): `cd "Getting Started"` +/// - Relative paths with `/`: `cd "Research Labs/Lab B"` +/// - Absolute paths starting with `/`: `cd /root/Chapter 1/Section 1.2` +pub fn cd(target: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { + if target.starts_with('/') { + return cd_absolute(target, ctx, state); + } + + // Relative path with segments: "Research Labs/Lab B" + if target.contains('/') { + return cd_relative_path(target, ctx, state); + } + + match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) { + Some(node_id) => { + let title = ctx.node_title(node_id).unwrap_or(target).to_string(); + state.cd(node_id, &title); + ToolResult::ok(format!("Entered: {}", state.path_str())) + } + None => ToolResult::fail(format!( + "Target '{}' not found. Use ls to see available children.", + target + )), + } +} + +/// Navigate using a relative multi-segment path (e.g., `"Research Labs/Lab B"`). +fn cd_relative_path(path: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { + let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); + if segments.is_empty() { + return ToolResult::fail("Empty path.".to_string()); + } + + let mut current = state.current_node; + let mut breadcrumb = state.breadcrumb.clone(); + + for segment in &segments { + match command::resolve_target_extended(segment, ctx.nav_index, current, ctx.tree) { + Some(node_id) => { + let title = ctx.node_title(node_id).unwrap_or(*segment).to_string(); + breadcrumb.push(title); + current = node_id; + } + None => { + return ToolResult::fail(format!( + "Path segment '{}' not found at '/{}'. Use ls to see available children.", + segment, + breadcrumb.join("/") + )); + } + } + } + + state.breadcrumb = breadcrumb; + state.current_node = current; + state.visited.insert(current); + + ToolResult::ok(format!("Entered: {}", state.path_str())) +} + +/// Navigate using an absolute path (e.g., `/root/Chapter 1/Section 1.2`). +fn cd_absolute(path: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { + let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); + + if segments.is_empty() { + return ToolResult::fail("Empty absolute path.".to_string()); + } + + let root = ctx.root(); + let mut current = root; + + let start_idx = if !segments.is_empty() && segments[0].eq_ignore_ascii_case("root") { + 1 + } else { + 0 + }; + + let mut breadcrumb = vec!["root".to_string()]; + + for segment in &segments[start_idx..] { + match command::resolve_target_extended(segment, ctx.nav_index, current, ctx.tree) { + Some(node_id) => { + let title = ctx.node_title(node_id).unwrap_or(*segment).to_string(); + breadcrumb.push(title); + current = node_id; + } + None => { + return ToolResult::fail(format!( + "Path segment '{}' not found. Stopped at: /{}", + segment, + breadcrumb.join("/") + )); + } + } + } + + state.breadcrumb = breadcrumb; + state.current_node = current; + state.visited.insert(current); + + ToolResult::ok(format!("Entered: {}", state.path_str())) +} + +/// Execute `cd ..` — navigate back to parent. +pub fn cd_up(ctx: &DocContext, state: &mut WorkerState) -> ToolResult { + match ctx.parent(state.current_node) { + Some(parent) => { + if state.cd_up(parent) { + ToolResult::ok(format!("Back to: {}", state.path_str())) + } else { + ToolResult::ok("Already at root.".to_string()) + } + } + None => ToolResult::ok("Already at root (no parent).".to_string()), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_test_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let c1 = tree.add_child(root, "Getting Started", "gs content"); + let c2 = tree.add_child(root, "API Reference", "api content"); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ + ChildRoute { + node_id: c1, + title: "Getting Started".to_string(), + description: "Setup guide".to_string(), + leaf_count: 3, + }, + ChildRoute { + node_id: c2, + title: "API Reference".to_string(), + description: "API docs".to_string(), + leaf_count: 7, + }, + ], + ); + + (tree, nav, root, c1, c2) + } + + #[test] + fn test_cd_navigates() { + let (tree, nav, root, c1, _) = build_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 15); + + let result = cd("Getting Started", &ctx, &mut state); + assert!(result.success); + assert_eq!(state.current_node, c1); + assert!(state.path_str().contains("Getting Started")); + } + + #[test] + fn test_cd_up_goes_back() { + let (tree, nav, root, _c1, _) = build_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 15); + + cd("Getting Started", &ctx, &mut state); + let result = cd_up(&ctx, &mut state); + assert!(result.success); + assert_eq!(state.current_node, root); + } + + fn build_deep_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { + // Root → "Research Labs" → "Lab B" + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let section = tree.add_child(root, "Research Labs", "section content"); + let lab_b = tree.add_child(section, "Lab B", "lab b content"); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ChildRoute { + node_id: section, + title: "Research Labs".to_string(), + description: "Lab sections".to_string(), + leaf_count: 4, + }], + ); + nav.add_child_routes( + section, + vec![ChildRoute { + node_id: lab_b, + title: "Lab B".to_string(), + description: "Topological qubits".to_string(), + leaf_count: 1, + }], + ); + + (tree, nav, root, section, lab_b) + } + + #[test] + fn test_cd_relative_path() { + let (tree, nav, root, _, lab_b) = build_deep_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 15); + + let result = cd("Research Labs/Lab B", &ctx, &mut state); + assert!(result.success); + assert_eq!(state.current_node, lab_b); + assert!(state.path_str().contains("Research Labs")); + assert!(state.path_str().contains("Lab B")); + } + + #[test] + fn test_cd_relative_path_partial_fail() { + let (tree, nav, root, _, _) = build_deep_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 15); + + let result = cd("Research Labs/Nonexistent", &ctx, &mut state); + assert!(!result.success); + assert!(result.feedback.contains("Nonexistent")); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/find.rs b/vectorless-core/vectorless-agent/src/tools/worker/find.rs new file mode 100644 index 00000000..0db5dfd2 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/find.rs @@ -0,0 +1,128 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `find_tree` — search for nodes by title pattern across the entire tree. + +use crate::agent::config::DocContext; + +use super::super::ToolResult; + +/// Execute `findtree ` — search for nodes by title pattern across the entire tree. +/// +/// Returns all nodes whose title contains the pattern (case-insensitive). +pub fn find_tree(pattern: &str, ctx: &DocContext) -> ToolResult { + let pattern_lower = pattern.to_lowercase(); + let all_nodes = ctx.tree.traverse(); + + let mut results = Vec::new(); + for node_id in &all_nodes { + if let Some(node) = ctx.tree.get(*node_id) { + if node.title.to_lowercase().contains(&pattern_lower) { + let depth = ctx.tree.depth(*node_id); + let leaf_count = ctx.nav_entry(*node_id).map(|e| e.leaf_count).unwrap_or(0); + results.push((node.title.clone(), depth, leaf_count)); + } + } + } + + if results.is_empty() { + return ToolResult::ok(format!("No nodes matching '{}'.", pattern)); + } + + let mut output = format!("Nodes matching '{}' ({} found):\n", pattern, results.len()); + for (title, depth, leaves) in &results { + output.push_str(&format!( + " - {} (depth {}, {} leaves)\n", + title, depth, leaves + )); + } + + ToolResult::ok(output) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { + let mut tree = DocumentTree::new( + "Root", + "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", + ); + let root = tree.root(); + let c1 = tree.add_child( + root, + "Revenue", + "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", + ); + let c2 = tree.add_child( + root, + "Expenses", + "Operating expenses totaled $6.8M.\nR&D spending: $3.1M\nMarketing: $1.2M", + ); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ + ChildRoute { + node_id: c1, + title: "Revenue".to_string(), + description: "Revenue breakdown".to_string(), + leaf_count: 2, + }, + ChildRoute { + node_id: c2, + title: "Expenses".to_string(), + description: "Cost analysis".to_string(), + leaf_count: 2, + }, + ], + ); + + (tree, nav, root) + } + + macro_rules! rich_ctx { + ($tree:expr, $nav:expr) => { + DocContext { + tree: &$tree, + nav_index: &$nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + } + }; + } + + #[test] + fn test_find_tree() { + let (tree, nav, _root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + + let result = find_tree("revenue", &ctx); + assert!(result.success); + assert!(result.feedback.contains("Revenue")); + } + + #[test] + fn test_find_tree_case_insensitive() { + let (tree, nav, _root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + + let result = find_tree("EXPENSE", &ctx); + assert!(result.success); + assert!(result.feedback.contains("Expenses")); + } + + #[test] + fn test_find_tree_no_match() { + let (tree, nav, _root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + + let result = find_tree("nonexistent_xyz", &ctx); + assert!(result.success); + assert!(result.feedback.contains("No nodes matching")); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/grep.rs b/vectorless-core/vectorless-agent/src/tools/worker/grep.rs new file mode 100644 index 00000000..be609c7d --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/grep.rs @@ -0,0 +1,175 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `grep` — regex search across all node content in the current subtree. + +use crate::agent::config::DocContext; +use crate::agent::state::WorkerState; + +use super::super::ToolResult; +use super::collect_subtree; + +/// Execute `grep ` — regex search across all node content in the current subtree. +/// +/// Searches content of the current node and all descendants. Returns matching lines +/// with their node titles, capped at 30 matches to avoid overwhelming feedback. +pub fn grep(pattern: &str, ctx: &DocContext, state: &WorkerState) -> ToolResult { + let re = match regex::Regex::new(pattern) { + Ok(re) => re, + Err(e) => return ToolResult::fail(format!("Invalid regex '{}': {}", pattern, e)), + }; + + let subtree = collect_subtree(state.current_node, ctx.tree); + let mut matches_found = 0; + let mut output = String::new(); + let max_matches = 30; + + for node_id in &subtree { + if matches_found >= max_matches { + output.push_str("\n... (truncated, more matches available)"); + break; + } + + let content = match ctx.cat(*node_id) { + Some(c) if !c.is_empty() => c, + _ => continue, + }; + + let title = ctx.node_title(*node_id).unwrap_or("?"); + + for line in content.lines() { + if matches_found >= max_matches { + break; + } + if re.is_match(line) { + output.push_str(&format!("[{}] {}\n", title, line)); + matches_found += 1; + } + } + } + + if matches_found == 0 { + ToolResult::ok(format!("No matches for /{}/ in subtree.", pattern)) + } else { + ToolResult::ok(format!( + "Found {} match(es) for /{}/:\n{}", + matches_found, pattern, output + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use crate::agent::state::WorkerState; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { + let mut tree = DocumentTree::new( + "Root", + "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", + ); + let root = tree.root(); + let c1 = tree.add_child( + root, + "Revenue", + "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", + ); + let c2 = tree.add_child( + root, + "Expenses", + "Operating expenses totaled $6.8M.\nR&D spending: $3.1M\nMarketing: $1.2M", + ); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ + ChildRoute { + node_id: c1, + title: "Revenue".to_string(), + description: "Revenue breakdown".to_string(), + leaf_count: 2, + }, + ChildRoute { + node_id: c2, + title: "Expenses".to_string(), + description: "Cost analysis".to_string(), + leaf_count: 2, + }, + ], + ); + + (tree, nav, root) + } + + macro_rules! rich_ctx { + ($tree:expr, $nav:expr) => { + DocContext { + tree: &$tree, + nav_index: &$nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + } + }; + } + + #[test] + fn test_grep_finds_matches() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = grep("revenue", &ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("revenue")); + assert!(result.feedback.contains("[Revenue]")); + } + + #[test] + fn test_grep_regex() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = grep("EBITDA|\\$\\d+", &ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("EBITDA")); + assert!(result.feedback.contains("$10")); + } + + #[test] + fn test_grep_no_matches() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = grep("nonexistent_term_xyz", &ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("No matches")); + } + + #[test] + fn test_grep_invalid_regex() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = grep("[invalid", &ctx, &state); + assert!(!result.success); + assert!(result.feedback.contains("Invalid regex")); + } + + #[test] + fn test_grep_subtree_only() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let mut state = WorkerState::new(root, 15); + + crate::agent::tools::worker::cd::cd("Expenses", &ctx, &mut state); + let result = grep("revenue", &ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("No matches")); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/head.rs b/vectorless-core/vectorless-agent/src/tools/worker/head.rs new file mode 100644 index 00000000..000f0c10 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/head.rs @@ -0,0 +1,119 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `head` — preview first N lines of a node without collecting evidence. + +use crate::agent::command; +use crate::agent::config::DocContext; +use crate::agent::state::WorkerState; + +use super::super::ToolResult; + +/// Execute `head ` — preview first N lines of a node without collecting evidence. +pub fn head(target: &str, lines: usize, ctx: &DocContext, state: &WorkerState) -> ToolResult { + let node_id = + match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) + { + Some(id) => id, + None => { + return ToolResult::fail(format!( + "Target '{}' not found. Use ls to see available children.", + target + )); + } + }; + + let content = match ctx.cat(node_id) { + Some(c) => c, + None => return ToolResult::fail(format!("No content for '{}'.", target)), + }; + + let title = ctx.node_title(node_id).unwrap_or("unknown"); + let total_lines = content.lines().count(); + let preview: Vec<&str> = content.lines().take(lines).collect(); + + let mut output = format!( + "[Preview: {} — showing {}/{} lines]\n", + title, + preview.len().min(lines), + total_lines + ); + output.push_str(&preview.join("\n")); + + if total_lines > lines { + output.push_str(&format!( + "\n... ({} more lines, use cat to read all)", + total_lines - lines + )); + } + + ToolResult::ok(output) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use crate::agent::state::WorkerState; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { + let mut tree = DocumentTree::new( + "Root", + "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", + ); + let root = tree.root(); + let c1 = tree.add_child( + root, + "Revenue", + "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", + ); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "Revenue".to_string(), + description: "Revenue breakdown".to_string(), + leaf_count: 2, + }], + ); + + (tree, nav, root) + } + + macro_rules! rich_ctx { + ($tree:expr, $nav:expr) => { + DocContext { + tree: &$tree, + nav_index: &$nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + } + }; + } + + #[test] + fn test_head_preview() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = head("Revenue", 2, &ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("Preview")); + assert!(result.feedback.contains("$10.2M")); + assert!(result.feedback.contains("2/4 lines")); + } + + #[test] + fn test_head_not_found() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = head("NonExistent", 10, &ctx, &state); + assert!(!result.success); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/ls.rs b/vectorless-core/vectorless-agent/src/tools/worker/ls.rs new file mode 100644 index 00000000..c06914ea --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/ls.rs @@ -0,0 +1,124 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `ls` — list children of the current node. + +use crate::agent::config::DocContext; +use crate::agent::state::WorkerState; + +use super::super::ToolResult; + +/// Execute `ls` — list children of the current node. +pub fn ls(ctx: &DocContext, state: &WorkerState) -> ToolResult { + let mut output = String::new(); + + if let Some(entry) = ctx.nav_entry(state.current_node) { + output.push_str(&format!("Current section: {}\n", entry.overview)); + if !entry.question_hints.is_empty() { + output.push_str(&format!( + "Can answer: {}\n", + entry.question_hints.join(", ") + )); + } + output.push('\n'); + } + + match ctx.ls(state.current_node) { + Some(routes) => { + if routes.is_empty() { + output + .push_str("(leaf node — no children)\nUse cd .. to go back or done to finish."); + return ToolResult::ok(output); + } + + for (i, route) in routes.iter().enumerate() { + if route.title == route.description { + output.push_str(&format!( + "[{}] {} ({} leaves)", + i + 1, + route.title, + route.leaf_count + )); + } else { + output.push_str(&format!( + "[{}] {} — {} ({} leaves)", + i + 1, + route.title, + route.description, + route.leaf_count + )); + } + if let Some(nav) = ctx.nav_entry(route.node_id) { + if !nav.question_hints.is_empty() { + output.push_str(&format!( + "\n Can answer: {}", + nav.question_hints.join(", ") + )); + } + if !nav.topic_tags.is_empty() { + output.push_str(&format!("\n Topics: {}", nav.topic_tags.join(", "))); + } + } + output.push('\n'); + } + ToolResult::ok(output) + } + None => { + output.push_str( + "(no navigation data for this node)\nUse cat to read content or cd .. to go back.", + ); + ToolResult::ok(output) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_test_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let c1 = tree.add_child(root, "Getting Started", "gs content"); + let c2 = tree.add_child(root, "API Reference", "api content"); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ + ChildRoute { + node_id: c1, + title: "Getting Started".to_string(), + description: "Setup guide".to_string(), + leaf_count: 3, + }, + ChildRoute { + node_id: c2, + title: "API Reference".to_string(), + description: "API docs".to_string(), + leaf_count: 7, + }, + ], + ); + + (tree, nav, root, c1, c2) + } + + #[test] + fn test_ls_shows_children() { + let (tree, nav, root, _, _) = build_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let state = WorkerState::new(root, 15); + + let result = ls(&ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("Getting Started")); + assert!(result.feedback.contains("API Reference")); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/mod.rs b/vectorless-core/vectorless-agent/src/tools/worker/mod.rs new file mode 100644 index 00000000..4a9fc6e6 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/mod.rs @@ -0,0 +1,39 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Worker tools: ls, cd, cd_up, cat, pwd, grep, head, find_tree, wc. + +mod cat; +mod cd; +mod find; +mod grep; +mod head; +mod ls; +mod pwd; +mod wc; + +pub use cat::cat; +pub use cd::{cd, cd_up}; +pub use find::find_tree; +pub use grep::grep; +pub use head::head; +pub use ls::ls; +pub use pwd::pwd; +pub use wc::wc; + +use vectorless_document::{DocumentTree, NodeId}; + +/// Collect all NodeIds in the subtree rooted at `node` (inclusive). +pub(super) fn collect_subtree(node: NodeId, tree: &DocumentTree) -> Vec { + let mut result = vec![node]; + let mut stack = vec![node]; + + while let Some(current) = stack.pop() { + for child in tree.children_iter(current) { + result.push(child); + stack.push(child); + } + } + + result +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs b/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs new file mode 100644 index 00000000..eb28cc2e --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs @@ -0,0 +1,58 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `pwd` — show current navigation path. + +use crate::agent::state::WorkerState; + +use super::super::ToolResult; + +/// Execute `pwd` — show current navigation path. +pub fn pwd(state: &WorkerState) -> ToolResult { + ToolResult::ok(format!("Current path: {}", state.path_str())) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use crate::agent::tools::worker::cd::cd; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex}; + + fn build_test_tree() -> (DocumentTree, NavigationIndex) { + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let c1 = tree.add_child(root, "API Reference", "api content"); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "API Reference".to_string(), + description: "API docs".to_string(), + leaf_count: 7, + }], + ); + + (tree, nav) + } + + #[test] + fn test_pwd() { + let (tree, nav) = build_test_tree(); + let root = tree.root(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 15); + cd("API Reference", &ctx, &mut state); + + let result = pwd(&state); + assert!(result.success); + assert!(result.feedback.contains("API Reference")); + } +} diff --git a/vectorless-core/vectorless-agent/src/tools/worker/wc.rs b/vectorless-core/vectorless-agent/src/tools/worker/wc.rs new file mode 100644 index 00000000..3dc19782 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/tools/worker/wc.rs @@ -0,0 +1,109 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! `wc` — show node content statistics. + +use crate::agent::command; +use crate::agent::config::DocContext; +use crate::agent::state::WorkerState; + +use super::super::ToolResult; + +/// Execute `wc ` — show node content statistics. +pub fn wc(target: &str, ctx: &DocContext, state: &WorkerState) -> ToolResult { + let node_id = + match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) + { + Some(id) => id, + None => { + return ToolResult::fail(format!( + "Target '{}' not found. Use ls to see available children.", + target + )); + } + }; + + let content = match ctx.cat(node_id) { + Some(c) => c, + None => return ToolResult::fail(format!("No content for '{}'.", target)), + }; + + let title = ctx.node_title(node_id).unwrap_or("unknown"); + let lines = content.lines().count(); + let words = content.split_whitespace().count(); + let chars = content.len(); + + ToolResult::ok(format!( + "[{}] {} lines, {} words, {} chars", + title, lines, words, chars + )) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use crate::agent::state::WorkerState; + use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; + + fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { + let mut tree = DocumentTree::new( + "Root", + "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", + ); + let root = tree.root(); + let c1 = tree.add_child( + root, + "Revenue", + "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", + ); + + let mut nav = NavigationIndex::new(); + nav.add_child_routes( + root, + vec![ChildRoute { + node_id: c1, + title: "Revenue".to_string(), + description: "Revenue breakdown".to_string(), + leaf_count: 2, + }], + ); + + (tree, nav, root) + } + + macro_rules! rich_ctx { + ($tree:expr, $nav:expr) => { + DocContext { + tree: &$tree, + nav_index: &$nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + } + }; + } + + #[test] + fn test_wc_stats() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = wc("Revenue", &ctx, &state); + assert!(result.success); + assert!(result.feedback.contains("Revenue")); + assert!(result.feedback.contains("lines")); + assert!(result.feedback.contains("words")); + assert!(result.feedback.contains("chars")); + } + + #[test] + fn test_wc_not_found() { + let (tree, nav, root) = build_rich_tree(); + let ctx = rich_ctx!(tree, nav); + let state = WorkerState::new(root, 15); + + let result = wc("NonExistent", &ctx, &state); + assert!(!result.success); + } +} diff --git a/vectorless-core/vectorless-agent/src/worker/execute.rs b/vectorless-core/vectorless-agent/src/worker/execute.rs new file mode 100644 index 00000000..12ac88e5 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/worker/execute.rs @@ -0,0 +1,278 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Command execution — dispatch parsed Command to tool functions. + +use tracing::{info, warn}; + +use vectorless_llm::LlmClient; + +use super::super::command::{Command, parse_command}; +use super::super::config::{DocContext, Step}; +use super::super::events::EventEmitter; +use super::super::prompts::{check_sufficiency, parse_sufficiency_response}; +use super::super::state::WorkerState; +use super::super::tools::worker as tools; + +/// Execute a single parsed command, mutating state. +/// +/// Returns a `Step` indicating whether to continue or stop. +pub async fn execute_command( + command: &Command, + ctx: &DocContext<'_>, + state: &mut WorkerState, + query: &str, + llm: &LlmClient, + llm_calls: &mut u32, + emitter: &EventEmitter, +) -> Step { + info!( + doc = ctx.doc_name, + command = ?command, + "Executing tool" + ); + match command { + Command::Ls => { + let result = tools::ls(ctx, state); + info!(doc = ctx.doc_name, feedback = %truncate_log(&result.feedback), "ls result"); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::Cd { target } => { + let result = tools::cd(target, ctx, state); + info!(doc = ctx.doc_name, target, feedback = %truncate_log(&result.feedback), "cd result"); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::CdUp => { + let result = tools::cd_up(ctx, state); + info!(doc = ctx.doc_name, feedback = %truncate_log(&result.feedback), "cd_up result"); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::Cat { target } => { + let evidence_before = state.evidence.len(); + let result = tools::cat(target, ctx, state); + info!(doc = ctx.doc_name, target, feedback = %truncate_log(&result.feedback), "cat result"); + state.set_feedback(result.feedback); + if state.evidence.len() > evidence_before { + if let Some(ev) = state.evidence.last() { + info!( + doc = ctx.doc_name, + node = %ev.node_title, + path = %ev.source_path, + len = ev.content.len(), + total = state.evidence.len(), + "Evidence collected" + ); + emitter.emit_evidence( + ctx.doc_name, + &ev.node_title, + &ev.source_path, + ev.content.len(), + state.evidence.len(), + ); + } + } + Step::Continue + } + + Command::Find { keyword } => { + let feedback = match ctx.find(keyword) { + Some(hit) => { + let mut entries = hit.entries.clone(); + entries.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let mut seen_nodes = std::collections::HashSet::new(); + let mut output = format!("Results for '{}':\n", keyword); + for entry in &entries { + if !seen_nodes.insert(entry.node_id) { + continue; + } + let title = ctx.node_title(entry.node_id).unwrap_or("unknown"); + output.push_str(&format!( + " - {} (depth {}, weight {:.2})", + title, entry.depth, entry.weight + )); + if let Some(content) = ctx.cat(entry.node_id) { + if let Some(snippet) = + super::super::tools::content_snippet(content, keyword, 300) + { + output.push_str(&format!("\n \"{}\"", snippet)); + } + } + output.push('\n'); + } + output + } + None => { + // Fallback: search node titles (like findtree) with content snippets + let pattern_lower = keyword.to_lowercase(); + let all_nodes = ctx.tree.traverse(); + let mut results = Vec::new(); + for node_id in &all_nodes { + if let Some(node) = ctx.tree.get(*node_id) { + if node.title.to_lowercase().contains(&pattern_lower) { + let depth = ctx.tree.depth(*node_id); + results.push((node.title.clone(), *node_id, depth)); + } + } + } + if results.is_empty() { + format!("No results for '{}' in index or titles.", keyword) + } else { + let mut output = format!( + "Results for '{}' (title match, {} found):\n", + keyword, + results.len() + ); + for (title, node_id, depth) in &results { + output.push_str(&format!(" - {} (depth {})", title, depth)); + if let Some(content) = ctx.cat(*node_id) { + if let Some(snippet) = + super::super::tools::content_snippet(content, keyword, 300) + { + output.push_str(&format!("\n \"{}\"", snippet)); + } + } + output.push('\n'); + } + output + } + } + }; + info!(doc = ctx.doc_name, keyword, feedback = %truncate_log(&feedback), "find result"); + state.set_feedback(feedback); + Step::Continue + } + + Command::Pwd => { + let result = tools::pwd(state); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::Check => { + let evidence_text = state.evidence_for_check(); + + let (system, user) = check_sufficiency(query, &evidence_text); + + info!( + doc = ctx.doc_name, + system = %system, + user = %user, + "Check prompt" + ); + + match llm.complete(&system, &user).await { + Ok(response) => { + *llm_calls += 1; + state.check_count += 1; + let sufficient = parse_sufficiency_response(&response); + info!( + doc = ctx.doc_name, + sufficient, + evidence = state.evidence.len(), + response = %response, + "Sufficiency check" + ); + emitter.emit_worker_sufficiency_check( + ctx.doc_name, + sufficient, + state.evidence.len(), + None, + ); + if sufficient { + state.last_feedback = + "Evidence is sufficient. Use done to finish.".to_string(); + Step::Done + } else { + let reason = response + .trim() + .strip_prefix("INSUFFICIENT") + .unwrap_or(response.trim()) + .trim() + .trim_start_matches(|c: char| c == '-' || c == ' '); + if !reason.is_empty() { + state.missing_info = reason.to_string(); + } + state.set_feedback(format!( + "Evidence not yet sufficient: {}", + response.trim() + )); + Step::Continue + } + } + Err(e) => { + warn!(error = %e, "Check LLM call failed"); + state.last_feedback = "Could not evaluate sufficiency.".to_string(); + Step::Continue + } + } + } + + Command::Done => { + state.last_feedback = "Navigation complete.".to_string(); + Step::Done + } + + Command::Grep { pattern } => { + let result = tools::grep(pattern, ctx, state); + info!(doc = ctx.doc_name, pattern, feedback = %truncate_log(&result.feedback), "grep result"); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::Head { target, lines } => { + let result = tools::head(target, *lines, ctx, state); + info!(doc = ctx.doc_name, target, lines, feedback = %truncate_log(&result.feedback), "head result"); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::FindTree { pattern } => { + let result = tools::find_tree(pattern, ctx); + info!(doc = ctx.doc_name, pattern, feedback = %truncate_log(&result.feedback), "find_tree result"); + state.set_feedback(result.feedback); + Step::Continue + } + + Command::Wc { target } => { + let result = tools::wc(target, ctx, state); + info!(doc = ctx.doc_name, target, feedback = %truncate_log(&result.feedback), "wc result"); + state.set_feedback(result.feedback); + Step::Continue + } + } +} + +/// Truncate feedback for log output — keep first 300 chars to avoid noisy logs. +fn truncate_log(s: &str) -> std::borrow::Cow<'_, str> { + const MAX: usize = 300; + if s.len() <= MAX { + std::borrow::Cow::Borrowed(s) + } else { + std::borrow::Cow::Owned(format!( + "{}...(truncated, {} chars total)", + &s[..MAX], + s.len() + )) + } +} + +/// Parse the LLM output and detect parse failures. +/// +/// Returns `(command, is_parse_failure)`. +pub fn parse_and_detect_failure(llm_output: &str) -> (Command, bool) { + let command = parse_command(llm_output); + let trimmed = llm_output.trim(); + let is_parse_failure = + matches!(command, Command::Ls) && !trimmed.starts_with("ls") && !trimmed.is_empty(); + (command, is_parse_failure) +} diff --git a/vectorless-core/vectorless-agent/src/worker/format.rs b/vectorless-core/vectorless-agent/src/worker/format.rs new file mode 100644 index 00000000..be9e029f --- /dev/null +++ b/vectorless-core/vectorless-agent/src/worker/format.rs @@ -0,0 +1,20 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Formatting helpers for Worker prompts. + +use super::super::config::DocContext; +use super::super::state::WorkerState; + +/// Resolve visited NodeIds to their titles for prompt injection. +pub fn format_visited_titles(state: &WorkerState, ctx: &DocContext<'_>) -> String { + if state.visited.is_empty() { + return "(none)".to_string(); + } + state + .visited + .iter() + .filter_map(|&node_id| ctx.node_title(node_id).map(|t| t.to_string())) + .collect::>() + .join(", ") +} diff --git a/vectorless-core/vectorless-agent/src/worker/mod.rs b/vectorless-core/vectorless-agent/src/worker/mod.rs new file mode 100644 index 00000000..d4ac5453 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/worker/mod.rs @@ -0,0 +1,236 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Worker agent — document navigation and evidence collection. +//! +//! The Worker is a consuming-self struct implementing [`Agent`]: +//! 1. Bird's-eye: ls(root) for initial overview +//! 2. Navigation planning: LLM generates a plan (keyword hits as context) +//! 3. Navigation loop: LLM → parse → execute → repeat (max N rounds) +//! +//! Dispatched by the Orchestrator, one per document. +//! Returns raw evidence — no answer synthesis. Rerank owns all answer generation. + +mod execute; +mod format; +mod navigation; +mod planning; + +use tracing::info; + +use super::Agent; +use super::config::{DocContext, WorkerConfig, WorkerOutput}; +use super::context::FindHit; +use super::events::EventEmitter; +use super::state::WorkerState; +use super::tools::worker as tools; +use vectorless_error::Error; +use vectorless_llm::LlmClient; +use vectorless_query::QueryPlan; +use vectorless_scoring::bm25::extract_keywords; + +use navigation::run_navigation_loop; +use planning::build_plan_prompt; + +/// Worker agent — navigates a single document to collect evidence. +/// +/// Holds all execution context. Calling [`run()`](Agent::run) consumes self. +pub struct Worker<'a> { + query: String, + task: Option, + ctx: &'a DocContext<'a>, + config: WorkerConfig, + llm: LlmClient, + emitter: EventEmitter, + query_plan: QueryPlan, +} + +impl<'a> Worker<'a> { + /// Create a new Worker. + pub fn new( + query: &str, + task: Option<&str>, + ctx: &'a DocContext<'a>, + config: WorkerConfig, + llm: LlmClient, + emitter: EventEmitter, + query_plan: QueryPlan, + ) -> Self { + Self { + query: query.to_string(), + task: task.map(|s| s.to_string()), + ctx, + config, + llm, + emitter, + query_plan, + } + } +} + +impl<'a> Agent for Worker<'a> { + type Output = WorkerOutput; + + fn name(&self) -> &str { + "worker" + } + + async fn run(self) -> vectorless_error::Result { + let Worker { + query, + task, + ctx, + config, + llm, + emitter, + query_plan, + } = self; + let task_ref = task.as_deref(); + + let intent_context = format!("{} — {}", query_plan.intent, query_plan.strategy_hint); + + emitter.emit_worker_started(ctx.doc_name, task_ref, config.max_rounds); + + info!( + doc = ctx.doc_name, + task = task_ref.unwrap_or("(full query)"), + max_rounds = config.max_rounds, + max_llm_calls = config.max_llm_calls, + "Worker starting" + ); + + let mut llm_calls: u32 = 0; + + // Gather keyword hits as context for LLM planning (not routing rules) + let keywords = extract_keywords(&query); + let index_hits: Vec = ctx.find_all(&keywords); + if !index_hits.is_empty() { + tracing::debug!( + doc = ctx.doc_name, + hit_count = index_hits.len(), + "ReasoningIndex keyword hits available for planning" + ); + } + + // --- Phase 1: Bird's-eye view --- + let mut state = WorkerState::new(ctx.root(), config.max_rounds); + let ls_result = tools::ls(ctx, &state); + state.set_feedback(ls_result.feedback); + + // --- Phase 1.5: Navigation planning --- + if state.remaining > 0 && (config.max_llm_calls == 0 || llm_calls < config.max_llm_calls) { + info!(doc = ctx.doc_name, "Generating navigation plan..."); + let plan_prompt = build_plan_prompt( + &query, + task_ref, + &state.last_feedback, + ctx.doc_name, + &index_hits, + ctx, + query_plan.intent, + ); + let plan_output = llm + .complete(&plan_prompt.0, &plan_prompt.1) + .await + .map_err(|e| Error::LlmReasoning { + stage: "worker/plan".to_string(), + detail: format!("Navigation plan LLM call failed: {e}"), + })?; + llm_calls += 1; + let plan_text = plan_output.trim().to_string(); + if !plan_text.is_empty() { + info!( + doc = ctx.doc_name, + plan = %plan_text, + "Navigation plan generated" + ); + emitter.emit_worker_plan_generated(ctx.doc_name, plan_text.len()); + state.plan = plan_text; + state.plan_generated = true; + } + } + + // --- Phase 2: Navigation loop --- + run_navigation_loop( + &query, + task_ref, + ctx, + &config, + &llm, + &mut state, + &emitter, + &index_hits, + &intent_context, + &mut llm_calls, + ) + .await?; + + let budget_exhausted = + state.remaining == 0 || (config.max_llm_calls > 0 && llm_calls >= config.max_llm_calls); + + let output = state.into_worker_output(llm_calls, budget_exhausted, ctx.doc_name); + + emitter.emit_worker_done( + ctx.doc_name, + output.evidence.len(), + output.metrics.rounds_used, + output.metrics.llm_calls, + output.metrics.budget_exhausted, + output.metrics.plan_generated, + ); + + info!( + doc = ctx.doc_name, + evidence = output.evidence.len(), + rounds = output.metrics.rounds_used, + llm_calls = output.metrics.llm_calls, + "Worker complete" + ); + + Ok(output) + } +} + +#[cfg(test)] +mod truncation_tests { + /// Verify that truncating feedback with multi-byte UTF-8 characters + /// never panics. This mirrors the truncation logic in the navigation loop. + #[test] + fn test_utf8_safe_truncation_ascii() { + let feedback = "a".repeat(200); + let boundary = feedback.ceil_char_boundary(120); + let truncated = &feedback[..boundary]; + assert!(truncated.len() <= 123); // 120 + "..." fits + assert!(truncated.is_char_boundary(truncated.len())); + } + + #[test] + fn test_utf8_safe_truncation_multibyte() { + // Each '中' is 3 bytes in UTF-8 + let feedback = "中文反馈内容测试截断安全".repeat(20); + assert!(feedback.len() > 120); + let boundary = feedback.ceil_char_boundary(120); + let truncated = &feedback[..boundary]; + assert!(truncated.len() <= 120); + assert!(truncated.is_char_boundary(truncated.len())); + } + + #[test] + fn test_utf8_safe_truncation_emoji() { + // Emojis are 4 bytes each + let feedback = "🦀🎉🚀".repeat(50); + assert!(feedback.len() > 120); + let boundary = feedback.ceil_char_boundary(120); + let truncated = &feedback[..boundary]; + assert!(truncated.len() <= 120); + assert!(truncated.is_char_boundary(truncated.len())); + } + + #[test] + fn test_utf8_safe_truncation_short_string() { + // String shorter than limit — no truncation needed + let feedback = "short feedback".to_string(); + let boundary = feedback.ceil_char_boundary(120); + assert_eq!(boundary, feedback.len()); + } +} diff --git a/vectorless-core/vectorless-agent/src/worker/navigation.rs b/vectorless-core/vectorless-agent/src/worker/navigation.rs new file mode 100644 index 00000000..bb6a5812 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/worker/navigation.rs @@ -0,0 +1,448 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Phase 2: Navigation loop — LLM-driven command loop until done or budget exhausted. + +use tracing::{debug, info}; + +use super::super::command::Command; +use super::super::config::{DocContext, Step, WorkerConfig}; +use super::super::context::FindHit; +use super::super::events::EventEmitter; +use super::super::prompts::{NavigationParams, worker_dispatch, worker_navigation}; +use super::super::state::WorkerState; +use super::execute::{execute_command, parse_and_detect_failure}; +use super::format::format_visited_titles; +use super::planning::{build_replan_prompt, format_keyword_hints}; +use vectorless_error::Error; +use vectorless_llm::LlmClient; + +/// Run the Phase 2 navigation loop. +/// +/// Loops until budget exhausted, `done`/`force_done`, or error. +/// Mutates `state` and `llm_calls` in place. +pub async fn run_navigation_loop( + query: &str, + task: Option<&str>, + ctx: &DocContext<'_>, + config: &WorkerConfig, + llm: &LlmClient, + state: &mut WorkerState, + emitter: &EventEmitter, + index_hits: &[FindHit], + intent_context: &str, + llm_calls: &mut u32, +) -> vectorless_error::Result<()> { + let use_dispatch_prompt = task.is_some(); + let keyword_hints = format_keyword_hints(index_hits, ctx); + let max_llm = config.max_llm_calls; + + loop { + if state.remaining == 0 { + info!(doc = ctx.doc_name, "Navigation budget exhausted"); + break; + } + if max_llm > 0 && *llm_calls >= max_llm { + info!( + doc = ctx.doc_name, + llm_calls, max_llm, "LLM call budget exhausted" + ); + break; + } + + // Build prompt + let (system, user) = build_round_prompt( + query, + task, + ctx, + state, + intent_context, + &keyword_hints, + use_dispatch_prompt, + config.max_rounds, + ); + + // LLM decision + let round_num = config.max_rounds - state.remaining + 1; + let round_start = std::time::Instant::now(); + info!( + doc = ctx.doc_name, + round = round_num, + max_rounds = config.max_rounds, + "Navigation round: calling LLM..." + ); + let llm_output = llm + .complete(&system, &user) + .await + .map_err(|e| Error::LlmReasoning { + stage: "worker/navigation".to_string(), + detail: format!("Nav loop LLM call failed (round {round_num}): {e}"), + })?; + *llm_calls += 1; + + // Parse command + let (command, is_parse_failure) = handle_parse_failure(&llm_output, ctx.doc_name, state); + if is_parse_failure { + continue; + } + + debug!(doc = ctx.doc_name, ?command, "Parsed command"); + + let is_check = matches!(command, Command::Check); + + // Execute + let step = execute_command(&command, ctx, state, query, llm, llm_calls, emitter).await; + + // Dynamic re-planning after insufficient check + handle_replan( + is_check, query, task, ctx, llm, state, emitter, llm_calls, max_llm, + ) + .await?; + + // Emit round event + let cmd_str = format!("{:?}", command); + let success = !matches!(step, Step::ForceDone(_)); + let round_elapsed = round_start.elapsed().as_millis() as u64; + emitter.emit_worker_round(ctx.doc_name, round_num, &cmd_str, success, round_elapsed); + + push_round_history(state, &cmd_str); + + // Check termination + match step { + Step::Done => { + info!( + doc = ctx.doc_name, + evidence = state.evidence.len(), + "Navigation done" + ); + break; + } + Step::ForceDone(reason) => { + info!(doc = ctx.doc_name, reason = %reason, "Forced done"); + break; + } + Step::Continue => { + if !is_check { + state.dec_round(); + } + } + } + } + + Ok(()) +} + +/// Build the (system, user) prompt pair for a single navigation round. +fn build_round_prompt( + query: &str, + task: Option<&str>, + ctx: &DocContext<'_>, + state: &WorkerState, + intent_context: &str, + keyword_hints: &str, + use_dispatch_prompt: bool, + max_rounds: u32, +) -> (String, String) { + if use_dispatch_prompt && state.remaining == max_rounds { + worker_dispatch(&super::super::prompts::WorkerDispatchParams { + original_query: query, + task: task.unwrap_or(query), + doc_name: ctx.doc_name, + breadcrumb: &state.path_str(), + }) + } else { + let visited_titles = format_visited_titles(state, ctx); + worker_navigation(&NavigationParams { + query, + task, + breadcrumb: &state.path_str(), + evidence_summary: &state.evidence_summary(), + missing_info: &state.missing_info, + last_feedback: &state.last_feedback, + remaining: state.remaining, + max_rounds: state.max_rounds, + history: &state.history_text(), + visited_titles: &visited_titles, + plan: &state.plan, + intent_context, + keyword_hints, + }) + } +} + +/// Parse LLM output and handle parse failures. +/// +/// Returns `(command, is_parse_failure)`. On parse failure, updates state +/// with feedback and pushes a history entry. +fn handle_parse_failure( + llm_output: &str, + doc_name: &str, + state: &mut WorkerState, +) -> (Command, bool) { + if llm_output.trim().len() < 2 { + tracing::warn!( + doc = doc_name, + response = llm_output.trim(), + "LLM response unusually short" + ); + } + let (command, is_parse_failure) = parse_and_detect_failure(llm_output); + if is_parse_failure { + let raw_preview = if llm_output.trim().len() > 200 { + format!("{}...", &llm_output.trim()[..200]) + } else { + llm_output.trim().to_string() + }; + state.last_feedback = format!( + "Your output was not recognized as a valid command:\n\"{}\"\n\n\ + Please output exactly one command (ls, cd, cat, head, find, findtree, grep, wc, pwd, check, or done).", + raw_preview + ); + state.push_history("(unrecognized) → parse failure".to_string()); + } + (command, is_parse_failure) +} + +/// Push a round's command + feedback preview into history and trace. +fn push_round_history(state: &mut WorkerState, cmd_str: &str) { + let feedback_preview = if state.last_feedback.len() > 120 { + let boundary = state.last_feedback.ceil_char_boundary(120); + format!("{}...", &state.last_feedback[..boundary]) + } else { + state.last_feedback.clone() + }; + state.push_history(format!("{} → {}", cmd_str, feedback_preview)); + + let round = state.max_rounds.saturating_sub(state.remaining); + state.trace_steps.push(vectorless_document::TraceStep { + action: cmd_str.to_string(), + observation: state.last_feedback.chars().take(200).collect(), + round, + }); +} + +/// Dynamic re-planning after an insufficient check. +/// +/// If check returned INSUFFICIENT with enough remaining rounds and LLM budget, +/// generates a new navigation plan. Otherwise clears stale replan state. +async fn handle_replan( + is_check: bool, + query: &str, + task: Option<&str>, + ctx: &DocContext<'_>, + llm: &LlmClient, + state: &mut WorkerState, + emitter: &EventEmitter, + llm_calls: &mut u32, + max_llm: u32, +) -> vectorless_error::Result<()> { + if !is_check { + return Ok(()); + } + + if !state.missing_info.is_empty() + && state.remaining >= 3 + && (max_llm == 0 || *llm_calls < max_llm) + { + let missing = state.missing_info.clone(); + info!(doc = ctx.doc_name, missing = %missing, "Re-planning navigation..."); + let replan = build_replan_prompt(query, task, state, ctx); + let new_plan = + llm.complete(&replan.0, &replan.1) + .await + .map_err(|e| Error::LlmReasoning { + stage: "worker/replan".to_string(), + detail: format!("Re-plan LLM call failed: {e}"), + })?; + *llm_calls += 1; + let plan_text = new_plan.trim().to_string(); + if !plan_text.is_empty() { + info!( + doc = ctx.doc_name, + plan = %plan_text, + "Re-plan generated" + ); + emitter.emit_worker_replan(ctx.doc_name, &missing, plan_text.len()); + state.plan = plan_text; + } + state.missing_info.clear(); + } else if !state.missing_info.is_empty() { + state.plan.clear(); + state.missing_info.clear(); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use crate::agent::state::WorkerState; + use vectorless_document::{DocumentTree, NodeId}; + + fn test_ctx() -> (DocumentTree, NodeId) { + let tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + (tree, root) + } + + #[test] + fn test_handle_parse_failure_valid_command() { + let (tree, root) = test_ctx(); + let nav = vectorless_document::NavigationIndex::new(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 10); + + let (cmd, is_failure) = handle_parse_failure("ls", ctx.doc_name, &mut state); + assert!(!is_failure); + assert!(matches!(cmd, Command::Ls)); + } + + #[test] + fn test_handle_parse_failure_unrecognized() { + let (tree, root) = test_ctx(); + let nav = vectorless_document::NavigationIndex::new(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 10); + + let (_cmd, is_failure) = + handle_parse_failure("random garbage text", ctx.doc_name, &mut state); + assert!(is_failure); + assert!(state.last_feedback.contains("not recognized")); + assert!(state.history.last().unwrap().contains("unrecognized")); + } + + #[test] + fn test_handle_parse_failure_short_response() { + let (tree, root) = test_ctx(); + let nav = vectorless_document::NavigationIndex::new(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let mut state = WorkerState::new(root, 10); + + // Single character response — short but not a parse failure if it's "ls" + let (cmd, is_failure) = handle_parse_failure("ls", ctx.doc_name, &mut state); + assert!(!is_failure); + assert!(matches!(cmd, Command::Ls)); + } + + #[test] + fn test_push_round_history_short_feedback() { + let (_, root) = test_ctx(); + let mut state = WorkerState::new(root, 10); + state.last_feedback = "short feedback".to_string(); + + push_round_history(&mut state, "ls"); + assert_eq!(state.history.len(), 1); + assert!(state.history[0].contains("ls → short feedback")); + } + + #[test] + fn test_push_round_history_long_feedback() { + let (_, root) = test_ctx(); + let mut state = WorkerState::new(root, 10); + state.last_feedback = "a".repeat(200); + + push_round_history(&mut state, "cat"); + assert_eq!(state.history.len(), 1); + assert!(state.history[0].contains("cat → ")); + // Should be truncated with ... + assert!(state.history[0].contains("...")); + } + + #[test] + fn test_push_round_history_respects_max_entries() { + let (_, root) = test_ctx(); + let mut state = WorkerState::new(root, 10); + state.last_feedback = "ok".to_string(); + + for i in 0..8 { + push_round_history(&mut state, &format!("cmd_{i}")); + } + // MAX_HISTORY_ENTRIES is 6, so only last 6 should remain + assert_eq!(state.history.len(), 6); + } + + #[test] + fn test_build_round_prompt_dispatch_first_round() { + let (tree, root) = test_ctx(); + let nav = vectorless_document::NavigationIndex::new(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test_doc", + }; + let mut state = WorkerState::new(root, 10); + // remaining == max_rounds means first round + assert_eq!(state.remaining, 10); + + let (system, user) = build_round_prompt( + "test query", + Some("sub-task"), + &ctx, + &state, + "factual — find answer", + "", + true, // use_dispatch_prompt + 10, + ); + assert!(system.contains("dispatch") || !system.is_empty()); + assert!(user.contains("test query") || user.contains("sub-task")); + } + + #[test] + fn test_build_round_prompt_navigation_subsequent_round() { + let (tree, root) = test_ctx(); + let nav = vectorless_document::NavigationIndex::new(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test_doc", + }; + let mut state = WorkerState::new(root, 10); + state.remaining = 8; // not first round + + let (system, _user) = build_round_prompt( + "test query", + None, + &ctx, + &state, + "factual", + "keyword hints here", + false, // use_dispatch_prompt + 10, + ); + assert!(!system.is_empty()); + } + + #[test] + fn test_utf8_safe_truncation_in_history() { + let (_, root) = test_ctx(); + let mut state = WorkerState::new(root, 10); + // Each '中' is 3 bytes in UTF-8 + state.last_feedback = "中文反馈内容测试截断安全".repeat(20); + + push_round_history(&mut state, "cat"); + let entry = &state.history[0]; + // Should be truncated without panicking + assert!(entry.contains("cat → ")); + assert!(entry.len() < state.last_feedback.len() + 20); + } +} diff --git a/vectorless-core/vectorless-agent/src/worker/planning.rs b/vectorless-core/vectorless-agent/src/worker/planning.rs new file mode 100644 index 00000000..37998071 --- /dev/null +++ b/vectorless-core/vectorless-agent/src/worker/planning.rs @@ -0,0 +1,708 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Navigation planning prompts — initial plan, re-plan, semantic hints, deep expansion. + +use std::collections::HashSet; + +use vectorless_query::QueryIntent; +use vectorless_scoring::bm25::{Bm25Engine, FieldDocument, extract_keywords}; + +use super::super::config::DocContext; +use super::super::context::FindHit; +use super::super::state::WorkerState; +use super::format::format_visited_titles; + +/// Maximum keyword/semantic hit entries in plan prompt. +const MAX_PLAN_ENTRIES: usize = 15; +/// Maximum section summaries in plan prompt. +const MAX_SECTION_SUMMARIES: usize = 10; +/// Maximum deep expansion entries. +const MAX_EXPANSION_ENTRIES: usize = 8; + +/// Build the navigation planning prompt (Phase 1.5). +pub fn build_plan_prompt( + query: &str, + task: Option<&str>, + ls_output: &str, + doc_name: &str, + keyword_hits: &[FindHit], + ctx: &DocContext<'_>, + intent: QueryIntent, +) -> (String, String) { + let task_section = match task { + Some(t) => format!("\nYour specific task: {}", t), + None => String::new(), + }; + + let query_keywords = extract_keywords(query); + let query_lower = query.to_lowercase(); + + let mut keyword_section = if keyword_hits.is_empty() { + String::new() + } else { + let mut section = + String::from("\nKeyword index matches (use these to prioritize navigation):\n"); + let mut entry_count = 0; + for hit in keyword_hits { + let mut entries = hit.entries.clone(); + entries.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let mut seen = HashSet::new(); + for entry in &entries { + if !seen.insert(entry.node_id) { + continue; + } + let ancestor_path = build_ancestor_path(entry.node_id, ctx); + section.push_str(&format!( + " - keyword '{}' → {} (depth {}, weight {:.2})\n", + hit.keyword, ancestor_path, entry.depth, entry.weight + )); + if let Some(content) = ctx.cat(entry.node_id) { + if let Some(snippet) = + super::super::tools::content_snippet(content, &hit.keyword, 300) + { + section.push_str(&format!(" \"{}\"\n", snippet)); + } + } + entry_count += 1; + if entry_count >= MAX_PLAN_ENTRIES { + section.push_str(" ... (more hits omitted)\n"); + break; + } + } + if entry_count >= MAX_PLAN_ENTRIES { + break; + } + } + section + }; + + let deep_expansion = build_deep_expansion(keyword_hits, ctx); + if !deep_expansion.is_empty() { + keyword_section.push_str(&deep_expansion); + } + + let semantic_section = build_semantic_hints(&query_keywords, &query_lower, ctx); + + let intent_section = build_intent_signals(intent, ctx); + + let system = "You are a document navigation planner. Given a user question, the top-level \ + document structure, keyword index matches, and semantic hints, output a brief navigation \ + plan: which sections to visit and in what order. Prioritize sections that matched keywords \ + or semantic hints. The plan should be 2-5 steps. Each step should be a specific action \ + like \"cd to X, then cat Y\" or \"grep for Z in current subtree\". \ + Pay attention to 'Can answer' and 'Topics' annotations in the structure listing — \ + they indicate what questions each section addresses. \ + Output only the plan, nothing else.\n\n\ + Example plan for \"What is the Q1 revenue?\":\n\ + 1. cd to Revenue (matched keyword 'revenue')\n\ + 2. ls to see sub-sections\n\ + 3. cat Q1 Report\n\ + 4. check\n\ + 5. done".to_string(); + + let user = format!( + "Document: {doc_name}\n\ + Top-level structure:\n{ls_output}{keyword_section}{semantic_section}{intent_section}\ + User question: {query}{task_section}\n\n\ + Navigation plan:" + ); + + (system, user) +} + +/// Build a focused re-planning prompt when check returns INSUFFICIENT. +pub fn build_replan_prompt( + query: &str, + task: Option<&str>, + state: &WorkerState, + ctx: &DocContext<'_>, +) -> (String, String) { + let task_section = match task { + Some(t) => format!("\nOriginal sub-task: {}", t), + None => String::new(), + }; + + let visited = format_visited_titles(state, ctx); + let evidence_summary = state.evidence_summary(); + + let current_children = match ctx.ls(state.current_node) { + Some(routes) if !routes.is_empty() => { + let items: Vec = routes + .iter() + .map(|r| format!(" - {} ({} leaves)", r.title, r.leaf_count)) + .collect(); + format!("Children at current position:\n{}\n", items.join("\n")) + } + _ => "Current position is a leaf node — consider cd .. to go back.\n".to_string(), + }; + + let sibling_hints = build_sibling_hints(state, ctx); + + let system = "You are re-planning a document navigation strategy. The previous plan did not \ + find sufficient evidence. Given what's been found and what's still missing, generate a \ + focused 2-3 step plan. Each step should be a specific action like \ + \"cd to X, then cat Y\" or \"grep for Z in current subtree\". \ + Prefer exploring unvisited branches. If current branch is exhausted, cd .. and try \ + a different path. Output only the plan, nothing else." + .to_string(); + + let user = format!( + "Original question: {query}{task_section}\n\ + Current position: /{}\n\ + Evidence collected so far:\n{evidence_summary}\n\ + What's missing: {}\n\ + Already visited: {visited}\n\ + {current_children}\ + {sibling_hints}\ + Remaining rounds: {}/{}\n\n\ + Revised navigation plan:", + state.path_str(), + state.missing_info, + state.remaining, + state.max_rounds, + ); + + (system, user) +} + +/// Format keyword index hits into a compact string for LLM context. +/// +/// Returns a string like: +/// ```text +/// Keyword matches (use find to jump directly): +/// - 'complex' → Performance (weight 0.85) +/// "...complexity analysis shows..." +/// - 'latency' → Performance (weight 0.72) +/// "...latency benchmarks indicate..." +/// ``` +pub fn format_keyword_hints(keyword_hits: &[FindHit], ctx: &DocContext<'_>) -> String { + if keyword_hits.is_empty() { + return String::new(); + } + + let mut section = String::from("Keyword matches (use find to jump directly):\n"); + let mut entry_count = 0; + for hit in keyword_hits { + let mut entries = hit.entries.clone(); + entries.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + let mut seen = HashSet::new(); + for entry in &entries { + if !seen.insert(entry.node_id) { + continue; + } + let title = ctx.node_title(entry.node_id).unwrap_or("unknown"); + section.push_str(&format!( + " - '{}' → {} (weight {:.2})\n", + hit.keyword, title, entry.weight + )); + if let Some(content) = ctx.cat(entry.node_id) { + if let Some(snippet) = + super::super::tools::content_snippet(content, &hit.keyword, 300) + { + section.push_str(&format!(" \"{}\"\n", snippet)); + } + } + entry_count += 1; + if entry_count >= MAX_PLAN_ENTRIES { + section.push_str(" ... (more omitted)\n"); + return section; + } + } + } + section +} + +/// Build the ancestor path string for a node (e.g., "root/Chapter 1/Section 1.2"). +pub fn build_ancestor_path(node_id: vectorless_document::NodeId, ctx: &DocContext<'_>) -> String { + let mut path: Vec = ctx.tree.ancestors_iter(node_id).collect(); + path.reverse(); + path.iter() + .filter_map(|&id| ctx.node_title(id)) + .collect::>() + .join("/") +} + +/// Build intent-specific index signals for the planning prompt. +/// +/// Injects pre-computed ReasoningIndex data as context for the LLM: +/// - Summary intent → summary_shortcut (document overview + section summaries) +/// - Navigational intent → section_map matches from query keywords +/// - Factual/Analytical → no additional signals (keyword hits already injected) +fn build_intent_signals(intent: QueryIntent, ctx: &DocContext<'_>) -> String { + match intent { + QueryIntent::Summary => { + let shortcut = match ctx.summary_shortcut() { + Some(s) => s, + None => return String::new(), + }; + let mut section = String::from( + "\nPre-computed document overview (use this to plan breadth-first scan):\n", + ); + if !shortcut.document_summary.is_empty() { + section.push_str(&format!( + "Document summary: {}\n", + shortcut.document_summary + )); + } + let mut summary_count = 0; + for ss in &shortcut.section_summaries { + section.push_str(&format!( + " - Section '{}' (depth {}): {}\n", + ss.title, ss.depth, ss.summary + )); + summary_count += 1; + if summary_count >= MAX_SECTION_SUMMARIES { + section.push_str(" ... (more sections omitted)\n"); + break; + } + } + section + } + QueryIntent::Navigational => { + let root = ctx.root(); + let routes = match ctx.ls(root) { + Some(r) => r, + None => return String::new(), + }; + let mut section = + String::from("\nSection map (known top-level sections for direct navigation):\n"); + for route in routes { + section.push_str(&format!( + " - {} ({} leaves)\n", + route.title, route.leaf_count + )); + } + section + } + _ => String::new(), + } +} + +/// Build semantic hints section using BM25 scoring over child routes. +fn build_semantic_hints( + query_keywords: &[String], + query_lower: &str, + ctx: &DocContext<'_>, +) -> String { + let root = ctx.root(); + let routes = match ctx.ls(root) { + Some(r) => r, + None => return String::new(), + }; + + if routes.is_empty() { + return String::new(); + } + + let field_docs: Vec> = routes + .iter() + .map(|route| { + let nav = ctx.nav_entry(route.node_id); + let overview = nav.map(|n| n.overview.as_str()).unwrap_or(""); + let hints_text = nav.map(|n| n.question_hints.join(" ")).unwrap_or_default(); + let tags_text = nav.map(|n| n.topic_tags.join(" ")).unwrap_or_default(); + let content = if overview.is_empty() && hints_text.is_empty() && tags_text.is_empty() { + String::new() + } else { + format!("{} {} {}", overview, hints_text, tags_text) + }; + FieldDocument::new( + route.title.clone(), + route.title.clone(), + route.description.clone(), + content, + ) + }) + .collect(); + + let engine = Bm25Engine::fit_to_corpus(&field_docs); + let bm25_results: std::collections::HashMap = engine + .search_weighted(query_lower, routes.len()) + .into_iter() + .collect(); + + let mut section = String::new(); + let mut entry_count = 0; + + for route in routes { + let nav = match ctx.nav_entry(route.node_id) { + Some(n) => n, + None => continue, + }; + + let bm25_score = bm25_results.get(&route.title).copied().unwrap_or(0.0); + if bm25_score <= 0.0 { + continue; + } + + let mut annotations = Vec::new(); + + for hint in &nav.question_hints { + let hint_lower = hint.to_lowercase(); + for kw in query_keywords { + if hint_lower.contains(&kw.to_lowercase()) { + annotations.push(format!("question \"{}\"", hint)); + break; + } + } + if !annotations.iter().any(|a| a.contains(&hint.clone())) { + for word in hint_lower.split_whitespace() { + if word.len() > 3 && query_lower.contains(word) { + annotations.push(format!("question \"{}\"", hint)); + break; + } + } + } + } + + for tag in &nav.topic_tags { + let tag_lower = tag.to_lowercase(); + for kw in query_keywords { + if tag_lower.contains(&kw.to_lowercase()) || kw.to_lowercase().contains(&tag_lower) + { + annotations.push(format!("topic \"{}\"", tag)); + break; + } + } + if !annotations + .iter() + .any(|a| a.contains(&format!("topic \"{}\"", tag))) + { + if query_lower.contains(&tag_lower) && tag.len() > 2 { + annotations.push(format!("topic \"{}\"", tag)); + } + } + } + + let annotation_str = if annotations.is_empty() { + String::new() + } else { + format!(", {}", annotations.join(", ")) + }; + + let line = format!( + " - Section '{}' — BM25: {:.2}{}\n", + route.title, bm25_score, annotation_str + ); + section.push_str(&line); + entry_count += 1; + if entry_count >= MAX_PLAN_ENTRIES { + break; + } + } + + if section.is_empty() { + String::new() + } else { + format!( + "\nSemantic hints (BM25-scored sections, higher = more relevant):\n{}", + section + ) + } +} + +/// For keyword hits that land in deep nodes (depth >= 2), expand the parent node's children. +fn build_deep_expansion(keyword_hits: &[FindHit], ctx: &DocContext<'_>) -> String { + if keyword_hits.is_empty() { + return String::new(); + } + + let mut seen_parents = HashSet::new(); + let mut expansion = String::new(); + let mut expansion_count = 0; + + for hit in keyword_hits { + for entry in &hit.entries { + if entry.depth < 2 { + continue; + } + let parent = match ctx.parent(entry.node_id) { + Some(p) => p, + None => continue, + }; + if !seen_parents.insert(parent) { + continue; + } + let routes = match ctx.ls(parent) { + Some(r) => r, + None => continue, + }; + let parent_title = ctx.node_title(parent).unwrap_or("unknown"); + expansion.push_str(&format!( + "Siblings near keyword hit '{}' (under {}):\n", + hit.keyword, parent_title + )); + for route in routes { + let marker = if ctx.node_title(entry.node_id) == Some(&route.title) { + " ← keyword hit" + } else { + "" + }; + expansion.push_str(&format!( + " - {} ({} leaves){}\n", + route.title, route.leaf_count, marker + )); + } + expansion.push('\n'); + expansion_count += 1; + if expansion_count >= MAX_EXPANSION_ENTRIES { + expansion.push_str(" ... (more expansions omitted)\n"); + break; + } + } + if expansion_count >= MAX_EXPANSION_ENTRIES { + break; + } + } + + expansion +} + +/// Build unvisited sibling branch hints for structured backtracking. +fn build_sibling_hints(state: &WorkerState, ctx: &DocContext<'_>) -> String { + let mut hints = String::new(); + + if let Some(parent) = ctx.parent(state.current_node) { + if let Some(routes) = ctx.ls(parent) { + let unvisited: Vec<&vectorless_document::ChildRoute> = routes + .iter() + .filter(|r| !state.visited.contains(&r.node_id)) + .collect(); + if !unvisited.is_empty() { + hints.push_str("Unvisited sibling branches at current level:\n"); + for route in &unvisited { + hints.push_str(&format!( + " - {} ({} leaves)\n", + route.title, route.leaf_count + )); + } + } + } + + if let Some(grandparent) = ctx.parent(parent) { + if let Some(routes) = ctx.ls(grandparent) { + let unvisited_parent_siblings: Vec<&vectorless_document::ChildRoute> = routes + .iter() + .filter(|r| !state.visited.contains(&r.node_id) && r.node_id != parent) + .collect(); + if !unvisited_parent_siblings.is_empty() { + hints.push_str("Unvisited branches at parent level (cd .. then explore):\n"); + for route in &unvisited_parent_siblings { + hints.push_str(&format!( + " - {} ({} leaves)\n", + route.title, route.leaf_count + )); + } + } + } + } + } + + if hints.is_empty() { + String::new() + } else { + format!("\n{}", hints) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::agent::config::DocContext; + use crate::agent::config::Evidence; + use crate::agent::state::WorkerState; + use vectorless_document::{ChildRoute, NavEntry, NodeId}; + use vectorless_scoring::bm25::extract_keywords; + + fn build_semantic_test_tree() -> ( + vectorless_document::DocumentTree, + vectorless_document::NavigationIndex, + NodeId, + NodeId, + NodeId, + ) { + let mut tree = vectorless_document::DocumentTree::new("Root", "root content"); + let root = tree.root(); + let revenue = tree.add_child(root, "Revenue", "revenue content"); + let expenses = tree.add_child(root, "Expenses", "expense content"); + + let mut nav = vectorless_document::NavigationIndex::new(); + nav.add_entry( + root, + NavEntry { + overview: "Annual financial report".to_string(), + question_hints: vec!["What is the financial overview?".to_string()], + topic_tags: vec!["finance".to_string()], + leaf_count: 4, + level: 0, + }, + ); + nav.add_child_routes( + root, + vec![ + ChildRoute { + node_id: revenue, + title: "Revenue".to_string(), + description: "Revenue breakdown".to_string(), + leaf_count: 2, + }, + ChildRoute { + node_id: expenses, + title: "Expenses".to_string(), + description: "Cost analysis".to_string(), + leaf_count: 2, + }, + ], + ); + nav.add_entry( + revenue, + NavEntry { + overview: "Revenue figures for 2024".to_string(), + question_hints: vec![ + "What is the total revenue?".to_string(), + "What was the Q1 revenue?".to_string(), + ], + topic_tags: vec![ + "revenue".to_string(), + "sales".to_string(), + "income".to_string(), + ], + leaf_count: 2, + level: 1, + }, + ); + nav.add_entry( + expenses, + NavEntry { + overview: "Operating expenses".to_string(), + question_hints: vec!["What are the operating costs?".to_string()], + topic_tags: vec!["expenses".to_string(), "costs".to_string()], + leaf_count: 2, + level: 1, + }, + ); + + (tree, nav, root, revenue, expenses) + } + + #[test] + fn test_build_ancestor_path() { + let (tree, nav, root, revenue, _) = build_semantic_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + assert_eq!(build_ancestor_path(revenue, &ctx), "Root/Revenue"); + assert_eq!(build_ancestor_path(root, &ctx), "Root"); + } + + #[test] + fn test_semantic_hints_keyword_match() { + let (tree, nav, _, _, _) = build_semantic_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let keywords = extract_keywords("What is the revenue?"); + let hints = build_semantic_hints(&keywords, &"what is the revenue".to_lowercase(), &ctx); + assert!( + hints.contains("Revenue"), + "Should match Revenue section, got: {}", + hints + ); + assert!(hints.contains("BM25")); + } + + #[test] + fn test_semantic_hints_topic_match() { + let (tree, nav, _, _, _) = build_semantic_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let keywords = extract_keywords("operating costs analysis"); + let hints = + build_semantic_hints(&keywords, &"operating costs analysis".to_lowercase(), &ctx); + assert!( + hints.contains("Expenses"), + "Should match Expenses via topic 'costs', got: {}", + hints + ); + } + + #[test] + fn test_semantic_hints_no_match() { + let (tree, nav, _, _, _) = build_semantic_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let keywords = extract_keywords("xyzzy foobar"); + let hints = build_semantic_hints(&keywords, &"xyzzy foobar".to_lowercase(), &ctx); + assert!(hints.is_empty(), "Should not match, got: {}", hints); + } + + #[test] + fn test_build_replan_prompt() { + let (tree, nav, root, _, _) = build_semantic_test_tree(); + let mut state = WorkerState::new(root, 15); + state.missing_info = "Need Q2 revenue figures".to_string(); + state.add_evidence(Evidence { + source_path: "root/Revenue".to_string(), + node_title: "Revenue".to_string(), + content: "Q1 revenue was $2.5M".to_string(), + doc_name: None, + }); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "test", + }; + let (system, user) = build_replan_prompt("What is total revenue?", None, &state, &ctx); + assert!(system.contains("re-planning")); + assert!(user.contains("What is total revenue?")); + assert!(user.contains("Q2 revenue")); + } + + #[test] + fn test_build_plan_prompt_with_semantic_hints() { + let (tree, nav, _, _, _) = build_semantic_test_tree(); + let ctx = DocContext { + tree: &tree, + nav_index: &nav, + reasoning_index: &vectorless_document::ReasoningIndex::default(), + doc_name: "Financial Report", + }; + let ls_output = + "[1] Revenue — Revenue breakdown (2 leaves)\n[2] Expenses — Cost analysis (2 leaves)\n"; + let (system, user) = build_plan_prompt( + "What is the revenue?", + None, + ls_output, + "Financial Report", + &[], + &ctx, + QueryIntent::Factual, + ); + assert!(system.contains("semantic hints")); + assert!(user.contains("What is the revenue?")); + } +} diff --git a/vectorless-core/vectorless-config/Cargo.toml b/vectorless-core/vectorless-config/Cargo.toml new file mode 100644 index 00000000..c94f8713 --- /dev/null +++ b/vectorless-core/vectorless-config/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "vectorless-config" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +tracing = { workspace = true } +tokio = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-config/src/lib.rs b/vectorless-core/vectorless-config/src/lib.rs new file mode 100644 index 00000000..feaa5fe5 --- /dev/null +++ b/vectorless-core/vectorless-config/src/lib.rs @@ -0,0 +1,21 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Internal configuration management. +//! +//! Users configure vectorless via [`EngineBuilder`](vectorless_engine::EngineBuilder) methods, +//! not by directly interacting with this module. + +mod types; +mod validator; + +pub use types::Config; +pub use types::DocumentGraphConfig; +pub use types::MetricsConfig; +pub use types::LlmMetricsConfig; +pub use types::RetrievalMetricsConfig; +pub use types::{ + CompressionAlgorithm, FallbackBehavior, FallbackConfig, IndexerConfig, LlmConfig, + OnAllFailedBehavior, RetrievalConfig, RetryConfig, SlotConfig, StorageConfig, + ThrottleConfig, +}; diff --git a/vectorless-core/vectorless-config/src/types/graph.rs b/vectorless-core/vectorless-config/src/types/graph.rs new file mode 100644 index 00000000..40b1d888 --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/graph.rs @@ -0,0 +1,51 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration for document graph building and retrieval. + +use serde::{Deserialize, Serialize}; + +/// Configuration for building the document graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraphConfig { + /// Whether graph building is enabled. + pub enabled: bool, + /// Minimum Jaccard similarity for creating an edge. + pub min_keyword_jaccard: f32, + /// Minimum shared keywords to create an edge. + pub min_shared_keywords: usize, + /// Maximum top keywords per document node. + pub max_keywords_per_doc: usize, + /// Maximum edges per document node. + pub max_edges_per_node: usize, + /// Boost factor applied to graph-connected documents during retrieval. + pub retrieval_boost_factor: f32, +} + +impl Default for DocumentGraphConfig { + fn default() -> Self { + Self { + enabled: true, + min_keyword_jaccard: 0.1, + min_shared_keywords: 2, + max_keywords_per_doc: 50, + max_edges_per_node: 20, + retrieval_boost_factor: 0.15, + } + } +} + +impl DocumentGraphConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create a disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } +} diff --git a/vectorless-core/vectorless-config/src/types/indexer.rs b/vectorless-core/vectorless-config/src/types/indexer.rs new file mode 100644 index 00000000..6353122a --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/indexer.rs @@ -0,0 +1,108 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Indexer configuration types. + +use serde::{Deserialize, Serialize}; + +/// Indexer configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IndexerConfig { + /// Word count threshold for splitting sections into subsections. + #[serde(default = "default_subsection_threshold")] + pub subsection_threshold: usize, + + /// Maximum tokens to send in a single segmentation request. + #[serde(default = "default_max_segment_tokens")] + pub max_segment_tokens: usize, + + /// Maximum tokens for each summary. + #[serde(default = "default_max_summary_tokens")] + pub max_summary_tokens: usize, + + /// Minimum content tokens required to generate a summary. + #[serde(default = "default_min_summary_tokens")] + pub min_summary_tokens: usize, +} + +fn default_subsection_threshold() -> usize { + 300 +} + +fn default_max_segment_tokens() -> usize { + 3000 +} + +fn default_max_summary_tokens() -> usize { + 200 +} + +fn default_min_summary_tokens() -> usize { + 20 +} + +impl Default for IndexerConfig { + fn default() -> Self { + Self { + subsection_threshold: default_subsection_threshold(), + max_segment_tokens: default_max_segment_tokens(), + max_summary_tokens: default_max_summary_tokens(), + min_summary_tokens: default_min_summary_tokens(), + } + } +} + +impl IndexerConfig { + /// Create a new indexer config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the subsection threshold. + pub fn with_subsection_threshold(mut self, threshold: usize) -> Self { + self.subsection_threshold = threshold; + self + } + + /// Set the maximum segment tokens. + pub fn with_max_segment_tokens(mut self, tokens: usize) -> Self { + self.max_segment_tokens = tokens; + self + } + + /// Set the maximum summary tokens. + pub fn with_max_summary_tokens(mut self, tokens: usize) -> Self { + self.max_summary_tokens = tokens; + self + } + + /// Set the minimum summary tokens. + pub fn with_min_summary_tokens(mut self, tokens: usize) -> Self { + self.min_summary_tokens = tokens; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_indexer_config_defaults() { + let config = IndexerConfig::default(); + assert_eq!(config.subsection_threshold, 300); + assert_eq!(config.max_segment_tokens, 3000); + assert_eq!(config.max_summary_tokens, 200); + assert_eq!(config.min_summary_tokens, 20); + } + + #[test] + fn test_indexer_config_builder() { + let config = IndexerConfig::new() + .with_subsection_threshold(500) + .with_max_summary_tokens(300); + + assert_eq!(config.subsection_threshold, 500); + assert_eq!(config.max_summary_tokens, 300); + } +} diff --git a/vectorless-core/vectorless-config/src/types/llm_pool.rs b/vectorless-core/vectorless-config/src/types/llm_pool.rs new file mode 100644 index 00000000..2cd129f1 --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/llm_pool.rs @@ -0,0 +1,612 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified LLM configuration. +//! +//! This module consolidates all LLM-related configuration into a single +//! cohesive structure. Users configure via [`EngineBuilder`](vectorless_engine::EngineBuilder) +//! for simple cases, or construct [`LlmConfig`] programmatically for advanced use. + +use serde::{Deserialize, Serialize}; + +/// Unified LLM configuration — the single entry point for all LLM settings. +/// +/// Contains: +/// - Global credentials (`api_key`, `model`, `endpoint`) +/// - Per-purpose slot overrides (`index`, `retrieval`) +/// - Infrastructure settings (`retry`, `throttle`, `fallback`) +/// +/// # Simple usage (via EngineBuilder) +/// +/// ```rust,no_run +/// use vectorless::client::EngineBuilder; +/// +/// # async fn example() -> Result<(), vectorless::BuildError> { +/// let engine = EngineBuilder::new() +/// .with_key("sk-...") +/// .with_model("gpt-4o") +/// .with_endpoint("https://api.openai.com/v1") +/// .build() +/// .await?; +/// # Ok(()) +/// # } +/// ``` +/// +/// # Advanced usage (programmatic config) +/// +/// ```rust,ignore +/// use vectorless::config::{Config, LlmConfig, SlotConfig}; +/// +/// let config = Config::new().with_llm( +/// LlmConfig::new("gpt-4o") +/// .with_api_key("sk-...") +/// .with_endpoint("https://api.openai.com/v1") +/// .with_index(SlotConfig::fast().with_model("gpt-4o-mini")) +/// .with_retrieval(SlotConfig::default().with_max_tokens(200)) +/// ); +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmConfig { + /// API key — **required**. + #[serde(default)] + pub api_key: Option, + + /// Default model name — **required**. + /// + /// Individual slots can override this via [`SlotConfig::model`]. + #[serde(default)] + pub model: String, + + /// API endpoint URL — **required**. + #[serde(default)] + pub endpoint: Option, + + /// Index slot (document indexing / summarization). + /// Uses a fast, cost-effective model by default. + #[serde(default)] + pub index: SlotConfig, + + /// Retrieval slot (document navigation). + /// Uses the default model. + #[serde(default = "default_retrieval_slot")] + pub retrieval: SlotConfig, + + /// Retry configuration for LLM calls. + #[serde(default)] + pub retry: RetryConfig, + + /// Throttle / rate-limiting configuration. + #[serde(default)] + pub throttle: ThrottleConfig, + + /// Fallback configuration for error recovery. + #[serde(default)] + pub fallback: FallbackConfig, +} + +fn default_retrieval_slot() -> SlotConfig { + SlotConfig { + max_tokens: 100, + ..SlotConfig::default() + } +} + +impl Default for LlmConfig { + fn default() -> Self { + Self { + api_key: None, + model: String::new(), + endpoint: None, + index: SlotConfig::default(), + retrieval: default_retrieval_slot(), + retry: RetryConfig::default(), + throttle: ThrottleConfig::default(), + fallback: FallbackConfig::default(), + } + } +} + +impl LlmConfig { + /// Create a new config with a specific model. + pub fn new(model: impl Into) -> Self { + Self { + model: model.into(), + ..Self::default() + } + } + + /// Set the API key. + pub fn with_api_key(mut self, key: impl Into) -> Self { + self.api_key = Some(key.into()); + self + } + + /// Set the default model. + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = model.into(); + self + } + + /// Set the endpoint URL. + pub fn with_endpoint(mut self, url: impl Into) -> Self { + self.endpoint = Some(url.into()); + self + } + + /// Set the index slot configuration. + pub fn with_index(mut self, slot: SlotConfig) -> Self { + self.index = slot; + self + } + + /// Set the retrieval slot configuration. + pub fn with_retrieval(mut self, slot: SlotConfig) -> Self { + self.retrieval = slot; + self + } + + /// Set the retry configuration. + pub fn with_retry(mut self, retry: RetryConfig) -> Self { + self.retry = retry; + self + } + + /// Set the throttle configuration. + pub fn with_throttle(mut self, throttle: ThrottleConfig) -> Self { + self.throttle = throttle; + self + } + + /// Set the fallback configuration. + pub fn with_fallback(mut self, fallback: FallbackConfig) -> Self { + self.fallback = fallback; + self + } + + /// Convenience: set max concurrent requests (delegates to throttle). + pub fn with_max_concurrent(mut self, max: usize) -> Self { + self.throttle.max_concurrent_requests = max; + self + } + + /// Resolve the effective model for a given slot. + /// + /// Returns the slot-specific model if set, otherwise the default model. + pub fn resolve_model(&self, slot: &SlotConfig) -> String { + slot.model.clone().unwrap_or_else(|| self.model.clone()) + } +} + +/// Per-purpose LLM slot override. +/// +/// Controls model selection and generation parameters for a specific +/// LLM usage (index or retrieval). +/// +/// - `model`: Override the default model (optional). +/// - `max_tokens`: Maximum response tokens. +/// - `temperature`: Generation temperature. +/// +/// `api_key` and `endpoint` are **not** here — they are always inherited +/// from the parent [`LlmConfig`]. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SlotConfig { + /// Override the default model for this purpose. + /// When `None`, uses [`LlmConfig::model`]. + #[serde(default)] + pub model: Option, + + /// Maximum tokens for responses. + #[serde(default = "default_max_tokens")] + pub max_tokens: usize, + + /// Temperature for generation. + #[serde(default = "default_temperature")] + pub temperature: f32, +} + +fn default_max_tokens() -> usize { + 200 +} + +fn default_temperature() -> f32 { + 0.0 +} + +impl Default for SlotConfig { + fn default() -> Self { + Self { + model: None, + max_tokens: default_max_tokens(), + temperature: default_temperature(), + } + } +} + +impl SlotConfig { + /// Create a new slot config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create a "fast" preset (low tokens). + pub fn fast() -> Self { + Self { + max_tokens: 100, + ..Self::default() + } + } + + /// Set the model override. + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = Some(model.into()); + self + } + + /// Set the max tokens. + pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { + self.max_tokens = max_tokens; + self + } + + /// Set the temperature. + pub fn with_temperature(mut self, temperature: f32) -> Self { + self.temperature = temperature; + self + } +} + +// ============================================================ +// Supporting configuration types +// ============================================================ + +/// Retry configuration for LLM calls. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetryConfig { + /// Maximum number of retry attempts. + #[serde(default = "default_max_attempts")] + pub max_attempts: usize, + + /// Initial delay before first retry (milliseconds). + #[serde(default = "default_initial_delay_ms")] + pub initial_delay_ms: u64, + + /// Maximum delay between retries (milliseconds). + #[serde(default = "default_max_delay_ms")] + pub max_delay_ms: u64, + + /// Multiplier for exponential backoff. + #[serde(default = "default_multiplier")] + pub multiplier: f64, + + /// Whether to retry on rate limit errors. + #[serde(default = "default_true")] + pub retry_on_rate_limit: bool, +} + +fn default_max_attempts() -> usize { + 3 +} + +fn default_initial_delay_ms() -> u64 { + 500 +} + +fn default_max_delay_ms() -> u64 { + 30000 +} + +fn default_multiplier() -> f64 { + 2.0 +} + +fn default_true() -> bool { + true +} + +impl Default for RetryConfig { + fn default() -> Self { + Self { + max_attempts: default_max_attempts(), + initial_delay_ms: default_initial_delay_ms(), + max_delay_ms: default_max_delay_ms(), + multiplier: default_multiplier(), + retry_on_rate_limit: default_true(), + } + } +} + +impl RetryConfig { + /// Create a new retry config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the max attempts. + pub fn with_max_attempts(mut self, max_attempts: usize) -> Self { + self.max_attempts = max_attempts; + self + } + + /// Calculate delay for a given attempt (0-indexed). + pub fn delay_for_attempt(&self, attempt: usize) -> std::time::Duration { + let delay_ms = (self.initial_delay_ms as f64) * self.multiplier.powi(attempt as i32); + let delay_ms = delay_ms.min(self.max_delay_ms as f64); + std::time::Duration::from_millis(delay_ms as u64) + } +} + +/// Throttle / rate-limiting configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ThrottleConfig { + /// Maximum concurrent LLM API calls. + #[serde(default = "default_max_concurrent")] + pub max_concurrent_requests: usize, + + /// Rate limit: requests per minute. + #[serde(default = "default_rpm")] + pub requests_per_minute: usize, + + /// Enable rate limiting. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Enable semaphore-based concurrency limiting. + #[serde(default = "default_true")] + pub semaphore_enabled: bool, +} + +fn default_max_concurrent() -> usize { + 10 +} + +fn default_rpm() -> usize { + 500 +} + +impl Default for ThrottleConfig { + fn default() -> Self { + Self { + max_concurrent_requests: default_max_concurrent(), + requests_per_minute: default_rpm(), + enabled: default_true(), + semaphore_enabled: default_true(), + } + } +} + +impl ThrottleConfig { + /// Create a new throttle config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the max concurrent requests. + pub fn with_max_concurrent(mut self, max: usize) -> Self { + self.max_concurrent_requests = max; + self + } + + /// Set the requests per minute. + pub fn with_rpm(mut self, rpm: usize) -> Self { + self.requests_per_minute = rpm; + self + } +} + +/// Fallback behavior on errors. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "snake_case")] +pub enum FallbackBehavior { + /// Retry the same model. + Retry, + /// Immediately fall back to next model. + Fallback, + /// Retry first, then fall back. + #[default] + RetryThenFallback, + /// Fail immediately. + Fail, +} + +/// Behavior when all fallback attempts fail. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "snake_case")] +pub enum OnAllFailedBehavior { + /// Return an error. + #[default] + ReturnError, + /// Return cached result if available. + ReturnCache, +} + +/// Fallback configuration for error recovery. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FallbackConfig { + /// Enable fallback mechanism. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Fallback models in priority order. + #[serde(default = "default_fallback_models")] + pub models: Vec, + + /// Fallback endpoints (optional). + #[serde(default)] + pub endpoints: Vec, + + /// Behavior on rate limit error. + #[serde(default)] + pub on_rate_limit: FallbackBehavior, + + /// Behavior on timeout error. + #[serde(default)] + pub on_timeout: FallbackBehavior, + + /// Behavior when all attempts fail. + #[serde(default)] + pub on_all_failed: OnAllFailedBehavior, + + /// Maximum retry attempts. + #[serde(default = "default_max_retries")] + pub max_retries: usize, + + /// Initial retry delay in milliseconds. + #[serde(default = "default_initial_retry_delay_ms")] + pub initial_retry_delay_ms: u64, + + /// Maximum retry delay in milliseconds. + #[serde(default = "default_max_retry_delay_ms")] + pub max_retry_delay_ms: u64, + + /// Retry delay multiplier (exponential backoff). + #[serde(default = "default_retry_multiplier")] + pub retry_multiplier: f32, +} + +fn default_fallback_models() -> Vec { + vec!["gpt-4o-mini".to_string(), "glm-4-flash".to_string()] +} + +fn default_max_retries() -> usize { + 3 +} + +fn default_initial_retry_delay_ms() -> u64 { + 1000 +} + +fn default_max_retry_delay_ms() -> u64 { + 30000 +} + +fn default_retry_multiplier() -> f32 { + 2.0 +} + +impl Default for FallbackConfig { + fn default() -> Self { + Self { + enabled: default_true(), + models: default_fallback_models(), + endpoints: Vec::new(), + on_rate_limit: FallbackBehavior::default(), + on_timeout: FallbackBehavior::default(), + on_all_failed: OnAllFailedBehavior::default(), + max_retries: default_max_retries(), + initial_retry_delay_ms: default_initial_retry_delay_ms(), + max_retry_delay_ms: default_max_retry_delay_ms(), + retry_multiplier: default_retry_multiplier(), + } + } +} + +impl FallbackConfig { + /// Create a new fallback config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable fallback entirely. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set fallback models. + pub fn with_models(mut self, models: Vec) -> Self { + self.models = models; + self + } + + /// Set behavior on rate limit. + pub fn with_on_rate_limit(mut self, behavior: FallbackBehavior) -> Self { + self.on_rate_limit = behavior; + self + } + + /// Calculate retry delay with exponential backoff. + pub fn calculate_retry_delay(&self, attempt: usize) -> std::time::Duration { + let delay_ms = if attempt == 0 { + self.initial_retry_delay_ms + } else { + let delay = + self.initial_retry_delay_ms as f32 * self.retry_multiplier.powi(attempt as i32); + delay.min(self.max_retry_delay_ms as f32) as u64 + }; + std::time::Duration::from_millis(delay_ms) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_llm_config_defaults() { + let config = LlmConfig::default(); + assert!(config.api_key.is_none()); + assert!(config.model.is_empty()); + assert!(config.endpoint.is_none()); + assert!(config.index.model.is_none()); + assert!(config.retrieval.model.is_none()); + assert_eq!(config.index.max_tokens, 200); + assert_eq!(config.retrieval.max_tokens, 100); + } + + #[test] + fn test_llm_config_builder() { + let config = LlmConfig::new("gpt-4o") + .with_api_key("sk-test") + .with_endpoint("https://api.openai.com/v1") + .with_index(SlotConfig::fast().with_model("gpt-4o-mini")); + + assert_eq!(config.model, "gpt-4o"); + assert_eq!(config.api_key, Some("sk-test".to_string())); + assert_eq!(config.index.model, Some("gpt-4o-mini".to_string())); + assert_eq!(config.index.max_tokens, 100); + } + + #[test] + fn test_resolve_model() { + let config = + LlmConfig::new("gpt-4o").with_retrieval(SlotConfig::new().with_model("gpt-4o-mini")); + + assert_eq!(config.resolve_model(&config.index), "gpt-4o"); + assert_eq!(config.resolve_model(&config.retrieval), "gpt-4o-mini"); + } + + #[test] + fn test_slot_config_fast() { + let slot = SlotConfig::fast(); + assert_eq!(slot.max_tokens, 100); + } + + #[test] + fn test_retry_delay_calculation() { + let config = RetryConfig::default(); + assert_eq!( + config.delay_for_attempt(0), + std::time::Duration::from_millis(500) + ); + assert_eq!( + config.delay_for_attempt(1), + std::time::Duration::from_millis(1000) + ); + } + + #[test] + fn test_throttle_config_defaults() { + let config = ThrottleConfig::default(); + assert_eq!(config.max_concurrent_requests, 10); + assert_eq!(config.requests_per_minute, 500); + } + + #[test] + fn test_fallback_config_defaults() { + let config = FallbackConfig::default(); + assert!(config.enabled); + assert!(!config.models.is_empty()); + } +} diff --git a/vectorless-core/vectorless-config/src/types/metrics.rs b/vectorless-core/vectorless-config/src/types/metrics.rs new file mode 100644 index 00000000..c1f4e766 --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/metrics.rs @@ -0,0 +1,181 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Metrics configuration for unified observability. + +use serde::{Deserialize, Serialize}; + +/// Unified metrics configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MetricsConfig { + /// Enable metrics collection. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Storage path for persisted metrics. + #[serde(default = "default_storage_path")] + pub storage_path: String, + + /// Retention period in days. + #[serde(default = "default_retention_days")] + pub retention_days: usize, + + /// LLM metrics configuration. + #[serde(default)] + pub llm: LlmMetricsConfig, + + /// Retrieval metrics configuration. + #[serde(default)] + pub retrieval: RetrievalMetricsConfig, +} + +fn default_storage_path() -> String { + "./workspace/metrics".to_string() +} + +fn default_retention_days() -> usize { + 30 +} + +fn default_true() -> bool { + true +} + +impl Default for MetricsConfig { + fn default() -> Self { + Self { + enabled: default_true(), + storage_path: default_storage_path(), + retention_days: default_retention_days(), + llm: LlmMetricsConfig::default(), + retrieval: RetrievalMetricsConfig::default(), + } + } +} + +impl MetricsConfig { + /// Create a new metrics config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable metrics collection. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } +} + +/// LLM-specific metrics configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmMetricsConfig { + /// Track token usage. + #[serde(default = "default_true")] + pub track_tokens: bool, + + /// Track latency. + #[serde(default = "default_true")] + pub track_latency: bool, + + /// Track estimated cost. + #[serde(default = "default_true")] + pub track_cost: bool, + + /// Cost per 1K input tokens (in USD). + #[serde(default = "default_cost_per_1k_input")] + pub cost_per_1k_input_tokens: f64, + + /// Cost per 1K output tokens (in USD). + #[serde(default = "default_cost_per_1k_output")] + pub cost_per_1k_output_tokens: f64, +} + +fn default_cost_per_1k_input() -> f64 { + 0.00015 // gpt-4o-mini +} + +fn default_cost_per_1k_output() -> f64 { + 0.0006 // gpt-4o-mini +} + +impl Default for LlmMetricsConfig { + fn default() -> Self { + Self { + track_tokens: default_true(), + track_latency: default_true(), + track_cost: default_true(), + cost_per_1k_input_tokens: default_cost_per_1k_input(), + cost_per_1k_output_tokens: default_cost_per_1k_output(), + } + } +} + +impl LlmMetricsConfig { + /// Calculate cost for given tokens. + pub fn calculate_cost(&self, input_tokens: u64, output_tokens: u64) -> f64 { + (input_tokens as f64 / 1000.0) * self.cost_per_1k_input_tokens + + (output_tokens as f64 / 1000.0) * self.cost_per_1k_output_tokens + } +} + +/// Retrieval-specific metrics configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetrievalMetricsConfig { + /// Track search paths. + #[serde(default = "default_true")] + pub track_paths: bool, + + /// Track relevance scores. + #[serde(default = "default_true")] + pub track_scores: bool, + + /// Track iterations. + #[serde(default = "default_true")] + pub track_iterations: bool, + + /// Track cache hits/misses. + #[serde(default = "default_true")] + pub track_cache: bool, +} + +impl Default for RetrievalMetricsConfig { + fn default() -> Self { + Self { + track_paths: default_true(), + track_scores: default_true(), + track_iterations: default_true(), + track_cache: default_true(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_metrics_config_defaults() { + let config = MetricsConfig::default(); + assert!(config.enabled); + assert_eq!(config.retention_days, 30); + } + + #[test] + fn test_llm_cost_calculation() { + let config = LlmMetricsConfig::default(); + + // 1000 input + 500 output tokens + let cost = config.calculate_cost(1000, 500); + + // 1 * 0.00015 + 0.5 * 0.0006 = 0.00015 + 0.0003 = 0.00045 + assert!((cost - 0.00045).abs() < 0.000001); + } + + #[test] + fn test_disabled_metrics() { + let config = MetricsConfig::disabled(); + assert!(!config.enabled); + } +} diff --git a/vectorless-core/vectorless-config/src/types/mod.rs b/vectorless-core/vectorless-config/src/types/mod.rs new file mode 100644 index 00000000..377842c6 --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/mod.rs @@ -0,0 +1,372 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration type definitions. + +mod graph; +mod indexer; +mod llm_pool; +mod metrics; +mod retrieval; +mod storage; + +use serde::{Deserialize, Serialize}; + +pub use indexer::IndexerConfig; +pub use llm_pool::{ + FallbackBehavior, FallbackConfig, LlmConfig, OnAllFailedBehavior, RetryConfig, SlotConfig, + ThrottleConfig, +}; +pub use metrics::{LlmMetricsConfig, MetricsConfig, RetrievalMetricsConfig}; +pub use retrieval::RetrievalConfig; +pub use storage::{CompressionAlgorithm, StorageConfig}; +pub use graph::DocumentGraphConfig; + +/// Main configuration for vectorless. +/// +/// Users typically configure via [`EngineBuilder`](vectorless_engine::EngineBuilder): +/// +/// ```rust,no_run +/// use vectorless::client::EngineBuilder; +/// +/// # async fn example() -> Result<(), vectorless::BuildError> { +/// let engine = EngineBuilder::new() +/// .with_key("sk-...") +/// .with_model("gpt-4o") +/// .with_endpoint("https://api.openai.com/v1") +/// .build() +/// .await?; +/// # Ok(()) +/// # } +/// ``` +/// +/// Advanced users can construct this programmatically: +/// +/// ```rust,ignore +/// use vectorless::config::{Config, LlmConfig, SlotConfig}; +/// +/// let config = Config::new().with_llm( +/// LlmConfig::new("gpt-4o") +/// .with_api_key("sk-...") +/// .with_endpoint("https://api.openai.com/v1") +/// .with_index(SlotConfig::fast().with_model("gpt-4o-mini")) +/// ); +/// ``` +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Config { + /// LLM configuration (model, credentials, retry, throttle, fallback). + #[serde(default)] + pub llm: LlmConfig, + + /// Metrics configuration. + #[serde(default)] + pub metrics: MetricsConfig, + + /// Indexer configuration. + #[serde(default)] + pub indexer: IndexerConfig, + + /// Retrieval strategy configuration (search, content aggregation, etc.). + #[serde(default)] + pub retrieval: RetrievalConfig, + + /// Storage configuration. + #[serde(default)] + pub storage: StorageConfig, + + /// Document graph configuration. + #[serde(default)] + pub graph: DocumentGraphConfig, +} + +impl Default for Config { + fn default() -> Self { + Self { + llm: LlmConfig::default(), + metrics: MetricsConfig::default(), + indexer: IndexerConfig::default(), + retrieval: RetrievalConfig::default(), + storage: StorageConfig::default(), + graph: DocumentGraphConfig::default(), + } + } +} + +impl Config { + /// Create a new configuration with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the LLM configuration. + pub fn with_llm(mut self, llm: LlmConfig) -> Self { + self.llm = llm; + self + } + + /// Set the metrics configuration. + pub fn with_metrics(mut self, metrics: MetricsConfig) -> Self { + self.metrics = metrics; + self + } + + /// Set the indexer configuration. + pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { + self.indexer = indexer; + self + } + + /// Set the retrieval configuration. + pub fn with_retrieval(mut self, retrieval: RetrievalConfig) -> Self { + self.retrieval = retrieval; + self + } + + /// Set the storage configuration. + pub fn with_storage(mut self, storage: StorageConfig) -> Self { + self.storage = storage; + self + } + + /// Set the document graph configuration. + pub fn with_graph(mut self, graph: DocumentGraphConfig) -> Self { + self.graph = graph; + self + } + + /// Validate the configuration. + pub fn validate(&self) -> Result<(), ConfigValidationError> { + let mut errors = Vec::new(); + + // Validate indexer + if self.indexer.subsection_threshold == 0 { + errors.push(ValidationError::error( + "indexer.subsection_threshold", + "Subsection threshold must be greater than 0", + )); + } + + // Validate LLM slot tokens + if self.llm.index.max_tokens == 0 { + errors.push(ValidationError::error( + "llm.index.max_tokens", + "Index max tokens must be greater than 0", + )); + } + + if self.llm.retrieval.max_tokens == 0 { + errors.push(ValidationError::error( + "llm.retrieval.max_tokens", + "Retrieval max tokens must be greater than 0", + )); + } + + // Validate retrieval + if self.retrieval.top_k == 0 { + errors.push(ValidationError::error( + "retrieval.top_k", + "Top K must be greater than 0", + )); + } + + // Validate throttle + if self.llm.throttle.max_concurrent_requests == 0 { + errors.push(ValidationError::error( + "llm.throttle.max_concurrent_requests", + "Max concurrent requests must be greater than 0", + )); + } + + // Validate graph + if self.graph.min_keyword_jaccard < 0.0 || self.graph.min_keyword_jaccard > 1.0 { + errors.push(ValidationError::error( + "graph.min_keyword_jaccard", + "Must be between 0.0 and 1.0", + )); + } + if self.graph.max_edges_per_node == 0 { + errors.push(ValidationError::error( + "graph.max_edges_per_node", + "Must be greater than 0", + )); + } + + // Validate fallback + if self.llm.fallback.enabled && self.llm.fallback.models.is_empty() { + errors.push(ValidationError::warning( + "llm.fallback.models", + "Fallback enabled but no fallback models configured", + )); + } + + if errors.is_empty() { + Ok(()) + } else { + Err(ConfigValidationError { errors }) + } + } +} + +/// Configuration validation error. +#[derive(Debug, Clone)] +pub struct ConfigValidationError { + /// Validation errors. + pub errors: Vec, +} + +impl std::fmt::Display for ConfigValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Configuration validation failed with {} error(s)", self.errors.len()) + } +} + +impl std::error::Error for ConfigValidationError {} + +/// A single validation error. +#[derive(Debug, Clone)] +pub struct ValidationError { + /// Field path (e.g., "retrieval.content.token_budget"). + pub path: String, + + /// Error message. + pub message: String, + + /// Expected value/range. + pub expected: Option, + + /// Actual value. + pub actual: Option, + + /// Severity level. + pub severity: Severity, +} + +impl ValidationError { + /// Create an error-level validation error. + pub fn error(path: impl Into, message: impl Into) -> Self { + Self { + path: path.into(), + message: message.into(), + expected: None, + actual: None, + severity: Severity::Error, + } + } + + /// Create a warning-level validation error. + pub fn warning(path: impl Into, message: impl Into) -> Self { + Self { + path: path.into(), + message: message.into(), + expected: None, + actual: None, + severity: Severity::Warning, + } + } + + /// Create an info-level validation error. + pub fn info(path: impl Into, message: impl Into) -> Self { + Self { + path: path.into(), + message: message.into(), + expected: None, + actual: None, + severity: Severity::Info, + } + } + + /// Set the expected value. + pub fn with_expected(mut self, expected: impl Into) -> Self { + self.expected = Some(expected.into()); + self + } + + /// Set the actual value. + pub fn with_actual(mut self, actual: impl Into) -> Self { + self.actual = Some(actual.into()); + self + } +} + +impl std::fmt::Display for ValidationError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let severity = match self.severity { + Severity::Error => "ERROR", + Severity::Warning => "WARNING", + Severity::Info => "INFO", + }; + write!(f, "[{}] {}: {}", severity, self.path, self.message)?; + if let Some(ref expected) = self.expected { + write!(f, " (expected: {})", expected)?; + } + if let Some(ref actual) = self.actual { + write!(f, " (actual: {})", actual)?; + } + Ok(()) + } +} + +/// Validation severity level. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Severity { + /// Error - must fix. + Error, + /// Warning - should fix. + Warning, + /// Info - suggestion. + Info, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_defaults() { + let config = Config::default(); + assert!(config.llm.model.is_empty()); + assert!(config.llm.index.model.is_none()); + assert_eq!(config.retrieval.top_k, 3); + assert_eq!(config.indexer.subsection_threshold, 300); + assert!(config.metrics.enabled); + } + + #[test] + fn test_llm_config_defaults() { + let config = LlmConfig::default(); + assert!(config.index.model.is_none()); + assert!(config.retrieval.model.is_none()); + assert_eq!(config.retry.max_attempts, 3); + assert_eq!(config.throttle.max_concurrent_requests, 10); + } + + #[test] + fn test_config_validation_success() { + let config = Config::default(); + assert!(config.validate().is_ok()); + } + + #[test] + fn test_config_validation_errors() { + let mut config = Config::default(); + config.retrieval.top_k = 0; + + let result = config.validate(); + assert!(result.is_err()); + + let err = result.unwrap_err(); + assert!(!err.errors.is_empty()); + } + + #[test] + fn test_validation_error_display() { + let err = ValidationError::error("test.field", "Invalid value") + .with_expected(">= 1") + .with_actual("0"); + + let display = format!("{}", err); + assert!(display.contains("ERROR")); + assert!(display.contains("test.field")); + assert!(display.contains("expected")); + } +} diff --git a/vectorless-core/vectorless-config/src/types/retrieval.rs b/vectorless-core/vectorless-config/src/types/retrieval.rs new file mode 100644 index 00000000..c4300987 --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/retrieval.rs @@ -0,0 +1,170 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Retrieval strategy configuration types. +//! +//! LLM configuration (model, api_key, endpoint) is managed centrally +//! in [`LlmConfig`](super::LlmConfig). This module only contains +//! retrieval strategy parameters. + +use serde::{Deserialize, Serialize}; + +use super::storage::{CacheConfig, StrategyConfig, SufficiencyConfig}; + +/// Retrieval strategy configuration. +/// +/// Controls how documents are searched and retrieved, independent +/// of which LLM model is used for navigation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetrievalConfig { + /// Number of top-k results to return. + #[serde(default = "default_top_k")] + pub top_k: usize, + + /// Search algorithm configuration. + #[serde(default)] + pub search: SearchConfig, + + /// Sufficiency checker configuration. + #[serde(default)] + pub sufficiency: SufficiencyConfig, + + /// Cache configuration. + #[serde(default)] + pub cache: CacheConfig, + + /// Strategy-specific configuration. + #[serde(default)] + pub strategy: StrategyConfig, +} + +fn default_top_k() -> usize { + 3 +} + +impl Default for RetrievalConfig { + fn default() -> Self { + Self { + top_k: default_top_k(), + search: SearchConfig::default(), + sufficiency: SufficiencyConfig::default(), + cache: CacheConfig::default(), + strategy: StrategyConfig::default(), + } + } +} + +impl RetrievalConfig { + /// Create a new retrieval config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the top_k. + pub fn with_top_k(mut self, top_k: usize) -> Self { + self.top_k = top_k; + self + } +} + +/// Search algorithm configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SearchConfig { + /// Number of top-k results to return. + #[serde(default = "default_search_top_k")] + pub top_k: usize, + + /// Beam width for multi-path search. + #[serde(default = "default_beam_width")] + pub beam_width: usize, + + /// Maximum iterations for search algorithms. + #[serde(default = "default_max_iterations")] + pub max_iterations: usize, + + /// Minimum score to include a path. + #[serde(default = "default_min_score")] + pub min_score: f32, + + /// Fallback chain: algorithms tried in order until min_score is met. + /// Options: "beam", "mcts", "pure_pilot". + /// Default: ["beam", "mcts", "pure_pilot"] + #[serde(default = "default_fallback_chain")] + pub fallback_chain: Vec, +} + +fn default_search_top_k() -> usize { + 5 +} + +fn default_beam_width() -> usize { + 3 +} + +fn default_max_iterations() -> usize { + 10 +} + +fn default_min_score() -> f32 { + 0.1 +} +fn default_fallback_chain() -> Vec { + vec!["beam".into(), "mcts".into(), "pure_pilot".into()] +} + +impl Default for SearchConfig { + fn default() -> Self { + Self { + top_k: default_search_top_k(), + beam_width: default_beam_width(), + max_iterations: default_max_iterations(), + min_score: default_min_score(), + fallback_chain: default_fallback_chain(), + } + } +} + +impl SearchConfig { + /// Create new search config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the top_k. + pub fn with_top_k(mut self, top_k: usize) -> Self { + self.top_k = top_k; + self + } + + /// Set the beam width. + pub fn with_beam_width(mut self, width: usize) -> Self { + self.beam_width = width; + self + } + + /// Set the max iterations. + pub fn with_max_iterations(mut self, max: usize) -> Self { + self.max_iterations = max; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retrieval_config_defaults() { + let config = RetrievalConfig::default(); + assert_eq!(config.top_k, 3); + assert_eq!(config.search.top_k, 5); + } + + #[test] + fn test_search_config_defaults() { + let config = SearchConfig::default(); + assert_eq!(config.top_k, 5); + assert_eq!(config.beam_width, 3); + assert_eq!(config.max_iterations, 10); + } +} diff --git a/vectorless-core/vectorless-config/src/types/storage.rs b/vectorless-core/vectorless-config/src/types/storage.rs new file mode 100644 index 00000000..b13304ea --- /dev/null +++ b/vectorless-core/vectorless-config/src/types/storage.rs @@ -0,0 +1,742 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage and sufficiency configuration types. + +use serde::{Deserialize, Serialize}; +use std::path::PathBuf; + +/// Storage configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StorageConfig { + /// Workspace directory for persisted documents. + #[serde(default = "default_workspace_dir")] + pub workspace_dir: PathBuf, + + /// LRU cache size (number of documents). + #[serde(default = "default_cache_size")] + pub cache_size: usize, + + /// Enable atomic writes (write to temp file, then rename). + /// This prevents data corruption on crash. + #[serde(default = "default_atomic_writes")] + pub atomic_writes: bool, + + /// Enable file locking for multi-process safety. + #[serde(default = "default_file_lock")] + pub file_lock: bool, + + /// Enable checksum verification for data integrity. + #[serde(default = "default_checksum_enabled")] + pub checksum_enabled: bool, + + /// Enable compression for stored documents. + #[serde(default)] + pub compression: CompressionConfig, + + /// Directory for pipeline checkpoints (derived from `workspace_dir`). + #[serde(skip)] + pub checkpoint_dir: PathBuf, +} + +fn default_workspace_dir() -> PathBuf { + default_workspace_path_for_cwd() +} + +/// Compute the default workspace path for the current working directory. +/// +/// Returns a platform-appropriate path: +/// - **Linux/macOS**: `~/.vectorless/workspaces/{cwd_hash}/` +/// - **Windows**: `%APPDATA%\vectorless\workspaces\{cwd_hash}\` +/// +/// where `cwd_hash` is a 12-hex-char hash derived from the current working +/// directory. This ensures different projects automatically get isolated +/// workspaces. +/// +/// # Environment variable resolution order +/// +/// | Platform | Primary | Fallback | Last resort | +/// |----------|-----------------|---------------------|-------------| +/// | Unix | `$HOME` | — | `"."` | +/// | Windows | `%LOCALAPPDATA%`| `%APPDATA%` | `"."` | +pub fn default_workspace_path_for_cwd() -> PathBuf { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let base_dir = if cfg!(windows) { + // Windows: prefer %LOCALAPPDATA% (e.g. C:\Users\xxx\AppData\Local) + // then %APPDATA% (e.g. C:\Users\xxx\AppData\Roaming) + std::env::var("LOCALAPPDATA") + .or_else(|_| std::env::var("APPDATA")) + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(".")) + } else { + // Unix (Linux, macOS): use $HOME + std::env::var("HOME") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(".")) + }; + + let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + + let mut hasher = DefaultHasher::new(); + cwd.to_string_lossy().hash(&mut hasher); + let hash = format!("{:012x}", hasher.finish()); + + base_dir.join(".vectorless").join("workspaces").join(hash) +} + +fn default_cache_size() -> usize { + 100 +} + +fn default_atomic_writes() -> bool { + true +} + +fn default_file_lock() -> bool { + true +} + +fn default_checksum_enabled() -> bool { + true +} + +impl Default for StorageConfig { + fn default() -> Self { + let workspace_dir = default_workspace_dir(); + let checkpoint_dir = workspace_dir.join("checkpoints"); + Self { + workspace_dir, + cache_size: default_cache_size(), + atomic_writes: default_atomic_writes(), + file_lock: default_file_lock(), + checksum_enabled: default_checksum_enabled(), + compression: CompressionConfig::default(), + checkpoint_dir, + } + } +} + +impl StorageConfig { + /// Create new storage config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the workspace directory. + pub fn with_workspace_dir(mut self, dir: impl Into) -> Self { + self.workspace_dir = dir.into(); + self + } + + /// Set the cache size. + pub fn with_cache_size(mut self, size: usize) -> Self { + self.cache_size = size; + self + } + + /// Enable or disable atomic writes. + pub fn with_atomic_writes(mut self, enabled: bool) -> Self { + self.atomic_writes = enabled; + self + } + + /// Enable or disable file locking. + pub fn with_file_lock(mut self, enabled: bool) -> Self { + self.file_lock = enabled; + self + } + + /// Enable or disable checksum verification. + pub fn with_checksum(mut self, enabled: bool) -> Self { + self.checksum_enabled = enabled; + self + } + + /// Set compression configuration. + pub fn with_compression(mut self, compression: CompressionConfig) -> Self { + self.compression = compression; + self + } +} + +/// Compression configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CompressionConfig { + /// Enable compression. + #[serde(default = "default_compression_enabled")] + pub enabled: bool, + + /// Compression algorithm. + #[serde(default = "default_compression_algorithm")] + pub algorithm: CompressionAlgorithm, + + /// Compression level (1-9, higher = better compression but slower). + #[serde(default = "default_compression_level")] + pub level: u32, +} + +fn default_compression_enabled() -> bool { + false +} + +fn default_compression_algorithm() -> CompressionAlgorithm { + CompressionAlgorithm::Gzip +} + +fn default_compression_level() -> u32 { + 6 +} + +impl Default for CompressionConfig { + fn default() -> Self { + Self { + enabled: default_compression_enabled(), + algorithm: default_compression_algorithm(), + level: default_compression_level(), + } + } +} + +impl CompressionConfig { + /// Create new compression config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Enable or disable compression. + pub fn with_enabled(mut self, enabled: bool) -> Self { + self.enabled = enabled; + self + } + + /// Set the compression algorithm. + pub fn with_algorithm(mut self, algorithm: CompressionAlgorithm) -> Self { + self.algorithm = algorithm; + self + } + + /// Set the compression level. + pub fn with_level(mut self, level: u32) -> Self { + self.level = level.clamp(1, 9); + self + } +} + +/// Compression algorithm. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum CompressionAlgorithm { + /// Gzip compression. + Gzip, + /// Zstandard compression. + Zstd, +} + +/// Sufficiency checker configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SufficiencyConfig { + /// Minimum tokens for sufficiency. + #[serde(default = "default_min_tokens")] + pub min_tokens: usize, + + /// Target tokens for full sufficiency. + #[serde(default = "default_target_tokens")] + pub target_tokens: usize, + + /// Maximum tokens before stopping. + #[serde(default = "default_max_tokens")] + pub max_tokens: usize, + + /// Minimum content length (characters). + #[serde(default = "default_min_content_length")] + pub min_content_length: usize, + + /// Confidence threshold for LLM judge. + #[serde(default = "default_confidence_threshold")] + pub confidence_threshold: f32, +} + +fn default_min_tokens() -> usize { + 500 +} + +fn default_target_tokens() -> usize { + 2000 +} + +fn default_max_tokens() -> usize { + 4000 +} + +fn default_min_content_length() -> usize { + 200 +} + +fn default_confidence_threshold() -> f32 { + 0.7 +} + +impl Default for SufficiencyConfig { + fn default() -> Self { + Self { + min_tokens: default_min_tokens(), + target_tokens: default_target_tokens(), + max_tokens: default_max_tokens(), + min_content_length: default_min_content_length(), + confidence_threshold: default_confidence_threshold(), + } + } +} + +impl SufficiencyConfig { + /// Create new sufficiency config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the minimum tokens. + pub fn with_min_tokens(mut self, tokens: usize) -> Self { + self.min_tokens = tokens; + self + } + + /// Set the target tokens. + pub fn with_target_tokens(mut self, tokens: usize) -> Self { + self.target_tokens = tokens; + self + } + + /// Set the maximum tokens. + pub fn with_max_tokens(mut self, tokens: usize) -> Self { + self.max_tokens = tokens; + self + } + + /// Set the confidence threshold. + pub fn with_confidence_threshold(mut self, threshold: f32) -> Self { + self.confidence_threshold = threshold; + self + } +} + +/// Cache configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CacheConfig { + /// Maximum number of cache entries. + #[serde(default = "default_max_entries")] + pub max_entries: usize, + + /// Time-to-live for cache entries (seconds). + #[serde(default = "default_ttl_secs")] + pub ttl_secs: u64, +} + +fn default_max_entries() -> usize { + 1000 +} + +fn default_ttl_secs() -> u64 { + 3600 +} + +impl Default for CacheConfig { + fn default() -> Self { + Self { + max_entries: default_max_entries(), + ttl_secs: default_ttl_secs(), + } + } +} + +impl CacheConfig { + /// Create new cache config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the maximum entries. + pub fn with_max_entries(mut self, max: usize) -> Self { + self.max_entries = max; + self + } + + /// Set the TTL in seconds. + pub fn with_ttl_secs(mut self, secs: u64) -> Self { + self.ttl_secs = secs; + self + } +} + +/// Strategy-specific configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StrategyConfig { + /// MCTS exploration weight (sqrt(2) ≈ 1.414). + #[serde(default = "default_exploration_weight")] + pub exploration_weight: f32, + + /// Semantic similarity threshold. + #[serde(default = "default_similarity_threshold")] + pub similarity_threshold: f32, + + /// High similarity threshold for "answer" decision. + #[serde(default = "default_high_similarity_threshold")] + pub high_similarity_threshold: f32, + + /// Low similarity threshold for "explore" decision. + #[serde(default = "default_low_similarity_threshold")] + pub low_similarity_threshold: f32, + + /// Hybrid strategy configuration (BM25 + LLM refinement). + #[serde(default)] + pub hybrid: HybridStrategyConfig, + + /// Cross-document strategy configuration. + #[serde(default)] + pub cross_document: CrossDocumentStrategyConfig, + + /// Page-range strategy configuration. + #[serde(default)] + pub page_range: PageRangeStrategyConfig, +} + +/// Hybrid strategy configuration (BM25 pre-filter + LLM refinement). +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HybridStrategyConfig { + /// Enable hybrid strategy. + #[serde(default = "default_true")] + pub enabled: bool, + + /// BM25 pre-filter: keep top N% of candidates. + #[serde(default = "default_pre_filter_ratio")] + pub pre_filter_ratio: f32, + + /// Minimum candidates to pass to LLM. + #[serde(default = "default_min_candidates")] + pub min_candidates: usize, + + /// Maximum candidates for LLM refinement. + #[serde(default = "default_max_candidates")] + pub max_candidates: usize, + + /// BM25 score for auto-accept (skip LLM). + #[serde(default = "default_auto_accept_threshold")] + pub auto_accept_threshold: f32, + + /// BM25 score for auto-reject (skip LLM). + #[serde(default = "default_auto_reject_threshold")] + pub auto_reject_threshold: f32, + + /// Weight for BM25 score in final scoring. + #[serde(default = "default_bm25_weight")] + pub bm25_weight: f32, + + /// Weight for LLM score in final scoring. + #[serde(default = "default_llm_weight")] + pub llm_weight: f32, +} + +fn default_true() -> bool { + true +} +fn default_pre_filter_ratio() -> f32 { + 0.3 +} +fn default_min_candidates() -> usize { + 2 +} +fn default_max_candidates() -> usize { + 5 +} +fn default_auto_accept_threshold() -> f32 { + 0.85 +} +fn default_auto_reject_threshold() -> f32 { + 0.15 +} +fn default_bm25_weight() -> f32 { + 0.4 +} +fn default_llm_weight() -> f32 { + 0.6 +} + +impl Default for HybridStrategyConfig { + fn default() -> Self { + Self { + enabled: true, + pre_filter_ratio: default_pre_filter_ratio(), + min_candidates: default_min_candidates(), + max_candidates: default_max_candidates(), + auto_accept_threshold: default_auto_accept_threshold(), + auto_reject_threshold: default_auto_reject_threshold(), + bm25_weight: default_bm25_weight(), + llm_weight: default_llm_weight(), + } + } +} + +/// Cross-document strategy configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CrossDocumentStrategyConfig { + /// Enable cross-document strategy. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Maximum documents to search. + #[serde(default = "default_max_documents")] + pub max_documents: usize, + + /// Maximum results per document. + #[serde(default = "default_max_results_per_doc")] + pub max_results_per_doc: usize, + + /// Maximum total results. + #[serde(default = "default_max_total_results")] + pub max_total_results: usize, + + /// Minimum score threshold. + #[serde(default = "default_min_score")] + pub min_score: f32, + + /// Merge strategy: TopK, BestPerDocument, WeightedByRelevance. + #[serde(default = "default_merge_strategy")] + pub merge_strategy: String, + + /// Search documents in parallel. + #[serde(default = "default_true")] + pub parallel_search: bool, +} + +fn default_max_documents() -> usize { + 10 +} +fn default_max_results_per_doc() -> usize { + 3 +} +fn default_max_total_results() -> usize { + 10 +} +fn default_min_score() -> f32 { + 0.3 +} +fn default_merge_strategy() -> String { + "TopK".to_string() +} + +impl Default for CrossDocumentStrategyConfig { + fn default() -> Self { + Self { + enabled: true, + max_documents: default_max_documents(), + max_results_per_doc: default_max_results_per_doc(), + max_total_results: default_max_total_results(), + min_score: default_min_score(), + merge_strategy: default_merge_strategy(), + parallel_search: true, + } + } +} + +/// Page-range strategy configuration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PageRangeStrategyConfig { + /// Enable page-range strategy. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Include nodes that span across the boundary. + #[serde(default = "default_true")] + pub include_boundary_nodes: bool, + + /// Expand range by N pages for context. + #[serde(default)] + pub expand_context_pages: usize, + + /// Minimum overlap ratio for node inclusion. + #[serde(default = "default_min_overlap_ratio")] + pub min_overlap_ratio: f32, +} + +fn default_min_overlap_ratio() -> f32 { + 0.1 +} + +impl Default for PageRangeStrategyConfig { + fn default() -> Self { + Self { + enabled: true, + include_boundary_nodes: true, + expand_context_pages: 0, + min_overlap_ratio: default_min_overlap_ratio(), + } + } +} + +fn default_exploration_weight() -> f32 { + 1.414 +} + +fn default_similarity_threshold() -> f32 { + 0.5 +} + +fn default_high_similarity_threshold() -> f32 { + 0.8 +} + +fn default_low_similarity_threshold() -> f32 { + 0.3 +} + +impl Default for StrategyConfig { + fn default() -> Self { + Self { + exploration_weight: default_exploration_weight(), + similarity_threshold: default_similarity_threshold(), + high_similarity_threshold: default_high_similarity_threshold(), + low_similarity_threshold: default_low_similarity_threshold(), + hybrid: HybridStrategyConfig::default(), + cross_document: CrossDocumentStrategyConfig::default(), + page_range: PageRangeStrategyConfig::default(), + } + } +} + +impl StrategyConfig { + /// Create new strategy config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the exploration weight. + pub fn with_exploration_weight(mut self, weight: f32) -> Self { + self.exploration_weight = weight; + self + } + + /// Set the similarity threshold. + pub fn with_similarity_threshold(mut self, threshold: f32) -> Self { + self.similarity_threshold = threshold; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_storage_config_defaults() { + let config = StorageConfig::default(); + // Default workspace should be under .vectorless/workspaces/ (Unix) + // or vectorless/workspaces/ (Windows via AppData) + let path_str = config.workspace_dir.to_string_lossy(); + if cfg!(windows) { + assert!( + path_str.contains("vectorless"), + "expected ...\\vectorless\\workspaces\\..., got {:?}", + config.workspace_dir, + ); + } else { + assert!( + path_str.contains(".vectorless"), + "expected ~/.vectorless/workspaces/..., got {:?}", + config.workspace_dir, + ); + } + assert_eq!(config.cache_size, 100); + assert!(config.atomic_writes); + assert!(config.file_lock); + assert!(config.checksum_enabled); + assert!(!config.compression.enabled); + } + + #[test] + fn test_storage_config_builders() { + let config = StorageConfig::new() + .with_workspace_dir("/data/workspace") + .with_cache_size(200) + .with_atomic_writes(false) + .with_file_lock(false) + .with_checksum(false); + + assert_eq!(config.workspace_dir, PathBuf::from("/data/workspace")); + assert_eq!(config.cache_size, 200); + assert!(!config.atomic_writes); + assert!(!config.file_lock); + assert!(!config.checksum_enabled); + } + + #[test] + fn test_compression_config_defaults() { + let config = CompressionConfig::default(); + assert!(!config.enabled); + assert_eq!(config.algorithm, CompressionAlgorithm::Gzip); + assert_eq!(config.level, 6); + } + + #[test] + fn test_compression_config_level_clamp() { + let config = CompressionConfig::new().with_level(15); + assert_eq!(config.level, 9); // clamped to max + + let config = CompressionConfig::new().with_level(0); + assert_eq!(config.level, 1); // clamped to min + } + + #[test] + fn test_sufficiency_config_defaults() { + let config = SufficiencyConfig::default(); + assert_eq!(config.min_tokens, 500); + assert_eq!(config.target_tokens, 2000); + assert_eq!(config.max_tokens, 4000); + } + + #[test] + fn test_cache_config_defaults() { + let config = CacheConfig::default(); + assert_eq!(config.max_entries, 1000); + assert_eq!(config.ttl_secs, 3600); + } + + #[test] + fn test_strategy_config_defaults() { + let config = StrategyConfig::default(); + assert!((config.exploration_weight - 1.414).abs() < 0.001); + assert_eq!(config.similarity_threshold, 0.5); + assert!(config.hybrid.enabled); + assert!(config.cross_document.enabled); + assert!(config.page_range.enabled); + } + + #[test] + fn test_hybrid_strategy_config_defaults() { + let config = HybridStrategyConfig::default(); + assert!(config.enabled); + assert!((config.pre_filter_ratio - 0.3).abs() < f32::EPSILON); + assert_eq!(config.min_candidates, 2); + assert_eq!(config.max_candidates, 5); + assert!((config.auto_accept_threshold - 0.85).abs() < f32::EPSILON); + } + + #[test] + fn test_cross_document_strategy_config_defaults() { + let config = CrossDocumentStrategyConfig::default(); + assert!(config.enabled); + assert_eq!(config.max_documents, 10); + assert_eq!(config.max_results_per_doc, 3); + assert_eq!(config.merge_strategy, "TopK"); + assert!(config.parallel_search); + } + + #[test] + fn test_page_range_strategy_config_defaults() { + let config = PageRangeStrategyConfig::default(); + assert!(config.enabled); + assert!(config.include_boundary_nodes); + assert_eq!(config.expand_context_pages, 0); + assert!((config.min_overlap_ratio - 0.1).abs() < f32::EPSILON); + } +} diff --git a/vectorless-core/vectorless-config/src/validator.rs b/vectorless-core/vectorless-config/src/validator.rs new file mode 100644 index 00000000..c3f55422 --- /dev/null +++ b/vectorless-core/vectorless-config/src/validator.rs @@ -0,0 +1,323 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration validation. +//! +//! This module provides comprehensive validation for configuration values, +//! including range checks, consistency checks, and dependency validation. + +use super::types::{Config, ConfigValidationError, Severity, ValidationError}; + +/// Configuration validator. +#[derive(Debug, Default)] +pub struct ConfigValidator { + /// Validation rules to apply. + rules: Vec>, +} + +impl ConfigValidator { + /// Create a new validator with default rules. + pub fn new() -> Self { + Self { + rules: vec![ + Box::new(RangeValidator), + Box::new(ConsistencyValidator), + Box::new(DependencyValidator), + ], + } + } + + /// Add a custom validation rule. + pub fn with_rule(mut self, rule: Box) -> Self { + self.rules.push(rule); + self + } + + /// Validate the configuration. + pub fn validate(&self, config: &Config) -> Result<(), ConfigValidationError> { + let mut errors = Vec::new(); + + for rule in &self.rules { + rule.validate(config, &mut errors); + } + + // Only fail on errors, not warnings or info + let has_errors = errors.iter().any(|e| e.severity == Severity::Error); + + if has_errors { + Err(ConfigValidationError { errors }) + } else { + Ok(()) + } + } +} + +/// Trait for validation rules. +pub trait ValidationRule: std::fmt::Debug + Send + Sync { + /// Validate the configuration, appending errors if found. + fn validate(&self, config: &Config, errors: &mut Vec); +} + +/// Validates value ranges. +#[derive(Debug)] +struct RangeValidator; + +impl ValidationRule for RangeValidator { + fn validate(&self, config: &Config, errors: &mut Vec) { + // Indexer ranges + if config.indexer.subsection_threshold == 0 { + errors.push(ValidationError::error( + "indexer.subsection_threshold", + "Subsection threshold must be greater than 0", + )); + } + + if config.indexer.subsection_threshold > 10000 { + errors.push( + ValidationError::warning( + "indexer.subsection_threshold", + "Subsection threshold is very high, may impact performance", + ) + .with_actual(config.indexer.subsection_threshold.to_string()), + ); + } + + // LLM slot token ranges + if config.llm.index.max_tokens == 0 { + errors.push(ValidationError::error( + "llm.index.max_tokens", + "Index max tokens must be greater than 0", + )); + } + + if config.llm.retrieval.max_tokens == 0 { + errors.push(ValidationError::error( + "llm.retrieval.max_tokens", + "Retrieval max tokens must be greater than 0", + )); + } + + // Retrieval ranges + if config.retrieval.top_k == 0 { + errors.push(ValidationError::error( + "retrieval.top_k", + "Top K must be greater than 0", + )); + } + + if config.retrieval.search.beam_width == 0 { + errors.push(ValidationError::error( + "retrieval.search.beam_width", + "Beam width must be greater than 0", + )); + } + + // Throttle ranges + if config.llm.throttle.max_concurrent_requests == 0 { + errors.push(ValidationError::error( + "llm.throttle.max_concurrent_requests", + "Max concurrent requests must be greater than 0", + )); + } + + if config.llm.throttle.requests_per_minute == 0 { + errors.push(ValidationError::error( + "llm.throttle.requests_per_minute", + "Requests per minute must be greater than 0", + )); + } + + // Fallback ranges + if config.llm.fallback.max_retries == 0 { + errors.push(ValidationError::warning( + "llm.fallback.max_retries", + "Max retries is 0, fallback will not retry", + )); + } + } +} + +/// Validates configuration consistency. +#[derive(Debug)] +struct ConsistencyValidator; + +impl ValidationRule for ConsistencyValidator { + fn validate(&self, config: &Config, errors: &mut Vec) { + // Check if index tokens are reasonable + if config.llm.index.max_tokens > config.indexer.max_segment_tokens { + errors.push( + ValidationError::warning( + "llm.index.max_tokens", + "Index max tokens exceeds max segment tokens", + ) + .with_expected(format!("<= {}", config.indexer.max_segment_tokens)) + .with_actual(config.llm.index.max_tokens.to_string()), + ); + } + + // Check if sufficiency thresholds are consistent + if config.retrieval.sufficiency.min_tokens > config.retrieval.sufficiency.target_tokens { + errors.push( + ValidationError::error( + "retrieval.sufficiency.min_tokens", + "Min tokens cannot exceed target tokens", + ) + .with_expected(format!("<= {}", config.retrieval.sufficiency.target_tokens)) + .with_actual(config.retrieval.sufficiency.min_tokens.to_string()), + ); + } + + if config.retrieval.sufficiency.target_tokens > config.retrieval.sufficiency.max_tokens { + errors.push( + ValidationError::error( + "retrieval.sufficiency.target_tokens", + "Target tokens cannot exceed max tokens", + ) + .with_expected(format!("<= {}", config.retrieval.sufficiency.max_tokens)) + .with_actual(config.retrieval.sufficiency.target_tokens.to_string()), + ); + } + } +} + +/// Validates configuration dependencies. +#[derive(Debug)] +struct DependencyValidator; + +impl ValidationRule for DependencyValidator { + fn validate(&self, config: &Config, errors: &mut Vec) { + // Check if API key is available when summaries are needed + if config.llm.api_key.is_none() { + if config.indexer.max_summary_tokens > 0 { + errors.push(ValidationError::info( + "llm.api_key", + "No API key configured, summary generation will be disabled", + )); + } + } + + // Check fallback configuration + if config.llm.fallback.enabled { + if config.llm.fallback.models.is_empty() && config.llm.fallback.endpoints.is_empty() { + errors.push(ValidationError::warning( + "llm.fallback.models", + "Fallback enabled but no fallback models or endpoints configured", + )); + } + + // Check retry behavior consistency + if matches!( + config.llm.fallback.on_rate_limit, + super::types::FallbackBehavior::Fallback + ) && config.llm.fallback.models.is_empty() + { + errors.push(ValidationError::error( + "llm.fallback.models", + "Rate limit behavior is 'fallback' but no fallback models configured", + )); + } + } + + // Check cache configuration + if config.retrieval.cache.max_entries == 0 { + errors.push(ValidationError::warning( + "retrieval.cache.max_entries", + "Cache disabled (max_entries = 0), performance may be impacted", + )); + } + + // Check strategy configuration + if config.retrieval.strategy.exploration_weight <= 0.0 { + errors.push( + ValidationError::error( + "retrieval.strategy.exploration_weight", + "Exploration weight must be positive", + ) + .with_actual(config.retrieval.strategy.exploration_weight.to_string()), + ); + } + + // Check similarity thresholds are ordered correctly + if config.retrieval.strategy.low_similarity_threshold + >= config.retrieval.strategy.high_similarity_threshold + { + errors.push( + ValidationError::error( + "retrieval.strategy.low_similarity_threshold", + "Low similarity threshold must be less than high similarity threshold", + ) + .with_expected(format!( + "< {}", + config.retrieval.strategy.high_similarity_threshold + )) + .with_actual( + config + .retrieval + .strategy + .low_similarity_threshold + .to_string(), + ), + ); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validator_valid_config() { + let config = Config::default(); + let validator = ConfigValidator::new(); + // Default config should pass validation (no errors, warnings are ok) + let result = validator.validate(&config); + assert!(result.is_ok(), "Default config should pass validation"); + } + + #[test] + fn test_validator_catches_range_errors() { + let mut config = Config::default(); + config.retrieval.top_k = 0; + + let validator = ConfigValidator::new(); + let result = validator.validate(&config); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.errors.iter().any(|e| e.path.contains("top_k"))); + } + + #[test] + fn test_validator_catches_consistency_errors() { + let mut config = Config::default(); + config.retrieval.sufficiency.min_tokens = 3000; + config.retrieval.sufficiency.target_tokens = 2000; + + let validator = ConfigValidator::new(); + let result = validator.validate(&config); + + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(err.errors.iter().any(|e| e.path.contains("min_tokens"))); + } + + #[test] + fn test_validator_catches_dependency_warnings() { + let mut config = Config::default(); + config.llm.fallback.enabled = true; + config.llm.fallback.models.clear(); + + let validator = ConfigValidator::new(); + let result = validator.validate(&config); + + // Should succeed but with warnings + if let Err(err) = result { + assert!( + err.errors + .iter() + .any(|e| e.path.contains("llm.fallback.models")) + ); + } + } +} diff --git a/vectorless-core/vectorless-document/Cargo.toml b/vectorless-core/vectorless-document/Cargo.toml new file mode 100644 index 00000000..ace36491 --- /dev/null +++ b/vectorless-core/vectorless-document/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "vectorless-document" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +regex = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +indextree = { workspace = true } +chrono = { workspace = true } +uuid = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-document/src/format.rs b/vectorless-core/vectorless-document/src/format.rs new file mode 100644 index 00000000..78f6e52e --- /dev/null +++ b/vectorless-core/vectorless-document/src/format.rs @@ -0,0 +1,62 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document format and sufficiency types. +//! +//! These types are used across multiple modules and are defined here +//! to avoid circular dependencies between crates. + +use serde::{Deserialize, Serialize}; + +/// Supported document formats. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DocumentFormat { + /// Markdown files (.md, .markdown) + Markdown, + /// PDF files (.pdf) + Pdf, +} + +impl DocumentFormat { + /// Detect format from file extension. + pub fn from_extension(ext: &str) -> Option { + match ext.to_lowercase().as_str() { + "md" | "markdown" => Some(Self::Markdown), + "pdf" => Some(Self::Pdf), + _ => None, + } + } + + /// Get the file extension for this format. + pub fn extension(&self) -> &'static str { + match self { + Self::Markdown => "md", + Self::Pdf => "pdf", + } + } + + /// All supported file extensions (lowercase). + /// + /// Single source of truth — used by directory scanning to + /// discover indexable files. + pub const SUPPORTED_EXTENSIONS: &'static [&'static str] = &["md", "pdf"]; +} + +/// Sufficiency level for incremental retrieval. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SufficiencyLevel { + /// Information is sufficient, stop retrieving. + Sufficient, + + /// Partial information, can continue if needed. + PartialSufficient, + + /// Information is insufficient, continue retrieving. + Insufficient, +} + +impl Default for SufficiencyLevel { + fn default() -> Self { + Self::Insufficient + } +} diff --git a/vectorless-core/vectorless-document/src/lib.rs b/vectorless-core/vectorless-document/src/lib.rs new file mode 100644 index 00000000..85ea1ff3 --- /dev/null +++ b/vectorless-core/vectorless-document/src/lib.rs @@ -0,0 +1,43 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document types - pure data structures for document tree representation. +//! +//! This module contains the core types that represent hierarchical documents. +//! These types have no dependencies on indexing or retrieval logic. +//! +//! # Types +//! +//! - [`TreeNode`] - A node in the document tree +//! - [`DocumentTree`] - Arena-based tree structure +//! - [`NodeId`] - Unique identifier for tree nodes +//! - [`TocView`] - Table of Contents generator +//! - [`StructureNode`] - JSON export structure +//! - [`NodeReference`] - In-document reference (e.g., "see Appendix G") +//! - [`RefType`] - Type of reference (Section, Appendix, Table, etc.) + +mod format; +mod navigation; +mod node; +mod reasoning; +mod reference; +mod serde_helpers; +mod structure; +mod toc; +mod tree; +pub mod understanding; + +pub use format::{DocumentFormat, SufficiencyLevel}; +pub use navigation::{ChildRoute, DocCard, NavEntry, NavigationIndex, SectionCard}; +pub use node::{NodeId, TreeNode}; +pub use reasoning::{ + ReasoningIndex, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, SummaryShortcut, + TopicEntry, +}; +pub use reference::ReferenceExtractor; +pub use structure::{DocumentStructure, StructureNode}; +pub use toc::{TocConfig, TocEntry, TocNode, TocView}; +pub use tree::{DocumentTree, RetrievalIndex}; +pub use understanding::{ + Answer, Concept, Document, DocumentInfo, Evidence, IngestInput, ReasoningTrace, TraceStep, +}; diff --git a/vectorless-core/vectorless-document/src/navigation.rs b/vectorless-core/vectorless-document/src/navigation.rs new file mode 100644 index 00000000..dbfeadd4 --- /dev/null +++ b/vectorless-core/vectorless-document/src/navigation.rs @@ -0,0 +1,626 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Navigation index for Agent-based retrieval. +//! +//! This is the primary data source for the Agent during the query phase. +//! It provides a compact, pre-computed view of the document tree optimized +//! for navigation decisions — the Agent can decide where to descend without +//! reading the actual content. +//! +//! # Design +//! +//! Based on the Corpus2Skill paper (2604.14572v1), this is the in-memory +//! equivalent of SKILL.md / INDEX.md. The Agent reads `child_routes` at +//! each decision point to see all available sub-topics and their descriptions, +//! then chooses where to navigate next. +//! +//! # Data Flow +//! +//! ```text +//! Enhance stage (writes to TreeNode): +//! summary, description, routing_keywords, leaf_count +//! │ +//! └──→ Navigation stage (reads TreeNode fields) +//! Builds: NavigationIndex (NavEntry + ChildRoute) +//! ``` +//! +//! No LLM calls are made during Navigation stage construction. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use super::node::NodeId; + +/// Navigation index — Agent's primary data source during the query phase. +/// +/// Contains pre-computed navigation metadata for every non-leaf node, +/// allowing the Agent to make routing decisions without accessing the +/// content layer (DocumentTree). +/// +/// `HashMap` fields use `serde_helpers` (Vec pairs) because +/// serde_json cannot deserialize integer-keyed maps. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NavigationIndex { + /// Navigation entry for each non-leaf node. + #[serde(with = "super::serde_helpers")] + nav_entries: HashMap, + + /// Child routes for each non-leaf node. + #[serde(with = "super::serde_helpers")] + child_routes: HashMap>, + + /// Pre-computed document card for multi-document Orchestrator. + /// Built during compile phase by NavigationIndexStage. + #[serde(default, skip_serializing_if = "Option::is_none")] + doc_card: Option, +} + +impl NavigationIndex { + /// Create a new empty navigation index. + pub fn new() -> Self { + Self { + nav_entries: HashMap::new(), + child_routes: HashMap::new(), + doc_card: None, + } + } + + /// Add a navigation entry for a non-leaf node. + pub fn add_entry(&mut self, node_id: NodeId, entry: NavEntry) { + self.nav_entries.insert(node_id, entry); + } + + /// Add child routes for a non-leaf node. + pub fn add_child_routes(&mut self, parent_id: NodeId, routes: Vec) { + self.child_routes.insert(parent_id, routes); + } + + /// Get the navigation entry for a node. + pub fn get_entry(&self, node_id: NodeId) -> Option<&NavEntry> { + self.nav_entries.get(&node_id) + } + + /// Get the child routes for a node. + pub fn get_child_routes(&self, node_id: NodeId) -> Option<&[ChildRoute]> { + self.child_routes.get(&node_id).map(|v| v.as_slice()) + } + + /// Get the number of navigation entries. + pub fn entry_count(&self) -> usize { + self.nav_entries.len() + } + + /// Get the total number of child route records. + pub fn total_child_routes(&self) -> usize { + self.child_routes.values().map(|v| v.len()).sum() + } + + /// Get the root node's navigation entry. + pub fn root_entry(&self) -> Option<&NavEntry> { + // The root should always be present if the index is non-empty. + // Return the first entry with level 0. + self.nav_entries.values().find(|e| e.level == 0) + } + + /// Iterate over all navigation entries. + pub fn entries(&self) -> impl Iterator { + self.nav_entries.iter() + } + + /// Iterate over all child route sets. + pub fn all_child_routes(&self) -> impl Iterator { + self.child_routes.iter().map(|(k, v)| (k, v.as_slice())) + } + + /// Check if the index is empty. + pub fn is_empty(&self) -> bool { + self.nav_entries.is_empty() + } + + /// Get the pre-computed document card. + pub fn doc_card(&self) -> Option<&DocCard> { + self.doc_card.as_ref() + } + + /// Set the document card. + pub fn set_doc_card(&mut self, card: DocCard) { + self.doc_card = Some(card); + } +} + +impl Default for NavigationIndex { + fn default() -> Self { + Self::new() + } +} + +/// Navigation entry for a non-leaf node. +/// +/// Provides the Agent with enough context to decide whether this subtree +/// is relevant to the current query, without needing to read the node's +/// actual content. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NavEntry { + /// Routing summary describing what this subtree covers. + /// Comes from Enhance stage's `summary` (routing-oriented). + pub overview: String, + + /// Typical questions this subtree can answer. + /// Extracted from content/summary during Enhance stage. + pub question_hints: Vec, + + /// Topic tags for keyword-based matching. + /// Comes from Enhance stage's `routing_keywords`. + pub topic_tags: Vec, + + /// Total number of leaf nodes in this subtree. + /// Equivalent to the paper's `num_documents`. + pub leaf_count: usize, + + /// Depth of this node in the tree. + /// Equivalent to the paper's `level`. + pub level: usize, +} + +/// Child route — compact routing info for one child node. +/// +/// The Agent sees a list of `ChildRoute`s when deciding which child +/// to descend into. This provides progressive disclosure: the Agent +/// doesn't need to enter the child node to understand what it contains. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChildRoute { + /// The child node's ID (for the Agent to navigate to). + pub node_id: NodeId, + + /// Child node's title. + pub title: String, + + /// One-sentence description of what this child covers. + /// Comes from Enhance stage's `description` field. + pub description: String, + + /// Number of leaf nodes in this child's subtree. + pub leaf_count: usize, +} + +/// Pre-computed document card for multi-document Orchestrator Agent. +/// +/// Built during the compile phase by `NavigationIndexStage`, this provides +/// a compact summary of the entire document — enough for the Orchestrator +/// to decide whether a document is relevant to a query without entering it. +/// +/// All fields come from data already computed in earlier phases of the +/// NavigationIndexStage (root NavEntry + root child_routes). No LLM calls. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocCard { + /// Document title (root node title). + pub title: String, + + /// Document overview (root NavEntry.overview). + pub overview: String, + + /// Questions this document can answer (root NavEntry.question_hints). + pub question_hints: Vec, + + /// Topic keywords (root NavEntry.topic_tags). + pub topic_tags: Vec, + + /// Top-level section summaries (from root child_routes). + pub sections: Vec, + + /// Total leaf nodes in the document. + pub total_leaves: usize, +} + +/// One top-level section in a [`DocCard`]. +/// +/// Provides a compact view of a single top-level section, +/// allowing the Orchestrator to scan section titles and descriptions +/// to assess document relevance. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SectionCard { + /// Section title. + pub title: String, + + /// One-sentence description of this section. + pub description: String, + + /// Number of leaf nodes in this section's subtree. + pub leaf_count: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::DocumentTree; + + fn build_small_tree() -> DocumentTree { + // Root -> [Child1 (leaf), Child2 -> [Grandchild (leaf)]] + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let _child1 = tree.add_child(root, "Child1", "leaf content"); + let child2 = tree.add_child(root, "Child2", ""); + let _grandchild = tree.add_child(child2, "Grandchild", "leaf content"); + tree + } + + #[test] + fn test_empty_navigation_index() { + let index = NavigationIndex::new(); + assert!(index.is_empty()); + assert_eq!(index.entry_count(), 0); + assert_eq!(index.total_child_routes(), 0); + assert!(index.root_entry().is_none()); + } + + #[test] + fn test_add_and_retrieve_entry() { + let tree = build_small_tree(); + let root = tree.root(); + + let entry = NavEntry { + overview: "Payment integration guide".to_string(), + question_hints: vec!["How to set up Stripe?".to_string()], + topic_tags: vec!["payment".to_string(), "stripe".to_string()], + leaf_count: 5, + level: 0, + }; + + let mut index = NavigationIndex::new(); + index.add_entry(root, entry); + + assert!(!index.is_empty()); + assert_eq!(index.entry_count(), 1); + + let retrieved = index.get_entry(root).unwrap(); + assert_eq!(retrieved.overview, "Payment integration guide"); + assert_eq!(retrieved.leaf_count, 5); + } + + #[test] + fn test_add_and_retrieve_child_routes() { + let tree = build_small_tree(); + let root = tree.root(); + let children: Vec = tree.children_iter(root).collect(); + + let routes = vec![ + ChildRoute { + node_id: children[0], + title: "Getting Started".to_string(), + description: "Setup and installation".to_string(), + leaf_count: 3, + }, + ChildRoute { + node_id: children[1], + title: "API Reference".to_string(), + description: "REST API endpoints".to_string(), + leaf_count: 7, + }, + ]; + + let mut index = NavigationIndex::new(); + index.add_child_routes(root, routes); + + let retrieved = index.get_child_routes(root).unwrap(); + assert_eq!(retrieved.len(), 2); + assert_eq!(retrieved[0].title, "Getting Started"); + assert_eq!(retrieved[1].leaf_count, 7); + assert_eq!(index.total_child_routes(), 2); + } + + #[test] + fn test_root_entry() { + let tree = build_small_tree(); + let root = tree.root(); + let children: Vec = tree.children_iter(root).collect(); + + let mut index = NavigationIndex::new(); + index.add_entry( + root, + NavEntry { + overview: "Root".to_string(), + question_hints: vec![], + topic_tags: vec![], + leaf_count: 10, + level: 0, + }, + ); + index.add_entry( + children[1], + NavEntry { + overview: "Child".to_string(), + question_hints: vec![], + topic_tags: vec![], + leaf_count: 5, + level: 1, + }, + ); + + let root_entry = index.root_entry().unwrap(); + assert_eq!(root_entry.level, 0); + assert_eq!(root_entry.leaf_count, 10); + } + + #[test] + fn test_get_entry_nonexistent() { + let index = NavigationIndex::new(); + let tree = build_small_tree(); + // Leaf node should never have an entry + let children: Vec = tree.children_iter(tree.root()).collect(); + assert!(index.get_entry(children[0]).is_none()); + } + + #[test] + fn test_get_child_routes_nonexistent() { + let index = NavigationIndex::new(); + let tree = build_small_tree(); + assert!(index.get_child_routes(tree.root()).is_none()); + } + + #[test] + fn test_default_trait() { + let index = NavigationIndex::default(); + assert!(index.is_empty()); + } + + #[test] + fn test_entries_iterator() { + let tree = build_small_tree(); + let root = tree.root(); + let children: Vec = tree.children_iter(root).collect(); + + let mut index = NavigationIndex::new(); + index.add_entry( + root, + NavEntry { + overview: "Root".to_string(), + question_hints: vec![], + topic_tags: vec![], + leaf_count: 2, + level: 0, + }, + ); + index.add_entry( + children[1], // Child2 is non-leaf + NavEntry { + overview: "Child2".to_string(), + question_hints: vec![], + topic_tags: vec![], + leaf_count: 1, + level: 1, + }, + ); + + let all_entries: Vec<_> = index.entries().collect(); + assert_eq!(all_entries.len(), 2); + } + + #[test] + fn test_all_child_routes_iterator() { + let tree = build_small_tree(); + let root = tree.root(); + let children: Vec = tree.children_iter(root).collect(); + + let mut index = NavigationIndex::new(); + index.add_child_routes( + root, + vec![ChildRoute { + node_id: children[0], + title: "C1".to_string(), + description: "d".to_string(), + leaf_count: 1, + }], + ); + + let all_routes: Vec<_> = index.all_child_routes().collect(); + assert_eq!(all_routes.len(), 1); + assert_eq!(all_routes[0].1.len(), 1); + } + + #[test] + fn test_serialization_roundtrip() { + let tree = build_small_tree(); + let root = tree.root(); + let children: Vec = tree.children_iter(root).collect(); + + let mut index = NavigationIndex::new(); + index.add_entry( + root, + NavEntry { + overview: "Root overview".to_string(), + question_hints: vec!["What is this?".to_string()], + topic_tags: vec!["intro".to_string(), "guide".to_string()], + leaf_count: 2, + level: 0, + }, + ); + index.add_child_routes( + root, + vec![ + ChildRoute { + node_id: children[0], + title: "Child1".to_string(), + description: "First child desc".to_string(), + leaf_count: 1, + }, + ChildRoute { + node_id: children[1], + title: "Child2".to_string(), + description: "Second child desc".to_string(), + leaf_count: 1, + }, + ], + ); + + // Serialize + let json = serde_json::to_string(&index).expect("serialization failed"); + + // Deserialize + let deserialized: NavigationIndex = + serde_json::from_str(&json).expect("deserialization failed"); + + // Verify data survived round-trip + assert_eq!(deserialized.entry_count(), 1); + assert_eq!(deserialized.total_child_routes(), 2); + + let entry = deserialized.get_entry(root).unwrap(); + assert_eq!(entry.overview, "Root overview"); + assert_eq!(entry.question_hints.len(), 1); + assert_eq!(entry.topic_tags.len(), 2); + assert_eq!(entry.leaf_count, 2); + assert_eq!(entry.level, 0); + + let routes = deserialized.get_child_routes(root).unwrap(); + assert_eq!(routes[0].title, "Child1"); + assert_eq!(routes[1].title, "Child2"); + } + + #[test] + fn test_doc_card_default_none() { + let index = NavigationIndex::new(); + assert!(index.doc_card().is_none()); + } + + #[test] + fn test_doc_card_set_and_get() { + let card = DocCard { + title: "Test Doc".to_string(), + overview: "A test document".to_string(), + question_hints: vec!["What?".to_string()], + topic_tags: vec!["test".to_string()], + sections: vec![SectionCard { + title: "Section 1".to_string(), + description: "First section".to_string(), + leaf_count: 5, + }], + total_leaves: 5, + }; + + let mut index = NavigationIndex::new(); + index.set_doc_card(card); + + let retrieved = index.doc_card().unwrap(); + assert_eq!(retrieved.title, "Test Doc"); + assert_eq!(retrieved.overview, "A test document"); + assert_eq!(retrieved.question_hints.len(), 1); + assert_eq!(retrieved.topic_tags.len(), 1); + assert_eq!(retrieved.sections.len(), 1); + assert_eq!(retrieved.sections[0].title, "Section 1"); + assert_eq!(retrieved.sections[0].leaf_count, 5); + assert_eq!(retrieved.total_leaves, 5); + } + + #[test] + fn test_doc_card_serialization_roundtrip() { + let tree = build_small_tree(); + let root = tree.root(); + let children: Vec = tree.children_iter(root).collect(); + + let mut index = NavigationIndex::new(); + index.add_entry( + root, + NavEntry { + overview: "Root overview".to_string(), + question_hints: vec!["What is this?".to_string()], + topic_tags: vec!["intro".to_string()], + leaf_count: 2, + level: 0, + }, + ); + index.add_child_routes( + root, + vec![ + ChildRoute { + node_id: children[0], + title: "Child1".to_string(), + description: "First".to_string(), + leaf_count: 1, + }, + ChildRoute { + node_id: children[1], + title: "Child2".to_string(), + description: "Second".to_string(), + leaf_count: 1, + }, + ], + ); + + // Build DocCard from index data + let root_entry = index.get_entry(root).unwrap(); + let sections: Vec = index + .get_child_routes(root) + .unwrap() + .iter() + .map(|r| SectionCard { + title: r.title.clone(), + description: r.description.clone(), + leaf_count: r.leaf_count, + }) + .collect(); + index.set_doc_card(DocCard { + title: "Root".to_string(), + overview: root_entry.overview.clone(), + question_hints: root_entry.question_hints.clone(), + topic_tags: root_entry.topic_tags.clone(), + sections, + total_leaves: root_entry.leaf_count, + }); + + // Serialize + deserialize + let json = serde_json::to_string(&index).expect("serialization failed"); + let deserialized: NavigationIndex = + serde_json::from_str(&json).expect("deserialization failed"); + + // Verify DocCard survived round-trip + let card = deserialized.doc_card().unwrap(); + assert_eq!(card.title, "Root"); + assert_eq!(card.overview, "Root overview"); + assert_eq!(card.question_hints, vec!["What is this?"]); + assert_eq!(card.topic_tags, vec!["intro"]); + assert_eq!(card.sections.len(), 2); + assert_eq!(card.sections[0].title, "Child1"); + assert_eq!(card.sections[1].leaf_count, 1); + assert_eq!(card.total_leaves, 2); + } + + #[test] + fn test_doc_card_backward_compat_deserialize_without_card() { + // JSON from an older version that doesn't have doc_card + let tree = build_small_tree(); + let root = tree.root(); + + let mut index = NavigationIndex::new(); + index.add_entry( + root, + NavEntry { + overview: "Old index".to_string(), + question_hints: vec![], + topic_tags: vec![], + leaf_count: 2, + level: 0, + }, + ); + // No doc_card set + + let json = serde_json::to_string(&index).expect("serialization failed"); + let deserialized: NavigationIndex = + serde_json::from_str(&json).expect("deserialization failed"); + + assert!(deserialized.doc_card().is_none()); + assert_eq!(deserialized.entry_count(), 1); + } + + #[test] + fn test_section_card_fields() { + let card = SectionCard { + title: "Getting Started".to_string(), + description: "Quick setup guide".to_string(), + leaf_count: 3, + }; + assert_eq!(card.title, "Getting Started"); + assert_eq!(card.description, "Quick setup guide"); + assert_eq!(card.leaf_count, 3); + } +} diff --git a/vectorless-core/vectorless-document/src/node.rs b/vectorless-core/vectorless-document/src/node.rs new file mode 100644 index 00000000..c0a6ffe6 --- /dev/null +++ b/vectorless-core/vectorless-document/src/node.rs @@ -0,0 +1,144 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Tree node definition using indextree (Arena-based). +//! +//! This module provides a node type for hierarchical document representation. +//! Each branch represents a section and each leaf contains the actual text. + +use indextree::NodeId as IndexTreeNodeId; +use serde::{Deserialize, Serialize}; +use std::fmt; + +use super::reference::NodeReference; + +/// Unique identifier for a node in the document tree. +/// +/// This is a newtype wrapper around indextree's NodeId to provide +/// better type safety and domain-specific semantics. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub struct NodeId(pub IndexTreeNodeId); + +// Implement traits for interoperability +impl fmt::Display for NodeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "NodeId({:?})", self.0) + } +} + +impl Serialize for NodeId { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + self.0.serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for NodeId { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let id = IndexTreeNodeId::deserialize(deserializer)?; + Ok(NodeId(id)) + } +} + +/// A node in the Vectorless document tree. +/// +/// Each branch represents a section and each leaf contains the actual text. +/// When a question is asked, an LLM navigates this tree level by level +/// to find the right answer. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TreeNode { + /// Title of this section. + pub title: String, + + /// Hierarchical structure index (e.g., "1", "1.1", "1.2.3"). + /// + /// This provides a human-readable path to the node and is useful for: + /// - LLM navigation (easier to understand "go to section 2.1.3") + /// - Table of contents display + /// - Cross-referencing + #[serde(default)] + pub structure: String, + + /// Raw text content (populated at leaves). + #[serde(default)] + pub content: String, + + /// Generated by LLM summary. + #[serde(default)] + pub summary: String, + + /// Depth in tree (0 = root, 1 = section, 2 = subsection, etc.). + #[serde(default)] + pub depth: usize, + + /// Starting line number (1-based). + #[serde(default)] + pub start_index: usize, + + /// Ending line number (1-based). + #[serde(default)] + pub end_index: usize, + + /// Starting page number (1-based, if applicable). + pub start_page: Option, + + /// Ending page number (1-based, if applicable). + pub end_page: Option, + + /// Unique node identifier (e.g., "0001", "0002"). + pub node_id: Option, + + /// Physical index marker for line tracking. + pub physical_index: Option, + + /// Token count estimate. + pub token_count: Option, + + /// References found in this node's content. + /// + /// These are in-document references like "see Appendix G" or + /// "refer to Table 5.3" that can be followed during retrieval. + #[serde(default)] + pub references: Vec, + + /// Routing keywords for navigation (non-leaf nodes). + /// + /// Populated by EnhanceStage with LLM-extracted topic tags. + /// Used by NavigationIndexStage to populate `NavEntry::topic_tags`. + #[serde(default)] + pub routing_keywords: Vec, + + /// Typical questions this subtree can answer (non-leaf nodes). + /// + /// Populated by EnhanceStage with LLM-extracted question hints. + /// Used by NavigationIndexStage to populate `NavEntry::question_hints`. + #[serde(default)] + pub question_hints: Vec, +} + +impl Default for TreeNode { + fn default() -> Self { + Self { + title: String::new(), + structure: String::new(), + content: String::new(), + summary: String::new(), + depth: 0, + start_index: 1, + end_index: 1, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + references: Vec::new(), + routing_keywords: Vec::new(), + question_hints: Vec::new(), + } + } +} diff --git a/vectorless-core/vectorless-document/src/reasoning.rs b/vectorless-core/vectorless-document/src/reasoning.rs new file mode 100644 index 00000000..2c4ab01b --- /dev/null +++ b/vectorless-core/vectorless-document/src/reasoning.rs @@ -0,0 +1,444 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Pre-computed reasoning index for fast retrieval path resolution. +//! +//! Built at index time from TOC and summaries, the reasoning index provides +//! topic-to-path mappings, summary shortcuts, and hot node tracking that +//! accelerate query-time retrieval by bypassing expensive tree traversal. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use super::node::NodeId; + +/// A pre-computed reasoning index that maps topics and query patterns +/// to optimal tree paths, built at index time for query-time acceleration. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReasoningIndex { + /// Keyword → list of (NodeId, weight) entries. + /// Built from titles and summaries at index time. + /// Key = lowercased keyword token. + topic_paths: HashMap>, + + /// Pre-computed shortcut for "document summary" queries. + /// Maps summary-type query patterns directly to the root node + /// and its top-level children summaries. + summary_shortcut: Option, + + /// Nodes marked as hot (frequently retrieved). + /// NodeId → cumulative hit count and rolling average score. + /// Uses `node_id_map` because serde_json cannot deserialize + /// `HashMap` (integer keys are incompatible with JSON). + #[serde(with = "super::serde_helpers")] + hot_nodes: HashMap, + + /// Depth-1 section title → NodeId mapping for fast ToC lookup. + section_map: HashMap, + + /// Configuration used to build this index (for cache invalidation). + #[serde(default)] + config_hash: u64, +} + +impl ReasoningIndex { + /// Create a new empty reasoning index. + pub fn new() -> Self { + Self { + topic_paths: HashMap::new(), + summary_shortcut: None, + hot_nodes: HashMap::new(), + section_map: HashMap::new(), + config_hash: 0, + } + } + + /// Create a builder for constructing the reasoning index. + pub fn builder() -> ReasoningIndexBuilder { + ReasoningIndexBuilder::new() + } + + /// Look up topic entries for a keyword. + pub fn topic_entries(&self, keyword: &str) -> Option<&[TopicEntry]> { + self.topic_paths.get(keyword).map(Vec::as_slice) + } + + /// Get the summary shortcut, if available. + pub fn summary_shortcut(&self) -> Option<&SummaryShortcut> { + self.summary_shortcut.as_ref() + } + + /// Check if a node is marked as hot. + pub fn is_hot(&self, node_id: NodeId) -> bool { + self.hot_nodes + .get(&node_id) + .map(|e| e.is_hot) + .unwrap_or(false) + } + + /// Get the hot node entry for a node. + pub fn hot_entry(&self, node_id: NodeId) -> Option<&HotNodeEntry> { + self.hot_nodes.get(&node_id) + } + + /// Look up a section by its title. + pub fn find_section(&self, title: &str) -> Option { + self.section_map.get(&title.to_lowercase()).copied() + } + + /// Iterate over all keyword → topic entries (for graph building). + pub fn all_topic_entries(&self) -> impl Iterator { + self.topic_paths.iter().map(|(k, v)| (k, v.as_slice())) + } + + /// Get the number of topic keywords indexed. + pub fn topic_count(&self) -> usize { + self.topic_paths.len() + } + + /// Get the number of sections in the section map. + pub fn section_count(&self) -> usize { + self.section_map.len() + } + + /// Get the number of hot nodes. + pub fn hot_node_count(&self) -> usize { + self.hot_nodes.iter().filter(|(_, e)| e.is_hot).count() + } + + /// Update hot node tracking from retrieval results. + pub fn update_hot_nodes(&mut self, hits: &[(NodeId, f32)], hot_threshold: u32) { + for &(node_id, score) in hits { + let entry = self.hot_nodes.entry(node_id).or_insert(HotNodeEntry { + hit_count: 0, + avg_score: 0.0, + is_hot: false, + }); + entry.hit_count += 1; + entry.avg_score += (score - entry.avg_score) / entry.hit_count as f32; + if entry.hit_count >= hot_threshold { + entry.is_hot = true; + } + } + } +} + +impl Default for ReasoningIndex { + fn default() -> Self { + Self::new() + } +} + +/// Builder for constructing a `ReasoningIndex`. +pub struct ReasoningIndexBuilder { + topic_paths: HashMap>, + summary_shortcut: Option, + hot_nodes: HashMap, + section_map: HashMap, + config_hash: u64, +} + +impl ReasoningIndexBuilder { + /// Create a new builder. + pub fn new() -> Self { + Self { + topic_paths: HashMap::new(), + summary_shortcut: None, + hot_nodes: HashMap::new(), + section_map: HashMap::new(), + config_hash: 0, + } + } + + /// Add a topic entry for a keyword. + pub fn add_topic_entry(&mut self, keyword: impl Into, entry: TopicEntry) { + self.topic_paths + .entry(keyword.into()) + .or_default() + .push(entry); + } + + /// Set the summary shortcut. + pub fn summary_shortcut(mut self, shortcut: SummaryShortcut) -> Self { + self.summary_shortcut = Some(shortcut); + self + } + + /// Add a section mapping. + pub fn add_section(&mut self, title: impl Into, node_id: NodeId) { + self.section_map + .insert(title.into().to_lowercase(), node_id); + } + + /// Set the config hash for cache invalidation. + pub fn config_hash(mut self, hash: u64) -> Self { + self.config_hash = hash; + self + } + + /// Sort topic entries by weight (descending) and trim per-keyword lists. + pub fn sort_and_trim(&mut self, max_entries: usize) { + for entries in self.topic_paths.values_mut() { + entries.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + entries.truncate(max_entries); + } + } + + /// Build the reasoning index. + pub fn build(self) -> ReasoningIndex { + ReasoningIndex { + topic_paths: self.topic_paths, + summary_shortcut: self.summary_shortcut, + hot_nodes: self.hot_nodes, + section_map: self.section_map, + config_hash: self.config_hash, + } + } +} + +impl Default for ReasoningIndexBuilder { + fn default() -> Self { + Self::new() + } +} + +/// A topic entry mapping a keyword to a node with a weight. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TopicEntry { + /// The target node. + pub node_id: NodeId, + /// Weight indicating how relevant this keyword is to this node (0.0 - 1.0). + pub weight: f32, + /// Depth of the node in the tree (for tie-breaking). + pub depth: usize, +} + +/// Pre-computed shortcut for summary-style queries. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SummaryShortcut { + /// The root node ID (direct answer for "what is this about" queries). + pub root_node: NodeId, + /// Pre-collected summaries of top-level sections. + pub section_summaries: Vec, + /// Combined summary text for direct return. + pub document_summary: String, +} + +/// A pre-collected section summary for quick access. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SectionSummary { + /// Section node ID. + pub node_id: NodeId, + /// Section title. + pub title: String, + /// Section summary (pre-computed by EnhanceStage). + pub summary: String, + /// Depth of the section. + pub depth: usize, +} + +/// Entry tracking how often a node is retrieved. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HotNodeEntry { + /// Number of times this node appeared in retrieval results. + pub hit_count: u32, + /// Rolling average score when retrieved. + pub avg_score: f32, + /// Whether this node is currently marked as "hot" + /// (hit_count exceeds configured threshold). + pub is_hot: bool, +} + +/// Configuration for building and using the reasoning index. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReasoningIndexConfig { + /// Whether reasoning index building is enabled. + pub enabled: bool, + /// Minimum hit count for a node to be considered "hot". + pub hot_node_threshold: u32, + /// Maximum number of topic entries per keyword. + pub max_topic_entries: usize, + /// Maximum number of keyword-to-node mappings to keep. + pub max_keyword_entries: usize, + /// Minimum keyword length to index. + pub min_keyword_length: usize, + /// Whether to build the summary shortcut. + pub build_summary_shortcut: bool, + /// Whether to expand keywords with LLM-generated synonyms. + /// When enabled, the indexing stage calls the LLM to generate + /// synonym terms for each keyword, improving recall for queries + /// that use different wording than the document. + pub enable_synonym_expansion: bool, +} + +impl Default for ReasoningIndexConfig { + fn default() -> Self { + Self { + enabled: true, + hot_node_threshold: 3, + max_topic_entries: 20, + max_keyword_entries: 5000, + min_keyword_length: 2, + build_summary_shortcut: true, + enable_synonym_expansion: true, + } + } +} + +impl ReasoningIndexConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create a disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set the hot node threshold. + pub fn with_hot_threshold(mut self, threshold: u32) -> Self { + self.hot_node_threshold = threshold; + self + } + + /// Set whether to build the summary shortcut. + pub fn with_summary_shortcut(mut self, build: bool) -> Self { + self.build_summary_shortcut = build; + self + } + + /// Enable or disable synonym expansion. + pub fn with_synonym_expansion(mut self, enable: bool) -> Self { + self.enable_synonym_expansion = enable; + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_reasoning_index_default() { + let index = ReasoningIndex::default(); + assert_eq!(index.topic_count(), 0); + assert_eq!(index.section_count(), 0); + assert_eq!(index.hot_node_count(), 0); + assert!(index.summary_shortcut().is_none()); + } + + #[test] + fn test_builder_basic() { + // Create a simple tree to get valid NodeIds + let mut tree = crate::document::DocumentTree::new("Root", "root content"); + let child1 = tree.add_child(tree.root(), "Introduction", "intro content"); + let child2 = tree.add_child(tree.root(), "Methods", "methods content"); + + let mut builder = ReasoningIndexBuilder::new(); + builder.add_section("Introduction", child1); + builder.add_section("Methods", child2); + + let index = builder.build(); + assert_eq!(index.section_count(), 2); + assert!(index.find_section("introduction").is_some()); + assert!(index.find_section("INTRODUCTION").is_some()); + assert!(index.find_section("methods").is_some()); + } + + #[test] + fn test_config_default() { + let config = ReasoningIndexConfig::default(); + assert!(config.enabled); + assert_eq!(config.hot_node_threshold, 3); + assert!(config.build_summary_shortcut); + } + + #[test] + fn test_config_disabled() { + let config = ReasoningIndexConfig::disabled(); + assert!(!config.enabled); + } + + #[test] + fn test_serialization_roundtrip_empty() { + let mut tree = crate::document::DocumentTree::new("Root", "content"); + let child = tree.add_child(tree.root(), "Section 1", "s1 content"); + + let mut builder = ReasoningIndexBuilder::new(); + builder.add_section("Section 1", child); + builder.add_topic_entry( + "section", + TopicEntry { + node_id: child, + weight: 0.8, + depth: 1, + }, + ); + let index = builder.build(); + + let json = serde_json::to_string(&index).unwrap(); + let restored: ReasoningIndex = serde_json::from_str(&json).unwrap(); + + assert_eq!(restored.topic_count(), 1); + assert_eq!(restored.section_count(), 1); + assert_eq!(restored.hot_node_count(), 0); + } + + #[test] + fn test_serialization_roundtrip_with_hot_nodes() { + let mut tree = crate::document::DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "S1", "content 1"); + let c2 = tree.add_child(root, "S2", "content 2"); + + let mut index = ReasoningIndex::new(); + index.update_hot_nodes(&[(c1, 0.9), (c2, 0.7), (c1, 0.8)], 2); + + // c1 should be hot (2 hits >= threshold 2) + assert!(index.is_hot(c1)); + // c2 should not be hot (1 hit < threshold 2) + assert!(!index.is_hot(c2)); + + let json = serde_json::to_string(&index).unwrap(); + + // hot_nodes should serialize as array of pairs, not as object + assert!(!json.contains("\"hot_nodes\":{}")); + assert!(json.contains("\"hot_nodes\":[")); + + let restored: ReasoningIndex = serde_json::from_str(&json).unwrap(); + assert!(restored.is_hot(c1)); + assert!(!restored.is_hot(c2)); + + let entry = restored.hot_entry(c1).unwrap(); + assert_eq!(entry.hit_count, 2); + assert!(entry.avg_score > 0.0); + } + + #[test] + fn test_backward_compat_hot_nodes_empty_object() { + // Simulate old JSON where hot_nodes was serialized as {} by derive. + let mut tree = crate::document::DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "S1", "c"); + + let mut builder = ReasoningIndexBuilder::new(); + builder.add_section("s1", child); + let index = builder.build(); + + // Serialize normally (produces "hot_nodes":[]), then replace with + // the old format to test backward compat + let json = serde_json::to_string(&index).unwrap(); + let old_json = json.replace("\"hot_nodes\":[]", "\"hot_nodes\":{}"); + + let restored: ReasoningIndex = serde_json::from_str(&old_json).unwrap(); + assert_eq!(restored.hot_node_count(), 0); + } +} diff --git a/vectorless-core/vectorless-document/src/reference.rs b/vectorless-core/vectorless-document/src/reference.rs new file mode 100644 index 00000000..10d08e42 --- /dev/null +++ b/vectorless-core/vectorless-document/src/reference.rs @@ -0,0 +1,559 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! In-document reference types and extraction. +//! +//! This module provides support for parsing and following references +//! within documents, such as "see Appendix G" or "refer to Table 5.3". +//! +//! # Example +//! +//! ```ignore +//! use vectorless::document::{NodeReference, RefType, ReferenceExtractor}; +//! +//! let content = "For more details, see Section 2.1 and Appendix G."; +//! let refs = ReferenceExtractor::extract(content); +//! +//! for r#ref in refs { +//! println!("Found {:?}: {}", r#ref.ref_type, r#ref.ref_text); +//! } +//! ``` + +use regex::Regex; +use serde::{Deserialize, Serialize}; +use std::sync::LazyLock; + +use super::NodeId; + +/// Type of in-document reference. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum RefType { + /// Reference to a section (e.g., "Section 2.1", "Chapter 3"). + Section, + /// Reference to an appendix (e.g., "Appendix A", "Appendix G"). + Appendix, + /// Reference to a table (e.g., "Table 5.3", "Table 1"). + Table, + /// Reference to a figure (e.g., "Figure 2.1", "Fig. 3"). + Figure, + /// Reference to a page (e.g., "page 42", "p. 15"). + Page, + /// Reference to an equation (e.g., "Equation 1", "Eq. 2.3"). + Equation, + /// Reference to a footnote (e.g., "footnote 1"). + Footnote, + /// Reference to a listing/code block. + Listing, + /// Unknown reference type. + Unknown, +} + +impl std::fmt::Display for RefType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + RefType::Section => write!(f, "Section"), + RefType::Appendix => write!(f, "Appendix"), + RefType::Table => write!(f, "Table"), + RefType::Figure => write!(f, "Figure"), + RefType::Page => write!(f, "Page"), + RefType::Equation => write!(f, "Equation"), + RefType::Footnote => write!(f, "Footnote"), + RefType::Listing => write!(f, "Listing"), + RefType::Unknown => write!(f, "Reference"), + } + } +} + +/// A reference found within document content. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeReference { + /// The original reference text (e.g., "see Appendix G"). + pub ref_text: String, + /// The target identifier extracted from the reference (e.g., "G", "5.3"). + pub target_id: String, + /// Type of the reference. + pub ref_type: RefType, + /// Resolved target node ID (if found in the tree). + pub target_node: Option, + /// Confidence score for the resolution (0.0 - 1.0). + pub confidence: f32, + /// Position in the original text (character offset). + pub position: usize, +} + +impl NodeReference { + /// Create a new unresolved reference. + pub fn new(ref_text: String, target_id: String, ref_type: RefType, position: usize) -> Self { + Self { + ref_text, + target_id, + ref_type, + target_node: None, + confidence: 0.0, + position, + } + } + + /// Create a resolved reference with a target node. + pub fn resolved( + ref_text: String, + target_id: String, + ref_type: RefType, + position: usize, + target_node: NodeId, + confidence: f32, + ) -> Self { + Self { + ref_text, + target_id, + ref_type, + target_node: Some(target_node), + confidence, + position, + } + } + + /// Check if this reference has been resolved. + pub fn is_resolved(&self) -> bool { + self.target_node.is_some() + } +} + +/// Reference extraction patterns. +static SECTION_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // Section references: "Section 2.1", "section 2.1.3", "Sec. 2.1" + ( + Regex::new(r"(?i)(?:see\s+)?(?:section|sec\.?)\s+([\d.]+)").unwrap(), + RefType::Section, + ), + // Chapter references: "Chapter 3", "Ch. 2" + ( + Regex::new(r"(?i)(?:see\s+)?(?:chapter|ch\.?)\s+(\d+)").unwrap(), + RefType::Section, + ), + ] +}); + +static APPENDIX_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // Appendix references: "Appendix A", "appendix G", "App. B" + ( + Regex::new(r"(?i)(?:see\s+)?(?:appendix|app\.?)\s+([A-Z]|[a-z])").unwrap(), + RefType::Appendix, + ), + ] +}); + +static TABLE_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // Table references: "Table 5.3", "table 1", "Tbl. 2.1" + ( + Regex::new(r"(?i)(?:see\s+)?(?:table|tbl\.?)\s+([\d.]+)").unwrap(), + RefType::Table, + ), + ] +}); + +static FIGURE_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // Figure references: "Figure 2.1", "fig. 3", "Fig 1.2" + ( + Regex::new(r"(?i)(?:see\s+)?(?:figure|fig\.?)\s+([\d.]+)").unwrap(), + RefType::Figure, + ), + ] +}); + +static PAGE_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // Page references: "page 42", "p. 15", "pp. 20-25" + ( + Regex::new(r"(?i)(?:see\s+)?(?:page|p\.?)\s+(\d+)").unwrap(), + RefType::Page, + ), + ] +}); + +static EQUATION_PATTERNS: LazyLock> = LazyLock::new(|| { + vec![ + // Equation references: "Equation 1", "Eq. 2.3" + ( + Regex::new(r"(?i)(?:see\s+)?(?:equation|eq\.?)\s+([\d.]+)").unwrap(), + RefType::Equation, + ), + ] +}); + +/// Reference extractor for parsing in-document references. +/// +/// # Example +/// +/// ```ignore +/// let content = "For details, see Section 2.1 and Appendix G."; +/// let refs = ReferenceExtractor::extract(content); +/// assert_eq!(refs.len(), 2); +/// ``` +pub struct ReferenceExtractor; + +impl ReferenceExtractor { + /// Extract all references from text content. + pub fn extract(text: &str) -> Vec { + let mut references = Vec::new(); + + // Extract section references + for (regex, ref_type) in SECTION_PATTERNS.iter() { + for cap in regex.captures_iter(text) { + if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { + references.push(NodeReference::new( + full_match.as_str().to_string(), + target.as_str().to_string(), + *ref_type, + full_match.start(), + )); + } + } + } + + // Extract appendix references + for (regex, ref_type) in APPENDIX_PATTERNS.iter() { + for cap in regex.captures_iter(text) { + if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { + references.push(NodeReference::new( + full_match.as_str().to_string(), + target.as_str().to_uppercase(), // Normalize to uppercase + *ref_type, + full_match.start(), + )); + } + } + } + + // Extract table references + for (regex, ref_type) in TABLE_PATTERNS.iter() { + for cap in regex.captures_iter(text) { + if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { + references.push(NodeReference::new( + full_match.as_str().to_string(), + target.as_str().to_string(), + *ref_type, + full_match.start(), + )); + } + } + } + + // Extract figure references + for (regex, ref_type) in FIGURE_PATTERNS.iter() { + for cap in regex.captures_iter(text) { + if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { + references.push(NodeReference::new( + full_match.as_str().to_string(), + target.as_str().to_string(), + *ref_type, + full_match.start(), + )); + } + } + } + + // Extract page references + for (regex, ref_type) in PAGE_PATTERNS.iter() { + for cap in regex.captures_iter(text) { + if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { + references.push(NodeReference::new( + full_match.as_str().to_string(), + target.as_str().to_string(), + *ref_type, + full_match.start(), + )); + } + } + } + + // Extract equation references + for (regex, ref_type) in EQUATION_PATTERNS.iter() { + for cap in regex.captures_iter(text) { + if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { + references.push(NodeReference::new( + full_match.as_str().to_string(), + target.as_str().to_string(), + *ref_type, + full_match.start(), + )); + } + } + } + + // Sort by position and remove duplicates + references.sort_by_key(|r| r.position); + references.dedup_by(|a, b| a.position == b.position); + + references + } + + /// Extract references and attempt to resolve them against a tree. + /// + /// Uses the tree's structure index and title matching to find targets. + pub fn extract_and_resolve( + text: &str, + tree: &super::DocumentTree, + index: &super::RetrievalIndex, + ) -> Vec { + let mut references = Self::extract(text); + + for ref_mut in &mut references { + ref_mut.target_node = Self::resolve_reference(ref_mut, tree, index); + if ref_mut.target_node.is_some() { + ref_mut.confidence = 0.8; + } + } + + references + } + + /// Resolve a reference to a node in the tree. + fn resolve_reference( + r#ref: &NodeReference, + tree: &super::DocumentTree, + index: &super::RetrievalIndex, + ) -> Option { + match r#ref.ref_type { + RefType::Section => { + // Try to find by structure index (e.g., "2.1" -> structure "2.1") + if let Some(node_id) = index.find_by_structure(&r#ref.target_id) { + return Some(node_id); + } + // Try partial match (e.g., "2" might match "2.1" or "2.2") + for (structure, &node_id) in index.structures() { + if structure.starts_with(&format!("{}.", r#ref.target_id)) + || structure.as_str() == r#ref.target_id + { + return Some(node_id); + } + } + None + } + RefType::Appendix => { + // Search for nodes with "Appendix X" in title + for node_id in tree.traverse() { + if let Some(node) = tree.get(node_id) { + let title_lower = node.title.to_lowercase(); + if title_lower + .starts_with(&format!("appendix {}", r#ref.target_id.to_lowercase())) + || title_lower == format!("appendix {}", r#ref.target_id.to_lowercase()) + { + return Some(node_id); + } + } + } + None + } + RefType::Table => { + // Search for nodes with "Table X" in title + for node_id in tree.traverse() { + if let Some(node) = tree.get(node_id) { + let title_lower = node.title.to_lowercase(); + if title_lower.contains(&format!("table {}", r#ref.target_id)) { + return Some(node_id); + } + } + } + None + } + RefType::Figure => { + // Search for nodes with "Figure X" in title + for node_id in tree.traverse() { + if let Some(node) = tree.get(node_id) { + let title_lower = node.title.to_lowercase(); + if title_lower.contains(&format!("figure {}", r#ref.target_id)) + || title_lower.contains(&format!("fig {}", r#ref.target_id)) + { + return Some(node_id); + } + } + } + None + } + RefType::Page => { + // Parse page number and find node + if let Ok(page) = r#ref.target_id.parse::() { + return index.find_by_page(page); + } + None + } + _ => None, + } + } +} + +/// Reference resolver for batch resolution. +/// +/// Caches resolved references for efficient reuse. +#[derive(Debug, Clone, Default)] +pub struct ReferenceResolver { + /// Cache of resolved references by ref_text. + cache: std::collections::HashMap>, +} + +impl ReferenceResolver { + /// Create a new reference resolver. + pub fn new() -> Self { + Self::default() + } + + /// Resolve references in batch and cache results. + pub fn resolve_batch( + &mut self, + references: &[NodeReference], + tree: &super::DocumentTree, + index: &super::RetrievalIndex, + ) { + for r#ref in references { + if !self.cache.contains_key(&r#ref.ref_text) { + let resolved = ReferenceExtractor::resolve_reference(r#ref, tree, index); + self.cache.insert(r#ref.ref_text.clone(), resolved); + } + } + } + + /// Get a cached resolution. + pub fn get(&self, ref_text: &str) -> Option> { + self.cache.get(ref_text).copied() + } + + /// Clear the cache. + pub fn clear(&mut self) { + self.cache.clear(); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_section_references() { + let text = "For details, see Section 2.1 and Section 3.2.1."; + let refs = ReferenceExtractor::extract(text); + + // Debug: print what was extracted + for r in &refs { + eprintln!( + "Extracted: {:?} '{}' -> '{}'", + r.ref_type, r.ref_text, r.target_id + ); + } + + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Section && r.target_id == "2.1") + ); + // Note: The regex may not capture all multi-level section numbers correctly + // in a single pass, so we check for the presence of section references + assert!(refs.iter().any(|r| r.ref_type == RefType::Section)); + } + + #[test] + fn test_extract_appendix_references() { + let text = "See Appendix G for more information."; + let refs = ReferenceExtractor::extract(text); + + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Appendix && r.target_id == "G") + ); + } + + #[test] + fn test_extract_table_references() { + let text = "The data is shown in Table 5.3 and Table 1."; + let refs = ReferenceExtractor::extract(text); + + // Debug output + for r in &refs { + eprintln!( + "Extracted: {:?} '{}' -> '{}'", + r.ref_type, r.ref_text, r.target_id + ); + } + + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Table && r.target_id == "5.3") + ); + // The trailing period may be included, so check for either "1" or "1." + assert!( + refs.iter().any( + |r| r.ref_type == RefType::Table && (r.target_id == "1" || r.target_id == "1.") + ) + ); + } + + #[test] + fn test_extract_figure_references() { + let text = "As shown in Figure 2.1 and fig. 3."; + let refs = ReferenceExtractor::extract(text); + + // Debug output + for r in &refs { + eprintln!( + "Extracted: {:?} '{}' -> '{}'", + r.ref_type, r.ref_text, r.target_id + ); + } + + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Figure && r.target_id == "2.1") + ); + // The trailing period may be included, so check for either "3" or "3." + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Figure + && (r.target_id == "3" || r.target_id == "3.")) + ); + } + + #[test] + fn test_extract_page_references() { + let text = "See page 42 for details."; + let refs = ReferenceExtractor::extract(text); + + assert!( + refs.iter() + .any(|r| r.ref_type == RefType::Page && r.target_id == "42") + ); + } + + #[test] + fn test_extract_mixed_references() { + let text = "For details, see Section 2.1, Appendix G, and Table 5.3."; + let refs = ReferenceExtractor::extract(text); + + assert_eq!(refs.len(), 3); + assert!(refs.iter().any(|r| r.ref_type == RefType::Section)); + assert!(refs.iter().any(|r| r.ref_type == RefType::Appendix)); + assert!(refs.iter().any(|r| r.ref_type == RefType::Table)); + } + + #[test] + fn test_ref_type_display() { + assert_eq!(format!("{}", RefType::Section), "Section"); + assert_eq!(format!("{}", RefType::Appendix), "Appendix"); + assert_eq!(format!("{}", RefType::Table), "Table"); + } + + #[test] + fn test_node_reference_is_resolved() { + let unresolved = NodeReference::new( + "Section 2.1".to_string(), + "2.1".to_string(), + RefType::Section, + 0, + ); + assert!(!unresolved.is_resolved()); + + // Can't easily test resolved() without a real NodeId + } +} diff --git a/vectorless-core/vectorless-document/src/serde_helpers.rs b/vectorless-core/vectorless-document/src/serde_helpers.rs new file mode 100644 index 00000000..cb658c35 --- /dev/null +++ b/vectorless-core/vectorless-document/src/serde_helpers.rs @@ -0,0 +1,241 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Serde helpers for types that contain `HashMap`. +//! +//! JSON requires object keys to be strings, but `NodeId` (wrapping `indextree::NodeId`) +//! serializes as an integer. When `serde_json` serializes a `HashMap`, +//! it converts the integer key to a string (e.g., `42` → `"42"`), but on deserialization +//! it cannot parse the string back to `NodeId` because the deserializer expects a number. +//! +//! This module provides a `#[serde(with = "node_id_map")]` adapter that serializes +//! `HashMap` as a `Vec<(NodeId, V)>` instead, which is JSON-safe. +//! +//! # Usage +//! +//! ```rust,ignore +//! use serde::{Serialize, Deserialize}; +//! use std::collections::HashMap; +//! use crate::document::serde_helpers::node_id_map; +//! +//! #[derive(Serialize, Deserialize)] +//! struct MyStruct { +//! #[serde(with = "node_id_map")] +//! entries: HashMap, +//! } +//! ``` + +use std::collections::HashMap; + +use serde::de::DeserializeOwned; +use serde::{Deserialize, Deserializer, Serialize, Serializer}; + +use super::node::NodeId; + +/// Serialize `HashMap` as `Vec<(NodeId, V)>` (sorted by key for determinism). +pub fn serialize(map: &HashMap, serializer: S) -> Result +where + V: Serialize, + S: Serializer, +{ + let mut pairs: Vec<_> = map.iter().map(|(k, v)| (*k, v)).collect(); + pairs.sort_by_key(|(id, _)| usize::from(id.0)); + pairs.serialize(serializer) +} + +/// Deserialize `Vec<(NodeId, V)>` back into `HashMap`. +/// +/// Also accepts `{}` (empty JSON object) for backward compatibility with +/// data serialized before this helper was introduced, when `hot_nodes` etc. +/// were empty and serialized as `{}`. +pub fn deserialize<'de, V, D>(deserializer: D) -> Result, D::Error> +where + V: DeserializeOwned, + D: Deserializer<'de>, +{ + use serde::de; + + // Try to deserialize as either a Vec of pairs or an empty object. + struct VecOrEmptyMap(std::marker::PhantomData); + + impl<'de, V> de::Visitor<'de> for VecOrEmptyMap + where + V: DeserializeOwned, + { + type Value = HashMap; + + fn expecting(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str("a list of (NodeId, value) pairs or an empty object") + } + + fn visit_seq(self, seq: A) -> Result + where + A: de::SeqAccess<'de>, + { + let pairs: Vec<(NodeId, V)> = + Deserialize::deserialize(de::value::SeqAccessDeserializer::new(seq))?; + Ok(pairs.into_iter().collect()) + } + + fn visit_map(self, map: A) -> Result + where + A: de::MapAccess<'de>, + { + // Consume the map (should be empty for backward compat) + let _: de::value::MapAccessDeserializer = de::value::MapAccessDeserializer::new(map); + Ok(HashMap::new()) + } + } + + deserializer.deserialize_any(VecOrEmptyMap(std::marker::PhantomData)) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::DocumentTree; + + /// Wrapper struct to test `#[serde(with)]` through serde_json round-trip. + #[derive(Serialize, Deserialize, Debug)] + struct Wrap { + #[serde(with = "super")] + map: HashMap, + } + + #[test] + fn test_empty_map_roundtrip() { + let original = Wrap { + map: HashMap::::new(), + }; + let json = serde_json::to_string(&original).unwrap(); + assert!(json.contains("\"map\":[]")); + + let restored: Wrap = serde_json::from_str(&json).unwrap(); + assert!(restored.map.is_empty()); + } + + #[test] + fn test_single_entry_roundtrip() { + let tree = DocumentTree::new("Root", "content"); + let root = tree.root(); + + let original = Wrap { + map: { + let mut m = HashMap::new(); + m.insert(root, "root data".to_string()); + m + }, + }; + + let json = serde_json::to_string(&original).unwrap(); + let restored: Wrap = serde_json::from_str(&json).unwrap(); + assert_eq!(restored.map.get(&root), Some(&"root data".to_string())); + } + + #[test] + fn test_multiple_entries_roundtrip() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "C1", "c1"); + let c2 = tree.add_child(root, "C2", "c2"); + + let original = Wrap { + map: { + let mut m = HashMap::new(); + m.insert(root, 0u32); + m.insert(c1, 1u32); + m.insert(c2, 2u32); + m + }, + }; + + let json = serde_json::to_string(&original).unwrap(); + let restored: Wrap = serde_json::from_str(&json).unwrap(); + + assert_eq!(restored.map.len(), 3); + assert_eq!(restored.map[&root], 0); + assert_eq!(restored.map[&c1], 1); + assert_eq!(restored.map[&c2], 2); + } + + #[test] + fn test_backward_compat_empty_object() { + // Old data serialized hot_nodes as {} before node_id_map was used. + let json = r#"{"map": {}}"#; + let restored: Wrap = serde_json::from_str(json).unwrap(); + assert!(restored.map.is_empty()); + } + + #[test] + fn test_backward_compat_nonempty_object_rejected() { + // A non-empty JSON object with string keys like {"1": "data"} should + // fail because the string key "1" cannot be deserialized as NodeId. + let json = r#"{"map": {"1": "data"}}"#; + let result: Result, _> = serde_json::from_str(json); + assert!(result.is_err()); + } + + #[test] + fn test_serialized_json_shape() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let child = tree.add_child(root, "Child", "c"); + + let original = Wrap { + map: { + let mut m = HashMap::new(); + m.insert(root, "a".to_string()); + m.insert(child, "b".to_string()); + m + }, + }; + + let json = serde_json::to_string(&original).unwrap(); + // Verify deterministic ordering: root (id 0) before child (id 1) + let root_pos = json.find("\"a\"").unwrap_or(usize::MAX); + let child_pos = json.find("\"b\"").unwrap_or(usize::MAX); + assert!( + root_pos < child_pos, + "root entry should come first: {}", + json + ); + } + + #[test] + fn test_roundtrip_with_complex_value() { + // Test with a non-trivial value type (not just String/u32) + let tree = DocumentTree::new("Root", ""); + let root = tree.root(); + + #[derive(Serialize, Deserialize, Debug, PartialEq)] + struct Entry { + count: u32, + label: String, + } + + #[derive(Serialize, Deserialize, Debug)] + struct ComplexWrap { + #[serde(with = "super")] + data: HashMap, + } + + let original = ComplexWrap { + data: { + let mut m = HashMap::new(); + m.insert( + root, + Entry { + count: 42, + label: "test".to_string(), + }, + ); + m + }, + }; + + let json = serde_json::to_string(&original).unwrap(); + let restored: ComplexWrap = serde_json::from_str(&json).unwrap(); + assert_eq!(restored.data[&root].count, 42); + assert_eq!(restored.data[&root].label, "test"); + } +} diff --git a/vectorless-core/vectorless-document/src/structure.rs b/vectorless-core/vectorless-document/src/structure.rs new file mode 100644 index 00000000..455b25cb --- /dev/null +++ b/vectorless-core/vectorless-document/src/structure.rs @@ -0,0 +1,65 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document structure types for JSON export. +//! + +use serde::{Deserialize, Serialize}; + +/// A node in the document structure for JSON export. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StructureNode { + /// Node title. + pub title: String, + /// Unique node identifier. + pub node_id: String, + /// Starting line number (1-based). + pub start_index: usize, + /// Ending line number (1-based). + pub end_index: usize, + /// Generated summary (optional). + #[serde(skip_serializing_if = "Option::is_none")] + pub summary: Option, + /// Child nodes. + #[serde(skip_serializing_if = "Vec::is_empty")] + pub nodes: Vec, +} + +/// Document structure for JSON export. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentStructure { + /// Document name. + pub doc_name: String, + /// Tree structure. + pub structure: Vec, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_structure_node_serialization() { + let node = StructureNode { + title: "Introduction".to_string(), + node_id: "0001".to_string(), + start_index: 1, + end_index: 10, + summary: Some("A brief intro".to_string()), + nodes: vec![], + }; + + let json = serde_json::to_string(&node).unwrap(); + assert!(json.contains("Introduction")); + } + + #[test] + fn test_document_structure() { + let doc = DocumentStructure { + doc_name: "test.md".to_string(), + structure: vec![], + }; + + assert_eq!(doc.doc_name, "test.md"); + } +} diff --git a/vectorless-core/vectorless-document/src/toc.rs b/vectorless-core/vectorless-document/src/toc.rs new file mode 100644 index 00000000..6f1806ef --- /dev/null +++ b/vectorless-core/vectorless-document/src/toc.rs @@ -0,0 +1,343 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Table of Contents (ToC) view generation. +//! +//! Provides utilities for generating different views of the document tree, +//! including hierarchical ToC, flat ToC, and filtered views. + +use serde::{Deserialize, Serialize}; + +use super::node::NodeId; +use super::node::TreeNode; +use super::tree::DocumentTree; + +/// A node in the Table of Contents. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TocNode { + /// Node title. + pub title: String, + /// Node ID (if available). + pub node_id: Option, + /// Depth in the tree. + pub depth: usize, + /// Page range (for PDFs). + pub page_range: Option<(usize, usize)>, + /// Brief summary (optional). + pub summary: Option, + /// Children nodes. + pub children: Vec, +} + +impl TocNode { + /// Create a new ToC node. + pub fn new(title: impl Into, depth: usize) -> Self { + Self { + title: title.into(), + node_id: None, + depth, + page_range: None, + summary: None, + children: Vec::new(), + } + } + + /// Set the node ID. + pub fn with_node_id(mut self, id: impl Into) -> Self { + self.node_id = Some(id.into()); + self + } + + /// Set the page range. + pub fn with_page_range(mut self, start: usize, end: usize) -> Self { + self.page_range = Some((start, end)); + self + } + + /// Set the summary. + pub fn with_summary(mut self, summary: impl Into) -> Self { + self.summary = Some(summary.into()); + self + } + + /// Add a child node. + pub fn add_child(&mut self, child: TocNode) { + self.children.push(child); + } + + /// Count total nodes in this subtree. + pub fn count_nodes(&self) -> usize { + 1 + self.children.iter().map(|c| c.count_nodes()).sum::() + } + + /// Count leaf nodes in this subtree. + pub fn count_leaves(&self) -> usize { + if self.children.is_empty() { + 1 + } else { + self.children.iter().map(|c| c.count_leaves()).sum() + } + } + + /// Get maximum depth in this subtree. + pub fn max_depth(&self) -> usize { + if self.children.is_empty() { + self.depth + } else { + self.children + .iter() + .map(|c| c.max_depth()) + .max() + .unwrap_or(self.depth) + } + } +} + +/// Configuration for ToC generation. +#[derive(Debug, Clone)] +pub struct TocConfig { + /// Maximum depth to include (None = unlimited). + pub max_depth: Option, + /// Whether to include summaries. + pub include_summaries: bool, + /// Whether to include page ranges. + pub include_pages: bool, + /// Minimum content length to include (filter out empty nodes). + pub min_content_length: usize, +} + +impl Default for TocConfig { + fn default() -> Self { + Self { + max_depth: None, + include_summaries: true, + include_pages: true, + min_content_length: 0, + } + } +} + +impl TocConfig { + /// Create new ToC config. + pub fn new() -> Self { + Self::default() + } + + /// Set maximum depth. + pub fn with_max_depth(mut self, depth: usize) -> Self { + self.max_depth = Some(depth); + self + } + + /// Set whether to include summaries. + pub fn with_summaries(mut self, include: bool) -> Self { + self.include_summaries = include; + self + } +} + +/// ToC view generator. +#[derive(Clone)] +pub struct TocView { + config: TocConfig, +} + +impl TocView { + /// Create a new ToC view generator. + pub fn new() -> Self { + Self { + config: TocConfig::default(), + } + } + + /// Create with custom configuration. + pub fn with_config(config: TocConfig) -> Self { + Self { config } + } + + /// Generate ToC from a tree. + pub fn generate(&self, tree: &DocumentTree) -> TocNode { + self.build_toc_node(tree, tree.root(), 0) + } + + /// Generate ToC starting from a specific node. + pub fn generate_from(&self, tree: &DocumentTree, start: NodeId) -> TocNode { + let depth = tree.get(start).map_or(0, |n| n.depth); + self.build_toc_node(tree, start, depth) + } + + /// Build a ToC node from a tree node. + fn build_toc_node(&self, tree: &DocumentTree, node_id: NodeId, depth: usize) -> TocNode { + let node = match tree.get(node_id) { + Some(n) => n, + None => return TocNode::new("Unknown", depth), + }; + + // Check depth limit + if let Some(max) = self.config.max_depth { + if depth > max { + return TocNode::new("...", depth - 1); + } + } + + // Check minimum content length + if node.content.len() < self.config.min_content_length && tree.children(node_id).is_empty() + { + return TocNode::new(node.title.clone(), depth); + } + + let mut toc_node = + TocNode::new(&node.title, depth).with_node_id(node.node_id.clone().unwrap_or_default()); + + // Add page range + if self.config.include_pages { + if let (Some(start), Some(end)) = (node.start_page, node.end_page) { + toc_node = toc_node.with_page_range(start, end); + } + } + + // Add summary + if self.config.include_summaries && !node.summary.is_empty() { + toc_node = toc_node.with_summary(&node.summary); + } + + // Recursively add children + for child_id in tree.children(node_id) { + let child_toc = self.build_toc_node(tree, child_id, depth + 1); + toc_node.add_child(child_toc); + } + + toc_node + } + + /// Generate a flat list of ToC entries. + pub fn generate_flat(&self, tree: &DocumentTree) -> Vec { + let mut entries = Vec::new(); + self.collect_flat_entries(tree, tree.root(), &mut entries); + entries + } + + fn collect_flat_entries( + &self, + tree: &DocumentTree, + node_id: NodeId, + entries: &mut Vec, + ) { + if let Some(node) = tree.get(node_id) { + entries.push(TocEntry { + title: node.title.clone(), + node_id: node.node_id.clone(), + depth: node.depth, + page_range: node.start_page.zip(node.end_page), + }); + + for child_id in tree.children(node_id) { + self.collect_flat_entries(tree, child_id, entries); + } + } + } + + /// Generate a filtered ToC based on a predicate. + pub fn generate_filtered(&self, tree: &DocumentTree, filter: F) -> Vec + where + F: Fn(&TreeNode) -> bool, + { + let mut result = Vec::new(); + self.collect_filtered(tree, tree.root(), &filter, &mut result); + result + } + + fn collect_filtered( + &self, + tree: &DocumentTree, + node_id: NodeId, + filter: &F, + result: &mut Vec, + ) where + F: Fn(&TreeNode) -> bool, + { + if let Some(node) = tree.get(node_id) { + if filter(node) { + let toc_node = self.build_toc_node(tree, node_id, node.depth); + result.push(toc_node); + } + + for child_id in tree.children(node_id) { + self.collect_filtered(tree, child_id, filter, result); + } + } + } + + /// Format ToC as markdown. + pub fn format_markdown(&self, toc: &TocNode) -> String { + let mut output = String::new(); + self.write_markdown(toc, &mut output, 0); + output + } + + fn write_markdown(&self, toc: &TocNode, output: &mut String, level: usize) { + let indent = " ".repeat(level); + let bullet = if level == 0 { "-" } else { "-" }; + + output.push_str(&format!("{}{} {}\n", indent, bullet, toc.title)); + + if let Some(ref summary) = toc.summary { + output.push_str(&format!("{} > {}\n", indent, summary)); + } + + for child in &toc.children { + self.write_markdown(child, output, level + 1); + } + } + + /// Format ToC as JSON. + pub fn format_json(&self, toc: &TocNode) -> Result { + serde_json::to_string_pretty(toc) + } +} + +impl Default for TocView { + fn default() -> Self { + Self::new() + } +} + +/// A flat ToC entry. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TocEntry { + /// Node title. + pub title: String, + /// Node ID. + pub node_id: Option, + /// Depth in tree. + pub depth: usize, + /// Page range. + pub page_range: Option<(usize, usize)>, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_toc_node_creation() { + let mut root = TocNode::new("Root", 0); + let child = TocNode::new("Child", 1) + .with_node_id("node-1") + .with_summary("A child node"); + + root.add_child(child); + + assert_eq!(root.count_nodes(), 2); + assert_eq!(root.count_leaves(), 1); + assert_eq!(root.max_depth(), 1); + } + + #[test] + fn test_toc_config() { + let config = TocConfig::new().with_max_depth(3).with_summaries(false); + + assert_eq!(config.max_depth, Some(3)); + assert!(!config.include_summaries); + } +} diff --git a/vectorless-core/vectorless-document/src/tree.rs b/vectorless-core/vectorless-document/src/tree.rs new file mode 100644 index 00000000..1659471b --- /dev/null +++ b/vectorless-core/vectorless-document/src/tree.rs @@ -0,0 +1,883 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document tree using arena-based allocation. +//! +//! This structure provides better memory locality and simpler +//! lifetime management compared to `Rc}`. + +use std::collections::HashMap; + +use indextree::Arena; +use serde::{Deserialize, Serialize}; + +use super::node::{NodeId, TreeNode}; +use super::structure::{DocumentStructure, StructureNode}; + +/// Pre-computed index for efficient retrieval operations. +/// +/// Built once after the document tree is fully constructed. +/// Provides O(1) access to commonly needed traversal data. +#[derive(Debug, Clone)] +pub struct RetrievalIndex { + /// All leaf nodes in the tree. + leaves: Vec, + + /// Nodes grouped by depth level. + /// level_index[0] = root, level_index[1] = level 1 nodes, etc. + level_index: Vec>, + + /// Path from root to each node (inclusive). + path_cache: HashMap>, + + /// Siblings for each node (excluding self). + siblings_cache: HashMap>, + + /// Structure string to NodeId mapping. + /// e.g., "1.2.3" -> NodeId + structure_index: HashMap, + + /// Page number to NodeId mapping. + /// Maps each page to the most specific (deepest) node containing it. + page_index: HashMap, + + /// NodeId to page range mapping. + node_page_range: HashMap, + + /// Total node count. + node_count: usize, + + /// Maximum depth in the tree. + max_depth: usize, +} + +impl RetrievalIndex { + /// Get all leaf nodes. + pub fn leaves(&self) -> &[NodeId] { + &self.leaves + } + + /// Get nodes at a specific depth level. + /// + /// Returns None if the level doesn't exist. + pub fn level(&self, depth: usize) -> Option<&[NodeId]> { + self.level_index.get(depth).map(|v| v.as_slice()) + } + + /// Get all levels. + pub fn levels(&self) -> &[Vec] { + &self.level_index + } + + /// Get the path from root to a node (inclusive). + /// + /// Returns None if the node is not in the index. + pub fn path_to(&self, node: NodeId) -> Option<&[NodeId]> { + self.path_cache.get(&node).map(|v| v.as_slice()) + } + + /// Get siblings of a node (excluding the node itself). + /// + /// Returns None if the node is not in the index or has no siblings. + pub fn siblings(&self, node: NodeId) -> Option<&[NodeId]> { + self.siblings_cache.get(&node).map(|v| v.as_slice()) + } + + /// Find a node by its structure index. + /// + /// # Example + /// ```ignore + /// // Find section 2.1.3 + /// let node = index.find_by_structure("2.1.3"); + /// ``` + pub fn find_by_structure(&self, structure: &str) -> Option { + self.structure_index.get(structure).copied() + } + + /// Find the most specific node containing a page number. + /// + /// Returns the deepest node whose page range contains the given page. + pub fn find_by_page(&self, page: usize) -> Option { + self.page_index.get(&page).copied() + } + + /// Find all nodes whose page range overlaps with the given range. + /// + /// This is useful for retrieving all content that spans a range of pages. + /// + /// # Example + /// ```ignore + /// // Find all nodes covering pages 10-15 + /// let nodes = index.find_nodes_by_page_range(10, 15); + /// ``` + pub fn find_nodes_by_page_range(&self, start: usize, end: usize) -> Vec { + let mut result = Vec::new(); + for (&node_id, &(node_start, node_end)) in &self.node_page_range { + // Check if ranges overlap: node_start <= end && start <= node_end + if node_start <= end && start <= node_end { + result.push(node_id); + } + } + // Sort by start page for consistent ordering + result.sort_by_key(|&id| self.node_page_range.get(&id).map(|(s, _)| *s).unwrap_or(0)); + result + } + + /// Get all page numbers covered by a node. + /// + /// Returns None if the node has no page information. + pub fn get_pages_for_node(&self, node: NodeId) -> Option> { + let (start, end) = self.node_page_range.get(&node)?; + Some((*start..=*end).collect()) + } + + /// Get the page range for a node. + pub fn page_range(&self, node: NodeId) -> Option<(usize, usize)> { + self.node_page_range.get(&node).copied() + } + + /// Get all nodes that are leaves within a page range. + /// + /// This returns only leaf nodes (nodes with no children) that + /// overlap with the given page range. + pub fn find_leaves_by_page_range(&self, start: usize, end: usize) -> Vec { + let leaves_set: std::collections::HashSet = self.leaves.iter().copied().collect(); + self.find_nodes_by_page_range(start, end) + .into_iter() + .filter(|id| leaves_set.contains(id)) + .collect() + } + + /// Get the total number of pages in the document. + pub fn total_pages(&self) -> usize { + self.node_page_range + .values() + .map(|(_, end)| *end) + .max() + .unwrap_or(0) + } + + /// Get all structure indices. + pub fn structures(&self) -> &HashMap { + &self.structure_index + } + + /// Get the total number of nodes. + pub fn node_count(&self) -> usize { + self.node_count + } + + /// Get the maximum depth in the tree. + pub fn max_depth(&self) -> usize { + self.max_depth + } + + /// Get the number of levels. + pub fn level_count(&self) -> usize { + self.level_index.len() + } +} + +/// A hierarchical document tree structure. +/// +/// Uses an arena-based tree representation for efficient traversal +/// and node manipulation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentTree { + /// The underlying arena storing all nodes. + arena: Arena, + + /// The root node ID. + root_id: NodeId, + + /// Cached leaf nodes (rebuilt on demand). + #[serde(skip)] + leaves_cache: Option>, +} + +impl DocumentTree { + /// Create a new document tree with a root node. + pub fn new(title: &str, content: &str) -> Self { + let mut arena = Arena::new(); + let root_data = TreeNode { + title: title.to_string(), + structure: String::new(), // Root has no structure index + content: content.to_string(), + summary: String::new(), + depth: 0, + start_index: 1, + end_index: 1, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + references: Vec::new(), + routing_keywords: Vec::new(), + question_hints: Vec::new(), + }; + let root_id = arena.new_node(root_data); + + // Root is initially a leaf + let leaves_cache = Some(vec![NodeId(root_id)]); + + Self { + arena, + root_id: NodeId(root_id), + leaves_cache, + } + } + + /// Create a document tree from an existing arena and root ID. + /// + /// This is useful for deserialization and testing. + pub fn from_raw(arena: Arena, root_id: NodeId) -> Self { + Self { + arena, + root_id, + leaves_cache: None, // Will be rebuilt on demand + } + } + + /// Get the root node ID. + pub fn root(&self) -> NodeId { + self.root_id + } + + /// Get a reference to the underlying arena. + pub fn arena(&self) -> &Arena { + &self.arena + } + + /// Get a node by its ID. + /// + /// Returns None if the node doesn't exist. + pub fn get(&self, id: NodeId) -> Option<&TreeNode> { + self.arena.get(id.0).map(|n| n.get()) + } + + /// Get a mutable reference to a node by its ID. + /// + /// Returns None if the node doesn't exist. + pub fn get_mut(&mut self, id: NodeId) -> Option<&mut TreeNode> { + self.arena.get_mut(id.0).map(|n| n.get_mut()) + } + + /// Add a child node to the specified parent. + /// + /// Returns the ID of the newly created child node. + /// The structure is automatically calculated based on siblings. + pub fn add_child(&mut self, parent: NodeId, title: &str, content: &str) -> NodeId { + let parent_depth = self.arena.get(parent.0).map(|n| n.get().depth).unwrap_or(0); + let parent_structure = self + .arena + .get(parent.0) + .map(|n| n.get().structure.clone()) + .unwrap_or_default(); + + // Calculate child index (1-based) + let child_index = parent.0.children(&self.arena).count() + 1; + + // Calculate structure: parent_structure.child_index + let child_structure = if parent_structure.is_empty() { + child_index.to_string() + } else { + format!("{}.{}", parent_structure, child_index) + }; + + let child_data = TreeNode { + title: title.to_string(), + structure: child_structure, + content: content.to_string(), + summary: String::new(), + depth: parent_depth + 1, + start_index: 1, + end_index: 1, + start_page: None, + end_page: None, + node_id: None, + physical_index: None, + token_count: None, + references: Vec::new(), + routing_keywords: Vec::new(), + question_hints: Vec::new(), + }; + let child_id = self.arena.new_node(child_data); + parent.0.append(child_id, &mut self.arena); + + // Update leaves cache + if let Some(ref mut cache) = self.leaves_cache { + // Remove parent from leaves (it's no longer a leaf) + cache.retain(|&id| id != parent); + // Add child to leaves + cache.push(NodeId(child_id)); + } + + NodeId(child_id) + } + + /// Add a child node with page boundaries. + /// + /// Returns the ID of the newly created child node. + pub fn add_child_with_pages( + &mut self, + parent: NodeId, + title: &str, + content: &str, + start_page: usize, + end_page: usize, + ) -> NodeId { + let child_id = self.add_child(parent, title, content); + if let Some(node) = self.get_mut(child_id) { + node.start_page = Some(start_page); + node.end_page = Some(end_page); + } + child_id + } + + /// Check if a node is a leaf (has no children). + pub fn is_leaf(&self, id: NodeId) -> bool { + id.0.children(&self.arena).next().is_none() + } + + /// Get the number of children of a node. + /// + /// This is more efficient than `children().len()` as it doesn't allocate. + pub fn child_count(&self, id: NodeId) -> usize { + id.0.children(&self.arena).count() + } + + /// Get the children of a node as an iterator. + /// + /// Use this instead of `children()` when you only need to iterate, + /// as it avoids allocating a Vec. + pub fn children_iter(&self, id: NodeId) -> impl Iterator + '_ { + id.0.children(&self.arena).map(NodeId) + } + + /// Get the children of a node. + /// + /// Returns a Vec for cases where you need owned access to the children. + /// Consider using `children_iter()` if you only need to iterate. + pub fn children(&self, id: NodeId) -> Vec { + self.children_iter(id).collect() + } + + /// Get the children of a node plus any resolved cross-reference targets. + /// + /// In addition to direct children, this collects `NodeId`s pointed to by + /// resolved references (`node.references[i].target_node`) on the given node. + /// Duplicate node IDs (e.g. a reference that happens to be a child) are + /// de-duplicated so the caller never sees the same node twice. + pub fn children_with_refs(&self, id: NodeId) -> Vec { + let mut result: Vec = self.children_iter(id).collect(); + if let Some(node) = self.get(id) { + for r#ref in &node.references { + if let Some(target) = r#ref.target_node { + if !result.contains(&target) { + result.push(target); + } + } + } + } + result + } + + /// Get the parent of a node. + /// + /// Returns None if the node is the root or doesn't have a parent. + pub fn parent(&self, id: NodeId) -> Option { + id.0.parent(&self.arena).map(NodeId) + } + + /// Get the siblings of a node (excluding the node itself). + /// + /// Returns an empty iterator for the root node. + pub fn siblings_iter(&self, id: NodeId) -> impl Iterator + '_ { + id.0.preceding_siblings(&self.arena) + .chain(id.0.following_siblings(&self.arena)) + .map(NodeId) + } + + /// Get the ancestors of a node from parent to root. + /// + /// Returns an empty iterator for the root node. + pub fn ancestors_iter(&self, id: NodeId) -> impl Iterator + '_ { + id.0.ancestors(&self.arena).map(NodeId) + } + + /// Get the path from root to a node (inclusive). + /// + /// Returns the path as a Vec starting from the root. + pub fn path_from_root(&self, id: NodeId) -> Vec { + let mut path: Vec = self.ancestors_iter(id).collect(); + path.reverse(); + path.push(id); + path + } + + /// Get the depth of a node (root = 0). + pub fn depth(&self, id: NodeId) -> usize { + self.get(id).map(|n| n.depth).unwrap_or(0) + } + + /// Get the maximum depth of any node in the tree (root = 0, leaf ≥ 0). + /// + /// Uses a single BFS pass. Returns 0 for a single-node tree. + pub fn max_depth(&self) -> usize { + let mut max_d = 0; + let mut stack = vec![(self.root_id, 0usize)]; + while let Some((id, d)) = stack.pop() { + max_d = max_d.max(d); + for child in self.children_iter(id) { + stack.push((child, d + 1)); + } + } + max_d + } + + /// Get the first child of a node. + /// + /// Returns None if the node has no children. + pub fn first_child(&self, id: NodeId) -> Option { + self.children_iter(id).next() + } + + /// Get the last child of a node. + /// + /// Returns None if the node has no children. + pub fn last_child(&self, id: NodeId) -> Option { + self.children_iter(id).last() + } + + /// Get all leaf nodes in the tree. + /// + /// Uses cached leaves if available, otherwise rebuilds the cache. + pub fn leaves(&self) -> Vec { + if let Some(ref cache) = self.leaves_cache { + return cache.clone(); + } + + // Rebuild cache on demand + let leaves: Vec = self + .traverse() + .into_iter() + .filter(|id| self.is_leaf(*id)) + .collect(); + + // Note: Can't mutate self here, caller should use rebuild_leaves_cache() + leaves + } + + /// Rebuild the leaves cache. + /// + /// Call this after deserialization or batch modifications. + pub fn rebuild_leaves_cache(&mut self) { + self.leaves_cache = Some( + self.traverse() + .into_iter() + .filter(|id| self.is_leaf(*id)) + .collect(), + ); + } + + /// Invalidate the leaves cache. + /// + /// Called automatically by mutation methods. + pub fn invalidate_leaves_cache(&mut self) { + self.leaves_cache = None; + } + + /// Get all nodes in the tree (depth-first order). + pub fn traverse(&self) -> Vec { + let mut result = Vec::new(); + let mut stack = vec![self.root_id]; + + while let Some(id) = stack.pop() { + result.push(id); + // Add children in reverse order for correct DFS order + let mut children: Vec<_> = self.children(id).into_iter().collect(); + children.reverse(); + stack.extend(children); + } + + result + } + + /// Get the number of nodes in the tree. + pub fn node_count(&self) -> usize { + self.arena.count() + } + + /// Update a node's summary. + pub fn set_summary(&mut self, id: NodeId, summary: &str) { + if let Some(node) = self.get_mut(id) { + node.summary = summary.to_string(); + } + } + + /// Update a node's content. + pub fn set_content(&mut self, id: NodeId, content: &str) { + if let Some(node) = self.get_mut(id) { + node.content = content.to_string(); + } + } + + /// Update a node's structure index. + pub fn set_structure(&mut self, id: NodeId, structure: &str) { + if let Some(node) = self.get_mut(id) { + node.structure = structure.to_string(); + } + } + + /// Set page boundaries for a node. + pub fn set_page_boundaries(&mut self, id: NodeId, start: usize, end: usize) { + if let Some(node) = self.get_mut(id) { + node.start_page = Some(start); + node.end_page = Some(end); + } + } + + /// Set line indices for a node. + pub fn set_line_indices(&mut self, id: NodeId, start: usize, end: usize) { + if let Some(node) = self.get_mut(id) { + node.start_index = start; + node.end_index = end; + } + } + + /// Get page range for a node. + pub fn page_range(&self, id: NodeId) -> Option<(usize, usize)> { + let node = self.get(id)?; + match (node.start_page, node.end_page) { + (Some(start), Some(end)) => Some((start, end)), + _ => None, + } + } + + /// Check if a node contains a specific page. + pub fn contains_page(&self, id: NodeId, page: usize) -> bool { + if let Some((start, end)) = self.page_range(id) { + page >= start && page <= end + } else { + false + } + } + + /// Find a node by its structure index. + /// + /// This is a convenience method that builds an index if needed. + /// For repeated queries, build a RetrievalIndex once. + pub fn find_by_structure(&self, structure: &str) -> Option { + // Linear search - for repeated use, build RetrievalIndex + for node_id in self.traverse() { + if let Some(node) = self.get(node_id) { + if node.structure == structure { + return Some(node_id); + } + } + } + None + } + + /// Find the most specific node containing a page. + /// + /// This is a convenience method that builds an index if needed. + /// For repeated queries, build a RetrievalIndex once. + pub fn find_by_page(&self, page: usize) -> Option { + let mut best_match: Option<(NodeId, usize)> = None; + + // Find the deepest node containing this page + for node_id in self.traverse() { + if let Some((start, end)) = self.page_range(node_id) { + if page >= start && page <= end { + let depth = self.get(node_id).map(|n| n.depth).unwrap_or(0); + match &best_match { + None => best_match = Some((node_id, depth)), + Some((_, best_depth)) if depth > *best_depth => { + best_match = Some((node_id, depth)); + } + _ => {} + } + } + } + } + + best_match.map(|(id, _)| id) + } + + /// Get all nodes whose page range overlaps with the given range. + pub fn find_nodes_by_page_range(&self, start: usize, end: usize) -> Vec { + self.traverse() + .into_iter() + .filter(|&id| { + if let Some((node_start, node_end)) = self.page_range(id) { + node_start <= end && start <= node_end + } else { + false + } + }) + .collect() + } + + /// Set the node ID (identifier string). + pub fn set_node_id(&mut self, id: NodeId, node_id: &str) { + if let Some(node) = self.get_mut(id) { + node.node_id = Some(node_id.to_string()); + } + } + + /// Set the physical index marker. + pub fn set_physical_index(&mut self, id: NodeId, index: &str) { + if let Some(node) = self.get_mut(id) { + node.physical_index = Some(index.to_string()); + } + } + + /// Update token count for a node. + pub fn set_token_count(&mut self, id: NodeId, count: usize) { + if let Some(node) = self.get_mut(id) { + node.token_count = Some(count); + } + } + + /// Set the references for a node. + pub fn set_references(&mut self, id: NodeId, references: Vec) { + if let Some(node) = self.get_mut(id) { + node.references = references; + } + } + + /// Export the tree structure to JSON format. + pub fn to_structure_json(&self, doc_name: &str) -> DocumentStructure { + let structure = self.build_structure_nodes(self.root_id); + DocumentStructure { + doc_name: doc_name.to_string(), + structure, + } + } + + /// Build a retrieval index for efficient operations. + /// + /// This should be called once after the tree is fully constructed. + /// The index provides O(1) access to commonly needed traversal data. + /// + /// # Example + /// + /// ```ignore + /// let tree = /* build tree */; + /// let index = tree.build_retrieval_index(); + /// + /// // Fast access to leaves + /// for leaf in index.leaves() { + /// // process leaf + /// } + /// + /// // Fast path lookup + /// if let Some(path) = index.path_to(node_id) { + /// // path[0] = root, path[-1] = node_id + /// } + /// + /// // Fast structure lookup + /// if let Some(node) = index.find_by_structure("2.1.3") { + /// // Found section 2.1.3 + /// } + /// + /// // Fast page lookup + /// if let Some(node) = index.find_by_page(42) { + /// // Found node containing page 42 + /// } + /// ``` + pub fn build_retrieval_index(&self) -> RetrievalIndex { + let mut leaves = Vec::new(); + let mut level_index: Vec> = Vec::new(); + let mut path_cache: HashMap> = HashMap::new(); + let mut siblings_cache: HashMap> = HashMap::new(); + let mut structure_index: HashMap = HashMap::new(); + let mut page_index: HashMap = HashMap::new(); + let mut node_page_range: HashMap = HashMap::new(); + let mut max_depth = 0; + let node_count = self.node_count(); + + // BFS to build level index + let mut current_level = vec![self.root_id]; + + // Initialize root path + path_cache.insert(self.root_id, vec![self.root_id]); + + while !current_level.is_empty() { + level_index.push(current_level.clone()); + + let mut next_level = Vec::new(); + + for &node_id in ¤t_level { + let children: Vec = self.children(node_id); + + // Get node data + if let Some(node) = self.get(node_id) { + max_depth = max_depth.max(node.depth); + + // Build structure index + if !node.structure.is_empty() { + structure_index.insert(node.structure.clone(), node_id); + } + + // Build page index and page range + if let (Some(start), Some(end)) = (node.start_page, node.end_page) { + node_page_range.insert(node_id, (start, end)); + + // Map each page to this node (will be overwritten by deeper nodes) + for page in start..=end { + page_index.insert(page, node_id); + } + } + } + + // Check if leaf + if children.is_empty() { + leaves.push(node_id); + } + + // Build siblings cache for children + if children.len() > 1 { + for (i, &child) in children.iter().enumerate() { + let siblings: Vec = children + .iter() + .enumerate() + .filter(|(j, _)| *j != i) + .map(|(_, &c)| c) + .collect(); + siblings_cache.insert(child, siblings); + } + } + + // Build path cache for children + if let Some(parent_path) = path_cache.get(&node_id).cloned() { + for &child in &children { + let mut child_path = parent_path.clone(); + child_path.push(child); + path_cache.insert(child, child_path); + } + } + + next_level.extend(children); + } + + current_level = next_level; + } + + RetrievalIndex { + leaves, + level_index, + path_cache, + siblings_cache, + structure_index, + page_index, + node_page_range, + node_count, + max_depth, + } + } + + /// Recursively build structure nodes starting from the given node. + fn build_structure_nodes(&self, node_id: NodeId) -> Vec { + let children = self.children(node_id); + children + .into_iter() + .enumerate() + .map(|(idx, child_id)| self.node_to_structure(child_id, idx)) + .collect() + } + + /// Convert a single node to StructureNode format. + fn node_to_structure(&self, node_id: NodeId, _idx: usize) -> StructureNode { + let node = self.get(node_id).cloned().unwrap_or_default(); + let children = self.children(node_id); + + StructureNode { + title: node.title, + node_id: node + .node_id + .clone() + .unwrap_or_else(|| format!("{:04}", _idx)), + start_index: node.start_index, + end_index: node.end_index, + summary: if node.summary.is_empty() { + None + } else { + Some(node.summary) + }, + nodes: children + .into_iter() + .enumerate() + .map(|(i, c)| self.node_to_structure(c, i)) + .collect(), + } + } +} + +impl Default for DocumentTree { + fn default() -> Self { + Self::new("Root", "") + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::document::reference::{NodeReference, RefType}; + + #[test] + fn test_children_with_refs_no_references() { + let mut tree = DocumentTree::new("Root", "root content"); + let child1 = tree.add_child(tree.root(), "Section 1", "content 1"); + let child2 = tree.add_child(tree.root(), "Section 2", "content 2"); + + let children = tree.children_with_refs(tree.root()); + assert_eq!(children.len(), 2); + assert!(children.contains(&child1)); + assert!(children.contains(&child2)); + } + + #[test] + fn test_children_with_refs_deduplicates() { + let mut tree = DocumentTree::new("Root", "root content"); + let child = tree.add_child(tree.root(), "Section 1", "content 1"); + + // Add a reference that points to the same node as an existing child + let refs = vec![NodeReference::resolved( + "see Section 1".to_string(), + "1".to_string(), + RefType::Section, + 5, + child, + 0.8, + )]; + tree.set_references(tree.root(), refs); + + let children = tree.children_with_refs(tree.root()); + // Should not duplicate + assert_eq!(children.len(), 1); + assert!(children.contains(&child)); + } + + #[test] + fn test_children_with_refs_unresolved_ignored() { + let mut tree = DocumentTree::new("Root", "root content"); + let child = tree.add_child(tree.root(), "Section 1", "content 1"); + + // Add an unresolved reference (target_node = None) + let refs = vec![NodeReference::new( + "see Section 5".to_string(), + "5".to_string(), + RefType::Section, + 5, + )]; + tree.set_references(tree.root(), refs); + + let children = tree.children_with_refs(tree.root()); + // Unresolved reference should not be included + assert_eq!(children.len(), 1); + assert!(children.contains(&child)); + } +} diff --git a/vectorless-core/vectorless-document/src/understanding.rs b/vectorless-core/vectorless-document/src/understanding.rs new file mode 100644 index 00000000..4be8e29d --- /dev/null +++ b/vectorless-core/vectorless-document/src/understanding.rs @@ -0,0 +1,306 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Understanding types — the core objects that define the Document Understanding Engine. +//! +//! These types form the stable public contract: +//! - [`Document`] — the unified post-ingest artifact (internal first-class citizen) +//! - [`DocumentInfo`] — what `ingest()` returns to users +//! - [`Concept`] — key concept extracted from a document +//! - [`Answer`] — what `ask()` returns +//! - [`Evidence`] — proof trail for an answer +//! - [`ReasoningTrace`] / [`TraceStep`] — always-mandatory reasoning trace + +use serde::{Deserialize, Serialize}; + +use super::toc::TocNode; + +// --------------------------------------------------------------------------- +// Document — unified post-ingest artifact +// --------------------------------------------------------------------------- + +/// A understood document — the core artifact of the understand phase. +/// +/// This is what `ingest()` produces internally and what `ask()` consumes. +/// It unifies tree + navigation index + reasoning index + summary + concepts +/// into a single first-class type, replacing the previous loose coupling of +/// `DocContext { &tree, &nav, &reasoning }`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Document { + /// Unique document identifier. + pub doc_id: String, + /// Document name/title. + pub name: String, + /// Document format ("pdf", "markdown", "docx"). + pub format: String, + /// Source file path (if indexed from a file). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub source_path: Option, + + // ── Three indexes (engine internal) ── + /// Hierarchical semantic tree. + pub tree: super::tree::DocumentTree, + /// Pre-computed navigation structure. + pub nav_index: super::navigation::NavigationIndex, + /// Keyword / topic / section summaries. + pub reasoning_index: super::reasoning::ReasoningIndex, + + // ── Understanding results (ingest stage output) ── + /// Document-level summary. + pub summary: String, + /// Key concepts the engine identified. + #[serde(default)] + pub concepts: Vec, + + // ── Metadata ── + /// Page count (for PDFs). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub page_count: Option, + /// Number of sections in the tree. + #[serde(default)] + pub section_count: usize, +} + +// --------------------------------------------------------------------------- +// DocumentInfo — what ingest() returns to users +// --------------------------------------------------------------------------- + +/// The engine's understanding of a document — returned by `ingest()`. +/// +/// Rich enough for users to confirm the engine "got it right": +/// summary, structure (TOC), and key concepts. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentInfo { + /// Unique document identifier. + pub doc_id: String, + /// Document name. + pub name: String, + /// Document format ("pdf", "markdown", "docx"). + pub format: String, + /// Document-level summary — what this document is about. + pub summary: String, + /// Table of contents — the document's structure as the engine sees it. + pub structure: TocNode, + /// Key concepts the engine identified. + pub concepts: Vec, + /// Number of sections in the document. + pub section_count: usize, + /// Page count (for PDFs). + pub page_count: Option, +} + +impl Document { + /// Get node content by ID (Agent `cat` command). + pub fn cat(&self, node_id: super::node::NodeId) -> Option<&str> { + self.tree.get(node_id).map(|n| n.content.as_str()) + } + + /// Find nodes containing a keyword in title or content. + pub fn find(&self, keyword: &str) -> Vec<(super::node::NodeId, &str)> { + let kw = keyword.to_lowercase(); + self.tree + .traverse() + .iter() + .filter_map(|&id| { + let node = self.tree.get(id)?; + if node.title.to_lowercase().contains(&kw) + || node.content.to_lowercase().contains(&kw) + { + Some((id, node.title.as_str())) + } else { + None + } + }) + .collect() + } + + /// Get node title by ID. + pub fn node_title(&self, node_id: super::node::NodeId) -> Option<&str> { + self.tree.get(node_id).map(|n| n.title.as_str()) + } + + /// Number of sections in the tree. + pub fn section_count(&self) -> usize { + self.section_count + } + + /// Produce the public DocumentInfo view of this document. + pub fn info(&self) -> DocumentInfo { + let toc = super::toc::TocView::new().generate(&self.tree); + DocumentInfo { + doc_id: self.doc_id.clone(), + name: self.name.clone(), + format: self.format.clone(), + summary: self.summary.clone(), + structure: toc, + concepts: self.concepts.clone(), + section_count: self.section_count, + page_count: self.page_count, + } + } +} + +// --------------------------------------------------------------------------- +// Concept +// --------------------------------------------------------------------------- + +/// A key concept extracted from a document. +/// +/// Produced during the ingest pipeline's final concept extraction step. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Concept { + /// Concept name (e.g., "capacitor derating"). + pub name: String, + /// One-sentence explanation. + pub summary: String, + /// Which sections this concept appears in. + pub sections: Vec, +} + +// --------------------------------------------------------------------------- +// Answer — what ask() returns +// --------------------------------------------------------------------------- + +/// The result of `ask()` — a reasoned answer with evidence and trace. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Answer { + /// The answer content. + pub content: String, + /// Evidence supporting the answer. + pub evidence: Vec, + /// Confidence score (0.0–1.0). + pub confidence: f32, + /// Reasoning trace — how the agent arrived at this answer. Always present. + pub trace: ReasoningTrace, +} + +// --------------------------------------------------------------------------- +// Evidence +// --------------------------------------------------------------------------- + +/// A piece of evidence supporting an answer — with source attribution. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Evidence { + /// Original document text. + pub content: String, + /// Navigation path (e.g., "Root/Chapter 3/Section 3.2"). + pub source_path: String, + /// Which document this evidence came from. + pub doc_name: String, + /// Relevance to the question (0.0–1.0). + pub relevance: f32, +} + +// --------------------------------------------------------------------------- +// ReasoningTrace — always mandatory +// --------------------------------------------------------------------------- + +/// Reasoning trace — how the agent arrived at the answer. Always present. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReasoningTrace { + /// The steps the agent took. + pub steps: Vec, +} + +impl ReasoningTrace { + /// Create an empty trace. + pub fn empty() -> Self { + Self { steps: Vec::new() } + } + + /// Create a trace with a single step. + pub fn single(action: impl Into, observation: impl Into, round: u32) -> Self { + Self { + steps: vec![TraceStep { + action: action.into(), + observation: observation.into(), + round, + }], + } + } + + /// Add a step to the trace. + pub fn push(&mut self, action: impl Into, observation: impl Into, round: u32) { + self.steps.push(TraceStep { + action: action.into(), + observation: observation.into(), + round, + }); + } +} + +/// A single step in the reasoning trace. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TraceStep { + /// What the agent did (e.g., "cd Chapter 3"). + pub action: String, + /// What the agent observed (e.g., "Found 5 sections about..."). + pub observation: String, + /// Which round this step was in. + pub round: u32, +} + +// --------------------------------------------------------------------------- +// IngestInput — what ingest() takes +// --------------------------------------------------------------------------- + +/// Input to `ingest()` — the document to be understood. +#[derive(Debug, Clone)] +pub enum IngestInput { + /// Index from a file path. + Path(std::path::PathBuf), + /// Index from raw bytes. + Bytes { + /// Document name. + name: String, + /// Raw document bytes. + data: Vec, + /// Document format. + format: super::format::DocumentFormat, + }, + /// Index from a text string. + Text { + /// Document name. + name: String, + /// Document content. + content: String, + }, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_reasoning_trace_empty() { + let trace = ReasoningTrace::empty(); + assert!(trace.steps.is_empty()); + } + + #[test] + fn test_reasoning_trace_single() { + let trace = ReasoningTrace::single("cd Chapter 3", "Found 5 sections", 1); + assert_eq!(trace.steps.len(), 1); + assert_eq!(trace.steps[0].action, "cd Chapter 3"); + assert_eq!(trace.steps[0].round, 1); + } + + #[test] + fn test_reasoning_trace_push() { + let mut trace = ReasoningTrace::empty(); + trace.push("ls", "Root with 3 children", 0); + trace.push("cd Chapter 2", "Found target section", 1); + assert_eq!(trace.steps.len(), 2); + } + + #[test] + fn test_concept_serialization() { + let concept = Concept { + name: "capacitor derating".into(), + summary: "Reducing capacitor specs for reliability".into(), + sections: vec!["Section 3.2".into()], + }; + let json = serde_json::to_string(&concept).unwrap(); + assert!(json.contains("capacitor derating")); + } +} diff --git a/vectorless-core/vectorless-engine/Cargo.toml b/vectorless-core/vectorless-engine/Cargo.toml new file mode 100644 index 00000000..b5d91b07 --- /dev/null +++ b/vectorless-core/vectorless-engine/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "vectorless-engine" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-agent = { path = "../vectorless-agent" } +vectorless-document = { path = "../vectorless-document" } +vectorless-error = { path = "../vectorless-error" } +vectorless-events = { path = "../vectorless-events" } +vectorless-index = { path = "../vectorless-index" } +vectorless-llm = { path = "../vectorless-llm" } +vectorless-metrics = { path = "../vectorless-metrics" } +vectorless-retrieval = { path = "../vectorless-retrieval" } +vectorless-rerank = { path = "../vectorless-rerank" } +vectorless-storage = { path = "../vectorless-storage" } +tokio = { workspace = true } +tracing = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } +thiserror = { workspace = true } +parking_lot = { workspace = true } +async-trait = { workspace = true } +futures = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-engine/src/builder.rs b/vectorless-core/vectorless-engine/src/builder.rs new file mode 100644 index 00000000..d32550f5 --- /dev/null +++ b/vectorless-core/vectorless-engine/src/builder.rs @@ -0,0 +1,268 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Builder pattern for creating Engine clients. +//! +//! This module provides [`EngineBuilder`] for configuring and building +//! [`Engine`] instances with sensible defaults. + +use crate::{ + client::engine::Engine, client::retriever::RetrieverClient, config::Config, + events::EventEmitter, metrics::MetricsHub, storage::Workspace, +}; + +/// Builder for creating a [`Engine`] client. +/// +/// `api_key`, `model` and `endpoint` are **required** for simple usage. +/// Advanced users can provide a pre-built [`Config`] via [`with_config`](EngineBuilder::with_config). +/// +/// # Example (simple) +/// +/// ```rust,no_run +/// use vectorless::client::EngineBuilder; +/// +/// #[tokio::main] +/// async fn main() -> Result<(), vectorless::BuildError> { +/// let client = EngineBuilder::new() +/// .with_key("sk-...") +/// .with_model("gpt-4o") +/// .with_endpoint("https://api.xxx.com/v1") +/// .build() +/// .await?; +/// Ok(()) +/// } +/// ``` +/// +/// # Example (advanced) +/// +/// ```rust,ignore +/// use vectorless::client::EngineBuilder; +/// use vectorless::config::{Config, LlmConfig, SlotConfig}; +/// +/// let config = Config::new().with_llm( +/// LlmConfig::new("gpt-4o") +/// .with_api_key("sk-...") +/// .with_endpoint("https://api.openai.com/v1") +/// .with_index(SlotConfig::fast().with_model("gpt-4o-mini")) +/// ); +/// +/// let engine = EngineBuilder::new() +/// .with_config(config) +/// .build() +/// .await?; +/// ``` +#[derive(Debug)] +pub struct EngineBuilder { + /// Custom configuration for advanced tuning. + config: Option, + + /// Event emitter. + events: Option, + + /// LLM API key (override). + api_key: Option, + + /// LLM model name (override). + model: Option, + + /// LLM endpoint URL (override). + endpoint: Option, +} + +impl EngineBuilder { + /// Create a new builder with defaults. + #[must_use] + pub fn new() -> Self { + Self { + config: None, + events: None, + api_key: None, + model: None, + endpoint: None, + } + } + + // ============================================================ + // Configuration + // ============================================================ + + /// Set a custom configuration. + /// + /// When provided, this replaces the default [`Config`] entirely. + /// Builder methods (`with_key`, `with_model`, `with_endpoint`) + /// will still override the corresponding fields on top of this config. + #[must_use] + pub fn with_config(mut self, config: Config) -> Self { + self.config = Some(config); + self + } + + /// Set the event emitter for callbacks. + #[must_use] + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = Some(events); + self + } + + // ============================================================ + // LLM Configuration (simple overrides) + // ============================================================ + + /// Set the LLM API key. **Required** (unless provided via Config). + #[must_use] + pub fn with_key(mut self, key: impl Into) -> Self { + self.api_key = Some(key.into()); + self + } + + /// Set the LLM model name. + #[must_use] + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = Some(model.into()); + self + } + + /// Set a custom LLM endpoint URL. + #[must_use] + pub fn with_endpoint(mut self, url: impl Into) -> Self { + self.endpoint = Some(url.into()); + self + } + + // ============================================================ + // Build + // ============================================================ + + /// Build the Engine client. + /// + /// # Errors + /// + /// Returns a [`BuildError`] if: + /// - Workspace creation fails + /// - Required `api_key` or `model` is missing + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::client::EngineBuilder; + /// + /// # #[tokio::main] + /// # async fn main() -> Result<(), vectorless::BuildError> { + /// let engine = EngineBuilder::new() + /// .with_key("sk-...") + /// .with_model("gpt-4o") + /// .with_endpoint("https://api.openai.com/v1") + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn build(self) -> Result { + // Load user-provided or default configuration + let mut config = self.config.unwrap_or_default(); + + // Apply simple overrides — write once, no dual-writing + if let Some(api_key) = self.api_key { + config.llm.api_key = Some(api_key); + } + if let Some(model) = self.model { + config.llm.model = model; + } + if let Some(endpoint) = self.endpoint { + config.llm.endpoint = Some(endpoint); + } + + // Validate required settings + if config.llm.api_key.is_none() { + return Err(BuildError::MissingApiKey); + } + if config.llm.model.is_empty() { + return Err(BuildError::MissingModel); + } + if config.llm.endpoint.is_none() { + return Err(BuildError::MissingEndpoint); + } + + // Open workspace from config + let workspace = Workspace::new(&config.storage.workspace_dir) + .await + .map_err(|e| BuildError::Workspace(e.to_string()))?; + + // Build LlmPool from unified LlmConfig (shared metrics hub) + let metrics_hub = std::sync::Arc::new(MetricsHub::with_defaults()); + let pool = vectorless_llm::LlmPool::from_config(&config.llm, Some(metrics_hub.clone())); + + // Indexer uses pool.index() + let indexer = crate::client::indexer::IndexerClient::with_llm(pool.index().clone()); + + // Retriever uses pool.retrieval() via agent system + let retriever = RetrieverClient::new(pool.retrieval().clone()); + + // Build engine + let events = self.events.unwrap_or_default(); + Engine::with_components(config, workspace, retriever, indexer, events, metrics_hub) + .await + .map_err(|e| BuildError::Other(e.to_string())) + } +} + +impl Default for EngineBuilder { + fn default() -> Self { + Self::new() + } +} + +/// Error during client build. +#[derive(Debug, thiserror::Error)] +pub enum BuildError { + /// Workspace error. + #[error("Workspace error: {0}")] + Workspace(String), + + /// Missing API key. + #[error("Missing API key: call .with_key(\"sk-...\") or set api_key in config")] + MissingApiKey, + + /// Missing model name. + #[error("Missing model: call .with_model(\"gpt-4o\") or set model in config")] + MissingModel, + + /// Missing endpoint URL. + #[error( + "Missing endpoint: call .with_endpoint(\"https://api.xxx.com/v1\") or set endpoint in config" + )] + MissingEndpoint, + + /// Other error. + #[error("{0}")] + Other(String), +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_builder_with_key() { + let builder = EngineBuilder::new().with_key("sk-test-key"); + + assert_eq!(builder.api_key, Some("sk-test-key".to_string())); + } + + #[test] + fn test_builder_with_model() { + let builder = EngineBuilder::new().with_model("gpt-4o-mini"); + + assert_eq!(builder.model, Some("gpt-4o-mini".to_string())); + } + + #[test] + fn test_builder_with_key_and_model() { + let builder = EngineBuilder::new() + .with_model("gpt-4o-mini") + .with_key("sk-test"); + + assert_eq!(builder.model, Some("gpt-4o-mini".to_string())); + assert_eq!(builder.api_key, Some("sk-test".to_string())); + } +} diff --git a/vectorless-core/vectorless-engine/src/engine.rs b/vectorless-core/vectorless-engine/src/engine.rs new file mode 100644 index 00000000..0507c82e --- /dev/null +++ b/vectorless-core/vectorless-engine/src/engine.rs @@ -0,0 +1,923 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Main Engine client - the entry point for vectorless. +//! +//! The Engine provides a unified API for the Document Understanding Engine: +//! +//! - [`ingest`](Engine::ingest) — Understand a document (parse, analyze, persist) +//! - [`ask`](Engine::ask) — Ask a question (returns answer + evidence + trace) +//! - [`forget`](Engine::forget) — Remove a document +//! - [`list_documents`](Engine::list_documents) — List all understood documents +//! +//! # Example +//! +//! ```rust,no_run +//! use vectorless::{EngineBuilder, IngestInput}; +//! +//! # #[tokio::main] +//! # async fn main() -> Result<(), Box> { +//! let engine = EngineBuilder::new() +//! .with_key("sk-...") +//! .with_model("gpt-4o") +//! .with_endpoint("https://api.openai.com/v1") +//! .build() +//! .await?; +//! +//! // Understand a document +//! let doc = engine.ingest(IngestInput::Path("./document.md".into())).await?; +//! println!("{}: {}", doc.name, doc.summary); +//! +//! // Ask a question +//! let answer = engine.ask("What is this?", &[doc.doc_id.clone()]).await?; +//! println!("{}", answer.content); +//! +//! // List all understood documents +//! let docs = engine.list_documents().await?; +//! for d in &docs { +//! println!("{}: {}", d.name, d.summary); +//! } +//! +//! // Forget a document +//! engine.forget(&doc.doc_id).await?; +//! # Ok(()) +//! # } +//! ``` + +use std::{collections::HashMap, sync::Arc}; + +use futures::StreamExt; +use tracing::{info, warn}; + +use crate::{ + Answer, Document as UnderstandingDocument, DocumentTree, Error, Evidence, IngestInput, + ReasoningTrace, + config::Config, + error::Result, + events::EventEmitter, + index::{ + PipelineOptions, + incremental::{self, IndexAction}, + }, + metrics::MetricsHub, + storage::{PersistedDocument, Workspace}, +}; + +use super::{ + index_context::{IndexContext, IndexSource}, + indexer::IndexerClient, + retriever::RetrieverClient, + types::{FailedItem, IndexItem, IndexMode, IndexResult}, + workspace::WorkspaceClient, +}; + +/// The main Engine client. +/// +/// Provides high-level operations for document indexing and retrieval. +/// Uses interior mutability to allow sharing across async tasks. +/// +/// # Cloning +/// +/// Cloning is cheap - it only increments reference counts (`Arc`). All clones +/// share the same underlying resources. +/// +/// # Thread Safety +/// +/// The client is `Clone + Send + Sync` and can be safely shared across threads. +pub struct Engine { + /// Configuration (immutable, shared). + config: Arc, + + /// Indexer client for document indexing. + indexer: IndexerClient, + + /// Retriever client for queries. + retriever: RetrieverClient, + + /// Workspace client for persistence. + workspace: WorkspaceClient, + + /// Central metrics hub for unified collection. + metrics_hub: Arc, +} + +impl Engine { + // ============================================================ + // Constructor (for Builder) + // ============================================================ + + /// Create a new client with the given components. + pub(crate) async fn with_components( + config: Config, + workspace: Workspace, + retriever: RetrieverClient, + indexer: IndexerClient, + events: EventEmitter, + metrics_hub: Arc, + ) -> Result { + let config = Arc::new(config); + + // Attach event emitter to indexer + let indexer = indexer.with_events(events.clone()); + + // Attach event emitter to retriever + let retriever = retriever.with_events(events.clone()); + + // Create workspace client + let workspace_client = WorkspaceClient::new(workspace) + .await + .with_events(events.clone()); + + Ok(Self { + config, + indexer, + retriever, + workspace: workspace_client, + metrics_hub, + }) + } + + // ============================================================ + // Ingest Pipeline (private — called by ingest()) + // ============================================================ + + /// Run the ingest pipeline: parse, compile, persist. + /// + /// Accepts an [`IndexContext`] that specifies the source and options. + /// Multiple sources are processed in parallel. + /// Returns an [`IndexResult`] containing the indexed document metadata. + #[tracing::instrument(skip_all, fields(sources = ctx.sources.len()))] + async fn ingest_pipeline(&self, ctx: IndexContext) -> Result { + if ctx.is_empty() { + return Err(Error::Config("No document sources provided".into())); + } + + let timeout_secs = ctx.options.timeout_secs; + + self.with_timeout(timeout_secs, async move { + let concurrency = self + .config + .llm + .throttle + .max_concurrent_requests + .min(ctx.sources.len()); + + let (items, failed) = self + .process_sources(&ctx.sources, &ctx.options, ctx.name.as_deref(), concurrency) + .await; + + if items.is_empty() && !failed.is_empty() { + return Err(Error::Config(format!( + "All {} source(s) failed: {}", + failed.len(), + failed + .iter() + .map(|f| format!("{} ({})", f.source, f.error)) + .collect::>() + .join("; ") + ))); + } + + // Rebuild cross-document graph in the background so index returns immediately. + if !items.is_empty() && self.config.graph.enabled { + let engine = self.clone(); + tokio::spawn(async move { + info!("Rebuilding document graph in background..."); + if let Err(e) = engine.rebuild_graph().await { + tracing::warn!("Background graph rebuild failed: {e}"); + } + }); + } + + Ok(IndexResult::with_partial(items, failed)) + }) + .await + } + + /// Process multiple sources in parallel. + async fn process_sources( + &self, + sources: &[IndexSource], + options: &super::types::IndexOptions, + name: Option<&str>, + concurrency: usize, + ) -> (Vec, Vec) { + let results: Vec<(Vec, Vec)> = + futures::stream::iter(sources.iter().cloned()) + .map(|source| { + let options = options.clone(); + let name = name.map(str::to_string); + let engine = self.clone(); + async move { + engine + .process_source(&source, &options, name.as_deref()) + .await + } + }) + .buffer_unordered(concurrency) + .collect() + .await; + + results.into_iter().fold( + (Vec::new(), Vec::new()), + |(mut items, mut failed), (ok, err)| { + items.extend(ok); + failed.extend(err); + (items, failed) + }, + ) + } + + /// Process a single source — resolve action and index. + #[tracing::instrument(skip_all, fields(source = %source))] + /// + /// Returns `(items, failed)`. + async fn process_source( + &self, + source: &IndexSource, + options: &super::types::IndexOptions, + name: Option<&str>, + ) -> (Vec, Vec) { + let source_label = source.to_string(); + + match self.resolve_index_action(source, options).await { + Ok(IndexAction::Skip(skip_info)) => { + info!("Skipped (unchanged): {}", source_label); + ( + vec![IndexItem::new( + skip_info.doc_id, + skip_info.name, + skip_info.format, + skip_info.description, + skip_info.page_count, + )], + Vec::new(), + ) + } + Ok(IndexAction::FullIndex { existing_id }) => { + let pipeline_options = self.build_pipeline_options(options, source); + match self + .index_with_retry(source, name, pipeline_options.clone(), None) + .await + { + Ok(doc) => { + self.index_and_persist( + doc, + &pipeline_options, + &source_label, + existing_id.as_deref(), + ) + .await + } + Err(e) => { + tracing::warn!("Failed to index {}: {}", source_label, e); + ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ) + } + } + } + Ok(IndexAction::IncrementalUpdate { + old_tree, + existing_id, + }) => { + info!("Incremental update for: {}", source_label); + let pipeline_options = self.build_pipeline_options(options, source); + match self + .index_with_retry(source, name, pipeline_options.clone(), Some(&old_tree)) + .await + { + Ok(mut doc) => { + doc.id = existing_id.clone(); + self.index_and_persist(doc, &pipeline_options, &source_label, None) + .await + } + Err(e) => { + tracing::warn!("Incremental update failed for {}: {}", source_label, e); + ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ) + } + } + } + Err(e) => { + tracing::warn!("Failed to resolve action for {}: {}", source_label, e); + ( + Vec::new(), + vec![FailedItem::new(&source_label, e.to_string())], + ) + } + } + } + + /// Index with retry on retryable errors. + /// + /// Reads `config.llm.retry` for backoff parameters. + /// Returns `Err` only after all retries are exhausted or the error + /// is not retryable. + async fn index_with_retry( + &self, + source: &IndexSource, + name: Option<&str>, + pipeline_options: PipelineOptions, + existing_tree: Option<&DocumentTree>, + ) -> Result { + let retry = &self.config.llm.retry; + let max_attempts = retry.max_attempts; + + for attempt in 0..max_attempts { + let result = if let Some(tree) = existing_tree { + self.indexer + .index_with_existing(source, name, pipeline_options.clone(), Some(tree)) + .await + } else { + self.indexer + .index(source, name, pipeline_options.clone()) + .await + }; + + match result { + Ok(doc) => return Ok(doc), + Err(e) if e.is_retryable() && attempt + 1 < max_attempts => { + let delay = retry.delay_for_attempt(attempt); + tracing::warn!( + attempt, + max_attempts, + ?delay, + "Retryable error indexing, retrying: {e}" + ); + tokio::time::sleep(delay).await; + } + Err(e) => return Err(e), + } + } + + // Unreachable: loop always returns via Ok/Err branches + unreachable!() + } + + /// Convert an [`IndexedDocument`] to an [`IndexItem`] and persist it. + /// + /// If `old_id` is provided, the old document is removed after a + /// successful save (atomic save-first, then remove old). + async fn index_and_persist( + &self, + doc: super::indexed_document::IndexedDocument, + pipeline_options: &PipelineOptions, + source_label: &str, + old_id: Option<&str>, + ) -> (Vec, Vec) { + let item = Self::build_index_item(&doc); + + info!("[index] Persisting document '{}'...", doc.name,); + let persisted = IndexerClient::to_persisted(doc, pipeline_options).await; + + if let Err(e) = self.workspace.save(&persisted).await { + warn!("[index] Failed to save document: {}", e); + return ( + Vec::new(), + vec![FailedItem::new(source_label, e.to_string())], + ); + } + // Clean up old document after successful save + if let Some(old_id) = old_id { + if let Err(e) = self.workspace.remove(old_id).await { + warn!("Failed to remove old document {}: {}", old_id, e); + } + } + + info!("[index] Document persisted: {}", item.doc_id); + (vec![item], Vec::new()) + } + + /// Build an [`IndexItem`] from an [`IndexedDocument`](super::indexed_document::IndexedDocument). + fn build_index_item(doc: &super::indexed_document::IndexedDocument) -> IndexItem { + IndexItem::new( + doc.id.clone(), + doc.name.clone(), + doc.format.clone(), + doc.description.clone(), + doc.page_count, + ) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) + .with_metrics_opt(doc.metrics.clone()) + } + + // ============================================================ + // Understanding Engine API + // ============================================================ + + /// Understand a document — parse, analyze, and persist. + /// + /// Returns a [`vectorless_document::DocumentInfo`] with summary, structure, and concepts. + /// The engine builds a full understanding including tree, navigation index, + /// reasoning index, summary, and key concepts. + pub async fn ingest(&self, input: IngestInput) -> Result { + let ctx = match &input { + IngestInput::Path(path) => IndexContext::from_path(path), + IngestInput::Bytes { data, format, .. } => IndexContext::from_bytes(data.clone(), *format), + IngestInput::Text { content, .. } => { + IndexContext::from_content(content, vectorless_index::parse::DocumentFormat::Markdown) + } + }; + + let result = self.ingest_pipeline(ctx).await?; + + let doc_id = result + .doc_id() + .ok_or_else(|| Error::Config("ingest produced no results".into()))? + .to_string(); + + // Load the persisted document to build DocumentInfo + let persisted = self + .workspace + .load(&doc_id) + .await? + .ok_or_else(|| Error::Config("Document not found after ingest".into()))?; + + let doc = Self::persisted_to_understanding_document(persisted); + Ok(doc.info()) + } + + /// Ask a question — returns a reasoned answer with evidence and trace. + /// + /// - `input`: the question (required) + /// - `ids`: document IDs to search. Empty = search all documents. + /// + /// Always returns an [`Answer`] with content, evidence, confidence, and + /// a mandatory reasoning trace. + pub async fn ask(&self, input: &str, ids: &[String]) -> Result { + // Resolve doc IDs + let doc_ids = if ids.is_empty() { + let docs = self.list_documents().await?; + if docs.is_empty() { + return Err(Error::Config("Workspace is empty".into())); + } + docs.into_iter().map(|d| d.doc_id).collect::>() + } else { + ids.to_vec() + }; + + // Load documents + let (documents, failed) = self.load_documents(&doc_ids).await?; + if documents.is_empty() { + return Err(Error::Config(format!( + "No documents available: {} failures", + failed.len() + ))); + } + + // Build DocContexts from Documents and dispatch + let doc_contexts: Vec = documents + .iter() + .map(|doc| doc.as_context()) + .collect(); + + let skip_analysis = !ids.is_empty(); + let scope = if skip_analysis { + vectorless_agent::Scope::Specified(doc_contexts) + } else { + vectorless_agent::Scope::Workspace(vectorless_agent::WorkspaceContext::new(doc_contexts)) + }; + + let emitter = vectorless_agent::EventEmitter::noop(); + let config = self.retriever.config().clone(); + let llm = self.retriever.llm().clone(); + let output = + vectorless_retrieval::dispatcher::dispatch(input, scope, &config, &llm, &emitter).await?; + + // Convert Output -> Answer + Ok(Self::output_to_answer(&output)) + } + + /// Remove a document from the workspace. + pub async fn forget(&self, doc_id: &str) -> Result<()> { + self.workspace.remove(doc_id).await?; + Ok(()) + } + + /// List all understood documents. + /// + /// Returns [`Vec`] with summary, structure, and concepts + /// for each document. + pub async fn list_documents(&self) -> Result> { + let ids = self.workspace.inner().list_documents().await; + let mut result = Vec::new(); + for id in ids { + match self.workspace.load(&id).await { + Ok(Some(persisted)) => { + result.push(Self::persisted_to_understanding_document(persisted).info()); + } + Ok(None) => { + tracing::warn!(doc_id = %id, "Document in index but not in storage"); + } + Err(e) => { + tracing::warn!(doc_id = %id, error = %e, "Failed to load document"); + } + } + } + Ok(result) + } + + // ============================================================ + // Utility Methods + // ============================================================ + + /// Check if a document exists in the workspace. + pub async fn exists(&self, doc_id: &str) -> Result { + self.workspace.exists(doc_id).await + } + + /// Remove all documents from the workspace. + /// + /// Returns the number of documents removed. + pub async fn clear(&self) -> Result { + self.workspace.clear().await + } + + /// Get the cross-document relationship graph. + /// + /// The graph is automatically rebuilt after indexing documents. + /// Returns `None` if no graph has been built yet. + pub async fn get_graph(&self) -> Result> { + self.workspace.get_graph().await + } + + /// Generate a complete metrics report. + /// + /// Returns a [`MetricsReport`](vectorless_metrics::MetricsReport) containing + /// LLM usage and retrieval operation metrics. + pub fn metrics_report(&self) -> vectorless_metrics::MetricsReport { + self.metrics_hub.generate_report() + } + + // ============================================================ + // Internal: type conversions + // ============================================================ + + /// Convert a PersistedDocument to a Document (understanding type). + fn persisted_to_understanding_document(persisted: PersistedDocument) -> UnderstandingDocument { + let nav_index = persisted.navigation_index.unwrap_or_default(); + let reasoning_index = persisted.reasoning_index.unwrap_or_default(); + let tree = persisted.tree; + + let section_count = tree.node_count(); + + UnderstandingDocument { + doc_id: persisted.meta.id, + name: persisted.meta.name, + format: persisted.meta.format, + source_path: persisted.meta.source_path.map(|p| p.to_string_lossy().to_string()), + tree, + nav_index, + reasoning_index, + summary: persisted.meta.description.unwrap_or_default(), + concepts: persisted.concepts, + page_count: persisted.meta.page_count, + section_count, + } + } + + /// Convert agent Output to public Answer type. + fn output_to_answer(output: &vectorless_agent::Output) -> Answer { + // Build evidence + let evidence: Vec = output + .evidence + .iter() + .map(|e| Evidence { + content: e.content.clone(), + source_path: e.source_path.clone(), + doc_name: e.doc_name.clone().unwrap_or_default(), + relevance: 0.0, + }) + .collect(); + + Answer { + content: output.answer.clone(), + evidence, + confidence: output.confidence, + trace: ReasoningTrace { + steps: output.trace_steps.clone(), + }, + } + } + + // ============================================================ + // Internal + // ============================================================ + + /// Load documents by ID, returning loaded artifacts and failures. + async fn load_documents( + &self, + doc_ids: &[String], + ) -> Result<(Vec, Vec)> { + let mut documents = Vec::new(); + let mut failed = Vec::new(); + for doc_id in doc_ids { + match self.workspace.load(doc_id).await { + Ok(Some(doc)) => { + documents.push(Self::persisted_to_understanding_document(doc)); + } + Ok(None) => { + failed.push(FailedItem::new(doc_id, "Document not found")); + } + Err(e) => { + failed.push(FailedItem::new(doc_id, &e.to_string())); + } + } + } + Ok((documents, failed)) + } + + /// Run a future with an optional timeout. + /// If `timeout_secs` is `Some`, wraps the future in `tokio::time::timeout`. + async fn with_timeout(&self, timeout_secs: Option, fut: F) -> Result + where + F: std::future::Future>, + { + match timeout_secs { + Some(secs) => { + match tokio::time::timeout(std::time::Duration::from_secs(secs), fut).await { + Ok(result) => result, + Err(_) => Err(Error::Config(format!("Operation timed out after {secs}s"))), + } + } + None => fut.await, + } + } + + /// Build pipeline options for pipeline execution (with checkpoint dir). + /// + /// This is the single source of truth for pipeline configuration. + fn build_pipeline_options( + &self, + options: &super::types::IndexOptions, + source: &IndexSource, + ) -> PipelineOptions { + use vectorless_index::{IndexMode, ReasoningIndexConfig, SummaryStrategy}; + + let format = match source { + IndexSource::Path(path) => self + .indexer + .detect_format_from_path(path) + .unwrap_or(vectorless_index::parse::DocumentFormat::Markdown), + IndexSource::Content { format, .. } => *format, + IndexSource::Bytes { format, .. } => *format, + }; + + let checkpoint_dir = Some(self.config.storage.checkpoint_dir.clone()); + + PipelineOptions { + mode: match format { + vectorless_index::parse::DocumentFormat::Markdown => IndexMode::Markdown, + vectorless_index::parse::DocumentFormat::Pdf => IndexMode::Pdf, + }, + generate_ids: options.generate_ids, + summary_strategy: if options.generate_summaries { + SummaryStrategy::full() + } else { + SummaryStrategy::none() + }, + generate_description: options.generate_description, + checkpoint_dir, + reasoning_index: ReasoningIndexConfig { + enable_synonym_expansion: options.enable_synonym_expansion, + ..ReasoningIndexConfig::default() + }, + concurrency: self.config.llm.throttle.to_runtime_config(), + ..Default::default() + } + } + + /// Resolve what action to take for a source. + async fn resolve_index_action( + &self, + source: &IndexSource, + options: &super::types::IndexOptions, + ) -> Result { + let workspace = &self.workspace; + + // Force mode always re-indexes from scratch + if options.mode == IndexMode::Force { + return Ok(IndexAction::FullIndex { existing_id: None }); + } + + // Only path sources support incremental indexing + let path = match source { + IndexSource::Path(p) => p, + _ => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + // Find if this file has already been indexed + let existing_id = match workspace.find_by_source_path(path).await { + Some(id) => id, + None => return Ok(IndexAction::FullIndex { existing_id: None }), // New file + }; + + // Default mode: skip if already indexed (no content check) + if options.mode == IndexMode::Default { + let info = workspace.get_document_info(&existing_id).await?; + let (name, format_str, desc, pages) = match info { + Some(i) => (i.name, i.format, i.description, i.page_count), + None => (String::new(), String::new(), None, None), + }; + return Ok(IndexAction::Skip(incremental::SkipInfo { + doc_id: existing_id, + name, + format: vectorless_index::parse::DocumentFormat::from_extension(&format_str) + .unwrap_or(vectorless_index::parse::DocumentFormat::Markdown), + description: desc, + page_count: pages, + })); + } + + // Incremental mode: load stored document and delegate to resolver + let current_bytes = match tokio::fs::read(path).await { + Ok(b) => b, + Err(_) => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + let stored_doc = match workspace.load(&existing_id).await? { + Some(d) => d, + None => return Ok(IndexAction::FullIndex { existing_id: None }), + }; + + let format = vectorless_index::parse::DocumentFormat::from_extension(&stored_doc.meta.format) + .unwrap_or(vectorless_index::parse::DocumentFormat::Markdown); + let pipeline_options = self.build_pipeline_options(options, source); + + // If logic fingerprint changed, remove old doc before full reprocess + let action = + incremental::resolve_action(¤t_bytes, &stored_doc, &pipeline_options, format); + + // Note: if FullIndex, old doc cleanup happens in process_source() + // after successful save (save-first, then remove old). + + Ok(action) + } + + /// Rebuild the document graph after indexing, if graph is enabled. + async fn rebuild_graph(&self) -> Result<()> { + if !self.config.graph.enabled { + return Ok(()); + } + + // Load all documents in parallel and extract keyword profiles + let doc_ids = self.workspace.inner().list_documents().await; + info!( + doc_count = doc_ids.len(), + "Loading documents for graph rebuild" + ); + let concurrency = self.config.llm.throttle.max_concurrent_requests; + + let doc_ids_clone: Vec = doc_ids.iter().cloned().collect(); + let loaded: Vec<(String, Result>)> = + futures::stream::iter(doc_ids_clone.into_iter()) + .map(|doc_id| { + let ws = self.workspace.clone(); + async move { + let result = ws.load(&doc_id).await; + (doc_id, result) + } + }) + .buffer_unordered(concurrency) + .collect() + .await; + + let mut failed_count = 0usize; + let mut loaded_docs: Vec = Vec::new(); + for (doc_id, result) in loaded { + match result { + Ok(Some(doc)) => loaded_docs.push(doc), + Ok(None) => { + warn!( + doc_id, + "Document in meta index but not in backend during graph rebuild" + ); + failed_count += 1; + } + Err(e) => { + warn!(doc_id, error = %e, "Failed to load document for graph rebuild"); + failed_count += 1; + } + } + } + + info!( + loaded = loaded_docs.len(), + failed = failed_count, + "Documents loaded for graph rebuild" + ); + + let mut builder = vectorless_graph::DocumentGraphBuilder::new(self.config.graph.clone()); + for doc in &loaded_docs { + let keywords = Self::extract_keywords_from_doc(&doc); + builder.add_document( + &doc.meta.id, + &doc.meta.name, + &doc.meta.format, + doc.meta.node_count, + keywords, + ); + } + + let graph = builder.build(); + info!( + nodes = graph.node_count(), + edges = graph.edge_count(), + "Graph built, persisting" + ); + self.workspace.set_graph(&graph).await?; + Ok(()) + } + + /// Extract keyword -> weight map from a persisted document's ReasoningIndex. + fn extract_keywords_from_doc(doc: &PersistedDocument) -> HashMap { + let mut keywords = HashMap::new(); + if let Some(ref ri) = doc.reasoning_index { + for (kw, entries) in ri.all_topic_entries() { + let weight: f32 = + entries.iter().map(|e| e.weight).sum::() / entries.len().max(1) as f32; + keywords.insert(kw.clone(), weight); + } + } + keywords + } +} + +impl Clone for Engine { + fn clone(&self) -> Self { + Self { + config: Arc::clone(&self.config), + indexer: self.indexer.clone(), + retriever: self.retriever.clone(), + workspace: self.workspace.clone(), + metrics_hub: Arc::clone(&self.metrics_hub), + } + } +} + +impl std::fmt::Debug for Engine { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Engine").finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::client::types::IndexMode; + + // -- resolve_index_action Default mode ---------------------------------------------- + + // We can't call resolve_index_action without a workspace, but we can + // verify IndexMode equality logic used inside. + #[test] + fn test_index_mode_force_skips_incremental() { + let mode = IndexMode::Force; + assert_eq!(mode, IndexMode::Force); + assert_ne!(mode, IndexMode::Default); + assert_ne!(mode, IndexMode::Incremental); + } + + // -- build_index_item ---------------------------------------------------------------- + + // Build_index_item only transforms data -- no I/O. + use crate::client::indexed_document::IndexedDocument; + + fn make_doc() -> IndexedDocument { + IndexedDocument::new("test-id", vectorless_index::parse::DocumentFormat::Markdown) + .with_name("test.md") + .with_description("test doc") + .with_source_path(std::path::PathBuf::from("/tmp/test.md")) + } + + #[test] + fn test_build_index_item() { + let doc = make_doc(); + let item = Engine::build_index_item(&doc); + + assert_eq!(item.doc_id, "test-id"); + assert_eq!(item.name, "test.md"); + assert_eq!(item.format, vectorless_index::parse::DocumentFormat::Markdown); + assert_eq!(item.description, Some("test doc".to_string())); + assert_eq!(item.source_path, Some("/tmp/test.md".to_string())); + assert!(item.metrics.is_none()); + } + + #[test] + fn test_build_index_item_no_source_path() { + let doc = IndexedDocument::new("id", vectorless_index::parse::DocumentFormat::Pdf); + let item = Engine::build_index_item(&doc); + + assert_eq!(item.source_path, Some(String::new())); // unwrap_or_default + assert_eq!(item.format, vectorless_index::parse::DocumentFormat::Pdf); + } +} diff --git a/vectorless-core/vectorless-engine/src/index_context.rs b/vectorless-core/vectorless-engine/src/index_context.rs new file mode 100644 index 00000000..df109db6 --- /dev/null +++ b/vectorless-core/vectorless-engine/src/index_context.rs @@ -0,0 +1,363 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index context for document indexing operations. +//! +//! [`IndexContext`] supports single or multiple document sources: +//! - **File path** — Load and parse a file from disk +//! - **Content string** — Parse content directly (HTML, Markdown, text) +//! - **Byte data** — Parse binary data (PDF, DOCX) +//! +//! # Single document +//! +//! ```rust,no_run +//! use vectorless::client::IndexContext; +//! +//! let ctx = IndexContext::from_path("./document.md"); +//! ``` +//! +//! # Multiple documents +//! +//! ```rust,no_run +//! use vectorless::client::IndexContext; +//! +//! let ctx = IndexContext::from_paths(vec!["./doc1.md", "./doc2.pdf"]); +//! ``` +//! +//! # From directory +//! +//! ```rust,no_run +//! use vectorless::client::IndexContext; +//! +//! // Non-recursive (top-level only) +//! let ctx = IndexContext::from_dir("./documents", false); +//! +//! // Recursive (includes subdirectories) +//! let ctx = IndexContext::from_dir("./documents", true); +//! ``` + +use std::path::PathBuf; + +use vectorless_document::DocumentFormat; + +use super::types::{IndexMode, IndexOptions}; + +// ============================================================ +// Index Source +// ============================================================ + +/// The source of document content for indexing. +#[derive(Debug, Clone)] +pub(crate) enum IndexSource { + /// Load document from a file path. + Path(PathBuf), + + /// Parse document from a string. + Content { + data: String, + format: DocumentFormat, + }, + + /// Parse document from binary data. + Bytes { + data: Vec, + format: DocumentFormat, + }, +} + +// ============================================================ +// Index Context +// ============================================================ + +/// Context for document indexing operations. +/// +/// Supports single or multiple document sources. When multiple sources +/// are provided, each is indexed independently and the results are +/// collected into [`IndexResult`](super::IndexResult). +/// +/// # Examples +/// +/// ```rust,no_run +/// use vectorless::client::IndexContext; +/// use vectorless::client::DocumentFormat; +/// +/// # #[tokio::main] +/// # async fn main() -> Result<(), Box> { +/// # let engine = vectorless::EngineBuilder::new().build().await?; +/// // Single file +/// let result = engine.index(IndexContext::from_path("./doc.md")).await?; +/// +/// // Multiple files +/// let result = engine.index( +/// IndexContext::from_paths(vec!["./doc1.md", "./doc2.pdf"]) +/// ).await?; +/// +/// // Entire directory +/// let result = engine.index(IndexContext::from_dir("./docs", false)).await?; +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug, Clone)] +pub struct IndexContext { + /// Document sources (supports multiple). + pub(crate) sources: Vec, + + /// Optional document name for metadata (single-source only). + pub(crate) name: Option, + + /// Indexing options. + pub(crate) options: IndexOptions, +} + +impl IndexContext { + /// Create from a single file path. + /// + /// The document format is automatically detected from the file extension. + pub fn from_path(path: impl Into) -> Self { + Self { + sources: vec![IndexSource::Path(path.into())], + name: None, + options: IndexOptions::default(), + } + } + + /// Create from multiple file paths. + pub fn from_paths(paths: impl IntoIterator>) -> Self { + Self { + sources: paths + .into_iter() + .map(|p| IndexSource::Path(p.into())) + .collect(), + name: None, + options: IndexOptions::default(), + } + } + + /// Create from a directory path. + /// + /// Indexes all supported files in the directory. + /// Supported extensions: `.md`, `.pdf`. + /// + /// Set `recursive` to `true` to include subdirectories. + pub fn from_dir(dir: impl Into, recursive: bool) -> Self { + Self::scan_dir(dir, recursive) + } + + /// Internal: scan a directory for supported document files. + fn scan_dir(dir: impl Into, recursive: bool) -> Self { + let dir = dir.into(); + let supported_extensions = DocumentFormat::SUPPORTED_EXTENSIONS; + + if !dir.exists() { + tracing::warn!("Directory not found: {}", dir.display()); + } + + let mut sources = Vec::new(); + Self::collect_files(&dir, &supported_extensions, recursive, &mut sources); + + Self { + sources, + name: None, + options: IndexOptions::default(), + } + } + + /// Recursively or non-recursively collect supported files. + fn collect_files( + dir: &std::path::Path, + extensions: &[&str], + recursive: bool, + sources: &mut Vec, + ) { + if let Ok(entries) = std::fs::read_dir(dir) { + let mut subdirs = Vec::new(); + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + if recursive { + subdirs.push(path); + } + } else if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + if extensions.contains(&ext.to_lowercase().as_str()) { + sources.push(IndexSource::Path(path)); + } + } + } + for subdir in subdirs { + Self::collect_files(&subdir, extensions, recursive, sources); + } + } + } + + /// Create from a content string. + pub fn from_content(content: impl Into, format: DocumentFormat) -> Self { + Self { + sources: vec![IndexSource::Content { + data: content.into(), + format, + }], + name: None, + options: IndexOptions::default(), + } + } + + /// Create from binary data. + pub fn from_bytes(bytes: Vec, format: DocumentFormat) -> Self { + Self { + sources: vec![IndexSource::Bytes { + data: bytes, + format, + }], + name: None, + options: IndexOptions::default(), + } + } + + /// Set the document name (single-source only). + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = Some(name.into()); + self + } + + /// Set the indexing options. + pub fn with_options(mut self, options: IndexOptions) -> Self { + self.options = options; + self + } + + /// Set the indexing mode. + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.options.mode = mode; + self + } + + /// Number of document sources. + pub fn len(&self) -> usize { + self.sources.len() + } + + /// Check if there are no sources. + pub fn is_empty(&self) -> bool { + self.sources.is_empty() + } + + /// Get the document name, if set. + pub fn name(&self) -> Option<&str> { + self.name.as_deref() + } + + /// Get the indexing options. + pub fn options(&self) -> &IndexOptions { + &self.options + } +} + +impl From for IndexContext { + fn from(path: PathBuf) -> Self { + Self::from_path(path) + } +} + +impl From<&std::path::Path> for IndexContext { + fn from(path: &std::path::Path) -> Self { + Self::from_path(path.to_path_buf()) + } +} + +impl From<&str> for IndexContext { + fn from(path: &str) -> Self { + Self::from_path(path) + } +} + +impl From for IndexContext { + fn from(path: String) -> Self { + Self::from_path(path) + } +} + +impl std::fmt::Display for IndexSource { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + IndexSource::Path(p) => write!(f, "path:{}", p.display()), + IndexSource::Content { format, .. } => write!(f, "content:{}", format.extension()), + IndexSource::Bytes { format, .. } => write!(f, "bytes:{}", format.extension()), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_path() { + let ctx = IndexContext::from_path("./test.md"); + assert_eq!(ctx.len(), 1); + assert!(ctx.name.is_none()); + } + + #[test] + fn test_from_paths() { + let ctx = IndexContext::from_paths(vec!["./a.md", "./b.pdf"]); + assert_eq!(ctx.len(), 2); + } + + #[test] + fn test_from_content() { + let ctx = IndexContext::from_content("# Title", DocumentFormat::Markdown); + assert_eq!(ctx.len(), 1); + } + + #[test] + fn test_from_bytes() { + let ctx = IndexContext::from_bytes(vec![1, 2, 3], DocumentFormat::Pdf); + assert_eq!(ctx.len(), 1); + } + + #[test] + fn test_with_name() { + let ctx = IndexContext::from_path("./test.md").with_name("My Document"); + assert_eq!(ctx.name(), Some("My Document")); + } + + #[test] + fn test_with_mode() { + let ctx = IndexContext::from_path("./test.md").with_mode(IndexMode::Force); + assert_eq!(ctx.options.mode, IndexMode::Force); + } + + #[test] + fn test_from_path_trait() { + let ctx = IndexContext::from(PathBuf::from("./test.md")); + assert_eq!(ctx.len(), 1); + } + + #[test] + fn test_from_dir_with_recursive() { + // Create a temp directory structure: + // tmp/ + // a.md + // sub/ + // b.md + // deep/ + // c.pdf + let tmp = std::env::temp_dir().join("vectorless_test_dir_recursive"); + let _ = std::fs::remove_dir_all(&tmp); + std::fs::create_dir_all(tmp.join("sub/deep")).unwrap(); + std::fs::write(tmp.join("a.md"), "# A").unwrap(); + std::fs::write(tmp.join("sub/b.md"), "# B").unwrap(); + std::fs::write(tmp.join("sub/deep/c.pdf"), b"%PDF").unwrap(); + std::fs::write(tmp.join("sub/deep/ignore.dat"), b"xxx").unwrap(); + + // Non-recursive: only top-level + let ctx = IndexContext::from_dir(&tmp, false); + assert_eq!(ctx.len(), 1); // only a.md + + // Recursive: all levels + let ctx = IndexContext::from_dir(&tmp, true); + assert_eq!(ctx.len(), 3); // a.md, b.md, c.pdf + + let _ = std::fs::remove_dir_all(&tmp); + } +} diff --git a/vectorless-core/vectorless-engine/src/indexed_document.rs b/vectorless-core/vectorless-engine/src/indexed_document.rs new file mode 100644 index 00000000..ee1cbbce --- /dev/null +++ b/vectorless-core/vectorless-engine/src/indexed_document.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Internal intermediate type produced by the indexing pipeline. +//! +//! [`IndexedDocument`] is an internal-only type that carries data from +//! [`IndexerClient`](super::indexer::IndexerClient) to [`Engine`](super::Engine). +//! It is **not** part of the public API. + +use std::path::PathBuf; + +use vectorless_document::DocumentFormat; +use vectorless_document::DocumentTree; +use vectorless_metrics::IndexMetrics; +use vectorless_storage::PageContent; + +/// An indexed document with its tree structure and metadata. +/// +/// Internal intermediate produced by the indexing pipeline and consumed +/// by [`Engine`](super::Engine) to create a [`PersistedDocument`](vectorless_storage::PersistedDocument). +#[derive(Debug, Clone)] +pub(crate) struct IndexedDocument { + /// Unique document identifier. + pub id: String, + + /// Document format. + pub format: DocumentFormat, + + /// Document name/title. + pub name: String, + + /// Document description (generated by LLM). + pub description: Option, + + /// Source file path. + pub source_path: Option, + + /// Page count (for PDFs). + pub page_count: Option, + + /// The document tree structure. + pub tree: Option, + + /// Per-page content (for PDFs). + pub pages: Vec, + + /// Indexing pipeline metrics. + pub metrics: Option, + + /// Pre-computed reasoning index for retrieval acceleration. + pub reasoning_index: Option, + + /// Pre-computed navigation index for agent-based retrieval. + pub navigation_index: Option, + + /// Key concepts extracted from the document. + pub concepts: Vec, +} + +impl IndexedDocument { + /// Create a new indexed document. + pub fn new(id: impl Into, format: DocumentFormat) -> Self { + Self { + id: id.into(), + format, + name: String::new(), + description: None, + source_path: None, + page_count: None, + tree: None, + pages: Vec::new(), + metrics: None, + reasoning_index: None, + navigation_index: None, + concepts: Vec::new(), + } + } + + /// Set the document name. + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = name.into(); + self + } + + /// Set the document description. + pub fn with_description(mut self, desc: impl Into) -> Self { + self.description = Some(desc.into()); + self + } + + /// Set the source path. + pub fn with_source_path(mut self, path: impl Into) -> Self { + self.source_path = Some(path.into()); + self + } + + /// Set the page count. + pub fn with_page_count(mut self, count: usize) -> Self { + self.page_count = Some(count); + self + } + + /// Set the document tree. + pub fn with_tree(mut self, tree: DocumentTree) -> Self { + self.tree = Some(tree); + self + } + + /// Set the indexing metrics. + pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { + self.metrics = Some(metrics); + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_indexed_document() { + let doc = IndexedDocument::new("doc-1", DocumentFormat::Markdown) + .with_name("Test Document") + .with_description("A test document"); + + assert_eq!(doc.id, "doc-1"); + assert_eq!(doc.name, "Test Document"); + assert!(doc.tree.is_none()); + } +} diff --git a/vectorless-core/vectorless-engine/src/indexer.rs b/vectorless-core/vectorless-engine/src/indexer.rs new file mode 100644 index 00000000..559ccef7 --- /dev/null +++ b/vectorless-core/vectorless-engine/src/indexer.rs @@ -0,0 +1,387 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document indexing client. +//! +//! This module provides document indexing operations including +//! format detection, parsing, and tree building. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::client::{IndexerClient, IndexContext}; +//! +//! let indexer = IndexerClient::new(executor); +//! +//! let result = indexer +//! .index(IndexContext::from_path("./document.md")) +//! .await?; +//! +//! println!("Indexed: {} ({} nodes)", result.id, result.tree.as_ref().map(|t| t.node_count()).unwrap_or(0)); +//! ``` + +use std::path::Path; +use std::sync::Arc; + +use tracing::info; +use uuid::Uuid; + +use vectorless_document::DocumentFormat; +use vectorless_error::{Error, Result}; +use vectorless_index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions}; +use vectorless_llm::LlmClient; +use vectorless_storage::{DocumentMeta, PersistedDocument}; + +use super::index_context::IndexSource; +use super::indexed_document::IndexedDocument; +use vectorless_events::{EventEmitter, IndexEvent}; + +/// Document indexing client. +/// +/// Provides operations for parsing and indexing documents. +/// Each index operation creates a fresh pipeline executor, enabling +/// true parallel document indexing without mutex contention. +pub(crate) struct IndexerClient { + /// Factory for creating pipeline executors (one per index operation). + executor_factory: Arc PipelineExecutor + Send + Sync>, + + /// Event emitter. + events: EventEmitter, +} + +impl IndexerClient { + /// Create with an LLM-enabled pipeline. + pub fn with_llm(client: LlmClient) -> Self { + let client = Arc::new(client); + Self { + executor_factory: Arc::new(move || PipelineExecutor::with_llm((*client).clone())), + events: EventEmitter::new(), + } + } + + /// Create with a custom executor factory (for testing). + pub(crate) fn with_factory(factory: Arc PipelineExecutor + Send + Sync>) -> Self { + Self { + executor_factory: factory, + events: EventEmitter::new(), + } + } + + /// Create with event emitter. + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = events; + self + } + + /// Index a document from an index context. + /// + /// The caller provides fully constructed [`PipelineOptions`] + /// (including checkpoint dir, reasoning config, etc.). + pub async fn index( + &self, + source: &IndexSource, + name: Option<&str>, + pipeline_options: PipelineOptions, + ) -> Result { + self.index_with_existing(source, name, pipeline_options, None) + .await + } + + /// Index a document, optionally reusing an existing tree for incremental updates. + /// + /// The caller provides fully constructed [`PipelineOptions`]. + pub async fn index_with_existing( + &self, + source: &IndexSource, + name: Option<&str>, + mut pipeline_options: PipelineOptions, + existing_tree: Option<&crate::DocumentTree>, + ) -> Result { + pipeline_options.existing_tree = existing_tree.cloned(); + match source { + IndexSource::Path(path) => self.index_from_path(path, name, pipeline_options).await, + IndexSource::Content { data, format } => { + self.index_from_content(data, *format, name, pipeline_options) + .await + } + IndexSource::Bytes { data, format } => { + self.index_from_bytes(data, *format, name, pipeline_options) + .await + } + } + } + + /// Index from a file path. + /// + /// Uses the format from `PipelineOptions.mode` — no redundant detection. + async fn index_from_path( + &self, + path: &Path, + name: Option<&str>, + pipeline_options: PipelineOptions, + ) -> Result { + let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); + + // Validate file before indexing + let validation = vectorless_utils::validate_file(&path)?; + if !validation.valid { + return Err(Error::Parse( + validation + .errors + .first() + .cloned() + .unwrap_or_else(|| "Invalid file".to_string()), + )); + } + for warning in &validation.warnings { + tracing::warn!("{}", warning); + } + + // Resolve format from pipeline options (set by Engine) — no re-detection + let format = Self::format_from_mode(&pipeline_options.mode); + + let input = IndexInput::file(&path); + self.run_pipeline( + input, + format, + &path.display().to_string(), + name, + Some(&path), + pipeline_options, + ) + .await + } + + /// Index from content string. + async fn index_from_content( + &self, + content: &str, + format: DocumentFormat, + name: Option<&str>, + pipeline_options: PipelineOptions, + ) -> Result { + // Validate content before indexing + let validation = vectorless_utils::validate_content(content, format); + if !validation.valid { + return Err(Error::Parse( + validation + .errors + .first() + .cloned() + .unwrap_or_else(|| "Invalid content".to_string()), + )); + } + + let input = IndexInput::content(content); + self.run_pipeline( + input, + format, + name.unwrap_or("content"), + name, + None, + pipeline_options, + ) + .await + } + + /// Index from binary data. + async fn index_from_bytes( + &self, + bytes: &[u8], + format: DocumentFormat, + name: Option<&str>, + pipeline_options: PipelineOptions, + ) -> Result { + // Validate bytes before indexing + let validation = vectorless_utils::validate_bytes(bytes, format); + if !validation.valid { + return Err(Error::Parse( + validation + .errors + .first() + .cloned() + .unwrap_or_else(|| "Invalid bytes".to_string()), + )); + } + + info!( + "Indexing {:?} document from bytes ({} bytes)", + format, + bytes.len() + ); + + let input = IndexInput::bytes(bytes); + self.run_pipeline( + input, + format, + name.unwrap_or("bytes"), + name, + None, + pipeline_options, + ) + .await + } + + /// Common pipeline execution: emit events → run pipeline → build result. + #[tracing::instrument(skip_all, fields(format = ?format, source = %source_label))] + async fn run_pipeline( + &self, + input: IndexInput, + format: DocumentFormat, + source_label: &str, + name: Option<&str>, + path: Option<&Path>, + pipeline_options: PipelineOptions, + ) -> Result { + self.events.emit_index(IndexEvent::Started { + path: source_label.to_string(), + }); + + let doc_id = Uuid::new_v4().to_string(); + self.events + .emit_index(IndexEvent::FormatDetected { format }); + + info!("Indexing {:?} document: {}", format, source_label); + + let mut executor = (self.executor_factory)(); + let result = executor.execute(input, pipeline_options).await?; + + self.build_indexed_document(doc_id, result, format, name, path) + } + + /// Build indexed document from pipeline result. + fn build_indexed_document( + &self, + doc_id: String, + result: vectorless_index::PipelineResult, + format: DocumentFormat, + name: Option<&str>, + path: Option<&Path>, + ) -> Result { + let tree = result + .tree + .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; + + let node_count = tree.node_count(); + self.events.emit_index(IndexEvent::TreeBuilt { node_count }); + + let doc_name = name + .map(str::to_string) + .or_else(|| { + path.and_then(|p| p.file_stem()) + .map(|s| s.to_string_lossy().to_string()) + }) + .unwrap_or_else(|| result.name.clone()); + + let mut doc = IndexedDocument::new(&doc_id, format) + .with_name(&doc_name) + .with_tree(tree) + .with_metrics(result.metrics); + + doc.reasoning_index = result.reasoning_index; + doc.navigation_index = result.navigation_index; + doc.concepts = result.concepts; + + if let Some(p) = path { + doc = doc.with_source_path(p); + } + + if let Some(desc) = &result.description { + doc = doc.with_description(desc); + } + + if let Some(page_count) = result.page_count { + doc = doc.with_page_count(page_count); + } + + info!("Indexing complete: {} ({} nodes)", doc_id, node_count); + self.events.emit_index(IndexEvent::Complete { doc_id }); + + Ok(doc) + } + + /// Resolve `DocumentFormat` from `PipelineOptions.mode`. + /// + /// Falls back to Markdown for `Auto` mode (the engine resolves + /// `Auto` to a concrete format before calling the indexer). + fn format_from_mode(mode: &IndexMode) -> DocumentFormat { + match mode { + IndexMode::Markdown => DocumentFormat::Markdown, + IndexMode::Pdf => DocumentFormat::Pdf, + IndexMode::Auto => DocumentFormat::Markdown, + } + } + + /// Detect document format from file extension. + pub(crate) fn detect_format_from_path(&self, path: &Path) -> Result { + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + DocumentFormat::from_extension(ext) + .ok_or_else(|| Error::Parse(format!("Unsupported format: {}", ext))) + } + + /// Convert [`IndexedDocument`] to [`PersistedDocument`]. + /// + /// This is an associated function — it does not depend on client state. + /// Stores content and logic fingerprints from the pipeline options. + /// + /// Uses async file I/O to avoid blocking the tokio runtime. + pub async fn to_persisted( + doc: IndexedDocument, + pipeline_options: &PipelineOptions, + ) -> PersistedDocument { + let mut meta = DocumentMeta::new(&doc.id, &doc.name, doc.format.extension()) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) + .with_description(doc.description.clone().unwrap_or_default()); + + // Compute content fingerprint for incremental indexing (async I/O) + if let Some(ref path) = doc.source_path { + if let Ok(bytes) = tokio::fs::read(path).await { + let fp = vectorless_utils::fingerprint::Fingerprint::from_bytes(&bytes); + meta = meta.with_fingerprint(fp); + } + } + + // Store logic fingerprint (pipeline configuration hash) + let logic_fp = pipeline_options.logic_fingerprint(); + meta = meta.with_logic_fingerprint(logic_fp); + + let tree = doc.tree.expect("IndexedDocument must have a tree"); + + // Extract stats from metrics + let node_count = tree.node_count(); + let (summary_tokens, duration_ms) = if let Some(ref m) = doc.metrics { + (m.total_tokens_generated, m.total_time_ms()) + } else { + (0, 0) + }; + + let mut persisted = PersistedDocument::new(meta, tree); + + for page in doc.pages { + persisted.add_page(page.page, &page.content); + } + + persisted.reasoning_index = doc.reasoning_index; + persisted.navigation_index = doc.navigation_index; + persisted.concepts = doc.concepts; + persisted + .meta + .update_processing_stats(node_count, summary_tokens, duration_ms); + + persisted + } +} + +impl Clone for IndexerClient { + fn clone(&self) -> Self { + Self { + executor_factory: Arc::clone(&self.executor_factory), + events: self.events.clone(), + } + } +} diff --git a/vectorless-core/vectorless-engine/src/lib.rs b/vectorless-core/vectorless-engine/src/lib.rs new file mode 100644 index 00000000..9976c037 --- /dev/null +++ b/vectorless-core/vectorless-engine/src/lib.rs @@ -0,0 +1,106 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! High-level client API for document indexing and retrieval. +//! +//! This module provides the main entry point for using vectorless: +//! - [`Engine`] — The main client for indexing and querying documents +//! - [`EngineBuilder`] — Builder pattern for client configuration +//! - [`IndexContext`] — Unified input for document indexing +//! - [`QueryContext`] — Unified input for document queries +//! +//! # Quick Start +//! +//! ```rust,no_run +//! use vectorless::client::{EngineBuilder, IndexContext, QueryContext}; +//! +//! # #[tokio::main] +//! # async fn main() -> Result<(), Box> { +//! // Create a client with default settings +//! let client = EngineBuilder::new() +//! .with_key("sk-...") +//! .with_model("gpt-4o") +//! .build() +//! .await?; +//! +//! // Index a document +//! let result = client.index(IndexContext::from_path("./document.md")).await?; +//! let doc_id = result.doc_id().unwrap(); +//! +//! // Query the document +//! let result = client.query( +//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) +//! ).await?; +//! if let Some(item) = result.single() { +//! println!("{}", item.content); +//! } +//! +//! // List all documents +//! for doc in client.list().await? { +//! println!("{}: {}", doc.id, doc.name); +//! } +//! # Ok(()) +//! # } +//! ``` +//! +//! # Events and Progress +//! +//! Monitor operation progress with events: +//! +//! ```rust,no_run +//! # use vectorless::client::{EngineBuilder, EventEmitter, IndexEvent}; +//! # #[tokio::main] +//! # async fn main() -> Result<(), Box> { +//! let events = EventEmitter::new() +//! .on_index(|e| match e { +//! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), +//! _ => {} +//! }); +//! +//! let client = EngineBuilder::new() +//! .with_events(events) +//! .build() +//! .await?; +//! # Ok(()) +//! # } +//! ``` + +mod builder; +mod engine; +mod index_context; +mod indexed_document; +mod indexer; +mod query_context; +mod retriever; +pub(crate) mod test_support; +mod types; +mod workspace; + +// ============================================================ +// Main Types +// ============================================================ + +pub use builder::{BuildError, EngineBuilder}; +pub use engine::Engine; + +// ============================================================ +// Context Types +// ============================================================ + +pub use index_context::IndexContext; +pub use query_context::QueryContext; + +// ============================================================ +// Result & Info Types +// ============================================================ + +pub use types::{ + Confidence, EvidenceItem, FailedItem, IndexItem, IndexMode, IndexOptions, IndexResult, + QueryMetrics, QueryResult, QueryResultItem, +}; + +// ============================================================ +// Parser Types (needed for IndexContext::from_content) +// ============================================================ + +pub use vectorless_document::DocumentFormat; diff --git a/vectorless-core/vectorless-engine/src/query_context.rs b/vectorless-core/vectorless-engine/src/query_context.rs new file mode 100644 index 00000000..48d9ad2a --- /dev/null +++ b/vectorless-core/vectorless-engine/src/query_context.rs @@ -0,0 +1,179 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Query context for the Engine API. +//! +//! [`QueryContext`] encapsulates all parameters for a query operation, +//! supporting specific documents or entire workspace queries. +//! +//! # Example +//! +//! ```rust +//! use vectorless::client::QueryContext; +//! +//! // Query specific documents +//! let ctx = QueryContext::new("What is the total revenue?") +//! .with_doc_ids(vec!["doc-1".to_string()]); +//! +//! // Query entire workspace +//! let ctx = QueryContext::new("Explain the algorithm"); +//! ``` + +/// Query scope — determines which documents to search. +#[derive(Debug, Clone)] +pub(crate) enum QueryScope { + /// Query specific documents. + Documents(Vec), + /// Query all documents in the workspace. + Workspace, +} + +/// Context for a query operation. +/// +/// Supports two scopes: +/// - **Specific documents** — via `with_doc_ids()` +/// - **Entire workspace** — default when no scope is set +/// +/// # Convenience +/// +/// Implements `From` and `From<&str>` for quick construction: +/// +/// ```rust +/// use vectorless::client::QueryContext; +/// +/// let ctx: QueryContext = "What is this?".into(); +/// ``` +#[derive(Debug, Clone)] +pub struct QueryContext { + /// The query text. + pub(crate) query: String, + /// Target scope. + pub(crate) scope: QueryScope, + /// Per-operation timeout (seconds). `None` means no timeout. + pub(crate) timeout_secs: Option, + /// Force Orchestrator analysis even when documents are specified. + /// + /// When `true`, the Orchestrator analyzes DocCards to select relevant + /// documents instead of dispatching all specified docs directly. + /// Useful when the user wants the system to decide which documents + /// (or sections) are most relevant to the query. + pub(crate) force_analysis: bool, +} + +impl QueryContext { + /// Create a new query context (defaults to workspace scope). + pub fn new(query: impl Into) -> Self { + Self { + query: query.into(), + scope: QueryScope::Workspace, + timeout_secs: None, + force_analysis: false, + } + } + + /// Set scope to specific documents. + /// + /// Pass a single ID or multiple IDs to restrict the query + /// to those documents only. + pub fn with_doc_ids(mut self, doc_ids: Vec) -> Self { + self.scope = QueryScope::Documents(doc_ids); + self + } + + /// Set scope to entire workspace. + pub fn with_workspace(mut self) -> Self { + self.scope = QueryScope::Workspace; + self + } + + /// Set per-operation timeout in seconds. + pub fn with_timeout_secs(mut self, secs: u64) -> Self { + self.timeout_secs = Some(secs); + self + } + + /// Force the Orchestrator to analyze documents before dispatching Workers. + /// + /// By default, when documents are specified via `with_doc_ids()`, the + /// Orchestrator skips its analysis phase and dispatches Workers to all + /// specified documents directly. Setting this to `true` forces the + /// Orchestrator to analyze DocCards and decide which documents are + /// relevant, even when the user specified documents explicitly. + /// + /// This is useful when querying across many documents where only a subset + /// is likely relevant to the specific question. + pub fn with_force_analysis(mut self, force: bool) -> Self { + self.force_analysis = force; + self + } +} + +impl From for QueryContext { + fn from(query: String) -> Self { + Self::new(query) + } +} + +impl From<&str> for QueryContext { + fn from(query: &str) -> Self { + Self::new(query) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_query_context_new() { + let ctx = QueryContext::new("What is this?"); + assert_eq!(ctx.query, "What is this?"); + } + + #[test] + fn test_query_context_from_string() { + let ctx: QueryContext = "Hello".to_string().into(); + assert_eq!(ctx.query, "Hello"); + } + + #[test] + fn test_query_context_from_str() { + let ctx: QueryContext = "Hello".into(); + assert_eq!(ctx.query, "Hello"); + } + + #[test] + fn test_single_doc_scope() { + let ctx = QueryContext::new("test").with_doc_ids(vec!["doc-1".to_string()]); + assert!( + matches!(ctx.scope, QueryScope::Documents(ref ids) if ids == &["doc-1".to_string()]) + ); + } + + #[test] + fn test_multi_doc_scope() { + let ctx = QueryContext::new("test").with_doc_ids(vec!["a".into(), "b".into()]); + assert!(matches!(ctx.scope, QueryScope::Documents(ref ids) if ids.len() == 2)); + } + + #[test] + fn test_workspace_scope() { + let ctx = QueryContext::new("test"); + assert!(matches!(ctx.scope, QueryScope::Workspace)); + } + + #[test] + fn test_builder_options() { + let ctx = QueryContext::new("test") + .with_doc_ids(vec!["doc-1".to_string()]) + .with_timeout_secs(60); + + assert_eq!(ctx.timeout_secs, Some(60)); + } + + #[test] + fn test_query_context_timeout_default() { + let ctx = QueryContext::new("test"); + assert_eq!(ctx.timeout_secs, None); + } +} diff --git a/vectorless-core/vectorless-engine/src/retriever.rs b/vectorless-core/vectorless-engine/src/retriever.rs new file mode 100644 index 00000000..f1b12a89 --- /dev/null +++ b/vectorless-core/vectorless-engine/src/retriever.rs @@ -0,0 +1,140 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document retrieval client. +//! +//! This module provides query and retrieval operations for document content, +//! dispatching through the retrieval layer to the agent-based system. + +use tracing::info; + +use vectorless_agent::{self, config::AgentConfig, events::EventEmitter as AgentEventEmitter}; +use crate::client::types::QueryResult; +use vectorless_document::{DocumentTree, NavigationIndex, ReasoningIndex}; +use vectorless_error::Result; +use vectorless_events::{EventEmitter, QueryEvent}; +use vectorless_llm::LlmClient; +use vectorless_retrieval::{dispatcher, postprocessor}; + +/// Document retrieval client. +/// +/// Delegates to the agent-based retrieval system. +pub(crate) struct RetrieverClient { + /// LLM client for agent navigation decisions. + llm: LlmClient, + + /// Agent configuration. + config: AgentConfig, + + /// Event emitter. + events: EventEmitter, +} + +impl RetrieverClient { + /// Create a new retriever client with an LLM client. + pub fn new(llm: LlmClient) -> Self { + Self { + llm, + config: AgentConfig::default(), + events: EventEmitter::new(), + } + } + + /// Create with event emitter. + pub fn with_events(mut self, events: EventEmitter) -> Self { + self.events = events; + self + } + + /// Set custom agent configuration. + pub fn with_config(mut self, config: AgentConfig) -> Self { + self.config = config; + self + } + + /// Get a reference to the agent configuration. + pub fn config(&self) -> &AgentConfig { + &self.config + } + + /// Get a reference to the LLM client. + pub fn llm(&self) -> &LlmClient { + &self.llm + } + + /// Query documents through the agent-based retrieval system. + /// + /// - `skip_analysis = true` → `Scope::Specified` (user-specified docs, skip Orchestrator analysis) + /// - `skip_analysis = false` → `Scope::Workspace` (full Orchestrator analysis flow) + #[tracing::instrument(skip_all, fields(question = %question, docs = documents.len()))] + pub async fn query( + &self, + documents: &[(DocumentTree, NavigationIndex, ReasoningIndex, String)], + question: &str, + skip_analysis: bool, + ) -> Result { + self.events.emit_query(QueryEvent::Started { + query: question.to_string(), + }); + + info!( + docs = documents.len(), + skip_analysis, "Querying: {:?}", question + ); + + let doc_contexts: Vec = documents + .iter() + .map(|(tree, nav, ridx, id)| agent::DocContext { + tree, + nav_index: nav, + reasoning_index: ridx, + doc_name: id.as_str(), + }) + .collect(); + + let scope = if skip_analysis { + agent::Scope::Specified(doc_contexts) + } else { + agent::Scope::Workspace(agent::WorkspaceContext::new(doc_contexts)) + }; + + let emitter = AgentEventEmitter::noop(); + let output = + dispatcher::dispatch(question, scope, &self.config, &self.llm, &emitter).await?; + + let fallback_id = documents + .first() + .map(|(_, _, _, id)| id.as_str()) + .unwrap_or(""); + let items = postprocessor::to_results(&output, fallback_id); + let result = QueryResult::new_with_items(items); + + self.events.emit_query(QueryEvent::Complete { + total_results: result.len(), + confidence: result.single().map(|i| i.confidence).unwrap_or(0.0), + }); + + Ok(result) + } +} + +impl Clone for RetrieverClient { + fn clone(&self) -> Self { + Self { + llm: self.llm.clone(), + config: self.config.clone(), + events: self.events.clone(), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retriever_client_creation() { + let _client = + RetrieverClient::new(LlmClient::new(vectorless_llm::config::LlmConfig::default())); + } +} diff --git a/vectorless-core/vectorless-engine/src/test_support.rs b/vectorless-core/vectorless-engine/src/test_support.rs new file mode 100644 index 00000000..7747ddbe --- /dev/null +++ b/vectorless-core/vectorless-engine/src/test_support.rs @@ -0,0 +1,54 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Test-only helpers for constructing Engine instances without a real LLM. +//! +//! This module is exposed via `vectorless::__test_support` and should **only** +//! be used in integration tests. + +use std::sync::Arc; + +use crate::client::engine::Engine; +use crate::client::indexer::IndexerClient; +use crate::client::retriever::RetrieverClient; +use vectorless_config::Config; +use vectorless_events::EventEmitter; +use vectorless_index::PipelineExecutor; +use vectorless_llm::LlmClient; +use vectorless_llm::config::LlmConfig; +use vectorless_metrics::MetricsHub; +use vectorless_storage::Workspace; + +/// Build an `Engine` with a no-LLM pipeline for integration testing. +/// +/// The pipeline skips enhance/summary stages but exercises: +/// parse → build → validate → split → enrich → optimize. +/// +/// # Example +/// +/// ```rust,ignore +/// let tmp = tempfile::tempdir().unwrap(); +/// let engine = vectorless::__test_support::build_test_engine(tmp.path()).await; +/// ``` +pub async fn build_test_engine(workspace_dir: &std::path::Path) -> Engine { + let config = Config::default(); + + // No-LLM indexer: pipeline without enhance stage + let executor_factory: Arc PipelineExecutor + Send + Sync> = + Arc::new(|| PipelineExecutor::new()); + let indexer = IndexerClient::with_factory(executor_factory); + + let workspace = Workspace::new(workspace_dir).await.unwrap(); + let retriever = RetrieverClient::new(LlmClient::new(LlmConfig::default())); + + Engine::with_components( + config, + workspace, + retriever, + indexer, + EventEmitter::new(), + Arc::new(MetricsHub::with_defaults()), + ) + .await + .unwrap() +} diff --git a/vectorless-core/vectorless-engine/src/types.rs b/vectorless-core/vectorless-engine/src/types.rs new file mode 100644 index 00000000..df5c30f4 --- /dev/null +++ b/vectorless-core/vectorless-engine/src/types.rs @@ -0,0 +1,536 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Public API types for the client module. +//! +//! This module contains all types exposed in the public API. + +use serde::{Deserialize, Serialize}; + +use vectorless_document::DocumentFormat; +use vectorless_metrics::IndexMetrics; + +// ============================================================ +// Partial Success +// ============================================================ + +/// A failed item in a batch operation. +#[derive(Debug, Clone)] +pub struct FailedItem { + /// Source description (file path, content name, or doc ID). + pub source: String, + /// Error message. + pub error: String, +} + +impl FailedItem { + /// Create a new failed item. + pub fn new(source: impl Into, error: impl Into) -> Self { + Self { + source: source.into(), + error: error.into(), + } + } +} + +// ============================================================ +// Index Types +// ============================================================ + +/// Document indexing behavior mode. +/// +/// Controls how the indexer handles existing documents and re-indexing. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum IndexMode { + /// Default mode - skip if already indexed. + /// + /// If a document with the same source has already been indexed, + /// the operation is skipped and the existing document ID is returned. + #[default] + Default, + + /// Force re-indexing. + /// + /// Always re-index the document, even if it has been indexed before. + /// A new document ID is generated. + Force, + + /// Incremental mode - only re-index changed files. + /// + /// Re-index only if the file has been modified since the last index. + /// For content/bytes sources, this behaves like [`IndexMode::Default`]. + Incremental, +} + +/// Options for indexing a document. +#[derive(Debug, Clone)] +pub struct IndexOptions { + /// Indexing mode. + pub mode: IndexMode, + + /// Whether to generate summaries using LLM. + pub generate_summaries: bool, + + /// Whether to generate node IDs. + pub generate_ids: bool, + + /// Whether to generate document description. + pub generate_description: bool, + + /// Whether to expand keywords with LLM-generated synonyms + /// during reasoning index construction. Improves recall for + /// queries that use different wording than the document. + pub enable_synonym_expansion: bool, + + /// Per-operation timeout (seconds). `None` means no timeout. + pub timeout_secs: Option, +} + +impl Default for IndexOptions { + fn default() -> Self { + Self { + mode: IndexMode::Default, + generate_summaries: true, + generate_ids: true, + generate_description: true, + enable_synonym_expansion: true, + timeout_secs: None, + } + } +} + +impl IndexOptions { + /// Create new index options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Enable summary generation. + pub fn with_summaries(mut self) -> Self { + self.generate_summaries = true; + self + } + + /// Enable document description generation. + pub fn with_description(mut self) -> Self { + self.generate_description = true; + self + } + + /// Set the indexing mode. + /// + /// # Modes + /// + /// - [`IndexMode::Default`] - Skip if already indexed + /// - [`IndexMode::Force`] - Always re-index + /// - [`IndexMode::Incremental`] - Only re-index changed files + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.mode = mode; + self + } + + /// Set per-operation timeout in seconds. + pub fn with_timeout_secs(mut self, secs: u64) -> Self { + self.timeout_secs = Some(secs); + self + } +} + +// ============================================================ +// Index Result Types +// ============================================================ + +/// Result of a document indexing operation. +#[derive(Debug, Clone)] +pub struct IndexResult { + /// Successfully indexed items. + pub items: Vec, + + /// Items that failed to index (partial success). + pub failed: Vec, +} + +impl IndexResult { + /// Create a new index result. + pub fn new(items: Vec) -> Self { + Self { + items, + failed: Vec::new(), + } + } + + /// Create with both successes and failures. + pub fn with_partial(items: Vec, failed: Vec) -> Self { + Self { items, failed } + } + + /// Get the single document ID (convenience for single-document indexing). + pub fn doc_id(&self) -> Option<&str> { + if self.items.len() == 1 { + Some(&self.items[0].doc_id) + } else { + None + } + } + + /// Check if the result is empty. + pub fn is_empty(&self) -> bool { + self.items.is_empty() + } + + /// Get the number of indexed items. + pub fn len(&self) -> usize { + self.items.len() + } + + /// Whether any items failed. + pub fn has_failures(&self) -> bool { + !self.failed.is_empty() + } + + /// Total number of sources (success + failed). + pub fn total(&self) -> usize { + self.items.len() + self.failed.len() + } +} + +/// A single indexed document item. +#[derive(Debug, Clone)] +pub struct IndexItem { + /// The unique document ID. + pub doc_id: String, + /// The document name. + pub name: String, + /// The document format. + pub format: DocumentFormat, + /// Document description (from root summary). + pub description: Option, + /// Source file path (if indexed from a file). + pub source_path: Option, + /// Page count (for PDFs). + pub page_count: Option, + /// Indexing pipeline metrics (timing, LLM usage, node stats). + pub metrics: Option, +} + +impl IndexItem { + /// Create a new index item. + pub fn new( + doc_id: impl Into, + name: impl Into, + format: DocumentFormat, + description: Option, + page_count: Option, + ) -> Self { + Self { + doc_id: doc_id.into(), + name: name.into(), + format, + description, + source_path: None, + page_count, + metrics: None, + } + } + + /// Set the source file path. + pub fn with_source_path(mut self, path: impl Into) -> Self { + self.source_path = Some(path.into()); + self + } + + /// Set the indexing metrics. + pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { + self.metrics = Some(metrics); + self + } + + /// Set the indexing metrics (optional). + pub fn with_metrics_opt(mut self, metrics: Option) -> Self { + self.metrics = metrics; + self + } +} + +// ============================================================ +// Query Types +// ============================================================ + +/// A single piece of evidence with source attribution. +#[derive(Debug, Clone)] +pub struct EvidenceItem { + /// Section title where this evidence was found. + pub title: String, + /// Navigation path (e.g., "Root/Chapter 1/Section 1.2"). + pub path: String, + /// Raw evidence content. + pub content: String, + /// Source document name (set in multi-doc scenarios). + pub doc_name: Option, +} + +/// Query execution metrics. +#[derive(Debug, Clone, Default)] +pub struct QueryMetrics { + /// Number of LLM calls made. + pub llm_calls: u32, + /// Number of navigation rounds used. + pub rounds_used: u32, + /// Number of distinct nodes visited. + pub nodes_visited: usize, + /// Number of evidence items collected. + pub evidence_count: usize, + /// Total characters of collected evidence. + pub evidence_chars: usize, +} + +/// Confidence score of the query result (0.0–1.0). +/// +/// Derived from LLM evaluate() — whether evidence was deemed sufficient +/// and how many replan rounds were needed. +pub type Confidence = f32; + +/// A single document's query result. +#[derive(Debug, Clone)] +pub struct QueryResultItem { + /// The document ID. + pub doc_id: String, + + /// Matching node IDs (navigation paths). + pub node_ids: Vec, + + /// Synthesized answer or raw evidence content. + pub content: String, + + /// Evidence items that contributed to this result, with source attribution. + pub evidence: Vec, + + /// Execution metrics for this query. + pub metrics: Option, + + /// Confidence score (0.0–1.0) — derived from LLM evaluation. + pub confidence: Confidence, +} + +/// Result of a document query. +/// +/// Contains results from one or more documents. For single-document queries, +/// `items` has one entry. For multi-document or workspace queries, it has +/// one entry per document that matched. +#[derive(Debug, Clone)] +pub struct QueryResult { + /// Query results per document. + pub items: Vec, + + /// Documents that failed during multi-doc query. + pub failed: Vec, +} + +impl QueryResult { + /// Create a new query result (empty). + pub fn new() -> Self { + Self { + items: Vec::new(), + failed: Vec::new(), + } + } + + /// Create a query result with items. + pub fn new_with_items(items: Vec) -> Self { + Self { + items, + failed: Vec::new(), + } + } + + /// Create a query result with a single item. + pub fn from_single(item: QueryResultItem) -> Self { + Self { + items: vec![item], + failed: Vec::new(), + } + } + + /// Create with both successes and failures. + pub fn with_partial(items: Vec, failed: Vec) -> Self { + Self { items, failed } + } + + /// Check if the result is empty. + pub fn is_empty(&self) -> bool { + self.items.is_empty() + } + + /// Get the number of result items. + pub fn len(&self) -> usize { + self.items.len() + } + + /// Get the first (single-doc) result item, if any. + pub fn single(&self) -> Option<&QueryResultItem> { + self.items.first() + } + + /// Whether any documents failed. + pub fn has_failures(&self) -> bool { + !self.failed.is_empty() + } +} + +impl Default for QueryResult { + fn default() -> Self { + Self::new() + } +} + +// ============================================================ +// Document Info Types +// ============================================================ + +/// Document info for listing. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentInfo { + /// Document ID. + pub id: String, + + /// Document name. + pub name: String, + + /// Document format. + pub format: String, + + /// Document description. + pub description: Option, + + /// Source file path. + pub source_path: Option, + + /// Page count (for PDFs). + pub page_count: Option, + + /// Line count (for text files). + pub line_count: Option, +} + +impl DocumentInfo { + /// Create a new document info. + pub fn new(id: impl Into, name: impl Into) -> Self { + Self { + id: id.into(), + name: name.into(), + format: String::new(), + description: None, + source_path: None, + page_count: None, + line_count: None, + } + } + + /// Set the format. + pub fn with_format(mut self, format: impl Into) -> Self { + self.format = format.into(); + self + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_options() { + let options = IndexOptions::new() + .with_summaries() + .with_mode(IndexMode::Force); + + assert!(options.generate_summaries); + assert_eq!(options.mode, IndexMode::Force); + } + + #[test] + fn test_index_options_timeout() { + let opts = IndexOptions::new().with_timeout_secs(30); + assert_eq!(opts.timeout_secs, Some(30)); + + let default = IndexOptions::default(); + assert_eq!(default.timeout_secs, None); + } + + #[test] + fn test_query_result() { + let result = QueryResult::new(); + assert!(result.is_empty()); + assert_eq!(result.len(), 0); + } + + #[test] + fn test_query_result_single() { + let item = QueryResultItem { + doc_id: "doc-1".into(), + node_ids: vec!["n1".into()], + content: "content".into(), + evidence: vec![], + metrics: None, + confidence: 0.9, + }; + let result = QueryResult::from_single(item); + assert!(!result.is_empty()); + assert_eq!(result.len(), 1); + assert!(result.single().is_some()); + assert_eq!(result.single().unwrap().doc_id, "doc-1"); + } + + #[test] + fn test_document_info() { + let info = DocumentInfo::new("doc-1", "Test").with_format("markdown"); + + assert_eq!(info.id, "doc-1"); + assert_eq!(info.format, "markdown"); + } + + #[test] + fn test_index_result() { + let item = IndexItem::new("doc-1", "Test", DocumentFormat::Markdown, None, None); + let result = IndexResult::new(vec![item]); + + assert_eq!(result.doc_id(), Some("doc-1")); + assert_eq!(result.len(), 1); + assert!(!result.is_empty()); + } + + #[test] + fn test_index_result_empty() { + let result = IndexResult::new(vec![]); + assert!(result.is_empty()); + assert_eq!(result.doc_id(), None); + } + + #[test] + fn test_index_result_multiple() { + let items = vec![ + IndexItem::new("doc-1", "A", DocumentFormat::Markdown, None, None), + IndexItem::new("doc-2", "B", DocumentFormat::Pdf, None, None), + ]; + let result = IndexResult::new(items); + assert_eq!(result.len(), 2); + assert_eq!(result.doc_id(), None); + } + + #[test] + fn test_partial_success() { + let items = vec![IndexItem::new( + "doc-1", + "A", + DocumentFormat::Markdown, + None, + None, + )]; + let failed = vec![FailedItem::new("missing.pdf", "File not found")]; + let result = IndexResult::with_partial(items, failed); + + assert_eq!(result.len(), 1); + assert!(result.has_failures()); + assert_eq!(result.total(), 2); + assert_eq!(result.failed[0].source, "missing.pdf"); + } +} diff --git a/vectorless-core/vectorless-error/Cargo.toml b/vectorless-core/vectorless-error/Cargo.toml new file mode 100644 index 00000000..c39a0ab1 --- /dev/null +++ b/vectorless-core/vectorless-error/Cargo.toml @@ -0,0 +1,15 @@ +[package] +name = "vectorless-error" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +thiserror = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-error/src/error.rs b/vectorless-core/vectorless-error/src/error.rs new file mode 100644 index 00000000..36acf0e5 --- /dev/null +++ b/vectorless-core/vectorless-error/src/error.rs @@ -0,0 +1,329 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Error types for the vectorless library. +//! +//! This module provides a comprehensive error type hierarchy for all operations. +//! All errors are consolidated into [`enum@Error`] with specific variants for each category. + +use thiserror::Error; + +/// The main error type for vectorless operations. +#[derive(Debug, Error)] +pub enum Error { + // ========================================================================= + // Document & Parsing Errors + // ========================================================================= + /// An error occurred while parsing a document. + #[error("Document parsing error: {0}")] + Parse(String), + + /// Unsupported document format. + #[error("Unsupported document format: {0}")] + UnsupportedFormat(String), + + /// Invalid document structure. + #[error("Invalid document structure: {0}")] + InvalidStructure(String), + + // ========================================================================= + // Index Errors + // ========================================================================= + /// An error occurred while building the index. + #[error("Index building error: {0}")] + IndexBuild(String), + + /// Index not found. + #[error("Index not found: {0}")] + IndexNotFound(String), + + /// Index corrupted. + #[error("Index corrupted: {0}")] + IndexCorrupted(String), + + /// Document graph build error. + #[error("Document graph build error: {0}")] + GraphBuild(String), + + // ========================================================================= + // Retrieval Errors + // ========================================================================= + /// An error occurred during retrieval. + #[error("Retrieval error: {0}")] + Retrieval(String), + + /// No relevant content found. + #[error("No relevant content found for query")] + NoRelevantContent, + + /// Search timeout. + #[error("Search timeout after {0}ms")] + SearchTimeout(u64), + + // ========================================================================= + // LLM Errors + // ========================================================================= + /// An error occurred during LLM call (transient: network, timeout). + #[error("LLM error: {0}")] + Llm(String), + + /// LLM rate limit exceeded. + #[error("LLM rate limit exceeded, retry after {0}ms")] + RateLimitExceeded(u64), + + /// LLM quota exceeded. + #[error("LLM quota exceeded")] + QuotaExceeded, + + /// LLM reasoning failure — model responded but output is unusable. + /// Not transient. Do not retry the same prompt. + #[error("LLM reasoning failure at '{stage}': {detail}")] + LlmReasoning { + /// The pipeline stage where reasoning failed. + stage: String, + /// Why the output was unusable. + detail: String, + }, + + // ========================================================================= + // Summary Errors + // ========================================================================= + /// An error occurred during summarization. + #[error("Summarization error: {0}")] + Summarization(String), + + /// Summary too long. + #[error("Summary exceeds maximum length: {0} tokens")] + SummaryTooLong(usize), + + // ========================================================================= + // Storage Errors + // ========================================================================= + /// An error occurred during I/O operations. + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + /// Workspace error. + #[error("Workspace error: {0}")] + Workspace(String), + + /// Cache error. + #[error("Cache error: {0}")] + Cache(String), + + /// Serialization error. + #[error("Serialization error: {0}")] + Serialization(String), + + /// Document not found. + #[error("Document not found: {0}")] + DocumentNotFound(String), + + /// Checksum mismatch. + #[error("Checksum mismatch: {0}")] + ChecksumMismatch(String), + + /// Workspace locked by another process. + #[error("Workspace locked by another process")] + WorkspaceLocked, + + /// Format version mismatch. + #[error("Format version mismatch: {0}")] + VersionMismatch(String), + + // ========================================================================= + // Configuration Errors + // ========================================================================= + /// TOML parsing error. + #[error("TOML parsing error: {0}")] + Toml(String), + + /// Invalid configuration. + #[error("Invalid configuration: {0}")] + Config(String), + + /// Missing required configuration. + #[error("Missing required configuration: {0}")] + MissingConfig(String), + + // ========================================================================= + // Node Errors + // ========================================================================= + /// The requested node was not found. + #[error("Node not found: {0}")] + NodeNotFound(String), + + // ========================================================================= + // Input Validation Errors + // ========================================================================= + /// Invalid input. + #[error("Invalid input: {0}")] + InvalidInput(String), + + /// Empty input. + #[error("Empty input: {field}")] + EmptyInput { + /// The field that was empty. + field: String, + }, + + /// Out of range. + #[error("{field} out of range: expected {min}-{max}, got {actual}")] + OutOfRange { + /// The field that was out of range. + field: String, + /// Minimum allowed value. + min: String, + /// Maximum allowed value. + max: String, + /// Actual value received. + actual: String, + }, + + // ========================================================================= + // Throttle Errors + // ========================================================================= + /// Throttle error. + #[error("Throttle error: {0}")] + Throttle(String), + + /// Concurrency limit exceeded. + #[error("Concurrency limit exceeded: {0} pending")] + ConcurrencyLimitExceeded(usize), + + // ========================================================================= + // Timeout Errors + // ========================================================================= + /// Operation timeout. + #[error("Operation timeout: {0}")] + Timeout(String), + + // ========================================================================= + // Generic Errors + // ========================================================================= + /// A generic error with a message. + #[error("{0}")] + Other(String), + + /// Error with context. + #[error("{context}: {source}")] + WithContext { + /// Additional context describing where/why the error occurred. + context: String, + /// The underlying error. + #[source] + source: Box, + }, +} + +impl Error { + /// Create an error with additional context. + #[must_use] + pub fn with_context(self, context: impl Into) -> Self { + Self::WithContext { + context: context.into(), + source: Box::new(self), + } + } + + /// Check if this is a retryable error. + #[must_use] + pub fn is_retryable(&self) -> bool { + matches!( + self, + Self::RateLimitExceeded(_) | Self::SearchTimeout(_) | Self::Timeout(_) | Self::Llm(_) + ) + } + + /// Check if this is a not found error. + #[must_use] + pub fn is_not_found(&self) -> bool { + matches!( + self, + Self::NodeNotFound(_) | Self::DocumentNotFound(_) | Self::IndexNotFound(_) + ) + } + + /// Check if this is a timeout error. + #[must_use] + pub fn is_timeout(&self) -> bool { + matches!(self, Self::Timeout(_) | Self::SearchTimeout(_)) + } + + /// Check if this is a configuration error. + #[must_use] + pub fn is_config_error(&self) -> bool { + matches!(self, Self::Config(_) | Self::MissingConfig(_)) + } + + /// Create an empty input error. + pub fn empty_input(field: impl Into) -> Self { + Self::EmptyInput { + field: field.into(), + } + } + + /// Create an out of range error. + pub fn out_of_range( + field: impl Into, + min: impl Into, + max: impl Into, + actual: impl Into, + ) -> Self { + Self::OutOfRange { + field: field.into(), + min: min.into(), + max: max.into(), + actual: actual.into(), + } + } +} + +/// A specialized result type for vectorless operations. +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_context() { + let inner = Error::Parse("test".to_string()); + let with_context = inner.with_context("While processing document"); + + let msg = format!("{}", with_context); + assert!(msg.contains("While processing document")); + assert!(msg.contains("test")); + } + + #[test] + fn test_is_retryable() { + assert!(Error::RateLimitExceeded(1000).is_retryable()); + assert!(Error::Timeout("test".to_string()).is_retryable()); + assert!(!Error::Config("test".to_string()).is_retryable()); + } + + #[test] + fn test_is_not_found() { + assert!(Error::NodeNotFound("1".to_string()).is_not_found()); + assert!(Error::DocumentNotFound("doc".to_string()).is_not_found()); + assert!(!Error::Parse("test".to_string()).is_not_found()); + } + + #[test] + fn test_empty_input() { + let err = Error::empty_input("query"); + let msg = format!("{}", err); + assert!(msg.contains("query")); + } + + #[test] + fn test_out_of_range() { + let err = Error::out_of_range("depth", "0", "10", "15"); + let msg = format!("{}", err); + assert!(msg.contains("depth")); + assert!(msg.contains("0")); + assert!(msg.contains("10")); + assert!(msg.contains("15")); + } +} diff --git a/vectorless-core/vectorless-error/src/lib.rs b/vectorless-core/vectorless-error/src/lib.rs new file mode 100644 index 00000000..85d6a0bf --- /dev/null +++ b/vectorless-core/vectorless-error/src/lib.rs @@ -0,0 +1,8 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Error types for the vectorless library. + +mod error; + +pub use error::{Error, Result}; diff --git a/vectorless-core/vectorless-events/Cargo.toml b/vectorless-core/vectorless-events/Cargo.toml new file mode 100644 index 00000000..c21492d7 --- /dev/null +++ b/vectorless-core/vectorless-events/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "vectorless-events" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +parking_lot = { workspace = true } +vectorless-error = { path = "../vectorless-error" } +vectorless-document = { path = "../vectorless-document" } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-events/src/emitter.rs b/vectorless-core/vectorless-events/src/emitter.rs new file mode 100644 index 00000000..7804a25c --- /dev/null +++ b/vectorless-core/vectorless-events/src/emitter.rs @@ -0,0 +1,256 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Event emitter for client operations. +//! +//! Collects event handlers and dispatches events to them. +//! Uses `Arc>` so cloning shares handlers instead of losing them. + +use std::sync::Arc; + +use parking_lot::RwLock; + +use super::types::{IndexEvent, QueryEvent, WorkspaceEvent}; + +/// Type alias for sync index handler. +pub(crate) type IndexHandler = Box; + +/// Type alias for sync query handler. +pub(crate) type QueryHandler = Box; + +/// Type alias for sync workspace handler. +pub(crate) type WorkspaceHandler = Box; + +/// Inner state shared via `Arc>`. +struct EventEmitterInner { + /// Index event handlers. + index_handlers: Vec, + + /// Query event handlers. + query_handlers: Vec, + + /// Workspace event handlers. + workspace_handlers: Vec, +} + +impl Default for EventEmitterInner { + fn default() -> Self { + Self { + index_handlers: Vec::new(), + query_handlers: Vec::new(), + workspace_handlers: Vec::new(), + } + } +} + +/// Event emitter for client operations. +/// +/// Collects event handlers and dispatches events to them. +/// Cloning shares the same handlers (via `Arc`), so all clones +/// dispatch to the same registered handlers. +/// +/// # Example +/// +/// ```rust,ignore +/// let emitter = EventEmitter::new() +/// .on_index(|e| match e { +/// IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), +/// _ => {} +/// }); +/// +/// let clone = emitter.clone(); +/// // clone shares the same handlers — emitting on either fires on both +/// ``` +pub struct EventEmitter { + inner: Arc>, +} + +impl EventEmitter { + /// Create a new event emitter with no handlers. + pub fn new() -> Self { + Self::default() + } + + /// Add an index event handler. + pub fn on_index(self, handler: F) -> Self + where + F: Fn(&IndexEvent) + Send + Sync + 'static, + { + self.inner.write().index_handlers.push(Box::new(handler)); + self + } + + /// Add a query event handler. + pub fn on_query(self, handler: F) -> Self + where + F: Fn(&QueryEvent) + Send + Sync + 'static, + { + self.inner.write().query_handlers.push(Box::new(handler)); + self + } + + /// Add a workspace event handler. + pub fn on_workspace(self, handler: F) -> Self + where + F: Fn(&WorkspaceEvent) + Send + Sync + 'static, + { + self.inner + .write() + .workspace_handlers + .push(Box::new(handler)); + self + } + + /// Emit an index event. + pub fn emit_index(&self, event: IndexEvent) { + let inner = self.inner.read(); + for handler in &inner.index_handlers { + handler(&event); + } + } + + /// Emit a query event. + pub fn emit_query(&self, event: QueryEvent) { + let inner = self.inner.read(); + for handler in &inner.query_handlers { + handler(&event); + } + } + + /// Emit a workspace event. + pub fn emit_workspace(&self, event: WorkspaceEvent) { + let inner = self.inner.read(); + for handler in &inner.workspace_handlers { + handler(&event); + } + } + + /// Check if there are any handlers registered. + pub fn has_handlers(&self) -> bool { + let inner = self.inner.read(); + !inner.index_handlers.is_empty() + || !inner.query_handlers.is_empty() + || !inner.workspace_handlers.is_empty() + } + + /// Merge another emitter into this one. + pub fn merge(self, other: EventEmitter) -> Self { + let mut other_inner = other.inner.write(); + let mut inner = self.inner.write(); + inner + .index_handlers + .extend(other_inner.index_handlers.drain(..)); + inner + .query_handlers + .extend(other_inner.query_handlers.drain(..)); + inner + .workspace_handlers + .extend(other_inner.workspace_handlers.drain(..)); + drop(inner); + drop(other_inner); + self + } +} + +impl Default for EventEmitter { + fn default() -> Self { + Self { + inner: Arc::new(RwLock::new(EventEmitterInner::default())), + } + } +} + +impl Clone for EventEmitter { + fn clone(&self) -> Self { + Self { + inner: Arc::clone(&self.inner), + } + } +} + +impl std::fmt::Debug for EventEmitter { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let inner = self.inner.read(); + f.debug_struct("EventEmitter") + .field("index_handlers", &inner.index_handlers.len()) + .field("query_handlers", &inner.query_handlers.len()) + .field("workspace_handlers", &inner.workspace_handlers.len()) + .finish() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + + #[test] + fn test_event_emitter_index() { + let counter = Arc::new(AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let emitter = EventEmitter::new().on_index(move |_e| { + counter_clone.fetch_add(1, Ordering::SeqCst); + }); + + emitter.emit_index(IndexEvent::Started { + path: "test.md".to_string(), + }); + emitter.emit_index(IndexEvent::Complete { + doc_id: "123".to_string(), + }); + + assert_eq!(counter.load(Ordering::SeqCst), 2); + } + + #[test] + fn test_event_emitter_query() { + let counter = Arc::new(AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let emitter = EventEmitter::new().on_query(move |_e| { + counter_clone.fetch_add(1, Ordering::SeqCst); + }); + + emitter.emit_query(QueryEvent::Started { + query: "test".to_string(), + }); + + assert_eq!(counter.load(Ordering::SeqCst), 1); + } + + #[test] + fn test_event_emitter_has_handlers() { + let empty = EventEmitter::new(); + assert!(!empty.has_handlers()); + + let with_handler = EventEmitter::new().on_index(|_| {}); + assert!(with_handler.has_handlers()); + } + + #[test] + fn test_event_emitter_clone_shares_handlers() { + let counter = Arc::new(AtomicUsize::new(0)); + let counter_clone = counter.clone(); + + let emitter = EventEmitter::new().on_index(move |_e| { + counter_clone.fetch_add(1, Ordering::SeqCst); + }); + + let cloned = emitter.clone(); + + // Emit on the clone — original's handler should fire + cloned.emit_index(IndexEvent::Started { + path: "test.md".to_string(), + }); + + assert_eq!(counter.load(Ordering::SeqCst), 1); + + // Emit on the original too + emitter.emit_index(IndexEvent::Complete { + doc_id: "123".to_string(), + }); + + assert_eq!(counter.load(Ordering::SeqCst), 2); + } +} diff --git a/vectorless-core/vectorless-events/src/lib.rs b/vectorless-core/vectorless-events/src/lib.rs new file mode 100644 index 00000000..e8e55df5 --- /dev/null +++ b/vectorless-core/vectorless-events/src/lib.rs @@ -0,0 +1,31 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Event system for observing and reacting to client operations. +//! +//! This module provides event types and the [`EventEmitter`] for +//! registering handlers and dispatching events during indexing, +//! querying, and workspace operations. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::events::{EventEmitter, IndexEvent}; +//! +//! let emitter = EventEmitter::new() +//! .on_index(|e| match e { +//! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), +//! _ => {} +//! }); +//! +//! let client = EngineBuilder::new() +//! .with_events(emitter) +//! .build() +//! .await?; +//! ``` + +mod emitter; +mod types; + +pub use emitter::EventEmitter; +pub use types::{IndexEvent, QueryEvent, WorkspaceEvent}; diff --git a/vectorless-core/vectorless-events/src/types.rs b/vectorless-core/vectorless-events/src/types.rs new file mode 100644 index 00000000..45bb5ad4 --- /dev/null +++ b/vectorless-core/vectorless-events/src/types.rs @@ -0,0 +1,138 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Event types for client operations. +//! +//! Provides enums for indexing, query, and workspace events +//! that can be observed via [`EventEmitter`](super::EventEmitter). + +use vectorless_document::DocumentFormat; +use vectorless_document::SufficiencyLevel; + +/// Indexing operation events. +#[derive(Debug, Clone)] +pub enum IndexEvent { + /// Started indexing a document. + Started { + /// File path being indexed. + path: String, + }, + + /// Document format detected. + FormatDetected { + /// Detected format. + format: DocumentFormat, + }, + + /// Parsing progress update. + ParsingProgress { + /// Percentage complete (0-100). + percent: u8, + }, + + /// Document tree built. + TreeBuilt { + /// Number of nodes in the tree. + node_count: usize, + }, + + /// Summary generation progress. + SummaryProgress { + /// Number of summaries completed. + completed: usize, + /// Total summaries to generate. + total: usize, + }, + + /// Indexing completed successfully. + Complete { + /// Generated document ID. + doc_id: String, + }, + + /// Error occurred during indexing. + Error { + /// Error message. + message: String, + }, +} + +/// Query operation events. +#[derive(Debug, Clone)] +pub enum QueryEvent { + /// Search started. + Started { + /// The query string. + query: String, + }, + + /// Node visited during search. + NodeVisited { + /// Node ID. + node_id: String, + /// Node title. + title: String, + /// Relevance score. + score: f32, + }, + + /// Candidate result found. + CandidateFound { + /// Node ID. + node_id: String, + /// Relevance score. + score: f32, + }, + + /// Sufficiency check result. + SufficiencyCheck { + /// Sufficiency level. + level: SufficiencyLevel, + /// Total tokens collected. + tokens: usize, + }, + + /// Query completed. + Complete { + /// Total results found. + total_results: usize, + /// Overall confidence score. + confidence: f32, + }, + + /// Error occurred during query. + Error { + /// Error message. + message: String, + }, +} + +/// Workspace operation events. +#[derive(Debug, Clone)] +pub enum WorkspaceEvent { + /// Document saved to workspace. + Saved { + /// Document ID. + doc_id: String, + }, + + /// Document loaded from workspace. + Loaded { + /// Document ID. + doc_id: String, + /// Whether it was a cache hit. + cache_hit: bool, + }, + + /// Document removed from workspace. + Removed { + /// Document ID. + doc_id: String, + }, + + /// Workspace cleared. + Cleared { + /// Number of documents removed. + count: usize, + }, +} diff --git a/vectorless-core/vectorless-graph/Cargo.toml b/vectorless-core/vectorless-graph/Cargo.toml new file mode 100644 index 00000000..eb42881e --- /dev/null +++ b/vectorless-core/vectorless-graph/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "vectorless-graph" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +tracing = { workspace = true } +vectorless-document = { path = "../vectorless-document" } +serde = { workspace = true } +serde_json = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-graph/src/builder.rs b/vectorless-core/vectorless-graph/src/builder.rs new file mode 100644 index 00000000..6cdf388b --- /dev/null +++ b/vectorless-core/vectorless-graph/src/builder.rs @@ -0,0 +1,400 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document Graph Builder — constructs cross-document relationship graphs. +//! +//! This is a standalone builder (not an `IndexStage`) because it operates +//! on the workspace level across all documents, not on a single document. + +use std::collections::HashMap; + +use tracing::info; + +use super::config::DocumentGraphConfig; +use super::types::{ + DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, SharedKeyword, WeightedKeyword, +}; + +/// Intermediate data collected per document during graph building. +#[derive(Debug, Clone)] +struct DocProfile { + doc_id: String, + title: String, + format: String, + node_count: usize, + /// keyword → aggregate weight + keywords: HashMap, +} + +/// Builder for constructing a `DocumentGraph` from multiple documents. +pub struct DocumentGraphBuilder { + config: DocumentGraphConfig, + profiles: Vec, +} + +impl DocumentGraphBuilder { + /// Create a new builder with the given configuration. + pub fn new(config: DocumentGraphConfig) -> Self { + Self { + config, + profiles: Vec::new(), + } + } + + /// Create a builder with default configuration. + pub fn with_defaults() -> Self { + Self::new(DocumentGraphConfig::default()) + } + + /// Add a document's keyword profile to the builder. + /// + /// `keywords` should map keyword → aggregate weight (from + /// `ReasoningIndex::topic_paths` or extracted from content). + pub fn add_document( + &mut self, + doc_id: impl Into, + title: impl Into, + format: impl Into, + node_count: usize, + keywords: HashMap, + ) { + self.profiles.push(DocProfile { + doc_id: doc_id.into(), + title: title.into(), + format: format.into(), + node_count, + keywords, + }); + } + + /// Build the document graph from accumulated document profiles. + pub fn build(self) -> DocumentGraph { + let mut graph = DocumentGraph::new(); + + if self.profiles.is_empty() { + info!("Building document graph: 0 documents, empty graph"); + return graph; + } + + // Step 1: Add document nodes with top-N keywords + for profile in &self.profiles { + let mut weighted: Vec = profile + .keywords + .iter() + .map(|(kw, &w)| WeightedKeyword { + keyword: kw.clone(), + weight: w, + }) + .collect(); + // Sort by weight descending + weighted.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + weighted.truncate(self.config.max_keywords_per_doc); + + graph.add_node(DocumentGraphNode { + doc_id: profile.doc_id.clone(), + title: profile.title.clone(), + format: profile.format.clone(), + top_keywords: weighted, + node_count: profile.node_count, + }); + } + + info!( + "Building document graph: {} document nodes added", + graph.node_count() + ); + + // Step 2: Compute edges using the keyword inverted index + // (already built inside graph.add_node via keyword_index) + self.compute_edges(&mut graph); + + info!( + "Document graph built: {} nodes, {} edges", + graph.node_count(), + graph.edge_count() + ); + + graph + } + + /// Compute edges between documents based on shared keywords. + fn compute_edges(&self, graph: &mut DocumentGraph) { + // Collect candidate pairs: (doc_a, doc_b) → shared keywords + let mut pair_shared: HashMap<(String, String), Vec> = HashMap::new(); + + // Iterate the keyword index: for each keyword, all docs sharing it are candidates + let kw_index = graph.keyword_index_clone(); + + for (keyword, entries) in &kw_index { + if entries.len() < 2 { + continue; // No pair possible + } + // For every pair of documents sharing this keyword + for i in 0..entries.len() { + for j in (i + 1)..entries.len() { + let a = &entries[i]; + let b = &entries[j]; + let pair = if a.doc_id < b.doc_id { + (a.doc_id.clone(), b.doc_id.clone()) + } else { + (b.doc_id.clone(), a.doc_id.clone()) + }; + let shared = SharedKeyword { + keyword: keyword.clone(), + source_weight: a.weight, + target_weight: b.weight, + }; + pair_shared.entry(pair).or_default().push(shared); + } + } + } + + // Step 3: Create edges for pairs that meet thresholds + for ((doc_a, doc_b), shared_kws) in pair_shared { + let shared_count = shared_kws.len(); + if shared_count < self.config.min_shared_keywords { + continue; + } + + // Compute Jaccard: |intersection| / |union| + let kw_a = graph + .get_node(&doc_a) + .map(|n| n.top_keywords.len()) + .unwrap_or(0); + let kw_b = graph + .get_node(&doc_b) + .map(|n| n.top_keywords.len()) + .unwrap_or(0); + let union_size = kw_a + kw_b - shared_count; + let jaccard = if union_size > 0 { + shared_count as f32 / union_size as f32 + } else { + 0.0 + }; + + if jaccard < self.config.min_keyword_jaccard { + continue; + } + + // Edge weight: combine Jaccard with keyword count + let max_kws = self.config.max_keywords_per_doc.max(1) as f32; + let weight = (jaccard * 0.6 + (shared_count as f32 / max_kws).min(1.0) * 0.4).min(1.0); + + // Create bidirectional edges + let evidence_a = EdgeEvidence { + shared_keywords: shared_kws.clone(), + shared_keyword_count: shared_count, + keyword_jaccard: jaccard, + }; + let evidence_b = EdgeEvidence { + shared_keywords: shared_kws + .iter() + .map(|s| SharedKeyword { + keyword: s.keyword.clone(), + source_weight: s.target_weight, + target_weight: s.source_weight, + }) + .collect(), + shared_keyword_count: shared_count, + keyword_jaccard: jaccard, + }; + + graph.add_edge( + &doc_a, + GraphEdge { + target_doc_id: doc_b.clone(), + weight, + evidence: evidence_a, + }, + ); + graph.add_edge( + &doc_b, + GraphEdge { + target_doc_id: doc_a.clone(), + weight, + evidence: evidence_b, + }, + ); + } + + // Step 4: Trim edges per node to max_edges_per_node + self.trim_edges(graph); + } + + /// Trim edges per node to the configured maximum. + fn trim_edges(&self, graph: &mut DocumentGraph) { + let max = self.config.max_edges_per_node; + let all_edges = graph.take_edges(); + let mut trimmed: HashMap> = HashMap::new(); + + for (source, mut edges) in all_edges { + edges.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + edges.truncate(max); + trimmed.insert(source, edges); + } + + graph.set_edges(trimmed); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_keywords(pairs: &[(&str, f32)]) -> HashMap { + pairs.iter().map(|&(k, w)| (k.to_string(), w)).collect() + } + + #[test] + fn test_empty_workspace() { + let builder = DocumentGraphBuilder::with_defaults(); + let graph = builder.build(); + assert!(graph.is_empty()); + } + + #[test] + fn test_single_document() { + let mut builder = DocumentGraphBuilder::with_defaults(); + builder.add_document( + "doc1", + "Test", + "md", + 5, + make_keywords(&[("rust", 0.9), ("async", 0.7)]), + ); + let graph = builder.build(); + assert_eq!(graph.node_count(), 1); + assert_eq!(graph.edge_count(), 0); + } + + #[test] + fn test_two_docs_shared_keywords() { + let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { + min_keyword_jaccard: 0.05, + min_shared_keywords: 2, + ..DocumentGraphConfig::default() + }); + builder.add_document( + "doc1", + "Rust Programming", + "md", + 10, + make_keywords(&[("rust", 0.9), ("async", 0.8), ("tokio", 0.6)]), + ); + builder.add_document( + "doc2", + "Async Rust", + "md", + 8, + make_keywords(&[("rust", 0.7), ("async", 0.9), ("futures", 0.5)]), + ); + + let graph = builder.build(); + assert_eq!(graph.node_count(), 2); + // Should have bidirectional edges + assert!(graph.edge_count() >= 2); + + // Check doc1 → doc2 edge + let neighbors = graph.get_neighbors("doc1"); + assert_eq!(neighbors.len(), 1); + assert_eq!(neighbors[0].target_doc_id, "doc2"); + assert!(neighbors[0].weight > 0.0); + assert!(neighbors[0].evidence.keyword_jaccard > 0.0); + assert!(neighbors[0].evidence.shared_keyword_count >= 2); + + // Check doc2 → doc1 edge (bidirectional) + let neighbors2 = graph.get_neighbors("doc2"); + assert_eq!(neighbors2.len(), 1); + assert_eq!(neighbors2[0].target_doc_id, "doc1"); + } + + #[test] + fn test_unrelated_docs_no_edge() { + let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { + min_keyword_jaccard: 0.1, + min_shared_keywords: 2, + ..DocumentGraphConfig::default() + }); + builder.add_document( + "doc1", + "Rust Guide", + "md", + 10, + make_keywords(&[("rust", 0.9), ("ownership", 0.8)]), + ); + builder.add_document( + "doc2", + "Cooking Recipes", + "md", + 8, + make_keywords(&[("pasta", 0.9), ("sauce", 0.8)]), + ); + + let graph = builder.build(); + assert_eq!(graph.node_count(), 2); + assert_eq!(graph.edge_count(), 0); + } + + #[test] + fn test_jaccard_threshold() { + let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { + min_keyword_jaccard: 0.9, // Very high threshold + min_shared_keywords: 1, + ..DocumentGraphConfig::default() + }); + // Two docs with minimal overlap + builder.add_document( + "doc1", + "A", + "md", + 5, + make_keywords(&[("a", 0.9), ("b", 0.8), ("c", 0.7), ("d", 0.6), ("e", 0.5)]), + ); + builder.add_document( + "doc2", + "B", + "md", + 5, + make_keywords(&[("a", 0.9), ("x", 0.8), ("y", 0.7), ("z", 0.6)]), + ); + + let graph = builder.build(); + // Only 1 shared keyword out of 5+4=9 unique, Jaccard = 1/8 ≈ 0.125 + // Way below 0.9 threshold → no edge + assert_eq!(graph.edge_count(), 0); + } + + #[test] + fn test_max_edges_per_node() { + let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { + min_keyword_jaccard: 0.01, + min_shared_keywords: 1, + max_edges_per_node: 2, + ..DocumentGraphConfig::default() + }); + + // 4 docs all sharing keywords with doc1 + for i in 0..4 { + builder.add_document( + format!("doc{}", i), + format!("Doc {}", i), + "md", + 5, + make_keywords(&[("shared", 0.9), ("common", 0.8)]), + ); + } + + let graph = builder.build(); + // doc1 should have at most 2 outgoing edges + let neighbors = graph.get_neighbors("doc0"); + assert!(neighbors.len() <= 2); + } +} diff --git a/vectorless-core/vectorless-graph/src/config.rs b/vectorless-core/vectorless-graph/src/config.rs new file mode 100644 index 00000000..40b1d888 --- /dev/null +++ b/vectorless-core/vectorless-graph/src/config.rs @@ -0,0 +1,51 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration for document graph building and retrieval. + +use serde::{Deserialize, Serialize}; + +/// Configuration for building the document graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraphConfig { + /// Whether graph building is enabled. + pub enabled: bool, + /// Minimum Jaccard similarity for creating an edge. + pub min_keyword_jaccard: f32, + /// Minimum shared keywords to create an edge. + pub min_shared_keywords: usize, + /// Maximum top keywords per document node. + pub max_keywords_per_doc: usize, + /// Maximum edges per document node. + pub max_edges_per_node: usize, + /// Boost factor applied to graph-connected documents during retrieval. + pub retrieval_boost_factor: f32, +} + +impl Default for DocumentGraphConfig { + fn default() -> Self { + Self { + enabled: true, + min_keyword_jaccard: 0.1, + min_shared_keywords: 2, + max_keywords_per_doc: 50, + max_edges_per_node: 20, + retrieval_boost_factor: 0.15, + } + } +} + +impl DocumentGraphConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create a disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } +} diff --git a/vectorless-core/vectorless-graph/src/lib.rs b/vectorless-core/vectorless-graph/src/lib.rs new file mode 100644 index 00000000..594609e4 --- /dev/null +++ b/vectorless-core/vectorless-graph/src/lib.rs @@ -0,0 +1,38 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document graph module — workspace-level cross-document relationship graph. +//! +//! This module provides: +//! - [`DocumentGraph`] — the graph data structure connecting documents by shared concepts +//! - [`DocumentGraphBuilder`] — constructs the graph from document keyword profiles +//! - [`DocumentGraphConfig`] — configuration for graph building and retrieval boosting +//! +//! The document graph is a workspace-scoped, weighted graph built from each document's +//! [`ReasoningIndex`](vectorless_document::ReasoningIndex) keyword data. It enables +//! graph-aware retrieval ranking where connected documents receive a relevance boost. +//! +//! # Data Flow +//! +//! ```text +//! Document Indexing → ReasoningIndex (topic_paths) +//! ↓ +//! DocumentGraphBuilder::add_document() +//! ↓ +//! DocumentGraph +//! ↓ +//! Workspace::set_graph() +//! ↓ +//! Engine::query() loads graph +//! ↓ +//! CrossDocumentStrategy (graph boosting) +//! ``` + +mod builder; +mod config; +mod types; + +// Re-export public API +pub use builder::DocumentGraphBuilder; +pub use config::DocumentGraphConfig; +pub use types::{DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword}; diff --git a/vectorless-core/vectorless-graph/src/types.rs b/vectorless-core/vectorless-graph/src/types.rs new file mode 100644 index 00000000..08f8d00a --- /dev/null +++ b/vectorless-core/vectorless-graph/src/types.rs @@ -0,0 +1,310 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document Graph data types. +//! +//! Core data structures for the workspace-scoped, weighted document relationship graph. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +/// A workspace-scoped document relationship graph. +/// +/// Nodes represent documents, edges represent relationships (shared keywords, +/// references). The graph is immutable after construction and can be shared +/// across threads via `Arc`. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraph { + /// All document nodes, indexed by doc_id. + nodes: HashMap, + + /// Adjacency list: doc_id → outgoing edges. + edges: HashMap>, + + /// Inverted index: keyword → documents containing this keyword. + keyword_index: HashMap>, + + /// Graph-level metadata. + metadata: GraphMetadata, +} + +/// Expose fields for graph builder (same module). +impl DocumentGraph { + /// Take all edges out, leaving an empty map in their place. + pub(crate) fn take_edges(&mut self) -> HashMap> { + std::mem::take(&mut self.edges) + } + + /// Set edges directly (used by builder after trimming). + pub(crate) fn set_edges(&mut self, edges: HashMap>) { + self.metadata.edge_count = edges.values().map(|v| v.len()).sum(); + self.edges = edges; + } + + /// Get a clone of the keyword index (used by builder for edge computation). + pub(crate) fn keyword_index_clone(&self) -> HashMap> { + self.keyword_index.clone() + } +} + +impl DocumentGraph { + /// Create a new empty document graph. + pub fn new() -> Self { + Self { + nodes: HashMap::new(), + edges: HashMap::new(), + keyword_index: HashMap::new(), + metadata: GraphMetadata { + document_count: 0, + edge_count: 0, + }, + } + } + + /// Add a document node to the graph. + pub fn add_node(&mut self, node: DocumentGraphNode) { + // Populate keyword index from the node's top keywords + for kw in &node.top_keywords { + self.keyword_index + .entry(kw.keyword.clone()) + .or_default() + .push(KeywordDocEntry { + doc_id: node.doc_id.clone(), + weight: kw.weight, + }); + } + let doc_id = node.doc_id.clone(); + self.nodes.insert(doc_id, node); + self.metadata.document_count = self.nodes.len(); + } + + /// Add a directed edge from `source` to `target`. + pub fn add_edge(&mut self, source: &str, edge: GraphEdge) { + self.edges.entry(source.to_string()).or_default().push(edge); + self.metadata.edge_count = self.edges.values().map(|v| v.len()).sum(); + } + + /// Get a document node by ID. + pub fn get_node(&self, doc_id: &str) -> Option<&DocumentGraphNode> { + self.nodes.get(doc_id) + } + + /// Get all edges outgoing from a document. + pub fn get_neighbors(&self, doc_id: &str) -> &[GraphEdge] { + self.edges.get(doc_id).map_or(&[], Vec::as_slice) + } + + /// Find documents containing a keyword. + pub fn find_by_keyword(&self, keyword: &str) -> &[KeywordDocEntry] { + self.keyword_index.get(keyword).map_or(&[], Vec::as_slice) + } + + /// Get the number of documents in the graph. + pub fn node_count(&self) -> usize { + self.nodes.len() + } + + /// Get the number of edges in the graph. + pub fn edge_count(&self) -> usize { + self.edges.values().map(|v| v.len()).sum() + } + + /// Get all document IDs in the graph. + pub fn doc_ids(&self) -> impl Iterator { + self.nodes.keys().map(|s| s.as_str()) + } + + /// Get graph metadata. + pub fn metadata(&self) -> &GraphMetadata { + &self.metadata + } + + /// Check if the graph is empty. + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() + } +} + +impl Default for DocumentGraph { + fn default() -> Self { + Self::new() + } +} + +/// A document node in the graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraphNode { + /// Document ID (matches `PersistedDocument.meta.id`). + pub doc_id: String, + /// Document title/name. + pub title: String, + /// Document format (md, pdf). + pub format: String, + /// Top-N representative keywords extracted from the document's + /// ReasoningIndex topic_paths, sorted by aggregate weight. + pub top_keywords: Vec, + /// Number of nodes in the document tree. + pub node_count: usize, +} + +/// A keyword with its aggregate weight across the document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct WeightedKeyword { + /// The keyword string (lowercased). + pub keyword: String, + /// Aggregate weight across all TopicEntry instances (0.0 - 1.0). + pub weight: f32, +} + +/// An edge connecting two documents. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GraphEdge { + /// Target document ID. + pub target_doc_id: String, + /// Edge weight (0.0 - 1.0). Higher = stronger relationship. + pub weight: f32, + /// Evidence for why these documents are connected. + pub evidence: EdgeEvidence, +} + +/// Evidence for why two documents are connected. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EdgeEvidence { + /// Keywords shared between the two documents. + pub shared_keywords: Vec, + /// Number of shared keywords. + pub shared_keyword_count: usize, + /// Jaccard similarity of keyword sets. + pub keyword_jaccard: f32, +} + +/// A keyword shared between two documents. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SharedKeyword { + /// The shared keyword. + pub keyword: String, + /// Weight in source document. + pub source_weight: f32, + /// Weight in target document. + pub target_weight: f32, +} + +/// Entry in the keyword inverted index. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct KeywordDocEntry { + /// Document ID containing this keyword. + pub doc_id: String, + /// Weight of this keyword in the document. + pub weight: f32, +} + +/// Graph-level metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct GraphMetadata { + /// Number of documents in the graph. + pub document_count: usize, + /// Number of edges in the graph. + pub edge_count: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_graph() { + let graph = DocumentGraph::new(); + assert!(graph.is_empty()); + assert_eq!(graph.node_count(), 0); + assert_eq!(graph.edge_count(), 0); + } + + #[test] + fn test_add_node() { + let mut graph = DocumentGraph::new(); + graph.add_node(DocumentGraphNode { + doc_id: "doc1".to_string(), + title: "Test Doc".to_string(), + format: "md".to_string(), + top_keywords: vec![ + WeightedKeyword { + keyword: "rust".to_string(), + weight: 0.9, + }, + WeightedKeyword { + keyword: "async".to_string(), + weight: 0.7, + }, + ], + node_count: 10, + }); + + assert_eq!(graph.node_count(), 1); + assert!(graph.get_node("doc1").is_some()); + assert_eq!(graph.find_by_keyword("rust").len(), 1); + assert_eq!(graph.find_by_keyword("async").len(), 1); + assert_eq!(graph.find_by_keyword("missing").len(), 0); + } + + #[test] + fn test_add_edge() { + let mut graph = DocumentGraph::new(); + graph.add_node(DocumentGraphNode { + doc_id: "doc1".to_string(), + title: "A".to_string(), + format: "md".to_string(), + top_keywords: vec![], + node_count: 5, + }); + graph.add_node(DocumentGraphNode { + doc_id: "doc2".to_string(), + title: "B".to_string(), + format: "md".to_string(), + top_keywords: vec![], + node_count: 8, + }); + + graph.add_edge( + "doc1", + GraphEdge { + target_doc_id: "doc2".to_string(), + weight: 0.5, + evidence: EdgeEvidence { + shared_keywords: vec![SharedKeyword { + keyword: "rust".to_string(), + source_weight: 0.9, + target_weight: 0.8, + }], + shared_keyword_count: 1, + keyword_jaccard: 0.3, + }, + }, + ); + + assert_eq!(graph.edge_count(), 1); + assert_eq!(graph.get_neighbors("doc1").len(), 1); + assert_eq!(graph.get_neighbors("doc1")[0].target_doc_id, "doc2"); + assert_eq!(graph.get_neighbors("doc2").len(), 0); + } + + #[test] + fn test_serialization_roundtrip() { + let mut graph = DocumentGraph::new(); + graph.add_node(DocumentGraphNode { + doc_id: "doc1".to_string(), + title: "Test".to_string(), + format: "md".to_string(), + top_keywords: vec![WeightedKeyword { + keyword: "test".to_string(), + weight: 1.0, + }], + node_count: 3, + }); + + let json = serde_json::to_string(&graph).unwrap(); + let deserialized: DocumentGraph = serde_json::from_str(&json).unwrap(); + assert_eq!(deserialized.node_count(), 1); + assert_eq!(deserialized.get_node("doc1").unwrap().title, "Test"); + } +} diff --git a/vectorless-core/vectorless-index/Cargo.toml b/vectorless-core/vectorless-index/Cargo.toml new file mode 100644 index 00000000..132a3b10 --- /dev/null +++ b/vectorless-core/vectorless-index/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "vectorless-index" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-config = { path = "../vectorless-config" } +vectorless-document = { path = "../vectorless-document" } +vectorless-error = { path = "../vectorless-error" } +vectorless-llm = { path = "../vectorless-llm" } +vectorless-metrics = { path = "../vectorless-metrics" } +vectorless-scoring = { path = "../vectorless-scoring" } +vectorless-storage = { path = "../vectorless-storage" } +vectorless-utils = { path = "../vectorless-utils" } +tokio = { workspace = true } +async-trait = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +pulldown-cmark = { workspace = true } +pdf-extract = { workspace = true } +lopdf = { workspace = true } +regex = { workspace = true } +uuid = { workspace = true } +chrono = { workspace = true } +rand = { workspace = true } +futures = { workspace = true } +base64 = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-index/src/config.rs b/vectorless-core/vectorless-index/src/config.rs new file mode 100644 index 00000000..e9133c40 --- /dev/null +++ b/vectorless-core/vectorless-index/src/config.rs @@ -0,0 +1,389 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration types for the index pipeline. +//! +//! This module contains all configuration types used by the indexing pipeline: +//! - [`IndexMode`] - Document format selection +//! - [`PipelineOptions`] - Full pipeline configuration +//! - [`OptimizationConfig`] - Tree optimization settings +//! - [`ThinningConfig`] - Node merging settings + +use super::summary::SummaryStrategy; +use vectorless_config::IndexerConfig; +use vectorless_document::{DocumentTree, ReasoningIndexConfig}; +use vectorless_llm::throttle::ConcurrencyConfig; +use vectorless_utils::fingerprint::{Fingerprint, Fingerprinter}; + +use std::path::PathBuf; + +/// Index mode for document processing. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IndexMode { + /// Auto-detect format from file extension. + Auto, + /// Force Markdown format. + Markdown, + /// Force PDF format. + Pdf, +} + +impl Default for IndexMode { + fn default() -> Self { + Self::Auto + } +} + +/// Configuration for tree optimization. +#[derive(Debug, Clone)] +pub struct OptimizationConfig { + /// Whether optimization is enabled. + pub enabled: bool, + + /// Maximum tree depth (flatten if exceeded). + pub max_depth: Option, + + /// Maximum children per node (group if exceeded). + pub max_children: Option, + + /// Minimum tokens for a leaf node (merge smaller ones). + pub merge_leaf_threshold: usize, +} + +impl Default for OptimizationConfig { + fn default() -> Self { + Self { + enabled: true, + max_depth: None, + max_children: None, + merge_leaf_threshold: 0, + } + } +} + +impl OptimizationConfig { + /// Create a new optimization config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Disable optimization entirely. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Set maximum depth. + pub fn with_max_depth(mut self, depth: usize) -> Self { + self.max_depth = Some(depth); + self + } + + /// Set maximum children per node. + pub fn with_max_children(mut self, max: usize) -> Self { + self.max_children = Some(max); + self + } +} + +/// Configuration for thinning (merging small nodes). +#[derive(Debug, Clone)] +pub struct ThinningConfig { + /// Whether thinning is enabled. + pub enabled: bool, + + /// Token threshold for merging. + pub threshold: usize, + + /// Whether to merge child content into the parent when removing children. + /// When true, nodes below threshold absorb their children's text before removal. + /// When false, small nodes are simply discarded. + pub merge_content: bool, +} + +impl Default for ThinningConfig { + fn default() -> Self { + Self { + enabled: false, + threshold: 500, + merge_content: true, + } + } +} + +impl ThinningConfig { + /// Create disabled config. + pub fn disabled() -> Self { + Self::default() + } + + /// Create enabled config with threshold. + pub fn enabled(threshold: usize) -> Self { + Self { + enabled: true, + threshold, + merge_content: true, + } + } + + /// Set the token threshold. + pub fn with_threshold(mut self, threshold: usize) -> Self { + self.threshold = threshold; + self + } + + /// Set whether to merge content. + pub fn with_merge_content(mut self, merge: bool) -> Self { + self.merge_content = merge; + self + } +} + +/// Configuration for large node splitting. +#[derive(Debug, Clone)] +pub struct SplitConfig { + /// Whether splitting is enabled. + pub enabled: bool, + + /// Maximum tokens per leaf node. Nodes exceeding this are split. + pub max_tokens_per_node: usize, + + /// Whether to use pattern-based splitting (headings, paragraphs). + /// When false, splits at approximate byte boundaries. + pub pattern_split: bool, +} + +impl Default for SplitConfig { + fn default() -> Self { + Self { + enabled: true, + max_tokens_per_node: 4000, + pattern_split: true, + } + } +} + +impl SplitConfig { + /// Create disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } + + /// Create enabled config with custom token limit. + pub fn with_max_tokens(mut self, max: usize) -> Self { + self.max_tokens_per_node = max; + self + } + + /// Set whether to use pattern-based splitting. + pub fn with_pattern_split(mut self, pattern: bool) -> Self { + self.pattern_split = pattern; + self + } +} + +/// Pipeline options for index execution. +#[derive(Debug, Clone)] +pub struct PipelineOptions { + /// Index mode. + pub mode: IndexMode, + + /// Whether to generate node IDs. + pub generate_ids: bool, + + /// Summary generation strategy. + pub summary_strategy: SummaryStrategy, + + /// Thinning configuration. + pub thinning: ThinningConfig, + + /// Optimization configuration. + pub optimization: OptimizationConfig, + + /// Split configuration. + pub split: SplitConfig, + + /// Whether to generate document description. + pub generate_description: bool, + + /// Concurrency configuration. + pub concurrency: ConcurrencyConfig, + + /// Indexer configuration. + pub indexer: IndexerConfig, + + /// Reasoning index configuration. + pub reasoning_index: ReasoningIndexConfig, + + /// Existing tree from a previous index (for incremental updates). + /// Stages (enhance, reasoning) can reuse data from unchanged nodes. + pub existing_tree: Option, + + /// Current processing version. Bumped when indexing algorithm changes + /// to force reprocessing of existing documents. + pub processing_version: u32, + + /// Directory for pipeline checkpoints. + /// When set, the pipeline saves state after each stage group + /// and can resume from the last completed stage on restart. + /// When `None`, checkpointing is disabled. + pub checkpoint_dir: Option, +} + +impl Default for PipelineOptions { + fn default() -> Self { + Self { + mode: IndexMode::Auto, + generate_ids: true, + summary_strategy: SummaryStrategy::full(), + thinning: ThinningConfig::default(), + optimization: OptimizationConfig::default(), + split: SplitConfig::default(), + generate_description: true, + concurrency: ConcurrencyConfig::default(), + indexer: IndexerConfig::default(), + reasoning_index: ReasoningIndexConfig::default(), + existing_tree: None, + processing_version: 1, + checkpoint_dir: None, + } + } +} + +impl PipelineOptions { + /// Create new pipeline options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the index mode. + pub fn with_mode(mut self, mode: IndexMode) -> Self { + self.mode = mode; + self + } + + /// Set whether to generate node IDs. + pub fn with_generate_ids(mut self, generate: bool) -> Self { + self.generate_ids = generate; + self + } + + /// Set the summary strategy. + pub fn with_summary_strategy(mut self, strategy: SummaryStrategy) -> Self { + self.summary_strategy = strategy; + self + } + + /// Set the thinning configuration. + pub fn with_thinning(mut self, thinning: ThinningConfig) -> Self { + self.thinning = thinning; + self + } + + /// Set the optimization configuration. + pub fn with_optimization(mut self, optimization: OptimizationConfig) -> Self { + self.optimization = optimization; + self + } + + /// Set the split configuration. + pub fn with_split(mut self, split: SplitConfig) -> Self { + self.split = split; + self + } + + /// Set whether to generate document description. + pub fn with_generate_description(mut self, generate: bool) -> Self { + self.generate_description = generate; + self + } + + /// Set the concurrency configuration. + pub fn with_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self { + self.concurrency = concurrency; + self + } + + /// Set the indexer configuration. + pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { + self.indexer = indexer; + self + } + + /// Set the reasoning index configuration. + pub fn with_reasoning_index(mut self, config: ReasoningIndexConfig) -> Self { + self.reasoning_index = config; + self + } + + /// Set the checkpoint directory. + /// + /// When set, the pipeline saves state after each stage group + /// and can resume from the last completed stage on restart. + pub fn with_checkpoint_dir(mut self, dir: impl Into) -> Self { + self.checkpoint_dir = Some(dir.into()); + self + } + + /// Compute a fingerprint of the pipeline configuration. + /// + /// If this fingerprint changes between runs, all documents need full reprocessing + /// even if their content hasn't changed (because the processing logic is different). + pub fn logic_fingerprint(&self) -> Fingerprint { + Fingerprinter::new() + .with_str(&format!("{:?}", self.mode)) + .with_bool(self.generate_ids) + .with_str(&format!("{:?}", self.summary_strategy)) + .with_bool(self.generate_description) + .with_bool(self.optimization.enabled) + .with_str(&format!("{:?}", self.reasoning_index)) + .into_fingerprint() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_mode_default() { + let mode = IndexMode::default(); + assert_eq!(mode, IndexMode::Auto); + } + + #[test] + fn test_optimization_config() { + let config = OptimizationConfig::new() + .with_max_depth(5) + .with_max_children(10); + + assert!(config.enabled); + assert_eq!(config.max_depth, Some(5)); + assert_eq!(config.max_children, Some(10)); + } + + #[test] + fn test_thinning_config() { + let config = ThinningConfig::enabled(300); + assert!(config.enabled); + assert_eq!(config.threshold, 300); + + let disabled = ThinningConfig::disabled(); + assert!(!disabled.enabled); + } + + #[test] + fn test_pipeline_options_builder() { + let options = PipelineOptions::new() + .with_mode(IndexMode::Markdown) + .with_generate_ids(false); + + assert_eq!(options.mode, IndexMode::Markdown); + assert!(!options.generate_ids); + } +} diff --git a/vectorless-core/vectorless-index/src/incremental/detector.rs b/vectorless-core/vectorless-index/src/incremental/detector.rs new file mode 100644 index 00000000..011edab8 --- /dev/null +++ b/vectorless-core/vectorless-index/src/incremental/detector.rs @@ -0,0 +1,654 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Change detection for incremental updates. +//! +//! This module provides fine-grained change detection using subtree fingerprints, +//! enabling precise identification of changed nodes without full reprocessing. + +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::path::Path; +use std::time::SystemTime; + +use serde::{Deserialize, Serialize}; + +use vectorless_document::{DocumentTree, NodeId}; +use vectorless_utils::fingerprint::{Fingerprint, Fingerprinter, NodeFingerprint}; + +/// Type of change detected. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum ChangeType { + /// Node was added. + Added, + /// Node was removed. + Removed, + /// Node content changed. + Modified, + /// Node structure changed (children added/removed). + Restructured, +} + +impl std::fmt::Display for ChangeType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ChangeType::Added => write!(f, "added"), + ChangeType::Removed => write!(f, "removed"), + ChangeType::Modified => write!(f, "modified"), + ChangeType::Restructured => write!(f, "restructured"), + } + } +} + +/// A single change in the document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct NodeChange { + /// Node ID (from old tree). + pub node_id: Option, + /// Node title (for human-readable output). + pub title: String, + /// Type of change. + pub change_type: ChangeType, + /// Node fingerprint (for modified nodes). + #[serde(default, skip_serializing_if = "Option::is_none")] + pub fingerprint: Option, +} + +impl NodeChange { + /// Create a new node change. + pub fn new(node_id: Option, title: String, change_type: ChangeType) -> Self { + Self { + node_id, + title, + change_type, + fingerprint: None, + } + } + + /// Add fingerprint information. + pub fn with_fingerprint(mut self, fp: NodeFingerprint) -> Self { + self.fingerprint = Some(fp); + self + } +} + +/// Set of changes between two document versions. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ChangeSet { + /// Added nodes. + pub added: Vec, + /// Removed nodes. + pub removed: Vec, + /// Modified nodes (content changed). + pub modified: Vec, + /// Restructured nodes (children changed). + pub restructured: Vec, +} + +impl ChangeSet { + /// Create an empty change set. + pub fn new() -> Self { + Self::default() + } + + /// Check if there are any changes. + pub fn is_empty(&self) -> bool { + self.added.is_empty() + && self.removed.is_empty() + && self.modified.is_empty() + && self.restructured.is_empty() + } + + /// Get total number of changes. + pub fn total_changes(&self) -> usize { + self.added.len() + self.removed.len() + self.modified.len() + self.restructured.len() + } + + /// Merge another change set into this one. + pub fn merge(&mut self, other: ChangeSet) { + self.added.extend(other.added); + self.removed.extend(other.removed); + self.modified.extend(other.modified); + self.restructured.extend(other.restructured); + } + + /// Get all changed node IDs. + pub fn changed_node_ids(&self) -> Vec<&str> { + let mut ids: Vec<&str> = Vec::new(); + for change in &self.added { + if let Some(ref id) = change.node_id { + ids.push(id.as_str()); + } + } + for change in &self.modified { + if let Some(ref id) = change.node_id { + ids.push(id.as_str()); + } + } + for change in &self.restructured { + if let Some(ref id) = change.node_id { + ids.push(id.as_str()); + } + } + ids + } +} + +/// Document-level change detection result. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentChangeInfo { + /// Document ID. + pub doc_id: String, + /// Overall content fingerprint. + pub content_fp: Fingerprint, + /// Node-level fingerprints. + pub node_fingerprints: HashMap, + /// Last modification time. + pub modified_at: chrono::DateTime, + /// Processing version (incremented when processing algorithm changes). + pub processing_version: u32, +} + +impl DocumentChangeInfo { + /// Create a new document change info. + pub fn new(doc_id: &str) -> Self { + Self { + doc_id: doc_id.to_string(), + content_fp: Fingerprint::zero(), + node_fingerprints: HashMap::new(), + modified_at: chrono::Utc::now(), + processing_version: 1, + } + } + + /// Update from a tree. + pub fn update_from_tree(&mut self, tree: &DocumentTree) { + self.content_fp = compute_tree_fingerprint(tree); + self.node_fingerprints = compute_all_node_fingerprints(tree); + self.modified_at = chrono::Utc::now(); + } +} + +/// Change detector for incremental updates. +/// +/// Supports both simple hash-based detection and fine-grained +/// subtree fingerprint-based detection. +pub struct ChangeDetector { + /// Content fingerprints by document ID. + content_fps: HashMap, + + /// Node-level fingerprints by document ID. + node_fps: HashMap>, + + /// File modification times by document ID. + mtimes: HashMap, + + /// Processing versions by document ID. + processing_versions: HashMap, + + /// Current processing version (for algorithm upgrades). + current_processing_version: u32, +} + +impl ChangeDetector { + /// Create a new change detector. + pub fn new() -> Self { + Self { + content_fps: HashMap::new(), + node_fps: HashMap::new(), + mtimes: HashMap::new(), + processing_versions: HashMap::new(), + current_processing_version: 1, + } + } + + /// Set the current processing version. + pub fn with_processing_version(mut self, version: u32) -> Self { + self.current_processing_version = version; + self + } + + /// Compute hash of content (simple u64 hash). + fn hash_content(content: &str) -> u64 { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + content.hash(&mut hasher); + hasher.finish() + } + + /// Check if a file needs reindexing based on mtime. + pub fn needs_reindex_by_mtime(&self, doc_id: &str, path: &Path) -> bool { + let Some(recorded_mtime) = self.mtimes.get(doc_id) else { + return true; // Never indexed + }; + + let Ok(metadata) = std::fs::metadata(path) else { + return true; // Can't read file + }; + + let Ok(current_mtime) = metadata.modified() else { + return true; + }; + + current_mtime > *recorded_mtime + } + + /// Check if content needs reindexing based on fingerprint. + pub fn needs_reindex_by_hash(&self, doc_id: &str, content: &str) -> bool { + let current_fp = Fingerprint::from_str(content); + + match self.content_fps.get(doc_id) { + Some(recorded_fp) => recorded_fp != ¤t_fp, + None => true, + } + } + + /// Check if document needs reindexing based on fingerprint. + pub fn needs_reindex_by_fingerprint(&self, doc_id: &str, new_fp: &Fingerprint) -> bool { + match self.content_fps.get(doc_id) { + Some(recorded_fp) => recorded_fp != new_fp, + None => true, + } + } + + /// Check if processing version has changed. + pub fn needs_reindex_by_version(&self, doc_id: &str) -> bool { + match self.processing_versions.get(doc_id) { + Some(recorded_version) => *recorded_version < self.current_processing_version, + None => true, + } + } + + /// Record document state after indexing. + pub fn record(&mut self, doc_id: &str, content: &str, path: Option<&Path>) { + self.record_with_tree(doc_id, content, None, path); + } + + /// Record document state with tree (for fine-grained detection). + pub fn record_with_tree( + &mut self, + doc_id: &str, + content: &str, + tree: Option<&DocumentTree>, + path: Option<&Path>, + ) { + // Record content fingerprint + let content_fp = Fingerprint::from_str(content); + self.content_fps.insert(doc_id.to_string(), content_fp); + + // Record node fingerprints if tree provided + if let Some(tree) = tree { + let node_fps = compute_all_node_fingerprints(tree); + self.node_fps.insert(doc_id.to_string(), node_fps); + } + + // Record mtime if path provided + if let Some(path) = path { + if let Ok(metadata) = std::fs::metadata(path) { + if let Ok(mtime) = metadata.modified() { + self.mtimes.insert(doc_id.to_string(), mtime); + } + } + } + + // Record processing version + self.processing_versions + .insert(doc_id.to_string(), self.current_processing_version); + } + + /// Record document from ChangeInfo. + pub fn record_change_info(&mut self, info: &DocumentChangeInfo, path: Option<&Path>) { + self.content_fps + .insert(info.doc_id.clone(), info.content_fp); + self.node_fps + .insert(info.doc_id.clone(), info.node_fingerprints.clone()); + self.processing_versions + .insert(info.doc_id.clone(), info.processing_version); + + if let Some(path) = path { + if let Ok(metadata) = std::fs::metadata(path) { + if let Ok(mtime) = metadata.modified() { + self.mtimes.insert(info.doc_id.clone(), mtime); + } + } + } + } + + /// Detect changes between two trees using fingerprints. + pub fn detect_changes(&self, old_tree: &DocumentTree, new_tree: &DocumentTree) -> ChangeSet { + let mut changes = ChangeSet::new(); + + // Collect fingerprints from both trees + let old_fps = compute_all_node_fingerprints(old_tree); + let new_fps = compute_all_node_fingerprints(new_tree); + + // Build title -> (string_key, Fingerprint) maps by traversing trees + // We store owned Strings to avoid lifetime issues + let old_by_title: HashMap = { + let mut map = HashMap::new(); + for node_id in old_tree.traverse() { + if let Some(node) = old_tree.get(node_id) { + let key = node + .node_id + .clone() + .unwrap_or_else(|| format!("node_{:?}", node_id.0)); + if let Some(fp) = old_fps.get(&key) { + map.insert(node.title.clone(), (key, fp.clone())); + } + } + } + map + }; + + let new_by_title: HashMap = { + let mut map = HashMap::new(); + for node_id in new_tree.traverse() { + if let Some(node) = new_tree.get(node_id) { + let key = node + .node_id + .clone() + .unwrap_or_else(|| format!("node_{:?}", node_id.0)); + if let Some(fp) = new_fps.get(&key) { + map.insert(node.title.clone(), (key, fp.clone())); + } + } + } + map + }; + + // Find added nodes + for (title, (node_key, fp)) in &new_by_title { + if !old_by_title.contains_key(title) { + changes.added.push( + NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Added) + .with_fingerprint(fp.clone()), + ); + } + } + + // Find removed nodes + for (title, (node_key, fp)) in &old_by_title { + if !new_by_title.contains_key(title) { + changes.removed.push( + NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Removed) + .with_fingerprint(fp.clone()), + ); + } + } + + // Find modified nodes + for (title, (new_key, new_fp)) in &new_by_title { + if let Some((_old_key, old_fp)) = old_by_title.get(title) { + if new_fp.content_changed(old_fp) { + changes.modified.push( + NodeChange::new(Some(new_key.clone()), title.clone(), ChangeType::Modified) + .with_fingerprint(new_fp.clone()), + ); + } else if new_fp.subtree_changed(old_fp) { + changes.restructured.push( + NodeChange::new( + Some(new_key.clone()), + title.clone(), + ChangeType::Restructured, + ) + .with_fingerprint(new_fp.clone()), + ); + } + } + } + + changes + } + + /// Get nodes that need reprocessing (summary regeneration). + /// + /// This returns nodes where either: + /// - Content changed (summary may need update) + /// - Processing version changed (all summaries need update) + pub fn get_nodes_needing_reprocess( + &self, + doc_id: &str, + new_tree: &DocumentTree, + ) -> Option> { + let old_fps = self.node_fps.get(doc_id)?; + let new_fps = compute_all_node_fingerprints(new_tree); + + let mut needs_reprocess = Vec::new(); + + // If processing version changed, all nodes need reprocessing + if self.needs_reindex_by_version(doc_id) { + return Some(new_fps.keys().cloned().collect()); + } + + // Otherwise, only changed nodes need reprocessing + for (node_key, new_fp) in &new_fps { + if let Some(old_fp) = old_fps.get(node_key) { + // Content changed or subtree structure changed + if new_fp.content_changed(old_fp) || new_fp.subtree_changed(old_fp) { + needs_reprocess.push(node_key.clone()); + } + } else { + // New node + needs_reprocess.push(node_key.clone()); + } + } + + Some(needs_reprocess) + } + + /// Clear stored data for a document. + pub fn clear(&mut self, doc_id: &str) { + self.content_fps.remove(doc_id); + self.node_fps.remove(doc_id); + self.mtimes.remove(doc_id); + self.processing_versions.remove(doc_id); + } + + /// Get the current content fingerprint for a document. + pub fn get_content_fingerprint(&self, doc_id: &str) -> Option<&Fingerprint> { + self.content_fps.get(doc_id) + } + + /// Get all node fingerprints for a document. + pub fn get_node_fingerprints(&self, doc_id: &str) -> Option<&HashMap> { + self.node_fps.get(doc_id) + } + + /// Serialize state for persistence. + pub fn to_state(&self) -> ChangeDetectorState { + ChangeDetectorState { + content_fps: self.content_fps.clone(), + node_fps: self.node_fps.clone(), + processing_versions: self.processing_versions.clone(), + } + } + + /// Restore state from persistence. + pub fn from_state(state: ChangeDetectorState) -> Self { + Self { + content_fps: state.content_fps, + node_fps: state.node_fps, + mtimes: HashMap::new(), + processing_versions: state.processing_versions, + current_processing_version: 1, + } + } +} + +impl Default for ChangeDetector { + fn default() -> Self { + Self::new() + } +} + +/// Serializable state for change detector. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChangeDetectorState { + /// Content fingerprints by document ID. + pub content_fps: HashMap, + /// Node fingerprints by document ID. + pub node_fps: HashMap>, + /// Processing versions by document ID. + pub processing_versions: HashMap, +} + +// ============================================================================= +// Helper Functions +// ============================================================================= + +/// Compute the overall fingerprint for a tree. +pub fn compute_tree_fingerprint(tree: &DocumentTree) -> Fingerprint { + let root_fp = compute_node_fingerprint(tree, tree.root()); + root_fp.subtree +} + +/// Compute content fingerprint for a single node. +fn compute_node_content_fp(tree: &DocumentTree, node_id: NodeId) -> Fingerprint { + let node = match tree.get(node_id) { + Some(n) => n, + None => return Fingerprint::zero(), + }; + + Fingerprinter::new() + .with_str(&node.title) + .with_str(&node.content) + .with_option_str(node.node_id.as_deref()) + .into_fingerprint() +} + +/// Compute fingerprint for a node and its subtree. +fn compute_node_fingerprint(tree: &DocumentTree, node_id: NodeId) -> NodeFingerprint { + let node = match tree.get(node_id) { + Some(n) => n, + None => return NodeFingerprint::zero(), + }; + + // Content fingerprint + let content_fp = Fingerprinter::new() + .with_str(&node.title) + .with_str(&node.content) + .with_option_str(node.node_id.as_deref()) + .into_fingerprint(); + + // Check if leaf node + let children = tree.children(node_id); + if children.is_empty() { + return NodeFingerprint::leaf(content_fp); + } + + // Compute subtree fingerprint from children + let mut subtree_fp = Fingerprinter::new(); + subtree_fp.write_fingerprint(&content_fp); + + for child_id in children { + let child_fp = compute_node_fingerprint(tree, child_id); + subtree_fp.write_fingerprint(&child_fp.subtree); + } + + NodeFingerprint::new(content_fp, subtree_fp.into_fingerprint()) +} + +/// Compute fingerprints for all nodes in a tree. +/// Returns a map from string key (for persistence) to NodeFingerprint. +pub fn compute_all_node_fingerprints(tree: &DocumentTree) -> HashMap { + let mut fingerprints = HashMap::new(); + + for node_id in tree.traverse() { + if let Some(node) = tree.get(node_id) { + let key = node + .node_id + .clone() + .unwrap_or_else(|| format!("node_{:?}", node_id.0)); + let fp = compute_node_fingerprint(tree, node_id); + fingerprints.insert(key, fp); + } + } + + fingerprints +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::DocumentTree; + + #[test] + fn test_change_detector_new() { + let detector = ChangeDetector::new(); + assert!(detector.content_fps.is_empty()); + } + + #[test] + fn test_needs_reindex_by_hash() { + let mut detector = ChangeDetector::new(); + + // First time: always needs reindex + assert!(detector.needs_reindex_by_hash("doc1", "content")); + + // Record the content + detector.record("doc1", "content", None); + + // Same content: no reindex needed + assert!(!detector.needs_reindex_by_hash("doc1", "content")); + + // Different content: needs reindex + assert!(detector.needs_reindex_by_hash("doc1", "new content")); + } + + #[test] + fn test_change_set() { + let mut changes = ChangeSet::new(); + assert!(changes.is_empty()); + + changes.added.push(NodeChange::new( + Some("node1".to_string()), + "Title".to_string(), + ChangeType::Added, + )); + + assert!(!changes.is_empty()); + assert_eq!(changes.total_changes(), 1); + } + + #[test] + fn test_processing_version() { + let mut detector = ChangeDetector::new().with_processing_version(2); + detector.record("doc1", "content", None); + + // Version matches, no reindex needed + assert!(!detector.needs_reindex_by_version("doc1")); + + // Create new detector with higher version + let detector2 = ChangeDetector::new().with_processing_version(3); + assert!(detector2.needs_reindex_by_version("doc1")); + } + + #[test] + fn test_node_fingerprint() { + let mut tree = DocumentTree::new("Root", "root content"); + let child = tree.add_child(tree.root(), "Child", "child content"); + + let root_fp = compute_node_fingerprint(&tree, tree.root()); + let child_fp = compute_node_fingerprint(&tree, child); + + // Child is a leaf, content == subtree + assert_eq!(child_fp.content, child_fp.subtree); + + // Root is not a leaf + assert_ne!(root_fp.content, root_fp.subtree); + } + + #[test] + fn test_fingerprint_serialization() { + let mut detector = ChangeDetector::new(); + let mut tree = DocumentTree::new("Root", "content"); + tree.add_child(tree.root(), "Section", "section content"); + + detector.record_with_tree("doc1", "content", Some(&tree), None); + + let state = detector.to_state(); + let json = serde_json::to_string(&state).unwrap(); + let restored: ChangeDetectorState = serde_json::from_str(&json).unwrap(); + + assert_eq!(state.content_fps, restored.content_fps); + } +} diff --git a/vectorless-core/vectorless-index/src/incremental/mod.rs b/vectorless-core/vectorless-index/src/incremental/mod.rs new file mode 100644 index 00000000..71f28fbe --- /dev/null +++ b/vectorless-core/vectorless-index/src/incremental/mod.rs @@ -0,0 +1,81 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Incremental indexing support. +//! +//! This module provides functionality to incrementally update +//! an existing document index when the source document changes. +//! +//! # Features +//! +//! - **Fine-grained change detection**: Uses subtree fingerprints to identify +//! exactly which nodes changed +//! - **Processing version tracking**: Automatically reprocesses when algorithm +//! versions change +//! - **Partial updates**: Only reprocess changed nodes + +mod detector; +mod resolver; +mod updater; + +use vectorless_document::DocumentTree; +pub use detector::ChangeDetector; +pub use resolver::{IndexAction, SkipInfo, resolve_action}; +use std::collections::HashMap; + +/// Reuse summaries from old tree for unchanged nodes in the new tree. +/// +/// Uses `ChangeDetector` to find which nodes changed, then copies +/// summaries from old tree nodes with matching titles that are unchanged. +/// +/// Returns a map of `title -> summary` for reusable summaries. +pub fn compute_reusable_summaries( + old_tree: &DocumentTree, + new_tree: &DocumentTree, +) -> HashMap { + let detector = ChangeDetector::new(); + let changes = detector.detect_changes(old_tree, new_tree); + + let changed_titles: std::collections::HashSet = changes + .modified + .iter() + .chain(changes.restructured.iter()) + .chain(changes.added.iter()) + .chain(changes.removed.iter()) + .map(|c| c.title.clone()) + .collect(); + + let mut reusable = HashMap::new(); + for node_id in old_tree.traverse() { + if let Some(node) = old_tree.get(node_id) { + if !changed_titles.contains(&node.title) && !node.summary.is_empty() { + reusable.insert(node.title.clone(), node.summary.clone()); + } + } + } + reusable +} + +/// Apply reusable summaries to a new tree. +/// +/// For each node in `new_tree` whose title matches a key in `summaries`, +/// sets the node's summary from the map. +/// +/// Returns the number of summaries applied. +pub fn apply_reusable_summaries( + new_tree: &mut DocumentTree, + summaries: &HashMap, +) -> usize { + let mut applied = 0; + for node_id in new_tree.traverse() { + if let Some(node) = new_tree.get(node_id) { + if node.summary.is_empty() { + if let Some(summary) = summaries.get(&node.title) { + new_tree.set_summary(node_id, summary); + applied += 1; + } + } + } + } + applied +} diff --git a/vectorless-core/vectorless-index/src/incremental/resolver.rs b/vectorless-core/vectorless-index/src/incremental/resolver.rs new file mode 100644 index 00000000..c87f571f --- /dev/null +++ b/vectorless-core/vectorless-index/src/incremental/resolver.rs @@ -0,0 +1,105 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Incremental indexing resolver — decides what action to take for a source. +//! +//! Three-layer change detection: +//! 1. **File-level**: content fingerprint → skip if unchanged +//! 2. **Logic-level**: pipeline config fingerprint → full reprocess if changed +//! 3. **Node-level**: Merkle subtree diff → incremental update + +use tracing::info; + +use vectorless_document::DocumentFormat; +use vectorless_document::DocumentTree; +use crate::index::config::PipelineOptions; +use vectorless_storage::PersistedDocument; +use vectorless_utils::fingerprint::Fingerprint; + +/// Action to take for a source during indexing. +pub enum IndexAction { + /// Skip entirely — content unchanged. + Skip(SkipInfo), + /// Full index from scratch — new file, logic changed, or force mode. + /// If replacing an existing document, `existing_id` contains the old doc ID + /// to clean up after the new document is successfully saved. + FullIndex { + /// Old document ID to remove after successful re-index (if replacing). + existing_id: Option, + }, + /// Incremental update — content changed, pipeline unchanged. + IncrementalUpdate { + /// The old tree to reuse data from. + old_tree: DocumentTree, + /// The existing document ID (preserved across updates). + existing_id: String, + }, +} + +/// Info returned when a source is skipped. +pub struct SkipInfo { + /// Existing document ID. + pub doc_id: String, + /// Document name. + pub name: String, + /// Document format. + pub format: DocumentFormat, + /// Document description. + pub description: Option, + /// Page count. + pub page_count: Option, +} + +/// Resolve what action to take for a source file. +/// +/// This is the core three-layer incremental decision: +/// +/// 1. **File fingerprint**: Compare file bytes hash with stored `content_fingerprint`. +/// If equal → `Skip` (nothing changed). +/// +/// 2. **Logic fingerprint**: Compare pipeline config hash with stored `logic_fingerprint`. +/// If different → `FullIndex` (processing logic changed, must reprocess everything). +/// +/// 3. **Incremental**: Content changed but pipeline unchanged → `IncrementalUpdate` +/// with the old tree for partial reprocessing. +pub fn resolve_action( + file_bytes: &[u8], + stored_doc: &PersistedDocument, + pipeline_options: &PipelineOptions, + format: DocumentFormat, +) -> IndexAction { + let current_fp = Fingerprint::from_bytes(file_bytes); + + // Layer 1: File-level content fingerprint + if !stored_doc + .meta + .needs_reprocessing(¤t_fp, pipeline_options.processing_version) + { + info!("File fingerprint unchanged, skipping"); + return IndexAction::Skip(SkipInfo { + doc_id: stored_doc.meta.id.clone(), + name: stored_doc.meta.name.clone(), + format, + description: stored_doc.meta.description.clone(), + page_count: stored_doc.meta.page_count, + }); + } + + // Layer 2: Logic fingerprint (pipeline config changed?) + let current_logic_fp = pipeline_options.logic_fingerprint(); + if stored_doc.meta.logic_fingerprint != current_logic_fp + && !stored_doc.meta.logic_fingerprint.is_zero() + { + info!("Logic fingerprint changed, full reprocess required"); + return IndexAction::FullIndex { + existing_id: Some(stored_doc.meta.id.clone()), + }; + } + + // Layer 3: Content changed, pipeline unchanged → incremental update + info!("Content changed, pipeline unchanged → incremental update"); + IndexAction::IncrementalUpdate { + old_tree: stored_doc.tree.clone(), + existing_id: stored_doc.meta.id.clone(), + } +} diff --git a/vectorless-core/vectorless-index/src/incremental/updater.rs b/vectorless-core/vectorless-index/src/incremental/updater.rs new file mode 100644 index 00000000..3cb838de --- /dev/null +++ b/vectorless-core/vectorless-index/src/incremental/updater.rs @@ -0,0 +1,177 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Partial tree updater for incremental indexing. + +use tracing::info; + +use vectorless_document::{DocumentTree, NodeId}; +use vectorless_error::Result; +use crate::index::parse::RawNode; + +use super::detector::ChangeDetector; + +/// Result of a partial update. +#[derive(Debug)] +pub struct UpdateResult { + /// Number of nodes added. + pub nodes_added: usize, + /// Number of nodes removed. + pub nodes_removed: usize, + /// Number of nodes modified. + pub nodes_modified: usize, + /// Number of summaries regenerated. + pub summaries_regenerated: usize, +} + +impl Default for UpdateResult { + fn default() -> Self { + Self { + nodes_added: 0, + nodes_removed: 0, + nodes_modified: 0, + summaries_regenerated: 0, + } + } +} + +/// Partial updater for incremental document updates. +pub struct PartialUpdater { + /// Change detector. + detector: ChangeDetector, +} + +impl PartialUpdater { + /// Create a new partial updater. + pub fn new() -> Self { + Self { + detector: ChangeDetector::new(), + } + } + + /// Get the change detector. + pub fn detector(&self) -> &ChangeDetector { + &self.detector + } + + /// Get mutable change detector. + pub fn detector_mut(&mut self) -> &mut ChangeDetector { + &mut self.detector + } + + /// Update a tree with new raw nodes. + /// + /// This performs a partial update by: + /// 1. Detecting changes between old and new content + /// 2. Updating only the affected subtrees + /// 3. Regenerating summaries for changed nodes + pub fn update( + &self, + old_tree: &DocumentTree, + new_raw_nodes: Vec, + ) -> Result<(DocumentTree, UpdateResult)> { + let mut result = UpdateResult::default(); + + // Build new tree from raw nodes + let new_tree = self.build_tree_from_raw(new_raw_nodes)?; + + // Detect changes + let changes = self.detector.detect_changes(old_tree, &new_tree); + + info!( + "Detected changes: {} added, {} removed, {} modified", + changes.added.len(), + changes.removed.len(), + changes.modified.len() + ); + + result.nodes_added = changes.added.len(); + result.nodes_removed = changes.removed.len(); + result.nodes_modified = changes.modified.len(); + + // For now, return the new tree + // In a full implementation, we would: + // 1. Preserve unchanged summaries + // 2. Only regenerate summaries for changed nodes + // 3. Merge preserved and new content + + Ok((new_tree, result)) + } + + /// Build a tree from raw nodes (simple implementation). + fn build_tree_from_raw(&self, raw_nodes: Vec) -> Result { + // This is a simplified implementation + // In production, use the BuildStage + + let mut tree = DocumentTree::new("Document", ""); + + // Stack to track parent nodes at each level + let mut level_stack: Vec> = vec![Some(tree.root())]; + + for raw in raw_nodes { + let level = raw.level; + + // Ensure stack has enough slots + while level_stack.len() <= level { + level_stack.push(None); + } + + // Find parent + let parent_id = (0..level) + .rev() + .find_map(|l| level_stack.get(l).copied().flatten()) + .unwrap_or(tree.root()); + + // Create node + let content = if raw.content.is_empty() { + "" + } else { + &raw.content + }; + let node_id = tree.add_child(parent_id, &raw.title, content); + + // Set line indices + tree.set_line_indices(node_id, raw.line_start, raw.line_end); + + // Set page if available + if let Some(page) = raw.page { + tree.set_page_boundaries(node_id, page, page); + } + + // Set token count if available + if let Some(count) = raw.token_count { + if count > 0 { + tree.set_token_count(node_id, count); + } + } + + // Update stack + if level < level_stack.len() { + level_stack[level] = Some(node_id); + } + + // Clear deeper levels + for i in (level + 1)..level_stack.len() { + level_stack[i] = None; + } + } + + Ok(tree) + } + + /// Check if reindexing is needed. + pub fn needs_reindex(&self, doc_id: &str, content: &str) -> bool { + self.detector.needs_reindex_by_hash(doc_id, content) + } + + /// Record document state after indexing. + pub fn record(&mut self, doc_id: &str, content: &str) { + self.detector.record(doc_id, content, None); + } +} + +impl Default for PartialUpdater { + fn default() -> Self { + Self::new() + } +} diff --git a/vectorless-core/vectorless-index/src/lib.rs b/vectorless-core/vectorless-index/src/lib.rs new file mode 100644 index 00000000..8ea71cab --- /dev/null +++ b/vectorless-core/vectorless-index/src/lib.rs @@ -0,0 +1,73 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index Pipeline module. +//! +//! This module provides a modular, extensible document indexing pipeline. +//! +//! # Architecture +//! +//! ```text +//! Priority 10: ┌──────────┐ +//! │ Parse │ Parse document into raw nodes +//! └────┬─────┘ +//! Priority 20: ┌────▼─────┐ +//! │ Build │ Construct tree + thinning (with content merge) +//! └────┬─────┘ +//! Priority 22: ┌────▼─────┐ +//! │ Validate │ Tree integrity checks (optional) +//! └────┬─────┘ +//! Priority 25: ┌────▼─────┐ +//! │ Split │ Split oversized leaf nodes (optional) +//! └────┬─────┘ +//! Priority 30: ┌────▼─────┐ +//! │ Enhance │ LLM summaries (when client available) +//! └────┬─────┘ +//! Priority 40: ┌────▼─────┐ +//! │ Enrich │ Metadata + cross-references +//! └────┬─────┘ +//! Priority 45: ┌────▼──────────┐ +//! │ Reasoning Idx │ Pre-computed reasoning index +//! └────┬──────────┘ +//! Priority 50: ┌────▼──────────┐ +//! │ Navigation Idx│ Agent navigation index +//! └────┬──────────┘ +//! Priority 60: ┌────▼──────┐ +//! │ Optimize │ Final tree optimization +//! └───────────┘ +//! ``` +//! +//! Checkpointing is available when `PipelineOptions::checkpoint_dir` is set. +//! State is saved after each stage group and resumed on restart. +//! +//! # Usage +//! +//! ```rust,ignore +//! use vectorless::index::{PipelineExecutor, IndexInput, PipelineOptions}; +//! use vectorless::index::summary::SummaryStrategy; +//! +//! let options = PipelineOptions::new() +//! .with_summary_strategy(SummaryStrategy::selective(100, true)); +//! +//! let result = PipelineExecutor::new() +//! .with_options(options) +//! .execute(input) +//! .await?; +//! ``` + +pub mod config; +pub mod incremental; +pub mod parse; +pub mod pipeline; +pub mod stages; +pub mod summary; + +// Re-export main types from pipeline +pub use pipeline::{IndexInput, IndexMetrics, PipelineExecutor, PipelineResult}; + +// Re-export config types +pub use vectorless_document::ReasoningIndexConfig; +pub use config::{IndexMode, PipelineOptions, ThinningConfig}; + +// Re-export summary +pub use summary::SummaryStrategy; diff --git a/vectorless-core/vectorless-index/src/parse/markdown/config.rs b/vectorless-core/vectorless-index/src/parse/markdown/config.rs new file mode 100644 index 00000000..7a013f5f --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/markdown/config.rs @@ -0,0 +1,219 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Configuration options for the Markdown parser. + +/// Markdown parser configuration. +/// +/// Controls parsing behavior, content extraction, and extension support. +/// +/// # Example +/// +/// ```rust +/// use vectorless::parser::markdown::MarkdownConfig; +/// +/// // Default GFM configuration +/// let config = MarkdownConfig::default(); +/// +/// // Strict CommonMark +/// let config = MarkdownConfig::commonmark(); +/// +/// // Documentation-focused +/// let config = MarkdownConfig::documentation(); +/// +/// // Custom configuration +/// let config = MarkdownConfig { +/// max_heading_level: 3, +/// include_code_blocks: false, +/// ..Default::default() +/// }; +/// ``` +#[derive(Debug, Clone)] +pub struct MarkdownConfig { + // ============================================================ + // Parsing Options + // ============================================================ + /// Enable GitHub Flavored Markdown extensions. + /// + /// Includes: tables, strikethrough, task lists, autolinks. + /// Default: `true` + pub enable_gfm: bool, + + /// Enable footnotes extension (`[^1]` syntax). + /// Default: `false` + pub enable_footnotes: bool, + + /// Enable definition lists. + /// Default: `false` + pub enable_definition_lists: bool, + + /// Enable superscript/subscript (`^sup^`, `~sub~`). + /// Default: `false` + pub enable_super_sub: bool, + + /// Maximum heading level to parse (1-6). + /// Headings above this level are treated as content. + /// Default: `6` + pub max_heading_level: usize, + + /// Minimum heading level to create a node. + /// Headings below this level are treated as content. + /// Default: `1` + pub min_heading_level: usize, + + // ============================================================ + // Content Extraction + // ============================================================ + /// Include code blocks in node content. + /// Default: `true` + pub include_code_blocks: bool, + + /// Include images (alt text) in content. + /// Default: `true` + pub include_images: bool, + + /// Include links in content. + /// Default: `true` + pub include_links: bool, + + /// Include tables in content. + /// Default: `true` + pub include_tables: bool, + + // ============================================================ + // Frontmatter + // ============================================================ + /// Parse YAML frontmatter (`---` delimiters). + /// Default: `true` + pub parse_frontmatter: bool, + + /// Parse TOML frontmatter (`+++` delimiters). + /// Default: `false` + pub parse_toml_frontmatter: bool, + + /// Fields to extract from frontmatter as metadata. + /// Default: `["title", "description"]` + pub frontmatter_fields: Vec, + + // ============================================================ + // Advanced Options + // ============================================================ + /// Minimum characters required for a heading title to be valid. + /// Headings with shorter titles are skipped. + /// Default: `1` + pub min_heading_chars: usize, + + /// Create an implicit root node for content before the first heading. + /// Default: `true` + pub create_preamble_node: bool, + + /// Title for the preamble node (if created). + /// Default: `"Introduction"` + pub preamble_title: String, +} + +impl Default for MarkdownConfig { + fn default() -> Self { + Self { + // Parsing options - GFM by default (most common) + enable_gfm: true, + enable_footnotes: false, + enable_definition_lists: false, + enable_super_sub: false, + max_heading_level: 6, + min_heading_level: 1, + + // Content extraction - include all by default + include_code_blocks: true, + include_images: true, + include_links: true, + include_tables: true, + + // Frontmatter + parse_frontmatter: true, + parse_toml_frontmatter: false, + frontmatter_fields: vec!["title".into(), "description".into()], + + // Advanced + min_heading_chars: 1, + create_preamble_node: true, + preamble_title: "Introduction".into(), + } + } +} + +impl MarkdownConfig { + /// Create a new configuration with defaults. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Configuration optimized for GitHub Flavored Markdown. + /// + /// Enables GFM extensions (tables, strikethrough, task lists). + #[must_use] + pub fn gfm() -> Self { + Self::default() + } + + /// Configuration for strict CommonMark (no extensions). + #[must_use] + pub fn commonmark() -> Self { + Self { + enable_gfm: false, + ..Self::default() + } + } + + /// Configuration optimized for documentation sites. + /// + /// Enables footnotes and definition lists. + #[must_use] + pub fn documentation() -> Self { + Self { + enable_footnotes: true, + enable_definition_lists: true, + ..Self::default() + } + } + + /// Configuration that excludes code blocks from content. + /// + /// Useful when code blocks are not relevant for retrieval. + #[must_use] + pub fn no_code_blocks() -> Self { + Self { + include_code_blocks: false, + ..Self::default() + } + } + + /// Set the maximum heading level. + #[must_use] + pub fn with_max_heading_level(mut self, level: usize) -> Self { + self.max_heading_level = level.clamp(1, 6); + self + } + + /// Enable or disable code blocks in content. + #[must_use] + pub fn with_code_blocks(mut self, include: bool) -> Self { + self.include_code_blocks = include; + self + } + + /// Enable or disable frontmatter parsing. + #[must_use] + pub fn with_frontmatter(mut self, parse: bool) -> Self { + self.parse_frontmatter = parse; + self + } + + /// Set the preamble node title. + #[must_use] + pub fn with_preamble_title(mut self, title: impl Into) -> Self { + self.preamble_title = title.into(); + self + } +} diff --git a/vectorless-core/vectorless-index/src/parse/markdown/frontmatter.rs b/vectorless-core/vectorless-index/src/parse/markdown/frontmatter.rs new file mode 100644 index 00000000..65f7cda0 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/markdown/frontmatter.rs @@ -0,0 +1,219 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Frontmatter extraction for Markdown documents. +//! +//! Supports YAML (`---`) and TOML (`+++`) delimited frontmatter. + +use std::collections::HashMap; + +/// Parsed frontmatter data. +#[derive(Debug, Clone, Default)] +pub struct Frontmatter { + /// Extracted key-value pairs. + pub fields: HashMap, +} + +impl Frontmatter { + /// Create an empty frontmatter. + #[must_use] + pub fn new() -> Self { + Self { + fields: HashMap::new(), + } + } + + /// Parse frontmatter from raw content. + /// + /// Returns `Some(Frontmatter)` if valid frontmatter is found. + /// Returns `None` if no frontmatter delimiters are present. + fn parse<'a>(content: &'a str, delimiter: &str) -> Option<(Self, &'a str)> { + // Check if content starts with delimiter + let delim_line = format!("{}\n", delimiter); + if !content.starts_with(&delim_line) { + return None; + } + + // Find closing delimiter + let content_after_open = &content[delimiter.len() + 1..]; + let close_pattern = format!("\n{}\n", delimiter); + + if let Some(end_pos) = content_after_open.find(&close_pattern) { + let frontmatter_text = &content_after_open[..end_pos]; + let remaining = &content_after_open[end_pos + close_pattern.len()..]; + + let fm = Self::parse_yaml(frontmatter_text); + Some((fm, remaining)) + } else { + None + } + } + + /// Parse YAML-style frontmatter (simple key: value extraction). + fn parse_yaml(text: &str) -> Self { + let mut fields = HashMap::new(); + + for line in text.lines() { + let line = line.trim(); + + // Skip empty lines and comments + if line.is_empty() || line.starts_with('#') { + continue; + } + + // Parse "key: value" or "key: "quoted value"" + if let Some((key, value)) = line.split_once(':') { + let key = key.trim().to_string(); + let value = value.trim(); + + // Remove quotes if present + let value = if (value.starts_with('"') && value.ends_with('"')) + || (value.starts_with('\'') && value.ends_with('\'')) + { + value[1..value.len() - 1].to_string() + } else { + value.to_string() + }; + + fields.insert(key, value); + } + } + + Self { fields } + } + + /// Get a field value by key. + #[must_use] + pub fn get(&self, key: &str) -> Option<&String> { + self.fields.get(key) + } + + /// Check if a field exists. + #[must_use] + pub fn contains(&self, key: &str) -> bool { + self.fields.contains_key(key) + } + + /// Get the title field. + #[must_use] + pub fn title(&self) -> Option<&String> { + self.get("title") + } + + /// Get the description field. + #[must_use] + pub fn description(&self) -> Option<&String> { + self.get("description") + } +} + +/// Extract frontmatter from Markdown content. +/// +/// Returns a tuple of (frontmatter, remaining_content). +/// If no frontmatter is found, returns `(None, content)`. +/// +/// # Supported Formats +/// +/// - YAML: `---\nkey: value\n---` +/// - TOML: `+++\nkey = "value"\n+++` +#[must_use] +pub fn extract_frontmatter( + content: &str, + parse_yaml: bool, + parse_toml: bool, +) -> (Option, &str) { + // Try YAML frontmatter first + if parse_yaml { + if let Some((fm, remaining)) = Frontmatter::parse(content, "---") { + return (Some(fm), remaining); + } + } + + // Try TOML frontmatter + if parse_toml { + if let Some((fm, remaining)) = Frontmatter::parse(content, "+++") { + return (Some(fm), remaining); + } + } + + // No frontmatter found + (None, content) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_yaml_frontmatter() { + let content = r#"--- +title: My Document +description: A test document +--- + +# Content + +Body text."#; + + let (fm, remaining) = extract_frontmatter(content, true, false); + + assert!(fm.is_some()); + let fm = fm.unwrap(); + assert_eq!(fm.title(), Some(&"My Document".to_string())); + assert_eq!(fm.description(), Some(&"A test document".to_string())); + assert!(remaining.trim_start().starts_with("# Content")); + } + + #[test] + fn test_extract_quoted_values() { + let content = r#"--- +title: "Quoted Title" +description: 'Single quoted' +--- + +Content"#; + + let (fm, _) = extract_frontmatter(content, true, false); + + assert!(fm.is_some()); + let fm = fm.unwrap(); + assert_eq!(fm.title(), Some(&"Quoted Title".to_string())); + assert_eq!(fm.description(), Some(&"Single quoted".to_string())); + } + + #[test] + fn test_no_frontmatter() { + let content = "# No Frontmatter\n\nJust content."; + + let (fm, remaining) = extract_frontmatter(content, true, false); + + assert!(fm.is_none()); + assert_eq!(remaining, content); + } + + #[test] + fn test_incomplete_frontmatter() { + let content = "---\ntitle: Test\n\nNo closing delimiter"; + + let (fm, remaining) = extract_frontmatter(content, true, false); + + // Should not match incomplete frontmatter + assert!(fm.is_none()); + assert_eq!(remaining, content); + } + + #[test] + fn test_toml_frontmatter() { + let content = r#"+++ +title = "TOML Doc" ++++ + +# Content"#; + + let (fm, remaining) = extract_frontmatter(content, false, true); + + // Note: Our simple parser treats TOML as YAML-like + assert!(fm.is_some()); + assert!(remaining.trim_start().starts_with("# Content")); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/markdown/mod.rs b/vectorless-core/vectorless-index/src/parse/markdown/mod.rs new file mode 100644 index 00000000..168f3645 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/markdown/mod.rs @@ -0,0 +1,30 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Production-ready Markdown parser module. +//! +//! This module provides a robust Markdown parser built on `pulldown-cmark`, +//! supporting CommonMark, GFM extensions, and frontmatter extraction. +//! +//! # Features +//! +//! - **CommonMark compliant** - Full CommonMark specification support +//! - **GFM extensions** - Tables, strikethrough, task lists, autolinks +//! - **Frontmatter** - YAML and TOML frontmatter parsing +//! - **Configurable** - Fine-grained control over parsing behavior +//! +//! # Example +//! +//! ```rust +//! use vectorless::parser::markdown::{MarkdownParser, MarkdownConfig}; +//! +//! let parser = MarkdownParser::new(); +//! // or with custom config: +//! // let parser = MarkdownParser::with_config(MarkdownConfig::gfm()); +//! ``` + +mod config; +mod frontmatter; +mod parser; + +pub use parser::MarkdownParser; diff --git a/vectorless-core/vectorless-index/src/parse/markdown/parser.rs b/vectorless-core/vectorless-index/src/parse/markdown/parser.rs new file mode 100644 index 00000000..a511e50c --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/markdown/parser.rs @@ -0,0 +1,601 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Main Markdown parser implementation. + +use pulldown_cmark::Options; +use std::path::Path; + +use vectorless_error::Result; +use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; +use vectorless_utils::estimate_tokens; + +use super::config::MarkdownConfig; +use super::frontmatter; + +/// Production-ready Markdown parser. +/// +/// Built on `pulldown-cmark` for robust CommonMark/GFM parsing. +/// +/// # Features +/// +/// - CommonMark compliant +/// - GitHub Flavored Markdown (GFM) extensions +/// - YAML/TOML frontmatter extraction +/// - Configurable parsing behavior +/// +/// # Example +/// +/// ```rust +/// use vectorless::parser::markdown::MarkdownParser; +/// use vectorless::parser::DocumentParser; +/// +/// # #[tokio::main] +/// # async fn main() -> vectorless::Result<()> { +/// let parser = MarkdownParser::new(); +/// let result = parser.parse("# Title\n\nContent").await?; +/// +/// println!("Found {} nodes", result.node_count()); +/// # Ok(()) +/// # } +/// ``` +#[derive(Debug, Clone)] +pub struct MarkdownParser { + config: MarkdownConfig, +} + +impl Default for MarkdownParser { + fn default() -> Self { + Self::new() + } +} + +impl MarkdownParser { + /// Create a new parser with default (GFM) configuration. + #[must_use] + pub fn new() -> Self { + Self::with_config(MarkdownConfig::default()) + } + + /// Create a parser with custom configuration. + #[must_use] + pub fn with_config(config: MarkdownConfig) -> Self { + Self { config } + } + + /// Build pulldown-cmark options from configuration. + fn build_options(&self) -> Options { + let mut options = Options::empty(); + + // GFM extensions + if self.config.enable_gfm { + options.insert(Options::ENABLE_TABLES); + options.insert(Options::ENABLE_STRIKETHROUGH); + options.insert(Options::ENABLE_TASKLISTS); + options.insert(Options::ENABLE_SMART_PUNCTUATION); + } + + // Footnotes + if self.config.enable_footnotes { + options.insert(Options::ENABLE_FOOTNOTES); + } + + // Definition lists + if self.config.enable_definition_lists { + options.insert(Options::ENABLE_DEFINITION_LIST); + } + + // Note: pulldown-cmark 0.12 doesn't have ENABLE_SUPERSCRIPT/ENABLE_SUBSCRIPT + // Super/subscript handling would require custom processing if needed + + options + } + + /// Parse Markdown content and extract nodes. + fn extract_nodes( + &self, + content: &str, + ) -> ( + Vec, + Option>, + ) { + // 1. Extract frontmatter (if present) + let (fm, remaining_content) = frontmatter::extract_frontmatter( + content, + self.config.parse_frontmatter, + self.config.parse_toml_frontmatter, + ); + + // 2. Build parser options + let options = self.build_options(); + + // 3. Parse with pulldown-cmark + let parser = pulldown_cmark::Parser::new_ext(remaining_content, options); + + // 4. Extract raw nodes from events + let nodes = self.extract_nodes_from_events(parser); + + // 5. Extract frontmatter fields + let fm_fields = fm.map(|f| { + self.config + .frontmatter_fields + .iter() + .filter_map(|field| f.get(field).map(|v| (field.clone(), v.clone()))) + .collect() + }); + + (nodes, fm_fields) + } + + /// Extract RawNodes from pulldown-cmark event iterator. + fn extract_nodes_from_events<'a, E>(&self, events: E) -> Vec + where + E: Iterator>, + { + use pulldown_cmark::{CodeBlockKind, Event, Tag, TagEnd}; + + let mut nodes: Vec = Vec::new(); + let mut current: Option = None; + let mut content_buffer = String::new(); + let mut title_buffer = String::new(); + let mut preamble_content = String::new(); + let mut current_line: usize = 1; + let mut in_heading = false; + let mut skip_content = false; + + for event in events { + match event { + Event::Start(tag) => match tag { + Tag::Heading { level, .. } => { + let level_num = level as usize; + + // Check if this heading level should be processed as a node + if level_num > self.config.max_heading_level + || level_num < self.config.min_heading_level + { + // Treat as content - add the heading marker to content + in_heading = false; + skip_content = false; + content_buffer.push_str(&format!("{} ", "#".repeat(level_num))); + continue; + } + + // Finish any current node first + if let Some(node) = finish_current_node( + &mut current, + &mut content_buffer, + &mut preamble_content, + &mut nodes, + &self.config, + current_line, + ) { + nodes.push(node); + } + + // Start new heading + in_heading = true; + title_buffer.clear(); + + current = Some(InProgressNode { + title: String::new(), + level: level_num, + line_start: current_line, + }); + } + Tag::CodeBlock(kind) => { + if self.config.include_code_blocks { + match kind { + CodeBlockKind::Fenced(lang) => { + content_buffer.push_str("\n```"); + content_buffer.push_str(&lang); + content_buffer.push('\n'); + } + CodeBlockKind::Indented => { + content_buffer.push_str("\n```\n"); + } + } + } else { + skip_content = true; + } + } + _ => {} + }, + Event::End(tag) => match tag { + TagEnd::Heading(_) => { + if in_heading { + in_heading = false; + if let Some(ref mut node) = current { + node.title = title_buffer.trim().to_string(); + title_buffer.clear(); + + if node.title.chars().count() < self.config.min_heading_chars { + current = None; + } + } + } + } + TagEnd::CodeBlock => { + skip_content = false; + if self.config.include_code_blocks { + content_buffer.push_str("\n```\n"); + } + } + _ => {} + }, + Event::Text(text) => { + current_line += text.chars().filter(|&c| c == '\n').count(); + + if in_heading { + title_buffer.push_str(&text); + } else if !skip_content { + content_buffer.push_str(&text); + } + } + Event::Code(code) => { + if !in_heading && !skip_content { + content_buffer.push('`'); + content_buffer.push_str(&code); + content_buffer.push('`'); + } + } + Event::Html(html) | Event::InlineHtml(html) => { + if !skip_content { + content_buffer.push_str(&html); + current_line += html.chars().filter(|&c| c == '\n').count(); + } + } + Event::SoftBreak => { + if !skip_content { + content_buffer.push(' '); + } + } + Event::HardBreak => { + if !skip_content { + content_buffer.push('\n'); + current_line += 1; + } + } + Event::Rule => { + if !skip_content { + content_buffer.push_str("\n\n---\n\n"); + } + } + _ => {} + } + } + + // Finish any remaining node + if let Some(node) = finish_current_node( + &mut current, + &mut content_buffer, + &mut preamble_content, + &mut nodes, + &self.config, + current_line, + ) { + nodes.push(node); + } + + // Handle document with no headings (only preamble) + if nodes.is_empty() + && self.config.create_preamble_node + && (!content_buffer.trim().is_empty() || !preamble_content.is_empty()) + { + // Use preamble_content if available, otherwise use content_buffer + let content = if preamble_content.is_empty() { + content_buffer.trim() + } else { + preamble_content.trim() + }; + nodes.push(RawNode { + title: self.config.preamble_title.clone(), + level: 0, + content: content.to_string(), + line_start: 1, + line_end: current_line, + page: None, + token_count: Some(estimate_tokens(content)), + total_token_count: None, + }); + } + + nodes + } +} + +/// In-progress node being constructed. +struct InProgressNode { + title: String, + level: usize, + line_start: usize, +} + +/// Finish the current node and return it if valid. +#[allow(clippy::too_many_arguments)] +fn finish_current_node( + current: &mut Option, + content_buffer: &mut String, + preamble_content: &mut String, + nodes: &mut Vec, + config: &MarkdownConfig, + current_line: usize, +) -> Option { + // Handle preamble content (content before first heading) + if nodes.is_empty() && !content_buffer.trim().is_empty() { + if config.create_preamble_node { + let content = content_buffer.trim(); + *preamble_content = content.to_string(); + } + // Clear the buffer after storing as preamble to avoid duplication + content_buffer.clear(); + } + + // Finish current heading node + if let Some(node) = current.take() { + let content = content_buffer.trim().to_string(); + + // If this is the first heading and we have preamble content, + // prepend it to this node's content + let final_content = if nodes.is_empty() && !preamble_content.is_empty() { + let combined = format!("{}\n\n{}", preamble_content, content); + preamble_content.clear(); + combined + } else { + content + }; + + content_buffer.clear(); + + return Some(RawNode { + title: node.title, + level: node.level, + content: final_content.trim().to_string(), + line_start: node.line_start, + line_end: current_line, + page: None, + token_count: Some(estimate_tokens(&final_content)), + total_token_count: None, + }); + } + + content_buffer.clear(); + None +} + +impl MarkdownParser { + /// Parse Markdown content and return result. + pub async fn parse(&self, content: &str) -> Result { + let line_count = content.lines().count(); + let (nodes, fm_fields) = self.extract_nodes(content); + + // Build metadata + let mut meta = DocumentMeta { + name: String::new(), + format: DocumentFormat::Markdown, + page_count: None, + line_count, + source_path: None, + description: None, + }; + + // Apply frontmatter fields + if let Some(fields) = fm_fields { + if let Some(title) = fields.get("title") { + meta.name = title.clone(); + } + if let Some(desc) = fields.get("description") { + meta.description = Some(desc.clone()); + } + } + + Ok(ParseResult::new(meta, nodes)) + } + + /// Parse a Markdown file. + pub async fn parse_file(&self, path: &Path) -> Result { + let content = tokio::fs::read_to_string(path) + .await + .map_err(|e| vectorless_error::Error::Parse(format!("Failed to read file: {}", e)))?; + + let mut result = self.parse(&content).await?; + + // Extract document name from filename (if not set by frontmatter) + if result.meta.name.is_empty() { + if let Some(stem) = path.file_stem() { + result.meta.name = stem.to_string_lossy().to_string(); + } + } + result.meta.source_path = Some(path.to_string_lossy().to_string()); + + Ok(result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_parse_simple() { + let parser = MarkdownParser::new(); + let content = "# Title\n\nContent here."; + let result = parser.parse(content).await.unwrap(); + + assert!(!result.nodes.is_empty()); + assert!( + result + .nodes + .iter() + .any(|n| n.title == "Title" && n.level == 1) + ); + } + + #[tokio::test] + async fn test_parse_nested() { + let parser = MarkdownParser::new(); + let content = r#"# Main + +## Section 1 + +Content 1. + +## Section 2 + +Content 2."#; + let result = parser.parse(content).await.unwrap(); + + let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect(); + assert!(heading_nodes.len() >= 3); + } + + #[tokio::test] + async fn test_parse_code_blocks() { + let parser = MarkdownParser::new(); + let content = r#"# Code Example + +```rust +fn main() { + println!("Hello"); +} +```"#; + let result = parser.parse(content).await.unwrap(); + + // Should have the heading node + let heading_node = result.nodes.iter().find(|n| n.title == "Code Example"); + assert!(heading_node.is_some()); + + // Code block should be in content + assert!(heading_node.unwrap().content.contains("```rust")); + } + + #[tokio::test] + async fn test_skip_headers_in_code_blocks() { + let parser = MarkdownParser::new(); + let content = r#"# Title 1 + +Content before code. + +``` +# This is not a header +# Also not a header +``` + +## Title 1.1 + +Content after code."#; + + let result = parser.parse(content).await.unwrap(); + + // Should only have Title 1 and Title 1.1 as heading nodes + let heading_titles: Vec<_> = result + .nodes + .iter() + .filter(|n| n.level > 0) + .map(|n| n.title.as_str()) + .collect(); + + assert!(heading_titles.contains(&"Title 1")); + assert!(heading_titles.contains(&"Title 1.1")); + assert!(!heading_titles.contains(&"This is not a header")); + } + + #[tokio::test] + async fn test_frontmatter_extraction() { + let parser = MarkdownParser::new(); + let content = r#"--- +title: My Document +description: A test document +--- + +# Content + +Body text."#; + + let result = parser.parse(content).await.unwrap(); + + assert_eq!(result.meta.name, "My Document"); + assert_eq!(result.meta.description, Some("A test document".to_string())); + } + + #[tokio::test] + async fn test_gfm_table() { + let parser = MarkdownParser::new(); + let content = r#"# Table Example + +| Name | Age | +|------|-----| +| Alice | 30 | +| Bob | 25 |"#; + + let result = parser.parse(content).await.unwrap(); + + let table_node = result.nodes.iter().find(|n| n.title == "Table Example"); + assert!(table_node.is_some()); + assert!(table_node.unwrap().content.contains("Alice")); + } + + #[tokio::test] + async fn test_max_heading_level_config() { + let config = MarkdownConfig { + max_heading_level: 2, + ..Default::default() + }; + let parser = MarkdownParser::with_config(config); + + let content = r#"# H1 + +## H2 + +### H3 + +#### H4"#; + + let result = parser.parse(content).await.unwrap(); + + // H3 and H4 should not be separate nodes + let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect(); + assert_eq!(heading_nodes.len(), 2); + } + + #[tokio::test] + async fn test_no_code_blocks_config() { + let config = MarkdownConfig::no_code_blocks(); + let parser = MarkdownParser::with_config(config); + + let content = r#"# Example + +```rust +let x = 1; +``` + +Some text."#; + + let result = parser.parse(content).await.unwrap(); + + let node = result.nodes.iter().find(|n| n.title == "Example").unwrap(); + // Code block should not be in content + assert!(!node.content.contains("let x = 1")); + // But regular text should be + assert!(node.content.contains("Some text")); + } + + #[tokio::test] + async fn test_empty_document() { + let parser = MarkdownParser::new(); + let result = parser.parse("").await.unwrap(); + + assert!(result.nodes.is_empty()); + } + + #[tokio::test] + async fn test_document_with_no_headings() { + let parser = MarkdownParser::new(); + let content = "Just some text\nwith no headings."; + + let result = parser.parse(content).await.unwrap(); + + assert_eq!(result.nodes.len(), 1); + assert_eq!(result.nodes[0].title, "Introduction"); + assert_eq!(result.nodes[0].level, 0); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/mod.rs b/vectorless-core/vectorless-index/src/parse/mod.rs new file mode 100644 index 00000000..e69de29b diff --git a/vectorless-core/vectorless-index/src/parse/pdf/mod.rs b/vectorless-core/vectorless-index/src/parse/pdf/mod.rs new file mode 100644 index 00000000..dc92da86 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/pdf/mod.rs @@ -0,0 +1,32 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! PDF document parsing module. +//! +//! This module provides functionality to parse PDF documents: +//! - **PdfPage** — Single page with text and metadata +//! - **PdfParser** — Extract pages from PDF files +//! +//! # Example +//! +//! ```rust,no_run +//! use vectorless::parser::pdf::{PdfParser, PdfPage}; +//! use std::path::Path; +//! +//! # fn main() -> vectorless::Result<()> { +//! let parser = PdfParser::new(); +//! let result = parser.parse_file(Path::new("document.pdf"))?; +//! +//! println!("Pages: {}", result.pages.len()); +//! for page in &result.pages { +//! println!("Page {}: {} tokens", page.number, page.token_count); +//! } +//! # Ok(()) +//! # } +//! ``` + +mod parser; +mod types; + +pub use parser::PdfParser; +pub use types::PdfPage; diff --git a/vectorless-core/vectorless-index/src/parse/pdf/parser.rs b/vectorless-core/vectorless-index/src/parse/pdf/parser.rs new file mode 100644 index 00000000..af5f2478 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/pdf/parser.rs @@ -0,0 +1,366 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! PDF document parser. +//! +//! Uses [`pdf_extract`] for reliable text extraction (handles CJK, ToUnicode +//! CMap, font encoding, etc.) and [`lopdf`] only for metadata extraction from +//! the PDF Info dictionary. + +use std::path::Path; + +use lopdf::Document as LopdfDocument; +use tracing::{info, warn}; + +use vectorless_error::Error; +use vectorless_error::Result; +use crate::index::parse::toc::TocProcessor; +use vectorless_llm::LlmClient; + +use super::types::{PdfMetadata, PdfPage, PdfParseResult}; +use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; + +/// PDF document parser. +pub struct PdfParser { + config: PdfParserConfig, + /// Optional LLM client for TOC extraction and structure analysis. + llm_client: Option, +} + +/// PDF parser configuration. +#[derive(Debug, Clone)] +pub struct PdfParserConfig { + /// Maximum pages to extract (0 = unlimited). + pub max_pages: usize, + + /// Enable TOC extraction. + pub extract_toc: bool, +} + +impl Default for PdfParserConfig { + fn default() -> Self { + Self { + max_pages: 0, + extract_toc: true, + } + } +} + +impl PdfParser { + /// Create a new PDF parser with default configuration. + pub fn new() -> Self { + Self::default() + } + + /// Create a PDF parser with an externally provided LLM client. + pub fn with_llm_client(client: LlmClient) -> Self { + Self { + config: PdfParserConfig::default(), + llm_client: Some(client), + } + } + + /// Create a parser with custom configuration. + pub fn with_config(config: PdfParserConfig) -> Self { + Self { + config, + llm_client: None, + } + } + + /// Create a parser without TOC extraction. + pub fn without_toc() -> Self { + Self { + config: PdfParserConfig { + extract_toc: false, + ..Default::default() + }, + llm_client: None, + } + } + + /// Parse PDF from bytes and return raw pages. + pub async fn parse_bytes_raw( + &self, + bytes: &[u8], + filename: Option<&str>, + ) -> Result { + // Use pdf-extract for text (handles CJK, ToUnicode CMap, etc.) + let pages = self.extract_pages(bytes)?; + + // Use lopdf only for metadata; fall back gracefully if it fails + let metadata = match LopdfDocument::load_mem(bytes) { + Ok(doc) => self.extract_metadata(&doc, filename), + Err(_) => PdfMetadata { + title: filename.unwrap_or("Document").to_string(), + page_count: pages.len(), + ..Default::default() + }, + }; + + Ok(PdfParseResult::new(metadata, pages)) + } + + /// Extract text from all pages using pdf-extract. + fn extract_pages(&self, bytes: &[u8]) -> Result> { + let page_texts = pdf_extract::extract_text_from_mem_by_pages(bytes) + .map_err(|e| Error::Parse(format!("pdf-extract failed: {}", e)))?; + + let mut pages = Vec::new(); + for (i, text) in page_texts.iter().enumerate() { + if self.config.max_pages > 0 && i >= self.config.max_pages { + break; + } + let page_num = i + 1; // 1-based + if !text.trim().is_empty() { + pages.push(PdfPage::new(page_num, text.clone())); + } + } + + Ok(pages) + } + + /// Extract metadata from PDF Info dictionary via lopdf. + fn extract_metadata(&self, doc: &LopdfDocument, filename: Option<&str>) -> PdfMetadata { + let mut metadata = PdfMetadata { + title: filename.unwrap_or("Document").to_string(), + page_count: doc.get_pages().len(), + ..Default::default() + }; + + if let Ok(info) = doc.trailer.get(b"Info") { + if let Ok(info_ref) = info.as_reference() { + if let Ok(info_obj) = doc.get_object(info_ref) { + if let Ok(dict) = info_obj.as_dict() { + if let Ok(title_obj) = dict.get(b"Title") { + if let Ok(title) = title_obj.as_str() { + metadata.title = self.decode_pdf_string(title); + } + } + + if let Ok(author_obj) = dict.get(b"Author") { + if let Ok(author) = author_obj.as_str() { + metadata.author = Some(self.decode_pdf_string(author)); + } + } + + if let Ok(subject_obj) = dict.get(b"Subject") { + if let Ok(subject) = subject_obj.as_str() { + metadata.subject = Some(self.decode_pdf_string(subject)); + } + } + } + } + } + } + + metadata + } + + /// Decode PDF string literal (handles escape sequences). + /// + /// Used only for metadata field values extracted via lopdf. + fn decode_pdf_string(&self, bytes: &[u8]) -> String { + let mut result = String::new(); + let mut i = 0; + + while i < bytes.len() { + match bytes[i] { + b'\\' if i + 1 < bytes.len() => { + i += 1; + match bytes[i] { + b'n' => result.push('\n'), + b'r' => result.push('\r'), + b't' => result.push('\t'), + b'(' => result.push('('), + b')' => result.push(')'), + b'\\' => result.push('\\'), + _ => {} + } + } + b if b >= 32 && b < 127 => { + result.push(b as char); + } + _ => {} + } + i += 1; + } + + result + } + + /// Convert TOC entries to RawNodes. + fn toc_entries_to_raw_nodes( + &self, + entries: &[crate::index::parse::toc::TocEntry], + pages: &[PdfPage], + ) -> Vec { + let mut nodes = Vec::new(); + + for entry in entries { + let content = self.get_content_for_entry(entry, pages); + + let mut node = RawNode::new(&entry.title) + .with_content(content) + .with_level(entry.level); + + if let Some(page) = entry.physical_page { + node = node.with_page(page); + } + + nodes.push(node); + } + + nodes + } + + /// Get content for a TOC entry from pages. + fn get_content_for_entry( + &self, + entry: &crate::index::parse::toc::TocEntry, + pages: &[PdfPage], + ) -> String { + let start_page = entry.physical_page.unwrap_or(1); + + pages + .iter() + .find(|p| p.number == start_page) + .map(|p| { + let text = &p.text; + if let Some(pos) = text.find(&entry.title) { + text[pos + entry.title.len()..].trim().to_string() + } else { + text.clone() + } + }) + .unwrap_or_default() + } + + /// Create RawNodes from pages (fallback when no TOC). + fn pages_to_raw_nodes(&self, pages: &[PdfPage]) -> Vec { + pages + .iter() + .map(|page| { + RawNode::new(format!("Page {}", page.number)) + .with_content(page.text.clone()) + .with_level(1) + .with_page(page.number) + }) + .collect() + } +} + +impl Default for PdfParser { + fn default() -> Self { + Self::with_config(PdfParserConfig::default()) + } +} + +impl PdfParser { + /// Parse a PDF file into raw nodes for the index pipeline. + pub async fn parse_file(&self, path: &Path) -> Result { + let bytes = tokio::fs::read(path) + .await + .map_err(|e| Error::Parse(format!("Failed to read PDF file: {}", e)))?; + let filename = path.file_stem().and_then(|s| s.to_str()); + self.parse_bytes_to_result(&bytes, filename, Some(path)) + .await + } + + /// Parse PDF bytes into raw nodes for the index pipeline. + pub async fn parse_bytes_async( + &self, + bytes: &[u8], + filename: Option<&str>, + ) -> Result { + self.parse_bytes_to_result(bytes, filename, None).await + } + + /// Core async parsing logic shared by parse_file and parse_bytes_async. + async fn parse_bytes_to_result( + &self, + bytes: &[u8], + filename: Option<&str>, + source_path: Option<&Path>, + ) -> Result { + let result = self.parse_bytes_raw(bytes, filename).await?; + let page_count = result.pages.len(); + + // Try TOC extraction if enabled + let nodes = if self.config.extract_toc { + info!("Extracting TOC from PDF with {} pages", page_count); + + let processor = match &self.llm_client { + Some(client) => { + info!("PdfParser: creating TocProcessor with LLM client"); + TocProcessor::with_llm_client(client.clone()) + } + None => { + info!( + "PdfParser: creating TocProcessor without LLM client (no key configured)" + ); + TocProcessor::new() + } + }; + match processor.process(&result.pages).await { + Ok(entries) if !entries.is_empty() => { + info!("Extracted {} TOC entries", entries.len()); + self.toc_entries_to_raw_nodes(&entries, &result.pages) + } + Ok(_) => { + warn!("No TOC entries found, falling back to page-based extraction"); + self.pages_to_raw_nodes(&result.pages) + } + Err(e) => { + warn!( + "TOC extraction failed: {}, falling back to page-based extraction", + e + ); + self.pages_to_raw_nodes(&result.pages) + } + } + } else { + self.pages_to_raw_nodes(&result.pages) + }; + + let meta = DocumentMeta { + name: result.metadata.title, + format: DocumentFormat::Pdf, + page_count: Some(page_count), + line_count: 0, + source_path: source_path.map(|p| p.to_string_lossy().to_string()), + description: result.metadata.subject, + }; + + Ok(ParseResult::new(meta, nodes)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parser_creation() { + let parser = PdfParser::new(); + assert_eq!(parser.config.max_pages, 0); + assert!(parser.config.extract_toc); + } + + #[test] + fn test_parser_without_toc() { + let parser = PdfParser::without_toc(); + assert!(!parser.config.extract_toc); + } + + #[test] + fn test_decode_pdf_string() { + let parser = PdfParser::new(); + + let decoded = parser.decode_pdf_string(b"Hello World"); + assert_eq!(decoded, "Hello World"); + + let decoded = parser.decode_pdf_string(b"Hello\\nWorld"); + assert_eq!(decoded, "Hello\nWorld"); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/pdf/types.rs b/vectorless-core/vectorless-index/src/parse/pdf/types.rs new file mode 100644 index 00000000..6bcfd6bf --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/pdf/types.rs @@ -0,0 +1,171 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! PDF document types. + +use vectorless_utils::estimate_tokens; +use serde::{Deserialize, Serialize}; + +/// A single page from a PDF document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PdfPage { + /// Page number (1-based). + pub number: usize, + + /// Text content of the page. + pub text: String, + + /// Estimated token count. + pub token_count: usize, +} + +impl PdfPage { + /// Create a new PDF page. + pub fn new(number: usize, text: impl Into) -> Self { + let text = text.into(); + let token_count = estimate_tokens(&text); + Self { + number, + text, + token_count, + } + } + + /// Check if the page is empty. + pub fn is_empty(&self) -> bool { + self.text.trim().is_empty() + } + + /// Get character count. + pub fn char_count(&self) -> usize { + self.text.chars().count() + } + + /// Get word count (approximate). + pub fn word_count(&self) -> usize { + self.text.split_whitespace().count() + } +} + +/// PDF document metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PdfMetadata { + /// Document title (from metadata or filename). + pub title: String, + + /// Total page count. + pub page_count: usize, + + /// Author (if available). + pub author: Option, + + /// Subject/description (if available). + pub subject: Option, + + /// Creator application (if available). + pub creator: Option, + + /// Producer application (if available). + pub producer: Option, +} + +impl Default for PdfMetadata { + fn default() -> Self { + Self { + title: String::new(), + page_count: 0, + author: None, + subject: None, + creator: None, + producer: None, + } + } +} + +/// Result of parsing a PDF document. +#[derive(Debug, Clone)] +pub struct PdfParseResult { + /// Document metadata. + pub metadata: PdfMetadata, + + /// Extracted pages. + pub pages: Vec, + + /// Total token count across all pages. + pub total_tokens: usize, +} + +impl PdfParseResult { + /// Create a new parse result. + pub fn new(metadata: PdfMetadata, pages: Vec) -> Self { + let total_tokens = pages.iter().map(|p| p.token_count).sum(); + Self { + metadata, + pages, + total_tokens, + } + } + + /// Check if the document is empty. + pub fn is_empty(&self) -> bool { + self.pages.is_empty() + } + + /// Get a page by number (1-based). + pub fn get_page(&self, number: usize) -> Option<&PdfPage> { + if number == 0 || number > self.pages.len() { + return None; + } + self.pages.get(number - 1) + } + + /// Get text for a page range (inclusive, 1-based). + pub fn get_page_range_text(&self, start: usize, end: usize) -> String { + let start = start.max(1); + let end = end.min(self.pages.len()); + + self.pages[start - 1..end] + .iter() + .map(|p| format!("\n{}\n\n", p.number, p.text, p.number)) + .collect() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pdf_page_creation() { + let page = PdfPage::new(1, "Hello world"); + assert_eq!(page.number, 1); + assert_eq!(page.text, "Hello world"); + assert!(page.token_count > 0); + } + + #[test] + fn test_estimate_tokens() { + // Uses tiktoken for accurate counting + assert_eq!(estimate_tokens(""), 0); + // "hi" is 1 token in tiktoken + assert_eq!(estimate_tokens("hi"), 1); + // tiktoken is efficient at encoding text - just verify it returns a positive count + let hundred_as = "a".repeat(100); + assert!(estimate_tokens(&hundred_as) >= 1); + } + + #[test] + fn test_page_range_text() { + let pages = vec![ + PdfPage::new(1, "Page 1 content"), + PdfPage::new(2, "Page 2 content"), + PdfPage::new(3, "Page 3 content"), + ]; + let result = PdfParseResult::new(PdfMetadata::default(), pages); + + let text = result.get_page_range_text(1, 2); + assert!(text.contains("Page 1 content")); + assert!(text.contains("Page 2 content")); + assert!(!text.contains("Page 3 content")); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/assigner.rs b/vectorless-core/vectorless-index/src/parse/toc/assigner.rs new file mode 100644 index 00000000..5a298031 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/assigner.rs @@ -0,0 +1,395 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Page assigner - assigns physical page numbers to TOC entries. + +use futures::stream::{self, StreamExt}; +use std::collections::HashMap; +use tracing::{debug, info}; + +use vectorless_error::Result; +use crate::index::parse::pdf::PdfPage; +use vectorless_llm::config::LlmConfig; + +use super::types::{PageOffset, TocEntry}; +use vectorless_llm::LlmClient; + +/// Page assigner configuration. +#[derive(Debug, Clone)] +pub struct PageAssignerConfig { + /// Number of anchor points for offset calculation. + pub anchor_count: usize, + + /// LLM configuration. + pub llm_config: LlmConfig, + + /// Maximum offset variance allowed. + pub max_offset_variance: usize, +} + +impl Default for PageAssignerConfig { + fn default() -> Self { + Self { + anchor_count: 5, + llm_config: LlmConfig::default(), + max_offset_variance: 3, + } + } +} + +/// Page assigner - assigns physical page numbers to TOC entries. +pub struct PageAssigner { + config: PageAssignerConfig, + client: LlmClient, +} + +impl PageAssigner { + /// Create a new page assigner. + pub fn new(config: PageAssignerConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create an assigner with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: PageAssignerConfig::default(), + client, + } + } + + /// Create an assigner with default configuration. + pub fn with_defaults() -> Self { + Self::new(PageAssignerConfig::default()) + } + + /// Assign physical pages to TOC entries. + /// + /// Strategy: + /// 1. If entries have TOC pages → calculate offset → apply offset + /// 2. If no TOC pages → use LLM to locate each entry + pub async fn assign(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { + if entries.is_empty() { + return Ok(()); + } + + // Check if we have TOC page numbers + let has_toc_pages = entries.iter().any(|e| e.toc_page.is_some()); + + if has_toc_pages { + self.assign_with_offset(entries, pages).await + } else { + self.assign_with_llm(entries, pages).await + } + } + + /// Assign pages using offset calculation. + async fn assign_with_offset(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { + info!("Assigning pages using offset calculation"); + + // Step 1: Select anchor entries + let anchors = self.select_anchors(entries, self.config.anchor_count); + + // Step 2: Verify anchors and calculate offset + let offset = self.calculate_offset(anchors, pages).await?; + + if offset.confidence < 0.5 { + debug!("Offset confidence too low, falling back to LLM positioning"); + return self.assign_with_llm(entries, pages).await; + } + + info!( + "Calculated offset: {} (confidence: {})", + offset.offset, offset.confidence + ); + + // Step 3: Apply offset to all entries + for entry in entries.iter_mut() { + if let Some(toc_page) = entry.toc_page { + let physical = offset.apply(toc_page); + entry.physical_page = Some(physical.min(pages.len())); + } + } + + Ok(()) + } + + /// Select anchor entries for offset calculation. + fn select_anchors<'a>(&self, entries: &'a [TocEntry], count: usize) -> Vec<&'a TocEntry> { + // Select entries with TOC pages, evenly distributed + let with_pages: Vec<_> = entries.iter().filter(|e| e.toc_page.is_some()).collect(); + + if with_pages.len() <= count { + return with_pages; + } + + // Select evenly distributed entries + let step = with_pages.len() as f32 / count as f32; + (0..count) + .map(|i| with_pages[(i as f32 * step) as usize]) + .collect() + } + + /// Calculate page offset by verifying anchors concurrently. + async fn calculate_offset( + &self, + anchors: Vec<&TocEntry>, + pages: &[PdfPage], + ) -> Result { + if anchors.is_empty() { + return Ok(PageOffset::new(0, 0, 0.0)); + } + + let anchor_count = anchors.len(); + + // Verify all anchors concurrently + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let futures: Vec<_> = anchors + .into_iter() + .map(|anchor| { + let title = anchor.title.clone(); + let toc_page = anchor.toc_page.unwrap(); + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let range_pages = Self::pages_around(&pages, toc_page, 3); + if range_pages.is_empty() { + return (0, false); + } + + let content = Self::format_range_pages(&range_pages); + match Self::locate_with_client(&client, &title, &content).await { + Ok(Some(physical)) => { + let offset = physical as i32 - toc_page as i32; + debug!( + "Anchor '{}' found: toc={}, physical={}, offset={}", + title, toc_page, physical, offset + ); + (offset, true) + } + _ => (0, false), + } + } + }) + .collect(); + + let verified_offsets: Vec<_> = stream::iter(futures).buffer_unordered(5).collect().await; + + // Calculate the mode (most common offset) + let successful: Vec<_> = verified_offsets + .iter() + .filter(|(_, success)| *success) + .map(|(offset, _)| *offset) + .collect(); + + if successful.is_empty() { + return Ok(PageOffset::new(0, 0, 0.0)); + } + + let mode = Self::calculate_mode_static(&successful); + let sample_count = successful.len(); + let confidence = sample_count as f32 / anchor_count as f32; + + Ok(PageOffset::new(mode, sample_count, confidence)) + } + + /// Calculate mode of offset values. + fn calculate_mode(&self, values: &[i32]) -> i32 { + Self::calculate_mode_static(values) + } + + /// Static version for use in concurrent contexts. + fn calculate_mode_static(values: &[i32]) -> i32 { + let mut counts: HashMap = HashMap::new(); + for &v in values { + *counts.entry(v).or_insert(0) += 1; + } + counts + .into_iter() + .max_by_key(|&(_, count)| count) + .map(|(v, _)| v) + .unwrap_or(0) + } + + /// Collect pages around a center page number. + fn pages_around(pages: &[PdfPage], center: usize, range: usize) -> Vec { + let start = center.saturating_sub(range).max(1); + let end = (center + range).min(pages.len()); + (start..=end) + .filter_map(|i| pages.get(i - 1).cloned()) + .collect() + } + + /// Format pages into tagged text for LLM. + fn format_range_pages(pages: &[PdfPage]) -> String { + pages + .iter() + .map(|p| { + format!( + "\n{}\n", + p.number, + &p.text[..p.text.len().min(500)], + p.number + ) + }) + .collect::>() + .join("\n\n") + } + + /// Locate a title in pre-formatted content using LLM (static, for concurrent use). + async fn locate_with_client( + client: &LlmClient, + title: &str, + content: &str, + ) -> Result> { + let system = "You are a document analysis assistant. Find which page contains a specific section title."; + let user = format!( + r#"Find which page contains the section titled: "{}" + +Pages: +{} + +Reply in JSON format: +{{"page": }}"#, + title, content + ); + + #[derive(serde::Deserialize)] + struct LocateResult { + page: Option, + } + + let result: LocateResult = client.complete_json(system, &user).await?; + Ok(result.page) + } + + /// Assign pages using LLM for each entry (with bounded concurrency). + async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { + info!("Assigning pages using LLM positioning"); + + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let total = entries.len(); + + // Launch entry searches with bounded concurrency to avoid rate limiting + let futures: Vec<_> = entries + .iter() + .map(|entry| { + let title = entry.title.clone(); + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let groups = Self::group_pages_owned(&pages, 5); + Self::locate_title_in_groups_static(&client, &title, &groups).await + } + }) + .collect(); + + let results: Vec<_> = stream::iter(futures).buffer_unordered(5).collect().await; + + info!("Assigned pages for {}/{} entries", results.len(), total); + + // Write results back + for (entry, result) in entries.iter_mut().zip(results.into_iter()) { + let physical = result?; + entry.physical_page = physical; + entry.confidence = if physical.is_some() { 0.8 } else { 0.3 }; + } + + Ok(()) + } + + /// Group owned pages for batch processing. + fn group_pages_owned(pages: &[PdfPage], group_size: usize) -> Vec> { + pages + .chunks(group_size) + .map(|chunk| chunk.to_vec()) + .collect() + } + + /// Locate a title across page groups (static, for concurrent use). + /// + /// Searches groups sequentially (early return on first match), + /// but multiple title searches can run concurrently. + async fn locate_title_in_groups_static( + client: &LlmClient, + title: &str, + groups: &[Vec], + ) -> Result> { + let system = "You are a document analysis assistant. Find which page contains a specific section title."; + + for group in groups { + let content = group + .iter() + .map(|p| { + format!( + "\n{}\n", + p.number, + &p.text[..p.text.len().min(300)], + p.number + ) + }) + .collect::>() + .join("\n\n"); + + let user = format!( + r#"Find which page contains the section titled: "{}" + +Pages: +{} + +Reply in JSON format: +{{"found": true/false, "page": }}"#, + title, content + ); + + #[derive(serde::Deserialize)] + struct SearchResult { + found: bool, + page: Option, + } + + let result: SearchResult = client.complete_json(system, &user).await?; + + if result.found { + return Ok(result.page); + } + } + + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_select_anchors() { + let assigner = PageAssigner::with_defaults(); + + let entries = vec![ + TocEntry::new("Chapter 1", 1).with_toc_page(1), + TocEntry::new("Chapter 2", 1).with_toc_page(10), + TocEntry::new("Chapter 3", 1).with_toc_page(20), + TocEntry::new("Chapter 4", 1).with_toc_page(30), + ]; + + let anchors = assigner.select_anchors(&entries, 2); + assert_eq!(anchors.len(), 2); + } + + #[test] + fn test_calculate_mode() { + let assigner = PageAssigner::with_defaults(); + + let values = vec![2, 2, 2, 3, 3, 4]; + assert_eq!(assigner.calculate_mode(&values), 2); + + let values = vec![1, 1, 2, 2, 2]; + assert_eq!(assigner.calculate_mode(&values), 2); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/detector.rs b/vectorless-core/vectorless-index/src/parse/toc/detector.rs new file mode 100644 index 00000000..95e84431 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/detector.rs @@ -0,0 +1,349 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! TOC (Table of Contents) detection. + +use regex::Regex; +use tracing::debug; + +use vectorless_error::Result; +use vectorless_llm::config::LlmConfig; + +use super::types::TocDetection; +use crate::index::parse::pdf::PdfPage; +use vectorless_llm::LlmClient; + +/// TOC detector configuration. +#[derive(Debug, Clone)] +pub struct TocDetectorConfig { + /// Maximum pages to check for TOC. + pub max_check_pages: usize, + + /// Minimum confidence threshold for regex detection. + pub regex_confidence_threshold: f32, + + /// Use LLM for uncertain cases. + pub use_llm_fallback: bool, + + /// LLM configuration. + pub llm_config: LlmConfig, +} + +impl Default for TocDetectorConfig { + fn default() -> Self { + Self { + max_check_pages: 15, + regex_confidence_threshold: 0.7, + use_llm_fallback: true, + llm_config: LlmConfig::default(), + } + } +} + +/// TOC detector - finds table of contents in PDF documents. +pub struct TocDetector { + config: TocDetectorConfig, + llm_client: Option, + patterns: Vec, +} + +/// A TOC detection pattern. +#[allow(dead_code)] +struct TocPattern { + /// Pattern name for debugging. + name: &'static str, + /// Regex pattern to match. + regex: Regex, + /// Weight for scoring. + weight: f32, +} + +impl TocDetector { + /// Create a new TOC detector. + pub fn new(config: TocDetectorConfig) -> Self { + let llm_client = if config.use_llm_fallback { + Some(LlmClient::new(config.llm_config.clone().into())) + } else { + None + }; + + Self { + config, + llm_client, + patterns: Self::build_patterns(), + } + } + + /// Create a detector with an externally provided LLM client. + pub fn with_client(config: TocDetectorConfig, client: LlmClient) -> Self { + let use_llm = config.use_llm_fallback; + Self { + config, + llm_client: if use_llm { Some(client) } else { None }, + patterns: Self::build_patterns(), + } + } + + /// Create a detector with default configuration. + pub fn with_defaults() -> Self { + Self::new(TocDetectorConfig::default()) + } + + /// Build detection patterns. + fn build_patterns() -> Vec { + vec![ + // Chinese TOC patterns + TocPattern { + name: "chinese_toc_header", + regex: Regex::new(r"(?i)^[\s]*(目\s*录|内\s*容\s*摘\s*要)[\s]*$").unwrap(), + weight: 0.9, + }, + TocPattern { + name: "chinese_chapter_with_page", + regex: Regex::new(r"第[一二三四五六七八九十\d]+[章节部篇].*?[\.\s…·]{2,}\s*\d+") + .unwrap(), + weight: 0.85, + }, + TocPattern { + name: "chinese_section_dots", + regex: Regex::new(r"\d+[\.\d]+\s+.+?\s*[\.\s…·]{3,}\s*\d+").unwrap(), + weight: 0.8, + }, + // English TOC patterns + TocPattern { + name: "english_toc_header", + regex: Regex::new(r"(?i)^[\s]*(table\s+of\s+contents|contents|outline)[\s]*$") + .unwrap(), + weight: 0.9, + }, + TocPattern { + name: "english_chapter_with_page", + regex: Regex::new(r"(?i)^[\s]*(chapter|section|part)\s+\d+.*?\d+\s*$").unwrap(), + weight: 0.85, + }, + TocPattern { + name: "numbered_section_dots", + regex: Regex::new(r"^\d+\.\d+(\.\d+)?\s+.+?[\.\s…]{3,}\s*\d+\s*$").unwrap(), + weight: 0.75, + }, + // Generic patterns + TocPattern { + name: "dots_leader", + regex: Regex::new(r".+?[\.\s…·]{4,}\s*\d{1,4}\s*$").unwrap(), + weight: 0.7, + }, + TocPattern { + name: "title_with_page", + regex: Regex::new(r"^.{3,50}?\s{2,}\d{1,4}\s*$").unwrap(), + weight: 0.5, + }, + ] + } + + /// Detect TOC in PDF pages. + pub async fn detect(&self, pages: &[PdfPage]) -> Result { + let check_pages = pages + .iter() + .take(self.config.max_check_pages) + .collect::>(); + + if check_pages.is_empty() { + return Ok(TocDetection::not_found()); + } + + // Step 1: Regex detection + let regex_result = self.detect_with_regex(&check_pages); + debug!( + "Regex detection result: found={}, confidence={}", + regex_result.found, regex_result.confidence + ); + + // Step 2: If confidence is high enough, return + if regex_result.confidence >= self.config.regex_confidence_threshold { + return Ok(regex_result); + } + + // Step 3: Use LLM fallback if available and needed + if let Some(ref client) = self.llm_client { + if regex_result.confidence > 0.3 || regex_result.confidence == 0.0 { + debug!("Using LLM fallback for TOC detection"); + return self.detect_with_llm(client, &check_pages).await; + } + } + + Ok(regex_result) + } + + /// Detect TOC using regex patterns. + fn detect_with_regex(&self, pages: &[&PdfPage]) -> TocDetection { + let mut toc_pages = Vec::new(); + let mut has_page_numbers = false; + let mut total_score = 0.0; + let mut match_count = 0; + + for page in pages { + let (score, has_numbers) = self.score_page_for_toc(page); + + if score > 0.5 { + toc_pages.push(page.number); + + if has_numbers { + has_page_numbers = true; + } + + total_score += score; + match_count += 1; + } + } + + if toc_pages.is_empty() { + return TocDetection::not_found(); + } + + let confidence = if match_count > 0 { + total_score / match_count as f32 + } else { + 0.0 + }; + + TocDetection::new(true) + .with_pages(toc_pages) + .with_page_numbers(has_page_numbers) + .with_confidence(confidence) + } + + /// Score a page for TOC likelihood. + fn score_page_for_toc(&self, page: &PdfPage) -> (f32, bool) { + let lines: Vec<&str> = page.text.lines().collect(); + + if lines.len() < 2 { + return (0.0, false); + } + + let mut max_score: f32 = 0.0; + let mut has_page_numbers = false; + let mut match_count = 0; + + for line in &lines { + for pattern in &self.patterns { + if pattern.regex.is_match(line) { + max_score = max_score.max(pattern.weight); + match_count += 1; + + // Check if pattern includes page numbers + if line.matches(char::is_numeric).count() > 0 { + has_page_numbers = true; + } + } + } + } + + // Adjust score based on number of matches + let score = if match_count >= 3 { + max_score + } else if match_count >= 1 { + max_score * 0.7 + } else { + 0.0 + }; + + (score, has_page_numbers) + } + + /// Detect TOC using LLM. + async fn detect_with_llm( + &self, + client: &LlmClient, + pages: &[&PdfPage], + ) -> Result { + // Combine first few pages for analysis + let content = pages + .iter() + .take(5) + .map(|p| { + format!( + "\n{}\n", + p.number, + &p.text[..p.text.len().min(1000)], + p.number + ) + }) + .collect::>() + .join("\n\n"); + + let system = "You are a document analysis assistant. Your task is to detect if the given document contains a Table of Contents (TOC)."; + let user = format!( + r#"Analyze this document and determine if it contains a Table of Contents. + +Document content: +{} + +Reply in JSON format: +{{ + "has_toc": true/false, + "toc_pages": [list of page numbers where TOC appears], + "has_page_numbers": true/false (whether TOC entries include page numbers), + "confidence": 0.0-1.0 +}}"#, + content + ); + + #[derive(serde::Deserialize)] + struct DetectionResponse { + has_toc: bool, + toc_pages: Vec, + has_page_numbers: bool, + confidence: f32, + } + + let response: DetectionResponse = client.complete_json(system, &user).await?; + + Ok(TocDetection::new(response.has_toc) + .with_pages(response.toc_pages) + .with_page_numbers(response.has_page_numbers) + .with_confidence(response.confidence)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_page(number: usize, text: &str) -> PdfPage { + PdfPage::new(number, text) + } + + #[test] + fn test_detect_chinese_toc() { + let detector = TocDetector::with_defaults(); + + let pages = vec![ + make_page(1, "前言"), + make_page(2, "目 录\n\n第一章 引言 ... 1\n第二章 方法 ... 5"), + ]; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let result = rt.block_on(detector.detect(&pages)).unwrap(); + + assert!(result.found); + assert!(result.has_page_numbers); + } + + #[test] + fn test_detect_english_toc() { + let detector = TocDetector::with_defaults(); + + let pages = vec![ + make_page(1, "Abstract"), + make_page( + 2, + "Table of Contents\n\nChapter 1. Introduction 1\nChapter 2. Methods 5", + ), + ]; + + let rt = tokio::runtime::Runtime::new().unwrap(); + let result = rt.block_on(detector.detect(&pages)).unwrap(); + + assert!(result.found); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/mod.rs b/vectorless-core/vectorless-index/src/parse/toc/mod.rs new file mode 100644 index 00000000..beac24d7 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/mod.rs @@ -0,0 +1,28 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Table of Contents (TOC) processing module. +//! +//! This module provides functionality to extract and verify document structure +//! from PDF Table of Contents: +//! +//! - **Detection** — Find TOC in document (regex + LLM fallback) +//! - **Parsing** — Convert TOC text to structured entries (LLM) +//! - **Assignment** — Map TOC pages to physical pages +//! - **Verification** — Sample verification of page assignments +//! - **Repair** — Fix incorrect assignments + +mod assigner; +mod detector; +mod parser; +mod processor; +mod repairer; +mod structure_extractor; +mod types; +mod verifier; + +// Re-export main types +pub use types::TocEntry; + +// Re-export components +pub use processor::TocProcessor; diff --git a/vectorless-core/vectorless-index/src/parse/toc/parser.rs b/vectorless-core/vectorless-index/src/parse/toc/parser.rs new file mode 100644 index 00000000..fe97708a --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/parser.rs @@ -0,0 +1,279 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! TOC parser - converts TOC text to structured entries. + +use tracing::debug; + +use vectorless_error::Result; +use vectorless_llm::config::LlmConfig; + +use super::types::TocEntry; +use vectorless_llm::LlmClient; + +/// TOC parser configuration. +#[derive(Debug, Clone)] +pub struct TocParserConfig { + /// LLM configuration. + pub llm_config: LlmConfig, + + /// Maximum retries for incomplete parsing. + pub max_retries: usize, + + /// Verify completeness after parsing. + pub verify_completeness: bool, +} + +impl Default for TocParserConfig { + fn default() -> Self { + Self { + llm_config: LlmConfig::default(), + max_retries: 3, + verify_completeness: true, + } + } +} + +/// TOC parser - converts raw TOC text to structured entries. +pub struct TocParser { + config: TocParserConfig, + client: LlmClient, +} + +impl TocParser { + /// Create a new TOC parser. + pub fn new(config: TocParserConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create a parser with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: TocParserConfig::default(), + client, + } + } + + /// Create a parser with default configuration. + pub fn with_defaults() -> Self { + Self::new(TocParserConfig::default()) + } + + /// Parse TOC text into structured entries. + pub async fn parse(&self, toc_text: &str) -> Result> { + if toc_text.trim().is_empty() { + return Ok(Vec::new()); + } + + // Step 1: Initial parse + let entries = self.parse_with_llm(toc_text).await?; + debug!("Initial parse: {} entries", entries.len()); + + if entries.is_empty() { + return Ok(entries); + } + + // Step 2: Verify completeness (if enabled) + if self.config.verify_completeness { + self.verify_and_complete(toc_text, entries).await + } else { + Ok(entries) + } + } + + /// Parse TOC text using LLM. + async fn parse_with_llm(&self, toc_text: &str) -> Result> { + let system = r#"You are a document structure extraction expert. +Your task is to parse a Table of Contents (TOC) into a structured format. + +Rules: +1. Extract all sections and subsections +2. Determine the hierarchy level (1 = top level, 2 = subsection, etc.) +3. Extract page numbers if present +4. Preserve original titles exactly (only fix spacing issues) +5. If the TOC seems incomplete, extract what you can see"#; + + let user = format!( + r#"Parse this Table of Contents: + +{} + +Return a JSON array: +[ + {{ + "title": "Section Title", + "level": 1, + "page": 10 + }}, + ... +] + +Notes: +- "level" should reflect the hierarchy (1, 2, 3...) +- "page" is optional if not present in TOC +- Only output the JSON array, no other text"#, + toc_text + ); + + #[derive(serde::Deserialize)] + struct ParsedEntry { + title: String, + level: usize, + #[serde(default)] + page: Option, + } + + let entries: Vec = self.client.complete_json(system, &user).await?; + + Ok(entries + .into_iter() + .map(|e| { + let mut entry = TocEntry::new(e.title, e.level); + if let Some(page) = e.page { + entry = entry.with_toc_page(page); + } + entry + }) + .collect()) + } + + /// Verify completeness and continue if needed. + async fn verify_and_complete( + &self, + toc_text: &str, + mut entries: Vec, + ) -> Result> { + let mut attempts = 0; + + while attempts < self.config.max_retries { + // Check if parsing is complete + let is_complete = self.check_completeness(toc_text, &entries).await?; + + if is_complete { + debug!("TOC parsing complete after {} attempts", attempts + 1); + return Ok(entries); + } + + debug!( + "TOC incomplete, attempting continuation (attempt {})", + attempts + 1 + ); + + // Continue parsing + let additional = self.continue_parsing(toc_text, &entries).await?; + if additional.is_empty() { + // No more entries found, stop + break; + } + + entries.extend(additional); + attempts += 1; + } + + Ok(entries) + } + + /// Check if parsing is complete. + async fn check_completeness(&self, toc_text: &str, entries: &[TocEntry]) -> Result { + let system = "You are a document analysis assistant. Determine if the parsed entries completely represent the original TOC."; + + let entries_json = + serde_json::to_string_pretty(&entries.iter().map(|e| &e.title).collect::>()) + .unwrap_or_default(); + + let user = format!( + r#"Original TOC: +{} + +Parsed entries: +{} + +Is the parsing complete? Reply with JSON: +{{"complete": true/false}}"#, + toc_text, entries_json + ); + + #[derive(serde::Deserialize)] + struct CompletenessCheck { + complete: bool, + } + + let result: CompletenessCheck = self.client.complete_json(system, &user).await?; + Ok(result.complete) + } + + /// Continue parsing from where we left off. + async fn continue_parsing( + &self, + toc_text: &str, + existing: &[TocEntry], + ) -> Result> { + let system = "You are a document structure extraction expert. Continue parsing the TOC from where it was left off."; + + let last_titles: Vec<_> = existing.iter().rev().take(5).map(|e| &e.title).collect(); + + let user = format!( + r#"Original TOC: +{} + +Already parsed (last 5): +{:?} + +Extract the REMAINING entries that were missed. Return a JSON array: +[ + {{"title": "...", "level": N, "page": M}}, + ... +] + +If nothing was missed, return an empty array: []"#, + toc_text, last_titles + ); + + #[derive(serde::Deserialize)] + struct ParsedEntry { + title: String, + level: usize, + #[serde(default)] + page: Option, + } + + let entries: Vec = self.client.complete_json(system, &user).await?; + + Ok(entries + .into_iter() + .map(|e| { + let mut entry = TocEntry::new(e.title, e.level); + if let Some(page) = e.page { + entry = entry.with_toc_page(page); + } + entry + }) + .collect()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_parse_simple_toc() { + let parser = TocParser::with_defaults(); + + // This test requires an API key + if std::env::var("OPENAI_API_KEY").is_err() { + return; + } + + let toc_text = r#" +Chapter 1. Introduction 1 + 1.1 Background 2 + 1.2 Objectives 5 +Chapter 2. Methods 10 +"#; + + let entries = parser.parse(toc_text).await.unwrap(); + assert!(!entries.is_empty()); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/processor.rs b/vectorless-core/vectorless-index/src/parse/toc/processor.rs new file mode 100644 index 00000000..1cc43d6c --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/processor.rs @@ -0,0 +1,573 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! TOC processor - integrates all TOC processing components. +//! +//! The processor orchestrates a multi-mode extraction pipeline with automatic +//! degradation: if one mode fails verification, it falls back to a lower-quality +//! but more reliable mode. + +use futures::stream::{self, StreamExt}; +use tracing::{debug, info, warn}; + +use vectorless_error::Result; +use crate::index::parse::pdf::PdfPage; +use vectorless_llm::LlmClient; + +use super::assigner::{PageAssigner, PageAssignerConfig}; +use super::detector::{TocDetector, TocDetectorConfig}; +use super::parser::{TocParser, TocParserConfig}; +use super::repairer::{IndexRepairer, RepairerConfig}; +use super::structure_extractor::{StructureExtractor, StructureExtractorConfig}; +use super::types::{ProcessingMode, TocEntry, VerificationReport}; +use super::verifier::{IndexVerifier, VerifierConfig}; + +/// TOC processor configuration. +#[derive(Debug, Clone)] +pub struct TocProcessorConfig { + /// TOC detector configuration. + pub detector: TocDetectorConfig, + + /// TOC parser configuration. + pub parser: TocParserConfig, + + /// Page assigner configuration. + pub assigner: PageAssignerConfig, + + /// Verifier configuration. + pub verifier: VerifierConfig, + + /// Repairer configuration. + pub repairer: RepairerConfig, + + /// Accuracy threshold for acceptance (0.0 - 1.0). + pub accuracy_threshold: f32, + + /// Maximum repair attempts per verification cycle. + pub max_repair_attempts: usize, + + /// Maximum page span for a single entry before recursive refinement. + pub max_pages_per_entry: usize, + + /// Maximum estimated tokens for a single entry before recursive refinement. + pub max_tokens_per_entry: usize, +} + +impl Default for TocProcessorConfig { + fn default() -> Self { + Self { + detector: TocDetectorConfig::default(), + parser: TocParserConfig::default(), + assigner: PageAssignerConfig::default(), + verifier: VerifierConfig::default(), + repairer: RepairerConfig::default(), + accuracy_threshold: 0.6, + max_repair_attempts: 3, + max_pages_per_entry: 30, + max_tokens_per_entry: 20000, + } + } +} + +/// TOC processor - orchestrates the complete TOC extraction pipeline. +/// +/// # Processing Pipeline +/// +/// 1. **Detect** - Find TOC in document (regex + LLM fallback) +/// 2. **Extract** - Get TOC text from detected pages +/// 3. **Parse** - Convert TOC text to structured entries (LLM) +/// 4. **Assign** - Map TOC pages to physical pages +/// 5. **Verify** - Sample verification of page assignments +/// 6. **Repair** - Fix incorrect assignments (if needed) +/// 7. **Refine** - Sub-divide oversized entries (if needed) +/// +/// # Degradation Strategy +/// +/// The pipeline tries three modes in order of quality: +/// +/// 1. `TocWithPageNumbers` - TOC found with page numbers (offset calculation) +/// 2. `TocWithoutPageNumbers` - TOC found without page numbers (LLM positioning) +/// 3. `NoToc` - No TOC available (LLM structure extraction from content) +/// +/// If a mode fails verification (accuracy < threshold), it automatically +/// degrades to the next mode. +/// +/// # Example +/// +/// ```rust,no_run +/// use vectorless::parser::toc::TocProcessor; +/// use vectorless::parser::pdf::PdfParser; +/// +/// # #[tokio::main] +/// # async fn main() -> vectorless::Result<()> { +/// let pdf_parser = PdfParser::new(); +/// let result = pdf_parser.parse_file("document.pdf".as_ref()).await?; +/// +/// let processor = TocProcessor::new(); +/// let entries = processor.process(&result.pages).await?; +/// +/// for entry in &entries { +/// println!("{} - Page {:?}", entry.title, entry.physical_page); +/// } +/// # Ok(()) +/// # } +/// ``` +pub struct TocProcessor { + config: TocProcessorConfig, + detector: TocDetector, + parser: TocParser, + assigner: PageAssigner, + verifier: IndexVerifier, + repairer: IndexRepairer, + /// Optional LLM client for StructureExtractor (no-TOC mode and refinement). + llm_client: Option, +} + +impl TocProcessor { + /// Create a new TOC processor with default configuration. + pub fn new() -> Self { + Self::with_config(TocProcessorConfig::default()) + } + + /// Create a TOC processor with an externally provided LLM client. + /// + /// All sub-components (detector, parser, assigner, verifier, repairer) + /// will use this client instead of creating their own from default config. + pub fn with_llm_client(client: LlmClient) -> Self { + info!("TocProcessor: created with external LLM client"); + let config = TocProcessorConfig::default(); + Self { + detector: TocDetector::with_client(config.detector.clone(), client.clone()), + parser: TocParser::with_client(client.clone()), + assigner: PageAssigner::with_client(client.clone()), + verifier: IndexVerifier::with_client(client.clone()), + repairer: IndexRepairer::with_client(client.clone()), + llm_client: Some(client), + config, + } + } + + /// Create a TOC processor with custom configuration. + pub fn with_config(config: TocProcessorConfig) -> Self { + info!("TocProcessor: created with config (no external LLM client)"); + Self { + detector: TocDetector::new(config.detector.clone()), + parser: TocParser::new(config.parser.clone()), + assigner: PageAssigner::new(config.assigner.clone()), + verifier: IndexVerifier::new(config.verifier.clone()), + repairer: IndexRepairer::new(config.repairer.clone()), + llm_client: None, + config, + } + } + + /// Process PDF pages and extract hierarchical structure. + /// + /// This is the main entry point. It detects TOC, selects the best + /// processing mode, and automatically degrades if needed. + pub async fn process(&self, pages: &[PdfPage]) -> Result> { + if pages.is_empty() { + return Ok(Vec::new()); + } + + info!("Processing {} pages for TOC extraction", pages.len()); + + // Step 1: Detect TOC + let detection = self.detector.detect(pages).await?; + + // Step 2: Determine initial mode based on detection result + let initial_mode = if !detection.found { + info!("No TOC found in document"); + ProcessingMode::NoToc + } else if detection.has_page_numbers { + info!("TOC found on pages {:?}, has page numbers", detection.pages); + ProcessingMode::TocWithPageNumbers + } else { + info!("TOC found on pages {:?}, no page numbers", detection.pages); + ProcessingMode::TocWithoutPageNumbers + }; + + // Step 3: Process with degradation + let entries = self + .process_with_degradation(initial_mode, &detection, pages) + .await?; + + // Step 4: Refine oversized entries + self.refine_large_entries(entries, pages).await + } + + /// Process with automatic mode degradation. + /// + /// Tries the given mode, verifies the result, and degrades to a + /// lower-quality mode if accuracy is below threshold. + async fn process_with_degradation( + &self, + initial_mode: ProcessingMode, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let mut mode = initial_mode; + + loop { + info!("Attempting extraction with mode {:?}", mode); + + let result = match mode { + ProcessingMode::TocWithPageNumbers => { + self.process_toc_with_page_numbers(detection, pages).await + } + ProcessingMode::TocWithoutPageNumbers => { + self.process_toc_without_page_numbers(detection, pages) + .await + } + ProcessingMode::NoToc => { + // NoToc always succeeds (produces some structure) + return self.process_without_toc(pages).await; + } + }; + + match result { + Ok(entries) if !entries.is_empty() => { + // Verify the entries + let mut mutable_entries = entries; + let report = self.verify_and_repair(&mut mutable_entries, pages).await?; + + if report.accuracy >= self.config.accuracy_threshold { + info!( + "Mode {:?} succeeded: {} entries, accuracy {:.1}%", + mode, + mutable_entries.len(), + report.accuracy * 100.0 + ); + return Ok(mutable_entries); + } + + // Accuracy too low, try degrading + warn!( + "Mode {:?} accuracy {:.1}% below threshold {:.1}%", + mode, + report.accuracy * 100.0, + self.config.accuracy_threshold * 100.0 + ); + + match mode.degrade() { + Some(next) => { + info!("Degrading from {:?} to {:?}", mode, next); + mode = next; + // Continue loop with degraded mode + } + None => { + warn!("No further degradation possible, returning best effort"); + return Ok(mutable_entries); + } + } + } + Ok(_) => { + // Empty entries, degrade + warn!("Mode {:?} produced no entries", mode); + match mode.degrade() { + Some(next) => { + mode = next; + } + None => return Ok(Vec::new()), + } + } + Err(e) => { + warn!("Mode {:?} failed: {}", mode, e); + match mode.degrade() { + Some(next) => { + mode = next; + } + None => return Err(e), + } + } + } + } + } + + /// Mode 1: TOC with page numbers. + /// + /// Parse the TOC, calculate physical-page offset from anchor entries, + /// and apply the offset to all entries. + async fn process_toc_with_page_numbers( + &self, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let toc_text = self.extract_toc_text(pages, &detection.pages); + if toc_text.trim().is_empty() { + return Ok(Vec::new()); + } + + let mut entries = self.parser.parse(&toc_text).await?; + if entries.is_empty() { + return Ok(Vec::new()); + } + + // Assign physical pages using offset calculation + self.assigner.assign(&mut entries, pages).await?; + + Ok(entries) + } + + /// Mode 2: TOC without page numbers. + /// + /// Parse the TOC, then use LLM to locate each entry in the document. + async fn process_toc_without_page_numbers( + &self, + detection: &super::types::TocDetection, + pages: &[PdfPage], + ) -> Result> { + let toc_text = self.extract_toc_text(pages, &detection.pages); + if toc_text.trim().is_empty() { + return Ok(Vec::new()); + } + + let mut entries = self.parser.parse(&toc_text).await?; + if entries.is_empty() { + return Ok(Vec::new()); + } + + // Clear any TOC page numbers (they're unreliable in this mode) + for entry in &mut entries { + entry.toc_page = None; + } + + // Assign physical pages using LLM positioning + self.assigner.assign(&mut entries, pages).await?; + + Ok(entries) + } + + /// Mode 3: No TOC available. + /// + /// Extract document structure directly from page content using LLM. + async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { + info!("Extracting structure from page content (no TOC available)"); + + let extractor = match &self.llm_client { + Some(client) => { + StructureExtractor::with_client(StructureExtractorConfig::default(), client.clone()) + } + None => StructureExtractor::new(StructureExtractorConfig::default()), + }; + extractor.extract(pages).await + } + + /// Extract TOC text from pages. + fn extract_toc_text(&self, pages: &[PdfPage], toc_pages: &[usize]) -> String { + toc_pages + .iter() + .filter_map(|&page_num| pages.get(page_num - 1)) + .map(|page| page.text.as_str()) + .collect::>() + .join("\n\n") + } + + /// Verify entries and repair if needed. + async fn verify_and_repair( + &self, + entries: &mut [TocEntry], + pages: &[PdfPage], + ) -> Result { + let mut attempts = 0; + + while attempts < self.config.max_repair_attempts { + let report = self.verifier.verify(entries, pages).await?; + + if report.accuracy >= self.config.accuracy_threshold { + debug!( + "Verification passed: accuracy {:.1}%", + report.accuracy * 100.0 + ); + return Ok(report); + } + + if report.errors.is_empty() { + return Ok(report); + } + + let repaired = self.repairer.repair(entries, &report.errors, pages).await?; + + if repaired == 0 { + debug!("No repairs possible"); + return Ok(report); + } + + attempts += 1; + debug!("Repair attempt {} complete", attempts); + } + + self.verifier.verify(entries, pages).await + } + + /// Refine oversized entries by extracting sub-structure. + /// + /// Entries that span too many pages or tokens are broken down using + /// the same structure extraction approach used for no-TOC documents. + async fn refine_large_entries( + &self, + entries: Vec, + pages: &[PdfPage], + ) -> Result> { + if entries.is_empty() { + return Ok(entries); + } + + let page_count = pages.len(); + + // Pre-compute next-entry page numbers and classify entries + let next_pages: Vec> = entries + .iter() + .enumerate() + .map(|(i, _)| entries.get(i + 1).and_then(|e| e.physical_page)) + .collect(); + + // Identify oversized entries and launch extractions concurrently + let llm_client = self.llm_client.clone(); + let oversized_futures: Vec<_> = entries + .iter() + .enumerate() + .filter(|(i, entry)| { + let span = entry_page_span(entry, next_pages[*i], page_count); + let tokens = entry_token_count(entry, pages); + span > self.config.max_pages_per_entry && tokens > self.config.max_tokens_per_entry + }) + .map(|(i, entry)| { + let start = entry.physical_page.unwrap_or(1); + let end = next_pages[i].unwrap_or(page_count); + let sub_pages: Vec = pages + .iter() + .filter(|p| p.number >= start && p.number <= end) + .cloned() + .collect(); + + let entry_title = entry.title.clone(); + let entry_level = entry.level; + let llm_client = llm_client.clone(); + + async move { + if sub_pages.is_empty() { + return (i, Vec::new()); + } + debug!( + "Refining oversized entry '{}' (pages {}-{})", + entry_title, start, end + ); + let extractor = match &llm_client { + Some(client) => StructureExtractor::with_client( + StructureExtractorConfig::default(), + client.clone(), + ), + None => StructureExtractor::new(StructureExtractorConfig::default()), + }; + match extractor.extract(&sub_pages).await { + Ok(sub_entries) => { + let skip = if sub_entries + .first() + .map(|e| e.title.trim() == entry_title.trim()) + .unwrap_or(false) + { + 1 + } else { + 0 + }; + + let refined: Vec = sub_entries[skip..] + .iter() + .map(|sub| { + TocEntry::new(&sub.title, sub.level + entry_level) + .with_physical_page(sub.physical_page.unwrap_or(start)) + .with_confidence(sub.confidence * 0.9) + }) + .collect(); + + info!( + "Refined '{}' into {} sub-entries", + entry_title, + refined.len() + ); + (i, refined) + } + Err(e) => { + warn!("Sub-extraction failed for '{}': {}", entry_title, e); + (i, Vec::new()) + } + } + } + }) + .collect(); + + let extraction_results: Vec<_> = stream::iter(oversized_futures) + .buffer_unordered(3) + .collect() + .await; + + // Build a lookup from index → refined sub-entries + let mut refined_map = std::collections::HashMap::new(); + for (idx, sub_entries) in extraction_results { + if !sub_entries.is_empty() { + refined_map.insert(idx, sub_entries); + } + } + + // Assemble final output + let mut result = Vec::with_capacity(entries.len() * 2); + for (i, entry) in entries.into_iter().enumerate() { + if let Some(sub_entries) = refined_map.remove(&i) { + result.extend(sub_entries); + } else { + result.push(entry); + } + } + + Ok(result) + } +} + +impl Default for TocProcessor { + fn default() -> Self { + Self::new() + } +} + +/// Calculate how many pages an entry spans. +/// +/// From its physical_page to the next entry's physical_page (or document end). +fn entry_page_span( + entry: &TocEntry, + next_physical_page: Option, + total_pages: usize, +) -> usize { + let start = entry.physical_page.unwrap_or(1); + let end = next_physical_page.unwrap_or(total_pages); + end.saturating_sub(start) +} + +/// Estimate total tokens for the content covered by an entry. +fn entry_token_count(entry: &TocEntry, pages: &[PdfPage]) -> usize { + let start = entry.physical_page.unwrap_or(1); + pages + .iter() + .filter(|p| p.number >= start) + .take(30) // cap at max_pages_per_entry default + .map(|p| p.token_count) + .sum() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_processor_creation() { + let processor = TocProcessor::new(); + assert_eq!(processor.config.accuracy_threshold, 0.6); + } + + #[tokio::test] + async fn test_empty_pages() { + let processor = TocProcessor::new(); + let entries = processor.process(&[]).await.unwrap(); + assert!(entries.is_empty()); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/repairer.rs b/vectorless-core/vectorless-index/src/parse/toc/repairer.rs new file mode 100644 index 00000000..f8016657 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/repairer.rs @@ -0,0 +1,247 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index repairer - fixes incorrect TOC entry page assignments. + +use futures::stream::{self, StreamExt}; +use tracing::{debug, info}; + +use vectorless_error::Result; +use crate::index::parse::pdf::PdfPage; +use vectorless_llm::config::LlmConfig; + +use super::types::{TocEntry, VerificationError, VerificationReport}; +use super::verifier::IndexVerifier; +use vectorless_llm::LlmClient; + +/// Repairer configuration. +#[derive(Debug, Clone)] +pub struct RepairerConfig { + /// Maximum repair attempts. + pub max_attempts: usize, + + /// LLM configuration. + pub llm_config: LlmConfig, + + /// Page search range around expected page. + pub search_range: usize, +} + +impl Default for RepairerConfig { + fn default() -> Self { + Self { + max_attempts: 3, + llm_config: LlmConfig::default(), + search_range: 5, + } + } +} + +/// Index repairer - fixes incorrect page assignments. +pub struct IndexRepairer { + config: RepairerConfig, + client: LlmClient, +} + +impl IndexRepairer { + /// Create a new repairer. + pub fn new(config: RepairerConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create a repairer with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: RepairerConfig::default(), + client, + } + } + + /// Create a repairer with default configuration. + pub fn with_defaults() -> Self { + Self::new(RepairerConfig::default()) + } + + /// Repair incorrect entries with bounded concurrency. + pub async fn repair( + &self, + entries: &mut [TocEntry], + errors: &[VerificationError], + pages: &[PdfPage], + ) -> Result { + if errors.is_empty() { + return Ok(0); + } + + info!("Repairing {} incorrect entries", errors.len()); + + // Collect repair tasks (don't borrow entries mutably yet) + let client = self.client.clone(); + let pages_owned = pages.to_vec(); + let search_range = self.config.search_range; + + let tasks: Vec<_> = errors + .iter() + .filter(|error| error.index < entries.len()) + .map(|error| { + let title = entries[error.index].title.clone(); + let expected_page = error.expected_page; + let client = client.clone(); + let pages = pages_owned.clone(); + + async move { + let start = expected_page.saturating_sub(search_range).max(1); + let end = (expected_page + search_range).min(pages.len()); + + let result = + Self::find_correct_page_static(&client, &title, &pages, start..=end).await; + + (title, expected_page, result) + } + }) + .collect(); + + let results: Vec<_> = stream::iter(tasks).buffer_unordered(5).collect().await; + + // Apply repairs + let mut repaired_count = 0; + for (title, expected_page, result) in results { + match result { + Ok(Some(correct_page)) => { + // Find the corresponding error entry and fix it + if let Some(error) = errors.iter().find(|e| e.title == title) { + if error.index < entries.len() { + debug!( + "Repaired '{}' : page {} → {}", + title, expected_page, correct_page + ); + entries[error.index].physical_page = Some(correct_page); + entries[error.index].confidence = 0.9; + repaired_count += 1; + } + } + } + Ok(None) => { + debug!( + "Could not repair '{}' (searched around page {})", + title, expected_page + ); + } + Err(e) => { + debug!("Repair failed for '{}': {}", title, e); + } + } + } + + info!("Repaired {}/{} entries", repaired_count, errors.len()); + Ok(repaired_count) + } + + /// Find the correct page for a title within a range (static, for concurrent use). + async fn find_correct_page_static( + client: &LlmClient, + title: &str, + pages: &[PdfPage], + range: std::ops::RangeInclusive, + ) -> Result> { + let system = "You are a document analysis assistant. Find which page contains a specific section title."; + + // Build content for pages in range + let mut content_parts = Vec::new(); + for page_num in range { + if let Some(page) = pages.get(page_num - 1) { + let text = if page.text.len() > 500 { + &page.text[..500] + } else { + &page.text + }; + content_parts.push(format!( + "\n{}\n", + page_num, text, page_num + )); + } + } + + if content_parts.is_empty() { + return Ok(None); + } + + let content = content_parts.join("\n\n"); + let user = format!( + r#"Find which page contains the section titled: "{}" + +Pages: +{} + +Reply in JSON format: +{{"found": true/false, "page": }}"#, + title, content + ); + + #[derive(serde::Deserialize)] + struct FindResult { + found: bool, + page: Option, + } + + let result: FindResult = client.complete_json(system, &user).await?; + + if result.found { + Ok(result.page) + } else { + Ok(None) + } + } + + /// Repair with verification loop. + pub async fn repair_with_verification( + &self, + entries: &mut [TocEntry], + pages: &[PdfPage], + verifier: &IndexVerifier, + ) -> Result { + let mut attempts = 0; + let threshold = 0.6; // Hardcoded for now, should be from verifier config + + while attempts < self.config.max_attempts { + // Verify current state + let report = verifier.verify(entries, pages).await?; + + if report.accuracy >= threshold { + info!("Repair complete: accuracy {:.1}%", report.accuracy * 100.0); + return Ok(report); + } + + if report.errors.is_empty() { + return Ok(report); + } + + // Repair errors + let repaired = self.repair(entries, &report.errors, pages).await?; + + if repaired == 0 { + // No repairs made, stop trying + debug!("No repairs possible, stopping"); + return Ok(report); + } + + attempts += 1; + info!("Repair attempt {} complete, re-verifying", attempts); + } + + // Final verification + verifier.verify(entries, pages).await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_repairer_creation() { + let repairer = IndexRepairer::with_defaults(); + assert_eq!(repairer.config.max_attempts, 3); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs b/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs new file mode 100644 index 00000000..aedd9b36 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs @@ -0,0 +1,481 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Structure extraction from documents without a Table of Contents. +//! +//! When a PDF has no TOC (or all TOC-based extraction modes failed), this +//! module uses LLM to analyse page content and extract the document's +//! hierarchical structure directly. + +use futures::stream::{self, StreamExt}; +use tracing::{debug, info, warn}; + +use vectorless_error::Result; +use crate::index::parse::pdf::PdfPage; +use vectorless_llm::config::LlmConfig; + +use super::types::TocEntry; +use vectorless_llm::LlmClient; + +/// Configuration for structure extraction. +#[derive(Debug, Clone)] +pub struct StructureExtractorConfig { + /// Maximum estimated tokens per page group sent to LLM. + pub max_tokens_per_group: usize, + + /// Number of overlap pages between consecutive groups. + pub overlap_pages: usize, + + /// LLM configuration. + pub llm_config: LlmConfig, +} + +impl Default for StructureExtractorConfig { + fn default() -> Self { + Self { + max_tokens_per_group: 20_000, + overlap_pages: 1, + llm_config: LlmConfig::default(), + } + } +} + +/// A group of consecutive pages with their combined text. +#[derive(Clone)] +struct PageGroup { + /// Combined text with page markers: `\n...\n`. + text: String, + /// Start page number (1-based). + start_page: usize, + /// End page number (1-based, inclusive). + end_page: usize, +} + +/// Extracts document structure from page content using LLM. +/// +/// Used when a document has no Table of Contents, or when TOC-based extraction +/// failed. Pages are grouped by token count and analysed sequentially: the +/// first group generates an initial structure, subsequent groups append to it. +pub struct StructureExtractor { + config: StructureExtractorConfig, + client: LlmClient, +} + +impl StructureExtractor { + /// Create a new structure extractor. + pub fn new(config: StructureExtractorConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create a structure extractor with an externally provided LLM client. + pub fn with_client(config: StructureExtractorConfig, client: LlmClient) -> Self { + Self { config, client } + } + + /// Create an extractor with default configuration. + pub fn with_defaults() -> Self { + Self::new(StructureExtractorConfig::default()) + } + + /// Extract hierarchical structure from all pages. + /// + /// The first page group is processed alone (initial structure), then all + /// remaining groups are processed in parallel, each using the initial + /// entries as context. Results are merged and deduplicated. + pub async fn extract(&self, pages: &[PdfPage]) -> Result> { + if pages.is_empty() { + return Ok(Vec::new()); + } + + let groups = self.group_pages(pages); + let page_count = pages.len(); + info!( + "Extracting structure from {} pages in {} groups", + page_count, + groups.len() + ); + + // Phase 1: Generate initial structure from first group + let initial_entries = self.generate_initial(&groups[0]).await?; + debug!( + "Initial group (pages {}-{}): extracted {} entries", + groups[0].start_page, + groups[0].end_page, + initial_entries.len() + ); + + if groups.len() == 1 { + return Ok(Self::finalize_entries(initial_entries, page_count)); + } + + // Phase 2: Process remaining groups in parallel (bounded concurrency) + // Each continuation group uses the initial entries as shared context. + let client = self.client.clone(); + let initial_entries_ref = &initial_entries; + + let continuation_futures: Vec<_> = groups[1..] + .iter() + .map(|group| { + let group = group.clone(); + let client = client.clone(); + let initial = initial_entries_ref.to_vec(); + + async move { + let result = + Self::generate_continuation_with_client(&client, &group, &initial).await; + (group.start_page, group.end_page, result) + } + }) + .collect(); + + let continuation_results: Vec<_> = stream::iter(continuation_futures) + .buffer_unordered(5) + .collect() + .await; + + // Phase 3: Merge initial + continuation entries + let mut all_entries = initial_entries; + for (start, end, result) in continuation_results { + match result { + Ok(entries) => { + debug!( + "Continuation group (pages {}-{}): extracted {} entries", + start, + end, + entries.len() + ); + all_entries.extend(entries); + } + Err(e) => { + warn!("Continuation group (pages {}-{}) failed: {}", start, end, e); + } + } + } + + // Phase 4: Sort by page number, deduplicate, truncate + all_entries.sort_by(|a, b| { + a.physical_page + .unwrap_or(0) + .cmp(&b.physical_page.unwrap_or(0)) + }); + all_entries.dedup_by(|a, b| { + a.title.trim() == b.title.trim() && a.physical_page == b.physical_page + }); + + Ok(Self::finalize_entries(all_entries, page_count)) + } + + /// Truncate out-of-range page numbers and log stats. + fn finalize_entries(mut entries: Vec, page_count: usize) -> Vec { + for entry in &mut entries { + if let Some(p) = entry.physical_page { + if p > page_count { + warn!("Truncating out-of-range page {} for '{}'", p, entry.title); + entry.physical_page = Some(page_count); + } + } + } + info!("Structure extraction complete: {} entries", entries.len()); + entries + } + + /// Group pages by estimated token count. + /// + /// Each group stays under `max_tokens_per_group`. Consecutive groups + /// overlap by `overlap_pages` pages to avoid splitting content at + /// section boundaries. + fn group_pages(&self, pages: &[PdfPage]) -> Vec { + let mut groups = Vec::new(); + let mut group_tokens = 0usize; + let mut group_pages_buf = Vec::new(); + + for (i, page) in pages.iter().enumerate() { + let new_tokens = group_tokens + page.token_count; + + if new_tokens > self.config.max_tokens_per_group && !group_pages_buf.is_empty() { + // Finalise current group + let text = format_group_text(&group_pages_buf); + groups.push(PageGroup { + text, + start_page: group_pages_buf.first().unwrap().number, + end_page: group_pages_buf.last().unwrap().number, + }); + + // Start new group with overlap + let overlap_start = i.saturating_sub(self.config.overlap_pages); + group_pages_buf = pages[overlap_start..=i].to_vec(); + group_tokens = group_pages_buf.iter().map(|p| p.token_count).sum(); + } else { + group_tokens = new_tokens; + group_pages_buf.push(page.clone()); + } + } + + // Final group + if !group_pages_buf.is_empty() { + let text = format_group_text(&group_pages_buf); + groups.push(PageGroup { + text, + start_page: group_pages_buf.first().unwrap().number, + end_page: group_pages_buf.last().unwrap().number, + }); + } + + groups + } + + /// Generate initial structure from the first page group. + async fn generate_initial(&self, group: &PageGroup) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + let user = format!( + r#"Analyze this document content and extract its hierarchical structure. + +Document content: +{} + +Return a JSON array: +[ + {{"title": "Section Title", "level": 1, "physical_page": 1}}, + {{"title": "Subsection", "level": 2, "physical_page": 3}}, + ... +] + +Rules: +- "level" reflects the hierarchy (1 = chapter/top, 2 = section, 3 = subsection) +- "physical_page" is the page number where the section begins +- Preserve original titles as closely as possible +- Only output the JSON array, no other text"#, + group.text + ); + + let sections: Vec = self.client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } + + /// Continue structure extraction for a subsequent group. + /// + /// Passes previously extracted entries as context so the LLM can + /// continue the structure rather than restart. + async fn generate_continuation( + &self, + group: &PageGroup, + previous: &[TocEntry], + ) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + + // Summarise previous entries as context + let prev_summary = previous + .iter() + .rev() + .take(10) + .rev() + .map(|e| { + format!( + " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", + e.title, + e.level, + e.physical_page.unwrap_or(0) + ) + }) + .collect::>() + .join(",\n"); + + let user = format!( + r#"Previously extracted structure: +[ +{} +] + +Continue extracting structure from these pages: +{} + +Return ONLY the NEW entries (do not repeat previous ones): +[ + {{"title": "...", "level": N, "physical_page": M}}, + ... +] + +If no new structural elements are found, return: []"#, + prev_summary, group.text + ); + + let sections: Vec = self.client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } + + /// Static version of continuation generation for parallel use. + /// + /// Uses an owned `LlmClient` reference instead of `&self`. + async fn generate_continuation_with_client( + client: &LlmClient, + group: &PageGroup, + previous: &[TocEntry], + ) -> Result> { + let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; + + let prev_summary = previous + .iter() + .rev() + .take(10) + .rev() + .map(|e| { + format!( + " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", + e.title, + e.level, + e.physical_page.unwrap_or(0) + ) + }) + .collect::>() + .join(",\n"); + + let user = format!( + r#"Previously extracted structure: +[ +{} +] + +Continue extracting structure from these pages: +{} + +Return ONLY the NEW entries (do not repeat previous ones): +[ + {{"title": "...", "level": N, "physical_page": M}}, + ... +] + +If no new structural elements are found, return: []"#, + prev_summary, group.text + ); + + let sections: Vec = client.complete_json(system, &user).await?; + + Ok(sections + .into_iter() + .map(|s| { + TocEntry::new(s.title, s.level) + .with_physical_page(s.physical_page) + .with_confidence(0.7) + }) + .collect()) + } +} + +/// Format pages into tagged text for LLM consumption. +fn format_group_text(pages: &[PdfPage]) -> String { + pages + .iter() + .map(|p| { + // Truncate individual page text if very long + let text = if p.text.len() > 3000 { + &p.text[..3000] + } else { + &p.text + }; + format!("\n{}\n", p.number, text, p.number) + }) + .collect::>() + .join("\n\n") +} + +const STRUCTURE_EXTRACTION_SYSTEM_PROMPT: &str = r#"You are a document structure extraction expert. Your task is to analyze document content and extract its hierarchical structure (chapters, sections, subsections). + +For each structural element you find, provide: +- title: The section title exactly as it appears +- level: The hierarchy level (1 = chapter/top level, 2 = section, 3 = subsection) +- physical_page: The page number where this section begins + +Important: +- Focus on genuine structural elements (chapters, sections), not paragraph topics +- Do NOT include the abstract, summary, or bibliography as structural elements unless they are major sections +- Be conservative: fewer high-quality entries are better than many low-quality ones"#; + +/// LLM response type for structure extraction. +#[derive(serde::Deserialize)] +struct ExtractedSection { + title: String, + level: usize, + physical_page: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_config() { + let config = StructureExtractorConfig::default(); + assert_eq!(config.max_tokens_per_group, 20_000); + assert_eq!(config.overlap_pages, 1); + } + + #[test] + fn test_group_pages_single_group() { + let extractor = StructureExtractor::with_defaults(); + + let pages: Vec = (1..=5) + .map(|i| PdfPage::new(i, format!("Page {} content", i))) + .collect(); + + let groups = extractor.group_pages(&pages); + assert_eq!(groups.len(), 1); + assert_eq!(groups[0].start_page, 1); + assert_eq!(groups[0].end_page, 5); + } + + #[test] + fn test_group_pages_multiple_groups() { + let config = StructureExtractorConfig { + max_tokens_per_group: 50, + overlap_pages: 1, + ..Default::default() + }; + let extractor = StructureExtractor::new(config); + + // Create pages with enough text to span multiple groups + let pages: Vec = (1..=10) + .map(|i| { + let text = format!( + "Page {} content. This is a longer text to use more tokens. ", + i + ) + .repeat(10); + PdfPage::new(i, text) + }) + .collect(); + + let groups = extractor.group_pages(&pages); + assert!( + groups.len() > 1, + "Expected multiple groups, got {}", + groups.len() + ); + } + + #[test] + fn test_format_group_text() { + let pages = vec![PdfPage::new(1, "Hello"), PdfPage::new(2, "World")]; + let text = format_group_text(&pages); + assert!(text.contains("")); + assert!(text.contains("")); + assert!(text.contains("Hello")); + assert!(text.contains("World")); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/types.rs b/vectorless-core/vectorless-index/src/parse/toc/types.rs new file mode 100644 index 00000000..0438c0d3 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/types.rs @@ -0,0 +1,350 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! TOC (Table of Contents) types. + +use serde::{Deserialize, Serialize}; + +/// A single TOC entry. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TocEntry { + /// Section title. + pub title: String, + + /// Hierarchy level (1 = top level, 2 = subsection, etc.). + pub level: usize, + + /// Page number from TOC (may have offset). + #[serde(skip_serializing_if = "Option::is_none")] + pub toc_page: Option, + + /// Actual physical page number (after verification/assignment). + #[serde(skip_serializing_if = "Option::is_none")] + pub physical_page: Option, + + /// Confidence score (0.0 - 1.0). + #[serde(default)] + pub confidence: f32, + + /// Start line index (for tree building). + #[serde(skip_serializing_if = "Option::is_none")] + pub start_index: Option, + + /// End line index (for tree building). + #[serde(skip_serializing_if = "Option::is_none")] + pub end_index: Option, + + /// Content of this section. + #[serde(skip_serializing_if = "String::is_empty")] + pub content: String, +} + +impl TocEntry { + /// Create a new TOC entry. + pub fn new(title: impl Into, level: usize) -> Self { + Self { + title: title.into(), + level, + toc_page: None, + physical_page: None, + confidence: 1.0, + start_index: None, + end_index: None, + content: String::new(), + } + } + + /// Set the TOC page number. + pub fn with_toc_page(mut self, page: usize) -> Self { + self.toc_page = Some(page); + self + } + + /// Set the physical page number. + pub fn with_physical_page(mut self, page: usize) -> Self { + self.physical_page = Some(page); + self + } + + /// Set the confidence score. + pub fn with_confidence(mut self, confidence: f32) -> Self { + self.confidence = confidence.clamp(0.0, 1.0); + self + } + + /// Check if this entry has a valid physical page. + pub fn has_physical_page(&self) -> bool { + self.physical_page.is_some() + } +} + +impl Default for TocEntry { + fn default() -> Self { + Self::new("", 1) + } +} + +/// Result of TOC detection. +#[derive(Debug, Clone)] +pub struct TocDetection { + /// Whether a TOC was found. + pub found: bool, + + /// Page numbers where TOC appears. + pub pages: Vec, + + /// Whether the TOC contains page numbers. + pub has_page_numbers: bool, + + /// Detection confidence (0.0 - 1.0). + pub confidence: f32, +} + +impl TocDetection { + /// Create a new TOC detection result. + pub fn new(found: bool) -> Self { + Self { + found, + pages: Vec::new(), + has_page_numbers: false, + confidence: 0.0, + } + } + + /// Create a result indicating no TOC was found. + pub fn not_found() -> Self { + Self::new(false) + } + + /// Set the TOC pages. + pub fn with_pages(mut self, pages: Vec) -> Self { + self.pages = pages; + self + } + + /// Set whether page numbers are present. + pub fn with_page_numbers(mut self, has: bool) -> Self { + self.has_page_numbers = has; + self + } + + /// Set the confidence score. + pub fn with_confidence(mut self, confidence: f32) -> Self { + self.confidence = confidence.clamp(0.0, 1.0); + self + } +} + +/// Page offset calculation result. +#[derive(Debug, Clone)] +pub struct PageOffset { + /// Calculated offset: physical_page = toc_page + offset. + pub offset: i32, + + /// Number of samples used for calculation. + pub sample_count: usize, + + /// Confidence in the offset calculation. + pub confidence: f32, +} + +impl PageOffset { + /// Create a new page offset. + pub fn new(offset: i32, sample_count: usize, confidence: f32) -> Self { + Self { + offset, + sample_count, + confidence: confidence.clamp(0.0, 1.0), + } + } + + /// Apply offset to a TOC page number. + pub fn apply(&self, toc_page: usize) -> usize { + (toc_page as i32 + self.offset).max(1) as usize + } +} + +/// Verification error for a single entry. +#[derive(Debug, Clone)] +pub struct VerificationError { + /// Index of the entry in the TOC list. + pub index: usize, + + /// Entry title. + pub title: String, + + /// Expected physical page. + pub expected_page: usize, + + /// Type of error. + pub error_type: ErrorType, +} + +impl VerificationError { + /// Create a new verification error. + pub fn new( + index: usize, + title: impl Into, + expected_page: usize, + error_type: ErrorType, + ) -> Self { + Self { + index, + title: title.into(), + expected_page, + error_type, + } + } +} + +/// Type of verification error. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ErrorType { + /// Title not found on the expected page. + TitleNotFound, + /// Title found but not at page start. + NotAtPageStart, + /// Page number out of document range. + PageOutOfRange, +} + +impl std::fmt::Display for ErrorType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + ErrorType::TitleNotFound => write!(f, "Title not found on page"), + ErrorType::NotAtPageStart => write!(f, "Title not at page start"), + ErrorType::PageOutOfRange => write!(f, "Page out of range"), + } + } +} + +/// Result of TOC verification. +#[derive(Debug, Clone)] +pub struct VerificationReport { + /// Total entries verified. + pub total: usize, + + /// Number of correct entries. + pub correct: usize, + + /// Accuracy (0.0 - 1.0). + pub accuracy: f32, + + /// List of errors found. + pub errors: Vec, +} + +impl VerificationReport { + /// Create a new verification report. + pub fn new(total: usize, correct: usize, errors: Vec) -> Self { + let accuracy = if total > 0 { + correct as f32 / total as f32 + } else { + 1.0 + }; + Self { + total, + correct, + accuracy, + errors, + } + } + + /// Create a report indicating all entries are correct. + pub fn all_correct(total: usize) -> Self { + Self::new(total, total, Vec::new()) + } + + /// Check if the accuracy meets a threshold. + pub fn meets_threshold(&self, threshold: f32) -> bool { + self.accuracy >= threshold + } + + /// Check if there are any errors. + pub fn has_errors(&self) -> bool { + !self.errors.is_empty() + } +} + +/// Processing mode for the TOC extraction pipeline. +/// +/// Modes are ordered by quality: higher modes produce more accurate results +/// when they succeed, but can degrade to lower modes on failure. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessingMode { + /// TOC found with page numbers. Highest quality path. + TocWithPageNumbers, + /// TOC found without page numbers, or page-number accuracy was too low. + TocWithoutPageNumbers, + /// No TOC, or all TOC-based modes failed. LLM-driven structure extraction. + NoToc, +} + +impl ProcessingMode { + /// Degrade to the next lower quality mode. + /// + /// Returns `None` if already at the lowest mode (`NoToc`). + pub fn degrade(self) -> Option { + match self { + Self::TocWithPageNumbers => Some(Self::TocWithoutPageNumbers), + Self::TocWithoutPageNumbers => Some(Self::NoToc), + Self::NoToc => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_toc_entry() { + let entry = TocEntry::new("Chapter 1", 1) + .with_toc_page(10) + .with_physical_page(12) + .with_confidence(0.9); + + assert_eq!(entry.title, "Chapter 1"); + assert_eq!(entry.level, 1); + assert_eq!(entry.toc_page, Some(10)); + assert_eq!(entry.physical_page, Some(12)); + assert!((entry.confidence - 0.9).abs() < 0.01); + } + + #[test] + fn test_page_offset() { + let offset = PageOffset::new(2, 5, 0.9); + assert_eq!(offset.apply(10), 12); + assert_eq!(offset.apply(1), 3); + } + + #[test] + fn test_verification_report() { + let report = VerificationReport::all_correct(10); + assert_eq!(report.total, 10); + assert_eq!(report.correct, 10); + assert_eq!(report.accuracy, 1.0); + assert!(!report.has_errors()); + } + + #[test] + fn test_error_type_display() { + assert_eq!( + format!("{}", ErrorType::TitleNotFound), + "Title not found on page" + ); + } + + #[test] + fn test_processing_mode_degrade() { + assert_eq!( + ProcessingMode::TocWithPageNumbers.degrade(), + Some(ProcessingMode::TocWithoutPageNumbers) + ); + assert_eq!( + ProcessingMode::TocWithoutPageNumbers.degrade(), + Some(ProcessingMode::NoToc) + ); + assert_eq!(ProcessingMode::NoToc.degrade(), None); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/toc/verifier.rs b/vectorless-core/vectorless-index/src/parse/toc/verifier.rs new file mode 100644 index 00000000..41d6ce29 --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/toc/verifier.rs @@ -0,0 +1,281 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index verifier - verifies TOC entry page assignments. + +use futures::stream::{self, StreamExt}; +use rand::seq::SliceRandom; +use tracing::{debug, info}; + +use vectorless_error::Result; +use crate::index::parse::pdf::PdfPage; +use vectorless_llm::config::LlmConfig; + +use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; +use vectorless_llm::LlmClient; + +/// Verifier configuration. +#[derive(Debug, Clone)] +pub struct VerifierConfig { + /// Sample size for verification (None = all entries). + pub sample_size: Option, + + /// LLM configuration. + pub llm_config: LlmConfig, + + /// Accuracy threshold for acceptance. + pub accuracy_threshold: f32, +} + +impl Default for VerifierConfig { + fn default() -> Self { + Self { + sample_size: Some(10), + llm_config: LlmConfig::default(), + accuracy_threshold: 0.6, + } + } +} + +/// Index verifier - verifies that TOC entries point to correct pages. +pub struct IndexVerifier { + config: VerifierConfig, + client: LlmClient, +} + +impl IndexVerifier { + /// Create a new verifier. + pub fn new(config: VerifierConfig) -> Self { + let client = LlmClient::new(config.llm_config.clone().into()); + Self { config, client } + } + + /// Create a verifier with an externally provided LLM client. + pub fn with_client(client: LlmClient) -> Self { + Self { + config: VerifierConfig::default(), + client, + } + } + + /// Create a verifier with default configuration. + pub fn with_defaults() -> Self { + Self::new(VerifierConfig::default()) + } + + /// Verify TOC entries against PDF pages. + /// + /// Sample entries are verified via LLM calls with bounded concurrency. + pub async fn verify( + &self, + entries: &[TocEntry], + pages: &[PdfPage], + ) -> Result { + if entries.is_empty() { + return Ok(VerificationReport::all_correct(0)); + } + + let sample = self.select_sample(entries); + + // Launch verification checks with bounded concurrency + let client = self.client.clone(); + let futures: Vec<_> = sample + .iter() + .map(|(index, entry)| { + let index = *index; + let title = entry.title.clone(); + let physical_page = entry.physical_page; + let client = client.clone(); + let pages = pages.to_vec(); + + async move { + match physical_page { + Some(page) => { + let result = + Self::verify_entry_with_client(&client, &title, page, &pages).await; + (index, title, page, result) + } + None => (index, title, 0, Ok(Err(ErrorType::PageOutOfRange))), + } + } + }) + .collect(); + + let results: Vec<_> = stream::iter(futures).buffer_unordered(5).collect().await; + + // Aggregate results + let total = results.len(); + let mut errors = Vec::new(); + let mut correct = 0; + + for (index, title, page, result) in results { + match result { + Ok(Ok(())) => correct += 1, + Ok(Err(error_type)) => { + errors.push(VerificationError::new(index, title, page, error_type)); + } + Err(e) => { + debug!("Verification LLM call failed: {}", e); + errors.push(VerificationError::new( + index, + title, + page, + ErrorType::TitleNotFound, + )); + } + } + } + + let report = VerificationReport::new(total, correct, errors); + info!( + "Verification complete: {}/{} correct ({:.1}% accuracy)", + report.correct, + report.total, + report.accuracy * 100.0 + ); + + Ok(report) + } + + /// Select a sample of entries to verify. + fn select_sample<'a>(&self, entries: &'a [TocEntry]) -> Vec<(usize, &'a TocEntry)> { + let with_pages: Vec<_> = entries + .iter() + .enumerate() + .filter(|(_, e)| e.physical_page.is_some()) + .collect(); + + match self.config.sample_size { + Some(size) if size < with_pages.len() => { + // Random sample + let mut rng = rand::thread_rng(); + let mut sample: Vec<_> = with_pages; + sample.shuffle(&mut rng); + sample.into_iter().take(size).collect() + } + _ => with_pages, + } + } + + /// Verify a single entry using a cloned client (for concurrent use). + async fn verify_entry_with_client( + client: &LlmClient, + title: &str, + physical_page: usize, + pages: &[PdfPage], + ) -> Result> { + if physical_page == 0 || physical_page > pages.len() { + return Ok(Err(ErrorType::PageOutOfRange)); + } + + let page = &pages[physical_page - 1]; + + let found = Self::check_title_on_page_with_client(client, title, &page.text).await?; + + if !found { + debug!("Title '{}' not found on page {}", title, physical_page); + return Ok(Err(ErrorType::TitleNotFound)); + } + + Ok(Ok(())) + } + + /// Check if a title appears on a page using LLM. + async fn check_title_on_page_with_client( + client: &LlmClient, + title: &str, + page_text: &str, + ) -> Result { + let system = "You are a document analysis assistant. Determine if a section title appears in the given text."; + + let text = if page_text.len() > 1000 { + &page_text[..1000] + } else { + page_text + }; + + let user = format!( + r#"Does the section title "{}" appear in this page text? + +Page text: +{} + +Reply in JSON format: +{{"found": true/false}}"#, + title, text + ); + + #[derive(serde::Deserialize)] + struct CheckResult { + found: bool, + } + + let result: CheckResult = client.complete_json(system, &user).await?; + Ok(result.found) + } + + /// Check if a title appears at the start of a page. + pub async fn check_title_at_start(&self, title: &str, page_text: &str) -> Result { + let system = "You are a document analysis assistant. Determine if a section title appears at the START of the given page text."; + + // Only check first 500 characters + let text = if page_text.len() > 500 { + &page_text[..500] + } else { + page_text + }; + + let user = format!( + r#"Does the section title "{}" appear at the BEGINNING of this page text? +Note: It should be near the start, not in the middle or end. + +Page text: +{} + +Reply in JSON format: +{{"at_start": true/false}}"#, + title, text + ); + + #[derive(serde::Deserialize)] + struct StartCheck { + at_start: bool, + } + + let result: StartCheck = self.client.complete_json(system, &user).await?; + Ok(result.at_start) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_select_sample() { + let verifier = IndexVerifier::with_defaults(); + + let entries: Vec = (1..=20) + .map(|i| TocEntry::new(format!("Entry {}", i), 1).with_physical_page(i)) + .collect(); + + let sample = verifier.select_sample(&entries); + assert_eq!(sample.len(), 10); // default sample_size + } + + #[test] + fn test_select_sample_all() { + let config = VerifierConfig { + sample_size: None, + ..Default::default() + }; + let verifier = IndexVerifier::new(config); + + let entries: Vec = (1..=5) + .map(|i| TocEntry::new(format!("Entry {}", i), 1).with_physical_page(i)) + .collect(); + + let sample = verifier.select_sample(&entries); + assert_eq!(sample.len(), 5); + } +} diff --git a/vectorless-core/vectorless-index/src/parse/types.rs b/vectorless-core/vectorless-index/src/parse/types.rs new file mode 100644 index 00000000..92dd6b0f --- /dev/null +++ b/vectorless-core/vectorless-index/src/parse/types.rs @@ -0,0 +1,173 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document type definitions. +//! +//! This module defines the types used for document parsing: +//! - [`RawNode`] - A raw node extracted from a document before tree construction +//! - [`DocumentMeta`] - Metadata about a document +//! - [`DocumentFormat`] - Supported document formats (re-exported from document module) + +use serde::{Deserialize, Serialize}; + +/// Re-export [`DocumentFormat`] from the document module. +pub use vectorless_document::DocumentFormat; + +/// A raw node extracted from a document. +/// +/// This represents a section or element before it's organized into a tree. +/// Raw nodes are produced by parsers and consumed by the indexer. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RawNode { + /// Title or heading of this node. + pub title: String, + + /// Text content of this node (including all children's content). + pub content: String, + + /// Level in the hierarchy (0 = root, 1 = top-level section, etc.). + pub level: usize, + + /// Line number where this node starts (1-based). + pub line_start: usize, + + /// Line number where this node ends (1-based). + pub line_end: usize, + + /// Page number for PDF documents (1-based). + pub page: Option, + + /// Estimated token count for this node's own content. + pub token_count: Option, + + /// Total token count including all children (recursive, computed by thinner). + #[serde(default)] + pub total_token_count: Option, +} + +impl Default for RawNode { + fn default() -> Self { + Self { + title: String::new(), + content: String::new(), + level: 0, + line_start: 1, + line_end: 1, + page: None, + token_count: None, + total_token_count: None, + } + } +} + +impl RawNode { + /// Create a new raw node with the given title. + pub fn new(title: impl Into) -> Self { + Self { + title: title.into(), + ..Default::default() + } + } + + /// Set the content of this node. + pub fn with_content(mut self, content: impl Into) -> Self { + self.content = content.into(); + self + } + + /// Set the level of this node. + pub fn with_level(mut self, level: usize) -> Self { + self.level = level; + self + } + + /// Set the line range of this node. + pub fn with_lines(mut self, start: usize, end: usize) -> Self { + self.line_start = start; + self.line_end = end; + self + } + + /// Set the page number of this node. + pub fn with_page(mut self, page: usize) -> Self { + self.page = Some(page); + self + } + + /// Check if this node has any content. + pub fn has_content(&self) -> bool { + !self.content.trim().is_empty() + } + + /// Get the character count of the content. + pub fn char_count(&self) -> usize { + self.content.chars().count() + } + + /// Get the word count (approximate) of the content. + pub fn word_count(&self) -> usize { + self.content.split_whitespace().count() + } +} + +/// Document metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentMeta { + /// Document name or title. + pub name: String, + + /// Document format. + pub format: DocumentFormat, + + /// Total number of pages (for PDF). + pub page_count: Option, + + /// Total number of lines. + pub line_count: usize, + + /// Source file path (if applicable). + pub source_path: Option, + + /// Document description (generated by LLM). + pub description: Option, +} + +impl Default for DocumentMeta { + fn default() -> Self { + Self { + name: String::new(), + format: DocumentFormat::Markdown, + page_count: None, + line_count: 0, + source_path: None, + description: None, + } + } +} + +/// Result of parsing a document. +#[derive(Debug, Clone)] +pub struct ParseResult { + /// Document metadata. + pub meta: DocumentMeta, + + /// Raw nodes extracted from the document. + pub nodes: Vec, +} + +impl ParseResult { + /// Create a new parse result. + pub fn new(meta: DocumentMeta, nodes: Vec) -> Self { + Self { meta, nodes } + } + + /// Get the number of nodes. + pub fn node_count(&self) -> usize { + self.nodes.len() + } + + /// Check if there are no nodes. + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() + } +} diff --git a/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs b/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs new file mode 100644 index 00000000..d192679f --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs @@ -0,0 +1,329 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Pipeline checkpoint support for resume-after-interruption. +//! +//! Saves pipeline state after each stage group completes. +//! On restart, completed stages are skipped and the pipeline resumes +//! from the first incomplete stage. + +use std::path::PathBuf; + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use tracing::{info, warn}; + +use vectorless_document::DocumentTree; +use crate::index::parse::RawNode; + +use super::metrics::IndexMetrics; + +/// Serializable checkpoint capturing pipeline state at a point in time. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PipelineCheckpoint { + /// Document ID being indexed. + pub doc_id: String, + + /// SHA-256 hash of the source content. + pub source_hash: String, + + /// Processing version at the time of checkpoint. + pub processing_version: u32, + + /// Fingerprint of pipeline configuration. + pub config_fingerprint: String, + + /// Names of stages that completed successfully. + pub completed_stages: Vec, + + /// Serialized context data that stages need for resume. + pub context_data: CheckpointContextData, + + /// When this checkpoint was created. + pub timestamp: DateTime, +} + +/// Context data that can be serialized for checkpoint persistence. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckpointContextData { + /// Raw nodes from parsing (if parse stage completed). + pub raw_nodes: Vec, + + /// Built document tree (if build stage completed). + pub tree: Option, + + /// Metrics collected so far. + pub metrics: IndexMetrics, + + /// Page count (for PDFs). + pub page_count: Option, + + /// Line count. + pub line_count: Option, + + /// Document description. + pub description: Option, +} + +/// Manages checkpoint persistence on disk. +pub struct CheckpointManager { + /// Directory where checkpoints are stored. + checkpoint_dir: PathBuf, +} + +impl CheckpointManager { + /// Create a new checkpoint manager. + /// + /// The directory will be created on first save if it doesn't exist. + pub fn new(checkpoint_dir: impl Into) -> Self { + Self { + checkpoint_dir: checkpoint_dir.into(), + } + } + + /// Save a checkpoint for the given document. + pub fn save(&self, doc_id: &str, checkpoint: &PipelineCheckpoint) -> std::io::Result<()> { + // Ensure directory exists + std::fs::create_dir_all(&self.checkpoint_dir)?; + + let path = self.checkpoint_path(doc_id); + let json = serde_json::to_string(checkpoint) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + + // Write atomically: write to temp file, then rename + let temp_path = path.with_extension("tmp"); + std::fs::write(&temp_path, json)?; + std::fs::rename(&temp_path, &path)?; + + Ok(()) + } + + /// Load a checkpoint for the given document. + /// + /// Returns `None` if no checkpoint exists. + pub fn load(&self, doc_id: &str) -> Option { + let path = self.checkpoint_path(doc_id); + if !path.exists() { + return None; + } + + let data = std::fs::read(&path).ok()?; + match serde_json::from_slice(&data) { + Ok(checkpoint) => Some(checkpoint), + Err(e) => { + warn!("Failed to deserialize checkpoint for {}: {}", doc_id, e); + None + } + } + } + + /// Remove a checkpoint after successful completion. + pub fn clear(&self, doc_id: &str) -> std::io::Result<()> { + let path = self.checkpoint_path(doc_id); + if path.exists() { + std::fs::remove_file(path)?; + info!("Cleared checkpoint for document {}", doc_id); + } + Ok(()) + } + + /// Check if a checkpoint exists for the given document. + pub fn exists(&self, doc_id: &str) -> bool { + self.checkpoint_path(doc_id).exists() + } + + /// Get the checkpoint file path for a document. + fn checkpoint_path(&self, doc_id: &str) -> PathBuf { + // Use a sanitized version of doc_id for the filename + let safe_name = doc_id.replace(['/', '\\', ':', '*', '?', '"', '<', '>', '|'], "_"); + self.checkpoint_dir + .join(format!("{}.checkpoint.json", safe_name)) + } + + /// Check if a checkpoint is valid for resuming. + /// + /// A checkpoint is valid if: + /// - Source hash matches (content hasn't changed) + /// - Processing version matches (algorithm hasn't changed) + /// - Config fingerprint matches (options haven't changed) + pub fn is_valid_for_resume( + checkpoint: &PipelineCheckpoint, + source_hash: &str, + processing_version: u32, + config_fingerprint: &str, + ) -> bool { + checkpoint.source_hash == source_hash + && checkpoint.processing_version == processing_version + && checkpoint.config_fingerprint == config_fingerprint + } + + /// List all checkpoint files in the directory. + pub fn list_checkpoints(&self) -> Vec { + let mut result = Vec::new(); + if let Ok(entries) = std::fs::read_dir(&self.checkpoint_dir) { + for entry in entries.flatten() { + let path = entry.path(); + if path.extension().map_or(false, |e| e == "json") { + if let Some(name) = path.file_stem().and_then(|n| n.to_str()) { + // Strip .checkpoint suffix + if let Some(doc_id) = name.strip_suffix(".checkpoint") { + result.push(doc_id.to_string()); + } + } + } + } + } + result + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn make_checkpoint() -> PipelineCheckpoint { + PipelineCheckpoint { + doc_id: "test-doc-123".to_string(), + source_hash: "abc123".to_string(), + processing_version: 1, + config_fingerprint: "cfg-fp".to_string(), + completed_stages: vec!["parse".to_string(), "build".to_string()], + context_data: CheckpointContextData { + raw_nodes: Vec::new(), + tree: Some(DocumentTree::new("Test", "content")), + metrics: IndexMetrics::default(), + page_count: None, + line_count: Some(10), + description: None, + }, + timestamp: Utc::now(), + } + } + + #[test] + fn test_save_and_load() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let checkpoint = make_checkpoint(); + manager.save("test-doc-123", &checkpoint).unwrap(); + + let loaded = manager.load("test-doc-123").unwrap(); + assert_eq!(loaded.doc_id, "test-doc-123"); + assert_eq!(loaded.completed_stages, vec!["parse", "build"]); + assert_eq!(loaded.context_data.line_count, Some(10)); + } + + #[test] + fn test_load_nonexistent() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + assert!(manager.load("nonexistent").is_none()); + } + + #[test] + fn test_clear() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let checkpoint = make_checkpoint(); + manager.save("test-doc-123", &checkpoint).unwrap(); + assert!(manager.exists("test-doc-123")); + + manager.clear("test-doc-123").unwrap(); + assert!(!manager.exists("test-doc-123")); + } + + #[test] + fn test_is_valid_for_resume() { + let checkpoint = make_checkpoint(); + + // Matching — valid + assert!(CheckpointManager::is_valid_for_resume( + &checkpoint, + "abc123", + 1, + "cfg-fp" + )); + + // Different source hash — invalid + assert!(!CheckpointManager::is_valid_for_resume( + &checkpoint, + "different", + 1, + "cfg-fp" + )); + + // Different processing version — invalid + assert!(!CheckpointManager::is_valid_for_resume( + &checkpoint, + "abc123", + 2, + "cfg-fp" + )); + + // Different config fingerprint — invalid + assert!(!CheckpointManager::is_valid_for_resume( + &checkpoint, + "abc123", + 1, + "different" + )); + } + + #[test] + fn test_list_checkpoints() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let mut cp = make_checkpoint(); + cp.doc_id = "doc-a".to_string(); + manager.save("doc-a", &cp).unwrap(); + + cp.doc_id = "doc-b".to_string(); + manager.save("doc-b", &cp).unwrap(); + + let list = manager.list_checkpoints(); + assert_eq!(list.len(), 2); + assert!(list.contains(&"doc-a".to_string())); + assert!(list.contains(&"doc-b".to_string())); + } + + #[test] + fn test_roundtrip_preserves_tree() { + let dir = TempDir::new().unwrap(); + let manager = CheckpointManager::new(dir.path()); + + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "Section 1", "Content"); + tree.set_token_count(child, 42); + + let checkpoint = PipelineCheckpoint { + doc_id: "tree-test".to_string(), + source_hash: "hash".to_string(), + processing_version: 1, + config_fingerprint: "fp".to_string(), + completed_stages: vec!["build".to_string()], + context_data: CheckpointContextData { + raw_nodes: Vec::new(), + tree: Some(tree), + metrics: IndexMetrics::default(), + page_count: None, + line_count: None, + description: None, + }, + timestamp: Utc::now(), + }; + + manager.save("tree-test", &checkpoint).unwrap(); + let loaded = manager.load("tree-test").unwrap(); + + let tree = loaded.context_data.tree.unwrap(); + assert_eq!(tree.node_count(), 2); // root + 1 child + let child_id = tree.children(tree.root())[0]; + assert_eq!(tree.get(child_id).unwrap().title, "Section 1"); + assert_eq!(tree.get(child_id).unwrap().token_count, Some(42)); + } +} diff --git a/vectorless-core/vectorless-index/src/pipeline/context.rs b/vectorless-core/vectorless-index/src/pipeline/context.rs new file mode 100644 index 00000000..36360d81 --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/context.rs @@ -0,0 +1,465 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index context for passing data between stages. + +use std::collections::HashMap; +use std::path::PathBuf; + +use vectorless_document::{Concept, DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; +use crate::index::parse::{DocumentFormat, RawNode}; +use vectorless_llm::LlmClient; + +use super::super::{PipelineOptions, SummaryStrategy}; +use super::metrics::IndexMetrics; + +/// Input for the index pipeline. +#[derive(Debug, Clone)] +pub enum IndexInput { + /// Index from file path. + File(PathBuf), + + /// Index from raw content string. + Content { + /// Content string. + content: String, + /// Document name. + name: String, + /// Document format. + format: DocumentFormat, + }, + + /// Index from binary data. + Bytes { + /// Binary data. + data: Vec, + /// Document name. + name: String, + /// Document format. + format: DocumentFormat, + }, +} + +impl IndexInput { + /// Create input from file path. + pub fn file(path: impl Into) -> Self { + Self::File(path.into()) + } + + /// Create input from content string. + pub fn content(content: impl Into) -> Self { + Self::Content { + content: content.into(), + name: String::new(), + format: DocumentFormat::Markdown, + } + } + + /// Create input from content with name and format. + pub fn content_with( + content: impl Into, + name: impl Into, + format: DocumentFormat, + ) -> Self { + Self::Content { + content: content.into(), + name: name.into(), + format, + } + } + + /// Create input from binary data. + pub fn bytes(data: impl Into>) -> Self { + Self::Bytes { + data: data.into(), + name: String::new(), + format: DocumentFormat::Pdf, + } + } + + /// Create input from binary data with name and format. + pub fn bytes_with( + data: impl Into>, + name: impl Into, + format: DocumentFormat, + ) -> Self { + Self::Bytes { + data: data.into(), + name: name.into(), + format, + } + } + + /// Check if this is a file input. + pub fn is_file(&self) -> bool { + matches!(self, Self::File(_)) + } + + /// Check if this is a content input. + pub fn is_content(&self) -> bool { + matches!(self, Self::Content { .. }) + } + + /// Check if this is a bytes input. + pub fn is_bytes(&self) -> bool { + matches!(self, Self::Bytes { .. }) + } + + /// Get the format if available. + pub fn format(&self) -> Option { + match self { + Self::File(_) => None, + Self::Content { format, .. } => Some(*format), + Self::Bytes { format, .. } => Some(*format), + } + } +} + +/// Result from a single stage execution. +#[derive(Debug, Clone)] +pub struct StageResult { + /// Whether the stage succeeded. + pub success: bool, + + /// Duration in milliseconds. + pub duration_ms: u64, + + /// Additional metadata. + pub metadata: HashMap, +} + +impl StageResult { + /// Create a successful result. + pub fn success(name: &str) -> Self { + println!("Stage '{}' completed successfully", name); + + Self { + success: true, + duration_ms: 0, + metadata: HashMap::new(), + } + } + + /// Create a failed result. + pub fn failure(name: &str, error: &str) -> Self { + println!("Stage '{}' failed: {}", name, error); + + let mut metadata = HashMap::new(); + metadata.insert( + "error".to_string(), + serde_json::Value::String(error.to_string()), + ); + Self { + success: false, + duration_ms: 0, + metadata, + } + } + + /// Set duration. + pub fn with_duration(mut self, ms: u64) -> Self { + self.duration_ms = ms; + self + } + + /// Add metadata. + pub fn with_metadata(mut self, key: &str, value: serde_json::Value) -> Self { + self.metadata.insert(key.to_string(), value); + self + } +} + +/// Summary cache for lazy generation. +#[derive(Debug, Clone, Default)] +pub struct SummaryCache { + /// Cached summaries: node_id -> summary. + summaries: HashMap, + + /// Whether to persist to disk. + persist: bool, +} + +impl SummaryCache { + /// Create a new cache. + pub fn new(persist: bool) -> Self { + Self { + summaries: HashMap::new(), + persist, + } + } + + /// Get a cached summary. + pub fn get(&self, node_id: NodeId) -> Option<&str> { + self.summaries.get(&node_id).map(|s| s.as_str()) + } + + /// Store a summary. + pub fn put(&mut self, node_id: NodeId, summary: String) { + self.summaries.insert(node_id, summary); + } + + /// Whether persistence is enabled. + pub fn should_persist(&self) -> bool { + self.persist + } + + /// Get all cached summaries. + pub fn all(&self) -> &HashMap { + &self.summaries + } +} + +/// Index context passed between stages. +#[derive(Debug)] +pub struct IndexContext { + /// Document ID. + pub doc_id: String, + + /// Source input. + pub input: IndexInput, + + /// Document format. + pub format: DocumentFormat, + + /// Document name. + pub name: String, + + /// Source file path (if from file). + pub source_path: Option, + + /// SHA-256 hash of source content for checkpoint validation. + pub source_hash: String, + + /// Parsed raw nodes. + pub raw_nodes: Vec, + + /// Built document tree. + pub tree: Option, + + /// Index options. + pub options: PipelineOptions, + + /// LLM client for enhancement. + pub llm_client: Option, + + /// Summary cache for lazy generation. + pub summary_cache: SummaryCache, + + /// Pre-computed reasoning index (built by ReasoningIndexStage). + pub reasoning_index: Option, + + /// Navigation index for Agent-based retrieval (built by NavigationIndexStage). + pub navigation_index: Option, + + /// Key concepts extracted from the document (built by ConceptExtractionStage). + pub concepts: Vec, + + /// Existing tree from previous indexing (for incremental updates). + /// When set, the enhance and reasoning stages can reuse data from unchanged nodes. + pub existing_tree: Option, + + /// Stage execution results. + pub stage_results: HashMap, + + /// Performance metrics. + pub metrics: IndexMetrics, + + /// Document description. + pub description: Option, + + /// Page count (for PDFs). + pub page_count: Option, + + /// Line count. + pub line_count: Option, +} + +impl IndexContext { + /// Create a new context from input. + pub fn new(input: IndexInput, options: PipelineOptions) -> Self { + let source_hash = Self::compute_source_hash(&input); + Self { + doc_id: uuid::Uuid::new_v4().to_string(), + input, + format: DocumentFormat::Markdown, + name: String::new(), + source_path: None, + source_hash, + raw_nodes: Vec::new(), + tree: None, + options, + llm_client: None, + summary_cache: SummaryCache::default(), + reasoning_index: None, + navigation_index: None, + concepts: Vec::new(), + existing_tree: None, + stage_results: HashMap::new(), + metrics: IndexMetrics::default(), + description: None, + page_count: None, + line_count: None, + } + } + + /// Compute SHA-256 hash of the source content. + fn compute_source_hash(input: &IndexInput) -> String { + use sha2::{Digest, Sha256}; + let hash = match input { + IndexInput::File(path) => { + // Hash the file path as proxy — actual content may not be readable yet + // (the parse stage reads it). This is sufficient for checkpoint invalidation + // since a different file path implies different content. + Sha256::digest(path.to_string_lossy().as_bytes()) + } + IndexInput::Content { content, .. } => Sha256::digest(content.as_bytes()), + IndexInput::Bytes { data, .. } => Sha256::digest(data), + }; + format!("{:x}", hash) + } + + /// Set the document ID. + pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { + self.doc_id = doc_id.into(); + self + } + + /// Set the LLM client. + pub fn with_llm_client(mut self, client: LlmClient) -> Self { + self.llm_client = Some(client); + self + } + + /// Set the document format. + pub fn with_format(mut self, format: DocumentFormat) -> Self { + self.format = format; + self + } + + /// Set the document name. + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = name.into(); + self + } + + /// Set the source path. + pub fn with_source_path(mut self, path: impl Into) -> Self { + self.source_path = Some(path.into()); + self + } + + /// Set the existing tree for incremental updates. + pub fn with_existing_tree(mut self, tree: DocumentTree) -> Self { + self.existing_tree = Some(tree); + self + } + + /// Initialize summary cache based on strategy. + pub fn init_summary_cache(&mut self) { + if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy { + self.summary_cache = SummaryCache::new(persist); + } + } + + /// Record a stage result. + pub fn record_stage(&mut self, name: &str, result: StageResult) { + self.stage_results.insert(name.to_string(), result); + } + + /// Get the tree, returning an error if not built. + pub fn tree(&self) -> Result<&DocumentTree, &'static str> { + self.tree.as_ref().ok_or("Tree not built") + } + + /// Get mutable tree, returning an error if not built. + pub fn tree_mut(&mut self) -> Result<&mut DocumentTree, &'static str> { + self.tree.as_mut().ok_or("Tree not built") + } + + /// Finalize and build the result. + pub fn finalize(self) -> PipelineResult { + PipelineResult { + doc_id: self.doc_id, + name: self.name, + format: self.format, + source_path: self.source_path, + tree: self.tree, + description: self.description, + page_count: self.page_count, + line_count: self.line_count, + metrics: self.metrics, + summary_cache: self.summary_cache, + reasoning_index: self.reasoning_index, + navigation_index: self.navigation_index, + concepts: self.concepts, + } + } +} + +/// Final result from the index pipeline. +#[derive(Debug)] +pub struct PipelineResult { + /// Document ID. + pub doc_id: String, + + /// Document name. + pub name: String, + + /// Document format. + pub format: DocumentFormat, + + /// Source file path. + pub source_path: Option, + + /// Built document tree. + pub tree: Option, + + /// Document description. + pub description: Option, + + /// Page count (for PDFs). + pub page_count: Option, + + /// Line count. + pub line_count: Option, + + /// Performance metrics. + pub metrics: IndexMetrics, + + /// Summary cache. + pub summary_cache: SummaryCache, + + /// Pre-computed reasoning index for retrieval acceleration. + pub reasoning_index: Option, + + /// Navigation index for Agent-based retrieval. + pub navigation_index: Option, + + /// Key concepts extracted from the document. + pub concepts: Vec, +} + +impl PipelineResult { + /// Check if the result has a tree. + pub fn has_tree(&self) -> bool { + self.tree.is_some() + } + + /// Get the tree. + pub fn tree(&self) -> Option<&DocumentTree> { + self.tree.as_ref() + } + + /// Get total indexing time in milliseconds. + pub fn total_time_ms(&self) -> u64 { + self.metrics.parse_time_ms + + self.metrics.build_time_ms + + self.metrics.validate_time_ms + + self.metrics.split_time_ms + + self.metrics.enhance_time_ms + + self.metrics.enrich_time_ms + + self.metrics.reasoning_index_time_ms + + self.metrics.navigation_index_time_ms + + self.metrics.optimize_time_ms + } +} diff --git a/vectorless-core/vectorless-index/src/pipeline/executor.rs b/vectorless-core/vectorless-index/src/pipeline/executor.rs new file mode 100644 index 00000000..16956888 --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/executor.rs @@ -0,0 +1,198 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Pipeline executor for running index stages. +//! +//! The executor uses [`PipelineOrchestrator`] internally for flexible +//! stage management with priority-based ordering and dependency resolution. + +use tracing::info; + +use vectorless_error::Result; +use vectorless_llm::LlmClient; + +use super::super::PipelineOptions; +use super::super::stages::{ + BuildStage, ConceptExtractionStage, EnhanceStage, EnrichStage, IndexStage, + NavigationIndexStage, OptimizeStage, ParseStage, ReasoningIndexStage, SplitStage, + ValidateStage, VerifyStage, +}; +use super::context::{IndexInput, PipelineResult}; +use super::orchestrator::PipelineOrchestrator; + +/// Pipeline executor for document indexing. +/// +/// Uses [`PipelineOrchestrator`] internally for stage management. +/// Supports both preset configurations and custom stage pipelines. +/// +/// # Example +/// +/// ```rust,ignore +/// // Default pipeline +/// let executor = PipelineExecutor::new(); +/// let result = executor.execute(input, options).await?; +/// +/// // With LLM enhancement +/// let executor = PipelineExecutor::with_llm(client); +/// +/// // Custom pipeline using orchestrator +/// let orchestrator = PipelineOrchestrator::new() +/// .stage(ParseStage::new()) +/// .stage_with_priority(MyCustomStage::new(), 50) +/// .stage(BuildStage::new()); +/// let executor = PipelineExecutor::from_orchestrator(orchestrator); +/// ``` +pub struct PipelineExecutor { + orchestrator: PipelineOrchestrator, +} + +impl PipelineExecutor { + /// Create a new pipeline executor with default stages. + /// + /// Default stages (in order): + /// 1. `parse` - Parse document into raw nodes + /// 2. `build` - Build tree structure + /// 3. `validate` - Verify tree integrity (optional) + /// 4. `split` - Split oversized leaf nodes (optional) + /// 5. `enrich` - Add metadata and cross-references + /// 6. `reasoning_index` - Build pre-computed reasoning index + /// 7. `concept_extraction` - Extract key concepts (optional) + /// 8. `navigation_index` - Build Agent navigation index + /// 9. `verify` - Validate ingest output reliability + /// 10. `optimize` - Optimize tree structure + pub fn new() -> Self { + let orchestrator = PipelineOrchestrator::new() + .stage_with_priority(ParseStage::new(), 10) + .stage_with_priority(BuildStage::new(), 20) + .stage_with_priority(ValidateStage::new(), 22) + .stage_with_priority(SplitStage::new(), 25) + .stage_with_priority(EnrichStage::new(), 40) + .stage_with_priority(ReasoningIndexStage::new(), 45) + .stage_with_priority(ConceptExtractionStage::new(), 47) + .stage_with_priority(NavigationIndexStage::new(), 50) + .stage_with_priority(VerifyStage, 55) + .stage_with_priority(OptimizeStage::new(), 60); + + Self { orchestrator } + } + + /// Create a pipeline with LLM enhancement. + /// + /// Stages (in order): + /// 1. `parse` - Parse document + /// 2. `build` - Build tree + /// 3. `validate` - Verify tree integrity (optional) + /// 4. `split` - Split oversized leaf nodes (optional) + /// 5. `enhance` - LLM-based enhancement (summaries) + /// 6. `enrich` - Add metadata + /// 7. `reasoning_index` - Build pre-computed reasoning index + /// 8. `concept_extraction` - Extract key concepts via LLM (optional) + /// 9. `navigation_index` - Build Agent navigation index + /// 10. `verify` - Validate ingest output reliability + /// 11. `optimize` - Optimize tree + pub fn with_llm(client: LlmClient) -> Self { + tracing::info!( + "PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage + context" + ); + let orchestrator = PipelineOrchestrator::new() + .with_llm_client(client.clone()) + .stage_with_priority(ParseStage::with_llm_client(client.clone()), 10) + .stage_with_priority(BuildStage::new(), 20) + .stage_with_priority(ValidateStage::new(), 22) + .stage_with_priority(SplitStage::new(), 25) + .stage_with_priority(EnhanceStage::with_llm_client(client.clone()), 30) + .stage_with_priority(EnrichStage::new(), 40) + .stage_with_priority(ReasoningIndexStage::new(), 45) + .stage_with_priority(ConceptExtractionStage::with_llm_client(client), 47) + .stage_with_priority(NavigationIndexStage::new(), 50) + .stage_with_priority(VerifyStage, 55) + .stage_with_priority(OptimizeStage::new(), 60); + + Self { orchestrator } + } + + /// Create from a custom orchestrator. + /// + /// Use this for full control over stage ordering and dependencies. + /// + /// # Example + /// + /// ```rust,ignore + /// let orchestrator = PipelineOrchestrator::new() + /// .stage_with_priority(ParseStage::new(), 10) + /// .stage_with_priority(MyAnalysisStage::new(), 25) + /// .stage_with_priority(BuildStage::new(), 20) + /// .stage_with_deps(MyValidationStage::new(), 50, &["build"]); + /// + /// let executor = PipelineExecutor::from_orchestrator(orchestrator); + /// ``` + pub fn from_orchestrator(orchestrator: PipelineOrchestrator) -> Self { + Self { orchestrator } + } + + /// Add a stage with default priority. + /// + /// The stage will be added after existing stages with the same priority. + pub fn add_stage(mut self, stage: impl IndexStage + 'static) -> Self { + self.orchestrator = self.orchestrator.stage(stage); + self + } + + /// Add a stage with custom priority. + /// + /// Lower priority = earlier execution. + pub fn add_stage_with_priority( + mut self, + stage: impl IndexStage + 'static, + priority: i32, + ) -> Self { + self.orchestrator = self.orchestrator.stage_with_priority(stage, priority); + self + } + + /// Add a stage with priority and dependencies. + /// + /// The stage will run after all specified dependencies. + pub fn add_stage_with_deps( + mut self, + stage: impl IndexStage + 'static, + priority: i32, + depends_on: &[&str], + ) -> Self { + self.orchestrator = self + .orchestrator + .stage_with_deps(stage, priority, depends_on); + self + } + + /// Get the list of stage names in execution order. + pub fn stage_names(&self) -> Result> { + self.orchestrator.stage_names() + } + + /// Get the number of stages. + pub fn stage_count(&self) -> usize { + self.orchestrator.stage_count() + } + + /// Execute the pipeline. + /// + /// Stages are executed in dependency-resolved order. + pub async fn execute( + &mut self, + input: IndexInput, + options: PipelineOptions, + ) -> Result { + info!( + "Starting index pipeline with {} stages", + self.orchestrator.stage_count() + ); + self.orchestrator.execute(input, options).await + } +} + +impl Default for PipelineExecutor { + fn default() -> Self { + Self::new() + } +} diff --git a/vectorless-core/vectorless-index/src/pipeline/metrics.rs b/vectorless-core/vectorless-index/src/pipeline/metrics.rs new file mode 100644 index 00000000..9c08d69a --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/metrics.rs @@ -0,0 +1,6 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Re-export IndexMetrics from the metrics module. + +pub use vectorless_metrics::IndexMetrics; diff --git a/vectorless-core/vectorless-index/src/pipeline/mod.rs b/vectorless-core/vectorless-index/src/pipeline/mod.rs new file mode 100644 index 00000000..e6e3752d --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/mod.rs @@ -0,0 +1,24 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Pipeline execution module. +//! +//! This module provides the core pipeline infrastructure: +//! - [`IndexContext`] - Context passed between stages +//! - [`PipelineExecutor`] - Executes the indexing pipeline +//! - [`PipelineOrchestrator`] - Flexible stage orchestration with dependencies +//! - [`IndexMetrics`] - Performance metrics collection +//! - [`FailurePolicy`] - Configurable failure handling for stages +//! - [`StageRetryConfig`] - Retry configuration for stages + +mod checkpoint; +mod context; +mod executor; +mod metrics; +mod orchestrator; +mod policy; + +pub use context::{IndexContext, IndexInput, PipelineResult, StageResult}; +pub use executor::PipelineExecutor; +pub use metrics::IndexMetrics; +pub use policy::{FailurePolicy, StageRetryConfig}; diff --git a/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs b/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs new file mode 100644 index 00000000..3d830748 --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs @@ -0,0 +1,1028 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Pipeline orchestrator for managing and executing index stages. +//! +//! The orchestrator provides: +//! - Stage registration with priority +//! - Dependency-based ordering via topological sort +//! - Failure policies (Fail, Skip, Retry) +//! - Execution groups for parallel execution +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::index::pipeline::PipelineOrchestrator; +//! use vectorless::index::stages::{ParseStage, BuildStage}; +//! +//! let orchestrator = PipelineOrchestrator::new() +//! .stage(ParseStage::new()) +//! .stage(BuildStage::new()) +//! .stage(MyCustomStage::new()); +//! +//! let result = orchestrator.execute(input, options).await?; +//! ``` + +use std::collections::HashMap; +use std::time::Instant; +use tracing::{debug, error, info, warn}; + +use vectorless_error::Result; + +use super::super::PipelineOptions; +use super::super::stages::IndexStage; +use super::checkpoint::{CheckpointContextData, CheckpointManager, PipelineCheckpoint}; +use super::context::{IndexContext, IndexInput, PipelineResult, StageResult}; +use super::policy::FailurePolicy; + +/// Stage entry with metadata for orchestration. +struct StageEntry { + /// The stage implementation. + stage: Box, + /// Priority (lower = earlier execution). + priority: i32, + /// Names of stages this depends on. + depends_on: Vec, +} + +impl std::fmt::Debug for StageEntry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("StageEntry") + .field("name", &self.stage.name()) + .field("priority", &self.priority) + .field("depends_on", &self.depends_on) + .finish() + } +} + +/// Group of stages at the same dependency level (can run in parallel). +#[derive(Debug, Clone)] +pub struct ExecutionGroup { + /// Indices of stages in this group. + pub stage_indices: Vec, + /// Whether this group has multiple stages (parallelizable). + pub parallel: bool, +} + +/// Pipeline orchestrator for stage management and execution. +/// +/// Provides flexible stage registration with: +/// - Priority-based ordering +/// - Dependency resolution +/// - Failure policies (Fail, Skip, Retry) +/// - Execution groups for parallel execution +/// +/// # Stage Ordering +/// +/// Stages are ordered by: +/// 1. Dependencies (must run after dependencies) +/// 2. Priority (lower = earlier) +/// 3. Registration order (tie-breaker) +/// +/// # Example +/// +/// ```rust,ignore +/// // Default pipeline +/// let orchestrator = PipelineOrchestrator::default(); +/// +/// // Custom pipeline +/// let orchestrator = PipelineOrchestrator::new() +/// .stage(ParseStage::new()) +/// .stage_with_priority(MyAnalysisStage::new(), 50) // Run after build (priority 20) +/// .stage_with_priority(BuildStage::new(), 20); +/// ``` +pub struct PipelineOrchestrator { + /// Registered stages with metadata. + stages: Vec, + /// Shared LLM client injected into pipeline context. + llm_client: Option, +} + +impl Default for PipelineOrchestrator { + fn default() -> Self { + Self::new() + } +} + +impl PipelineOrchestrator { + /// Create a new empty orchestrator. + pub fn new() -> Self { + Self { + stages: Vec::new(), + llm_client: None, + } + } + + /// Set the shared LLM client (injected into pipeline context). + pub fn with_llm_client(mut self, client: vectorless_llm::LlmClient) -> Self { + self.llm_client = Some(client); + self + } + + /// Add a stage with default priority (100). + /// + /// Dependencies are automatically read from the stage's `depends_on()` method. + pub fn stage(mut self, stage: S) -> Self + where + S: IndexStage + 'static, + { + let deps = stage.depends_on(); + self.stages.push(StageEntry { + stage: Box::new(stage), + priority: 100, + depends_on: deps.into_iter().map(|s| s.to_string()).collect(), + }); + self + } + + /// Add a stage with custom priority. + /// + /// Dependencies are automatically read from the stage's `depends_on()` method. + /// Lower priority = earlier execution. + /// Default priority is 100. + pub fn stage_with_priority(mut self, stage: S, priority: i32) -> Self + where + S: IndexStage + 'static, + { + let deps = stage.depends_on(); + self.stages.push(StageEntry { + stage: Box::new(stage), + priority, + depends_on: deps.into_iter().map(|s| s.to_string()).collect(), + }); + self + } + + /// Add a stage with priority and explicit dependencies. + /// + /// Merges trait-level dependencies with explicitly provided ones. + /// The stage will run after all specified dependencies. + pub fn stage_with_deps( + mut self, + stage: S, + priority: i32, + explicit_depends_on: &[&str], + ) -> Self + where + S: IndexStage + 'static, + { + let trait_deps = stage.depends_on(); + let mut all_deps: Vec = trait_deps.into_iter().map(|s| s.to_string()).collect(); + + // Add explicit deps that aren't already included + for dep in explicit_depends_on { + if !all_deps.iter().any(|d| d == dep) { + all_deps.push(dep.to_string()); + } + } + + self.stages.push(StageEntry { + stage: Box::new(stage), + priority, + depends_on: all_deps, + }); + self + } + + /// Remove all stages with the given name. + pub fn remove_stage(mut self, name: &str) -> Self { + self.stages.retain(|entry| entry.stage.name() != name); + self + } + + /// Check if a stage with the given name exists. + pub fn has_stage(&self, name: &str) -> bool { + self.stages.iter().any(|entry| entry.stage.name() == name) + } + + /// Get the number of registered stages. + pub fn stage_count(&self) -> usize { + self.stages.len() + } + + /// Resolve dependencies and return stage indices in execution order. + /// + /// # Errors + /// + /// Returns an error if: + /// - A dependency refers to a non-existent stage + /// - There's a circular dependency + fn resolve_order(&self) -> Result> { + // Build name -> index map + let name_to_idx: HashMap<&str, usize> = self + .stages + .iter() + .enumerate() + .map(|(i, entry)| (entry.stage.name(), i)) + .collect(); + + // Validate dependencies + for entry in &self.stages { + for dep in &entry.depends_on { + if !name_to_idx.contains_key(dep.as_str()) { + return Err(vectorless_error::Error::Config(format!( + "Stage '{}' depends on non-existent stage '{}'", + entry.stage.name(), + dep + ))); + } + } + } + + // Topological sort with priority consideration (Kahn's algorithm) + let n = self.stages.len(); + let mut in_degree: Vec = vec![0; n]; + let mut adjacency: HashMap> = HashMap::new(); + + for (i, entry) in self.stages.iter().enumerate() { + for dep in &entry.depends_on { + if let Some(&dep_idx) = name_to_idx.get(dep.as_str()) { + adjacency.entry(dep_idx).or_default().push(i); + in_degree[i] += 1; + } + } + } + + // Collect stages with no dependencies, sorted by priority + let mut ready: Vec = (0..n).filter(|&i| in_degree[i] == 0).collect(); + ready.sort_by_key(|&i| (self.stages[i].priority, i)); + + let mut result: Vec = Vec::new(); + + while let Some(idx) = ready.first().cloned() { + ready.remove(0); + result.push(idx); + + if let Some(neighbors) = adjacency.get(&idx) { + for &neighbor in neighbors { + in_degree[neighbor] -= 1; + if in_degree[neighbor] == 0 { + // Insert in priority order + let entry = &self.stages[neighbor]; + let pos = ready + .binary_search_by_key(&(entry.priority, neighbor), |&i| { + (self.stages[i].priority, i) + }) + .unwrap_or_else(|e| e); + ready.insert(pos, neighbor); + } + } + } + } + + // Check for cycles + if result.len() != n { + let remaining: Vec<&str> = (0..n) + .filter(|i| !result.contains(i)) + .map(|i| self.stages[i].stage.name()) + .collect(); + return Err(vectorless_error::Error::Config(format!( + "Circular dependency detected involving stages: {:?}", + remaining + ))); + } + + Ok(result) + } + + /// Compute execution groups from resolved order. + /// + /// Stages with the same "level" in the dependency graph and no + /// inter-dependencies can run in parallel. + fn compute_execution_groups(&self, order: &[usize]) -> Vec { + if order.is_empty() { + return Vec::new(); + } + + // Build name -> index map + let name_to_idx: HashMap<&str, usize> = self + .stages + .iter() + .enumerate() + .map(|(i, entry)| (entry.stage.name(), i)) + .collect(); + + // Calculate level for each stage based on dependencies + let mut levels: HashMap = HashMap::new(); + + for &idx in order { + let entry = &self.stages[idx]; + let level = if entry.depends_on.is_empty() { + 0 + } else { + entry + .depends_on + .iter() + .filter_map(|dep| { + name_to_idx + .get(dep.as_str()) + .and_then(|&dep_idx| levels.get(&dep_idx)) + }) + .max() + .map(|&l| l + 1) + .unwrap_or(0) + }; + levels.insert(idx, level); + } + + // Group stages by level + let mut level_groups: HashMap> = HashMap::new(); + for &idx in order { + let level = levels[&idx]; + level_groups.entry(level).or_default().push(idx); + } + + // Convert to execution groups + let max_level = *levels.values().max().unwrap_or(&0); + (0..=max_level) + .filter_map(|level| { + level_groups.get(&level).map(|indices| ExecutionGroup { + stage_indices: indices.clone(), + parallel: indices.len() > 1, + }) + }) + .collect() + } + + /// Execute a stage with its failure policy applied. + async fn execute_stage_with_policy( + stage: &mut Box, + ctx: &mut IndexContext, + ) -> Result { + let policy = stage.failure_policy(); + let stage_name = stage.name().to_string(); + + match policy { + FailurePolicy::Fail => { + // Direct execution, errors propagate + stage.execute(ctx).await + } + + FailurePolicy::Skip => { + // Try once, skip on failure + match stage.execute(ctx).await { + Ok(result) => Ok(result), + Err(e) => { + warn!("Stage {} failed, skipping: {}", stage_name, e); + Ok(StageResult::failure(&stage_name, &e.to_string())) + } + } + } + + FailurePolicy::Retry(config) => { + let mut attempts = 0; + loop { + attempts += 1; + match stage.execute(ctx).await { + Ok(result) => { + if attempts > 1 { + info!("Stage {} succeeded on attempt {}", stage_name, attempts); + } + return Ok(result); + } + Err(e) => { + if attempts >= config.max_attempts { + warn!( + "Stage {} failed after {} attempts: {}", + stage_name, attempts, e + ); + return Err(e); + } + let delay = config.delay_for_attempt(attempts - 1); + warn!( + "Stage {} failed on attempt {}, retrying in {:?}: {}", + stage_name, attempts, delay, e + ); + tokio::time::sleep(delay).await; + } + } + } + } + } + } + + /// Handle the result of a stage execution (shared between sequential and parallel paths). + fn handle_stage_result( + result: Result, + stage_name: &str, + policy: &FailurePolicy, + ctx: &mut IndexContext, + ) -> Result<()> { + match result { + Ok(result) => { + ctx.stage_results.insert(stage_name.to_string(), result); + Ok(()) + } + Err(e) => { + if policy.allows_continuation() { + warn!( + "Stage {} failed but policy allows continuation: {}", + stage_name, e + ); + ctx.stage_results.insert( + stage_name.to_string(), + StageResult::failure(stage_name, &e.to_string()), + ); + Ok(()) + } else { + error!("Stage {} failed, stopping pipeline: {}", stage_name, e); + Err(e) + } + } + } + } + + /// Execute the pipeline. + /// + /// Stages are executed in dependency-resolved order. + /// Failure policies are applied per-stage. + pub async fn execute( + &mut self, + input: IndexInput, + options: PipelineOptions, + ) -> Result { + let total_start = Instant::now(); + info!( + "Starting orchestrated pipeline with {} stages", + self.stages.len() + ); + + // Resolve execution order + let order = self.resolve_order()?; + let stage_names: Vec<&str> = order.iter().map(|&i| self.stages[i].stage.name()).collect(); + info!("[pipeline] Execution order: {:?}", stage_names); + + // Compute execution groups for potential parallelization + let groups = self.compute_execution_groups(&order); + let parallel_count = groups.iter().filter(|g| g.parallel).count(); + if parallel_count > 0 { + info!( + "[pipeline] {} execution groups ({} parallelizable)", + groups.len(), + parallel_count + ); + } else { + debug!( + "[pipeline] {} execution groups (all sequential)", + groups.len() + ); + } + + // Create context + let mut opts = options; + let existing_tree = opts.existing_tree.take(); + let mut ctx = IndexContext::new(input, opts); + // Inject shared LLM client into context for stages that need it (e.g. ReasoningIndexStage) + if let Some(client) = self.llm_client.take() { + ctx = ctx.with_llm_client(client); + } + if let Some(tree) = existing_tree { + ctx = ctx.with_existing_tree(tree); + } + + // Try to resume from checkpoint + if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { + let manager = CheckpointManager::new(checkpoint_dir); + if let Some(checkpoint) = manager.load(&ctx.doc_id) { + if CheckpointManager::is_valid_for_resume( + &checkpoint, + &ctx.source_hash, + ctx.options.processing_version, + &ctx.options.logic_fingerprint().to_string(), + ) { + info!( + "Resuming from checkpoint: {} stages already completed", + checkpoint.completed_stages.len() + ); + // Restore context data from checkpoint + ctx.raw_nodes = checkpoint.context_data.raw_nodes; + if let Some(tree) = checkpoint.context_data.tree { + ctx.tree = Some(tree); + } + ctx.metrics = checkpoint.context_data.metrics; + ctx.page_count = checkpoint.context_data.page_count; + ctx.line_count = checkpoint.context_data.line_count; + ctx.description = checkpoint.context_data.description; + // Mark completed stages as done + for stage_name in &checkpoint.completed_stages { + ctx.stage_results + .insert(stage_name.clone(), StageResult::success(stage_name)); + } + } else { + info!("Checkpoint exists but invalid, starting fresh"); + } + } + } + + // Execute each group + for (group_idx, group) in groups.iter().enumerate() { + if group.parallel { + let names: Vec<&str> = group + .stage_indices + .iter() + .map(|&i| self.stages[i].stage.name()) + .collect(); + info!("[pipeline] Parallel group {}: {:?}", group_idx, names); + } + + if group.parallel && !group.stage_indices.is_empty() { + // Check if all stages in this group are already completed (from checkpoint) + let all_completed = group.stage_indices.iter().all(|&idx| { + let name = self.stages[idx].stage.name(); + ctx.stage_results.contains_key(name) + }); + if all_completed { + let names: Vec<&str> = group + .stage_indices + .iter() + .map(|&i| self.stages[i].stage.name()) + .collect(); + info!("[pipeline] Skipping completed parallel group: {:?}", names); + continue; + } + + // === N-stage parallel execution === + // + // At most one stage may write_tree — it gets the main ctx. + // All other stages get cloned contexts with tree snapshots. + // All stages run concurrently via futures::future::join_all. + // After all complete, outputs are merged back by AccessPattern. + + // Identify the tree writer (if any) + let tree_writer_idx: Option = group + .stage_indices + .iter() + .find(|&&idx| self.stages[idx].stage.access_pattern().writes_tree) + .copied(); + + // For each stage, prepare (stage, context) pair. + // Swap out stages from self.stages to get owned Box. + let mut entries: Vec = Vec::with_capacity(group.stage_indices.len()); + + for &idx in &group.stage_indices { + let stage = std::mem::replace(&mut self.stages[idx].stage, Box::new(NopStage)); + let name = stage.name().to_string(); + let policy = stage.failure_policy(); + let access = stage.access_pattern(); + + let stage_ctx = if Some(idx) == tree_writer_idx { + // Tree writer gets a placeholder; we'll use &mut ctx directly + None + } else { + // Reader gets a cloned context + let mut clone = + IndexContext::new(IndexInput::content(""), ctx.options.clone()); + clone.tree = ctx.tree.clone(); + clone.existing_tree = ctx.existing_tree.clone(); + clone.doc_id = ctx.doc_id.clone(); + clone.name = ctx.name.clone(); + clone.format = ctx.format; + clone.source_path = ctx.source_path.clone(); + if let Some(ref llm) = ctx.llm_client { + clone.llm_client = Some(llm.clone()); + } + Some(clone) + }; + + entries.push(ParallelEntry { + idx, + stage, + ctx: stage_ctx, + name, + policy, + access, + }); + } + + let parallel_names: Vec<&str> = entries.iter().map(|e| e.name.as_str()).collect(); + info!("[pipeline] Executing in parallel: {:?}", parallel_names); + + // Split into writer and readers + let mut writer_entry: Option = None; + let mut reader_entries: Vec = Vec::new(); + for entry in entries { + if entry.ctx.is_none() { + writer_entry = Some(entry); + } else { + reader_entries.push(entry); + } + } + + // Execute writer on main ctx concurrently with readers. + // Move each reader's stage+ctx into an owned async block. + // All futures are !Send (Box), but join_all + // works fine on the same thread. + + let reader_futs: Vec< + std::pin::Pin< + Box< + dyn std::future::Future< + Output = ( + ParallelEntry, + std::result::Result, + ), + > + Send, + >, + >, + > = reader_entries + .into_iter() + .map(|mut entry| { + Box::pin(async move { + let res = Self::execute_stage_with_policy( + &mut entry.stage, + entry.ctx.as_mut().unwrap(), + ) + .await; + (entry, res) + }) + as std::pin::Pin + Send>> + }) + .collect(); + + // If there's a tree writer, run it concurrently with readers. + // If no tree writer (all readers), just run readers. + if let Some(mut we) = writer_entry { + // Run writer + readers concurrently. + // The writer borrows &mut ctx; readers use their own cloned ctxs. + let (writer_res, completed_readers) = tokio::join!( + Self::execute_stage_with_policy(&mut we.stage, &mut ctx), + futures::future::join_all(reader_futs), + ); + + // Put writer stage back and handle result + self.stages[we.idx].stage = we.stage; + Self::handle_stage_result(writer_res, &we.name, &we.policy, &mut ctx)?; + + // Process reader results + for (re, reader_res) in completed_readers { + Self::merge_reader_outputs(&mut ctx, &re); + self.stages[re.idx].stage = re.stage; + Self::handle_stage_result(reader_res, &re.name, &re.policy, &mut ctx)?; + } + } else { + // All readers, no writer + let completed_readers = futures::future::join_all(reader_futs).await; + for (re, reader_res) in completed_readers { + Self::merge_reader_outputs(&mut ctx, &re); + self.stages[re.idx].stage = re.stage; + Self::handle_stage_result(reader_res, &re.name, &re.policy, &mut ctx)?; + } + } + } else { + // === Sequential execution (single stage or non-parallel group) === + for &idx in &group.stage_indices { + let entry = &mut self.stages[idx]; + let stage_name = entry.stage.name().to_string(); + + // Skip stages already completed (from checkpoint resume) + if ctx.stage_results.contains_key(&stage_name) { + info!("Skipping already completed stage: {}", stage_name); + continue; + } + + let policy = entry.stage.failure_policy(); + + info!( + "Executing stage: {} (priority {})", + stage_name, entry.priority + ); + + match Self::execute_stage_with_policy(&mut entry.stage, &mut ctx).await { + Ok(result) => { + ctx.stage_results.insert(stage_name.clone(), result); + } + Err(e) => { + if policy.allows_continuation() { + warn!( + "Stage {} failed but policy allows continuation: {}", + stage_name, e + ); + ctx.stage_results.insert( + stage_name.clone(), + StageResult::failure(&stage_name, &e.to_string()), + ); + } else { + error!("Stage {} failed, stopping pipeline: {}", stage_name, e); + // Save checkpoint before returning error + Self::save_checkpoint(&ctx); + return Err(e); + } + } + } + } + } + + // Save checkpoint after each group completes + Self::save_checkpoint(&ctx); + } + + let total_duration = total_start.elapsed().as_millis() as u64; + info!( + "[pipeline] Complete: {} stages in {}ms for '{}'", + ctx.stage_results.len(), + total_duration, + ctx.name, + ); + + // Clear checkpoint on successful completion + if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { + let manager = CheckpointManager::new(checkpoint_dir); + if let Err(e) = manager.clear(&ctx.doc_id) { + warn!("Failed to clear checkpoint for {}: {}", ctx.doc_id, e); + } + } + + // Finalize result + Ok(ctx.finalize()) + } + + /// Merge a reader stage's outputs back into the main context. + /// + /// Reads the reader's AccessPattern to know which fields to copy, + /// and merges additive metrics (LLM calls, tokens, etc.). + fn merge_reader_outputs(ctx: &mut IndexContext, reader: &ParallelEntry) { + if reader.access.writes_reasoning_index { + if let Some(ref rctx) = reader.ctx { + ctx.reasoning_index = rctx.reasoning_index.clone(); + } + } + if reader.access.writes_navigation_index { + if let Some(ref rctx) = reader.ctx { + ctx.navigation_index = rctx.navigation_index.clone(); + } + } + if reader.access.writes_description { + if let Some(ref rctx) = reader.ctx { + ctx.description = rctx.description.clone(); + } + } + // Merge additive metrics from reader + if let Some(ref rctx) = reader.ctx { + ctx.metrics.llm_calls += rctx.metrics.llm_calls; + ctx.metrics.summaries_generated += rctx.metrics.summaries_generated; + ctx.metrics.total_tokens_generated += rctx.metrics.total_tokens_generated; + ctx.metrics.nodes_processed += rctx.metrics.nodes_processed; + ctx.metrics.nodes_merged += rctx.metrics.nodes_merged; + ctx.metrics.nodes_skipped += rctx.metrics.nodes_skipped; + if rctx.metrics.reasoning_index_time_ms > 0 { + ctx.metrics.record_reasoning_index( + rctx.metrics.reasoning_index_time_ms, + rctx.metrics.topics_indexed, + rctx.metrics.keywords_indexed, + ); + } + if rctx.metrics.optimize_time_ms > 0 { + ctx.metrics.record_optimize(rctx.metrics.optimize_time_ms); + } + if rctx.metrics.navigation_index_time_ms > 0 { + ctx.metrics.record_navigation_index( + rctx.metrics.navigation_index_time_ms, + rctx.metrics.nav_entries_indexed, + rctx.metrics.child_routes_indexed, + ); + } + if rctx.metrics.enhance_time_ms > 0 { + ctx.metrics.record_enhance(rctx.metrics.enhance_time_ms); + } + if rctx.metrics.enrich_time_ms > 0 { + ctx.metrics.record_enrich(rctx.metrics.enrich_time_ms); + } + } + } + + /// Save a checkpoint of the current pipeline state. + fn save_checkpoint(ctx: &IndexContext) { + let checkpoint_dir = match ctx.options.checkpoint_dir { + Some(ref dir) => dir.clone(), + None => return, + }; + + let completed_stages: Vec = ctx.stage_results.keys().cloned().collect(); + let checkpoint = PipelineCheckpoint { + doc_id: ctx.doc_id.clone(), + source_hash: ctx.source_hash.clone(), + processing_version: ctx.options.processing_version, + config_fingerprint: ctx.options.logic_fingerprint().to_string(), + completed_stages, + context_data: CheckpointContextData { + raw_nodes: ctx.raw_nodes.clone(), + tree: ctx.tree.clone(), + metrics: ctx.metrics.clone(), + page_count: ctx.page_count, + line_count: ctx.line_count, + description: ctx.description.clone(), + }, + timestamp: chrono::Utc::now(), + }; + + let manager = CheckpointManager::new(checkpoint_dir); + if let Err(e) = manager.save(&ctx.doc_id, &checkpoint) { + warn!("Failed to save checkpoint for {}: {}", ctx.doc_id, e); + } + } + + /// Get list of stage names in execution order. + pub fn stage_names(&self) -> Result> { + let order = self.resolve_order()?; + Ok(order.iter().map(|&i| self.stages[i].stage.name()).collect()) + } + + /// Get execution groups for the current pipeline. + /// + /// This is useful for visualizing parallelization opportunities. + pub fn get_execution_groups(&self) -> Result> { + let order = self.resolve_order()?; + Ok(self.compute_execution_groups(&order)) + } +} + +/// Placeholder stage used during parallel execution when the real stage +/// is temporarily swapped out via `std::mem::replace`. +struct NopStage; + +#[async_trait::async_trait] +impl IndexStage for NopStage { + fn name(&self) -> &'static str { + "_nop" + } + + async fn execute(&mut self, _ctx: &mut IndexContext) -> Result { + Ok(StageResult::success("_nop")) + } +} + +/// Owned entry for parallel stage execution. +/// +/// Each stage in a parallel group is swapped out from the orchestrator's +/// stages vec into this struct, along with its own cloned context. +/// After execution, the stage is swapped back and outputs are merged. +struct ParallelEntry { + /// Index into orchestrator's stages vec (for swapping back). + idx: usize, + /// The owned stage implementation. + stage: Box, + /// Cloned context for reader stages; None for the tree writer + /// (which uses the main ctx directly). + ctx: Option, + /// Stage name (captured before swap). + name: String, + /// Failure policy (captured before swap). + policy: FailurePolicy, + /// Access pattern (captured before swap). + access: crate::index::stages::AccessPattern, +} + +/// Builder for creating custom stage configurations. +/// +/// This is a convenience type for configuring custom stages +/// without manually calling the orchestrator methods. +pub struct CustomStageBuilder { + name: String, + priority: i32, + depends_on: Vec, + optional: bool, +} + +impl CustomStageBuilder { + /// Create a new custom stage builder. + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + priority: 100, + depends_on: Vec::new(), + optional: false, + } + } + + /// Set priority (lower = earlier). + pub fn priority(mut self, priority: i32) -> Self { + self.priority = priority; + self + } + + /// Add a dependency. + pub fn depends_on(mut self, stage: impl Into) -> Self { + self.depends_on.push(stage.into()); + self + } + + /// Mark as optional (failures won't stop pipeline). + pub fn optional(mut self) -> Self { + self.optional = true; + self + } + + /// Get the stage name. + pub fn name(&self) -> &str { + &self.name + } + + /// Get the priority. + pub fn get_priority(&self) -> i32 { + self.priority + } + + /// Get dependencies. + pub fn get_deps(&self) -> &[String] { + &self.depends_on + } + + /// Check if optional. + pub fn is_optional(&self) -> bool { + self.optional + } +} + +#[cfg(test)] +mod tests { + use super::super::context::StageResult; + use super::*; + + #[test] + fn test_orchestrator_creation() { + let orchestrator = PipelineOrchestrator::new(); + assert_eq!(orchestrator.stage_count(), 0); + } + + #[test] + fn test_add_stages() { + let orchestrator = PipelineOrchestrator::new() + .stage_with_priority(MockStage::new("a"), 10) + .stage_with_priority(MockStage::new("b"), 20) + .stage_with_priority(MockStage::new("c"), 5); + + assert_eq!(orchestrator.stage_count(), 3); + + let names = orchestrator.stage_names().unwrap(); + assert_eq!(names, vec!["c", "a", "b"]); // priority order + } + + #[test] + fn test_dependency_resolution() { + let orchestrator = PipelineOrchestrator::new() + .stage_with_priority(MockStage::new("a"), 10) + .stage_with_deps(MockStage::new("b"), 5, &["a"]) // b depends on a + .stage_with_deps(MockStage::new("c"), 1, &["b"]); // c depends on b + + let names = orchestrator.stage_names().unwrap(); + assert_eq!(names, vec!["a", "b", "c"]); + } + + #[test] + fn test_missing_dependency() { + let orchestrator = + PipelineOrchestrator::new().stage_with_deps(MockStage::new("a"), 10, &["nonexistent"]); + + let result = orchestrator.stage_names(); + assert!(result.is_err()); + } + + #[test] + fn test_remove_stage() { + let orchestrator = PipelineOrchestrator::new() + .stage(MockStage::new("a")) + .stage(MockStage::new("b")) + .remove_stage("a"); + + assert_eq!(orchestrator.stage_count(), 1); + assert!(!orchestrator.has_stage("a")); + assert!(orchestrator.has_stage("b")); + } + + #[test] + fn test_custom_stage_builder() { + let builder = CustomStageBuilder::new("my_stage") + .priority(50) + .depends_on("parse") + .optional(); + + assert_eq!(builder.name(), "my_stage"); + assert_eq!(builder.get_priority(), 50); + assert_eq!(builder.get_deps(), &["parse".to_string()]); + assert!(builder.is_optional()); + } + + /// Mock stage for testing. + struct MockStage { + name: String, + } + + impl MockStage { + fn new(name: &str) -> Self { + Self { + name: name.to_string(), + } + } + } + + #[async_trait::async_trait] + impl IndexStage for MockStage { + fn name(&self) -> &str { + &self.name + } + + async fn execute(&mut self, _ctx: &mut IndexContext) -> Result { + Ok(StageResult::success(&self.name)) + } + } +} diff --git a/vectorless-core/vectorless-index/src/pipeline/policy.rs b/vectorless-core/vectorless-index/src/pipeline/policy.rs new file mode 100644 index 00000000..da3c5b2b --- /dev/null +++ b/vectorless-core/vectorless-index/src/pipeline/policy.rs @@ -0,0 +1,222 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Failure policies for pipeline stages. +//! +//! This module provides configurable failure handling for index pipeline stages. +//! +//! # Policies +//! +//! - **Fail** - Stop the entire pipeline on stage failure (default for required stages) +//! - **Skip** - Skip the failed stage and continue the pipeline +//! - **Retry** - Retry the stage with exponential backoff before failing +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::index::pipeline::{FailurePolicy, StageRetryConfig}; +//! +//! // Simple skip policy +//! let policy = FailurePolicy::skip(); +//! +//! // Retry with custom config +//! let policy = FailurePolicy::retry_with( +//! StageRetryConfig::new() +//! .with_max_attempts(3) +//! .with_initial_delay(Duration::from_millis(500)) +//! ); +//! ``` + +use std::time::Duration; + +/// Retry configuration for stage execution. +#[derive(Debug, Clone)] +pub struct StageRetryConfig { + /// Maximum number of attempts (including initial). + pub max_attempts: usize, + /// Initial delay before first retry. + pub initial_delay: Duration, + /// Maximum delay between retries. + pub max_delay: Duration, + /// Exponential backoff multiplier. + pub multiplier: f64, +} + +impl Default for StageRetryConfig { + fn default() -> Self { + Self { + max_attempts: 3, + initial_delay: Duration::from_millis(100), + max_delay: Duration::from_secs(10), + multiplier: 2.0, + } + } +} + +impl StageRetryConfig { + /// Create a new retry config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set maximum number of attempts. + pub fn with_max_attempts(mut self, n: usize) -> Self { + self.max_attempts = n.max(1); + self + } + + /// Set initial delay before first retry. + pub fn with_initial_delay(mut self, delay: Duration) -> Self { + self.initial_delay = delay; + self + } + + /// Set maximum delay between retries. + pub fn with_max_delay(mut self, delay: Duration) -> Self { + self.max_delay = delay; + self + } + + /// Set exponential backoff multiplier. + pub fn with_multiplier(mut self, multiplier: f64) -> Self { + self.multiplier = multiplier; + self + } + + /// Calculate delay for a given attempt (0-indexed). + /// + /// Uses exponential backoff: `initial_delay * multiplier^attempt` + pub fn delay_for_attempt(&self, attempt: usize) -> Duration { + let delay_ms = + (self.initial_delay.as_millis() as f64) * self.multiplier.powi(attempt as i32); + let capped_ms = delay_ms.min(self.max_delay.as_millis() as f64); + Duration::from_millis(capped_ms as u64) + } +} + +/// Policy for handling stage failures. +#[derive(Debug, Clone)] +pub enum FailurePolicy { + /// Fail the entire pipeline on error (default for required stages). + Fail, + + /// Skip this stage on failure, continue pipeline. + /// The stage result will record the failure but execution continues. + Skip, + + /// Retry with specified configuration before failing. + /// If all retries fail, the pipeline behavior depends on `allows_continuation`. + Retry(StageRetryConfig), +} + +impl Default for FailurePolicy { + fn default() -> Self { + Self::Fail + } +} + +impl FailurePolicy { + /// Create a Fail policy. + pub fn fail() -> Self { + Self::Fail + } + + /// Create a Skip policy. + pub fn skip() -> Self { + Self::Skip + } + + /// Create a Retry policy with default configuration. + pub fn retry() -> Self { + Self::Retry(StageRetryConfig::default()) + } + + /// Create a Retry policy with custom configuration. + pub fn retry_with(config: StageRetryConfig) -> Self { + Self::Retry(config) + } + + /// Check if pipeline can continue after failure with this policy. + /// + /// - `Fail`: No, stops pipeline + /// - `Skip`: Yes, continues + /// - `Retry`: No (if all retries exhausted, it's treated as failure) + pub fn allows_continuation(&self) -> bool { + matches!(self, Self::Skip) + } + + /// Check if this policy involves retry attempts. + pub fn has_retry(&self) -> bool { + matches!(self, Self::Retry(_)) + } + + /// Get retry config if this is a Retry policy. + pub fn retry_config(&self) -> Option<&StageRetryConfig> { + match self { + Self::Retry(config) => Some(config), + _ => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_retry_config() { + let config = StageRetryConfig::default(); + assert_eq!(config.max_attempts, 3); + assert_eq!(config.initial_delay, Duration::from_millis(100)); + assert_eq!(config.max_delay, Duration::from_secs(10)); + } + + #[test] + fn test_retry_config_builder() { + let config = StageRetryConfig::new() + .with_max_attempts(5) + .with_initial_delay(Duration::from_millis(200)) + .with_max_delay(Duration::from_secs(30)); + + assert_eq!(config.max_attempts, 5); + assert_eq!(config.initial_delay, Duration::from_millis(200)); + assert_eq!(config.max_delay, Duration::from_secs(30)); + } + + #[test] + fn test_delay_for_attempt() { + let config = StageRetryConfig::new() + .with_initial_delay(Duration::from_millis(100)) + .with_multiplier(2.0); + + assert_eq!(config.delay_for_attempt(0), Duration::from_millis(100)); + assert_eq!(config.delay_for_attempt(1), Duration::from_millis(200)); + assert_eq!(config.delay_for_attempt(2), Duration::from_millis(400)); + } + + #[test] + fn test_delay_respects_max() { + let config = StageRetryConfig::new() + .with_initial_delay(Duration::from_secs(1)) + .with_max_delay(Duration::from_secs(5)) + .with_multiplier(10.0); + + assert_eq!(config.delay_for_attempt(0), Duration::from_secs(1)); + assert_eq!(config.delay_for_attempt(1), Duration::from_secs(5)); // capped + assert_eq!(config.delay_for_attempt(2), Duration::from_secs(5)); // capped + } + + #[test] + fn test_failure_policy_constructors() { + assert!(matches!(FailurePolicy::fail(), FailurePolicy::Fail)); + assert!(matches!(FailurePolicy::skip(), FailurePolicy::Skip)); + assert!(matches!(FailurePolicy::retry(), FailurePolicy::Retry(_))); + } + + #[test] + fn test_allows_continuation() { + assert!(!FailurePolicy::fail().allows_continuation()); + assert!(FailurePolicy::skip().allows_continuation()); + assert!(!FailurePolicy::retry().allows_continuation()); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/build.rs b/vectorless-core/vectorless-index/src/stages/build.rs new file mode 100644 index 00000000..3557da8d --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/build.rs @@ -0,0 +1,334 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Build stage - Build tree from raw nodes. + +use super::async_trait; +use std::time::Instant; +use tracing::{debug, info}; + +use vectorless_document::{DocumentTree, NodeId}; +use vectorless_error::Result; +use crate::index::parse::RawNode; +use vectorless_utils::estimate_tokens; + +use super::{IndexStage, StageResult}; +use crate::index::ThinningConfig; +use crate::index::pipeline::IndexContext; + +/// Build stage - constructs a tree from raw nodes. +pub struct BuildStage; + +impl BuildStage { + /// Create a new build stage. + pub fn new() -> Self { + Self + } + + /// Calculate total token counts for all nodes (recursive, includes children). + fn calculate_total_tokens(nodes: &mut [RawNode]) { + if nodes.is_empty() { + return; + } + + // Process from back to front + for i in (0..nodes.len()).rev() { + let own_tokens = nodes[i] + .token_count + .unwrap_or_else(|| estimate_tokens(&nodes[i].content)); + nodes[i].token_count = Some(own_tokens); + + // Find all children (direct and indirect) + let children_tokens: usize = Self::find_all_children_indices(i, nodes) + .iter() + .map(|&child_idx| nodes[child_idx].total_token_count.unwrap_or(0)) + .sum(); + + nodes[i].total_token_count = Some(own_tokens + children_tokens); + } + } + + /// Find all children (direct and indirect) of a node. + fn find_all_children_indices(parent_idx: usize, nodes: &[RawNode]) -> Vec { + let parent_level = nodes[parent_idx].level; + let mut children = Vec::new(); + + for i in (parent_idx + 1)..nodes.len() { + if nodes[i].level <= parent_level { + break; + } + children.push(i); + } + + children + } + + /// Find direct children of a node. + fn find_direct_children_indices(parent_idx: usize, nodes: &[RawNode]) -> Vec { + let parent_level = nodes[parent_idx].level; + let target_level = parent_level + 1; + let mut children = Vec::new(); + let mut i = parent_idx + 1; + + while i < nodes.len() { + if nodes[i].level <= parent_level { + break; + } + if nodes[i].level == target_level { + children.push(i); + } + i += 1; + } + + children + } + + /// Apply thinning to raw nodes before tree construction. + /// + /// When `merge_content` is true: small nodes are merged into their parent + /// by concatenating child content into the parent, then marking children for removal. + /// When `merge_content` is false: small nodes are simply marked for removal. + fn apply_thinning(nodes: &mut [RawNode], config: &ThinningConfig) -> Vec { + if !config.enabled || nodes.is_empty() { + return vec![true; nodes.len()]; + } + + let mut keep = vec![true; nodes.len()]; + + // Process from leaves to root (bottom-up) + for i in (0..nodes.len()).rev() { + if !keep[i] { + continue; + } + let total_tokens = nodes[i].total_token_count.unwrap_or(0); + + if total_tokens < config.threshold { + // Find all children of this node + let children_indices = Self::find_all_children_indices(i, nodes); + + if !children_indices.is_empty() && config.merge_content { + // Merge children content into this node + let mut merged_content = nodes[i].content.clone(); + for &child_idx in &children_indices { + if !nodes[child_idx].content.trim().is_empty() { + if !merged_content.is_empty() { + merged_content.push_str("\n\n"); + } + merged_content.push_str(&nodes[child_idx].content); + } + } + nodes[i].content = merged_content; + nodes[i].token_count = Some(nodes[i].token_count.unwrap_or(0)); + } + + // Mark children for removal + for &child_idx in &children_indices { + keep[child_idx] = false; + } + } + } + + // Ensure each parent keeps at least one child + Self::ensure_min_children(nodes, &mut keep); + + keep + } + + /// Ensure each parent keeps at least one direct child. + fn ensure_min_children(nodes: &[RawNode], keep: &mut [bool]) { + for i in 0..nodes.len() { + let children = Self::find_direct_children_indices(i, nodes); + + if !children.is_empty() { + let has_kept_child = children.iter().any(|&c| keep[c]); + + if !has_kept_child { + // Keep the child with the most content + let best_child = children + .iter() + .max_by_key(|&&c| nodes[c].total_token_count.unwrap_or(0)) + .copied(); + + if let Some(idx) = best_child { + keep[idx] = true; + } + } + } + } + } + + /// Build tree from raw nodes. + fn build_tree(&self, raw_nodes: Vec, ctx: &mut IndexContext) -> DocumentTree { + let root_title = ctx.name.clone(); + let root_content = String::new(); + + let mut tree = DocumentTree::new(&root_title, &root_content); + + // Stack to track parent nodes at each level + let mut level_stack: Vec> = vec![Some(tree.root())]; + + for raw in raw_nodes { + let level = raw.level; + + // Ensure stack has enough slots + while level_stack.len() <= level { + level_stack.push(None); + } + + // Find parent: closest ancestor with a lower level + let parent_id = (0..level) + .rev() + .find_map(|l| level_stack.get(l).copied().flatten()) + .unwrap_or(tree.root()); + + // Create the node + let content = if raw.content.is_empty() { + "" + } else { + &raw.content + }; + let node_id = tree.add_child(parent_id, &raw.title, content); + + // Set line indices + tree.set_line_indices(node_id, raw.line_start, raw.line_end); + + // Set page boundaries if available + if let Some(page) = raw.page { + tree.set_page_boundaries(node_id, page, page); + } + + // Set token count if available + if let Some(count) = raw.token_count { + if count > 0 { + tree.set_token_count(node_id, count); + } + } + + // Update the stack for this level + if level < level_stack.len() { + level_stack[level] = Some(node_id); + } + + // Clear deeper levels + for i in (level + 1)..level_stack.len() { + level_stack[i] = None; + } + } + + tree + } + + /// Assign unique node IDs (DFS traversal). + fn assign_node_ids(&self, tree: &mut DocumentTree) { + let mut counter: usize = 0; + self.assign_recursive(tree, tree.root(), &mut counter); + } + + fn assign_recursive(&self, tree: &mut DocumentTree, node_id: NodeId, counter: &mut usize) { + *counter += 1; + let id_str = format!("{:04}", counter); + tree.set_node_id(node_id, &id_str); + + let children = tree.children(node_id); + for child_id in children { + self.assign_recursive(tree, child_id, counter); + } + } +} + +impl Default for BuildStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for BuildStage { + fn name(&self) -> &'static str { + "build" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["parse"] + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + // Take raw nodes from context + let mut raw_nodes = std::mem::take(&mut ctx.raw_nodes); + + if raw_nodes.is_empty() { + info!("[build] No raw nodes, skipping"); + return Ok(StageResult::success("build")); + } + + info!( + "[build] Starting: {} raw nodes, thinning={}", + raw_nodes.len(), + ctx.options.thinning.enabled + ); + + // Step 1: Calculate total tokens + Self::calculate_total_tokens(&mut raw_nodes); + debug!( + "[build] Calculated total tokens for {} nodes", + raw_nodes.len() + ); + + // Step 2: Apply thinning if enabled + let _original_count = raw_nodes.len(); + let keep = Self::apply_thinning(&mut raw_nodes, &ctx.options.thinning); + + let nodes_before_merge = raw_nodes.len(); + raw_nodes = raw_nodes + .into_iter() + .zip(keep) + .filter_map(|(node, k)| if k { Some(node) } else { None }) + .collect(); + + let skipped = nodes_before_merge - raw_nodes.len(); + ctx.metrics.nodes_skipped += skipped; + if skipped > 0 { + debug!( + "[build] Thinning removed {} nodes ({} → {})", + skipped, + nodes_before_merge, + raw_nodes.len() + ); + } + + // Step 3: Build tree + let mut tree = self.build_tree(raw_nodes, ctx); + + // Step 4: Assign node IDs if configured + if ctx.options.generate_ids { + self.assign_node_ids(&mut tree); + } + + let node_count = tree.node_count(); + + // Store tree in context + ctx.tree = Some(tree); + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_build(duration); + + info!( + "[build] Complete: {} nodes (skipped {} via thinning) in {}ms", + node_count, skipped, duration + ); + + let mut stage_result = StageResult::success("build"); + stage_result.duration_ms = duration; + stage_result.metadata.insert( + "node_count".to_string(), + serde_json::json!(ctx.tree.as_ref().map(|t| t.node_count()).unwrap_or(0)), + ); + stage_result + .metadata + .insert("nodes_skipped".to_string(), serde_json::json!(skipped)); + + Ok(stage_result) + } +} diff --git a/vectorless-core/vectorless-index/src/stages/concept.rs b/vectorless-core/vectorless-index/src/stages/concept.rs new file mode 100644 index 00000000..dfca16e4 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/concept.rs @@ -0,0 +1,238 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Concept extraction stage — extracts key concepts from topics and summaries. + +use std::collections::HashMap; + +use serde::Deserialize; +use tracing::{info, warn}; + +use vectorless_document::Concept; +use vectorless_error::Result; +use vectorless_llm::LlmClient; + +use super::async_trait; +use super::{AccessPattern, IndexStage, StageResult}; +use crate::index::pipeline::IndexContext; + +/// Maximum number of top keywords to send to the LLM for concept extraction. +const MAX_TOPICS: usize = 20; + +/// Maximum number of concepts to extract. +const MAX_CONCEPTS: usize = 15; + +/// Concept extraction stage. +/// +/// Takes the reasoning index's topic entries and tree summaries, then uses +/// a single LLM call to extract structured [`Concept`] values. +/// Falls back to basic keyword-based concepts when no LLM is available. +pub struct ConceptExtractionStage { + llm_client: Option, +} + +impl ConceptExtractionStage { + /// Create a new stage without LLM support (keyword-based fallback). + pub fn new() -> Self { + Self { llm_client: None } + } + + /// Create a stage with LLM support for rich concept extraction. + pub fn with_llm_client(client: LlmClient) -> Self { + Self { + llm_client: Some(client), + } + } +} + +#[async_trait] +impl IndexStage for ConceptExtractionStage { + fn name(&self) -> &str { + "concept_extraction" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["reasoning_index"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_concepts: true, + ..AccessPattern::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let concepts = if let Some(ref client) = self.llm_client { + extract_with_llm(ctx, client).await + } else { + extract_from_topics(ctx) + }; + + let count = concepts.len(); + ctx.concepts = concepts; + info!("[concept_extraction] Extracted {} concepts", count); + + Ok(StageResult::success("concept_extraction")) + } +} + +/// Extract concepts using LLM from topics and summaries. +async fn extract_with_llm(ctx: &mut IndexContext, client: &LlmClient) -> Vec { + let (topics, section_titles) = gather_source_data(ctx); + + if topics.is_empty() { + warn!("[concept_extraction] No topics available for extraction"); + return Vec::new(); + } + + let system = "You are a document analysis assistant. Extract the most important concepts \ + from the given topics and section titles. For each concept, provide:\n\ + - name: a short name (2-4 words)\n\ + - summary: a one-sentence explanation\n\ + - sections: list of section titles where this concept appears\n\n\ + Return ONLY a valid JSON array of objects. No explanation, no markdown. \ + Maximum 15 concepts, ordered by importance."; + + let user_prompt = format!( + "Document topics (keyword: relevance weight):\n{}\n\n\ + Section titles:\n{}", + topics + .iter() + .map(|(k, w)| format!("- {} (weight: {:.2})", k, w)) + .collect::>() + .join("\n"), + section_titles.join(", "), + ); + + #[derive(Debug, Deserialize)] + #[serde(rename_all = "snake_case")] + struct RawConcept { + name: String, + summary: String, + #[serde(default)] + sections: Vec, + } + + match client + .complete_json::>(&system, &user_prompt) + .await + { + Ok(raw) => raw + .into_iter() + .take(MAX_CONCEPTS) + .map(|c| Concept { + name: c.name, + summary: c.summary, + sections: c.sections, + }) + .collect(), + Err(e) => { + warn!("[concept_extraction] LLM extraction failed: {}, using fallback", e); + extract_from_topics(ctx) + } + } +} + +/// Fallback: derive basic concepts from topic keywords. +fn extract_from_topics(ctx: &mut IndexContext) -> Vec { + let (topics, section_titles) = gather_source_data(ctx); + + topics + .into_iter() + .take(MAX_CONCEPTS) + .map(|(name, _)| Concept { + name: name.clone(), + summary: String::new(), + sections: section_titles.clone(), + }) + .collect() +} + +/// Gather top topics and section titles from the pipeline context. +fn gather_source_data(ctx: &IndexContext) -> (Vec<(String, f32)>, Vec) { + // Collect top keywords by weight + let mut topics: Vec<(String, f32)> = Vec::new(); + + if let Some(ref ri) = ctx.reasoning_index { + let mut all: Vec<(String, f32)> = ri + .all_topic_entries() + .map(|(keyword, entries)| { + let max_weight = entries.iter().map(|e| e.weight).fold(0.0_f32, f32::max); + (keyword.clone(), max_weight) + }) + .collect(); + all.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + all.truncate(MAX_TOPICS); + topics = all; + } + + // Collect section titles from the tree + let section_titles: Vec = ctx + .tree + .as_ref() + .map(|tree| { + tree.traverse() + .iter() + .filter_map(|&id| { + let node = tree.get(id)?; + if !node.title.is_empty() { + Some(node.title.clone()) + } else { + None + } + }) + .collect() + }) + .unwrap_or_default(); + + (topics, section_titles) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_from_empty_topics() { + let topics = Vec::<(String, f32)>::new(); + let titles = vec!["Section 1".to_string()]; + // Basic sanity: empty topics produce empty concepts + let concepts: Vec = topics + .into_iter() + .take(MAX_CONCEPTS) + .map(|(name, _)| Concept { + name, + summary: String::new(), + sections: titles.clone(), + }) + .collect(); + assert!(concepts.is_empty()); + } + + #[test] + fn test_extract_from_topics_basic() { + let topics: Vec<(String, f32)> = vec![ + ("quantum".to_string(), 0.95), + ("error correction".to_string(), 0.88), + ("qubit".to_string(), 0.82), + ]; + let titles = vec!["Research Labs".to_string()]; + let concepts: Vec = topics + .into_iter() + .take(MAX_CONCEPTS) + .map(|(name, _)| Concept { + name, + summary: String::new(), + sections: titles.clone(), + }) + .collect(); + assert_eq!(concepts.len(), 3); + assert_eq!(concepts[0].name, "quantum"); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/enhance.rs b/vectorless-core/vectorless-index/src/stages/enhance.rs new file mode 100644 index 00000000..f6af051f --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/enhance.rs @@ -0,0 +1,449 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Enhance stage - Generate summaries using LLM. + +use super::async_trait; +use futures::StreamExt; +use std::sync::Arc; +use std::time::{Duration, Instant}; +use tracing::{debug, info, warn}; + +use vectorless_document::NodeId; +use vectorless_error::Result; +use crate::index::incremental; +use vectorless_llm::LlmClient; +use vectorless_llm::memo::{MemoKey, MemoStore}; +use vectorless_utils::fingerprint::Fingerprint; + +use super::{IndexStage, StageResult}; +use crate::index::pipeline::{FailurePolicy, IndexContext, StageRetryConfig}; +use crate::index::summary::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy}; + +/// A node that needs LLM summary generation. +struct PendingNode { + node_id: NodeId, + title: String, + content: String, + is_leaf: bool, +} + +/// Enhance stage - generates summaries using LLM. +pub struct EnhanceStage { + /// LLM client for summary generation. + llm_client: Option>, + /// Memo store for caching LLM results. + memo_store: Option>, +} + +impl EnhanceStage { + /// Create a new enhance stage. + pub fn new() -> Self { + Self { + llm_client: None, + memo_store: None, + } + } + + /// Create with LLM client. + pub fn with_llm_client(client: LlmClient) -> Self { + Self { + llm_client: Some(Arc::new(client)), + memo_store: None, + } + } + + /// Create with LLM client and memo store. + pub fn with_llm_and_memo(client: LlmClient, memo_store: MemoStore) -> Self { + Self { + llm_client: Some(Arc::new(client)), + memo_store: Some(Arc::new(memo_store)), + } + } + + /// Set memo store for caching. + pub fn with_memo_store(mut self, store: MemoStore) -> Self { + self.memo_store = Some(Arc::new(store)); + self + } + + /// Parse structured navigation response from LLM. + /// + /// Expected format: + /// ```text + /// OVERVIEW: + /// QUESTIONS: q1, q2, q3 + /// TAGS: tag1, tag2, tag3 + /// ``` + /// + /// Falls back gracefully: if markers are missing, the entire response + /// becomes the overview and questions/tags remain empty. + fn parse_structured_nav_response(response: &str) -> (String, Vec, Vec) { + let mut overview = String::new(); + let mut questions: Vec = Vec::new(); + let mut tags: Vec = Vec::new(); + + for line in response.lines() { + let line = line.trim(); + if let Some(rest) = line.strip_prefix("OVERVIEW:") { + overview = rest.trim().to_string(); + } else if let Some(rest) = line.strip_prefix("QUESTIONS:") { + questions = rest + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } else if let Some(rest) = line.strip_prefix("TAGS:") { + tags = rest + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + } + + // Fallback: if no OVERVIEW marker found, use entire response as overview + if overview.is_empty() { + overview = response.trim().to_string(); + } + + (overview, questions, tags) + } + + /// Check if summary generation is needed based on strategy. + fn needs_summaries(&self, ctx: &IndexContext) -> bool { + match &ctx.options.summary_strategy { + SummaryStrategy::None => false, + SummaryStrategy::Lazy { .. } => false, // Generated on-demand at query time + SummaryStrategy::Full { .. } | SummaryStrategy::Selective { .. } => true, + } + } +} + +impl Default for EnhanceStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for EnhanceStage { + fn name(&self) -> &'static str { + "enhance" + } + + fn is_optional(&self) -> bool { + true + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["build"] + } + + fn failure_policy(&self) -> FailurePolicy { + // LLM operations benefit from retry with backoff + FailurePolicy::retry_with( + StageRetryConfig::new() + .with_max_attempts(2) + .with_initial_delay(Duration::from_millis(500)), + ) + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + info!( + "[enhance] Starting: llm_client={}, strategy={:?}", + self.llm_client.is_some(), + ctx.options.summary_strategy + ); + + // Check if we need summaries + if !self.needs_summaries(ctx) { + info!( + "[enhance] Skipped: strategy={:?}", + ctx.options.summary_strategy + ); + return Ok(StageResult::success("enhance")); + } + + // Get LLM client + let llm_client = match &self.llm_client { + Some(client) => client, + None => { + warn!("[enhance] No LLM client, skipping summary generation"); + return Ok(StageResult::success("enhance")); + } + }; + + // Get tree + let tree = match ctx.tree.as_mut() { + Some(t) => t, + None => { + warn!("[enhance] No tree built, skipping"); + return Ok(StageResult::success("enhance")); + } + }; + + // Create summary generator (shared via Arc for concurrent use) + let generator = Arc::new( + LlmSummaryGenerator::new((*llm_client).as_ref().clone()) + .with_max_tokens(ctx.options.indexer.max_summary_tokens) + .with_memo_store( + self.memo_store + .as_ref() + .map(|s| (**s).clone()) + .unwrap_or_default(), + ), + ); + + // Get all nodes to process + let node_ids: Vec = tree.traverse(); + let total_nodes = node_ids.len(); + + // === Incremental: reuse summaries from existing tree for unchanged nodes === + if let Some(ref old_tree) = ctx.existing_tree { + let reusable = incremental::compute_reusable_summaries(old_tree, tree); + let applied = incremental::apply_reusable_summaries(tree, &reusable); + for _ in 0..applied { + ctx.metrics.increment_summaries(); + } + info!( + "[enhance] Incremental: {} of {} nodes unchanged, reusing summaries", + applied, total_nodes, + ); + } + + info!( + "[enhance] Processing {} nodes for summary generation", + total_nodes + ); + + // === Phase 1: Collect pending nodes (cache hits applied immediately) === + let strategy = ctx.options.summary_strategy.clone(); + let mut pending_llm: Vec = Vec::new(); + let mut generated = 0; + let mut skipped_no_content = 0; + let mut skipped_tokens = 0; + let mut shortcut_used = 0; + let shortcut_threshold = strategy.shortcut_threshold(); + + for node_id in node_ids { + let node = match tree.get(node_id) { + Some(n) => n.clone(), + None => continue, + }; + + // Skip if no content + if node.content.is_empty() { + skipped_no_content += 1; + continue; + } + + // Skip if summary already set (incremental: reused from old tree) + if !node.summary.is_empty() { + continue; + } + + // Check if strategy says we should generate + let token_count = node.token_count.unwrap_or(0); + if !strategy.should_generate(tree, node_id, token_count) { + skipped_tokens += 1; + continue; + } + + // Check memo store (fast path — apply immediately) + if let Some(store) = self.memo_store.as_deref() { + let content_fp = Fingerprint::from_str(&format!("{}|{}", node.title, node.content)); + let memo_key = MemoKey::summary(&content_fp); + if let Some(cached) = store + .get(&memo_key) + .and_then(|c| c.as_summary().map(|s| s.to_string())) + { + if !cached.is_empty() { + tree.set_summary(node_id, &cached); + debug!( + "[enhance] Cache hit: '{}' ({} chars)", + node.title, + cached.len() + ); + ctx.metrics.increment_summaries(); + generated += 1; + continue; + } + } + } + + // Shortcut: use original content as summary for short nodes (Borrow A) + let token_count = node + .token_count + .unwrap_or_else(|| vectorless_utils::estimate_tokens(&node.content)); + if shortcut_threshold > 0 && token_count > 0 && token_count <= shortcut_threshold { + tree.set_summary(node_id, &node.content); + debug!( + "[enhance] Shortcut: '{}' ({} tokens, using original content)", + node.title, token_count + ); + ctx.metrics.increment_summaries(); + generated += 1; + shortcut_used += 1; + continue; + } + + // Needs LLM call + let is_leaf = tree.is_leaf(node_id); + pending_llm.push(PendingNode { + node_id, + title: node.title, + content: node.content, + is_leaf, + }); + } + + // === Phase 2: Concurrent LLM calls with buffer_unordered === + let mut failed = 0; + let concurrency = ctx.options.concurrency.max_concurrent_requests; + + if !pending_llm.is_empty() { + info!( + "[enhance] Generating summaries for {} nodes (concurrency: {})", + pending_llm.len(), + concurrency + ); + + // Collect results: (NodeId, is_leaf, Result) + let results: Vec<(NodeId, bool, std::result::Result)> = + futures::stream::iter(pending_llm) + .map(|pending| { + let generator = Arc::clone(&generator); + async move { + let result = generator + .generate_for_node( + &pending.title, + &pending.content, + pending.is_leaf, + ) + .await; + ( + pending.node_id, + pending.is_leaf, + result.map_err(|e| e.to_string()), + ) + } + }) + .buffer_unordered(concurrency) + .collect() + .await; + + // Write results back to tree + for (node_id, is_leaf, result) in results { + ctx.metrics.increment_llm_calls(); + match result { + Ok(response) => { + if response.is_empty() { + failed += 1; + } else { + ctx.metrics + .add_tokens_generated(vectorless_utils::estimate_tokens(&response)); + + if is_leaf { + // Leaf node: response is a plain content summary + tree.set_summary(node_id, &response); + } else { + // Non-leaf node: response is structured (OVERVIEW/QUESTIONS/TAGS) + let (overview, questions, tags) = + Self::parse_structured_nav_response(&response); + tree.set_summary(node_id, &overview); + + if let Some(node) = tree.get_mut(node_id) { + node.question_hints = questions; + node.routing_keywords = tags; + } + } + generated += 1; + ctx.metrics.increment_summaries(); + } + } + Err(e) => { + warn!("[enhance] LLM summary failed: {}", e); + failed += 1; + } + } + } + } + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_enhance(duration); + if failed > 0 { + ctx.metrics.add_summaries_failed(failed); + } + + info!( + "[enhance] Complete: {} summaries ({} shortcut, {} failed, {} no-content, {} skipped-tokens) in {}ms", + generated, shortcut_used, failed, skipped_no_content, skipped_tokens, duration + ); + + let mut stage_result = StageResult::success("enhance"); + stage_result.duration_ms = duration; + stage_result.metadata.insert( + "summaries_generated".to_string(), + serde_json::json!(generated), + ); + stage_result + .metadata + .insert("summaries_failed".to_string(), serde_json::json!(failed)); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_structured_nav_response_full() { + let response = "\ +OVERVIEW: This section covers payment integration and billing configuration. +QUESTIONS: How to set up payments?, What currencies are supported?, How to configure invoices? +TAGS: payments, billing, invoices, currency"; + + let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(response); + + assert!(overview.contains("payment integration")); + assert_eq!(questions.len(), 3); + assert!(questions[0].contains("set up payments")); + assert_eq!(tags.len(), 4); + assert_eq!(tags[0], "payments"); + } + + #[test] + fn test_parse_structured_nav_response_partial() { + // Only overview, no questions or tags + let response = "OVERVIEW: A general introduction to the system."; + let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(response); + + assert!(overview.contains("general introduction")); + assert!(questions.is_empty()); + assert!(tags.is_empty()); + } + + #[test] + fn test_parse_structured_nav_response_fallback() { + // No markers at all — fallback to entire response as overview + let response = "This is just a plain summary without any markers."; + let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(response); + + assert_eq!(overview, response.trim()); + assert!(questions.is_empty()); + assert!(tags.is_empty()); + } + + #[test] + fn test_parse_structured_nav_response_empty() { + let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(""); + assert!(overview.is_empty()); + assert!(questions.is_empty()); + assert!(tags.is_empty()); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/enrich.rs b/vectorless-core/vectorless-index/src/stages/enrich.rs new file mode 100644 index 00000000..8b743ea8 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/enrich.rs @@ -0,0 +1,240 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Enrich stage - Add metadata to the tree. + +use super::async_trait; +use std::time::Instant; +use tracing::{debug, info}; + +use vectorless_document::{DocumentTree, NodeId, ReferenceExtractor, TocView}; +use vectorless_error::Result; + +use super::{AccessPattern, IndexStage, StageResult}; +use crate::index::pipeline::IndexContext; + +/// Enrich stage - adds metadata to the tree. +pub struct EnrichStage; + +impl EnrichStage { + /// Create a new enrich stage. + pub fn new() -> Self { + Self + } + + /// Calculate page ranges for all nodes. + fn calculate_page_ranges(tree: &mut DocumentTree) { + // Propagate page ranges up the tree + Self::propagate_page_ranges(tree, tree.root()); + } + + /// Recursively propagate page ranges from children to parent. + fn propagate_page_ranges(tree: &mut DocumentTree, node_id: NodeId) { + let children = tree.children(node_id); + + if children.is_empty() { + return; + } + + // First, propagate to all children + for child_id in &children { + Self::propagate_page_ranges(tree, *child_id); + } + + // Then calculate this node's range from children + let mut min_page: Option = None; + let mut max_page: Option = None; + + for child_id in &children { + if let Some(child) = tree.get(*child_id) { + if let Some(start) = child.start_page { + min_page = Some(min_page.map_or(start, |m| m.min(start))); + } + if let Some(end) = child.end_page { + max_page = Some(max_page.map_or(end, |m| m.max(end))); + } + } + } + + // Update this node's page range + if let (Some(min), Some(max)) = (min_page, max_page) { + tree.set_page_boundaries(node_id, min, max); + } + } + + /// Calculate token statistics. + fn calculate_token_stats(tree: &DocumentTree) -> (usize, usize) { + let mut total_tokens = 0; + let mut node_count = 0; + + for node_id in tree.traverse() { + if let Some(node) = tree.get(node_id) { + total_tokens += node.token_count.unwrap_or(0); + node_count += 1; + } + } + + (total_tokens, node_count) + } + + /// Generate document description from root summary. + fn generate_description(&self, ctx: &mut IndexContext) { + if !ctx.options.generate_description { + return; + } + + // Use root summary if available + if let Some(tree) = &ctx.tree { + if let Some(root) = tree.get(tree.root()) { + if !root.summary.is_empty() { + ctx.description = Some(root.summary.clone()); + debug!("[enrich] Using root summary as document description"); + } + } + } + } + + /// Extract and resolve in-document cross-references for all nodes. + /// + /// Parses content for patterns like "see Section 2.1", "Appendix G", etc. + /// and resolves them to actual `NodeId`s in the tree using the retrieval + /// index for fast lookup. + fn resolve_references(tree: &mut DocumentTree) -> usize { + let retrieval_index = tree.build_retrieval_index(); + let node_ids: Vec = tree.traverse().into_iter().collect(); + let mut total_resolved = 0; + + for node_id in node_ids { + let content = tree + .get(node_id) + .map(|n| n.content.clone()) + .unwrap_or_default(); + if content.is_empty() { + continue; + } + + // Quick check: skip nodes without any reference-like patterns + let content_lower = content.to_lowercase(); + let has_ref_pattern = content_lower.contains("section") + || content_lower.contains("appendix") + || content_lower.contains("table") + || content_lower.contains("figure") + || content_lower.contains("page") + || content_lower.contains("equation"); + + if !has_ref_pattern { + continue; + } + + let refs = ReferenceExtractor::extract_and_resolve(&content, tree, &retrieval_index); + let resolved = refs.iter().filter(|r| r.is_resolved()).count(); + if resolved > 0 { + total_resolved += resolved; + } + tree.set_references(node_id, refs); + } + + total_resolved + } +} + +impl Default for EnrichStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for EnrichStage { + fn name(&self) -> &'static str { + "enrich" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["build"] + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: true, // sets page_boundaries + writes_description: true, + ..Default::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let tree = ctx + .tree + .as_mut() + .ok_or_else(|| vectorless_error::Error::IndexBuild("Tree not built".to_string()))?; + + let node_count = tree.node_count(); + info!("[enrich] Starting: {} nodes", node_count); + + // 1. Calculate page ranges + Self::calculate_page_ranges(tree); + debug!("[enrich] Calculated page ranges"); + + // 2. Generate ToC view (cached in context) + let toc_view = TocView::new(); + let toc = toc_view.generate(tree); + let _toc_markdown = toc_view.format_markdown(&toc); + debug!("[enrich] Generated ToC ({} children)", toc.children.len()); + + // 3. Calculate token statistics + let (total_tokens, stat_node_count) = Self::calculate_token_stats(tree); + debug!( + "[enrich] Token stats: {} total tokens across {} nodes", + total_tokens, stat_node_count + ); + + // 4. Extract and resolve cross-references + let resolved_refs = Self::resolve_references(tree); + if resolved_refs > 0 { + info!("[enrich] Resolved {} cross-references", resolved_refs); + } + + // 5. Generate document description + self.generate_description(ctx); + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_enrich(duration); + + info!( + "[enrich] Complete: {} tokens, {} refs resolved in {}ms", + total_tokens, resolved_refs, duration + ); + + let mut stage_result = StageResult::success("enrich"); + stage_result.duration_ms = duration; + stage_result + .metadata + .insert("total_tokens".to_string(), serde_json::json!(total_tokens)); + stage_result + .metadata + .insert("node_count".to_string(), serde_json::json!(node_count)); + stage_result.metadata.insert( + "resolved_references".to_string(), + serde_json::json!(resolved_refs), + ); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resolve_references_no_refs() { + let mut tree = DocumentTree::new("Root", "root content"); + tree.add_child(tree.root(), "Section 1", "No references here."); + + let resolved = EnrichStage::resolve_references(&mut tree); + assert_eq!(resolved, 0); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/mod.rs b/vectorless-core/vectorless-index/src/stages/mod.rs new file mode 100644 index 00000000..3d8da297 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/mod.rs @@ -0,0 +1,141 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Index pipeline stages. + +mod build; +mod concept; +mod enhance; +mod enrich; +mod navigation; +mod optimize; +mod parse; +mod reasoning; +mod split; +mod validate; +mod verify_ingest; + +pub use build::BuildStage; +pub use concept::ConceptExtractionStage; +pub use enhance::EnhanceStage; +pub use enrich::EnrichStage; +pub use navigation::NavigationIndexStage; +pub use optimize::OptimizeStage; +pub use parse::ParseStage; +pub use reasoning::ReasoningIndexStage; +pub use split::SplitStage; +pub use validate::ValidateStage; +pub use verify_ingest::VerifyStage; + +use super::pipeline::{FailurePolicy, IndexContext, StageResult}; +use vectorless_error::Result; +pub use async_trait::async_trait; + +/// Declares which context fields a stage reads/writes. +/// Used by the orchestrator to determine safe parallel execution. +#[derive(Debug, Clone, Default)] +pub struct AccessPattern { + /// Whether this stage reads the tree. + pub reads_tree: bool, + /// Whether this stage mutates the tree (summaries, structure, etc.). + pub writes_tree: bool, + /// Whether this stage writes to `reasoning_index`. + pub writes_reasoning_index: bool, + /// Whether this stage writes to `navigation_index`. + pub writes_navigation_index: bool, + /// Whether this stage writes to `description`. + pub writes_description: bool, + /// Whether this stage writes to `concepts`. + pub writes_concepts: bool, +} + +/// Index pipeline stage. +/// +/// Each stage represents a discrete step in the document indexing process. +/// Stages are executed in dependency order by the [`PipelineOrchestrator`]. +/// +/// # Stage Lifecycle +/// +/// 1. Stage is registered with the orchestrator +/// 2. Dependencies are resolved and execution order is determined +/// 3. `execute()` is called with the shared context +/// 4. Results are stored in `ctx.stage_results` +/// +/// # Example +/// +/// ```rust,ignore +/// struct MyStage; +/// +/// #[async_trait] +/// impl IndexStage for MyStage { +/// fn name(&self) -> &str { "my_stage" } +/// +/// fn depends_on(&self) -> Vec<&'static str> { +/// vec!["parse", "build"] +/// } +/// +/// async fn execute(&mut self, ctx: &mut IndexContext) -> Result { +/// // Process the context... +/// Ok(StageResult::success("my_stage")) +/// } +/// } +/// ``` +#[async_trait] +pub trait IndexStage: Send + Sync { + /// Stage name (must be unique within pipeline). + fn name(&self) -> &str; + + /// Execute the stage. + /// + /// This method receives a mutable reference to the shared context, + /// allowing stages to read from and write to it. + async fn execute(&mut self, ctx: &mut IndexContext) -> Result; + + /// Whether this stage is optional (can be skipped on failure). + /// + /// Optional stages that fail will not stop the pipeline. + /// Default: `false` + fn is_optional(&self) -> bool { + false + } + + /// Names of stages this stage depends on. + /// + /// Dependencies are validated during pipeline construction. + /// A stage will only execute after all its dependencies have completed. + /// + /// # Example + /// + /// ```rust,ignore + /// fn depends_on(&self) -> Vec<&'static str> { + /// vec!["parse", "build"] + /// } + /// ``` + fn depends_on(&self) -> Vec<&'static str> { + Vec::new() + } + + /// Failure policy for this stage. + /// + /// Determines how the pipeline handles failures in this stage: + /// - `Fail`: Stop the entire pipeline (default for required stages) + /// - `Skip`: Skip this stage, continue pipeline + /// - `Retry`: Retry with exponential backoff + /// + /// Default behavior: + /// - If `is_optional()` returns true, defaults to `FailurePolicy::Skip` + /// - Otherwise, defaults to `FailurePolicy::Fail` + fn failure_policy(&self) -> FailurePolicy { + if self.is_optional() { + FailurePolicy::skip() + } else { + FailurePolicy::fail() + } + } + + /// Declare which context fields this stage accesses. + /// Used by the orchestrator for safe parallel execution. + fn access_pattern(&self) -> AccessPattern { + AccessPattern::default() + } +} diff --git a/vectorless-core/vectorless-index/src/stages/navigation.rs b/vectorless-core/vectorless-index/src/stages/navigation.rs new file mode 100644 index 00000000..57abc926 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/navigation.rs @@ -0,0 +1,563 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Navigation Index Stage — Build the Agent navigation index from the document tree. +//! +//! This stage runs after EnrichStage and ReasoningIndexStage. It reads the +//! enhanced TreeNode fields (summary, description, routing_keywords, leaf_count) +//! and builds a [`NavigationIndex`] containing compact [`NavEntry`] and +//! [`ChildRoute`] records for every non-leaf node. +//! +//! # No LLM Calls +//! +//! This stage performs pure data organization. All LLM-generated content +//! (summaries, descriptions, keywords) is already on the tree from the +//! Enhance stage. This stage only reads and restructures that data. + +use std::time::Instant; +use tracing::{debug, info, warn}; + +use vectorless_document::{ChildRoute, DocumentTree, NavEntry, NavigationIndex, NodeId}; +use vectorless_error::Result; + +use super::async_trait; +use super::{AccessPattern, IndexStage, StageResult}; +use crate::index::pipeline::IndexContext; + +/// Navigation Index Stage — builds the Agent navigation index. +/// +/// For every non-leaf node in the tree, this stage creates: +/// - A [`NavEntry`] with overview, question hints, topic tags, leaf count, and level. +/// - A list of [`ChildRoute`] entries, one per child, with title, description, and leaf count. +/// +/// The resulting [`NavigationIndex`] is stored in `ctx.navigation_index` and +/// serialized as part of [`PersistedDocument`](vectorless_storage::persistence::PersistedDocument). +pub struct NavigationIndexStage; + +impl NavigationIndexStage { + /// Create a new navigation index stage. + pub fn new() -> Self { + Self + } + + /// Count the number of leaf nodes in a subtree rooted at `node_id`. + fn count_leaves(tree: &DocumentTree, node_id: NodeId) -> usize { + if tree.is_leaf(node_id) { + return 1; + } + let mut count = 0; + let mut stack = vec![node_id]; + while let Some(id) = stack.pop() { + if tree.is_leaf(id) { + count += 1; + } else { + for child in tree.children_iter(id) { + stack.push(child); + } + } + } + count + } + + /// Build a NavEntry for a non-leaf node. + fn build_nav_entry(tree: &DocumentTree, node_id: NodeId, leaf_count: usize) -> NavEntry { + let node = match tree.get(node_id) { + Some(n) => n, + None => { + return NavEntry { + overview: String::new(), + question_hints: Vec::new(), + topic_tags: Vec::new(), + leaf_count: 0, + level: 0, + }; + } + }; + + // Overview: use summary if available, otherwise title + let overview = if !node.summary.is_empty() { + node.summary.clone() + } else { + node.title.clone() + }; + + NavEntry { + overview, + question_hints: node.question_hints.clone(), + topic_tags: node.routing_keywords.clone(), + leaf_count, + level: node.depth, + } + } + + /// Build a ChildRoute for a single child node. + fn build_child_route(tree: &DocumentTree, child_id: NodeId, leaf_count: usize) -> ChildRoute { + let node = tree.get(child_id); + let title = node.map(|n| n.title.clone()).unwrap_or_default(); + let description = node + .and_then(|n| { + // Use summary as description if available; otherwise use a truncated title + if !n.summary.is_empty() { + Some(n.summary.clone()) + } else if !n.content.is_empty() { + // Truncate content as fallback description + let s: String = n.content.chars().take(100).collect(); + Some(s) + } else { + None + } + }) + .unwrap_or_else(|| title.clone()); + + ChildRoute { + node_id: child_id, + title, + description, + leaf_count, + } + } +} + +impl Default for NavigationIndexStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for NavigationIndexStage { + fn name(&self) -> &'static str { + "navigation_index" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["enrich"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_navigation_index: true, + ..Default::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let tree = match ctx.tree.as_ref() { + Some(t) => t, + None => { + warn!("[navigation_index] No tree, cannot build index"); + return Ok(StageResult::failure("navigation_index", "Tree not built")); + } + }; + + let all_nodes = tree.traverse(); + let leaf_count = all_nodes.iter().filter(|&&id| tree.is_leaf(id)).count(); + let non_leaf_count = all_nodes.len() - leaf_count; + + info!( + "[navigation_index] Starting: {} total nodes ({} leaves, {} non-leaf)", + all_nodes.len(), + leaf_count, + non_leaf_count, + ); + + let mut nav_entries_count = 0usize; + let mut child_routes_count = 0usize; + + // Phase 1: Pre-compute leaf counts for all nodes. + // We compute once per node to avoid repeated traversals. + debug!( + "[navigation_index] Phase 1: Pre-computing leaf counts for {} nodes", + all_nodes.len() + ); + let mut leaf_counts: std::collections::HashMap = + std::collections::HashMap::with_capacity(all_nodes.len()); + for &node_id in &all_nodes { + leaf_counts.insert(node_id, Self::count_leaves(tree, node_id)); + } + + // Phase 2: Build NavEntry + ChildRoutes for each non-leaf node. + debug!( + "[navigation_index] Phase 2: Building NavEntry + ChildRoutes for {} non-leaf nodes", + non_leaf_count + ); + let mut nav_index = NavigationIndex::new(); + + for &node_id in &all_nodes { + // Skip leaf nodes — they have no children to navigate to + if tree.is_leaf(node_id) { + continue; + } + + let lc = *leaf_counts.get(&node_id).unwrap_or(&0); + + // Build navigation entry for this non-leaf node + let nav_entry = Self::build_nav_entry(tree, node_id, lc); + nav_index.add_entry(node_id, nav_entry); + nav_entries_count += 1; + + // Build child routes for this node's children + let child_ids: Vec = tree.children_iter(node_id).collect(); + let mut routes = Vec::with_capacity(child_ids.len()); + + for child_id in child_ids { + let child_lc = *leaf_counts.get(&child_id).unwrap_or(&0); + let route = Self::build_child_route(tree, child_id, child_lc); + routes.push(route); + child_routes_count += 1; + } + + debug!( + "[navigation_index] node '{}' → {} child routes ({} leaves in subtree)", + tree.get(node_id).map(|n| n.title.as_str()).unwrap_or("?"), + routes.len(), + lc, + ); + + nav_index.add_child_routes(node_id, routes); + } + + // Phase 3: Build DocCard from root-level data (already computed, zero LLM). + // Provides a compact document summary for multi-document Orchestrator Agent. + if let Some(root_entry) = nav_index.get_entry(tree.root()) { + let sections: Vec = nav_index + .get_child_routes(tree.root()) + .map(|routes| { + routes + .iter() + .map(|r| vectorless_document::SectionCard { + title: r.title.clone(), + description: r.description.clone(), + leaf_count: r.leaf_count, + }) + .collect() + }) + .unwrap_or_default(); + + let doc_card = vectorless_document::DocCard { + title: tree + .get(tree.root()) + .map(|n| n.title.clone()) + .unwrap_or_default(), + overview: root_entry.overview.clone(), + question_hints: root_entry.question_hints.clone(), + topic_tags: root_entry.topic_tags.clone(), + sections, + total_leaves: root_entry.leaf_count, + }; + nav_index.set_doc_card(doc_card); + + debug!( + "[navigation_index] Phase 3: Built DocCard — {} sections, {} total leaves", + nav_index.doc_card().map(|c| c.sections.len()).unwrap_or(0), + nav_index.doc_card().map(|c| c.total_leaves).unwrap_or(0), + ); + } else { + debug!("[navigation_index] Phase 3: Skipped DocCard (no root entry)"); + } + + let duration = start.elapsed().as_millis() as u64; + + ctx.metrics + .record_navigation_index(duration, nav_entries_count, child_routes_count); + + info!( + "[navigation_index] Complete: {} nav entries, {} child routes in {}ms", + nav_entries_count, child_routes_count, duration, + ); + + ctx.navigation_index = Some(nav_index); + + let mut stage_result = StageResult::success("navigation_index"); + stage_result.duration_ms = duration; + stage_result.metadata.insert( + "nav_entries".to_string(), + serde_json::json!(nav_entries_count), + ); + stage_result.metadata.insert( + "child_routes".to_string(), + serde_json::json!(child_routes_count), + ); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::DocumentTree; + + fn build_test_tree() -> DocumentTree { + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + + let sec1 = tree.add_child(root, "Section 1", "section 1 content"); + let _sec1_1 = tree.add_child(sec1, "Section 1.1", "s1.1 content"); + let _sec1_2 = tree.add_child(sec1, "Section 1.2", "s1.2 content"); + + let sec2 = tree.add_child(root, "Section 2", "section 2 content"); + let _sec2_1 = tree.add_child(sec2, "Section 2.1", "s2.1 content"); + + // Set some summaries + tree.set_summary(root, "A comprehensive guide"); + tree.set_summary(sec1, "Getting started with setup"); + tree.set_summary(sec2, "Advanced configuration"); + + tree + } + + #[test] + fn test_count_leaves() { + let tree = build_test_tree(); + let root = tree.root(); + + // Root has 3 leaves: 1.1, 1.2, 2.1 + assert_eq!(NavigationIndexStage::count_leaves(&tree, root), 3); + } + + #[test] + fn test_count_leaves_single_node() { + let tree = DocumentTree::new("Root", "content"); + let root = tree.root(); + + assert_eq!(NavigationIndexStage::count_leaves(&tree, root), 1); + } + + #[test] + fn test_build_nav_entry_with_summary() { + let tree = build_test_tree(); + let root = tree.root(); + + let entry = NavigationIndexStage::build_nav_entry(&tree, root, 3); + assert_eq!(entry.overview, "A comprehensive guide"); + assert_eq!(entry.leaf_count, 3); + assert_eq!(entry.level, 0); + } + + #[test] + fn test_build_nav_entry_without_summary() { + let tree = DocumentTree::new("Root", "content"); + let root = tree.root(); + + let entry = NavigationIndexStage::build_nav_entry(&tree, root, 1); + assert_eq!(entry.overview, "Root"); + } + + #[test] + fn test_build_child_route() { + let tree = build_test_tree(); + let root = tree.root(); + let children: Vec<_> = tree.children_iter(root).collect(); + + let route = NavigationIndexStage::build_child_route(&tree, children[0], 2); + assert_eq!(route.title, "Section 1"); + assert_eq!(route.leaf_count, 2); + } + + #[test] + fn test_stage_config() { + let stage = NavigationIndexStage::new(); + assert_eq!(stage.name(), "navigation_index"); + assert!(stage.is_optional()); + assert_eq!(stage.depends_on(), vec!["enrich"]); + + let ap = stage.access_pattern(); + assert!(ap.reads_tree); + assert!(ap.writes_navigation_index); + assert!(!ap.writes_tree); + assert!(!ap.writes_reasoning_index); + } + + #[tokio::test] + async fn test_execute_end_to_end() { + // Build a 3-level tree: Root -> [Sec1 -> [1.1, 1.2], Sec2 -> [2.1]] + let mut tree = DocumentTree::new("Root", "root content"); + let root = tree.root(); + let sec1 = tree.add_child(root, "Section 1", "s1 content"); + let _sec1_1 = tree.add_child(sec1, "Section 1.1", "s1.1 content"); + let _sec1_2 = tree.add_child(sec1, "Section 1.2", "s1.2 content"); + let sec2 = tree.add_child(root, "Section 2", "s2 content"); + let _sec2_1 = tree.add_child(sec2, "Section 2.1", "s2.1 content"); + + tree.set_summary(root, "A comprehensive guide"); + tree.set_summary(sec1, "Getting started"); + + // Build context with the tree + let mut ctx = IndexContext::new( + crate::index::pipeline::IndexInput::content("test"), + crate::index::config::PipelineOptions::default(), + ); + ctx.tree = Some(tree); + + // Execute the stage + let mut stage = NavigationIndexStage::new(); + let result = stage.execute(&mut ctx).await; + + assert!(result.is_ok()); + let stage_result = result.unwrap(); + assert!(stage_result.success); + assert_eq!( + stage_result.metadata["nav_entries"], + serde_json::json!(3) // root, sec1, sec2 + ); + assert_eq!( + stage_result.metadata["child_routes"], + serde_json::json!(5) // root→2 + sec1→2 + sec2→1 + ); + + // Verify the index structure + let nav_index = ctx.navigation_index.unwrap(); + assert_eq!(nav_index.entry_count(), 3); // 3 non-leaf nodes + assert_eq!(nav_index.total_child_routes(), 5); + + // Root entry + let root_id = ctx.tree.as_ref().unwrap().root(); + let root_entry = nav_index.get_entry(root_id).unwrap(); + assert_eq!(root_entry.overview, "A comprehensive guide"); + assert_eq!(root_entry.leaf_count, 3); + assert_eq!(root_entry.level, 0); + + // Root child routes + let root_routes = nav_index.get_child_routes(root_id).unwrap(); + assert_eq!(root_routes.len(), 2); + assert_eq!(root_routes[0].title, "Section 1"); + assert_eq!(root_routes[0].leaf_count, 2); + assert_eq!(root_routes[1].title, "Section 2"); + assert_eq!(root_routes[1].leaf_count, 1); + } + + #[tokio::test] + async fn test_execute_single_leaf_tree() { + // Single node = root is leaf → no non-leaf nodes → empty index + let tree = DocumentTree::new("Root", "content"); + + let mut ctx = IndexContext::new( + crate::index::pipeline::IndexInput::content("test"), + crate::index::config::PipelineOptions::default(), + ); + ctx.tree = Some(tree); + + let mut stage = NavigationIndexStage::new(); + let result = stage.execute(&mut ctx).await; + + assert!(result.is_ok()); + assert!(stage_result_is_success(&result)); + + let nav_index = ctx.navigation_index.unwrap(); + assert_eq!(nav_index.entry_count(), 0); + assert_eq!(nav_index.total_child_routes(), 0); + } + + #[tokio::test] + async fn test_execute_no_tree() { + let ctx = IndexContext::new( + crate::index::pipeline::IndexInput::content("test"), + crate::index::config::PipelineOptions::default(), + ); + // ctx.tree is None + + let mut stage = NavigationIndexStage::new(); + // Can't move ctx since tree is None, construct manually + let mut ctx = ctx; + ctx.tree = None; + + let result = stage.execute(&mut ctx).await.unwrap(); + assert!(!result.success); + assert!(ctx.navigation_index.is_none()); + } + + #[test] + fn test_build_child_route_no_summary_has_content() { + // Node with content but no summary → description = truncated content + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let child = tree.add_child(root, "Child", "this is a long content string that exceeds 100 characters and should be truncated when used as a fallback description for the child route"); + + let route = NavigationIndexStage::build_child_route(&tree, child, 1); + assert_eq!(route.title, "Child"); + // description should be truncated content, not the full string + assert!(route.description.len() <= 100); + assert!(route.description.starts_with("this is a long")); + } + + #[test] + fn test_build_child_route_no_summary_no_content() { + // Node with neither summary nor content → description = title + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let child = tree.add_child(root, "Orphan Section", ""); + // Clear any auto-generated content + tree.set_summary(child, ""); + + let route = NavigationIndexStage::build_child_route(&tree, child, 1); + assert_eq!(route.title, "Orphan Section"); + // Fallback: description = title when no summary and no content + assert_eq!(route.description, "Orphan Section"); + } + + #[test] + fn test_build_child_route_with_summary() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let child = tree.add_child(root, "Child", "some content"); + tree.set_summary(child, "A concise summary"); + + let route = NavigationIndexStage::build_child_route(&tree, child, 1); + assert_eq!(route.description, "A concise summary"); + } + + #[test] + fn test_build_nav_entry_depth_tracking() { + // Verify that depth/level is correctly captured from the tree + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let sec1 = tree.add_child(root, "S1", ""); + let sec1_1 = tree.add_child(sec1, "S1.1", "leaf"); + tree.set_summary(root, "Root overview"); + tree.set_summary(sec1, "Section overview"); + + let root_entry = NavigationIndexStage::build_nav_entry(&tree, root, 3); + assert_eq!(root_entry.level, 0); + + let sec1_entry = NavigationIndexStage::build_nav_entry(&tree, sec1, 1); + assert_eq!(sec1_entry.level, 1); + + // Leaf node should still return valid NavEntry if called + let leaf_entry = NavigationIndexStage::build_nav_entry(&tree, sec1_1, 1); + assert_eq!(leaf_entry.level, 2); + assert_eq!(leaf_entry.overview, "S1.1"); // no summary → fallback to title + } + + #[test] + fn test_count_leaves_subtree() { + // Verify leaf count is correct for a subtree, not the entire tree + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let sec1 = tree.add_child(root, "S1", ""); + let _s1a = tree.add_child(sec1, "S1.A", "leaf"); + let _s1b = tree.add_child(sec1, "S1.B", "leaf"); + let _s1c = tree.add_child(sec1, "S1.C", "leaf"); + let sec2 = tree.add_child(root, "S2", ""); + let _s2a = tree.add_child(sec2, "S2.A", "leaf"); + + // sec1 subtree has 3 leaves + assert_eq!(NavigationIndexStage::count_leaves(&tree, sec1), 3); + // sec2 subtree has 1 leaf + assert_eq!(NavigationIndexStage::count_leaves(&tree, sec2), 1); + // root has 4 leaves total + assert_eq!(NavigationIndexStage::count_leaves(&tree, root), 4); + } + + /// Helper to check success without destructuring. + fn stage_result_is_success(result: &Result) -> bool { + result.as_ref().map(|r| r.success).unwrap_or(false) + } +} diff --git a/vectorless-core/vectorless-index/src/stages/optimize.rs b/vectorless-core/vectorless-index/src/stages/optimize.rs new file mode 100644 index 00000000..33244ddf --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/optimize.rs @@ -0,0 +1,455 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Optimize stage - Optimize tree structure. + +use super::{AccessPattern, async_trait}; +use std::time::Instant; +use tracing::{debug, info}; + +use vectorless_document::NodeId; +use vectorless_error::Result; +use crate::index::pipeline::IndexContext; + +use super::{IndexStage, StageResult}; + +/// Optimize stage - optimizes tree structure. +pub struct OptimizeStage; + +impl OptimizeStage { + /// Create a new optimize stage. + pub fn new() -> Self { + Self + } + + /// Merge adjacent small leaf nodes that are siblings under the same parent. + /// + /// Only merges nodes that are both **leaves** (no children of their own). + /// Non-leaf nodes (section headings with subsections) are never merged, + /// even if their own content is empty. + fn merge_small_leaves( + tree: &mut vectorless_document::DocumentTree, + min_tokens: usize, + metrics: &mut crate::index::IndexMetrics, + ) -> usize { + let mut merged_count = 0; + + // Get all non-leaf nodes (parents whose children may be candidates) + let non_leaves: Vec = tree + .traverse() + .into_iter() + .filter(|id| !tree.is_leaf(*id)) + .collect(); + + for parent_id in non_leaves { + let children = tree.children(parent_id); + if children.len() < 2 { + continue; + } + + // Collect children info: only leaf nodes are merge candidates + let candidates: Vec<(NodeId, usize, bool)> = children + .iter() + .map(|&id| { + let tokens = tree.get(id).and_then(|n| n.token_count).unwrap_or(0); + let is_leaf = tree.is_leaf(id); + (id, tokens, is_leaf) + }) + .collect(); + + // Find pairs of adjacent small leaf siblings + let mut i = 0; + while i < candidates.len() - 1 { + let (curr_id, curr_tokens, curr_is_leaf) = candidates[i]; + let (next_id, next_tokens, next_is_leaf) = candidates[i + 1]; + + // Both must be leaves with actual content, and both must be small + if curr_is_leaf + && next_is_leaf + && curr_tokens > 0 + && curr_tokens < min_tokens + && next_tokens > 0 + && next_tokens < min_tokens + { + // Merge next into current + if let Some(next_node) = tree.get(next_id).cloned() { + if let Some(curr) = tree.get_mut(curr_id) { + if !next_node.content.is_empty() { + if !curr.content.is_empty() { + curr.content.push_str("\n\n"); + } + // Prefix with heading to preserve boundary + curr.content.push_str(&format!( + "## {}\n{}", + next_node.title, next_node.content + )); + } + curr.token_count = Some(curr.token_count.unwrap_or(0) + next_tokens); + } + } + + // Mark next as merged + if let Some(node) = tree.get_mut(next_id) { + node.title = format!("[MERGED: {}]", node.title); + node.content.clear(); + node.token_count = Some(0); + } + + merged_count += 1; + metrics.increment_nodes_merged(); + i += 2; // Skip merged node + } else { + i += 1; + } + } + } + + merged_count + } + + /// Remove empty intermediate nodes (skip root). + fn remove_empty_nodes(tree: &mut vectorless_document::DocumentTree) -> usize { + let mut removed_count = 0; + let root = tree.root(); + + // Find non-root nodes with no content and only one child + let candidates: Vec = tree + .traverse() + .into_iter() + .filter(|id| { + // Skip root node + if *id == root { + return false; + } + if tree.is_leaf(*id) { + return false; + } + let children = tree.children(*id); + if children.len() != 1 { + return false; + } + if let Some(node) = tree.get(*id) { + node.content.trim().is_empty() + } else { + false + } + }) + .collect(); + + // Note: Actually removing nodes from arena tree is complex + // For now, we just mark them + for node_id in candidates { + if let Some(node) = tree.get_mut(node_id) { + node.title = format!("[EMPTY: {}]", node.title); + removed_count += 1; + } + } + + removed_count + } +} + +impl Default for OptimizeStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for OptimizeStage { + fn name(&self) -> &'static str { + "optimize" + } + + fn is_optional(&self) -> bool { + true + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["enrich", "navigation_index"] + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: true, // merges small leaf nodes + ..Default::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let config = &ctx.options.optimization; + if !config.enabled { + debug!("[optimize] Disabled, skipping"); + return Ok(StageResult::success("optimize")); + } + + let tree = ctx + .tree + .as_mut() + .ok_or_else(|| vectorless_error::Error::IndexBuild("Tree not built".to_string()))?; + + let node_count = tree.node_count(); + info!( + "[optimize] Starting: {} nodes, merge_threshold={}", + node_count, config.merge_leaf_threshold, + ); + + let mut merged_count = 0; + + // 1. Merge small leaves + if config.merge_leaf_threshold > 0 { + merged_count = + Self::merge_small_leaves(tree, config.merge_leaf_threshold, &mut ctx.metrics); + if merged_count > 0 { + debug!("[optimize] Merged {} small leaf nodes", merged_count); + } + } + + // 2. Remove empty intermediate nodes + let removed_count = Self::remove_empty_nodes(tree); + if removed_count > 0 { + debug!( + "[optimize] Marked {} empty intermediate nodes", + removed_count + ); + } + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_optimize(duration); + + info!( + "[optimize] Complete: {} merged, {} emptied in {}ms", + merged_count, removed_count, duration + ); + + let mut stage_result = StageResult::success("optimize"); + stage_result.duration_ms = duration; + stage_result + .metadata + .insert("nodes_merged".to_string(), serde_json::json!(merged_count)); + stage_result.metadata.insert( + "nodes_removed".to_string(), + serde_json::json!(removed_count), + ); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::DocumentTree; + use crate::index::PipelineOptions; + use crate::index::pipeline::IndexContext; + use crate::index::pipeline::IndexInput; + + /// Create a tree with small leaf children under root for merge tests. + /// + /// ```text + /// Root + /// ├── Leaf A (50 tokens) + /// ├── Leaf B (30 tokens) ← should merge with Leaf A + /// ├── Leaf C (200 tokens) ← too large, not merged + /// └── Leaf D (40 tokens) ← no adjacent small sibling + /// ``` + fn make_merge_test_tree() -> DocumentTree { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + + let a = tree.add_child(root, "Leaf A", "content A"); + let b = tree.add_child(root, "Leaf B", "content B"); + let c = tree.add_child(root, "Leaf C", "content C long"); + let d = tree.add_child(root, "Leaf D", "content D"); + + // Set token counts + if let Some(n) = tree.get_mut(a) { + n.token_count = Some(50); + } + if let Some(n) = tree.get_mut(b) { + n.token_count = Some(30); + } + if let Some(n) = tree.get_mut(c) { + n.token_count = Some(200); + } + if let Some(n) = tree.get_mut(d) { + n.token_count = Some(40); + } + + tree + } + + #[test] + fn test_merge_small_leaves_merges_adjacent_pair() { + let mut tree = make_merge_test_tree(); + let root = tree.root(); + let mut metrics = crate::index::pipeline::IndexMetrics::new(); + + // Threshold 100: Leaf A (50) and Leaf B (30) should merge + let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); + + assert_eq!(merged, 1); + assert_eq!(metrics.nodes_merged, 1); + + // Leaf B should be marked as merged + let children = tree.children(root); + let leaf_b = children.iter().find(|&&id| { + tree.get(id) + .map(|n| n.title.starts_with("[MERGED")) + .unwrap_or(false) + }); + assert!(leaf_b.is_some(), "Leaf B should be marked as merged"); + } + + #[test] + fn test_merge_small_leaves_nothing_above_threshold() { + let mut tree = make_merge_test_tree(); + let mut metrics = crate::index::pipeline::IndexMetrics::new(); + + // Threshold 10: all leaves are above this, nothing merges + let merged = OptimizeStage::merge_small_leaves(&mut tree, 10, &mut metrics); + assert_eq!(merged, 0); + } + + #[test] + fn test_merge_small_leaves_preserves_content() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let a = tree.add_child(root, "A", "hello"); + let b = tree.add_child(root, "B", "world"); + if let Some(n) = tree.get_mut(a) { + n.token_count = Some(5); + } + if let Some(n) = tree.get_mut(b) { + n.token_count = Some(5); + } + + let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let _ = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); + + // Leaf A should now contain both contents with heading prefix + let a_node = tree.get(a).unwrap(); + assert!(a_node.content.contains("hello")); + assert!(a_node.content.contains("## B")); + assert!(a_node.content.contains("world")); + assert_eq!(a_node.token_count, Some(10)); + } + + #[test] + fn test_merge_small_leaves_skips_non_leaf() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + + // Section is a non-leaf (has a child), should not be merged even if small + let section = tree.add_child(root, "Section", "section content"); + let _sub = tree.add_child(section, "Sub", "sub content"); + let leaf = tree.add_child(root, "Leaf", "leaf content"); + + if let Some(n) = tree.get_mut(section) { + n.token_count = Some(5); + } + if let Some(n) = tree.get_mut(leaf) { + n.token_count = Some(5); + } + + let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); + + // Section is non-leaf, only Leaf is a leaf — no adjacent pair of leaves + assert_eq!(merged, 0); + } + + #[test] + fn test_remove_empty_nodes_marks_single_child_empty() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + + // Empty intermediate with single child + let section = tree.add_child(root, "Section", ""); + let _leaf = tree.add_child(section, "Leaf", "content"); + + let removed = OptimizeStage::remove_empty_nodes(&mut tree); + assert_eq!(removed, 1); + + let section_node = tree.get(section).unwrap(); + assert!(section_node.title.starts_with("[EMPTY")); + } + + #[test] + fn test_remove_empty_nodes_skips_root() { + let mut tree = DocumentTree::new("Root", ""); + let _child = tree.add_child(tree.root(), "Child", "content"); + + let removed = OptimizeStage::remove_empty_nodes(&mut tree); + assert_eq!(removed, 0); + } + + #[test] + fn test_remove_empty_nodes_skips_leaves() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let leaf = tree.add_child(root, "Leaf", ""); + + let removed = OptimizeStage::remove_empty_nodes(&mut tree); + assert_eq!(removed, 0, "Leaves should not be removed"); + + // Verify the leaf is indeed a leaf + assert!(tree.is_leaf(leaf)); + } + + #[test] + fn test_remove_empty_nodes_skips_multi_child() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let section = tree.add_child(root, "Section", ""); + let _c1 = tree.add_child(section, "C1", "a"); + let _c2 = tree.add_child(section, "C2", "b"); + + let removed = OptimizeStage::remove_empty_nodes(&mut tree); + assert_eq!( + removed, 0, + "Nodes with multiple children should not be removed" + ); + } + + #[test] + fn test_remove_empty_nodes_skips_non_empty() { + let mut tree = DocumentTree::new("Root", ""); + let root = tree.root(); + let section = tree.add_child(root, "Section", "has content"); + let _leaf = tree.add_child(section, "Leaf", "content"); + + let removed = OptimizeStage::remove_empty_nodes(&mut tree); + assert_eq!(removed, 0); + } + + #[tokio::test] + async fn test_optimize_disabled_skips() { + let mut stage = OptimizeStage::new(); + assert_eq!(stage.name(), "optimize"); + assert!(stage.is_optional()); + assert_eq!(stage.depends_on(), vec!["enrich", "navigation_index"]); + + let mut options = PipelineOptions::default(); + options.optimization.enabled = false; + + let input = IndexInput::content("# Test\nHello"); + let mut ctx = IndexContext::new(input, options); + ctx.tree = Some(DocumentTree::new("Root", "content")); + + let result = stage.execute(&mut ctx).await.unwrap(); + assert!(result.success); + } + + #[test] + fn test_merge_small_leaves_empty_tree() { + let mut tree = DocumentTree::new("Root", ""); + let mut metrics = crate::index::pipeline::IndexMetrics::new(); + + let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); + assert_eq!(merged, 0, "Root with no children should merge nothing"); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/parse.rs b/vectorless-core/vectorless-index/src/stages/parse.rs new file mode 100644 index 00000000..1b48e1da --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/parse.rs @@ -0,0 +1,166 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Parse stage - Parse documents into raw nodes. + +use super::async_trait; +use std::time::Instant; +use tracing::{debug, info}; + +use vectorless_document::DocumentFormat; +use vectorless_error::Result; + +use super::{IndexStage, StageResult}; +use crate::index::IndexMode; +use crate::index::pipeline::{IndexContext, IndexInput}; + +/// Parse stage - extracts raw nodes from documents. +pub struct ParseStage { + /// Optional LLM client for PDF structure extraction. + llm_client: Option, +} + +impl ParseStage { + /// Create a new parse stage. + pub fn new() -> Self { + Self { llm_client: None } + } + + /// Create a parse stage with an LLM client. + pub fn with_llm_client(client: vectorless_llm::LlmClient) -> Self { + Self { + llm_client: Some(client), + } + } + + /// Detect document format from path and options. + fn detect_format(&self, ctx: &IndexContext) -> Result { + match ctx.options.mode { + IndexMode::Auto => match &ctx.input { + IndexInput::File(path) => { + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + DocumentFormat::from_extension(ext) + .ok_or_else(|| vectorless_error::Error::Parse(format!("Unknown format: {}", ext))) + } + IndexInput::Content { format, .. } => Ok(*format), + IndexInput::Bytes { format, .. } => Ok(*format), + }, + IndexMode::Markdown => Ok(DocumentFormat::Markdown), + IndexMode::Pdf => Ok(DocumentFormat::Pdf), + } + } +} + +impl Default for ParseStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for ParseStage { + fn name(&self) -> &'static str { + "parse" + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + // Detect format + let format = self.detect_format(ctx)?; + ctx.format = format; + + let input_type = match &ctx.input { + IndexInput::File(_) => "file", + IndexInput::Content { .. } => "content", + IndexInput::Bytes { .. } => "bytes", + }; + info!( + "[parse] Starting: format={:?}, input={}, llm={}", + format, + input_type, + self.llm_client.is_some() + ); + + // Parse based on input type + let result = match &ctx.input { + IndexInput::File(path) => { + // Resolve path + let path = path.canonicalize().unwrap_or_else(|_| path.clone()); + ctx.source_path = Some(path.clone()); + + // Extract name from file + ctx.name = path + .file_stem() + .and_then(|n| n.to_str()) + .unwrap_or("document") + .to_string(); + + debug!("[parse] Reading file: {:?}", ctx.source_path); + + // Parse directly + crate::index::parse::parse_file(&path, format, self.llm_client.clone()).await? + } + IndexInput::Content { + content, + name, + format, + } => { + // Set name + ctx.name = name.clone(); + + debug!("[parse] Parsing inline content ({} chars)", content.len()); + + // Parse content directly + crate::index::parse::parse_content(content, *format, self.llm_client.clone()) + .await? + } + IndexInput::Bytes { data, name, format } => { + // Set name + ctx.name = name.clone(); + + debug!("[parse] Parsing bytes ({} bytes)", data.len()); + + // Parse bytes + crate::index::parse::parse_bytes(data, *format, self.llm_client.clone()).await? + } + }; + + // Store results + ctx.raw_nodes = result.nodes; + ctx.metrics.set_nodes_processed(ctx.raw_nodes.len()); + + // Store metadata + if let Some(page_count) = result.meta.page_count { + ctx.page_count = Some(page_count); + debug!("[parse] Document has {} pages", page_count); + } + ctx.line_count = Some(result.meta.line_count); + + if let Some(desc) = result.meta.description { + ctx.description = Some(desc); + } + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_parse(duration); + + info!( + "[parse] Complete: {} nodes from '{}' ({}ms)", + ctx.raw_nodes.len(), + ctx.name, + duration + ); + + let mut stage_result = StageResult::success("parse"); + stage_result.duration_ms = duration; + stage_result.metadata.insert( + "node_count".to_string(), + serde_json::json!(ctx.raw_nodes.len()), + ); + stage_result + .metadata + .insert("format".to_string(), serde_json::json!(format.extension())); + + Ok(stage_result) + } +} diff --git a/vectorless-core/vectorless-index/src/stages/reasoning.rs b/vectorless-core/vectorless-index/src/stages/reasoning.rs new file mode 100644 index 00000000..1d41a159 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/reasoning.rs @@ -0,0 +1,639 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Reasoning Index Stage - Build pre-computed reasoning index. +//! +//! This stage runs after EnrichStage (which generates descriptions and +//! calculates metadata) and before OptimizeStage. It builds a +//! [`ReasoningIndex`] from the document tree's TOC, summaries, and keywords. + +use std::collections::HashMap; +use std::time::Instant; +use tracing::{debug, info, warn}; + +use vectorless_document::{ + NodeId, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, SummaryShortcut, + TopicEntry, +}; +use vectorless_error::Result; +use vectorless_llm::LlmClient; +use vectorless_scoring::extract_keywords; + +use super::async_trait; +use super::{AccessPattern, IndexStage, StageResult}; +use crate::index::pipeline::IndexContext; + +/// Reasoning Index Stage - builds a pre-computed reasoning index from the document tree. +/// +/// This stage creates a [`ReasoningIndex`] containing: +/// - Topic-to-path mappings from titles and summaries +/// - Summary shortcuts for high-frequency "overview" queries +/// - Section map for fast ToC lookup +pub struct ReasoningIndexStage { + config: ReasoningIndexConfig, +} + +impl ReasoningIndexStage { + /// Create a new reasoning index stage with default config. + pub fn new() -> Self { + Self { + config: ReasoningIndexConfig::default(), + } + } + + /// Create with custom config. + pub fn with_config(config: ReasoningIndexConfig) -> Self { + Self { config } + } + + /// Extract keywords from a text, filtering by minimum length. + fn extract_node_keywords(text: &str, min_length: usize) -> Vec { + extract_keywords(text) + .into_iter() + .filter(|k: &String| k.len() >= min_length) + .collect() + } + + /// Build the topic-to-path mapping by extracting keywords from all nodes. + fn build_topic_paths( + tree: &vectorless_document::DocumentTree, + config: &ReasoningIndexConfig, + ) -> (HashMap>, usize) { + let mut keyword_nodes: HashMap> = HashMap::new(); + + // Walk all nodes and extract keywords from title + summary + for node_id in tree.traverse() { + if let Some(node) = tree.get(node_id) { + let title_keywords = + Self::extract_node_keywords(&node.title, config.min_keyword_length); + let summary_keywords = + Self::extract_node_keywords(&node.summary, config.min_keyword_length); + // Always extract from content — keywords can appear anywhere + let content_keywords = + Self::extract_node_keywords(&node.content, config.min_keyword_length); + + // Title keywords get higher weight (2.0), summary (1.5), content (1.0) + for kw in &title_keywords { + keyword_nodes + .entry(kw.clone()) + .or_default() + .push((node_id, 2.0, node.depth)); + } + for kw in &summary_keywords { + keyword_nodes + .entry(kw.clone()) + .or_default() + .push((node_id, 1.5, node.depth)); + } + for kw in &content_keywords { + keyword_nodes + .entry(kw.clone()) + .or_default() + .push((node_id, 1.0, node.depth)); + } + } + } + + // Sort by keyword frequency (most common first) and trim to max_keyword_entries + let mut sorted_keywords: Vec<_> = keyword_nodes.into_iter().collect(); + sorted_keywords.sort_by(|a, b| b.1.len().cmp(&a.1.len())); + sorted_keywords.truncate(config.max_keyword_entries); + + let keyword_count = sorted_keywords.len(); + + // Build topic_paths: merge duplicate (keyword, node) pairs + let mut topic_paths: HashMap> = HashMap::new(); + + for (keyword, entries) in sorted_keywords { + // Merge duplicate node entries by summing weights + let mut merged: HashMap = HashMap::new(); + for (node_id, weight, depth) in entries { + let entry = merged.entry(node_id).or_insert((0.0, depth)); + entry.0 += weight; + } + + // Normalize weights to 0.0-1.0 range + let max_weight = merged.values().map(|(w, _)| *w).fold(0.0_f32, f32::max); + let scale = if max_weight > 0.0 { + 1.0 / max_weight + } else { + 1.0 + }; + + let mut topic_entries: Vec = merged + .into_iter() + .map(|(node_id, (weight, depth))| TopicEntry { + node_id, + weight: weight * scale, + depth, + }) + .collect(); + + topic_entries.sort_by(|a, b| { + b.weight + .partial_cmp(&a.weight) + .unwrap_or(std::cmp::Ordering::Equal) + }); + topic_entries.truncate(config.max_topic_entries); + + topic_paths.insert(keyword, topic_entries); + } + + (topic_paths, keyword_count) + } + + /// Build section map from depth-1 nodes. + fn build_section_map(tree: &vectorless_document::DocumentTree) -> HashMap { + let mut section_map = HashMap::new(); + let root = tree.root(); + for child_id in tree.children(root) { + if let Some(node) = tree.get(child_id) { + section_map.insert(node.title.to_lowercase(), child_id); + // Also index by structure index (e.g. "1", "2", "3") + if !node.structure.is_empty() { + section_map.insert(node.structure.clone(), child_id); + } + } + } + section_map + } + + /// Expand keywords with LLM-generated synonyms (single batch request). + /// + /// Sends all keywords to the LLM in one request and maps each to its + /// synonyms. Synonym entries inherit the same node mappings but with + /// a reduced weight (0.6x) to reflect the indirect match. + async fn expand_synonyms( + topic_paths: &mut HashMap>, + llm_client: &LlmClient, + max_keywords: usize, + ) -> usize { + use std::collections::HashSet; + + let existing_keys: HashSet = topic_paths.keys().cloned().collect(); + // Pick top keywords by entry count for synonym expansion + let mut ranked: Vec<(String, usize)> = topic_paths + .iter() + .map(|(k, v): (&String, &Vec)| (k.clone(), v.len())) + .collect(); + ranked.sort_by(|a, b| b.1.cmp(&a.1)); + ranked.truncate(max_keywords); + + let keyword_count = ranked.len(); + if keyword_count == 0 { + return 0; + } + + tracing::info!( + "[reasoning_index] Expanding synonyms for {} keywords (single request)", + keyword_count, + ); + + // Snapshot the source entries for each keyword. + let source_entries: HashMap> = ranked + .iter() + .map(|(kw, _): &(String, usize)| { + (kw.clone(), topic_paths.get(kw).cloned().unwrap_or_default()) + }) + .collect(); + + let keywords: Vec = ranked.into_iter().map(|(kw, _)| kw).collect(); + + let system = "You are a thesaurus assistant. For each keyword, provide up to 5 synonyms \ + or related search terms. Return ONLY a valid JSON object mapping each keyword to an \ + array of synonym strings. No explanation, no markdown."; + let user_prompt = format!( + "Keywords: {}\n\nReturn a JSON object: {{\"keyword\": [\"syn1\", \"syn2\"], ...}}", + keywords.join(", ") + ); + + let synonym_map: HashMap> = match llm_client + .complete_json::>>(system, &user_prompt) + .await + { + Ok(map) => map + .into_iter() + .map(|(k, v): (String, Vec)| (k.to_lowercase(), v)) + .collect(), + Err(e) => { + tracing::warn!("[reasoning_index] Batch synonym expansion failed: {}", e); + return 0; + } + }; + + // Write results back + let mut synonym_count = 0; + for keyword in &keywords { + if let Some(synonyms) = synonym_map.get(keyword) { + if let Some(entries) = source_entries.get(keyword) { + for syn in synonyms { + let syn_clean = syn.trim().to_lowercase(); + if syn_clean.is_empty() + || syn_clean.len() < 2 + || existing_keys.contains(&syn_clean) + { + continue; + } + let synonym_entries: Vec = entries + .iter() + .map(|e| TopicEntry { + node_id: e.node_id, + weight: e.weight * 0.6, + depth: e.depth, + }) + .collect(); + topic_paths.insert(syn_clean, synonym_entries); + synonym_count += 1; + } + } + } + } + + synonym_count + } + + /// Build summary shortcut from root and depth-1 nodes. + fn build_summary_shortcut(tree: &vectorless_document::DocumentTree) -> Option { + let root = tree.root(); + let root_node = tree.get(root)?; + + // Collect document summary from root + let document_summary = if !root_node.summary.is_empty() { + root_node.summary.clone() + } else { + // Fallback: concatenate depth-1 summaries + let mut parts = Vec::new(); + for child_id in tree.children(root) { + if let Some(child) = tree.get(child_id) { + if !child.summary.is_empty() { + parts.push(format!("{}: {}", child.title, child.summary)); + } + } + } + parts.join("\n") + }; + + // Collect section summaries + let mut section_summaries = Vec::new(); + for child_id in tree.children(root) { + if let Some(child) = tree.get(child_id) { + section_summaries.push(SectionSummary { + node_id: child_id, + title: child.title.clone(), + summary: child.summary.clone(), + depth: child.depth, + }); + } + } + + Some(SummaryShortcut { + root_node: root, + section_summaries, + document_summary, + }) + } +} + +impl Default for ReasoningIndexStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for ReasoningIndexStage { + fn name(&self) -> &'static str { + "reasoning_index" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["enrich"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_reasoning_index: true, + ..Default::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + // Check if enabled via pipeline options + if !ctx.options.reasoning_index.enabled { + info!("[reasoning_index] Disabled, skipping"); + return Ok(StageResult::success("reasoning_index")); + } + + // Use stage config, overridden by pipeline options + let config = &ctx.options.reasoning_index; + + let tree = match ctx.tree.as_ref() { + Some(t) => t, + None => { + warn!("[reasoning_index] No tree, cannot build index"); + return Ok(StageResult::failure("reasoning_index", "Tree not built")); + } + }; + + info!( + "[reasoning_index] Starting: synonyms={}, summary_shortcut={}, max_keywords={}", + config.enable_synonym_expansion, + config.build_summary_shortcut, + config.max_keyword_entries, + ); + + // 1. Build topic-to-path mapping + let (mut topic_paths, keyword_count) = Self::build_topic_paths(tree, config); + let topic_count: usize = topic_paths + .values() + .map(|v: &Vec| v.len()) + .sum(); + debug!( + "[reasoning_index] Topic paths: {} keywords, {} entries", + keyword_count, topic_count + ); + + // 1b. Optional: expand keywords with LLM-generated synonyms + let synonym_count = if config.enable_synonym_expansion { + if let Some(ref llm_client) = ctx.llm_client { + let max_kw = (keyword_count / 4).max(20).min(100); + let count = Self::expand_synonyms(&mut topic_paths, llm_client, max_kw).await; + if count > 0 { + info!("[reasoning_index] Expanded {} synonym keywords", count); + } + count + } else { + debug!("[reasoning_index] Synonym expansion enabled but no LLM client"); + 0 + } + } else { + 0 + }; + + // 2. Build section map + let section_map = Self::build_section_map(tree); + debug!( + "[reasoning_index] Section map: {} entries", + section_map.len() + ); + + // 3. Build summary shortcut + let summary_shortcut = if config.build_summary_shortcut { + let shortcut = Self::build_summary_shortcut(tree); + if shortcut.is_some() { + debug!("[reasoning_index] Built summary shortcut"); + } + shortcut + } else { + None + }; + + // 4. Assemble the reasoning index + let mut builder = ReasoningIndexBuilder::new(); + for (keyword, entries) in topic_paths { + for entry in entries { + builder.add_topic_entry(&keyword, entry); + } + } + for (title, node_id) in section_map { + builder.add_section(&title, node_id); + } + if let Some(shortcut) = summary_shortcut { + builder = builder.summary_shortcut(shortcut); + } + builder.sort_and_trim(config.max_topic_entries); + + let reasoning_index = builder.build(); + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics + .record_reasoning_index(duration, topic_count, keyword_count); + + info!( + "[reasoning_index] Complete: {} keywords, {} topics, {} sections, {} synonyms in {}ms", + keyword_count, + topic_count, + reasoning_index.section_count(), + synonym_count, + duration, + ); + + ctx.reasoning_index = Some(reasoning_index); + + let mut stage_result = StageResult::success("reasoning_index"); + stage_result.duration_ms = duration; + stage_result.metadata.insert( + "keywords_indexed".to_string(), + serde_json::json!(keyword_count), + ); + stage_result + .metadata + .insert("topics_indexed".to_string(), serde_json::json!(topic_count)); + stage_result.metadata.insert( + "synonyms_expanded".to_string(), + serde_json::json!(synonym_count), + ); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_node_keywords() { + let keywords = + ReasoningIndexStage::extract_node_keywords("Introduction to Machine Learning", 2); + assert!(keywords.contains(&"introduction".to_string())); + assert!(keywords.contains(&"machine".to_string())); + assert!(keywords.contains(&"learning".to_string())); + } + + #[test] + fn test_extract_node_keywords_min_length() { + let keywords = ReasoningIndexStage::extract_node_keywords("A B CD", 2); + assert!(!keywords.contains(&"a".to_string())); + assert!(!keywords.contains(&"b".to_string())); + assert!(keywords.contains(&"cd".to_string())); + } + + #[test] + fn test_stage_config_default() { + let stage = ReasoningIndexStage::new(); + assert!(stage.config.enabled); + assert_eq!(stage.name(), "reasoning_index"); + assert!(stage.is_optional()); + assert_eq!(stage.depends_on(), vec!["enrich"]); + } + + #[test] + fn test_build_topic_paths_basic() { + use vectorless_document::ReasoningIndexConfig; + + let mut tree = vectorless_document::DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "Machine Learning Introduction", ""); + let c2 = tree.add_child(root, "Deep Learning Methods", ""); + + // Set summaries for keyword extraction + if let Some(n) = tree.get_mut(c1) { + n.summary = "An overview of machine learning algorithms".to_string(); + } + if let Some(n) = tree.get_mut(c2) { + n.summary = "Advanced deep learning techniques".to_string(); + } + + let config = ReasoningIndexConfig::default(); + let (topic_paths, keyword_count) = ReasoningIndexStage::build_topic_paths(&tree, &config); + + assert!( + keyword_count > 0, + "Should extract keywords from title + summary" + ); + assert!(!topic_paths.is_empty(), "Should build topic paths"); + + // "learning" appears in both titles → should be a keyword + assert!( + topic_paths.contains_key("learning"), + "Expected 'learning' in topic paths, got: {:?}", + topic_paths.keys().collect::>() + ); + } + + #[test] + fn test_build_topic_paths_weight_normalization() { + use vectorless_document::ReasoningIndexConfig; + + let mut tree = vectorless_document::DocumentTree::new("Root", ""); + let root = tree.root(); + let _c1 = tree.add_child(root, "rust ownership", "rust borrowing rules"); + + let config = ReasoningIndexConfig::default(); + let (topic_paths, _) = ReasoningIndexStage::build_topic_paths(&tree, &config); + + // All weights should be in 0.0-1.0 range + for entries in topic_paths.values() { + for entry in entries { + assert!( + entry.weight >= 0.0 && entry.weight <= 1.0, + "Weight {} out of [0, 1] range", + entry.weight + ); + } + } + } + + #[test] + fn test_build_topic_paths_respects_max_keyword_entries() { + use vectorless_document::ReasoningIndexConfig; + + let mut tree = vectorless_document::DocumentTree::new("Root", ""); + let root = tree.root(); + + // Create many children with unique keywords + for i in 0..50 { + let c = tree.add_child(root, &format!("Section {} Alpha Beta Gamma Delta", i), ""); + if let Some(n) = tree.get_mut(c) { + n.summary = format!("keywords unique{} special{} terms{}", i, i, i); + } + } + + let mut config = ReasoningIndexConfig::default(); + config.max_keyword_entries = 5; + let (topic_paths, keyword_count) = ReasoningIndexStage::build_topic_paths(&tree, &config); + + assert!( + keyword_count <= 5, + "Should respect max_keyword_entries, got {}", + keyword_count + ); + assert_eq!(topic_paths.len(), keyword_count); + } + + #[test] + fn test_build_section_map() { + let mut tree = vectorless_document::DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "Introduction", "content"); + let c2 = tree.add_child(root, "Methods", "content"); + + // Set structure indices + if let Some(n) = tree.get_mut(c1) { + n.structure = "1".to_string(); + } + if let Some(n) = tree.get_mut(c2) { + n.structure = "2".to_string(); + } + + let section_map = ReasoningIndexStage::build_section_map(&tree); + + // Should index by title (lowercase) and structure index + assert!(section_map.contains_key("introduction")); + assert!(section_map.contains_key("methods")); + assert!(section_map.contains_key("1")); + assert!(section_map.contains_key("2")); + assert_eq!(section_map.len(), 4); + } + + #[test] + fn test_build_summary_shortcut() { + let mut tree = vectorless_document::DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "S1", "summary 1"); + let c2 = tree.add_child(root, "S2", "summary 2"); + + // Set root summary (not content — build_summary_shortcut reads summary field) + if let Some(n) = tree.get_mut(root) { + n.summary = "root summary text".to_string(); + } + if let Some(n) = tree.get_mut(c1) { + n.summary = "first section summary".to_string(); + } + if let Some(n) = tree.get_mut(c2) { + n.summary = "second section summary".to_string(); + } + + let shortcut = ReasoningIndexStage::build_summary_shortcut(&tree); + assert!(shortcut.is_some()); + + let sc = shortcut.unwrap(); + assert_eq!(sc.root_node, root); + assert_eq!(sc.document_summary, "root summary text"); + assert_eq!(sc.section_summaries.len(), 2); + } + + #[test] + fn test_build_summary_shortcut_fallback_to_children() { + // Root has no summary → fallback to concatenating children + let mut tree = vectorless_document::DocumentTree::new("Root", ""); + let root = tree.root(); + let c1 = tree.add_child(root, "S1", ""); + let c2 = tree.add_child(root, "S2", ""); + + if let Some(n) = tree.get_mut(c1) { + n.summary = "child summary 1".to_string(); + } + if let Some(n) = tree.get_mut(c2) { + n.summary = "child summary 2".to_string(); + } + + let shortcut = ReasoningIndexStage::build_summary_shortcut(&tree); + assert!(shortcut.is_some()); + + let sc = shortcut.unwrap(); + assert!( + sc.document_summary.contains("child summary 1"), + "Fallback should include child summaries" + ); + assert!(sc.document_summary.contains("S1")); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/split.rs b/vectorless-core/vectorless-index/src/stages/split.rs new file mode 100644 index 00000000..68b5077f --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/split.rs @@ -0,0 +1,347 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Split stage - Break large leaf nodes into smaller ones. + +use std::time::Instant; +use tracing::{debug, info}; + +use vectorless_document::{DocumentTree, NodeId}; +use vectorless_error::Result; +use vectorless_utils::estimate_tokens; + +use super::{AccessPattern, IndexStage, StageResult, async_trait}; +use crate::index::config::SplitConfig; +use crate::index::pipeline::IndexContext; + +/// Split stage — breaks oversized leaf nodes into smaller children. +/// +/// When a leaf node exceeds the token limit, the stage searches for natural +/// split points (headings `\n#`, paragraph boundaries `\n\n`) and creates +/// child nodes from the resulting chunks. +/// +/// This stage runs after validate (priority 22) at priority 25. +pub struct SplitStage; + +impl SplitStage { + /// Create a new split stage. + pub fn new() -> Self { + Self + } + + /// Find natural split points in content. + /// + /// Returns byte offsets where the content can be split. + /// Prioritizes heading boundaries (`\n#`), then paragraph breaks (`\n\n`). + fn find_split_points(content: &str, max_tokens: usize) -> Vec { + let total_tokens = estimate_tokens(content); + if total_tokens <= max_tokens { + return Vec::new(); + } + + // Estimate how many parts we need + let estimated_parts = (total_tokens + max_tokens - 1) / max_tokens; + let target_size = content.len() / estimated_parts.max(1); + + let mut points = Vec::new(); + + // First pass: find heading boundaries + let mut last_split = 0; + for (i, line) in content.lines().enumerate() { + let byte_offset = line.as_ptr() as usize - content.as_ptr() as usize; + if i > 0 && line.starts_with('#') && byte_offset > last_split { + let chunk_tokens = estimate_tokens(&content[last_split..byte_offset]); + if chunk_tokens >= max_tokens / 2 { + points.push(byte_offset); + last_split = byte_offset; + } + } + } + + // If heading splits are sufficient, return them + if !points.is_empty() { + let approx_size = content.len() / (points.len() + 1); + if approx_size <= target_size * 2 { + return points; + } + } + + // Second pass: use paragraph boundaries + points.clear(); + let mut pos = 0; + for paragraph in content.split("\n\n") { + let para_end = pos + paragraph.len(); + if para_end > 0 && pos > 0 { + let chunk_tokens = + estimate_tokens(&content[points.last().copied().unwrap_or(0)..pos]); + if chunk_tokens >= max_tokens / 2 { + points.push(pos); + } + } + pos = para_end + 2; // skip "\n\n" + } + + // If still not enough split points, use approximate byte boundaries + if points.is_empty() { + let bytes_per_token = content.len().max(1) / total_tokens.max(1); + let target_bytes = max_tokens * bytes_per_token; + + let mut offset = target_bytes; + while offset < content.len() { + // Find the nearest newline + if let Some(nl_pos) = content[offset..].find('\n') { + points.push(offset + nl_pos); + } else { + break; + } + offset += target_bytes; + } + } + + points + } + + /// Split a single leaf node into children. + /// + /// Returns the number of new children created. + fn split_leaf(tree: &mut DocumentTree, leaf_id: NodeId, max_tokens: usize) -> usize { + let content = match tree.get(leaf_id) { + Some(node) => node.content.clone(), + None => return 0, + }; + + let split_points = Self::find_split_points(&content, max_tokens); + if split_points.is_empty() { + return 0; + } + + // Extract title for child naming + let parent_title = tree + .get(leaf_id) + .map(|n| n.title.clone()) + .unwrap_or_default(); + + // Create chunks from split points + let mut chunks: Vec<&str> = Vec::new(); + let mut prev = 0; + for &point in &split_points { + if point > prev { + chunks.push(&content[prev..point]); + } + prev = point; + } + if prev < content.len() { + chunks.push(&content[prev..]); + } + + let child_count = chunks.len(); + for (i, chunk) in chunks.into_iter().enumerate() { + let chunk_trimmed = chunk.trim(); + if chunk_trimmed.is_empty() { + continue; + } + + // Try to extract a title from the first line + let title = if chunk_trimmed.starts_with('#') { + chunk_trimmed + .lines() + .next() + .unwrap_or("") + .trim_start_matches('#') + .trim() + .to_string() + } else { + format!("{} (part {})", parent_title, i + 1) + }; + + let child_id = tree.add_child(leaf_id, &title, chunk_trimmed); + let token_count = estimate_tokens(chunk_trimmed); + tree.set_token_count(child_id, token_count); + } + + // Clear parent's content (moved to children) + tree.set_content(leaf_id, ""); + tree.set_token_count(leaf_id, 0); + + child_count + } + + /// Process all oversized leaf nodes in the tree. + fn split_tree(tree: &mut DocumentTree, config: &SplitConfig) -> usize { + if !config.enabled { + return 0; + } + + // Collect leaves first to avoid borrow issues + let leaves: Vec = tree.leaves(); + let mut total_split = 0; + + for leaf_id in leaves { + // Check if this leaf exceeds the token limit + let token_count = tree.get(leaf_id).and_then(|n| n.token_count).unwrap_or(0); + + // Use estimated tokens if no count set + let tokens = if token_count > 0 { + token_count + } else { + tree.get(leaf_id) + .map(|n| estimate_tokens(&n.content)) + .unwrap_or(0) + }; + + if tokens > config.max_tokens_per_node { + let split_count = Self::split_leaf(tree, leaf_id, config.max_tokens_per_node); + if split_count > 0 { + total_split += 1; + } + } + } + + total_split + } +} + +impl Default for SplitStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for SplitStage { + fn name(&self) -> &'static str { + "split" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["build"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: true, + writes_reasoning_index: false, + writes_navigation_index: false, + writes_description: false, + writes_concepts: false, + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let tree = match ctx.tree.as_mut() { + Some(t) => t, + None => { + info!("[split] No tree, skipping"); + return Ok(StageResult::success("split")); + } + }; + + let config = &ctx.options.split; + if !config.enabled { + debug!("[split] Disabled, skipping"); + return Ok(StageResult::success("split")); + } + + info!( + "[split] Starting: max_tokens_per_node={}", + config.max_tokens_per_node + ); + + let node_count_before = tree.node_count(); + let split_count = Self::split_tree(tree, config); + let node_count_after = tree.node_count(); + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_split(duration); + ctx.metrics.nodes_merged += split_count; + + info!( + "[split] Complete: {} nodes split ({} → {} total) in {}ms", + split_count, node_count_before, node_count_after, duration + ); + + let mut stage_result = StageResult::success("split"); + stage_result.duration_ms = duration; + stage_result + .metadata + .insert("nodes_split".to_string(), serde_json::json!(split_count)); + stage_result.metadata.insert( + "node_count_before".to_string(), + serde_json::json!(node_count_before), + ); + stage_result.metadata.insert( + "node_count_after".to_string(), + serde_json::json!(node_count_after), + ); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_find_split_points_small_content() { + let content = "Hello world"; + let points = SplitStage::find_split_points(content, 8000); + assert!(points.is_empty()); + } + + #[test] + fn test_find_split_points_heading_boundaries() { + let mut content = String::from("Introduction text that is long enough. "); + // Pad to exceed token limit + for _ in 0..500 { + content.push_str("This is some content. "); + } + content.push_str("\n## Section One\n"); + for _ in 0..500 { + content.push_str("More content here. "); + } + content.push_str("\n## Section Two\n"); + for _ in 0..500 { + content.push_str("Final content. "); + } + + let points = SplitStage::find_split_points(&content, 200); + assert!(!points.is_empty()); + } + + #[test] + fn test_find_split_points_paragraph_boundaries() { + let mut content = String::new(); + for i in 0..10 { + for _ in 0..100 { + content.push_str(&format!("Paragraph {} content. ", i)); + } + content.push_str("\n\n"); + } + + let points = SplitStage::find_split_points(&content, 200); + assert!(!points.is_empty()); + } + + #[test] + fn test_split_tree_disabled() { + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child( + tree.root(), + "Big", + "Very long content here with lots of text that would normally exceed limits", + ); + tree.set_token_count(child, 15000); + + let config = SplitConfig::disabled(); + let count = SplitStage::split_tree(&mut tree, &config); + assert_eq!(count, 0); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/validate.rs b/vectorless-core/vectorless-index/src/stages/validate.rs new file mode 100644 index 00000000..b4dbc1d4 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/validate.rs @@ -0,0 +1,365 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Validate stage - Verify tree integrity after build. + +use std::collections::HashSet; +use std::time::Instant; +use tracing::{debug, info, warn}; + +use vectorless_error::Result; + +use super::{AccessPattern, IndexStage, StageResult, async_trait}; +use crate::index::pipeline::IndexContext; + +/// Maximum allowed tree depth. +const MAX_DEPTH: usize = 20; + +/// Minimum token count ratio for parent vs children consistency check. +/// A parent's token count should be at least `ratio` of the sum of its children. +const MIN_PARENT_TOKEN_RATIO: f32 = 0.8; + +/// Minimum content similarity threshold to flag potential duplicates. +/// Content is considered duplicate if normalized equality matches. +const DUPLICATE_MIN_LENGTH: usize = 50; + +/// Validation issue severity. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Severity { + /// Warning — tree is usable but may have quality issues. + Warning, + /// Error — tree has structural problems. + Error, +} + +/// A single validation issue found during tree inspection. +#[derive(Debug, Clone)] +struct ValidationIssue { + /// Severity level. + severity: Severity, + /// Human-readable description. + message: String, +} + +/// Validate stage — checks tree integrity after build. +/// +/// Validates: +/// 1. Tree structural integrity (all nodes reachable from root) +/// 2. Depth sanity (max depth < 20) +/// 3. Empty title detection on leaf nodes +/// 4. Token count consistency (parent >= sum of children) +/// 5. Content duplication detection +pub struct ValidateStage; + +impl ValidateStage { + /// Create a new validate stage. + pub fn new() -> Self { + Self + } + + /// Run all validation checks and collect issues. + fn validate_tree(&self, ctx: &IndexContext) -> Vec { + let tree = match ctx.tree.as_ref() { + Some(t) => t, + None => { + return vec![ValidationIssue { + severity: Severity::Error, + message: "No tree available for validation".to_string(), + }]; + } + }; + + let mut issues = Vec::new(); + + Self::check_depth(tree, &mut issues); + Self::check_empty_titles(tree, &mut issues); + Self::check_token_consistency(tree, &mut issues); + Self::check_content_duplication(tree, &mut issues); + + issues + } + + /// Check that tree depth is reasonable. + fn check_depth(tree: &vectorless_document::DocumentTree, issues: &mut Vec) { + let all_nodes = tree.traverse(); + let max_depth = all_nodes + .iter() + .map(|&id| tree.depth(id)) + .max() + .unwrap_or(0); + + if max_depth > MAX_DEPTH { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!( + "Tree depth ({}) exceeds recommended maximum ({})", + max_depth, MAX_DEPTH + ), + }); + } + } + + /// Check for leaf nodes with empty titles. + fn check_empty_titles(tree: &vectorless_document::DocumentTree, issues: &mut Vec) { + let leaves = tree.leaves(); + let mut empty_count = 0; + + for &leaf_id in &leaves { + if let Some(node) = tree.get(leaf_id) { + if node.title.trim().is_empty() { + empty_count += 1; + } + } + } + + if empty_count > 0 { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!("Found {} leaf nodes with empty titles", empty_count), + }); + } + } + + /// Check token count consistency: parent's tokens should be >= sum of children's. + fn check_token_consistency( + tree: &vectorless_document::DocumentTree, + issues: &mut Vec, + ) { + let all_nodes = tree.traverse(); + let mut inconsistent = 0; + + for &node_id in &all_nodes { + let children: Vec<_> = tree.children(node_id); + if children.is_empty() { + continue; + } + + let parent_tokens = tree.get(node_id).and_then(|n| n.token_count).unwrap_or(0); + + let children_sum: usize = children + .iter() + .map(|&c| tree.get(c).and_then(|n| n.token_count).unwrap_or(0)) + .sum(); + + // Parent should have at least some proportion of children's tokens + // (parent has its own content plus children, but after thinning this may vary) + if parent_tokens > 0 + && children_sum > 0 + && (parent_tokens as f32 / children_sum as f32) < MIN_PARENT_TOKEN_RATIO + { + // Only flag if both are non-trivial + if children_sum >= 100 { + inconsistent += 1; + } + } + } + + if inconsistent > 0 { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!( + "Found {} nodes with token counts significantly less than their children's sum", + inconsistent + ), + }); + } + } + + /// Check for content duplication across leaf nodes. + fn check_content_duplication( + tree: &vectorless_document::DocumentTree, + issues: &mut Vec, + ) { + let leaves = tree.leaves(); + let mut seen: HashSet = HashSet::new(); + let mut duplicate_count = 0; + + for &leaf_id in &leaves { + if let Some(node) = tree.get(leaf_id) { + let content = node.content.trim(); + if content.len() < DUPLICATE_MIN_LENGTH { + continue; + } + + // Simple hash of normalized content for duplicate detection + let hash = Self::simple_hash(content); + if !seen.insert(hash) { + duplicate_count += 1; + } + } + } + + if duplicate_count > 0 { + issues.push(ValidationIssue { + severity: Severity::Warning, + message: format!( + "Found {} leaf nodes with duplicate content", + duplicate_count + ), + }); + } + } + + /// Simple FNV-1a-like hash for duplicate detection. + /// Not cryptographic — just for grouping identical content. + fn simple_hash(s: &str) -> u64 { + let mut hash: u64 = 0xcbf29ce484222325; + for byte in s.bytes() { + hash ^= byte as u64; + hash = hash.wrapping_mul(0x100000001b3); + } + hash + } +} + +impl Default for ValidateStage { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl IndexStage for ValidateStage { + fn name(&self) -> &'static str { + "validate" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["build"] + } + + fn is_optional(&self) -> bool { + true + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + writes_tree: false, + writes_reasoning_index: false, + writes_navigation_index: false, + writes_description: false, + writes_concepts: false, + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + let start = Instant::now(); + + let node_count = ctx.tree.as_ref().map(|t| t.node_count()).unwrap_or(0); + info!("[validate] Starting: {} nodes", node_count); + + let issues = self.validate_tree(ctx); + + let warnings = issues + .iter() + .filter(|i| i.severity == Severity::Warning) + .count(); + let errors = issues + .iter() + .filter(|i| i.severity == Severity::Error) + .count(); + + // Log all issues + for issue in &issues { + match issue.severity { + Severity::Warning => warn!("[validate] {}", issue.message), + Severity::Error => warn!("[validate] ERROR: {}", issue.message), + } + } + + if warnings == 0 && errors == 0 { + debug!("[validate] No issues found"); + } + + let duration = start.elapsed().as_millis() as u64; + ctx.metrics.record_validate(duration); + + info!( + "[validate] Complete: {} warnings, {} errors in {}ms", + warnings, errors, duration + ); + + let mut stage_result = StageResult::success("validate"); + stage_result.duration_ms = duration; + stage_result + .metadata + .insert("warnings".to_string(), serde_json::json!(warnings)); + stage_result + .metadata + .insert("errors".to_string(), serde_json::json!(errors)); + + Ok(stage_result) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::DocumentTree; + + fn make_context_with_tree(tree: DocumentTree) -> IndexContext { + let input = crate::index::IndexInput::content("test"); + let options = crate::index::config::PipelineOptions::default(); + let mut ctx = IndexContext::new(input, options); + ctx.tree = Some(tree); + ctx + } + + #[test] + fn test_validate_empty_tree() { + let tree = DocumentTree::new("Root", ""); + let ctx = make_context_with_tree(tree); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + // Single root node is valid — no issues expected + assert!(issues.is_empty()); + } + + #[test] + fn test_validate_simple_tree() { + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "Section 1", "Content of section 1"); + tree.set_token_count(child, 100); + + let ctx = make_context_with_tree(tree); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + assert!(issues.is_empty()); + } + + #[test] + fn test_validate_empty_title_warning() { + let mut tree = DocumentTree::new("Root", ""); + let child = tree.add_child(tree.root(), "", "Some content here"); + tree.set_token_count(child, 50); + + let ctx = make_context_with_tree(tree); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + let warning_count = issues + .iter() + .filter(|i| i.message.contains("empty titles")) + .count(); + assert_eq!(warning_count, 1); + } + + #[test] + fn test_validate_no_tree_error() { + let input = crate::index::IndexInput::content("test"); + let options = crate::index::config::PipelineOptions::default(); + let ctx = IndexContext::new(input, options); + + let stage = ValidateStage::new(); + let issues = stage.validate_tree(&ctx); + + assert_eq!(issues.len(), 1); + assert_eq!(issues[0].severity, Severity::Error); + } +} diff --git a/vectorless-core/vectorless-index/src/stages/verify_ingest.rs b/vectorless-core/vectorless-index/src/stages/verify_ingest.rs new file mode 100644 index 00000000..db5f0051 --- /dev/null +++ b/vectorless-core/vectorless-index/src/stages/verify_ingest.rs @@ -0,0 +1,79 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Verify stage — validates ingest output reliability before persist. + +use tracing::{info, warn}; + +use super::{AccessPattern, IndexStage}; +use vectorless_error::{Error, Result}; +use crate::index::pipeline::{IndexContext, StageResult}; +use super::async_trait; + +/// Verification stage — ensures ingest produced reliable output. +/// +/// Checks: +/// - Tree is non-empty (at least root node) +/// - Document summary is non-empty +/// - At least one concept was extracted +/// +/// Any check failure produces an error — no silent degradation. +pub struct VerifyStage; + +#[async_trait] +impl IndexStage for VerifyStage { + fn name(&self) -> &str { + "verify" + } + + fn depends_on(&self) -> Vec<&'static str> { + vec!["concept_extraction"] + } + + fn is_optional(&self) -> bool { + false + } + + fn access_pattern(&self) -> AccessPattern { + AccessPattern { + reads_tree: true, + ..AccessPattern::default() + } + } + + async fn execute(&mut self, ctx: &mut IndexContext) -> Result { + // Tree must exist and have nodes + let tree = ctx.tree.as_ref().ok_or_else(|| { + Error::InvalidStructure("document tree is empty".into()) + })?; + let node_count = tree.node_count(); + if node_count == 0 { + return Err(Error::InvalidStructure( + "tree has no nodes".into(), + )); + } + + // Summary must be non-empty + let has_summary = ctx + .description + .as_ref() + .is_some_and(|s| !s.trim().is_empty()); + if !has_summary { + warn!("[verify] Document summary is empty"); + } + + // Concepts must be present (warning only — non-fatal) + if ctx.concepts.is_empty() { + warn!("[verify] No concepts extracted from document"); + } + + info!( + "[verify] Passed: {} nodes, summary={}, concepts={}", + node_count, + has_summary, + ctx.concepts.len() + ); + + Ok(StageResult::success("verify")) + } +} diff --git a/vectorless-core/vectorless-index/src/summary/full.rs b/vectorless-core/vectorless-index/src/summary/full.rs new file mode 100644 index 00000000..bc8a3a92 --- /dev/null +++ b/vectorless-core/vectorless-index/src/summary/full.rs @@ -0,0 +1,65 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Full summary strategy - generate summaries for all nodes. + +use vectorless_document::NodeId; +use vectorless_llm::LlmClient; + +use super::{SummaryGenerator, SummaryStrategyConfig}; + +/// Full summary strategy - generates summaries for all nodes. +pub struct FullStrategy { + /// Summary generator. + generator: Box, + /// Configuration. + config: SummaryStrategyConfig, +} + +impl FullStrategy { + /// Create a new full strategy with LLM client. + pub fn new(client: LlmClient) -> Self { + Self { + generator: Box::new(super::LlmSummaryGenerator::new(client)), + config: SummaryStrategyConfig::default(), + } + } + + /// Create with custom generator. + pub fn with_generator(generator: Box) -> Self { + Self { + generator, + config: SummaryStrategyConfig::default(), + } + } + + /// Set configuration. + pub fn with_config(mut self, config: SummaryStrategyConfig) -> Self { + self.config = config; + self + } + + /// Check if a node should have a summary generated. + pub fn should_generate(&self, _node_id: NodeId, content_tokens: usize) -> bool { + // In full mode, generate for all nodes with content + content_tokens >= self.config.min_content_tokens + } + + /// Generate a summary for content. + pub async fn generate(&self, title: &str, content: &str) -> vectorless_llm::LlmResult { + self.generator.generate(title, content).await + } + + /// Get the configuration. + pub fn config(&self) -> &SummaryStrategyConfig { + &self.config + } +} + +impl std::fmt::Debug for FullStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FullStrategy") + .field("config", &self.config) + .finish() + } +} diff --git a/vectorless-core/vectorless-index/src/summary/lazy.rs b/vectorless-core/vectorless-index/src/summary/lazy.rs new file mode 100644 index 00000000..29821d5b --- /dev/null +++ b/vectorless-core/vectorless-index/src/summary/lazy.rs @@ -0,0 +1,153 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Lazy summary strategy - generate summaries on-demand at query time. + +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::RwLock; + +use vectorless_llm::LlmClient; + +use super::{SummaryGenerator, SummaryStrategyConfig}; + +/// Lazy summary strategy - generates summaries on-demand. +/// +/// Summaries are generated when first requested and optionally cached +/// for future use. +pub struct LazyStrategy { + /// Summary generator. + generator: Arc>>, + /// Cache of generated summaries (node_id -> summary). + cache: Arc>>, + /// Whether to persist generated summaries. + persist: bool, + /// Configuration. + config: SummaryStrategyConfig, +} + +impl LazyStrategy { + /// Create a new lazy strategy with LLM client. + pub fn new(client: LlmClient) -> Self { + Self { + generator: Arc::new(RwLock::new(Box::new(super::LlmSummaryGenerator::new( + client, + )))), + cache: Arc::new(RwLock::new(HashMap::new())), + persist: false, + config: SummaryStrategyConfig::default(), + } + } + + /// Create with persistence enabled. + pub fn with_persist(client: LlmClient, persist: bool) -> Self { + Self { + generator: Arc::new(RwLock::new(Box::new(super::LlmSummaryGenerator::new( + client, + )))), + cache: Arc::new(RwLock::new(HashMap::new())), + persist, + config: SummaryStrategyConfig::default(), + } + } + + /// Create with custom generator. + pub fn with_generator(generator: Box) -> Self { + Self { + generator: Arc::new(RwLock::new(generator)), + cache: Arc::new(RwLock::new(HashMap::new())), + persist: false, + config: SummaryStrategyConfig::default(), + } + } + + /// Set persistence mode. + pub fn with_persist_mode(mut self, persist: bool) -> Self { + self.persist = persist; + self + } + + /// Set configuration. + pub fn with_config(mut self, config: SummaryStrategyConfig) -> Self { + self.config = config; + self + } + + /// Check if a cached summary exists. + pub async fn has_cached(&self, node_id: &str) -> bool { + let cache = self.cache.read().await; + cache.contains_key(node_id) + } + + /// Get a cached summary if available. + pub async fn get_cached(&self, node_id: &str) -> Option { + let cache = self.cache.read().await; + cache.get(node_id).cloned() + } + + /// Get or generate a summary. + /// + /// Returns the cached summary if available, otherwise generates a new one. + pub async fn get_or_generate( + &self, + node_id: &str, + title: &str, + content: &str, + ) -> vectorless_llm::LlmResult { + // Check cache first + if self.persist { + if let Some(cached) = self.get_cached(node_id).await { + return Ok(cached); + } + } + + // Generate new summary + let generator = self.generator.read().await; + let summary = generator.generate(title, content).await?; + + // Cache if persistence is enabled + if self.persist { + let mut cache = self.cache.write().await; + cache.insert(node_id.to_string(), summary.clone()); + } + + Ok(summary) + } + + /// Pre-populate the cache with existing summaries. + pub async fn populate_cache(&self, summaries: HashMap) { + let mut cache = self.cache.write().await; + cache.extend(summaries); + } + + /// Clear the cache. + pub async fn clear_cache(&self) { + let mut cache = self.cache.write().await; + cache.clear(); + } + + /// Get cache size. + pub async fn cache_size(&self) -> usize { + let cache = self.cache.read().await; + cache.len() + } + + /// Check if persistence is enabled. + pub fn is_persist_enabled(&self) -> bool { + self.persist + } + + /// Get the configuration. + pub fn config(&self) -> &SummaryStrategyConfig { + &self.config + } +} + +impl std::fmt::Debug for LazyStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LazyStrategy") + .field("persist", &self.persist) + .field("config", &self.config) + .finish() + } +} diff --git a/vectorless-core/vectorless-index/src/summary/mod.rs b/vectorless-core/vectorless-index/src/summary/mod.rs new file mode 100644 index 00000000..f87593d0 --- /dev/null +++ b/vectorless-core/vectorless-index/src/summary/mod.rs @@ -0,0 +1,24 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Summary generation strategies. +//! +//! This module provides different strategies for generating summaries: +//! - [`SummaryStrategy`] - Configuration for summary generation +//! - [`SummaryStrategyConfig`] - Configuration options +//! - [`SummaryGenerator`] - Trait for summary generation +//! - [`LlmSummaryGenerator`] - LLM-based implementation +//! +//! # Strategies +//! +//! - **None**: No summary generation +//! - **Full**: Generate summaries for all nodes +//! - **Selective**: Generate summaries only for qualifying nodes (default) +//! - **Lazy**: Generate summaries on-demand at query time + +mod full; +mod lazy; +mod selective; +mod strategy; + +pub use strategy::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy, SummaryStrategyConfig}; diff --git a/vectorless-core/vectorless-index/src/summary/selective.rs b/vectorless-core/vectorless-index/src/summary/selective.rs new file mode 100644 index 00000000..f933d988 --- /dev/null +++ b/vectorless-core/vectorless-index/src/summary/selective.rs @@ -0,0 +1,120 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Selective summary strategy - generate summaries only for qualifying nodes. + +use vectorless_document::{DocumentTree, NodeId}; +use vectorless_llm::LlmClient; + +use super::{SummaryGenerator, SummaryStrategyConfig}; + +/// Selective summary strategy - generates summaries only for nodes that meet criteria. +pub struct SelectiveStrategy { + /// Summary generator. + generator: Box, + /// Minimum token threshold. + min_tokens: usize, + /// Only generate for branch nodes (non-leaves). + branch_only: bool, + /// Configuration. + config: SummaryStrategyConfig, +} + +impl SelectiveStrategy { + /// Create a new selective strategy with default settings. + pub fn new(client: LlmClient) -> Self { + Self { + generator: Box::new(super::LlmSummaryGenerator::new(client)), + min_tokens: 100, + branch_only: true, + config: SummaryStrategyConfig::default(), + } + } + + /// Create with custom thresholds. + pub fn with_thresholds(client: LlmClient, min_tokens: usize, branch_only: bool) -> Self { + Self { + generator: Box::new(super::LlmSummaryGenerator::new(client)), + min_tokens, + branch_only, + config: SummaryStrategyConfig::default(), + } + } + + /// Create with custom generator. + pub fn with_generator(generator: Box) -> Self { + Self { + generator, + min_tokens: 100, + branch_only: true, + config: SummaryStrategyConfig::default(), + } + } + + /// Set minimum token threshold. + pub fn with_min_tokens(mut self, min_tokens: usize) -> Self { + self.min_tokens = min_tokens; + self + } + + /// Set branch-only mode. + pub fn with_branch_only(mut self, branch_only: bool) -> Self { + self.branch_only = branch_only; + self + } + + /// Set configuration. + pub fn with_config(mut self, config: SummaryStrategyConfig) -> Self { + self.config = config; + self + } + + /// Check if a node should have a summary generated. + pub fn should_generate( + &self, + tree: &DocumentTree, + node_id: NodeId, + token_count: usize, + ) -> bool { + // Check token threshold + let enough_tokens = token_count >= self.min_tokens; + + // Check if branch-only + if self.branch_only { + let is_branch = !tree.is_leaf(node_id); + is_branch && enough_tokens + } else { + enough_tokens + } + } + + /// Generate a summary for content. + pub async fn generate(&self, title: &str, content: &str) -> vectorless_llm::LlmResult { + self.generator.generate(title, content).await + } + + /// Get the minimum token threshold. + pub fn min_tokens(&self) -> usize { + self.min_tokens + } + + /// Check if branch-only mode is enabled. + pub fn is_branch_only(&self) -> bool { + self.branch_only + } + + /// Get the configuration. + pub fn config(&self) -> &SummaryStrategyConfig { + &self.config + } +} + +impl std::fmt::Debug for SelectiveStrategy { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SelectiveStrategy") + .field("min_tokens", &self.min_tokens) + .field("branch_only", &self.branch_only) + .field("config", &self.config) + .finish() + } +} diff --git a/vectorless-core/vectorless-index/src/summary/strategy.rs b/vectorless-core/vectorless-index/src/summary/strategy.rs new file mode 100644 index 00000000..03f7a6c1 --- /dev/null +++ b/vectorless-core/vectorless-index/src/summary/strategy.rs @@ -0,0 +1,322 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Summary generation strategies. + +use async_trait::async_trait; + +use vectorless_document::{DocumentTree, NodeId}; +use vectorless_llm::memo::{MemoKey, MemoStore, MemoValue}; +use vectorless_llm::{LlmClient, LlmResult}; +use vectorless_utils::fingerprint::Fingerprint; + +/// Configuration for summary strategies. +#[derive(Debug, Clone)] +pub struct SummaryStrategyConfig { + /// Maximum tokens for a summary. + pub max_tokens: usize, + + /// Minimum content tokens to generate summary. + pub min_content_tokens: usize, + + /// Whether to persist lazy-generated summaries. + pub persist_lazy: bool, + + /// Token threshold below which the original content is used as summary + /// instead of calling LLM. Saves API cost for short, self-contained nodes. + /// Set to 0 to always call LLM. + pub shortcut_threshold: usize, +} + +impl Default for SummaryStrategyConfig { + fn default() -> Self { + Self { + max_tokens: 200, + min_content_tokens: 50, + persist_lazy: false, + shortcut_threshold: 50, + } + } +} + +/// Strategy for generating summaries. +#[derive(Debug, Clone)] +pub enum SummaryStrategy { + /// No summary generation. + None, + + /// Generate for all nodes. + Full { + /// Strategy configuration. + config: SummaryStrategyConfig, + }, + + /// Generate selectively. + Selective { + /// Minimum tokens threshold. + min_tokens: usize, + + /// Only generate for branch nodes (non-leaves). + branch_only: bool, + + /// Strategy configuration. + config: SummaryStrategyConfig, + }, + + /// Generate on-demand at query time. + Lazy { + /// Whether to persist generated summaries. + persist: bool, + + /// Strategy configuration. + config: SummaryStrategyConfig, + }, +} + +impl Default for SummaryStrategy { + fn default() -> Self { + Self::Full { + config: SummaryStrategyConfig::default(), + } + } +} + +impl SummaryStrategy { + /// Create a "none" strategy. + pub fn none() -> Self { + Self::None + } + + /// Create a "full" strategy. + pub fn full() -> Self { + Self::Full { + config: SummaryStrategyConfig::default(), + } + } + + /// Create a "selective" strategy. + pub fn selective(min_tokens: usize, branch_only: bool) -> Self { + Self::Selective { + min_tokens, + branch_only, + config: SummaryStrategyConfig::default(), + } + } + + /// Create a "lazy" strategy. + pub fn lazy(persist: bool) -> Self { + Self::Lazy { + persist, + config: SummaryStrategyConfig::default(), + } + } + + /// Check if we should generate a summary for a node. + pub fn should_generate( + &self, + tree: &DocumentTree, + node_id: NodeId, + token_count: usize, + ) -> bool { + match self { + Self::None => false, + Self::Full { .. } => token_count > 0, + Self::Selective { + min_tokens, + branch_only, + .. + } => { + let is_branch = !tree.is_leaf(node_id); + let enough_tokens = token_count >= *min_tokens; + + if *branch_only { + is_branch && enough_tokens + } else { + enough_tokens + } + } + Self::Lazy { .. } => false, // Generated on-demand + } + } + + /// Check if lazy strategy is enabled. + pub fn is_lazy(&self) -> bool { + matches!(self, Self::Lazy { .. }) + } + + /// Get the config. + pub fn config(&self) -> SummaryStrategyConfig { + match self { + Self::None => SummaryStrategyConfig::default(), + Self::Full { config } => config.clone(), + Self::Selective { config, .. } => config.clone(), + Self::Lazy { config, .. } => config.clone(), + } + } + + /// Get the shortcut threshold (tokens below which content is used as-is). + pub fn shortcut_threshold(&self) -> usize { + self.config().shortcut_threshold + } +} + +/// Summary generator trait. +#[async_trait] +pub trait SummaryGenerator: Send + Sync { + /// Generate a summary for the given content. + async fn generate(&self, title: &str, content: &str) -> LlmResult; + + /// Generate a summary with leaf/non-leaf context. + /// Non-leaf nodes get a navigation-oriented prompt ("what does this section cover"), + /// leaf nodes get a content-oriented prompt ("what does this section say"). + async fn generate_for_node( + &self, + title: &str, + content: &str, + is_leaf: bool, + ) -> LlmResult { + let _ = is_leaf; + self.generate(title, content).await + } +} + +/// LLM-based summary generator. +pub struct LlmSummaryGenerator { + client: LlmClient, + max_tokens: usize, + /// Optional memo store for caching results. + memo_store: Option, +} + +impl LlmSummaryGenerator { + /// Create a new summary generator. + pub fn new(client: LlmClient) -> Self { + Self { + client, + max_tokens: 200, + memo_store: None, + } + } + + /// Set max tokens. + pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { + self.max_tokens = max_tokens; + self + } + + /// Set memo store for caching. + pub fn with_memo_store(mut self, store: MemoStore) -> Self { + self.memo_store = Some(store); + self + } +} + +#[async_trait] +impl SummaryGenerator for LlmSummaryGenerator { + async fn generate(&self, title: &str, content: &str) -> LlmResult { + // Compute content fingerprint for cache key + let content_fp = Fingerprint::from_str(&format!("{}|{}", title, content)); + let memo_key = MemoKey::summary(&content_fp); + + // Check memo store first + if let Some(ref store) = self.memo_store { + if let Some(cached) = store.get(&memo_key) { + if let Some(summary) = cached.as_summary() { + tracing::debug!("Memo cache hit for summary: {}", title); + return Ok(summary.to_string()); + } + } + } + + // Generate with LLM + let system_prompt = "You are a document summarization assistant. \ + Generate a concise summary (2-3 sentences) of the given section. \ + Focus on the main topics and key information. \ + Respond with only the summary, no additional text."; + + let user_prompt = format!("Title: {}\n\nContent:\n{}", title, content); + + let summary = self + .client + .complete_with_max_tokens(&system_prompt, &user_prompt, self.max_tokens as u16) + .await?; + + // Cache the result + if let Some(ref store) = self.memo_store { + // Estimate tokens saved (roughly: input + output tokens) + let tokens_saved = (title.len() + content.len() + summary.len()) / 4; + store.put_with_tokens( + memo_key, + MemoValue::Summary(summary.clone()), + tokens_saved as u64, + ); + tracing::debug!("Memo cache stored for summary: {}", title); + } + + Ok(summary) + } + + async fn generate_for_node( + &self, + title: &str, + content: &str, + is_leaf: bool, + ) -> LlmResult { + // Compute content fingerprint for cache key (include leaf flag) + let content_fp = Fingerprint::from_str(&format!("{}|{}|leaf={}", title, content, is_leaf)); + let memo_key = MemoKey::summary(&content_fp); + + // Check memo store first + if let Some(ref store) = self.memo_store { + if let Some(cached) = store.get(&memo_key) { + if let Some(summary) = cached.as_summary() { + tracing::debug!("Memo cache hit for summary: {}", title); + return Ok(summary.to_string()); + } + } + } + + // Choose prompt based on node type + let system_prompt = if is_leaf { + // Leaf nodes: content-oriented — "what does this section say" + "You are a document summarization assistant. \ + Generate a concise summary (2-3 sentences) of the given section's content. \ + Focus on the key information and facts presented. \ + Respond with only the summary, no additional text." + } else { + // Non-leaf (branch) nodes: navigation-oriented with structured output. + // Produces OVERVIEW, QUESTIONS, and TAGS sections that EnhanceStage parses. + "You are a document navigation assistant. \ + Generate a structured overview of this section for navigation purposes. \ + Respond in EXACTLY this format (one section per line):\n\ + OVERVIEW: <2-3 sentence description of what topics this section covers>\n\ + QUESTIONS: \n\ + TAGS: " + }; + + let user_prompt = if is_leaf { + format!("Title: {}\n\nContent:\n{}", title, content) + } else { + // For non-leaf nodes, include children info for better routing summaries + format!("Title: {}\n\nContent:\n{}", title, content) + }; + + let summary = self + .client + .complete_with_max_tokens(&system_prompt, &user_prompt, self.max_tokens as u16) + .await?; + + // Cache the result + if let Some(ref store) = self.memo_store { + let tokens_saved = (title.len() + content.len() + summary.len()) / 4; + store.put_with_tokens( + memo_key, + MemoValue::Summary(summary.clone()), + tokens_saved as u64, + ); + tracing::debug!("Memo cache stored for summary: {}", title); + } + + Ok(summary) + } +} diff --git a/vectorless-core/vectorless-llm/Cargo.toml b/vectorless-core/vectorless-llm/Cargo.toml new file mode 100644 index 00000000..a5f5b3a0 --- /dev/null +++ b/vectorless-core/vectorless-llm/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "vectorless-llm" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-config = { path = "../vectorless-config" } +vectorless-error = { path = "../vectorless-error" } +vectorless-metrics = { path = "../vectorless-metrics" } +vectorless-utils = { path = "../vectorless-utils" } +async-openai = { workspace = true } +tokio = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +thiserror = { workspace = true } +chrono = { workspace = true } +governor = { workspace = true } +nonzero_ext = { workspace = true } +lru = { workspace = true } +parking_lot = { workspace = true } +uuid = { workspace = true } +rand = { workspace = true } +base64 = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-llm/src/client.rs b/vectorless-core/vectorless-llm/src/client.rs new file mode 100644 index 00000000..56dcae7a --- /dev/null +++ b/vectorless-core/vectorless-llm/src/client.rs @@ -0,0 +1,378 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified LLM client with retry and concurrency support. + +use serde::de::DeserializeOwned; +use std::borrow::Cow; +use std::sync::Arc; +use tracing::{debug, instrument}; + +use super::config::LlmConfig; +use super::error::{LlmError, LlmResult}; +use super::executor::LlmExecutor; +use super::fallback::FallbackChain; +use super::throttle::ConcurrencyController; + +/// Unified LLM client. +/// +/// This client provides: +/// - Unified interface for all LLM operations +/// - Automatic retry with exponential backoff +/// - Rate limiting and concurrency control +/// - JSON response parsing +/// - Error classification +/// - Graceful fallback on errors +/// +/// # Example +/// +/// ```rust,no_run +/// use vectorless::llm::{LlmClient, LlmConfig}; +/// +/// # #[tokio::main] +/// # async fn main() -> vectorless::llm::LlmResult<()> { +/// let config = LlmConfig::new("gpt-4o-mini"); +/// let client = LlmClient::new(config); +/// +/// // Simple completion +/// let response = client.complete("You are helpful.", "Hello!").await?; +/// println!("Response: {}", response); +/// +/// // JSON completion +/// #[derive(serde::Deserialize)] +/// struct Answer { +/// answer: String, +/// } +/// let answer: Answer = client.complete_json( +/// "You answer questions in JSON.", +/// "What is 2+2?" +/// ).await?; +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] +pub struct LlmClient { + executor: LlmExecutor, +} + +impl std::fmt::Debug for LlmClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LlmClient") + .field("model", &self.executor.config().model) + .field("endpoint", &self.executor.config().endpoint) + .field( + "concurrency", + &self.executor.throttle().map(|c| format!("{:?}", c)), + ) + .field("fallback_enabled", &self.executor.fallback().is_some()) + .finish() + } +} + +impl LlmClient { + /// Create a new LLM client with the given configuration. + pub fn new(config: LlmConfig) -> Self { + Self { + executor: LlmExecutor::new(config), + } + } + + /// Create a client with default configuration. + pub fn with_defaults() -> Self { + Self::new(LlmConfig::default()) + } + + /// Create a client for a specific model. + pub fn for_model(model: impl Into) -> Self { + Self::new(LlmConfig::new(model)) + } + + /// Add concurrency control to the client. + /// + /// # Example + /// + /// ```rust,no_run + /// use vectorless::llm::LlmClient; + /// use vectorless::throttle::{ConcurrencyController, ConcurrencyConfig}; + /// + /// let config = ConcurrencyConfig::new() + /// .with_max_concurrent_requests(10) + /// .with_requests_per_minute(500); + /// + /// let client = LlmClient::for_model("gpt-4o-mini") + /// .with_concurrency(ConcurrencyController::new(config)); + /// ``` + pub fn with_concurrency(mut self, controller: ConcurrencyController) -> Self { + self.executor = self.executor.with_throttle(controller); + self + } + + /// Add concurrency control from an existing Arc. + pub fn with_shared_concurrency(mut self, controller: Arc) -> Self { + self.executor = self.executor.with_shared_throttle(controller); + self + } + + /// Replace the async-openai client with a shared instance (reuses connection pool). + pub fn with_shared_openai_client( + mut self, + client: Arc>, + ) -> Self { + self.executor = self.executor.with_openai_client(client); + self + } + + /// Add fallback chain for error recovery. + /// + /// # Example + /// + /// ```rust + /// use vectorless::llm::{LlmClient, FallbackChain, FallbackConfig}; + /// + /// let fallback = FallbackConfig::default(); + /// let client = LlmClient::for_model("gpt-4o") + /// .with_fallback(FallbackChain::new(fallback)); + /// + /// assert!(client.fallback().is_some()); + /// ``` + pub fn with_fallback(mut self, chain: FallbackChain) -> Self { + self.executor = self.executor.with_fallback(chain); + self + } + + /// Add fallback chain from an existing Arc. + pub fn with_shared_fallback(mut self, chain: Arc) -> Self { + self.executor = self.executor.with_shared_fallback(chain); + self + } + + /// Add metrics hub for recording LLM call statistics. + pub fn with_shared_metrics(mut self, hub: Arc) -> Self { + self.executor = self.executor.with_shared_metrics(hub); + self + } + + /// Get the configuration. + pub fn config(&self) -> &LlmConfig { + self.executor.config() + } + + /// Get the concurrency controller (if any). + pub fn concurrency(&self) -> Option<&ConcurrencyController> { + self.executor.throttle() + } + + /// Get the fallback chain (if any). + pub fn fallback(&self) -> Option<&FallbackChain> { + self.executor.fallback() + } + + /// Get the underlying executor (for advanced usage). + pub fn executor(&self) -> &LlmExecutor { + &self.executor + } + + /// Complete a prompt with system and user messages. + /// + /// This method includes: + /// - Automatic rate limiting (if configured) + /// - Automatic retry with exponential backoff + /// - Automatic fallback on persistent errors (if configured) + #[instrument(skip(self, system, user), fields(model = %self.executor.config().model))] + pub async fn complete(&self, system: &str, user: &str) -> LlmResult { + debug!( + system_len = system.len(), + user_len = user.len(), + "Starting LLM completion" + ); + self.executor.complete(system, user).await + } + + /// Complete a prompt with custom max tokens. + pub async fn complete_with_max_tokens( + &self, + system: &str, + user: &str, + max_tokens: u16, + ) -> LlmResult { + debug!( + system_len = system.len(), + user_len = user.len(), + max_tokens = max_tokens, + "Starting LLM completion with max tokens" + ); + self.executor + .complete_with_max_tokens(system, user, max_tokens) + .await + } + + /// Complete a prompt and parse the response as JSON. + /// + /// This method handles: + /// - JSON extraction from markdown code blocks + /// - Bracket matching for nested JSON + /// + /// # Example + /// + /// ```rust,no_run + /// # use vectorless::llm::{LlmClient, LlmConfig}; + /// # #[tokio::main] + /// # async fn main() -> vectorless::llm::LlmResult<()> { + /// #[derive(serde::Deserialize)] + /// struct TocEntry { + /// title: String, + /// page: usize, + /// } + /// + /// let client = LlmClient::for_model("gpt-4o-mini"); + /// let entries: Vec = client.complete_json( + /// "Extract TOC entries as JSON array.", + /// "Chapter 1: Introduction ... 5" + /// ).await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn complete_json( + &self, + system: &str, + user: &str, + ) -> LlmResult { + let response = self.complete(system, user).await?; + self.parse_json(&response) + } + + /// Complete a prompt and parse the response as JSON with custom max tokens. + pub async fn complete_json_with_max_tokens( + &self, + system: &str, + user: &str, + max_tokens: u16, + ) -> LlmResult { + let response = self + .complete_with_max_tokens(system, user, max_tokens) + .await?; + self.parse_json(&response) + } + + /// Parse JSON from LLM response. + fn parse_json(&self, text: &str) -> LlmResult { + let json_text = self.extract_json(text); + serde_json::from_str(&json_text).map_err(|e| { + LlmError::Parse(format!("Failed to parse JSON: {}. Response: {}", e, text)) + }) + } + + /// Extract JSON from text (handles markdown code blocks). + fn extract_json<'a>(&self, text: &'a str) -> Cow<'a, str> { + let text = text.trim(); + + // Try markdown code block first + if text.starts_with("```") { + // Find the end of the first line (language identifier) + if let Some(start) = text.find('\n') { + let rest = &text[start + 1..]; + if let Some(end) = rest.find("```") { + return Cow::Borrowed(rest[..end].trim()); + } + } + } + + // Try to find JSON array or object + if text.starts_with('[') || text.starts_with('{') { + let open = text.chars().next().unwrap(); + let close = if open == '[' { ']' } else { '}' }; + + let mut depth = 0; + for (i, ch) in text.char_indices() { + match ch { + c if c == open => depth += 1, + c if c == close => { + depth -= 1; + if depth == 0 { + return Cow::Borrowed(&text[..=i]); + } + } + _ => {} + } + } + } + + Cow::Borrowed(text) + } +} + +impl Default for LlmClient { + fn default() -> Self { + Self::with_defaults() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_json_plain() { + let client = LlmClient::with_defaults(); + + let json = client.extract_json(r#"{"key": "value"}"#); + assert_eq!(json, r#"{"key": "value"}"#); + } + + #[test] + fn test_extract_json_code_block() { + let client = LlmClient::with_defaults(); + + let json = client.extract_json( + r#"```json +{"key": "value"} +```"#, + ); + assert_eq!(json, r#"{"key": "value"}"#); + } + + #[test] + fn test_extract_json_array() { + let client = LlmClient::with_defaults(); + + let json = client.extract_json(r#"[1, 2, 3]"#); + assert_eq!(json, r#"[1, 2, 3]"#); + } + + #[test] + fn test_extract_json_nested() { + let client = LlmClient::with_defaults(); + + let json = client.extract_json(r#"{"outer": {"inner": 1}}"#); + assert_eq!(json, r#"{"outer": {"inner": 1}}"#); + } + + #[test] + fn test_client_creation() { + let client = LlmClient::for_model("gpt-4o"); + assert_eq!(client.config().model, "gpt-4o"); + } + + #[test] + fn test_client_with_concurrency() { + use crate::llm::throttle::ConcurrencyConfig; + + let controller = ConcurrencyController::new(ConcurrencyConfig::conservative()); + let client = LlmClient::for_model("gpt-4o-mini").with_concurrency(controller); + + assert!(client.concurrency().is_some()); + } + + #[test] + fn test_client_with_shared_metrics() { + use vectorless_metrics::MetricsHub; + + let hub = MetricsHub::shared(); + let client = LlmClient::for_model("gpt-4o").with_shared_metrics(hub.clone()); + + // Client should still function normally + assert_eq!(client.config().model, "gpt-4o"); + assert!(client.fallback().is_none()); // no fallback added + assert!(client.concurrency().is_none()); // no concurrency added + } +} diff --git a/vectorless-core/vectorless-llm/src/config.rs b/vectorless-core/vectorless-llm/src/config.rs new file mode 100644 index 00000000..b85d9831 --- /dev/null +++ b/vectorless-core/vectorless-llm/src/config.rs @@ -0,0 +1,260 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Runtime LLM configuration types. + +use serde::{Deserialize, Serialize}; +use std::time::Duration; + +/// Runtime LLM client configuration. +/// +/// This is the runtime representation used by [`LlmClient`](super::LlmClient). +/// Created from the config-layer [`LlmConfig`](vectorless_config::LlmConfig) +/// during pool construction — users never construct this directly. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct LlmConfig { + /// Model name (e.g., "gpt-4o-mini", "gpt-4o"). + #[serde(default)] + pub model: String, + + /// API endpoint URL. + #[serde(default)] + pub endpoint: String, + + /// API key. + #[serde(default)] + pub api_key: Option, + + /// Maximum tokens for response. + #[serde(default = "default_max_tokens")] + pub max_tokens: usize, + + /// Temperature for generation. + #[serde(default = "default_temperature")] + pub temperature: f32, + + /// Retry configuration. + #[serde(default)] + pub retry: RetryConfig, + + /// Per-request timeout. 0 means no timeout (wait indefinitely). + #[serde(default)] + pub request_timeout_secs: u64, +} + +fn default_max_tokens() -> usize { + 2000 +} + +fn default_temperature() -> f32 { + 0.0 +} + +impl Default for LlmConfig { + fn default() -> Self { + Self { + model: String::new(), + endpoint: String::new(), + api_key: None, + max_tokens: default_max_tokens(), + temperature: default_temperature(), + retry: RetryConfig::default(), + request_timeout_secs: 0, + } + } +} + +impl LlmConfig { + /// Create a new config with a specific model. + pub fn new(model: impl Into) -> Self { + Self { + model: model.into(), + ..Self::default() + } + } + + /// Set the model. + pub fn with_model(mut self, model: impl Into) -> Self { + self.model = model.into(); + self + } + + /// Set the endpoint. + pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { + self.endpoint = endpoint.into(); + self + } + + /// Set the API key. + pub fn with_api_key(mut self, api_key: impl Into) -> Self { + self.api_key = Some(api_key.into()); + self + } + + /// Set the max tokens. + pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { + self.max_tokens = max_tokens; + self + } + + /// Set the temperature. + pub fn with_temperature(mut self, temperature: f32) -> Self { + self.temperature = temperature; + self + } + + /// Set the retry configuration. + pub fn with_retry(mut self, retry: RetryConfig) -> Self { + self.retry = retry; + self + } +} + +/// Runtime retry configuration for LLM calls. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetryConfig { + /// Maximum number of retry attempts (including initial call). + #[serde(default = "default_max_attempts")] + pub max_attempts: usize, + + /// Initial delay before first retry (milliseconds). + #[serde(default = "default_initial_delay_ms")] + pub initial_delay_ms: u64, + + /// Maximum delay between retries (milliseconds). + #[serde(default = "default_max_delay_ms")] + pub max_delay_ms: u64, + + /// Multiplier for exponential backoff. + #[serde(default = "default_multiplier")] + pub multiplier: f64, + + /// Whether to retry on rate limit errors. + #[serde(default = "default_true")] + pub retry_on_rate_limit: bool, +} + +fn default_max_attempts() -> usize { + 3 +} +fn default_initial_delay_ms() -> u64 { + 500 +} +fn default_max_delay_ms() -> u64 { + 30000 +} +fn default_multiplier() -> f64 { + 2.0 +} +fn default_true() -> bool { + true +} + +impl From<&vectorless_config::RetryConfig> for RetryConfig { + fn from(c: &vectorless_config::RetryConfig) -> Self { + Self { + max_attempts: c.max_attempts, + initial_delay_ms: c.initial_delay_ms, + max_delay_ms: c.max_delay_ms, + multiplier: c.multiplier, + retry_on_rate_limit: c.retry_on_rate_limit, + } + } +} + +impl Default for RetryConfig { + fn default() -> Self { + Self { + max_attempts: default_max_attempts(), + initial_delay_ms: default_initial_delay_ms(), + max_delay_ms: default_max_delay_ms(), + multiplier: default_multiplier(), + retry_on_rate_limit: default_true(), + } + } +} + +impl RetryConfig { + /// Create a new retry config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the maximum number of attempts. + pub fn with_max_attempts(mut self, max_attempts: usize) -> Self { + self.max_attempts = max_attempts; + self + } + + /// Set the initial delay (milliseconds). + pub fn with_initial_delay(mut self, delay_ms: u64) -> Self { + self.initial_delay_ms = delay_ms; + self + } + + /// Set the maximum delay (milliseconds). + pub fn with_max_delay(mut self, delay_ms: u64) -> Self { + self.max_delay_ms = delay_ms; + self + } + + /// Set the backoff multiplier. + pub fn with_multiplier(mut self, multiplier: f64) -> Self { + self.multiplier = multiplier; + self + } + + /// Set whether to retry on rate limit. + pub fn with_retry_on_rate_limit(mut self, retry: bool) -> Self { + self.retry_on_rate_limit = retry; + self + } + + /// Calculate delay for a given attempt (0-indexed). + pub fn delay_for_attempt(&self, attempt: usize) -> Duration { + let delay_ms = (self.initial_delay_ms as f64) * self.multiplier.powf(attempt as f64); + let delay_ms = delay_ms.min(self.max_delay_ms as f64); + Duration::from_millis(delay_ms as u64) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retry_delay_calculation() { + let config = RetryConfig::default(); + + // Initial delay is 500ms + assert_eq!(config.delay_for_attempt(0), Duration::from_millis(500)); + + // Second attempt: 500 * 2 = 1000ms + assert_eq!(config.delay_for_attempt(1), Duration::from_millis(1000)); + + // Third attempt: 500 * 4 = 2000ms + assert_eq!(config.delay_for_attempt(2), Duration::from_millis(2000)); + } + + #[test] + fn test_retry_delay_max_cap() { + let config = RetryConfig { + max_delay_ms: 1500, + ..RetryConfig::default() + }; + + // Should cap at max_delay_ms + assert_eq!(config.delay_for_attempt(5), Duration::from_millis(1500)); + } + + #[test] + fn test_llm_config_builder() { + let config = LlmConfig::new("gpt-4o") + .with_max_tokens(1000) + .with_temperature(0.5); + + assert_eq!(config.model, "gpt-4o"); + assert_eq!(config.max_tokens, 1000); + assert!((config.temperature - 0.5).abs() < 0.001); + } +} diff --git a/vectorless-core/vectorless-llm/src/error.rs b/vectorless-core/vectorless-llm/src/error.rs new file mode 100644 index 00000000..94598345 --- /dev/null +++ b/vectorless-core/vectorless-llm/src/error.rs @@ -0,0 +1,135 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified LLM error types. + +use thiserror::Error; + +/// LLM error types. +#[derive(Debug, Clone, Error)] +pub enum LlmError { + /// API error from the LLM provider. + #[error("LLM API error: {0}")] + Api(String), + + /// Request construction error. + #[error("Request error: {0}")] + Request(String), + + /// Configuration error. + #[error("Configuration error: {0}")] + Config(String), + + /// Response parsing error. + #[error("Failed to parse response: {0}")] + Parse(String), + + /// Rate limit exceeded. + #[error("Rate limit exceeded: {0}")] + RateLimit(String), + + /// Request timeout. + #[error("Request timeout: {0}")] + Timeout(String), + + /// No content returned. + #[error("LLM returned no content")] + NoContent, + + /// Retry exhausted. + #[error("Retry exhausted after {attempts} attempts: {last_error}")] + RetryExhausted { + /// Number of attempts made. + attempts: usize, + /// The last error encountered. + last_error: String, + }, +} + +impl LlmError { + /// Check if the error is retryable. + pub fn is_retryable(&self) -> bool { + match self { + LlmError::Api(msg) => { + // Rate limits and temporary failures are retryable + let msg_lower = msg.to_lowercase(); + msg_lower.contains("rate limit") + || msg_lower.contains("429") + || msg_lower.contains("503") + || msg_lower.contains("502") + || msg_lower.contains("timeout") + || msg_lower.contains("overloaded") + } + LlmError::Timeout(_) => true, + LlmError::RateLimit(_) => true, + _ => false, + } + } + + /// Classify an API error message into the appropriate error type. + pub fn from_api_message(msg: &str) -> Self { + let msg_lower = msg.to_lowercase(); + + if msg_lower.contains("rate limit") || msg_lower.contains("429") { + LlmError::RateLimit(msg.to_string()) + } else if msg_lower.contains("timeout") { + LlmError::Timeout(msg.to_string()) + } else { + LlmError::Api(msg.to_string()) + } + } +} + +impl From for LlmError { + fn from(e: async_openai::error::OpenAIError) -> Self { + let msg = e.to_string(); + LlmError::from_api_message(&msg) + } +} + +impl From for LlmError { + fn from(e: serde_json::Error) -> Self { + LlmError::Parse(e.to_string()) + } +} + +impl From for vectorless_error::Error { + fn from(e: LlmError) -> Self { + vectorless_error::Error::Llm(e.to_string()) + } +} + +impl From for String { + fn from(e: LlmError) -> Self { + e.to_string() + } +} + +/// Specialized result type for LLM operations. +pub type LlmResult = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_is_retryable() { + assert!(LlmError::RateLimit("test".to_string()).is_retryable()); + assert!(LlmError::Timeout("test".to_string()).is_retryable()); + assert!(LlmError::Api("rate limit exceeded".to_string()).is_retryable()); + assert!(!LlmError::Config("test".to_string()).is_retryable()); + assert!(!LlmError::Parse("test".to_string()).is_retryable()); + } + + #[test] + fn test_from_api_message() { + let err = LlmError::from_api_message("Rate limit exceeded"); + assert!(matches!(err, LlmError::RateLimit(_))); + + let err = LlmError::from_api_message("Request timeout"); + assert!(matches!(err, LlmError::Timeout(_))); + + let err = LlmError::from_api_message("Internal server error"); + assert!(matches!(err, LlmError::Api(_))); + } +} diff --git a/vectorless-core/vectorless-llm/src/executor.rs b/vectorless-core/vectorless-llm/src/executor.rs new file mode 100644 index 00000000..d8bf02c9 --- /dev/null +++ b/vectorless-core/vectorless-llm/src/executor.rs @@ -0,0 +1,568 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified executor coordinating throttle, retry, and fallback. +//! +//! This module provides the `LlmExecutor` which coordinates: +//! - **Throttle** — Rate limiting and concurrency control +//! - **Retry** — Exponential backoff on transient errors +//! - **Fallback** — Model/endpoint degradation on persistent failures +//! +//! # Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────────┐ +//! │ LlmExecutor │ +//! │ │ +//! │ execute() ──▶ [Throttle] ──▶ [API Call] ──▶ [Success/Error] │ +//! │ │ │ │ +//! │ acquire permit do request │ +//! │ │ │ +//! │ ┌──────────┴──────────┐ │ +//! │ ▼ ▼ │ +//! │ [Retry] [Fallback] │ +//! │ │ │ │ +//! │ exponential model/endpoint │ +//! │ backoff degradation │ +//! │ │ +//! └─────────────────────────────────────────────────────────────────┘ +//! ``` +//! +//! # Example +//! +//! ```rust,no_run +//! use vectorless::llm::{LlmExecutor, LlmConfig, FallbackChain, FallbackConfig}; +//! use vectorless::throttle::{ConcurrencyController, ConcurrencyConfig}; +//! +//! # #[tokio::main] +//! # async fn main() -> vectorless::llm::LlmResult<()> { +//! let config = LlmConfig::new("gpt-4o"); +//! let throttle = ConcurrencyController::new(ConcurrencyConfig::default()); +//! let fallback = FallbackChain::new(FallbackConfig::default()); +//! +//! let executor = LlmExecutor::new(config) +//! .with_throttle(throttle) +//! .with_fallback(fallback); +//! +//! let result = executor.complete("You are helpful.", "Hello!").await?; +//! # Ok(()) +//! # } +//! ``` + +use std::sync::Arc; +use std::time::Duration; +use tracing::{debug, info, warn}; + +use async_openai::types::chat::{ + ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, + CreateChatCompletionRequestArgs, +}; + +use super::config::LlmConfig; +use super::error::{LlmError, LlmResult}; +use super::fallback::{FallbackChain, FallbackStep}; +use super::throttle::ConcurrencyController; +use vectorless_metrics::MetricsHub; + +/// Unified executor for LLM operations. +/// +/// Coordinates throttle, retry, and fallback mechanisms. +#[derive(Clone)] +pub struct LlmExecutor { + /// LLM configuration. + config: LlmConfig, + /// Reusable async-openai client (created once, shared via Arc). + openai_client: Arc>, + /// Throttle controller (optional). + throttle: Option>, + /// Fallback chain (optional). + fallback: Option>, + /// Metrics hub for recording LLM call statistics (optional). + metrics: Option>, +} + +impl std::fmt::Debug for LlmExecutor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("LlmExecutor") + .field("model", &self.config.model) + .field("endpoint", &self.config.endpoint) + .field("has_throttle", &self.throttle.is_some()) + .field("has_fallback", &self.fallback.is_some()) + .field("has_openai_client", &true) + .field("has_metrics", &self.metrics.is_some()) + .finish() + } +} + +impl LlmExecutor { + /// Create a new executor with the given configuration. + pub fn new(config: LlmConfig) -> Self { + let openai_client = Self::build_openai_client(&config); + Self { + config, + openai_client: Arc::new(openai_client), + throttle: None, + fallback: None, + metrics: None, + } + } + + /// Build the async-openai client from config. + fn build_openai_client( + config: &LlmConfig, + ) -> async_openai::Client { + let api_key = config.api_key.clone().unwrap_or_default(); + let endpoint = if config.endpoint.is_empty() { + "https://api.openai.com/v1".to_string() + } else { + config.endpoint.clone() + }; + let openai_config = async_openai::config::OpenAIConfig::new() + .with_api_key(api_key) + .with_api_base(endpoint); + async_openai::Client::with_config(openai_config) + } + + /// Create an executor with default configuration. + pub fn with_defaults() -> Self { + Self::new(LlmConfig::default()) + } + + /// Create an executor for a specific model. + pub fn for_model(model: impl Into) -> Self { + Self::new(LlmConfig::new(model)) + } + + /// Add throttle control. + pub fn with_throttle(mut self, controller: ConcurrencyController) -> Self { + self.throttle = Some(Arc::new(controller)); + self + } + + /// Add throttle control from an existing Arc. + pub fn with_shared_throttle(mut self, controller: Arc) -> Self { + self.throttle = Some(controller); + self + } + + /// Add fallback chain. + pub fn with_fallback(mut self, chain: FallbackChain) -> Self { + self.fallback = Some(Arc::new(chain)); + self + } + + /// Add fallback chain from an existing Arc. + pub fn with_shared_fallback(mut self, chain: Arc) -> Self { + self.fallback = Some(chain); + self + } + + /// Add metrics hub for recording LLM call statistics. + pub fn with_shared_metrics(mut self, hub: Arc) -> Self { + self.metrics = Some(hub); + self + } + + /// Replace the async-openai client (used when pool reconfigures clients). + pub fn with_openai_client( + mut self, + client: Arc>, + ) -> Self { + self.openai_client = client; + self + } + + /// Get the configuration. + pub fn config(&self) -> &LlmConfig { + &self.config + } + + /// Get the throttle controller (if any). + pub fn throttle(&self) -> Option<&ConcurrencyController> { + self.throttle.as_deref() + } + + /// Get the fallback chain (if any). + pub fn fallback(&self) -> Option<&FallbackChain> { + self.fallback.as_deref() + } + + /// Execute a completion with unified coordination. + /// + /// This method coordinates: + /// 1. Throttle: Acquire permit before API call + /// 2. Retry: Exponential backoff on transient errors + /// 3. Fallback: Model/endpoint degradation on persistent failures + pub async fn complete(&self, system: &str, user: &str) -> LlmResult { + self.execute_with_context(system, user, None).await + } + + /// Execute a completion with custom max tokens. + pub async fn complete_with_max_tokens( + &self, + system: &str, + user: &str, + max_tokens: u16, + ) -> LlmResult { + self.execute_with_context(system, user, Some(max_tokens)) + .await + } + + /// Internal execution with full coordination. + async fn execute_with_context( + &self, + system: &str, + user: &str, + max_tokens: Option, + ) -> LlmResult { + let mut attempts = 0; + let mut current_model = self.config.model.clone(); + let mut fallback_history: Vec = vec![]; + let mut total_attempts_including_fallback = 0; + + loop { + attempts += 1; + total_attempts_including_fallback += 1; + + // Safety check: prevent infinite loops + const MAX_TOTAL_ATTEMPTS: usize = 20; + if total_attempts_including_fallback > MAX_TOTAL_ATTEMPTS { + warn!( + total_attempts = total_attempts_including_fallback, + "Exceeded maximum total attempts, aborting" + ); + return Err(LlmError::RetryExhausted { + attempts: total_attempts_including_fallback, + last_error: "Exceeded maximum total attempts including fallbacks".to_string(), + }); + } + + // Step 1: Acquire throttle permit + let _permit = self.acquire_throttle_permit().await; + + debug!( + attempt = attempts, + model = %current_model, + "Executing LLM request" + ); + + // Step 2: Execute the request (with optional timeout) + let request_future = self.do_request(¤t_model, system, user, max_tokens); + let result = if self.config.request_timeout_secs > 0 { + let timeout = Duration::from_secs(self.config.request_timeout_secs); + match tokio::time::timeout(timeout, request_future).await { + Ok(r) => r, + Err(_) => { + warn!( + timeout_secs = self.config.request_timeout_secs, + model = %current_model, + "LLM request timed out" + ); + if let Some(ref metrics) = self.metrics { + metrics.record_llm_timeout(); + } + Err(LlmError::Timeout(format!( + "Request timed out after {}s", + self.config.request_timeout_secs + ))) + } + } + } else { + request_future.await + }; + + match result { + Ok(response) => { + if fallback_history.is_empty() { + debug!( + attempts = attempts, + "LLM request succeeded without fallback" + ); + } else { + info!( + attempts = attempts, + fallback_steps = fallback_history.len(), + "LLM request succeeded after fallback" + ); + } + return Ok(response); + } + Err(error) => { + // Record specific error events + if let Some(ref metrics) = self.metrics { + match &error { + LlmError::RateLimit(_) => metrics.record_llm_rate_limit(), + LlmError::Timeout(_) => metrics.record_llm_timeout(), + _ => {} + } + } + + // Step 3: Check if we should retry + if self.should_retry(&error, attempts) { + let delay = self.retry_delay(attempts); + warn!( + attempt = attempts, + max_attempts = self.config.retry.max_attempts, + delay_ms = delay.as_millis() as u64, + error = %error, + "LLM call failed, retrying..." + ); + tokio::time::sleep(delay).await; + continue; + } + + // Step 4: Check if we should fallback + if let Some(ref fallback) = self.fallback { + if fallback.should_fallback(&error) { + let mut fell_back = false; + + // Try next model + if let Some(next_model) = fallback.next_model(¤t_model) { + info!( + from_model = %current_model, + to_model = %next_model, + "Falling back to next model" + ); + if let Some(ref metrics) = self.metrics { + metrics.record_llm_fallback(); + } + fallback.record_fallback( + &mut fallback_history, + current_model.clone(), + Some(next_model.clone()), + self.config.endpoint.clone(), + None, + error.to_string(), + ); + current_model = next_model; + attempts = 0; // Reset retry counter for new model + fell_back = true; + } + + if fell_back { + continue; + } + } + } + + // Step 5: No more retries or fallbacks, return error + warn!( + attempts = attempts, + fallback_steps = fallback_history.len(), + error = %error, + "LLM call failed, no more retries or fallbacks available" + ); + return Err(error); + } + } + } + } + + /// Acquire throttle permit (if configured). + async fn acquire_throttle_permit(&self) -> Option> { + if let Some(ref throttle) = self.throttle { + throttle.acquire().await + } else { + None + } + } + + /// Check if we should retry based on error and attempt count. + fn should_retry(&self, error: &LlmError, attempts: usize) -> bool { + if attempts >= self.config.retry.max_attempts { + return false; + } + + // Use unified retryable check, with rate-limit override + if matches!(error, LlmError::RateLimit(_)) { + self.config.retry.retry_on_rate_limit + } else { + error.is_retryable() + } + } + + /// Calculate retry delay for a given attempt. + fn retry_delay(&self, attempt: usize) -> Duration { + self.config.retry.delay_for_attempt(attempt - 1) + } + + /// Execute the actual API request. + async fn do_request( + &self, + model: &str, + system: &str, + user: &str, + _max_tokens: Option, + ) -> LlmResult { + // Build request — only set max_tokens when explicitly provided, + // letting the API use its own default otherwise. + let request = CreateChatCompletionRequestArgs::default() + .model(model) + .messages([ + ChatCompletionRequestSystemMessage::from(system).into(), + ChatCompletionRequestUserMessage::from(user).into(), + ]) + .temperature(self.config.temperature) + .build() + .map_err(|e| LlmError::Request(format!("Failed to build request: {}", e)))?; + + // if let Some(mt) = max_tokens { + // request.max_tokens = Some(mt as u32); + // } + + info!( + "LLM request → endpoint: {}, model: {}, system: {} chars, user: {} chars", + self.config.endpoint, + model, + system.len(), + user.len() + ); + + let request_start = std::time::Instant::now(); + let response = match self.openai_client.chat().create(request).await { + Ok(r) => r, + Err(e) => { + let elapsed = request_start.elapsed(); + if let Some(ref metrics) = self.metrics { + metrics.record_llm_call(0, 0, elapsed.as_millis() as u64, false); + } + let msg = e.to_string(); + return Err(LlmError::from_api_message(&msg)); + } + }; + let request_elapsed = request_start.elapsed(); + + let usage = response.usage.as_ref(); + let prompt_tokens = usage.map(|u| u.prompt_tokens).unwrap_or(0); + let completion_tokens = usage.map(|u| u.completion_tokens).unwrap_or(0); + + let first_choice = response.choices.first(); + + if first_choice.is_none() { + if let Some(ref metrics) = self.metrics { + metrics.record_llm_call( + prompt_tokens as u64, + completion_tokens as u64, + request_elapsed.as_millis() as u64, + false, + ); + } + return Err(LlmError::NoContent); + } + + let choice = first_choice.unwrap(); + let content = choice.message.content.clone().unwrap_or_default(); + + if content.is_empty() { + let has_tool_calls = choice + .message + .tool_calls + .as_ref() + .map_or(false, |t| !t.is_empty()); + let finish_reason = format!("{:?}", choice.finish_reason); + warn!( + elapsed_ms = request_elapsed.as_millis(), + prompt_tokens, + completion_tokens, + has_tool_calls, + finish_reason, + "LLM returned empty content field" + ); + } + + if let Some(ref metrics) = self.metrics { + metrics.record_llm_call( + prompt_tokens as u64, + completion_tokens as u64, + request_elapsed.as_millis() as u64, + true, + ); + } + + if content.is_empty() { + warn!( + elapsed_ms = request_elapsed.as_millis(), + prompt_tokens, completion_tokens, "LLM returned empty response" + ); + } else { + info!( + "LLM response ← {}ms, tokens: {} prompt + {} completion, content: {} chars", + request_elapsed.as_millis(), + prompt_tokens, + completion_tokens, + content.len() + ); + } + + Ok(content) + } +} + +impl Default for LlmExecutor { + fn default() -> Self { + Self::with_defaults() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_executor_creation() { + let executor = LlmExecutor::for_model("gpt-4o"); + assert_eq!(executor.config().model, "gpt-4o"); + assert!(executor.throttle().is_none()); + assert!(executor.fallback().is_none()); + } + + #[test] + fn test_executor_with_throttle() { + use crate::llm::throttle::ConcurrencyConfig; + + let controller = ConcurrencyController::new(ConcurrencyConfig::conservative()); + let executor = LlmExecutor::for_model("gpt-4o-mini").with_throttle(controller); + + assert!(executor.throttle().is_some()); + } + + #[test] + fn test_should_retry() { + let executor = LlmExecutor::with_defaults(); + + // Should retry on timeout + assert!(executor.should_retry(&LlmError::Timeout("test".to_string()), 1)); + + // Should retry on rate limit (if configured) + assert!(executor.should_retry(&LlmError::RateLimit("test".to_string()), 1)); + + // Should not retry on config error + assert!(!executor.should_retry(&LlmError::Config("test".to_string()), 1)); + + // Should not retry after max attempts + assert!(!executor.should_retry(&LlmError::Timeout("test".to_string()), 100)); + } + + #[test] + fn test_retry_delay() { + let executor = LlmExecutor::with_defaults(); + + // First retry attempt (attempt 1 -> delay_for_attempt(0)) + let delay = executor.retry_delay(1); + assert_eq!(delay, Duration::from_millis(500)); + } + + #[test] + fn test_executor_with_metrics() { + let hub = MetricsHub::shared(); + let executor = LlmExecutor::for_model("gpt-4o").with_shared_metrics(hub); + + assert!(executor.metrics.is_some()); + } + + #[test] + fn test_executor_without_metrics() { + let executor = LlmExecutor::for_model("gpt-4o"); + assert!(executor.metrics.is_none()); + } +} diff --git a/vectorless-core/vectorless-llm/src/fallback.rs b/vectorless-core/vectorless-llm/src/fallback.rs new file mode 100644 index 00000000..d9a84c0e --- /dev/null +++ b/vectorless-core/vectorless-llm/src/fallback.rs @@ -0,0 +1,378 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Fallback and error recovery for LLM calls. +//! +//! This module provides graceful degradation when LLM API calls fail: +//! - Automatic model switching (e.g., gpt-4o → gpt-4o-mini) +//! - Automatic endpoint switching +//! - Configurable retry and fallback behaviors +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::llm::fallback::{FallbackChain, FallbackConfig}; +//! +//! let config = FallbackConfig::default(); +//! let chain = FallbackChain::new(config); +//! +//! // Check if fallback is enabled +//! assert!(chain.is_enabled()); +//! ``` + +use serde::{Deserialize, Serialize}; +use tracing::{debug, info, warn}; + +use super::error::LlmError; +use vectorless_config::{ + FallbackBehavior, FallbackConfig as ConfigFallbackConfig, OnAllFailedBehavior, +}; + +/// Result from a fallback-aware LLM call. +#[derive(Debug, Clone)] +pub struct FallbackResult { + /// The actual result. + pub result: T, + /// Whether the result came from a fallback model/endpoint. + pub degraded: bool, + /// The model that was ultimately used. + pub model: String, + /// The endpoint that was ultimately used. + pub endpoint: String, + /// History of fallback attempts (for debugging). + pub fallback_history: Vec, +} + +impl FallbackResult { + /// Create a successful result without fallback. + pub fn success(result: T, model: String, endpoint: String) -> Self { + Self { + result, + degraded: false, + model, + endpoint, + fallback_history: Vec::new(), + } + } + + /// Create a result from a fallback. + pub fn from_fallback( + result: T, + model: String, + endpoint: String, + history: Vec, + ) -> Self { + Self { + result, + degraded: true, + model, + endpoint, + fallback_history: history, + } + } +} + +/// A single step in the fallback chain. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FallbackStep { + /// The model we tried. + pub from_model: String, + /// The model we fell back to (if any). + pub to_model: Option, + /// The endpoint we tried. + pub from_endpoint: String, + /// The endpoint we fell back to (if any). + pub to_endpoint: Option, + /// The reason for fallback. + pub reason: String, +} + +/// Fallback chain manager. +#[derive(Debug, Clone)] +pub struct FallbackChain { + config: FallbackConfig, +} + +/// Runtime fallback configuration (converted from config::FallbackConfig). +#[derive(Debug, Clone)] +pub struct FallbackConfig { + /// Whether fallback is enabled. + pub enabled: bool, + /// Fallback models in priority order. + pub models: Vec, + /// Fallback endpoints in priority order. + pub endpoints: Vec, + /// Behavior on rate limit error. + pub on_rate_limit: FallbackBehavior, + /// Behavior on timeout error. + pub on_timeout: FallbackBehavior, + /// Behavior when all attempts fail. + pub on_all_failed: OnAllFailedBehavior, +} + +impl Default for FallbackConfig { + fn default() -> Self { + Self { + enabled: true, + models: vec!["gpt-4o-mini".to_string(), "glm-4-flash".to_string()], + endpoints: vec![], + on_rate_limit: FallbackBehavior::RetryThenFallback, + on_timeout: FallbackBehavior::RetryThenFallback, + on_all_failed: OnAllFailedBehavior::ReturnError, + } + } +} + +impl From for FallbackConfig { + fn from(config: ConfigFallbackConfig) -> Self { + Self { + enabled: config.enabled, + models: config.models, + endpoints: config.endpoints, + on_rate_limit: config.on_rate_limit, + on_timeout: config.on_timeout, + on_all_failed: config.on_all_failed, + } + } +} + +impl FallbackConfig { + /// Create a new fallback config. + pub fn new() -> Self { + Self::default() + } + + /// Disable fallback. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } +} + +impl FallbackChain { + /// Create a new fallback chain with the given configuration. + pub fn new(config: FallbackConfig) -> Self { + Self { config } + } + + /// Create a disabled fallback chain (no fallback). + pub fn disabled() -> Self { + Self::new(FallbackConfig::disabled()) + } + + /// Get the configuration. + pub fn config(&self) -> &FallbackConfig { + &self.config + } + + /// Check if fallback is enabled. + pub fn is_enabled(&self) -> bool { + self.config.enabled + } + + /// Determine the appropriate behavior for an error. + pub fn behavior_for_error(&self, error: &LlmError) -> FallbackBehavior { + match error { + LlmError::RateLimit(_) => self.config.on_rate_limit, + LlmError::Timeout(_) => self.config.on_timeout, + _ => FallbackBehavior::Fail, + } + } + + /// Check if an error should trigger fallback. + pub fn should_fallback(&self, error: &LlmError) -> bool { + if !self.config.enabled { + return false; + } + + match self.behavior_for_error(error) { + FallbackBehavior::Fallback | FallbackBehavior::RetryThenFallback => true, + FallbackBehavior::Retry | FallbackBehavior::Fail => false, + } + } + + /// Check if an error should trigger retry. + pub fn should_retry(&self, error: &LlmError) -> bool { + if !self.config.enabled { + return false; + } + + match self.behavior_for_error(error) { + FallbackBehavior::Retry | FallbackBehavior::RetryThenFallback => true, + FallbackBehavior::Fallback | FallbackBehavior::Fail => false, + } + } + + /// Get the next fallback model. + pub fn next_model(&self, current: &str) -> Option { + let models = &self.config.models; + let current_idx = models.iter().position(|m| m == current); + + match current_idx { + // Current model is in the list, try next one + Some(idx) if idx + 1 < models.len() => { + let next = models[idx + 1].clone(); + info!(from = current, to = %next, "Falling back to next model"); + Some(next) + } + // Current model is the last in the list, no more fallbacks + Some(_) => { + warn!( + model = current, + "Already at last fallback model, no more available" + ); + None + } + // Current model not in fallback list, try first fallback + None => { + if !models.is_empty() && models[0] != current { + let next = models[0].clone(); + info!(from = current, to = %next, "Falling back to first fallback model"); + Some(next) + } else { + warn!(model = current, "No more fallback models available"); + None + } + } + } + } + + /// Get the next fallback endpoint. + pub fn next_endpoint(&self, current: &str) -> Option { + let endpoints = &self.config.endpoints; + let current_idx = endpoints.iter().position(|e| e == current); + + match current_idx { + // Current endpoint is in the list, try next one + Some(idx) if idx + 1 < endpoints.len() => { + let next = endpoints[idx + 1].clone(); + info!(from = current, to = %next, "Falling back to next endpoint"); + Some(next) + } + // Current endpoint is the last in the list, no more fallbacks + Some(_) => { + warn!( + endpoint = current, + "Already at last fallback endpoint, no more available" + ); + None + } + // Current endpoint not in fallback list, try first fallback + None => { + if !endpoints.is_empty() && endpoints[0] != current { + let next = endpoints[0].clone(); + info!(from = current, to = %next, "Falling back to first fallback endpoint"); + Some(next) + } else { + debug!(endpoint = current, "No more fallback endpoints available"); + None + } + } + } + } + + /// Record a fallback step. + pub fn record_fallback( + &self, + history: &mut Vec, + from_model: String, + to_model: Option, + from_endpoint: String, + to_endpoint: Option, + reason: String, + ) { + let step = FallbackStep { + from_model, + to_model, + from_endpoint, + to_endpoint, + reason, + }; + debug!(?step, "Recording fallback step"); + history.push(step); + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fallback_config_default() { + let config = FallbackConfig::default(); + assert!(config.enabled); + assert!(!config.models.is_empty()); + } + + #[test] + fn test_fallback_chain_disabled() { + let chain = FallbackChain::disabled(); + assert!(!chain.is_enabled()); + } + + #[test] + fn test_next_model() { + let config = FallbackConfig { + models: vec![ + "gpt-4o".to_string(), + "gpt-4o-mini".to_string(), + "glm-4-flash".to_string(), + ], + ..FallbackConfig::default() + }; + let chain = FallbackChain::new(config); + + // Should get next model in chain + assert_eq!(chain.next_model("gpt-4o"), Some("gpt-4o-mini".to_string())); + assert_eq!( + chain.next_model("gpt-4o-mini"), + Some("glm-4-flash".to_string()) + ); + assert_eq!(chain.next_model("glm-4-flash"), None); + } + + #[test] + fn test_next_model_not_in_list() { + let config = FallbackConfig { + models: vec!["gpt-4o-mini".to_string()], + ..FallbackConfig::default() + }; + let chain = FallbackChain::new(config); + + // Should fall back to first model in list + assert_eq!( + chain.next_model("unknown-model"), + Some("gpt-4o-mini".to_string()) + ); + } + + #[test] + fn test_behavior_for_rate_limit() { + let config = FallbackConfig { + on_rate_limit: FallbackBehavior::Fallback, + ..FallbackConfig::default() + }; + let chain = FallbackChain::new(config); + + let error = LlmError::RateLimit("Rate limited".to_string()); + assert_eq!(chain.behavior_for_error(&error), FallbackBehavior::Fallback); + } + + #[test] + fn test_should_fallback() { + let config = FallbackConfig { + enabled: true, + on_rate_limit: FallbackBehavior::RetryThenFallback, + ..FallbackConfig::default() + }; + let chain = FallbackChain::new(config); + + let error = LlmError::RateLimit("Rate limited".to_string()); + assert!(chain.should_fallback(&error)); + + let chain_disabled = FallbackChain::disabled(); + assert!(!chain_disabled.should_fallback(&error)); + } +} diff --git a/vectorless-core/vectorless-llm/src/lib.rs b/vectorless-core/vectorless-llm/src/lib.rs new file mode 100644 index 00000000..e61c6eb7 --- /dev/null +++ b/vectorless-core/vectorless-llm/src/lib.rs @@ -0,0 +1,45 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified LLM client module. +//! +//! This module provides a unified interface for all LLM operations across the codebase: +//! - **Index** — Document indexing and summarization +//! - **Retrieval** — Document tree navigation +//! - **Pilot** — Navigation guidance +//! +//! # Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────────┐ +//! │ LlmPool │ +//! │ │ +//! │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ +//! │ │ index │ │ retrieval │ │ pilot │ │ +//! │ │ LlmClient │ │ LlmClient │ │ LlmClient │ │ +//! │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ +//! │ │ │ │ │ +//! │ └────────────────┼────────────────┘ │ +//! │ │ │ +//! │ ▼ │ +//! │ ┌─────────────────────┐ │ +//! │ │ async-openai │ │ +//! │ └─────────────────────┘ │ +//! └─────────────────────────────────────────────────────────────────┘ +//! ``` + +mod client; +pub(crate) mod config; +mod error; +mod executor; +mod fallback; +pub(crate) mod memo; +mod pool; +pub(crate) mod throttle; + +pub use client::LlmClient; +pub use error::LlmResult; +pub use pool::LlmPool; + +// Re-export vectorless_error types for internal use +pub(crate) use vectorless_error::{Error, Result}; diff --git a/vectorless-core/vectorless-llm/src/memo/mod.rs b/vectorless-core/vectorless-llm/src/memo/mod.rs new file mode 100644 index 00000000..79c9ae78 --- /dev/null +++ b/vectorless-core/vectorless-llm/src/memo/mod.rs @@ -0,0 +1,14 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! LLM Memoization system for caching expensive LLM calls. +//! +//! Provides a caching layer for LLM-generated content, avoiding +//! redundant API calls via content-addressed LRU cache with TTL +//! and optional disk persistence. + +mod store; +mod types; + +pub use store::MemoStore; +pub use types::{MemoKey, MemoValue}; diff --git a/vectorless-core/vectorless-llm/src/memo/store.rs b/vectorless-core/vectorless-llm/src/memo/store.rs new file mode 100644 index 00000000..0c681ab0 --- /dev/null +++ b/vectorless-core/vectorless-llm/src/memo/store.rs @@ -0,0 +1,679 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Memoization store implementation. +//! +//! Provides an in-memory LRU cache with optional disk persistence. + +use std::collections::HashMap; +use std::future::Future; +use std::path::Path; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; + +use chrono::Duration; +use lru::LruCache; +use parking_lot::RwLock; +use serde::{Deserialize, Serialize}; +use tracing::{debug, info}; + +use super::types::{MemoEntry, MemoKey, MemoOpType, MemoStats, MemoValue}; +use vectorless_error::Result; +use vectorless_utils::fingerprint::Fingerprint; + +/// Default TTL for cache entries (7 days). +const DEFAULT_TTL: Duration = Duration::days(7); + +/// Default maximum cache size. +const DEFAULT_MAX_SIZE: usize = 10_000; + +/// Serializable format for memo store persistence. +#[derive(Debug, Clone, Serialize, Deserialize)] +struct MemoStoreData { + /// Format version. + version: u32, + + /// Cache entries. + entries: HashMap, + + /// Statistics. + stats: MemoStats, +} + +/// Lock-free atomic statistics for concurrent access. +#[derive(Debug)] +struct AtomicStats { + hits: AtomicU64, + misses: AtomicU64, + tokens_saved: AtomicU64, +} + +impl AtomicStats { + fn new() -> Self { + Self { + hits: AtomicU64::new(0), + misses: AtomicU64::new(0), + tokens_saved: AtomicU64::new(0), + } + } + + fn record_hit(&self) { + self.hits.fetch_add(1, Ordering::Relaxed); + } + + fn record_miss(&self) { + self.misses.fetch_add(1, Ordering::Relaxed); + } + + fn add_tokens_saved(&self, tokens: u64) { + self.tokens_saved.fetch_add(tokens, Ordering::Relaxed); + } + + fn snapshot(&self) -> (u64, u64, u64) { + ( + self.hits.load(Ordering::Relaxed), + self.misses.load(Ordering::Relaxed), + self.tokens_saved.load(Ordering::Relaxed), + ) + } + + fn load_from(&self, hits: u64, misses: u64, tokens_saved: u64) { + self.hits.store(hits, Ordering::Relaxed); + self.misses.store(misses, Ordering::Relaxed); + self.tokens_saved.store(tokens_saved, Ordering::Relaxed); + } +} + +/// LLM Memoization store. +/// +/// Provides caching for expensive LLM operations with: +/// - LRU eviction policy +/// - TTL-based expiration +/// - Optional disk persistence +/// - Thread-safe access +/// +/// # Example +/// +/// ```rust,ignore +/// let store = MemoStore::new(); +/// +/// let summary = store.get_or_compute( +/// MemoKey::summary(&content_fp), +/// || async { +/// llm.generate_summary(content).await +/// } +/// ).await?; +/// ``` +pub struct MemoStore { + /// LRU cache for entries. + cache: Arc>>, + + /// Lock-free statistics. + stats: Arc, + + /// TTL for entries. + ttl: Duration, + + /// Model identifier for cache keys. + model_id: Option, + + /// Version for cache invalidation. + version: u32, +} + +impl std::fmt::Debug for MemoStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("MemoStore") + .field("ttl", &self.ttl) + .field("model_id", &self.model_id) + .field("version", &self.version) + .field("cache_len", &self.cache.read().len()) + .finish() + } +} + +impl Clone for MemoStore { + fn clone(&self) -> Self { + Self { + cache: Arc::clone(&self.cache), + stats: Arc::clone(&self.stats), + ttl: self.ttl, + model_id: self.model_id.clone(), + version: self.version, + } + } +} + +impl MemoStore { + /// Create a new memo store with default size. + pub fn new() -> Self { + Self::with_capacity(DEFAULT_MAX_SIZE) + } + + /// Create a new memo store with specified capacity. + pub fn with_capacity(capacity: usize) -> Self { + Self { + cache: Arc::new(RwLock::new(LruCache::new( + std::num::NonZeroUsize::new(capacity) + .unwrap_or(std::num::NonZeroUsize::new(1000).unwrap()), + ))), + stats: Arc::new(AtomicStats::new()), + ttl: DEFAULT_TTL, + model_id: None, + version: 1, + } + } + + /// Set the TTL for cache entries. + pub fn with_ttl(mut self, ttl: Duration) -> Self { + self.ttl = ttl; + self + } + + /// Set the model identifier. + pub fn with_model(mut self, model_id: &str) -> Self { + self.model_id = Some(model_id.to_string()); + self + } + + /// Set the version. + pub fn with_version(mut self, version: u32) -> Self { + self.version = version; + self + } + + /// Get a cached value if present and not expired. + pub fn get(&self, key: &MemoKey) -> Option { + let full_key = self.make_key(key); + let mut cache = self.cache.write(); + + if let Some(entry) = cache.get_mut(&full_key) { + if entry.is_expired(self.ttl) { + cache.pop(&full_key); + return None; + } + entry.record_hit(); + debug!("Memo cache hit for {:?}", key.op_type); + return Some(entry.value.clone()); + } + + None + } + + /// Put a value in the cache. + pub fn put(&self, key: MemoKey, value: MemoValue) { + self.put_with_tokens(key, value, 0); + } + + /// Put a value in the cache with token count. + pub fn put_with_tokens(&self, key: MemoKey, value: MemoValue, tokens_saved: u64) { + let full_key = self.make_key(&key); + let entry = MemoEntry::with_tokens(value, tokens_saved); + + let mut cache = self.cache.write(); + cache.put(full_key, entry); + + debug!("Memo cache put for {:?}", key.op_type); + } + + /// Get a value or compute it if not present. + /// + /// This is the primary method for using the memo store. + /// It will return the cached value if present, or call the + /// provided compute function and cache the result. + pub async fn get_or_compute(&self, key: MemoKey, compute: F) -> Result + where + F: FnOnce() -> Fut, + Fut: Future>, // (value, tokens) + { + // Check cache first (synchronous) + if let Some(value) = self.get(&key) { + self.stats.record_hit(); + return Ok(value); + } + + // Record miss + self.stats.record_miss(); + + // Compute + let (value, tokens) = compute().await?; + + // Cache result + self.put_with_tokens(key.clone(), value.clone(), tokens); + + // Update tokens saved + self.stats.add_tokens_saved(tokens); + + Ok(value) + } + + /// Check if a key exists in the cache. + pub fn contains(&self, key: &MemoKey) -> bool { + let full_key = self.make_key(key); + let cache = self.cache.read(); + cache.contains(&full_key) + } + + /// Remove a key from the cache. + pub fn remove(&self, key: &MemoKey) -> Option { + let full_key = self.make_key(key); + let mut cache = self.cache.write(); + cache.pop(&full_key).map(|e| e.value) + } + + /// Clear all entries from the cache. + pub fn clear(&self) { + let mut cache = self.cache.write(); + cache.clear(); + debug!("Memo cache cleared"); + } + + /// Get the number of entries in the cache. + pub fn len(&self) -> usize { + let cache = self.cache.read(); + cache.len() + } + + /// Check if the cache is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get cache statistics (synchronous, lock-free). + pub fn stats(&self) -> MemoStats { + let (hits, misses, tokens_saved) = self.stats.snapshot(); + MemoStats { + entries: self.len(), + hits, + misses, + tokens_saved, + cost_saved: 0.0, + } + } + + /// Invalidate all entries of a specific operation type. + /// + /// Useful when the algorithm for a specific operation changes. + pub fn invalidate_by_op_type(&self, op_type: MemoOpType) -> usize { + let mut cache = self.cache.write(); + let before = cache.len(); + + let keys_to_remove: Vec = cache + .iter() + .filter_map(|(key, entry)| { + let matches = match (&entry.value, op_type) { + (MemoValue::Summary(_), MemoOpType::Summary) => true, + (MemoValue::PilotDecision(_), MemoOpType::PilotDecision) => true, + (MemoValue::QueryAnalysis(_), MemoOpType::QueryAnalysis) => true, + (MemoValue::Extraction(_), MemoOpType::Extraction) => true, + _ => false, + }; + if matches { Some(key.clone()) } else { None } + }) + .collect(); + + for key in keys_to_remove { + cache.pop(&key); + } + + let removed = before - cache.len(); + if removed > 0 { + debug!("Invalidated {} entries for op_type {:?}", removed, op_type); + } + removed + } + + /// Invalidate all entries matching a model ID prefix. + /// + /// Useful when switching models or when a model's behavior changes. + pub fn invalidate_by_model_prefix(&self, prefix: &str) -> usize { + let mut cache = self.cache.write(); + let before = cache.len(); + + let should_clear = self + .model_id + .as_ref() + .map(|m| m.starts_with(prefix)) + .unwrap_or(false); + + if should_clear { + cache.clear(); + let removed = before; + debug!( + "Invalidated all {} entries (model prefix '{}')", + removed, prefix + ); + return removed; + } + + 0 + } + + /// Remove expired entries. + pub fn prune_expired(&self) -> usize { + let mut cache = self.cache.write(); + let before = cache.len(); + + let expired: Vec = cache + .iter() + .filter(|(_, entry)| entry.is_expired(self.ttl)) + .map(|(k, _)| k.clone()) + .collect(); + + for key in expired { + cache.pop(&key); + } + + let removed = before - cache.len(); + if removed > 0 { + debug!("Pruned {} expired memo entries", removed); + } + removed + } + + /// Save the cache to disk. + pub async fn save(&self, path: &Path) -> Result<()> { + // Prune expired entries before persisting + self.prune_expired(); + + let cache = self.cache.read(); + let stats = self.stats(); + + let entries: HashMap = + cache.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); + + let data = MemoStoreData { + version: 1, + entries, + stats, + }; + + let parent = path + .parent() + .ok_or_else(|| vectorless_error::Error::Parse("Invalid path for memo store".to_string()))?; + tokio::fs::create_dir_all(parent).await?; + + let temp_path = path.with_extension("tmp"); + let json = serde_json::to_vec_pretty(&data) + .map_err(|e| vectorless_error::Error::Parse(format!("Failed to serialize memo store: {}", e)))?; + tokio::fs::write(&temp_path, &json).await?; + tokio::fs::rename(&temp_path, path).await?; + + info!( + "Saved memo store with {} entries to {:?}", + data.entries.len(), + path + ); + Ok(()) + } + + /// Load the cache from disk. + pub async fn load(&self, path: &Path) -> Result<()> { + if !path.exists() { + return Ok(()); + } + + let bytes = tokio::fs::read(path).await?; + let data: MemoStoreData = serde_json::from_slice(&bytes) + .map_err(|e| vectorless_error::Error::Parse(format!("Failed to deserialize memo store: {}", e)))?; + + let mut cache = self.cache.write(); + + for (key, entry) in data.entries { + if !entry.is_expired(self.ttl) { + cache.put(key, entry); + } + } + + // Restore stats + self.stats + .load_from(data.stats.hits, data.stats.misses, data.stats.tokens_saved); + + info!( + "Loaded memo store with {} entries from {:?}", + cache.len(), + path + ); + Ok(()) + } + + /// Make a full cache key from a MemoKey. + fn make_key(&self, key: &MemoKey) -> String { + let mut key_with_context = key.clone(); + if key_with_context.model_id.is_none() { + key_with_context.model_id = self.model_id.clone(); + } + if key_with_context.version == 0 { + key_with_context.version = self.version; + } + key_with_context.fingerprint().to_string() + } +} + +impl Default for MemoStore { + fn default() -> Self { + Self::new() + } +} + +/// A helper for building memo keys with context. +pub struct MemoKeyBuilder { + model_id: Option, + version: u32, +} + +impl MemoKeyBuilder { + /// Create a new key builder. + pub fn new() -> Self { + Self { + model_id: None, + version: 1, + } + } + + /// Set the model identifier. + pub fn with_model(mut self, model_id: &str) -> Self { + self.model_id = Some(model_id.to_string()); + self + } + + /// Set the version. + pub fn with_version(mut self, version: u32) -> Self { + self.version = version; + self + } + + /// Build a summary key. + pub fn summary_key(&self, content_fp: &Fingerprint) -> MemoKey { + MemoKey { + op_type: super::types::MemoOpType::Summary, + input_fp: *content_fp, + model_id: self.model_id.clone(), + version: self.version, + context_fp: Fingerprint::zero(), + } + } + + /// Build a pilot decision key. + pub fn pilot_key(&self, context_fp: &Fingerprint, query_fp: &Fingerprint) -> MemoKey { + MemoKey { + op_type: super::types::MemoOpType::PilotDecision, + input_fp: *query_fp, + model_id: self.model_id.clone(), + version: self.version, + context_fp: *context_fp, + } + } + + /// Build a query analysis key. + pub fn query_analysis_key(&self, query_fp: &Fingerprint) -> MemoKey { + MemoKey { + op_type: super::types::MemoOpType::QueryAnalysis, + input_fp: *query_fp, + model_id: self.model_id.clone(), + version: self.version, + context_fp: Fingerprint::zero(), + } + } + + /// Build an extraction key. + pub fn extraction_key(&self, content_fp: &Fingerprint) -> MemoKey { + MemoKey { + op_type: super::types::MemoOpType::Extraction, + input_fp: *content_fp, + model_id: self.model_id.clone(), + version: self.version, + context_fp: Fingerprint::zero(), + } + } +} + +impl Default for MemoKeyBuilder { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn make_test_key() -> MemoKey { + let fp = Fingerprint::from_str("test content"); + MemoKey::summary(&fp) + } + + #[test] + fn test_memo_store_basic() { + let store = MemoStore::new(); + let key = make_test_key(); + + assert!(!store.contains(&key)); + + store.put(key.clone(), MemoValue::Summary("Test summary".to_string())); + + assert!(store.contains(&key)); + + let value = store.get(&key); + assert!(value.is_some()); + assert_eq!(value.unwrap().as_summary(), Some("Test summary")); + } + + #[test] + fn test_memo_store_lru_eviction() { + let store = MemoStore::with_capacity(3); + + for i in 0..5 { + let fp = Fingerprint::from_str(&format!("content {}", i)); + let key = MemoKey::summary(&fp); + store.put(key, MemoValue::Summary(format!("Summary {}", i))); + } + + assert_eq!(store.len(), 3); + } + + #[tokio::test] + async fn test_memo_store_get_or_compute() { + let store = MemoStore::new(); + let key = make_test_key(); + + let call_count = Arc::new(std::sync::atomic::AtomicU64::new(0)); + let count_clone = call_count.clone(); + + // First call should compute + let result = store + .get_or_compute(key.clone(), || { + let c = count_clone.clone(); + async move { + c.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + Ok((MemoValue::Summary("Computed".to_string()), 100)) + } + }) + .await + .unwrap(); + + assert_eq!(result.as_summary(), Some("Computed")); + assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 1); + + // Second call should use cache + let result2 = store + .get_or_compute(key.clone(), || { + let c = count_clone.clone(); + async move { + c.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + Ok((MemoValue::Summary("Should not be called".to_string()), 100)) + } + }) + .await + .unwrap(); + + assert_eq!(result2.as_summary(), Some("Computed")); + assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 1); + } + + #[tokio::test] + async fn test_memo_store_persistence() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("memo.json"); + + let store = MemoStore::new(); + let key = make_test_key(); + + store.put_with_tokens( + key.clone(), + MemoValue::Summary("Test summary".to_string()), + 100, + ); + + // Save + store.save(&path).await.unwrap(); + assert!(path.exists()); + + // Load into new store + let store2 = MemoStore::new(); + store2.load(&path).await.unwrap(); + + assert!(store2.contains(&key)); + let value = store2.get(&key); + assert_eq!(value.unwrap().as_summary(), Some("Test summary")); + } + + #[tokio::test] + async fn test_memo_store_stats() { + let store = MemoStore::new(); + let key = make_test_key(); + + // Miss + store + .get_or_compute(key.clone(), || async { + Ok((MemoValue::Summary("Test".to_string()), 100)) + }) + .await + .unwrap(); + + // Hit + store + .get_or_compute(key.clone(), || async { + Ok((MemoValue::Summary("Should not be called".to_string()), 0)) + }) + .await + .unwrap(); + + let stats = store.stats(); + assert_eq!(stats.misses, 1); + assert_eq!(stats.hits, 1); + assert_eq!(stats.tokens_saved, 100); + } + + #[test] + fn test_memo_key_builder() { + let builder = MemoKeyBuilder::new().with_model("gpt-4").with_version(2); + + let fp = Fingerprint::from_str("content"); + let key = builder.summary_key(&fp); + + assert_eq!(key.model_id, Some("gpt-4".to_string())); + assert_eq!(key.version, 2); + } +} diff --git a/vectorless-core/vectorless-llm/src/memo/types.rs b/vectorless-core/vectorless-llm/src/memo/types.rs new file mode 100644 index 00000000..9e3cb86d --- /dev/null +++ b/vectorless-core/vectorless-llm/src/memo/types.rs @@ -0,0 +1,414 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Types for the memoization system. + +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; + +use vectorless_utils::fingerprint::Fingerprint; + +/// Types of operations that can be memoized. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum MemoOpType { + /// Node summary generation. + Summary, + + /// Pilot navigation decision. + PilotDecision, + + /// Query analysis result. + QueryAnalysis, + + /// Content extraction result. + Extraction, + + /// LLM node evaluation during retrieval. + NodeEvaluation, + + /// Sufficiency check result. + SufficiencyCheck, + + /// Query complexity detection. + ComplexityDetection, + + /// Query decomposition. + QueryDecomposition, + + /// Custom operation type. + Custom(u8), +} + +impl MemoOpType { + /// Get a unique byte identifier for this operation type. + pub fn as_byte(&self) -> u8 { + match self { + MemoOpType::Summary => 0, + MemoOpType::PilotDecision => 1, + MemoOpType::QueryAnalysis => 2, + MemoOpType::Extraction => 3, + MemoOpType::NodeEvaluation => 4, + MemoOpType::SufficiencyCheck => 5, + MemoOpType::ComplexityDetection => 6, + MemoOpType::QueryDecomposition => 7, + MemoOpType::Custom(n) => 100 + n, + } + } +} + +/// Key for memoization lookup. +/// +/// Keys are content-addressed using fingerprints, ensuring that +/// cache hits only occur when the input is semantically identical. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct MemoKey { + /// Type of operation being memoized. + pub op_type: MemoOpType, + + /// Fingerprint of the input content. + pub input_fp: Fingerprint, + + /// Optional model identifier for cache invalidation when model changes. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model_id: Option, + + /// Optional version for cache invalidation when algorithm changes. + #[serde(default)] + pub version: u32, + + /// Additional context fingerprint (e.g., query context for pilot decisions). + #[serde(default, skip_serializing_if = "Fingerprint::is_zero")] + pub context_fp: Fingerprint, +} + +impl MemoKey { + /// Create a key for summary generation. + pub fn summary(content_fp: &Fingerprint) -> Self { + Self { + op_type: MemoOpType::Summary, + input_fp: *content_fp, + model_id: None, + version: 1, + context_fp: Fingerprint::zero(), + } + } + + /// Create a key for summary generation with model and version. + pub fn summary_with_model(content_fp: &Fingerprint, model_id: &str, version: u32) -> Self { + Self { + op_type: MemoOpType::Summary, + input_fp: *content_fp, + model_id: Some(model_id.to_string()), + version, + context_fp: Fingerprint::zero(), + } + } + + /// Create a key for pilot decision. + pub fn pilot_decision(context_fp: &Fingerprint, query_fp: &Fingerprint) -> Self { + Self { + op_type: MemoOpType::PilotDecision, + input_fp: *query_fp, + model_id: None, + version: 1, + context_fp: *context_fp, + } + } + + /// Create a key for query analysis. + pub fn query_analysis(query_fp: &Fingerprint) -> Self { + Self { + op_type: MemoOpType::QueryAnalysis, + input_fp: *query_fp, + model_id: None, + version: 1, + context_fp: Fingerprint::zero(), + } + } + + /// Create a key for content extraction. + pub fn extraction(content_fp: &Fingerprint) -> Self { + Self { + op_type: MemoOpType::Extraction, + input_fp: *content_fp, + model_id: None, + version: 1, + context_fp: Fingerprint::zero(), + } + } + + /// Set the model identifier. + pub fn with_model(mut self, model_id: &str) -> Self { + self.model_id = Some(model_id.to_string()); + self + } + + /// Set the version. + pub fn with_version(mut self, version: u32) -> Self { + self.version = version; + self + } + + /// Set the context fingerprint. + pub fn with_context(mut self, context_fp: &Fingerprint) -> Self { + self.context_fp = *context_fp; + self + } + + /// Compute a fingerprint of this key for storage. + pub fn fingerprint(&self) -> Fingerprint { + use vectorless_utils::fingerprint::Fingerprinter; + + let mut fp = Fingerprinter::new(); + fp.write_u64(self.op_type.as_byte() as u64); + fp.write_fingerprint(&self.input_fp); + fp.write_option_str(self.model_id.as_deref()); + fp.write_u64(self.version as u64); + if !self.context_fp.is_zero() { + fp.write_fingerprint(&self.context_fp); + } + fp.into_fingerprint() + } +} + +/// Cached value from an LLM operation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum MemoValue { + /// Generated summary text. + Summary(String), + + /// Pilot navigation decision. + PilotDecision(PilotDecisionValue), + + /// Query analysis result. + QueryAnalysis(QueryAnalysisValue), + + /// Extracted content. + Extraction(serde_json::Value), + + /// Raw text (for custom operations). + Text(String), + + /// JSON value (for structured outputs). + Json(serde_json::Value), +} + +/// Serializable pilot decision value. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PilotDecisionValue { + /// Selected candidate index. + pub selected_idx: usize, + + /// Confidence score (0.0 to 1.0). + pub confidence: f32, + + /// Reasoning text. + pub reasoning: String, +} + +/// Serializable query analysis value. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueryAnalysisValue { + /// Query complexity score. + pub complexity: f32, + + /// Detected intent. + pub intent: String, + + /// Suggested strategy. + pub strategy: String, +} + +impl MemoValue { + /// Get the value as a string summary. + pub fn as_summary(&self) -> Option<&str> { + match self { + MemoValue::Summary(s) => Some(s), + _ => None, + } + } + + /// Get the value as text. + pub fn as_text(&self) -> Option<&str> { + match self { + MemoValue::Text(s) => Some(s), + MemoValue::Summary(s) => Some(s), + _ => None, + } + } + + /// Check if this is a summary value. + pub fn is_summary(&self) -> bool { + matches!(self, MemoValue::Summary(_)) + } +} + +/// A cached entry in the memo store. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MemoEntry { + /// The cached value. + pub value: MemoValue, + + /// When this entry was created. + pub created_at: DateTime, + + /// When this entry was last accessed. + pub last_accessed: DateTime, + + /// Number of cache hits. + pub hits: u64, + + /// Token cost saved by this cache entry. + pub tokens_saved: u64, +} + +impl MemoEntry { + /// Create a new entry. + pub fn new(value: MemoValue) -> Self { + let now = Utc::now(); + Self { + value, + created_at: now, + last_accessed: now, + hits: 0, + tokens_saved: 0, + } + } + + /// Create a new entry with token count. + pub fn with_tokens(value: MemoValue, tokens_saved: u64) -> Self { + Self { + tokens_saved, + ..Self::new(value) + } + } + + /// Record a cache hit. + pub fn record_hit(&mut self) { + self.hits += 1; + self.last_accessed = Utc::now(); + } + + /// Check if this entry has expired. + pub fn is_expired(&self, ttl: chrono::Duration) -> bool { + let now = Utc::now(); + now - self.created_at > ttl + } + + /// Get the age of this entry. + pub fn age(&self) -> chrono::Duration { + Utc::now() - self.created_at + } +} + +/// Statistics for the memo store. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct MemoStats { + /// Total number of cache entries. + pub entries: usize, + + /// Total cache hits. + pub hits: u64, + + /// Total cache misses. + pub misses: u64, + + /// Total tokens saved by cache hits. + pub tokens_saved: u64, + + /// Estimated cost saved (in USD). + pub cost_saved: f64, +} + +impl MemoStats { + /// Calculate the cache hit rate. + pub fn hit_rate(&self) -> f64 { + let total = self.hits + self.misses; + if total == 0 { + 0.0 + } else { + self.hits as f64 / total as f64 + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memo_key_summary() { + let fp = Fingerprint::from_str("test content"); + let key = MemoKey::summary(&fp); + + assert_eq!(key.op_type, MemoOpType::Summary); + assert_eq!(key.input_fp, fp); + assert!(key.model_id.is_none()); + } + + #[test] + fn test_memo_key_with_model() { + let fp = Fingerprint::from_str("test content"); + let key = MemoKey::summary(&fp).with_model("gpt-4").with_version(2); + + assert_eq!(key.model_id, Some("gpt-4".to_string())); + assert_eq!(key.version, 2); + } + + #[test] + fn test_memo_key_fingerprint() { + let fp = Fingerprint::from_str("test content"); + let key1 = MemoKey::summary(&fp); + let key2 = MemoKey::summary(&fp); + + assert_eq!(key1.fingerprint(), key2.fingerprint()); + + let key3 = MemoKey::summary_with_model(&fp, "gpt-4", 1); + assert_ne!(key1.fingerprint(), key3.fingerprint()); + } + + #[test] + fn test_memo_entry() { + let entry = MemoEntry::new(MemoValue::Summary("Test summary".to_string())); + + assert_eq!(entry.hits, 0); + assert!(entry.value.as_summary().is_some()); + } + + #[test] + fn test_memo_entry_hit() { + let mut entry = MemoEntry::new(MemoValue::Summary("Test summary".to_string())); + entry.record_hit(); + entry.record_hit(); + + assert_eq!(entry.hits, 2); + } + + #[test] + fn test_memo_stats_hit_rate() { + let mut stats = MemoStats::default(); + stats.hits = 80; + stats.misses = 20; + + assert!((stats.hit_rate() - 0.8).abs() < 0.001); + } + + #[test] + fn test_memo_key_serialization() { + let fp = Fingerprint::from_str("test content"); + let key = MemoKey::summary_with_model(&fp, "gpt-4", 1); + + let json = serde_json::to_string(&key).unwrap(); + let decoded: MemoKey = serde_json::from_str(&json).unwrap(); + + assert_eq!(key, decoded); + } + + #[test] + fn test_memo_value_serialization() { + let value = MemoValue::Summary("Test summary".to_string()); + let json = serde_json::to_string(&value).unwrap(); + let decoded: MemoValue = serde_json::from_str(&json).unwrap(); + assert_eq!(value.as_summary(), decoded.as_summary()); + } +} diff --git a/vectorless-core/vectorless-llm/src/pool.rs b/vectorless-core/vectorless-llm/src/pool.rs new file mode 100644 index 00000000..45d3b4be --- /dev/null +++ b/vectorless-core/vectorless-llm/src/pool.rs @@ -0,0 +1,176 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! LLM client pool for managing multiple clients. + +use std::sync::Arc; + +use super::client::LlmClient; +use super::config::LlmConfig; +use super::fallback::{FallbackChain, FallbackConfig}; +use super::throttle::ConcurrencyController; +use vectorless_metrics::MetricsHub; + +/// Pool of LLM clients for different purposes. +/// +/// This provides a centralized way to access LLM clients +/// configured for specific tasks: +/// - **Index** — Document indexing/summarization (fast, cheap model) +/// - **Retrieval** — Document navigation (capable model) +/// - **Pilot** — Navigation guidance (fast model) +/// +/// # Construction +/// +/// The pool is built from a [`config::LlmConfig`](vectorless_config::LlmConfig) +/// which defines the global credentials and per-slot overrides. +/// +/// ```rust,ignore +/// use vectorless::llm::LlmPool; +/// +/// let pool = LlmPool::from_config(&config.llm); +/// +/// // Use index client for summarization +/// let summary = pool.index().complete( +/// "You summarize text concisely.", +/// "Long text to summarize..." +/// ).await?; +/// ``` +#[derive(Debug, Clone)] +pub struct LlmPool { + index: Arc, + retrieval: Arc, +} + +impl LlmPool { + /// Create a pool from the unified LLM configuration. + /// + /// Resolves per-slot model overrides and creates individual + /// [`LlmClient`] instances with the appropriate settings. + /// When `metrics` is provided, all clients share the same hub + /// for unified LLM call statistics. + pub fn from_config( + config: &vectorless_config::LlmConfig, + metrics: Option>, + ) -> Self { + let api_key = config.api_key.clone(); + let endpoint = config.endpoint.clone().unwrap_or_default(); + let retry = super::config::RetryConfig::from(&config.retry); + + let make_config = |slot: &vectorless_config::SlotConfig| -> LlmConfig { + LlmConfig { + model: config.resolve_model(slot), + endpoint: endpoint.clone(), + api_key: api_key.clone(), + max_tokens: slot.max_tokens, + temperature: slot.temperature, + retry: retry.clone(), + request_timeout_secs: 0, + } + }; + + // Build a single shared async-openai client (reuses connection pool) + let openai_base = if endpoint.is_empty() { + "https://api.openai.com/v1".to_string() + } else { + endpoint.clone() + }; + let openai_client = Arc::new(async_openai::Client::with_config( + async_openai::config::OpenAIConfig::new() + .with_api_key(api_key.clone().unwrap_or_default()) + .with_api_base(openai_base), + )); + + // Attach shared throttle controller from config + let concurrency_config = super::throttle::ConcurrencyConfig::from(&config.throttle); + let controller = Arc::new(ConcurrencyController::new(concurrency_config)); + + // Attach shared fallback chain from config + let fallback_config: FallbackConfig = config.fallback.clone().into(); + let fallback_chain = Arc::new(FallbackChain::new(fallback_config)); + + let build_client = |slot_config: &vectorless_config::SlotConfig| { + let mut client = LlmClient::new(make_config(slot_config)) + .with_shared_concurrency(controller.clone()) + .with_shared_openai_client(openai_client.clone()) + .with_shared_fallback(fallback_chain.clone()); + if let Some(ref hub) = metrics { + client = client.with_shared_metrics(hub.clone()); + } + Arc::new(client) + }; + + Self { + index: build_client(&config.index), + retrieval: build_client(&config.retrieval), + } + } + + /// Create a pool with default configurations. + pub fn from_defaults() -> Self { + Self::from_config(&vectorless_config::LlmConfig::default(), None) + } + + /// Get the index client. + pub fn index(&self) -> &LlmClient { + &self.index + } + + /// Get the retrieval client. + pub fn retrieval(&self) -> &LlmClient { + &self.retrieval + } +} + +impl Default for LlmPool { + fn default() -> Self { + Self::from_defaults() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_pool_from_config() { + let config = vectorless_config::LlmConfig::new("gpt-4o") + .with_api_key("sk-test") + .with_endpoint("https://api.openai.com/v1") + .with_index(vectorless_config::SlotConfig::fast().with_model("gpt-4o-mini")); + + let pool = LlmPool::from_config(&config, None); + + assert_eq!(pool.index().config().model, "gpt-4o-mini"); + assert_eq!(pool.retrieval().config().model, "gpt-4o"); + assert_eq!(pool.index().config().max_tokens, 100); + } + + #[test] + fn test_pool_from_config_with_metrics() { + let config = vectorless_config::LlmConfig::new("gpt-4o") + .with_api_key("sk-test") + .with_endpoint("https://api.openai.com/v1"); + + let hub = MetricsHub::shared(); + let pool = LlmPool::from_config(&config, Some(hub.clone())); + + assert!(pool.index().fallback().is_some()); + assert!(pool.retrieval().fallback().is_some()); + + assert_eq!(pool.index().config().model, "gpt-4o"); + assert_eq!(pool.retrieval().config().model, "gpt-4o"); + } + + #[test] + fn test_pool_shared_metrics_hub() { + let config = vectorless_config::LlmConfig::new("gpt-4o") + .with_api_key("sk-test") + .with_endpoint("https://api.openai.com/v1"); + + let hub = MetricsHub::shared(); + let _pool = LlmPool::from_config(&config, Some(hub.clone())); + + // Hub is shared with all three clients — Arc refcount > 1 + assert!(Arc::strong_count(&hub) > 1); + } +} diff --git a/vectorless-core/vectorless-llm/src/throttle.rs b/vectorless-core/vectorless-llm/src/throttle.rs new file mode 100644 index 00000000..2a0d27fb --- /dev/null +++ b/vectorless-core/vectorless-llm/src/throttle.rs @@ -0,0 +1,270 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Concurrency control for LLM API calls. +//! +//! Combines semaphore (concurrency limit) with token-bucket rate limiter (RPM). + +use std::num::NonZeroU32; +use std::sync::Arc; + +use governor::{ + Quota, RateLimiter as GovernorLimiter, + clock::{Clock, DefaultClock}, + state::{InMemoryState, NotKeyed}, +}; +use serde::{Deserialize, Serialize}; +use tokio::sync::{Semaphore, SemaphorePermit}; +use tracing::{debug, trace}; + +// ============================================================ +// ConcurrencyConfig +// ============================================================ + +/// Concurrency control configuration. +/// +/// Controls how LLM requests are rate-limited and throttled +/// to avoid overwhelming the API. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConcurrencyConfig { + /// Maximum concurrent LLM API calls. + #[serde(default = "default_max_concurrent_requests")] + pub max_concurrent_requests: usize, + + /// Rate limit: requests per minute (token bucket). + #[serde(default = "default_requests_per_minute")] + pub requests_per_minute: usize, + + /// Whether rate limiting is enabled. + #[serde(default = "default_true")] + pub enabled: bool, + + /// Whether semaphore-based concurrency limiting is enabled. + #[serde(default = "default_true")] + pub semaphore_enabled: bool, +} + +impl From<&vectorless_config::ThrottleConfig> for ConcurrencyConfig { + fn from(c: &vectorless_config::ThrottleConfig) -> Self { + Self { + max_concurrent_requests: c.max_concurrent_requests, + requests_per_minute: c.requests_per_minute, + enabled: c.enabled, + semaphore_enabled: c.semaphore_enabled, + } + } +} + +fn default_max_concurrent_requests() -> usize { + 10 +} +fn default_requests_per_minute() -> usize { + 500 +} +fn default_true() -> bool { + true +} + +impl Default for ConcurrencyConfig { + fn default() -> Self { + Self { + max_concurrent_requests: default_max_concurrent_requests(), + requests_per_minute: default_requests_per_minute(), + enabled: true, + semaphore_enabled: true, + } + } +} + +impl ConcurrencyConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set the maximum concurrent requests. + pub fn with_max_concurrent_requests(mut self, max: usize) -> Self { + self.max_concurrent_requests = max; + self + } + + /// Set the requests per minute rate limit. + pub fn with_requests_per_minute(mut self, rpm: usize) -> Self { + self.requests_per_minute = rpm; + self + } + + /// Enable or disable rate limiting. + pub fn with_enabled(mut self, enabled: bool) -> Self { + self.enabled = enabled; + self + } + + /// Create a config for conservative scenarios. + pub fn conservative() -> Self { + Self { + max_concurrent_requests: 5, + requests_per_minute: 100, + enabled: true, + semaphore_enabled: true, + } + } + + /// Create a config that disables all limits. + pub fn unlimited() -> Self { + Self { + max_concurrent_requests: usize::MAX, + requests_per_minute: usize::MAX, + enabled: false, + semaphore_enabled: false, + } + } +} + +// ============================================================ +// ConcurrencyController +// ============================================================ + +/// Concurrency controller for LLM API calls. +/// +/// Combines: +/// - **Rate Limiter** — Token bucket to limit requests per time period +/// - **Semaphore** — Limit concurrent requests +/// +/// The only operation needed by business code is [`acquire()`](ConcurrencyController::acquire). +#[derive(Clone)] +pub struct ConcurrencyController { + semaphore: Arc, + rate_limiter: Option>>, + semaphore_enabled: bool, +} + +impl ConcurrencyController { + /// Create a new concurrency controller with the given configuration. + pub fn new(config: ConcurrencyConfig) -> Self { + let semaphore = Arc::new(Semaphore::new(config.max_concurrent_requests)); + let rate_limiter = if config.enabled { + let rpm = NonZeroU32::new(config.requests_per_minute as u32) + .unwrap_or_else(|| NonZeroU32::new(1).unwrap()); + Some(Arc::new(GovernorLimiter::direct(Quota::per_minute(rpm)))) + } else { + None + }; + + Self { + semaphore, + rate_limiter, + semaphore_enabled: config.semaphore_enabled, + } + } + + /// Create a controller with default configuration. + pub fn with_defaults() -> Self { + Self::new(ConcurrencyConfig::default()) + } + + /// Acquire a permit for making an LLM request. + /// + /// This will: + /// 1. Wait for the rate limiter (if enabled) + /// 2. Acquire a semaphore permit (if enabled) + /// + /// The permit is automatically released when dropped. + pub async fn acquire(&self) -> Option> { + // Step 1: Wait for rate limiter + if let Some(ref limiter) = self.rate_limiter { + let clock = DefaultClock::default(); + loop { + match limiter.check() { + Ok(_) => { + trace!("Rate limiter: token acquired"); + break; + } + Err(negative) => { + let wait_duration = negative.wait_time_from(clock.now()); + trace!( + wait_ms = wait_duration.as_millis() as u64, + "Rate limiter: waiting for token" + ); + tokio::time::sleep(wait_duration).await; + } + } + } + debug!("Rate limiter: token acquired"); + } + + // Step 2: Acquire semaphore permit + if self.semaphore_enabled { + trace!("Waiting for semaphore permit"); + let permit = self + .semaphore + .acquire() + .await + .expect("semaphore should not be closed"); + debug!( + "Semaphore: permit acquired (available: {})", + self.semaphore.available_permits() + ); + Some(permit) + } else { + None + } + } +} + +impl std::fmt::Debug for ConcurrencyController { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConcurrencyController") + .field("available_permits", &self.semaphore.available_permits()) + .field("has_rate_limiter", &self.rate_limiter.is_some()) + .field("semaphore_enabled", &self.semaphore_enabled) + .finish() + } +} + +impl Default for ConcurrencyController { + fn default() -> Self { + Self::with_defaults() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_controller_acquire() { + let controller = ConcurrencyController::new(ConcurrencyConfig { + max_concurrent_requests: 2, + requests_per_minute: 100, + enabled: false, + semaphore_enabled: true, + }); + + let permit1 = controller.acquire().await; + assert!(permit1.is_some()); + + let permit2 = controller.acquire().await; + assert!(permit2.is_some()); + + drop(permit1); + } + + #[test] + fn test_controller_creation() { + let controller = ConcurrencyController::with_defaults(); + assert!(controller.semaphore.available_permits() > 0); + } + + #[test] + fn test_rate_limiter_creation() { + let config = ConcurrencyConfig { + max_concurrent_requests: 10, + requests_per_minute: 100, + enabled: true, + semaphore_enabled: true, + }; + let controller = ConcurrencyController::new(config); + assert!(controller.rate_limiter.is_some()); + } +} diff --git a/vectorless-core/vectorless-metrics/Cargo.toml b/vectorless-core/vectorless-metrics/Cargo.toml new file mode 100644 index 00000000..93237dcc --- /dev/null +++ b/vectorless-core/vectorless-metrics/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "vectorless-metrics" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-config = { path = "../vectorless-config" } +vectorless-error = { path = "../vectorless-error" } +serde = { workspace = true } +tracing = { workspace = true } +parking_lot = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-metrics/src/hub.rs b/vectorless-core/vectorless-metrics/src/hub.rs new file mode 100644 index 00000000..ff4f553b --- /dev/null +++ b/vectorless-core/vectorless-metrics/src/hub.rs @@ -0,0 +1,324 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Central metrics hub for unified collection. + +use std::sync::Arc; + +use super::llm::{LlmMetrics, LlmMetricsReport}; +use super::retrieval::{RetrievalMetrics, RetrievalMetricsReport}; +use vectorless_config::MetricsConfig; + +/// Central metrics hub for unified collection. +/// +/// Provides a single point for all metrics collection across: +/// - LLM operations (tokens, latency, cost) +/// - Retrieval operations (paths, scores, cache) +/// +/// # Thread Safety +/// +/// All metrics use atomic operations and are safe to use from multiple threads. +/// +/// # Example +/// +/// ```rust +/// use vectorless::metrics::{MetricsHub, MetricsConfig}; +/// +/// let config = MetricsConfig::default(); +/// let hub = MetricsHub::new(config); +/// +/// // Record LLM call +/// hub.record_llm_call(100, 50, 150, true); +/// +/// // Get report +/// let report = hub.generate_report(); +/// ``` +#[derive(Debug)] +pub struct MetricsHub { + config: MetricsConfig, + llm: LlmMetrics, + retrieval: RetrievalMetrics, +} + +impl MetricsHub { + /// Create a new metrics hub. + pub fn new(config: MetricsConfig) -> Self { + Self { + config, + llm: LlmMetrics::new(), + retrieval: RetrievalMetrics::new(), + } + } + + /// Create a new metrics hub with defaults. + pub fn with_defaults() -> Self { + Self::new(MetricsConfig::default()) + } + + /// Create an Arc-wrapped metrics hub. + pub fn shared() -> Arc { + Arc::new(Self::with_defaults()) + } + + /// Create an Arc-wrapped metrics hub with config. + pub fn shared_with_config(config: MetricsConfig) -> Arc { + Arc::new(Self::new(config)) + } + + /// Check if metrics are enabled. + pub fn is_enabled(&self) -> bool { + self.config.enabled + } + + /// Get the configuration. + pub fn config(&self) -> &MetricsConfig { + &self.config + } + + // ======================================================================== + // LLM Metrics + // ======================================================================== + + /// Record an LLM call. + pub fn record_llm_call( + &self, + input_tokens: u64, + output_tokens: u64, + latency_ms: u64, + success: bool, + ) { + if !self.config.enabled || !self.config.llm.track_tokens { + return; + } + self.llm.record_call( + input_tokens, + output_tokens, + latency_ms, + success, + &self.config.llm, + ); + } + + /// Record an LLM rate limit error. + pub fn record_llm_rate_limit(&self) { + if self.config.enabled { + self.llm.record_rate_limit(); + } + } + + /// Record an LLM timeout error. + pub fn record_llm_timeout(&self) { + if self.config.enabled { + self.llm.record_timeout(); + } + } + + /// Record an LLM fallback trigger. + pub fn record_llm_fallback(&self) { + if self.config.enabled { + self.llm.record_fallback(); + } + } + + /// Get LLM metrics report. + pub fn llm_report(&self) -> LlmMetricsReport { + self.llm.generate_report() + } + + // ======================================================================== + // Retrieval Metrics + // ======================================================================== + + /// Record a retrieval query. + pub fn record_retrieval_query(&self, iterations: u64, nodes_visited: u64, latency_ms: u64) { + if !self.config.enabled { + return; + } + self.retrieval.record_query( + iterations, + nodes_visited, + latency_ms, + &self.config.retrieval, + ); + } + + /// Record a found path. + pub fn record_retrieval_path(&self, length: u64, score: f64) { + if !self.config.enabled { + return; + } + self.retrieval + .record_path(length, score, &self.config.retrieval); + } + + /// Record a cache hit. + pub fn record_cache_hit(&self) { + if !self.config.enabled || !self.config.retrieval.track_cache { + return; + } + self.retrieval.record_cache_hit(&self.config.retrieval); + } + + /// Record a cache miss. + pub fn record_cache_miss(&self) { + if !self.config.enabled || !self.config.retrieval.track_cache { + return; + } + self.retrieval.record_cache_miss(&self.config.retrieval); + } + + /// Record a backtrack. + pub fn record_backtrack(&self) { + if self.config.enabled { + self.retrieval.record_backtrack(); + } + } + + /// Record a sufficiency check. + pub fn record_sufficiency_check(&self, was_sufficient: bool) { + if self.config.enabled { + self.retrieval.record_sufficiency_check(was_sufficient); + } + } + + /// Get retrieval metrics report. + pub fn retrieval_report(&self) -> RetrievalMetricsReport { + self.retrieval.generate_report() + } + + // ======================================================================== + // General Operations + // ======================================================================== + + /// Reset all metrics. + pub fn reset(&self) { + self.llm.reset(); + self.retrieval.reset(); + } + + /// Generate a complete report. + pub fn generate_report(&self) -> MetricsReport { + MetricsReport { + llm: self.llm_report(), + retrieval: self.retrieval_report(), + } + } +} + +impl Default for MetricsHub { + fn default() -> Self { + Self::with_defaults() + } +} + +/// Complete metrics report. +#[derive(Debug, Clone)] +pub struct MetricsReport { + /// LLM metrics. + pub llm: LlmMetricsReport, + /// Retrieval metrics. + pub retrieval: RetrievalMetricsReport, +} + +impl MetricsReport { + /// Calculate total estimated cost in USD. + pub fn total_cost_usd(&self) -> f64 { + self.llm.estimated_cost_usd + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_metrics_hub_recording() { + let hub = MetricsHub::with_defaults(); + + hub.record_llm_call(100, 50, 150, true); + hub.record_retrieval_query(5, 10, 100); + + let report = hub.generate_report(); + + assert_eq!(report.llm.total_calls, 1); + assert_eq!(report.retrieval.total_queries, 1); + } + + #[test] + fn test_metrics_hub_disabled() { + let config = MetricsConfig::disabled(); + let hub = MetricsHub::new(config); + + hub.record_llm_call(100, 50, 150, true); + + let report = hub.generate_report(); + + assert_eq!(report.llm.total_calls, 0); + } + + #[test] + fn test_metrics_hub_reset() { + let hub = MetricsHub::with_defaults(); + + hub.record_llm_call(100, 50, 150, true); + hub.reset(); + + let report = hub.generate_report(); + assert_eq!(report.llm.total_calls, 0); + } + + #[test] + fn test_llm_metrics_success_and_failure() { + let hub = MetricsHub::with_defaults(); + + hub.record_llm_call(100, 50, 150, true); + hub.record_llm_call(200, 100, 300, true); + hub.record_llm_call(0, 0, 50, false); + + let report = hub.llm_report(); + assert_eq!(report.total_calls, 3); + assert_eq!(report.successful_calls, 2); + assert_eq!(report.failed_calls, 1); + assert!((report.success_rate - 0.666).abs() < 0.01); + assert_eq!(report.total_input_tokens, 300); + assert_eq!(report.total_output_tokens, 150); + } + + #[test] + fn test_llm_error_events() { + let hub = MetricsHub::with_defaults(); + + hub.record_llm_rate_limit(); + hub.record_llm_rate_limit(); + hub.record_llm_timeout(); + hub.record_llm_fallback(); + + let report = hub.llm_report(); + assert_eq!(report.rate_limit_errors, 2); + assert_eq!(report.timeout_errors, 1); + assert_eq!(report.fallback_triggers, 1); + } + + #[test] + fn test_shared_arc_metrics() { + let hub = MetricsHub::shared(); + + let hub2 = hub.clone(); + hub.record_llm_call(100, 50, 100, true); + hub2.record_llm_call(200, 100, 200, true); + + let report = hub.generate_report(); + assert_eq!(report.llm.total_calls, 2); + assert_eq!(report.llm.total_input_tokens, 300); + } + + #[test] + fn test_metrics_report_cost() { + let hub = MetricsHub::with_defaults(); + + hub.record_llm_call(1000, 500, 200, true); + + let report = hub.generate_report(); + assert!(report.total_cost_usd() >= 0.0); + } +} diff --git a/vectorless-core/vectorless-metrics/src/index.rs b/vectorless-core/vectorless-metrics/src/index.rs new file mode 100644 index 00000000..3d1e5569 --- /dev/null +++ b/vectorless-core/vectorless-metrics/src/index.rs @@ -0,0 +1,199 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Indexing pipeline metrics. + +use serde::{Deserialize, Serialize}; + +/// Performance metrics for the indexing pipeline. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct IndexMetrics { + /// Parse stage duration (ms). + #[serde(default)] + pub parse_time_ms: u64, + + /// Build stage duration (ms). + #[serde(default)] + pub build_time_ms: u64, + + /// Enhance stage duration (ms). + #[serde(default)] + pub enhance_time_ms: u64, + + /// Enrich stage duration (ms). + #[serde(default)] + pub enrich_time_ms: u64, + + /// Optimize stage duration (ms). + #[serde(default)] + pub optimize_time_ms: u64, + + /// Validate stage duration (ms). + #[serde(default)] + pub validate_time_ms: u64, + + /// Split stage duration (ms). + #[serde(default)] + pub split_time_ms: u64, + + /// Reasoning index build duration (ms). + #[serde(default)] + pub reasoning_index_time_ms: u64, + + /// Navigation index build duration (ms). + #[serde(default)] + pub navigation_index_time_ms: u64, + + /// Number of nav entries in navigation index. + #[serde(default)] + pub nav_entries_indexed: usize, + + /// Number of child routes in navigation index. + #[serde(default)] + pub child_routes_indexed: usize, + + /// Number of topics indexed in reasoning index. + #[serde(default)] + pub topics_indexed: usize, + + /// Number of keywords indexed in reasoning index. + #[serde(default)] + pub keywords_indexed: usize, + + /// Total tokens generated (summaries). + #[serde(default)] + pub total_tokens_generated: usize, + + /// Number of LLM calls. + #[serde(default)] + pub llm_calls: usize, + + /// Number of nodes processed. + #[serde(default)] + pub nodes_processed: usize, + + /// Number of summaries generated. + #[serde(default)] + pub summaries_generated: usize, + + /// Number of summaries that failed to generate (LLM error, rate limit, etc.). + #[serde(default)] + pub summaries_failed: usize, + + /// Number of nodes skipped (thinning). + #[serde(default)] + pub nodes_skipped: usize, + + /// Number of nodes merged. + #[serde(default)] + pub nodes_merged: usize, +} + +impl IndexMetrics { + /// Create new metrics with start time. + pub fn new() -> Self { + Self::default() + } + + /// Record parse stage time. + pub fn record_parse(&mut self, duration_ms: u64) { + self.parse_time_ms = duration_ms; + } + + /// Record build stage time. + pub fn record_build(&mut self, duration_ms: u64) { + self.build_time_ms = duration_ms; + } + + /// Record enhance stage time. + pub fn record_enhance(&mut self, duration_ms: u64) { + self.enhance_time_ms = duration_ms; + } + + /// Record enrich stage time. + pub fn record_enrich(&mut self, duration_ms: u64) { + self.enrich_time_ms = duration_ms; + } + + /// Record optimize stage time. + pub fn record_optimize(&mut self, duration_ms: u64) { + self.optimize_time_ms = duration_ms; + } + + /// Record validate stage time. + pub fn record_validate(&mut self, duration_ms: u64) { + self.validate_time_ms = duration_ms; + } + + /// Record split stage time. + pub fn record_split(&mut self, duration_ms: u64) { + self.split_time_ms = duration_ms; + } + + /// Record reasoning index build time. + pub fn record_reasoning_index(&mut self, duration_ms: u64, topics: usize, keywords: usize) { + self.reasoning_index_time_ms = duration_ms; + self.topics_indexed = topics; + self.keywords_indexed = keywords; + } + + /// Record navigation index build time. + pub fn record_navigation_index( + &mut self, + duration_ms: u64, + nav_entries: usize, + child_routes: usize, + ) { + self.navigation_index_time_ms = duration_ms; + self.nav_entries_indexed = nav_entries; + self.child_routes_indexed = child_routes; + } + + /// Increment LLM calls. + pub fn increment_llm_calls(&mut self) { + self.llm_calls += 1; + } + + /// Add to tokens generated. + pub fn add_tokens_generated(&mut self, tokens: usize) { + self.total_tokens_generated += tokens; + } + + /// Set nodes processed. + pub fn set_nodes_processed(&mut self, count: usize) { + self.nodes_processed = count; + } + + /// Increment summaries generated. + pub fn increment_summaries(&mut self) { + self.summaries_generated += 1; + } + + /// Add to summaries failed count. + pub fn add_summaries_failed(&mut self, count: usize) { + self.summaries_failed += count; + } + + /// Increment nodes skipped. + pub fn increment_nodes_skipped(&mut self) { + self.nodes_skipped += 1; + } + + /// Increment nodes merged. + pub fn increment_nodes_merged(&mut self) { + self.nodes_merged += 1; + } + + /// Get total time. + pub fn total_time_ms(&self) -> u64 { + self.parse_time_ms + + self.build_time_ms + + self.validate_time_ms + + self.split_time_ms + + self.enhance_time_ms + + self.enrich_time_ms + + self.reasoning_index_time_ms + + self.navigation_index_time_ms + + self.optimize_time_ms + } +} diff --git a/vectorless-core/vectorless-metrics/src/lib.rs b/vectorless-core/vectorless-metrics/src/lib.rs new file mode 100644 index 00000000..26ab6411 --- /dev/null +++ b/vectorless-core/vectorless-metrics/src/lib.rs @@ -0,0 +1,56 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified metrics collection for Vectorless. +//! +//! This module provides centralized metrics collection across all components: +//! - **LLM Metrics** — Token usage, latency, cost +//! - **Retrieval Metrics** — Paths, scores, iterations, cache +//! +//! # Architecture +//! +//! ```text +//! ┌─────────────────────────────────────────────────────────────────┐ +//! │ MetricsHub │ +//! │ │ +//! │ ┌─────────────┐ ┌──────────────────┐ │ +//! │ │ LlmMetrics │ │RetrievalMetrics │ │ +//! │ │ │ │ │ │ +//! │ │ - tokens │ │ - paths │ │ +//! │ │ - latency │ │ - scores │ │ +//! │ │ - cost │ │ - cache │ │ +//! │ └─────────────┘ └──────────────────┘ │ +//! │ │ +//! │ ┌─────────────────────────────────────────────────────────┐ │ +//! │ │ MetricsReport │ │ +//! │ │ │ │ +//! │ │ Aggregated report with all metrics and statistics │ │ +//! │ └─────────────────────────────────────────────────────────┘ │ +//! └─────────────────────────────────────────────────────────────────┘ +//! ``` +//! +//! # Example +//! +//! ```rust +//! use vectorless::metrics::{MetricsHub, MetricsConfig}; +//! +//! let config = MetricsConfig::default(); +//! let hub = MetricsHub::new(config); +//! +//! // Record LLM call +//! hub.record_llm_call(100, 50, 150, true); +//! +//! // Generate report +//! let report = hub.generate_report(); +//! println!("Total cost: ${:.4}", report.llm.estimated_cost_usd); +//! ``` + +mod hub; +mod index; +mod llm; +mod retrieval; + +pub use hub::{MetricsHub, MetricsReport}; +pub use index::IndexMetrics; +pub use llm::LlmMetricsReport; +pub use retrieval::RetrievalMetricsReport; diff --git a/vectorless-core/vectorless-metrics/src/llm.rs b/vectorless-core/vectorless-metrics/src/llm.rs new file mode 100644 index 00000000..09d1546a --- /dev/null +++ b/vectorless-core/vectorless-metrics/src/llm.rs @@ -0,0 +1,207 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! LLM metrics collection. + +use std::sync::atomic::{AtomicU64, Ordering}; + +use vectorless_config::LlmMetricsConfig; + +/// LLM metrics tracker. +#[derive(Debug, Default)] +pub struct LlmMetrics { + /// Total number of LLM calls. + pub total_calls: AtomicU64, + /// Number of successful calls. + pub successful_calls: AtomicU64, + /// Number of failed calls. + pub failed_calls: AtomicU64, + /// Total input tokens. + pub total_input_tokens: AtomicU64, + /// Total output tokens. + pub total_output_tokens: AtomicU64, + /// Total latency in milliseconds. + pub total_latency_ms: AtomicU64, + /// Estimated cost in micro-dollars. + pub estimated_cost_micros: AtomicU64, + /// Number of rate limit errors. + pub rate_limit_errors: AtomicU64, + /// Number of timeout errors. + pub timeout_errors: AtomicU64, + /// Number of fallback triggers. + pub fallback_triggers: AtomicU64, +} + +impl LlmMetrics { + /// Create new LLM metrics. + pub fn new() -> Self { + Self::default() + } + + /// Record an LLM call. + pub fn record_call( + &self, + input_tokens: u64, + output_tokens: u64, + latency_ms: u64, + success: bool, + config: &LlmMetricsConfig, + ) { + self.total_calls.fetch_add(1, Ordering::Relaxed); + + if success { + self.successful_calls.fetch_add(1, Ordering::Relaxed); + } else { + self.failed_calls.fetch_add(1, Ordering::Relaxed); + } + + if config.track_tokens { + self.total_input_tokens + .fetch_add(input_tokens, Ordering::Relaxed); + self.total_output_tokens + .fetch_add(output_tokens, Ordering::Relaxed); + } + + if config.track_latency { + self.total_latency_ms + .fetch_add(latency_ms, Ordering::Relaxed); + } + + if config.track_cost { + let cost = config.calculate_cost(input_tokens, output_tokens); + // Store in micro-dollars for precision + let cost_micros = (cost * 1_000_000.0) as u64; + self.estimated_cost_micros + .fetch_add(cost_micros, Ordering::Relaxed); + } + } + + /// Record a rate limit error. + pub fn record_rate_limit(&self) { + self.rate_limit_errors.fetch_add(1, Ordering::Relaxed); + } + + /// Record a timeout error. + pub fn record_timeout(&self) { + self.timeout_errors.fetch_add(1, Ordering::Relaxed); + } + + /// Record a fallback trigger. + pub fn record_fallback(&self) { + self.fallback_triggers.fetch_add(1, Ordering::Relaxed); + } + + /// Reset all metrics. + pub fn reset(&self) { + self.total_calls.store(0, Ordering::Relaxed); + self.successful_calls.store(0, Ordering::Relaxed); + self.failed_calls.store(0, Ordering::Relaxed); + self.total_input_tokens.store(0, Ordering::Relaxed); + self.total_output_tokens.store(0, Ordering::Relaxed); + self.total_latency_ms.store(0, Ordering::Relaxed); + self.estimated_cost_micros.store(0, Ordering::Relaxed); + self.rate_limit_errors.store(0, Ordering::Relaxed); + self.timeout_errors.store(0, Ordering::Relaxed); + self.fallback_triggers.store(0, Ordering::Relaxed); + } + + /// Generate a report snapshot. + pub fn generate_report(&self) -> LlmMetricsReport { + let total_calls = self.total_calls.load(Ordering::Relaxed); + let successful = self.successful_calls.load(Ordering::Relaxed); + let failed = self.failed_calls.load(Ordering::Relaxed); + let total_latency = self.total_latency_ms.load(Ordering::Relaxed); + + LlmMetricsReport { + total_calls, + successful_calls: successful, + failed_calls: failed, + success_rate: if total_calls > 0 { + successful as f64 / total_calls as f64 + } else { + 0.0 + }, + total_input_tokens: self.total_input_tokens.load(Ordering::Relaxed), + total_output_tokens: self.total_output_tokens.load(Ordering::Relaxed), + total_tokens: self.total_input_tokens.load(Ordering::Relaxed) + + self.total_output_tokens.load(Ordering::Relaxed), + avg_latency_ms: if total_calls > 0 { + total_latency as f64 / total_calls as f64 + } else { + 0.0 + }, + total_latency_ms: total_latency, + estimated_cost_usd: self.estimated_cost_micros.load(Ordering::Relaxed) as f64 + / 1_000_000.0, + rate_limit_errors: self.rate_limit_errors.load(Ordering::Relaxed), + timeout_errors: self.timeout_errors.load(Ordering::Relaxed), + fallback_triggers: self.fallback_triggers.load(Ordering::Relaxed), + } + } +} + +/// LLM metrics report. +#[derive(Debug, Clone)] +pub struct LlmMetricsReport { + /// Total number of LLM calls. + pub total_calls: u64, + /// Number of successful calls. + pub successful_calls: u64, + /// Number of failed calls. + pub failed_calls: u64, + /// Success rate (0.0 - 1.0). + pub success_rate: f64, + /// Total input tokens. + pub total_input_tokens: u64, + /// Total output tokens. + pub total_output_tokens: u64, + /// Total tokens (input + output). + pub total_tokens: u64, + /// Average latency in milliseconds. + pub avg_latency_ms: f64, + /// Total latency in milliseconds. + pub total_latency_ms: u64, + /// Estimated cost in USD. + pub estimated_cost_usd: f64, + /// Number of rate limit errors. + pub rate_limit_errors: u64, + /// Number of timeout errors. + pub timeout_errors: u64, + /// Number of fallback triggers. + pub fallback_triggers: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_llm_metrics_recording() { + let config = LlmMetricsConfig::default(); + let metrics = LlmMetrics::new(); + + metrics.record_call(100, 50, 150, true, &config); + metrics.record_call(200, 100, 300, true, &config); + metrics.record_call(100, 0, 0, false, &config); + + let report = metrics.generate_report(); + assert_eq!(report.total_calls, 3); + assert_eq!(report.successful_calls, 2); + assert_eq!(report.failed_calls, 1); + assert!((report.success_rate - 0.666666).abs() < 0.01); + assert_eq!(report.total_input_tokens, 400); + assert_eq!(report.total_output_tokens, 150); + } + + #[test] + fn test_llm_metrics_reset() { + let config = LlmMetricsConfig::default(); + let metrics = LlmMetrics::new(); + + metrics.record_call(100, 50, 150, true, &config); + metrics.reset(); + + let report = metrics.generate_report(); + assert_eq!(report.total_calls, 0); + } +} diff --git a/vectorless-core/vectorless-metrics/src/retrieval.rs b/vectorless-core/vectorless-metrics/src/retrieval.rs new file mode 100644 index 00000000..ce15941e --- /dev/null +++ b/vectorless-core/vectorless-metrics/src/retrieval.rs @@ -0,0 +1,263 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Retrieval metrics collection. + +use std::sync::atomic::{AtomicU64, Ordering}; + +use vectorless_config::RetrievalMetricsConfig; + +/// Retrieval metrics tracker. +#[derive(Debug, Default)] +pub struct RetrievalMetrics { + /// Total number of queries. + pub total_queries: AtomicU64, + /// Total number of search iterations. + pub total_iterations: AtomicU64, + /// Sum of iterations (for average). + pub iterations_sum: AtomicU64, + /// Total number of nodes visited. + pub nodes_visited: AtomicU64, + /// Total number of paths found. + pub paths_found: AtomicU64, + /// Sum of path lengths (for average). + pub path_length_sum: AtomicU64, + /// Sum of path scores stored as scaled integer (multiply by 1_000_000 for actual value). + pub path_score_sum_scaled: AtomicU64, + /// Number of paths with score >= 0.5. + pub high_score_paths: AtomicU64, + /// Number of paths with score < 0.3. + pub low_score_paths: AtomicU64, + /// Number of cache hits. + pub cache_hits: AtomicU64, + /// Number of cache misses. + pub cache_misses: AtomicU64, + /// Total latency in milliseconds. + pub total_latency_ms: AtomicU64, + /// Number of backtracks. + pub backtracks: AtomicU64, + /// Number of sufficiency checks. + pub sufficiency_checks: AtomicU64, + /// Number of times content was sufficient. + pub sufficient_results: AtomicU64, +} + +impl RetrievalMetrics { + /// Create new retrieval metrics. + pub fn new() -> Self { + Self::default() + } + + /// Record a query. + pub fn record_query( + &self, + iterations: u64, + nodes: u64, + latency_ms: u64, + config: &RetrievalMetricsConfig, + ) { + self.total_queries.fetch_add(1, Ordering::Relaxed); + + if config.track_iterations { + self.total_iterations + .fetch_add(iterations, Ordering::Relaxed); + self.iterations_sum.fetch_add(iterations, Ordering::Relaxed); + } + + if config.track_paths { + self.nodes_visited.fetch_add(nodes, Ordering::Relaxed); + } + + self.total_latency_ms + .fetch_add(latency_ms, Ordering::Relaxed); + } + + /// Record a found path. + pub fn record_path(&self, length: u64, score: f64, config: &RetrievalMetricsConfig) { + if !config.track_paths { + return; + } + + self.paths_found.fetch_add(1, Ordering::Relaxed); + self.path_length_sum.fetch_add(length, Ordering::Relaxed); + + if config.track_scores { + let scaled_score = (score * 1_000_000.0) as u64; + self.path_score_sum_scaled + .fetch_add(scaled_score, Ordering::Relaxed); + + if score >= 0.5 { + self.high_score_paths.fetch_add(1, Ordering::Relaxed); + } else if score < 0.3 { + self.low_score_paths.fetch_add(1, Ordering::Relaxed); + } + } + } + + /// Record a cache hit. + pub fn record_cache_hit(&self, config: &RetrievalMetricsConfig) { + if config.track_cache { + self.cache_hits.fetch_add(1, Ordering::Relaxed); + } + } + + /// Record a cache miss. + pub fn record_cache_miss(&self, config: &RetrievalMetricsConfig) { + if config.track_cache { + self.cache_misses.fetch_add(1, Ordering::Relaxed); + } + } + + /// Record a backtrack. + pub fn record_backtrack(&self) { + self.backtracks.fetch_add(1, Ordering::Relaxed); + } + + /// Record a sufficiency check. + pub fn record_sufficiency_check(&self, was_sufficient: bool) { + self.sufficiency_checks.fetch_add(1, Ordering::Relaxed); + if was_sufficient { + self.sufficient_results.fetch_add(1, Ordering::Relaxed); + } + } + + /// Reset all metrics. + pub fn reset(&self) { + self.total_queries.store(0, Ordering::Relaxed); + self.total_iterations.store(0, Ordering::Relaxed); + self.iterations_sum.store(0, Ordering::Relaxed); + self.nodes_visited.store(0, Ordering::Relaxed); + self.paths_found.store(0, Ordering::Relaxed); + self.path_length_sum.store(0, Ordering::Relaxed); + self.path_score_sum_scaled.store(0, Ordering::Relaxed); + self.high_score_paths.store(0, Ordering::Relaxed); + self.low_score_paths.store(0, Ordering::Relaxed); + self.cache_hits.store(0, Ordering::Relaxed); + self.cache_misses.store(0, Ordering::Relaxed); + self.total_latency_ms.store(0, Ordering::Relaxed); + self.backtracks.store(0, Ordering::Relaxed); + self.sufficiency_checks.store(0, Ordering::Relaxed); + self.sufficient_results.store(0, Ordering::Relaxed); + } + + /// Generate a report snapshot. + pub fn generate_report(&self) -> RetrievalMetricsReport { + let total_queries = self.total_queries.load(Ordering::Relaxed); + let paths_found = self.paths_found.load(Ordering::Relaxed); + let cache_hits = self.cache_hits.load(Ordering::Relaxed); + let cache_misses = self.cache_misses.load(Ordering::Relaxed); + let total_cache = cache_hits + cache_misses; + let sufficiency_checks = self.sufficiency_checks.load(Ordering::Relaxed); + + RetrievalMetricsReport { + total_queries, + total_iterations: self.total_iterations.load(Ordering::Relaxed), + avg_iterations: if total_queries > 0 { + self.iterations_sum.load(Ordering::Relaxed) as f64 / total_queries as f64 + } else { + 0.0 + }, + nodes_visited: self.nodes_visited.load(Ordering::Relaxed), + paths_found, + avg_path_length: if paths_found > 0 { + self.path_length_sum.load(Ordering::Relaxed) as f64 / paths_found as f64 + } else { + 0.0 + }, + avg_path_score: if paths_found > 0 { + (self.path_score_sum_scaled.load(Ordering::Relaxed) as f64 / 1_000_000.0) + / paths_found as f64 + } else { + 0.0 + }, + high_score_paths: self.high_score_paths.load(Ordering::Relaxed), + low_score_paths: self.low_score_paths.load(Ordering::Relaxed), + cache_hits, + cache_misses, + cache_hit_rate: if total_cache > 0 { + cache_hits as f64 / total_cache as f64 + } else { + 0.0 + }, + total_latency_ms: self.total_latency_ms.load(Ordering::Relaxed), + avg_latency_ms: if total_queries > 0 { + self.total_latency_ms.load(Ordering::Relaxed) as f64 / total_queries as f64 + } else { + 0.0 + }, + backtracks: self.backtracks.load(Ordering::Relaxed), + sufficiency_checks, + sufficiency_rate: if sufficiency_checks > 0 { + self.sufficient_results.load(Ordering::Relaxed) as f64 / sufficiency_checks as f64 + } else { + 0.0 + }, + } + } +} + +/// Retrieval metrics report. +#[derive(Debug, Clone)] +pub struct RetrievalMetricsReport { + /// Total number of queries. + pub total_queries: u64, + /// Total number of iterations. + pub total_iterations: u64, + /// Average iterations per query. + pub avg_iterations: f64, + /// Total nodes visited. + pub nodes_visited: u64, + /// Total paths found. + pub paths_found: u64, + /// Average path length. + pub avg_path_length: f64, + /// Average path score. + pub avg_path_score: f64, + /// Number of high-score paths (>= 0.5). + pub high_score_paths: u64, + /// Number of low-score paths (< 0.3). + pub low_score_paths: u64, + /// Number of cache hits. + pub cache_hits: u64, + /// Number of cache misses. + pub cache_misses: u64, + /// Cache hit rate. + pub cache_hit_rate: f64, + /// Total latency in milliseconds. + pub total_latency_ms: u64, + /// Average latency per query in milliseconds. + pub avg_latency_ms: f64, + /// Number of backtracks. + pub backtracks: u64, + /// Number of sufficiency checks. + pub sufficiency_checks: u64, + /// Sufficiency rate. + pub sufficiency_rate: f64, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_retrieval_metrics_recording() { + let config = RetrievalMetricsConfig::default(); + let metrics = RetrievalMetrics::new(); + + metrics.record_query(5, 10, 100, &config); + metrics.record_query(3, 8, 80, &config); + + metrics.record_path(3, 0.8, &config); + metrics.record_path(2, 0.2, &config); + + metrics.record_cache_hit(&config); + metrics.record_cache_hit(&config); + metrics.record_cache_miss(&config); + + let report = metrics.generate_report(); + assert_eq!(report.total_queries, 2); + assert_eq!(report.total_iterations, 8); + assert_eq!(report.paths_found, 2); + assert!((report.cache_hit_rate - 0.666).abs() < 0.01); + } +} diff --git a/vectorless-core/vectorless-py/Cargo.toml b/vectorless-core/vectorless-py/Cargo.toml index a5236730..967540ce 100644 --- a/vectorless-core/vectorless-py/Cargo.toml +++ b/vectorless-core/vectorless-py/Cargo.toml @@ -16,7 +16,7 @@ crate-type = ["cdylib"] pyo3 = { workspace = true } pyo3-async-runtimes = { workspace = true } tokio = { version = "1", features = ["rt-multi-thread"] } -vectorless = { path = "../vectorless" } +vectorless-engine = { path = "../vectorless-engine" } [lints] workspace = true diff --git a/vectorless-core/vectorless-query/Cargo.toml b/vectorless-core/vectorless-query/Cargo.toml new file mode 100644 index 00000000..353e94dc --- /dev/null +++ b/vectorless-core/vectorless-query/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "vectorless-query" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-error = { path = "../vectorless-error" } +vectorless-llm = { path = "../vectorless-llm" } +vectorless-scoring = { path = "../vectorless-scoring" } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +tokio = { workspace = true } +chrono = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-query/src/lib.rs b/vectorless-core/vectorless-query/src/lib.rs new file mode 100644 index 00000000..38ecb116 --- /dev/null +++ b/vectorless-core/vectorless-query/src/lib.rs @@ -0,0 +1,45 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Query understanding and planning. +//! +//! Analyzes a user's raw query and produces a structured [`QueryPlan`] +//! for downstream modules (Orchestrator, Worker). +//! +//! # Pipeline +//! +//! ```text +//! raw query string +//! → extract keywords (from scoring/bm25) +//! → LLM query understanding (intent, concepts, complexity) +//! → QueryPlan +//! ``` +//! +//! LLM understanding is required — this is a pure reasoning engine. +//! Errors are propagated, not silently degraded. + +mod types; +mod understand; + +pub use types::{QueryIntent, QueryPlan}; + +use vectorless_llm::LlmClient; +use vectorless_scoring::bm25::extract_keywords; + +/// Query understanding pipeline. +/// +/// Produces a [`QueryPlan`] from a raw query string via LLM analysis. +pub struct QueryPipeline; + +impl QueryPipeline { + /// Analyze a query and produce a structured plan. + /// + /// 1. Extract keywords (zero-cost, no LLM) + /// 2. LLM deep understanding (intent, concepts, complexity, strategy) + /// + /// Errors propagate — the caller handles retries or failure. + pub async fn understand(query: &str, llm: &LlmClient) -> vectorless_error::Result { + let keywords = extract_keywords(query); + understand::understand(query, &keywords, llm).await + } +} diff --git a/vectorless-core/vectorless-query/src/types.rs b/vectorless-core/vectorless-query/src/types.rs new file mode 100644 index 00000000..f8e025e8 --- /dev/null +++ b/vectorless-core/vectorless-query/src/types.rs @@ -0,0 +1,114 @@ +// Copyright (c) 2026 vectorless devices +// SPDX-License-Identifier: Apache-2.0 + +//! Core types for query understanding. + +use serde::{Deserialize, Serialize}; + +/// Query intent classification. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum QueryIntent { + /// Factoid: "What is the Q3 2024 revenue?" + Factual, + /// Analytical: "Compare market risk vs operational risk" + Analytical, + /// Navigation: "Find the section on compliance policy" + Navigational, + /// Summary: "Summarize the main points of this document" + Summary, +} + +impl Default for QueryIntent { + fn default() -> Self { + Self::Factual + } +} + +impl std::fmt::Display for QueryIntent { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + QueryIntent::Factual => write!(f, "factual"), + QueryIntent::Analytical => write!(f, "analytical"), + QueryIntent::Navigational => write!(f, "navigational"), + QueryIntent::Summary => write!(f, "summary"), + } + } +} + +/// Query complexity estimation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum Complexity { + /// Single keyword, simple factoid. + Simple, + /// Multi-concept, requires synthesis. + Moderate, + /// Cross-document, comparative, or multi-faceted. + Complex, +} + +impl Default for Complexity { + fn default() -> Self { + Self::Simple + } +} + +impl std::fmt::Display for Complexity { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Complexity::Simple => write!(f, "simple"), + Complexity::Moderate => write!(f, "moderate"), + Complexity::Complex => write!(f, "complex"), + } + } +} + +/// A sub-query produced by decomposition. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SubQuery { + /// The sub-query text. + pub query: String, + /// Intent of this sub-query. + pub intent: QueryIntent, + /// Pre-identified target documents (if any). + pub target_docs: Option>, +} + +/// A structured query plan — the output of the query understanding pipeline. +/// +/// Produced by `QueryPipeline::understand()`. Consumed by the Orchestrator +/// and Worker agents for strategy selection. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueryPlan { + /// The original raw query string. + pub original: String, + /// Detected intent. + pub intent: QueryIntent, + /// Extracted keywords. + pub keywords: Vec, + /// Key concepts identified by LLM (distinct from keywords). + pub key_concepts: Vec, + /// Strategy hint for navigation agents. + pub strategy_hint: String, + /// Estimated complexity. + pub complexity: Complexity, + /// Rewritten queries (produced by LLM for better matching). + pub rewritten: Vec, + /// Decomposed sub-queries (for complex/multi-faceted queries). + pub sub_queries: Vec, +} + +impl QueryPlan { + /// LLM understanding failed — produce a minimal default plan. + pub fn default_for(query: &str, keywords: Vec) -> Self { + Self { + original: query.to_string(), + intent: QueryIntent::Factual, + keywords, + key_concepts: Vec::new(), + strategy_hint: "focused".to_string(), + complexity: Complexity::Simple, + rewritten: Vec::new(), + sub_queries: Vec::new(), + } + } +} diff --git a/vectorless-core/vectorless-query/src/understand.rs b/vectorless-core/vectorless-query/src/understand.rs new file mode 100644 index 00000000..c124395a --- /dev/null +++ b/vectorless-core/vectorless-query/src/understand.rs @@ -0,0 +1,246 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! LLM-driven query understanding. +//! +//! Uses an LLM call to analyze the query and produce a structured [`QueryPlan`]. +//! Falls back to keyword-only analysis on LLM failure. + +use serde::Deserialize; +use tracing::{info, warn}; + +use vectorless_llm::LlmClient; + +use super::types::{Complexity, QueryIntent, QueryPlan, SubQuery}; + +/// Structured analysis returned by the LLM. +#[derive(Deserialize)] +struct QueryAnalysis { + intent: String, + key_concepts: Vec, + strategy_hint: String, + complexity: String, + rewritten: Option, + sub_queries: Vec, +} + +/// Use LLM to understand the query and produce a QueryPlan. +/// +/// Propagates LLM errors — no silent degradation. The caller decides +/// how to handle failure (retry, abort, etc.). +pub async fn understand( + query: &str, + keywords: &[String], + llm: &LlmClient, +) -> vectorless_error::Result { + let (system, user) = understand_prompt(query, keywords); + info!("Query understanding: calling LLM..."); + let response = llm.complete(&system, &user).await?; + + if response.trim().is_empty() { + warn!("Query understanding: LLM returned empty response"); + return Err(vectorless_error::Error::Config( + "Query understanding failed: LLM returned an empty response. \ + Check your API key, model, and endpoint configuration." + .to_string(), + )); + } + + let analysis = parse_analysis(&response).ok_or_else(|| { + let preview = &response[..response.len().min(300)]; + vectorless_error::Error::Config(format!( + "Query understanding returned unparseable response ({} chars): {}", + response.len(), + preview + )) + })?; + + info!( + intent = %analysis.intent, + complexity = %analysis.complexity, + concepts = ?analysis.key_concepts, + strategy = %analysis.strategy_hint, + rewritten = ?analysis.rewritten, + "Query understanding complete" + ); + Ok(analysis.into_plan(query, keywords)) +} + +/// Parse the LLM's JSON response into a QueryAnalysis. +fn parse_analysis(response: &str) -> Option { + let trimmed = response.trim(); + + // Try to extract JSON from the response (LLM may wrap it in markdown) + let json_str = if trimmed.starts_with("```") { + // Find the first newline after the opening fence (skips language tag) + let after_fence = if let Some(nl) = trimmed.find('\n') { + &trimmed[nl + 1..] + } else { + trimmed + }; + // Strip the closing fence + let without_end = if let Some(end) = after_fence.rfind("```") { + &after_fence[..end] + } else { + after_fence + }; + without_end.trim() + } else { + trimmed + }; + + match serde_json::from_str(json_str) { + Ok(analysis) => Some(analysis), + Err(e) => { + warn!( + error = %e, + json_len = json_str.len(), + "Query understanding: JSON parse failed" + ); + None + } + } +} + +impl QueryAnalysis { + fn into_plan(self, query: &str, keywords: &[String]) -> QueryPlan { + QueryPlan { + original: query.to_string(), + intent: parse_intent(&self.intent), + keywords: keywords.to_vec(), + key_concepts: self.key_concepts, + strategy_hint: self.strategy_hint, + complexity: parse_complexity(&self.complexity), + rewritten: self.rewritten.into_iter().collect(), + sub_queries: self + .sub_queries + .into_iter() + .map(|sq| SubQuery { + query: sq, + intent: QueryIntent::Factual, + target_docs: None, + }) + .collect(), + } + } +} + +fn parse_intent(s: &str) -> QueryIntent { + match s.to_lowercase().as_str() { + "analytical" | "analysis" | "compare" | "comparison" => QueryIntent::Analytical, + "navigational" | "navigation" | "find" | "locate" => QueryIntent::Navigational, + "summary" | "summarize" | "overview" => QueryIntent::Summary, + _ => QueryIntent::Factual, + } +} + +fn parse_complexity(s: &str) -> Complexity { + match s.to_lowercase().as_str() { + "complex" | "high" => Complexity::Complex, + "moderate" | "medium" => Complexity::Moderate, + _ => Complexity::Simple, + } +} + +/// Build the LLM prompt for query understanding. +fn understand_prompt(query: &str, keywords: &[String]) -> (String, String) { + let system = r#"You are a query analysis engine. Analyze the user's query and respond with a JSON object containing: + +- "intent": one of "factual", "analytical", "navigational", "summary" +- "key_concepts": array of the main concepts/entities in the query (distinct from keywords) +- "strategy_hint": one of "focused" (single-topic), "exploratory" (broad scan), "comparative" (cross-reference), or "summary" (aggregate) +- "complexity": one of "simple", "moderate", "complex" +- "rewritten": optional rewritten version of the query for better retrieval (null if not needed) +- "sub_queries": array of sub-query strings if the query can be decomposed (empty array if not) + +Respond with ONLY the JSON object, no additional text."#; + + let user = format!( + "Query: {}\nExtracted keywords: [{}]", + query, + keywords.join(", ") + ); + + (system.to_string(), user) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_intent() { + assert_eq!(parse_intent("factual"), QueryIntent::Factual); + assert_eq!(parse_intent("analytical"), QueryIntent::Analytical); + assert_eq!(parse_intent("analysis"), QueryIntent::Analytical); + assert_eq!(parse_intent("navigational"), QueryIntent::Navigational); + assert_eq!(parse_intent("summary"), QueryIntent::Summary); + assert_eq!(parse_intent("unknown"), QueryIntent::Factual); + } + + #[test] + fn test_parse_complexity() { + assert_eq!(parse_complexity("simple"), Complexity::Simple); + assert_eq!(parse_complexity("moderate"), Complexity::Moderate); + assert_eq!(parse_complexity("complex"), Complexity::Complex); + assert_eq!(parse_complexity("high"), Complexity::Complex); + assert_eq!(parse_complexity("unknown"), Complexity::Simple); + } + + #[test] + fn test_parse_analysis_json() { + let response = r#"{"intent":"factual","key_concepts":["revenue","Q3"],"strategy_hint":"focused","complexity":"simple","rewritten":null,"sub_queries":[]}"#; + let analysis = parse_analysis(response).unwrap(); + assert_eq!(analysis.intent, "factual"); + assert_eq!(analysis.key_concepts.len(), 2); + assert!(analysis.rewritten.is_none()); + } + + #[test] + fn test_parse_analysis_markdown_wrapped() { + let response = "```json\n{\"intent\":\"analytical\",\"key_concepts\":[\"risk\"],\"strategy_hint\":\"comparative\",\"complexity\":\"moderate\",\"rewritten\":\"compare risks\",\"sub_queries\":[]}\n```"; + let analysis = parse_analysis(response).unwrap(); + assert_eq!(analysis.intent, "analytical"); + } + + #[test] + fn test_parse_analysis_invalid() { + assert!(parse_analysis("not json").is_none()); + } + + #[test] + fn test_parse_analysis_code_fence_no_newline() { + // Edge case: ```json{"intent":...}``` with no newline after language tag + let response = "```json\n{\"intent\":\"factual\",\"key_concepts\":[\"test\"],\"strategy_hint\":\"focused\",\"complexity\":\"simple\",\"rewritten\":null,\"sub_queries\":[]}\n```"; + let analysis = parse_analysis(response).unwrap(); + assert_eq!(analysis.intent, "factual"); + } + + #[test] + fn test_parse_analysis_code_fence_no_closing() { + // LLM sometimes omits the closing fence + let response = "```json\n{\"intent\":\"summary\",\"key_concepts\":[\"overview\"],\"strategy_hint\":\"summary\",\"complexity\":\"simple\",\"rewritten\":null,\"sub_queries\":[]}"; + let analysis = parse_analysis(response).unwrap(); + assert_eq!(analysis.intent, "summary"); + } + + #[test] + fn test_parse_analysis_keys_starting_with_fence_letters() { + // The old trim_start_matches(|c| 'j' | 's' | 'o' | 'n') would eat + // JSON keys starting with those letters. Verify this works correctly. + let response = r#"{"intent":"navigational","key_concepts":["journal","offset","node"],"strategy_hint":"focused","complexity":"moderate","rewritten":null,"sub_queries":[]}"#; + let analysis = parse_analysis(response).unwrap(); + assert_eq!(analysis.intent, "navigational"); + assert_eq!(analysis.key_concepts, vec!["journal", "offset", "node"]); + } + + #[test] + fn test_default_plan() { + let plan = QueryPlan::default_for("test query", vec!["test".to_string()]); + assert_eq!(plan.original, "test query"); + assert_eq!(plan.intent, QueryIntent::Factual); + assert_eq!(plan.keywords.len(), 1); + assert!(plan.key_concepts.is_empty()); + assert!(plan.sub_queries.is_empty()); + } +} diff --git a/vectorless-core/vectorless-rerank/Cargo.toml b/vectorless-core/vectorless-rerank/Cargo.toml new file mode 100644 index 00000000..da09414d --- /dev/null +++ b/vectorless-core/vectorless-rerank/Cargo.toml @@ -0,0 +1,19 @@ +[package] +name = "vectorless-rerank" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +serde = { workspace = true } +tracing = { workspace = true } +vectorless-agent = { path = "../vectorless-agent" } +vectorless-error = { path = "../vectorless-error" } +vectorless-query = { path = "../vectorless-query" } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-rerank/src/dedup.rs b/vectorless-core/vectorless-rerank/src/dedup.rs new file mode 100644 index 00000000..0b09e6d1 --- /dev/null +++ b/vectorless-core/vectorless-rerank/src/dedup.rs @@ -0,0 +1,216 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Evidence deduplication and quality filtering. + +use std::collections::HashSet; + +use vectorless_agent::Evidence; + +/// Minimum characters for an evidence item to be considered meaningful. +const MIN_EVIDENCE_CHARS: usize = 50; + +/// Jaccard similarity threshold for content dedup. +const SIMILARITY_THRESHOLD: f64 = 0.8; + +/// Filter low-quality and duplicate evidence. +/// +/// Steps: +/// 1. Drop evidence with no meaningful content (< MIN_EVIDENCE_CHARS) +/// 2. Deduplicate by source overlap (same path in same doc) +/// 3. Deduplicate by content similarity (Jaccard on token sets) +pub fn dedup(evidence: &[Evidence]) -> Vec { + // Step 1: Quality filter + let quality: Vec<&Evidence> = evidence + .iter() + .filter(|e| e.content.len() >= MIN_EVIDENCE_CHARS) + .collect(); + + // Step 2: Deduplicate by source overlap + let mut seen_sources: HashSet = HashSet::new(); + let source_deduped: Vec<&Evidence> = quality + .into_iter() + .filter(|e| { + let doc_key = e.doc_name.as_deref().unwrap_or("_unknown"); + let key = format!("{}:{}", doc_key, e.source_path); + seen_sources.insert(key) + }) + .collect(); + + // Step 3: Deduplicate by content similarity + let mut deduped: Vec = Vec::new(); + for ev in source_deduped { + let tokens = tokenize(&ev.content); + let dominated = deduped + .iter() + .any(|existing| jaccard(&tokens, &tokenize(&existing.content)) >= SIMILARITY_THRESHOLD); + if !dominated { + deduped.push(ev.clone()); + } + } + + deduped +} + +/// Tokenize text into a set of lowercase words. +fn tokenize(text: &str) -> HashSet { + text.to_lowercase() + .split_whitespace() + .map(|s| s.to_string()) + .collect() +} + +/// Compute Jaccard similarity between two sets. +fn jaccard(a: &HashSet, b: &HashSet) -> f64 { + if a.is_empty() && b.is_empty() { + return 1.0; + } + let intersection = a.intersection(b).count() as f64; + let union = a.union(b).count() as f64; + intersection / union +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_evidence(title: &str, content: &str) -> Evidence { + Evidence { + source_path: format!("root/{}", title), + node_title: title.to_string(), + content: content.to_string(), + doc_name: Some("doc".to_string()), + } + } + + #[test] + fn test_quality_filter() { + let evidence = vec![ + make_evidence("A", "short"), // < 50 chars, filtered + make_evidence("B", &"x".repeat(60)), // kept + ]; + let result = dedup(&evidence); + assert_eq!(result.len(), 1); + assert_eq!(result[0].node_title, "B"); + } + + #[test] + fn test_source_dedup() { + let evidence = vec![ + make_evidence( + "A", + &"content A with enough text to pass the quality filter threshold".to_string(), + ), + make_evidence( + "A", + &"different content A but same source path that is long enough".to_string(), + ), + ]; + let result = dedup(&evidence); + assert_eq!(result.len(), 1); + } + + #[test] + fn test_content_similarity_dedup() { + let base = "This is a piece of evidence about machine learning algorithms and their applications in real world scenarios".to_string(); + let similar = "This is a piece of evidence about machine learning algorithms and their applications in real world".to_string(); + let different = + "Completely unrelated content about quantum physics and particle accelerators at CERN" + .to_string(); + let evidence = vec![ + make_evidence("A", &base), + make_evidence("B", &similar), // high similarity, should be deduped + make_evidence("C", &different), // different, kept + ]; + let result = dedup(&evidence); + assert!(result.len() >= 2); // at least A and C + } + + #[test] + fn test_empty_input() { + let result = dedup(&[]); + assert!(result.is_empty()); + } + + #[test] + fn test_jaccard_identical() { + let a = tokenize("hello world foo"); + let b = tokenize("hello world foo"); + assert!((jaccard(&a, &b) - 1.0).abs() < 0.001); + } + + #[test] + fn test_jaccard_disjoint() { + let a = tokenize("aaa bbb"); + let b = tokenize("ccc ddd"); + assert!((jaccard(&a, &b)).abs() < 0.001); + } + + #[test] + fn test_source_dedup_none_doc_name() { + // Evidence with doc_name: None should use "_unknown" as doc key, + // so same source_path with None doc_name still deduplicates correctly. + let evidence = vec![ + Evidence { + source_path: "root/section_a".to_string(), + node_title: "A".to_string(), + content: "content A with enough text to pass the quality filter threshold" + .to_string(), + doc_name: None, + }, + Evidence { + source_path: "root/section_a".to_string(), + node_title: "A2".to_string(), + content: "different content but same source path that should be deduped" + .to_string(), + doc_name: None, + }, + ]; + let result = dedup(&evidence); + assert_eq!(result.len(), 1); + } + + #[test] + fn test_source_dedup_mixed_doc_name() { + // Same source_path but different doc_name should produce different dedup keys, + // so both survive source dedup. Content must be sufficiently different too. + let evidence = vec![ + Evidence { + source_path: "root/section".to_string(), + node_title: "A".to_string(), + content: "Revenue for Q4 was twelve million dollars driven by SaaS growth in the enterprise segment".to_string(), + doc_name: Some("doc_a".to_string()), + }, + Evidence { + source_path: "root/section".to_string(), + node_title: "B".to_string(), + content: "The encryption module uses AES-256 for data at rest and TLS 1.3 for all network communication".to_string(), + doc_name: Some("doc_b".to_string()), + }, + ]; + let result = dedup(&evidence); + assert_eq!(result.len(), 2); + } + + #[test] + fn test_source_dedup_none_vs_some_doc_name() { + // None doc_name ("_unknown") and Some doc_name produce different keys, + // so both survive source dedup. Content must be sufficiently different too. + let evidence = vec![ + Evidence { + source_path: "root/section".to_string(), + node_title: "A".to_string(), + content: "The database uses a log-structured merge tree with write-ahead logging for durability".to_string(), + doc_name: None, + }, + Evidence { + source_path: "root/section".to_string(), + node_title: "B".to_string(), + content: "Authentication requires Bearer tokens with automatic refresh after twenty-four hours".to_string(), + doc_name: Some("doc_x".to_string()), + }, + ]; + let result = dedup(&evidence); + assert_eq!(result.len(), 2); + } +} diff --git a/vectorless-core/vectorless-rerank/src/lib.rs b/vectorless-core/vectorless-rerank/src/lib.rs new file mode 100644 index 00000000..64e4474a --- /dev/null +++ b/vectorless-core/vectorless-rerank/src/lib.rs @@ -0,0 +1,104 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Result reranking — dedup + format. +//! +//! Post-processing pipeline that runs after the agent collects raw evidence: +//! +//! ```text +//! agent (collect evidence) +//! → rerank::process() +//! → dedup (quality filter + dedup) +//! → format as answer (no LLM — return original text) +//! → Output with final answer +//! ``` +//! +//! This is a document retrieval engine. The answer IS the evidence. +//! No LLM synthesis, no rewriting. Find what you find, return what you find. + +pub mod dedup; +pub mod types; + +use tracing::info; + +use vectorless_agent::Evidence; +use vectorless_query::QueryIntent; +use types::RerankOutput; + +/// Process agent output through the rerank pipeline. +/// +/// Deduplicates evidence, then returns the original text as the answer. +/// No LLM calls — the Worker already retrieved the exact passages. +pub async fn process( + _query: &str, + evidence: &[Evidence], + _multi_doc: bool, + intent: QueryIntent, + confidence: f32, +) -> vectorless_error::Result { + let deduped = dedup::dedup(evidence); + if deduped.is_empty() { + info!("No evidence after dedup"); + return Ok(RerankOutput { + answer: String::new(), + llm_calls: 0, + confidence: 0.0, + }); + } + + info!( + evidence = deduped.len(), + intent = %intent, + "Evidence after dedup" + ); + + let answer = match intent { + QueryIntent::Navigational => format_locations(&deduped), + _ => format_evidence_as_answer(&deduped), + }; + + info!( + evidence = deduped.len(), + answer_len = answer.len(), + confidence, + "Rerank complete" + ); + + Ok(RerankOutput { + answer, + llm_calls: 0, + confidence, + }) +} + +/// Format evidence as a location listing for navigational queries. +fn format_locations(evidence: &[Evidence]) -> String { + if evidence.is_empty() { + return "No matching locations found.".to_string(); + } + let mut result = "Found at:\n".to_string(); + for e in evidence { + let doc = e.doc_name.as_deref().unwrap_or("unknown"); + result.push_str(&format!( + "- **{}** in {} at {}\n", + e.node_title, doc, e.source_path + )); + } + result +} + +/// Format collected evidence directly as the answer. +fn format_evidence_as_answer(evidence: &[Evidence]) -> String { + evidence + .iter() + .map(|e| { + let doc = e.doc_name.as_deref().unwrap_or(""); + if doc.is_empty() { + format!("[{}]\n{}", e.node_title, e.content) + } else { + format!("[{} — {}]\n{}", e.node_title, doc, e.content) + } + }) + .collect::>() + .join("\n\n") +} diff --git a/vectorless-core/vectorless-rerank/src/types.rs b/vectorless-core/vectorless-rerank/src/types.rs new file mode 100644 index 00000000..4b42f351 --- /dev/null +++ b/vectorless-core/vectorless-rerank/src/types.rs @@ -0,0 +1,14 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Rerank result types. + +/// Output from the rerank pipeline. +pub struct RerankOutput { + /// Synthesized answer. + pub answer: String, + /// Number of LLM calls used during synthesis/fusion. + pub llm_calls: u32, + /// Confidence score (0.0–1.0) — derived from LLM evaluate() result. + pub confidence: f32, +} diff --git a/vectorless-core/vectorless-retrieval/Cargo.toml b/vectorless-core/vectorless-retrieval/Cargo.toml new file mode 100644 index 00000000..69a66bfd --- /dev/null +++ b/vectorless-core/vectorless-retrieval/Cargo.toml @@ -0,0 +1,27 @@ +[package] +name = "vectorless-retrieval" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-agent = { path = "../vectorless-agent" } +vectorless-document = { path = "../vectorless-document" } +vectorless-error = { path = "../vectorless-error" } +vectorless-llm = { path = "../vectorless-llm" } +vectorless-query = { path = "../vectorless-query" } +vectorless-storage = { path = "../vectorless-storage" } +vectorless-utils = { path = "../vectorless-utils" } +tokio = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tracing = { workspace = true } +futures = { workspace = true } +parking_lot = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-retrieval/src/cache.rs b/vectorless-core/vectorless-retrieval/src/cache.rs new file mode 100644 index 00000000..c924732c --- /dev/null +++ b/vectorless-core/vectorless-retrieval/src/cache.rs @@ -0,0 +1,577 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Tiered reasoning cache for the retrieval pipeline. +//! +//! Provides three levels of caching to avoid redundant computation: +//! +//! - **L1 (Exact)**: Cache full retrieval results keyed by exact query fingerprint. +//! Identical queries return instantly. +//! +//! - **L2 (Path Pattern)**: Cache navigation decisions for tree paths. If a previous +//! query navigated through Section 3.2, a new query about the same section can +//! reuse those path cues even when the full query differs. +//! +//! - **L3 (Strategy Score)**: Cache node scores from keyword/BM25 strategies. +//! Node scores are independent of the query, so they can be shared across +//! different queries on the same document. + +use std::collections::{HashMap, VecDeque}; +use std::sync::RwLock; +use std::time::Instant; + +use tracing::warn; + +use vectorless_document::NodeId; +use vectorless_utils::fingerprint::Fingerprint; + +/// A tiered reasoning cache for the retrieval pipeline. +/// +/// Thread-safe via `RwLock`. Each tier has independent size limits +/// and TTL-based expiration. +pub struct ReasoningCache { + /// L1: Exact query → cached candidate list. + l1: RwLock, + /// L2: Node path pattern → cached navigation cue score. + l2: RwLock, + /// L3: Node content fingerprint → cached strategy score. + l3: RwLock, + /// Configuration. + config: ReasoningCacheConfig, +} + +/// Configuration for the reasoning cache. +#[derive(Debug, Clone)] +pub struct ReasoningCacheConfig { + /// Maximum L1 entries (exact query results). + pub l1_max: usize, + /// Maximum L2 entries (path patterns). + pub l2_max: usize, + /// Maximum L3 entries (strategy scores). + pub l3_max: usize, +} + +impl Default for ReasoningCacheConfig { + fn default() -> Self { + Self { + l1_max: 200, + l2_max: 1000, + l3_max: 5000, + } + } +} + +// ---- L1: Exact Query Cache ---- + +#[derive(Debug, Clone)] +struct L1Entry { + /// Fingerprint of the workspace + document set used for this query. + scope_fp: Fingerprint, + /// Cached candidate nodes (pre-sorted by score). + candidates: Vec, + /// Strategy used. + strategy: String, + /// When cached. + created_at: Instant, +} + +/// A cached candidate from a previous retrieval. +#[derive(Debug, Clone)] +pub struct CachedCandidate { + /// Node ID. + pub node_id: NodeId, + /// Relevance score. + pub score: f32, + /// Depth in tree. + pub depth: usize, +} + +struct L1Store { + entries: HashMap, + order: VecDeque, // For LRU eviction — O(1) pop_front +} + +// ---- L2: Path Pattern Cache ---- + +#[derive(Debug, Clone)] +struct L2Entry { + /// Score for this navigation cue. + confidence: f32, + /// How many times this path was relevant. + hit_count: usize, + created_at: Instant, +} + +struct L2Store { + entries: HashMap, // Key: "doc_fp:node_path" + order: VecDeque, +} + +// ---- L3: Strategy Score Cache ---- + +#[derive(Debug, Clone)] +struct L3Entry { + /// BM25/Keyword score. + score: f32, + /// Which strategy produced this score. + strategy: String, + created_at: Instant, +} + +struct L3Store { + entries: HashMap, // Key: node content fingerprint + order: VecDeque, +} + +// ---- Public API ---- + +impl ReasoningCache { + /// Create a new reasoning cache with default configuration. + pub fn new() -> Self { + Self::with_config(ReasoningCacheConfig::default()) + } + + /// Create with custom configuration. + pub fn with_config(config: ReasoningCacheConfig) -> Self { + Self { + l1: RwLock::new(L1Store { + entries: HashMap::new(), + order: VecDeque::new(), + }), + l2: RwLock::new(L2Store { + entries: HashMap::new(), + order: VecDeque::new(), + }), + l3: RwLock::new(L3Store { + entries: HashMap::new(), + order: VecDeque::new(), + }), + config, + } + } + + // ============ L1: Exact Query ============ + + /// Look up an exact query result. + /// + /// Returns cached candidates if the same query was executed before + /// on the same document scope. + pub fn l1_get(&self, query: &str, scope_fp: &Fingerprint) -> Option> { + let query_fp = Fingerprint::from_str(query); + let l1 = read_lock(&self.l1)?; + let entry = l1.entries.get(&query_fp)?; + // Scope must match (same document set) + if &entry.scope_fp != scope_fp { + return None; + } + Some(entry.candidates.clone()) + } + + /// Store an L1 result. + pub fn l1_store( + &self, + query: &str, + scope_fp: Fingerprint, + candidates: Vec, + strategy: String, + ) { + let query_fp = Fingerprint::from_str(query); + if let Ok(mut l1) = self.l1.write() { + if l1.entries.len() >= self.config.l1_max { + Self::evict_lru_fingerprint(&mut l1); + } + l1.entries.insert( + query_fp, + L1Entry { + scope_fp, + candidates, + strategy, + created_at: Instant::now(), + }, + ); + l1.order.push_back(query_fp); + } + } + + // ============ L2: Path Pattern ============ + + /// Look up a cached navigation confidence for a document + node path. + /// + /// If a previous query successfully navigated through this path, + /// return the confidence score. + pub fn l2_get(&self, doc_key: &str, node_path: &str) -> Option { + let key = format!("{}:{}", doc_key, node_path); + let l2 = read_lock(&self.l2)?; + let entry = l2.entries.get(&key)?; + Some(entry.confidence) + } + + /// Record a successful navigation through a path. + /// + /// Call this after retrieval confirms a path was relevant. + pub fn l2_record(&self, doc_key: &str, node_path: &str, confidence: f32) { + let key = format!("{}:{}", doc_key, node_path); + if let Ok(mut l2) = self.l2.write() { + if let Some(entry) = l2.entries.get_mut(&key) { + // Update running average + entry.hit_count += 1; + entry.confidence = + entry.confidence + (confidence - entry.confidence) / entry.hit_count as f32; + } else { + if l2.entries.len() >= self.config.l2_max { + Self::evict_lru_string(&mut l2); + } + l2.entries.insert( + key.clone(), + L2Entry { + confidence, + hit_count: 1, + created_at: Instant::now(), + }, + ); + l2.order.push_back(key); + } + } + } + + /// Get top-N path hints for a document, sorted by confidence. + /// + /// Useful for bootstrapping new queries on a known document. + pub fn l2_top_paths(&self, doc_key: &str, n: usize) -> Vec<(String, f32)> { + let prefix = format!("{}:", doc_key); + let l2 = match read_lock(&self.l2) { + Some(guard) => guard, + None => return Vec::new(), + }; + + let mut paths: Vec<(String, f32)> = l2 + .entries + .iter() + .filter(|(k, _)| k.starts_with(&prefix)) + .map(|(k, v)| (k[prefix.len()..].to_string(), v.confidence)) + .collect(); + paths.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + paths.truncate(n); + paths + } + + // ============ L3: Strategy Score ============ + + /// Look up a cached strategy score for a node. + /// + /// Node scores from keyword/BM25 are content-dependent but + /// query-independent, so they can be shared across queries. + pub fn l3_get(&self, node_content_fp: &Fingerprint) -> Option<(f32, String)> { + let l3 = read_lock(&self.l3)?; + let entry = l3.entries.get(node_content_fp)?; + Some((entry.score, entry.strategy.clone())) + } + + /// Store a strategy score for a node. + pub fn l3_store(&self, node_content_fp: Fingerprint, score: f32, strategy: String) { + if let Ok(mut l3) = self.l3.write() { + if l3.entries.len() >= self.config.l3_max { + Self::evict_lru_fingerprint_l3(&mut l3); + } + l3.entries.insert( + node_content_fp, + L3Entry { + score, + strategy, + created_at: Instant::now(), + }, + ); + l3.order.push_back(node_content_fp); + } + } + + // ============ Stats ============ + + /// Get cache statistics. + pub fn stats(&self) -> ReasoningCacheStats { + let (l1_count, l2_count, l3_count) = ( + read_lock(&self.l1).map(|g| g.entries.len()).unwrap_or(0), + read_lock(&self.l2).map(|g| g.entries.len()).unwrap_or(0), + read_lock(&self.l3).map(|g| g.entries.len()).unwrap_or(0), + ); + ReasoningCacheStats { + l1_entries: l1_count, + l2_entries: l2_count, + l3_entries: l3_count, + } + } + + /// Clear all cache tiers. + pub fn clear(&self) { + if let Ok(mut l1) = self.l1.write() { + l1.entries.clear(); + l1.order.clear(); + } + if let Ok(mut l2) = self.l2.write() { + l2.entries.clear(); + l2.order.clear(); + } + if let Ok(mut l3) = self.l3.write() { + l3.entries.clear(); + l3.order.clear(); + } + } + + // ============ Eviction helpers ============ + + fn evict_lru_fingerprint(l1: &mut L1Store) { + if let Some(old) = l1.order.pop_front() { + l1.entries.remove(&old); + } + } + + fn evict_lru_string(l2: &mut L2Store) { + if let Some(old) = l2.order.pop_front() { + l2.entries.remove(&old); + } + } + + fn evict_lru_fingerprint_l3(l3: &mut L3Store) { + if let Some(old) = l3.order.pop_front() { + l3.entries.remove(&old); + } + } +} + +impl Default for ReasoningCache { + fn default() -> Self { + Self::new() + } +} + +/// Read from a RwLock, recovering from poison by taking the guard anyway. +/// +/// A poisoned lock means another thread panicked while holding it — the data +/// is still valid, just potentially in an inconsistent state. For a cache, +/// returning stale/empty data is always preferable to failing silently. +fn read_lock(lock: &RwLock) -> Option> { + match lock.read() { + Ok(guard) => Some(guard), + Err(poisoned) => { + warn!("ReasoningCache: recovering from poisoned lock"); + Some(poisoned.into_inner()) + } + } +} + +/// Cache statistics. +#[derive(Debug, Clone)] +pub struct ReasoningCacheStats { + /// L1 entries (exact query results). + pub l1_entries: usize, + /// L2 entries (path patterns). + pub l2_entries: usize, + /// L3 entries (strategy scores). + pub l3_entries: usize, +} + +#[cfg(test)] +mod tests { + use super::*; + + fn make_node_id(n: usize) -> NodeId { + let mut arena = indextree::Arena::new(); + NodeId(arena.new_node(n)) + } + + #[test] + fn test_l1_store_and_retrieve() { + let cache = ReasoningCache::new(); + let scope = Fingerprint::from_str("doc1"); + + let candidates = vec![CachedCandidate { + node_id: make_node_id(1), + score: 0.9, + depth: 2, + }]; + + cache.l1_store("what is rust?", scope, candidates.clone(), "keyword".into()); + let result = cache.l1_get("what is rust?", &scope); + assert!(result.is_some()); + assert_eq!(result.unwrap().len(), 1); + } + + #[test] + fn test_l1_miss_different_scope() { + let cache = ReasoningCache::new(); + let scope1 = Fingerprint::from_str("doc1"); + let scope2 = Fingerprint::from_str("doc2"); + + let candidates = vec![CachedCandidate { + node_id: make_node_id(1), + score: 0.9, + depth: 2, + }]; + + cache.l1_store("query", scope1, candidates, "keyword".into()); + assert!(cache.l1_get("query", &scope2).is_none()); + } + + #[test] + fn test_l2_record_and_get() { + let cache = ReasoningCache::new(); + + cache.l2_record("doc1", "3.2", 0.8); + let score = cache.l2_get("doc1", "3.2"); + assert!(score.is_some()); + assert!((score.unwrap() - 0.8).abs() < 0.01); + } + + #[test] + fn test_l2_running_average() { + let cache = ReasoningCache::new(); + + cache.l2_record("doc1", "3.2", 0.8); + cache.l2_record("doc1", "3.2", 0.6); + let score = cache.l2_get("doc1", "3.2").unwrap(); + // Running average: 0.8 + (0.6 - 0.8) / 2 = 0.7 + assert!((score - 0.7).abs() < 0.01); + } + + #[test] + fn test_l2_top_paths() { + let cache = ReasoningCache::new(); + + cache.l2_record("doc1", "3.1", 0.5); + cache.l2_record("doc1", "3.2", 0.9); + cache.l2_record("doc1", "2.1", 0.7); + + let top = cache.l2_top_paths("doc1", 2); + assert_eq!(top.len(), 2); + assert!((top[0].1 - 0.9).abs() < 0.01); // 3.2 is highest + } + + #[test] + fn test_l3_store_and_retrieve() { + let cache = ReasoningCache::new(); + let fp = Fingerprint::from_str("some node content"); + + cache.l3_store(fp, 0.85, "bm25".into()); + let (score, strategy) = cache.l3_get(&fp).unwrap(); + assert!((score - 0.85).abs() < 0.01); + assert_eq!(strategy, "bm25"); + } + + #[test] + fn test_clear() { + let cache = ReasoningCache::new(); + let scope = Fingerprint::from_str("doc1"); + + cache.l1_store("q", scope, vec![], "kw".into()); + cache.l2_record("doc1", "1", 0.5); + cache.l3_store(Fingerprint::from_str("c"), 0.5, "kw".into()); + + cache.clear(); + + let stats = cache.stats(); + assert_eq!(stats.l1_entries, 0); + assert_eq!(stats.l2_entries, 0); + assert_eq!(stats.l3_entries, 0); + } + + #[test] + fn test_l1_lru_eviction() { + let config = ReasoningCacheConfig { + l1_max: 2, + ..Default::default() + }; + let cache = ReasoningCache::with_config(config); + let scope = Fingerprint::from_str("doc"); + + cache.l1_store("q1", scope, vec![], "kw".into()); + cache.l1_store("q2", scope, vec![], "kw".into()); + cache.l1_store("q3", scope, vec![], "kw".into()); // evicts q1 + + assert!(cache.l1_get("q1", &scope).is_none()); + assert!(cache.l1_get("q2", &scope).is_some()); + assert!(cache.l1_get("q3", &scope).is_some()); + } + + #[test] + fn test_l2_lru_eviction() { + let config = ReasoningCacheConfig { + l2_max: 2, + ..Default::default() + }; + let cache = ReasoningCache::with_config(config); + + cache.l2_record("doc", "1", 0.5); + cache.l2_record("doc", "2", 0.6); + cache.l2_record("doc", "3", 0.7); // evicts "doc:1" + + assert!(cache.l2_get("doc", "1").is_none()); + assert!(cache.l2_get("doc", "2").is_some()); + assert!(cache.l2_get("doc", "3").is_some()); + } + + #[test] + fn test_l3_lru_eviction() { + let config = ReasoningCacheConfig { + l3_max: 2, + ..Default::default() + }; + let cache = ReasoningCache::with_config(config); + + let fp1 = Fingerprint::from_str("content_a"); + let fp2 = Fingerprint::from_str("content_b"); + let fp3 = Fingerprint::from_str("content_c"); + + cache.l3_store(fp1, 0.5, "kw".into()); + cache.l3_store(fp2, 0.6, "kw".into()); + cache.l3_store(fp3, 0.7, "kw".into()); // evicts fp1 + + assert!(cache.l3_get(&fp1).is_none()); + assert!(cache.l3_get(&fp2).is_some()); + assert!(cache.l3_get(&fp3).is_some()); + } + + #[test] + fn test_poisoned_lock_recovery() { + let cache = ReasoningCache::new(); + + // Verify normal operation: store and retrieve still works + let scope = Fingerprint::from_str("doc"); + cache.l1_store("query", scope, vec![], "kw".into()); + + let scope2 = Fingerprint::from_str("doc2"); + cache.l1_store("q2", scope2, vec![], "kw".into()); + assert!(cache.l1_get("q2", &scope2).is_some()); + + // Verify stats still works (internally uses read_lock) + let stats = cache.stats(); + assert!(stats.l1_entries >= 1); + } + + #[test] + fn test_poisoned_lock_read_recovery() { + use std::sync::Arc; + use std::thread; + + // Create a cache and populate it + let cache = Arc::new(ReasoningCache::new()); + let scope = Fingerprint::from_str("doc"); + cache.l1_store("query", scope, vec![], "kw".into()); + + // Poison the lock from another thread + let cache_clone = Arc::clone(&cache); + let handle = thread::spawn(move || { + // This will poison the L1 lock + let _guard = cache_clone.l1.write().unwrap(); + panic!("intentional panic to poison lock"); + }); + + // Wait for the panicking thread to finish + let _ = handle.join(); + + // The lock is now poisoned. Our read_lock() should recover from it. + // l1_get uses read_lock internally + let result = cache.l1_get("query", &scope); + // Should still return data (recovered from poison) + assert!(result.is_some()); + } +} diff --git a/vectorless-core/vectorless-retrieval/src/dispatcher.rs b/vectorless-core/vectorless-retrieval/src/dispatcher.rs new file mode 100644 index 00000000..b2c45096 --- /dev/null +++ b/vectorless-core/vectorless-retrieval/src/dispatcher.rs @@ -0,0 +1,78 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Retrieval dispatcher — the single entry point for all query operations. +//! +//! All queries go through the Orchestrator. There is no separate Worker path. +//! The Orchestrator internally decides whether to run the full analysis phase +//! based on user intent: +//! +//! - **User specified doc_ids** → Orchestrator skips analysis, spawns N Workers +//! directly (N=1 is a normal case, not special). +//! - **User unspecified (workspace)** → Orchestrator analyzes DocCards, selects +//! relevant docs, then spawns Workers. +//! +//! Post-processing (synthesis, dedup, rerank) is always unified through the +//! Orchestrator's output — never duplicated in Worker. + +use tracing::info; + +use vectorless_agent::config::{AgentConfig, Scope, WorkspaceContext}; +use vectorless_agent::orchestrator::Orchestrator; +use vectorless_agent::{Agent, EventEmitter, Output}; +use vectorless_error::{Error, Result}; +use vectorless_llm::LlmClient; +use vectorless_query::QueryPipeline; + +/// Dispatch a query to the Orchestrator. +/// +/// This is the single entry point from the client layer into the retrieval system. +/// It always goes through the Orchestrator — never directly to Worker. +/// +/// Flow: +/// 1. Query understanding via LLM (produces [`QueryPlan`]) +/// 2. Orchestrator dispatch (uses QueryPlan for strategy) +/// +/// - `Scope::Specified(docs)` → Orchestrator skips analysis, dispatches all docs directly. +/// - `Scope::Workspace(ws)` → Orchestrator runs full flow (analyze → dispatch → fuse → synthesize). +pub async fn dispatch( + query: &str, + scope: Scope<'_>, + config: &AgentConfig, + llm: &LlmClient, + emitter: &EventEmitter, +) -> Result { + let (ws, skip_analysis) = match scope { + Scope::Specified(docs) => { + info!( + docs = docs.len(), + "Dispatch (user-specified, skip analysis)" + ); + (WorkspaceContext::new(docs), true) + } + Scope::Workspace(ws) => { + info!(docs = ws.doc_count(), "Dispatch (workspace, full flow)"); + (ws, false) + } + }; + + // Step 1: Query understanding — LLM analyzes intent, concepts, complexity. + // This is required. "Model fails, we fail." — errors propagate. + info!("Starting query understanding..."); + let query_plan = QueryPipeline::understand(query, llm).await?; + + // Step 2: Dispatch to Orchestrator with the query plan. + let orchestrator = Orchestrator::new( + query, + &ws, + config.clone(), + llm.clone(), + emitter.clone(), + skip_analysis, + query_plan, + ); + orchestrator + .run() + .await + .map_err(|e| Error::Retrieval(e.to_string())) +} diff --git a/vectorless-core/vectorless-retrieval/src/lib.rs b/vectorless-core/vectorless-retrieval/src/lib.rs new file mode 100644 index 00000000..bab04971 --- /dev/null +++ b/vectorless-core/vectorless-retrieval/src/lib.rs @@ -0,0 +1,28 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Retrieval dispatch layer — the entry point for all query operations. +//! +//! This module sits between the client API and the agent execution layer. +//! It is responsible for: +//! +//! - **Dispatching** queries to the appropriate agent path (Worker vs Orchestrator) +//! - **Post-processing** agent output into client-facing results +//! - **Caching** query results (L1 exact, L2 path patterns, L3 strategy scores) +//! - **Streaming** retrieval events for async progress monitoring +//! +//! Call flow: +//! ```text +//! client → retrieval::dispatch() +//! ├── User specified doc_ids → parallel N × Worker +//! └── Workspace scope → Orchestrator (analyze → spawn → fusion) +//! ``` + +mod cache; +pub mod dispatcher; +pub mod postprocessor; +pub mod stream; +mod types; + +pub use stream::{RetrieveEvent, RetrieveEventReceiver}; +pub use types::{ReasoningChain, RetrieveResponse, SufficiencyLevel}; diff --git a/vectorless-core/vectorless-retrieval/src/postprocessor.rs b/vectorless-core/vectorless-retrieval/src/postprocessor.rs new file mode 100644 index 00000000..77a95a87 --- /dev/null +++ b/vectorless-core/vectorless-retrieval/src/postprocessor.rs @@ -0,0 +1,130 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Post-processing of agent output into client-facing results. +//! +//! Converts raw agent [`Output`] into one or more [`QueryResultItem`]s. +//! When evidence comes from multiple documents (distinct `doc_name` values), +//! results are split by document so the caller can see per-doc attribution. + +use std::collections::BTreeMap; + +use vectorless_agent::config::{Evidence, Metrics, Output}; +use vectorless_engine::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem}; + +/// Convert agent output to query result items, split by document. +/// +/// Groups evidence by `doc_name` and creates one `QueryResultItem` per document. +/// For single-document queries (all evidence has the same or no `doc_name`), +/// returns a single item with the given `doc_id`. +/// +/// The synthesized answer is shared across all items (it was produced from +/// cross-document evidence). Each item gets its own subset of evidence. +pub fn to_results(output: &Output, doc_id: &str) -> Vec { + if output.evidence.is_empty() { + return vec![empty_item(doc_id, &output.answer, output.confidence)]; + } + + // Group evidence by doc_name + let groups = group_by_doc(&output.evidence); + + if groups.len() <= 1 { + // Single doc — return one item + return vec![build_item( + doc_id, + &output.answer, + output.confidence, + &output.evidence, + &output.metrics, + )]; + } + + // Multi-doc — one item per document + groups + .into_iter() + .map(|(name, refs)| { + let did = name.as_deref().unwrap_or(doc_id); + let evidence: Vec = refs.iter().map(|e| (*e).clone()).collect(); + build_item( + did, + &output.answer, + output.confidence, + &evidence, + &output.metrics, + ) + }) + .collect() +} + +/// Group evidence by `doc_name`, preserving order. +fn group_by_doc(evidence: &[Evidence]) -> BTreeMap, Vec<&Evidence>> { + let mut groups: BTreeMap, Vec<&Evidence>> = BTreeMap::new(); + for ev in evidence { + groups.entry(ev.doc_name.clone()).or_default().push(ev); + } + groups +} + +/// Build a single enriched result item. +fn build_item( + doc_id: &str, + answer: &str, + confidence: Confidence, + evidence: &[Evidence], + metrics: &Metrics, +) -> QueryResultItem { + let node_ids: Vec = evidence.iter().map(|e| e.source_path.clone()).collect(); + let evidence_items: Vec = evidence + .iter() + .map(|e| EvidenceItem { + title: e.node_title.clone(), + path: e.source_path.clone(), + content: e.content.clone(), + doc_name: e.doc_name.clone(), + }) + .collect(); + + let content = if answer.is_empty() { + evidence + .iter() + .map(|e| format!("## {}\n{}", e.node_title, e.content)) + .collect::>() + .join("\n\n---\n\n") + } else { + answer.to_string() + }; + + let evidence_count = evidence.len(); + + QueryResultItem { + doc_id: doc_id.to_string(), + node_ids, + content, + evidence: evidence_items, + metrics: Some(QueryMetrics { + llm_calls: metrics.llm_calls, + rounds_used: metrics.rounds_used, + nodes_visited: metrics.nodes_visited, + evidence_count, + evidence_chars: metrics.evidence_chars, + }), + confidence, + } +} + +/// Build an empty result item (no evidence). +fn empty_item(doc_id: &str, answer: &str, confidence: Confidence) -> QueryResultItem { + let content = if answer.is_empty() { + String::new() + } else { + answer.to_string() + }; + QueryResultItem { + doc_id: doc_id.to_string(), + node_ids: Vec::new(), + content, + evidence: Vec::new(), + metrics: None, + confidence, + } +} diff --git a/vectorless-core/vectorless-retrieval/src/stream.rs b/vectorless-core/vectorless-retrieval/src/stream.rs new file mode 100644 index 00000000..33aa75b7 --- /dev/null +++ b/vectorless-core/vectorless-retrieval/src/stream.rs @@ -0,0 +1,128 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Streaming retrieval events. +//! +//! When `RetrieveOptions::streaming` is enabled, retrieval emits +//! [`RetrieveEvent`]s incrementally as the pipeline progresses through +//! its stages (Analyze → Plan → Search → Evaluate). +//! +//! # Example +//! +//! ```rust,ignore +//! let options = RetrieveOptions::new().with_streaming(true); +//! let rx = client.query_stream(&tree, "query", &options).await?; +//! +//! while let Some(event) = rx.recv().await { +//! match event { +//! RetrieveEvent::Started { query, .. } => println!("Started: {query}"), +//! RetrieveEvent::StageCompleted { stage, .. } => println!("Done: {stage}"), +//! RetrieveEvent::Completed { response } => { +//! println!("Confidence: {}", response.confidence); +//! break; +//! } +//! RetrieveEvent::Error { message } => { +//! eprintln!("Error: {message}"); +//! break; +//! } +//! _ => {} +//! } +//! } +//! ``` + +use tokio::sync::mpsc; + +use super::types::{RetrieveResponse, SufficiencyLevel}; + +/// Events emitted during streaming retrieval. +/// +/// Each event represents a meaningful milestone in the retrieval pipeline. +/// The stream always terminates with either [`Completed`](RetrieveEvent::Completed) +/// or [`Error`](RetrieveEvent::Error). +#[derive(Debug, Clone)] +pub enum RetrieveEvent { + /// Retrieval pipeline started. + Started { + /// The query string. + query: String, + /// Planned retrieval strategy name. + strategy: String, + }, + + /// A pipeline stage completed. + StageCompleted { + /// Stage name (analyze, plan, search, evaluate). + stage: String, + /// Time spent in this stage (ms). + elapsed_ms: u64, + }, + + /// A node was visited during tree traversal. + NodeVisited { + /// Node ID. + node_id: String, + /// Node title. + title: String, + /// Relevance score (0.0 - 1.0). + score: f32, + }, + + /// Relevant content was found. + ContentFound { + /// Node ID. + node_id: String, + /// Node title. + title: String, + /// Short preview of the content. + preview: String, + /// Relevance score. + score: f32, + }, + + /// Pipeline is backtracking to an earlier stage. + Backtracking { + /// Stage backtracking from. + from: String, + /// Stage backtracking to. + to: String, + /// Reason for backtracking. + reason: String, + }, + + /// Sufficiency check result. + SufficiencyCheck { + /// Sufficiency level. + level: SufficiencyLevel, + /// Total tokens collected so far. + tokens: usize, + }, + + /// Retrieval completed successfully with final results. + Completed { + /// The full retrieval response. + response: RetrieveResponse, + }, + + /// An error occurred during retrieval. + Error { + /// Error message. + message: String, + }, +} + +/// Sender half for streaming retrieval events. +pub(crate) type RetrieveEventSender = mpsc::Sender; + +/// Receiver half for streaming retrieval events. +pub type RetrieveEventReceiver = mpsc::Receiver; + +/// Create a bounded channel for streaming retrieval events. +/// +/// The bound defaults to 64 events. The sender will apply backpressure +/// when the receiver cannot keep up, preventing unbounded memory growth. +pub(crate) fn channel(bound: usize) -> (RetrieveEventSender, RetrieveEventReceiver) { + mpsc::channel(bound) +} + +/// Default channel bound for streaming events. +pub const DEFAULT_STREAM_BOUND: usize = 64; diff --git a/vectorless-core/vectorless-retrieval/src/types.rs b/vectorless-core/vectorless-retrieval/src/types.rs new file mode 100644 index 00000000..3fee208e --- /dev/null +++ b/vectorless-core/vectorless-retrieval/src/types.rs @@ -0,0 +1,193 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Core types for the retrieval system. + +use serde::{Deserialize, Serialize}; + +/// Re-export [`SufficiencyLevel`] from the document module. +pub use vectorless_document::SufficiencyLevel; + +/// Complete retrieval response. +#[derive(Debug, Clone)] +pub struct RetrieveResponse { + /// Retrieved results. + pub results: Vec, + + /// Aggregated content. + pub content: String, + + /// Overall confidence score. + pub confidence: f32, + + /// Whether information is sufficient. + pub is_sufficient: bool, + + /// Strategy that was used. + pub strategy_used: String, + + /// Reasoning chain explaining how results were found. + pub reasoning_chain: ReasoningChain, + + /// Total tokens used. + pub tokens_used: usize, +} + +impl Default for RetrieveResponse { + fn default() -> Self { + Self { + results: Vec::new(), + content: String::new(), + confidence: 0.0, + is_sufficient: false, + strategy_used: String::new(), + reasoning_chain: ReasoningChain::default(), + tokens_used: 0, + } + } +} + +impl RetrieveResponse { + /// Create a new empty response. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Check if there are any results. + #[must_use] + pub fn is_empty(&self) -> bool { + self.results.is_empty() + } + + /// Get the number of results. + #[must_use] + pub fn len(&self) -> usize { + self.results.len() + } +} + +/// A single retrieval result. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RetrievalResult { + /// Node ID in the tree. + pub node_id: Option, + + /// Node title. + pub title: String, + + /// Node content (if included). + pub content: Option, + + /// Node summary (if included). + pub summary: Option, + + /// Relevance score (0.0 - 1.0). + pub score: f32, + + /// Depth in the tree. + pub depth: usize, + + /// Page range (for PDFs). + pub page_range: Option<(usize, usize)>, +} + +impl RetrievalResult { + /// Create a new retrieval result. + #[must_use] + pub fn new(title: impl Into) -> Self { + Self { + node_id: None, + title: title.into(), + content: None, + summary: None, + score: 1.0, + depth: 0, + page_range: None, + } + } + + /// Set the node ID. + #[must_use] + pub fn with_node_id(mut self, id: impl Into) -> Self { + self.node_id = Some(id.into()); + self + } + + /// Set the content. + #[must_use] + pub fn with_content(mut self, content: impl Into) -> Self { + self.content = Some(content.into()); + self + } + + /// Set the summary. + #[must_use] + pub fn with_summary(mut self, summary: impl Into) -> Self { + self.summary = Some(summary.into()); + self + } + + /// Set the score. + #[must_use] + pub fn with_score(mut self, score: f32) -> Self { + self.score = score; + self + } + + /// Set the depth. + #[must_use] + pub fn with_depth(mut self, depth: usize) -> Self { + self.depth = depth; + self + } + + /// Set the page range. + #[must_use] + pub fn with_page_range(mut self, start: usize, end: usize) -> Self { + self.page_range = Some((start, end)); + self + } +} + +/// Complete reasoning chain for a retrieval operation. +/// +/// Provides an ordered, auditable trace of every decision the engine made +/// from query analysis through final evaluation. +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ReasoningChain { + /// Ordered reasoning steps. + pub steps: Vec, +} + +impl ReasoningChain { + /// Create an empty reasoning chain. + #[must_use] + pub fn new() -> Self { + Self::default() + } + + /// Append a reasoning step. + pub fn push(&mut self, step: ReasoningStep) { + self.steps.push(step); + } + + /// Number of reasoning steps. + #[must_use] + pub fn len(&self) -> usize { + self.steps.len() + } + + /// Whether the chain is empty. + #[must_use] + pub fn is_empty(&self) -> bool { + self.steps.is_empty() + } +} + +/// A single step in the reasoning chain. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReasoningStep { + /// Human-readable explanation of the decision. + pub reasoning: String, +} diff --git a/vectorless-core/vectorless-scoring/Cargo.toml b/vectorless-core/vectorless-scoring/Cargo.toml new file mode 100644 index 00000000..f1c169d2 --- /dev/null +++ b/vectorless-core/vectorless-scoring/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "vectorless-scoring" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +bm25 = { workspace = true } +regex = { workspace = true } +async-trait = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-scoring/src/bm25.rs b/vectorless-core/vectorless-scoring/src/bm25.rs new file mode 100644 index 00000000..8bc20085 --- /dev/null +++ b/vectorless-core/vectorless-scoring/src/bm25.rs @@ -0,0 +1,690 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! BM25 scoring module using the `bm25` crate. +//! +//! This module provides: +//! - Per-field weighting for document scoring +//! - Configurable length normalization +//! - IDF caching for efficient scoring +//! - Query expansion support + +use bm25::{ + DefaultTokenizer, Embedder, EmbedderBuilder, Language, ScoredDocument, Scorer, Tokenizer, +}; + +/// Field weights for BM25 scoring. +/// +/// Different document fields can have different importance. +/// For example, title matches are typically more important than content matches. +#[derive(Debug, Clone, Copy)] +pub struct FieldWeights { + /// Weight for title field matches. + pub title: f32, + /// Weight for summary field matches. + pub summary: f32, + /// Weight for content field matches. + pub content: f32, +} + +impl Default for FieldWeights { + fn default() -> Self { + Self { + title: 2.0, + summary: 1.5, + content: 1.0, + } + } +} + +/// BM25 parameters for fine-tuning. +#[derive(Debug, Clone, Copy)] +pub struct Bm25Params { + /// Term frequency saturation parameter (k1). + /// Controls how quickly term frequency saturates. + /// Typical value: 1.2 + pub k1: f32, + /// Length normalization parameter (b). + /// Controls how much document length affects scoring. + /// - 0.0: No length normalization + /// - 1.0: Full length normalization + /// Typical value: 0.75 + pub b: f32, + /// Average document length. + /// If not known, can be estimated or set to 1.0 with b=0. + pub avgdl: f32, +} + +impl Default for Bm25Params { + fn default() -> Self { + Self { + k1: 1.2, + b: 0.75, + avgdl: 100.0, + } + } +} + +/// A document with multiple fields for scoring. +#[derive(Debug, Clone)] +pub struct FieldDocument { + /// Document identifier. + pub id: K, + /// Title field. + pub title: String, + /// Summary field. + pub summary: String, + /// Content field. + pub content: String, +} + +impl FieldDocument { + /// Create a new field document. + pub fn new(id: K, title: String, summary: String, content: String) -> Self { + Self { + id, + title, + summary, + content, + } + } + + /// Get combined text for embedding. + fn combined_text(&self) -> String { + format!("{} {} {}", self.title, self.summary, self.content) + } +} + +/// Key for field-specific document storage. +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +struct FieldKey { + doc_id: K, + field: Field, +} + +#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] +enum Field { + Title, + Summary, + Content, +} + +/// BM25 engine with per-field weighting support. +/// +/// This wraps the `bm25` crate's Embedder and Scorer to provide: +/// - Per-field weighting +/// - Configurable parameters +/// - IDF caching (handled internally by Scorer) +pub struct Bm25Engine { + /// The embedder for creating sparse vectors. + embedder: Embedder, + /// The scorer for scoring documents (combined text). + scorer: Scorer, + /// Field-specific scorers for weighted scoring. + title_scorer: Scorer, + summary_scorer: Scorer, + content_scorer: Scorer, + /// Field weights. + weights: FieldWeights, + /// Document count. + doc_count: usize, + /// Whether the engine has been fitted to a corpus. + fitted: bool, +} + +impl Bm25Engine { + /// Create a new BM25 engine with default parameters. + pub fn new() -> Self { + Self::with_params(Bm25Params::default()) + } + + /// Create a BM25 engine with custom parameters. + pub fn with_params(params: Bm25Params) -> Self { + let embedder = EmbedderBuilder::with_avgdl(params.avgdl) + .k1(params.k1) + .b(params.b) + .language_mode(Language::English) + .build(); + + Self { + embedder, + scorer: Scorer::new(), + title_scorer: Scorer::new(), + summary_scorer: Scorer::new(), + content_scorer: Scorer::new(), + weights: FieldWeights::default(), + doc_count: 0, + fitted: false, + } + } + + /// Create a BM25 engine fitted to a corpus. + /// + /// This calculates the true average document length from the corpus. + pub fn fit_to_corpus(documents: &[FieldDocument]) -> Self { + // Collect owned strings first + let corpus: Vec = documents.iter().map(|d| d.combined_text()).collect(); + let corpus_refs: Vec<&str> = corpus.iter().map(|s| s.as_str()).collect(); + + let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs).build(); + + let mut engine = Self { + embedder, + scorer: Scorer::new(), + title_scorer: Scorer::new(), + summary_scorer: Scorer::new(), + content_scorer: Scorer::new(), + weights: FieldWeights::default(), + doc_count: 0, + fitted: true, + }; + + // Index all documents + for doc in documents { + engine.upsert(doc); + } + + engine + } + + /// Set field weights. + pub fn with_weights(mut self, weights: FieldWeights) -> Self { + self.weights = weights; + self + } + + /// Set language for tokenization. + pub fn with_language(mut self, language: Language) -> Self { + self.embedder = EmbedderBuilder::with_avgdl(self.embedder.avgdl()) + .language_mode(language) + .build(); + self + } + + /// Get the average document length. + pub fn avgdl(&self) -> f32 { + self.embedder.avgdl() + } + + /// Check if the engine has been fitted to a corpus. + pub fn is_fitted(&self) -> bool { + self.fitted + } + + /// Upsert a document into the index. + /// + /// This stores embeddings for each field separately for weighted scoring. + pub fn upsert(&mut self, document: &FieldDocument) { + let id = &document.id; + + // Embed and store each field separately + let title_emb = self.embedder.embed(&document.title); + let summary_emb = self.embedder.embed(&document.summary); + let content_emb = self.embedder.embed(&document.content); + + self.title_scorer.upsert(id, title_emb); + self.summary_scorer.upsert(id, summary_emb); + self.content_scorer.upsert(id, content_emb); + + // Also store combined embedding for basic search + let combined = self.embedder.embed(&document.combined_text()); + self.scorer.upsert(id, combined); + + self.doc_count += 1; + } + + /// Remove a document from the index. + pub fn remove(&mut self, id: &K) { + self.scorer.remove(id); + self.title_scorer.remove(id); + self.summary_scorer.remove(id); + self.content_scorer.remove(id); + self.doc_count = self.doc_count.saturating_sub(1); + } + + /// Get the number of indexed documents. + pub fn len(&self) -> usize { + self.doc_count + } + + /// Check if the index is empty. + pub fn is_empty(&self) -> bool { + self.doc_count == 0 + } + + /// Score a single document against a query. + /// + /// Returns None if the document is not in the index. + pub fn score(&self, id: &K, query: &str) -> Option { + let query_emb = self.embedder.embed(query); + + // Score each field + let title_score = self.title_scorer.score(id, &query_emb)?; + let summary_score = self.summary_scorer.score(id, &query_emb)?; + let content_score = self.content_scorer.score(id, &query_emb)?; + + // Weighted combination + let total_weight = self.weights.title + self.weights.summary + self.weights.content; + let weighted_score = (title_score * self.weights.title + + summary_score * self.weights.summary + + content_score * self.weights.content) + / total_weight; + + Some(weighted_score) + } + + /// Search for documents matching a query. + /// + /// Returns documents sorted by score (descending). + pub fn search(&self, query: &str, limit: usize) -> Vec> { + let query_emb = self.embedder.embed(query); + self.scorer + .matches(&query_emb) + .into_iter() + .take(limit) + .collect() + } + + /// Search with per-field weighting. + /// + /// This is slower but provides more accurate weighted scores. + pub fn search_weighted(&self, query: &str, limit: usize) -> Vec<(K, f32)> { + let query_emb = self.embedder.embed(query); + + // Get all document IDs from the main scorer + let all_results = self.scorer.matches(&query_emb); + + let mut scored: Vec<(K, f32)> = all_results + .into_iter() + .filter_map(|scored_doc| { + let id = scored_doc.id; + + // Get per-field scores + let title_score = self.title_scorer.score(&id, &query_emb)?; + let summary_score = self.summary_scorer.score(&id, &query_emb)?; + let content_score = self.content_scorer.score(&id, &query_emb)?; + + let total_weight = self.weights.title + self.weights.summary + self.weights.content; + let weighted_score = (title_score * self.weights.title + + summary_score * self.weights.summary + + content_score * self.weights.content) + / total_weight; + + Some((id, weighted_score)) + }) + .collect(); + + scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + scored.truncate(limit); + scored + } + + /// Extract keywords from a query (tokenize and filter). + pub fn tokenize(&self, text: &str) -> Vec { + let tokenizer = DefaultTokenizer::builder() + .language_mode(Language::English) + .normalization(true) + .stopwords(true) + .stemming(true) + .build(); + tokenizer.tokenize(text) + } + + /// Get the underlying embedder. + pub fn embedder(&self) -> &Embedder { + &self.embedder + } + + /// Get mutable access to the embedder. + pub fn embedder_mut(&mut self) -> &mut Embedder { + &mut self.embedder + } +} + +impl Default for Bm25Engine { + fn default() -> Self { + Self::new() + } +} + +/// Query expansion result from LLM. +#[derive(Debug, Clone)] +pub struct ExpandedQuery { + /// Original query. + pub original: String, + /// Expanded terms. + pub expansions: Vec, + /// Combined query (original + expansions). + pub combined: String, +} + +impl ExpandedQuery { + /// Create a new expanded query. + pub fn new(original: String, expansions: Vec) -> Self { + let combined = format!("{} {}", original, expansions.join(" ")); + Self { + original, + expansions, + combined, + } + } +} + +/// Query expander trait for LLM-based expansion. +#[async_trait::async_trait] +pub trait QueryExpander: Send + Sync { + /// Expand a query with related terms. + async fn expand(&self, query: &str) -> ExpandedQuery; +} + +/// Common English stop words for keyword filtering. +pub const STOPWORDS: &[&str] = &[ + "a", + "an", + "the", + "is", + "are", + "was", + "were", + "be", + "been", + "being", + "have", + "has", + "had", + "do", + "does", + "did", + "will", + "would", + "could", + "should", + "may", + "might", + "must", + "shall", + "can", + "need", + "dare", + "ought", + "used", + "to", + "of", + "in", + "for", + "on", + "with", + "at", + "by", + "from", + "as", + "into", + "through", + "during", + "before", + "after", + "above", + "below", + "between", + "under", + "again", + "further", + "then", + "once", + "here", + "there", + "when", + "where", + "why", + "how", + "all", + "each", + "few", + "more", + "most", + "other", + "some", + "such", + "no", + "nor", + "not", + "only", + "own", + "same", + "so", + "than", + "too", + "very", + "just", + "and", + "but", + "if", + "or", + "because", + "until", + "while", + "about", + "what", + "which", + "who", + "whom", + "this", + "that", + "these", + "those", + "i", + "me", + "my", + "myself", + "we", + "our", + "ours", + "ourselves", + "you", + "your", + "yours", + "yourself", + "yourselves", + "he", + "him", + "his", + "himself", + "she", + "her", + "hers", + "herself", + "it", + "its", + "itself", + "they", + "them", + "their", + "theirs", + "themselves", +]; + +/// Extract keywords from a query string, filtering stop words. +/// +/// This is a simple keyword extraction that: +/// - Converts to lowercase +/// - Splits on non-alphanumeric characters +/// - Filters out stop words +/// - Requires minimum length of 2 characters +#[must_use] +pub fn extract_keywords(query: &str) -> Vec { + query + .to_lowercase() + .split(|c: char| !c.is_alphanumeric()) + .filter(|s| { + let s = *s; + !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s) + }) + .map(String::from) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bm25_engine_creation() { + let engine: Bm25Engine = Bm25Engine::new(); + assert!(engine.is_empty()); + assert!(!engine.is_fitted()); + } + + #[test] + fn test_bm25_engine_fit_to_corpus() { + let docs = vec![ + FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About Rust".to_string(), + "Rust is a systems programming language.".to_string(), + ), + FieldDocument::new( + 2u32, + "Python Guide".to_string(), + "About Python".to_string(), + "Python is a scripting language.".to_string(), + ), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs); + assert!(engine.is_fitted()); + assert_eq!(engine.len(), 2); + } + + #[test] + fn test_bm25_search() { + let docs = vec![ + FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About Rust".to_string(), + "Rust is a systems programming language with memory safety.".to_string(), + ), + FieldDocument::new( + 2u32, + "Python Guide".to_string(), + "About Python".to_string(), + "Python is a scripting language for data science.".to_string(), + ), + FieldDocument::new( + 3u32, + "Rust Memory Safety".to_string(), + "Memory in Rust".to_string(), + "Rust provides guaranteed memory safety without garbage collection.".to_string(), + ), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs); + let results = engine.search("rust memory", 10); + + assert!(!results.is_empty()); + // Documents about Rust should rank higher + assert!(results.iter().any(|r| r.id == 1 || r.id == 3)); + } + + #[test] + fn test_bm25_weighted_search() { + let docs = vec![ + FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About memory safety".to_string(), + "Content about other things.".to_string(), + ), + FieldDocument::new( + 2u32, + "Other Language".to_string(), + "About other things".to_string(), + "Rust memory safety is important.".to_string(), + ), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs).with_weights(FieldWeights { + title: 3.0, + summary: 2.0, + content: 1.0, + }); + + let results = engine.search_weighted("rust", 10); + + // Doc 1 has "Rust" in title, should rank higher + assert_eq!(results.first().map(|(id, _)| *id), Some(1u32)); + } + + #[test] + fn test_bm25_score() { + let docs = vec![FieldDocument::new( + 1u32, + "Rust Programming".to_string(), + "About Rust".to_string(), + "Rust is a systems programming language.".to_string(), + )]; + + let engine = Bm25Engine::fit_to_corpus(&docs); + let score = engine.score(&1u32, "rust programming"); + + assert!(score.is_some()); + assert!(score.unwrap() > 0.0); + } + + #[test] + fn test_bm25_tokenize() { + let engine: Bm25Engine = Bm25Engine::new(); + let tokens = engine.tokenize("What is the Rust programming language?"); + + // Should filter stop words and stem + assert!(tokens.contains(&"rust".to_string())); + assert!(tokens.contains(&"program".to_string())); // stemmed + assert!(!tokens.contains(&"what".to_string())); // stop word + assert!(!tokens.contains(&"the".to_string())); // stop word + } + + #[test] + fn test_bm25_remove() { + let docs = vec![FieldDocument::new( + 1u32, + "Rust".to_string(), + "About Rust".to_string(), + "Rust content.".to_string(), + )]; + + let mut engine = Bm25Engine::fit_to_corpus(&docs); + assert_eq!(engine.len(), 1); + + engine.remove(&1u32); + assert!(engine.is_empty()); + } + + #[test] + fn test_field_weights_default() { + let weights = FieldWeights::default(); + assert!((weights.title - 2.0).abs() < f32::EPSILON); + assert!((weights.summary - 1.5).abs() < f32::EPSILON); + assert!((weights.content - 1.0).abs() < f32::EPSILON); + } + + #[test] + fn test_bm25_params_default() { + let params = Bm25Params::default(); + assert!((params.k1 - 1.2).abs() < f32::EPSILON); + assert!((params.b - 0.75).abs() < f32::EPSILON); + assert!((params.avgdl - 100.0).abs() < f32::EPSILON); + } + + #[test] + fn test_expanded_query() { + let expanded = ExpandedQuery::new( + "rust".to_string(), + vec!["programming".to_string(), "language".to_string()], + ); + + assert_eq!(expanded.original, "rust"); + assert_eq!(expanded.expansions.len(), 2); + assert_eq!(expanded.combined, "rust programming language"); + } +} diff --git a/vectorless-core/vectorless-scoring/src/lib.rs b/vectorless-core/vectorless-scoring/src/lib.rs new file mode 100644 index 00000000..eac4e435 --- /dev/null +++ b/vectorless-core/vectorless-scoring/src/lib.rs @@ -0,0 +1,8 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Scoring utilities — keyword extraction via BM25. + +pub mod bm25; + +pub use bm25::extract_keywords; diff --git a/vectorless-core/vectorless-storage/Cargo.toml b/vectorless-core/vectorless-storage/Cargo.toml new file mode 100644 index 00000000..e15a0c50 --- /dev/null +++ b/vectorless-core/vectorless-storage/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "vectorless-storage" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-config = { path = "../vectorless-config" } +vectorless-document = { path = "../vectorless-document" } +vectorless-error = { path = "../vectorless-error" } +vectorless-utils = { path = "../vectorless-utils" } +vectorless-graph = { path = "../vectorless-graph" } +tokio = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +flate2 = { workspace = true } +lru = { workspace = true } +tracing = { workspace = true } +chrono = { workspace = true } +uuid = { workspace = true } +sha2 = { workspace = true } +base64 = { workspace = true } +parking_lot = { workspace = true } +regex = { workspace = true } +thiserror = { workspace = true } + +[target.'cfg(unix)'.dependencies] +libc = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-storage/src/backend/file.rs b/vectorless-core/vectorless-storage/src/backend/file.rs new file mode 100644 index 00000000..454ca5e4 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/backend/file.rs @@ -0,0 +1,293 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! File system storage backend. + +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::RwLock; + +use tracing::debug; + +use super::StorageBackend; +use vectorless_error::Error; +use vectorless_error::Result; + +/// File system storage backend. +/// +/// Stores each key-value pair as a separate file in a directory. +/// The key is used as the filename (with `.bin` extension). +/// +/// # Structure +/// +/// ```text +/// workspace/ +/// ├── doc-1.bin # Document 1 +/// ├── doc-2.bin # Document 2 +/// ├── meta.bin # Metadata index +/// └── .workspace.lock # Lock file +/// ``` +/// +/// # Thread Safety +/// +/// Uses `RwLock` for thread-safe operations on the directory listing cache. +#[derive(Debug)] +pub struct FileBackend { + /// Root directory for storage. + root: PathBuf, + /// Cached directory listing (refreshed on miss). + cache: RwLock>>, +} + +impl FileBackend { + /// Create a new file backend at the given path. + /// + /// Creates the directory if it doesn't exist. + pub fn new(path: impl Into) -> Result { + let root = path.into(); + fs::create_dir_all(&root).map_err(Error::Io)?; + + Ok(Self { + root, + cache: RwLock::new(None), + }) + } + + /// Open an existing file backend. + /// + /// Creates the directory if it doesn't exist. + pub fn open(path: impl Into) -> Result { + Self::new(path) + } + + /// Get the root path. + pub fn root(&self) -> &Path { + &self.root + } + + /// Convert a key to a file path. + fn key_to_path(&self, key: &str) -> PathBuf { + // Sanitize key to prevent path traversal + let sanitized = key.replace("..", "_").replace(['/', '\\', ':'], "_"); + self.root.join(format!("{}.bin", sanitized)) + } + + /// Refresh the directory listing cache. + fn refresh_cache(&self) -> Result> { + let entries: Vec = fs::read_dir(&self.root) + .map_err(Error::Io)? + .filter_map(|entry| entry.ok()) + .filter_map(|entry| { + let path = entry.path(); + if path.extension()?.to_str()? == "bin" { + path.file_stem()?.to_str().map(|s| s.to_string()) + } else { + None + } + }) + .collect(); + + // Update cache + if let Ok(mut cache) = self.cache.write() { + *cache = Some(entries.clone()); + } + + Ok(entries) + } + + /// Get cached keys or refresh cache. + fn get_keys(&self) -> Result> { + // Try to read from cache first + if let Ok(cache) = self.cache.read() { + if let Some(ref keys) = *cache { + return Ok(keys.clone()); + } + } + + // Refresh cache + self.refresh_cache() + } + + /// Invalidate the cache. + pub fn invalidate_cache(&self) { + if let Ok(mut cache) = self.cache.write() { + *cache = None; + } + } +} + +impl StorageBackend for FileBackend { + fn get(&self, key: &str) -> Result>> { + let path = self.key_to_path(key); + + if !path.exists() { + return Ok(None); + } + + let data = fs::read(&path).map_err(Error::Io)?; + debug!("Read {} bytes from {}", data.len(), key); + + Ok(Some(data)) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let path = self.key_to_path(key); + + // Use atomic write (temp file + rename) + let temp_path = path.with_extension("tmp"); + + fs::write(&temp_path, value).map_err(Error::Io)?; + fs::rename(&temp_path, &path).map_err(Error::Io)?; + + // Invalidate cache + self.invalidate_cache(); + + debug!("Wrote {} bytes to {}", value.len(), key); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let path = self.key_to_path(key); + + if !path.exists() { + return Ok(false); + } + + fs::remove_file(&path).map_err(Error::Io)?; + + // Invalidate cache + self.invalidate_cache(); + + debug!("Deleted {}", key); + Ok(true) + } + + fn exists(&self, key: &str) -> Result { + let path = self.key_to_path(key); + Ok(path.exists()) + } + + fn keys(&self) -> Result> { + self.get_keys() + } + + fn len(&self) -> Result { + Ok(self.get_keys()?.len()) + } + + fn clear(&self) -> Result<()> { + let keys = self.get_keys()?; + + for key in &keys { + let path = self.key_to_path(key); + if path.exists() { + fs::remove_file(&path).map_err(Error::Io)?; + } + } + + // Clear cache + if let Ok(mut cache) = self.cache.write() { + *cache = None; + } + + debug!("Cleared {} entries", keys.len()); + Ok(()) + } + + fn backend_name(&self) -> &'static str { + "file" + } + + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + for (key, value) in items { + self.put(key, value)?; + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_file_backend_basic() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + // Put and get + backend.put("key1", b"value1").unwrap(); + let value = backend.get("key1").unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Exists + assert!(backend.exists("key1").unwrap()); + assert!(!backend.exists("key2").unwrap()); + + // Delete + assert!(backend.delete("key1").unwrap()); + assert!(!backend.exists("key1").unwrap()); + assert!(!backend.delete("key1").unwrap()); // Already deleted + } + + #[test] + fn test_file_backend_keys() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + backend.put("key3", b"v3").unwrap(); + + let keys = backend.keys().unwrap(); + assert_eq!(keys.len(), 3); + assert!(keys.contains(&"key1".to_string())); + } + + #[test] + fn test_file_backend_clear() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + + backend.clear().unwrap(); + + assert!(backend.is_empty().unwrap()); + } + + #[test] + fn test_file_backend_batch() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + let items: Vec<(&str, &[u8])> = vec![ + ("k1", b"v1".as_slice()), + ("k2", b"v2".as_slice()), + ("k3", b"v3".as_slice()), + ]; + + backend.batch_put(&items).unwrap(); + + let results = backend.batch_get(&["k1", "k2", "k3", "k4"]).unwrap(); + assert_eq!(results.len(), 4); + assert!(results[0].is_some()); + assert!(results[3].is_none()); + } + + #[test] + fn test_file_backend_key_sanitization() { + let temp = TempDir::new().unwrap(); + let backend = FileBackend::new(temp.path()).unwrap(); + + // Keys with special characters should be sanitized + backend.put("../etc/passwd", b"malicious").unwrap(); + backend.put("path/to/file", b"nested").unwrap(); + + // Both should be stored safely + assert!(backend.exists("../etc/passwd").unwrap()); + assert!(backend.exists("path/to/file").unwrap()); + } +} diff --git a/vectorless-core/vectorless-storage/src/backend/memory.rs b/vectorless-core/vectorless-storage/src/backend/memory.rs new file mode 100644 index 00000000..3d9b3be2 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/backend/memory.rs @@ -0,0 +1,181 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! In-memory storage backend (for testing). + +use std::collections::HashMap; +use std::sync::RwLock; + +use super::StorageBackend; +use vectorless_error::Result; + +/// In-memory storage backend. +/// +/// Stores all data in a `HashMap`. Useful for testing and scenarios +/// where persistence is not required. +/// +/// # Thread Safety +/// +/// Uses `RwLock` for thread-safe access to the internal map. +#[derive(Debug, Default)] +pub struct MemoryBackend { + /// Internal storage. + data: RwLock>>, +} + +impl MemoryBackend { + /// Create a new in-memory backend. + pub fn new() -> Self { + Self::default() + } + + /// Create a new in-memory backend with pre-seeded data. + pub fn with_data(data: HashMap>) -> Self { + Self { + data: RwLock::new(data), + } + } +} + +impl StorageBackend for MemoryBackend { + fn get(&self, key: &str) -> Result>> { + let data = self + .data + .read() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + Ok(data.get(key).cloned()) + } + + fn put(&self, key: &str, value: &[u8]) -> Result<()> { + let mut data = self + .data + .write() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + data.insert(key.to_string(), value.to_vec()); + Ok(()) + } + + fn delete(&self, key: &str) -> Result { + let mut data = self + .data + .write() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + Ok(data.remove(key).is_some()) + } + + fn exists(&self, key: &str) -> Result { + let data = self + .data + .read() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + Ok(data.contains_key(key)) + } + + fn keys(&self) -> Result> { + let data = self + .data + .read() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + Ok(data.keys().cloned().collect()) + } + + fn len(&self) -> Result { + let data = self + .data + .read() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + Ok(data.len()) + } + + fn clear(&self) -> Result<()> { + let mut data = self + .data + .write() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + data.clear(); + Ok(()) + } + + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + let mut data = self + .data + .write() + .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + for (key, value) in items { + data.insert(key.to_string(), value.to_vec()); + } + Ok(()) + } + + fn backend_name(&self) -> &'static str { + "memory" + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_memory_backend_basic() { + let backend = MemoryBackend::new(); + + // Put and get + backend.put("key1", b"value1").unwrap(); + let value = backend.get("key1").unwrap(); + assert_eq!(value, Some(b"value1".to_vec())); + + // Non-existent key + let missing = backend.get("missing").unwrap(); + assert!(missing.is_none()); + } + + #[test] + fn test_memory_backend_delete() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"value1").unwrap(); + assert!(backend.exists("key1").unwrap()); + + let deleted = backend.delete("key1").unwrap(); + assert!(deleted); + assert!(!backend.exists("key1").unwrap()); + + // Delete non-existent + let not_deleted = backend.delete("missing").unwrap(); + assert!(!not_deleted); + } + + #[test] + fn test_memory_backend_keys() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + backend.put("key3", b"v3").unwrap(); + + let keys = backend.keys().unwrap(); + assert_eq!(keys.len(), 3); + } + + #[test] + fn test_memory_backend_clear() { + let backend = MemoryBackend::new(); + + backend.put("key1", b"v1").unwrap(); + backend.put("key2", b"v2").unwrap(); + + backend.clear().unwrap(); + assert!(backend.is_empty().unwrap()); + } + + #[test] + fn test_memory_backend_with_data() { + let mut initial = HashMap::new(); + initial.insert("k1".to_string(), b"v1".to_vec()); + initial.insert("k2".to_string(), b"v2".to_vec()); + + let backend = MemoryBackend::with_data(initial); + assert_eq!(backend.len().unwrap(), 2); + } +} diff --git a/vectorless-core/vectorless-storage/src/backend/mod.rs b/vectorless-core/vectorless-storage/src/backend/mod.rs new file mode 100644 index 00000000..a8bc8053 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/backend/mod.rs @@ -0,0 +1,34 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage backend abstraction. +//! +//! This module provides a trait-based abstraction for different storage backends, +//! allowing the workspace to work with various storage systems: +//! +//! - **FileBackend**: File system storage (default) +//! - **MemoryBackend**: In-memory storage (for testing) +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::backend::{StorageBackend, FileBackend}; +//! +//! let backend = FileBackend::new("./workspace"); +//! +//! // Store data +//! backend.put("doc-1", b"document data")?; +//! +//! // Retrieve data +//! let data = backend.get("doc-1")?; +//! +//! // List all keys +//! let keys = backend.keys()?; +//! ``` + +mod file; +mod memory; +mod trait_def; + +pub use file::FileBackend; +pub use trait_def::StorageBackend; diff --git a/vectorless-core/vectorless-storage/src/backend/trait_def.rs b/vectorless-core/vectorless-storage/src/backend/trait_def.rs new file mode 100644 index 00000000..9d74f232 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/backend/trait_def.rs @@ -0,0 +1,113 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage backend trait definition. + +use std::fmt::Debug; + +use vectorless_error::Result; + +/// Storage backend trait for abstracting different storage systems. +/// +/// This trait provides a simple key-value interface for document storage. +/// Implementations can use different underlying storage systems: +/// +/// - File system +/// - In-memory (for testing) +/// - Database (SQLite, RocksDB, etc.) +/// - Cloud storage (S3, etc.) +/// +/// # Thread Safety +/// +/// All implementations must be `Send + Sync` to support concurrent access. +pub trait StorageBackend: Debug + Send + Sync { + /// Get a value by key. + /// + /// Returns `None` if the key doesn't exist. + fn get(&self, key: &str) -> Result>>; + + /// Store a value with the given key. + /// + /// Overwrites any existing value. + fn put(&self, key: &str, value: &[u8]) -> Result<()>; + + /// Delete a value by key. + /// + /// Returns `true` if the value was deleted, `false` if it didn't exist. + fn delete(&self, key: &str) -> Result; + + /// Check if a key exists. + fn exists(&self, key: &str) -> Result; + + /// List all keys in the storage. + fn keys(&self) -> Result>; + + /// Get the number of entries in storage. + fn len(&self) -> Result; + + /// Check if storage is empty. + fn is_empty(&self) -> Result { + Ok(self.len()? == 0) + } + + /// Clear all entries from storage. + fn clear(&self) -> Result<()>; + + // ======================================================================== + // Batch operations (optional, default implementations) + // ======================================================================== + + /// Get multiple values by keys. + /// + /// Returns a vector of options, one for each key. + fn batch_get(&self, keys: &[&str]) -> Result>>> { + keys.iter().map(|k| self.get(k)).collect() + } + + /// Store multiple key-value pairs. + /// + /// Default implementation calls `put` for each item. + fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { + for (key, value) in items { + self.put(key, value)?; + } + Ok(()) + } + + /// Delete multiple keys. + /// + /// Returns the number of keys that were actually deleted. + fn batch_delete(&self, keys: &[&str]) -> Result { + let mut count = 0; + for key in keys { + if self.delete(key)? { + count += 1; + } + } + Ok(count) + } + + // ======================================================================== + // Metadata operations + // ======================================================================== + + /// Get storage backend name. + fn backend_name(&self) -> &'static str; + + /// Get storage statistics. + fn stats(&self) -> StorageStats { + StorageStats { + backend: self.backend_name().to_string(), + entries: self.len().unwrap_or(0), + } + } +} + +/// Storage statistics. +#[derive(Debug, Clone)] +pub struct StorageStats { + /// Backend name. + pub backend: String, + /// Number of entries. + pub entries: usize, +} diff --git a/vectorless-core/vectorless-storage/src/cache.rs b/vectorless-core/vectorless-storage/src/cache.rs new file mode 100644 index 00000000..57ca2b24 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/cache.rs @@ -0,0 +1,381 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document cache with LRU eviction policy. +//! +//! This module provides a thread-safe LRU cache for loaded documents, +//! allowing efficient reuse of loaded document data while limiting memory usage. +//! +//! # Metrics +//! +//! The cache tracks: +//! - Hits: Number of successful cache lookups +//! - Misses: Number of failed cache lookups +//! - Evictions: Number of entries evicted due to capacity +//! - Utilization: Current usage as percentage of capacity + +use std::num::NonZeroUsize; +use std::sync::Mutex; +use std::sync::atomic::{AtomicU64, Ordering}; + +use lru::LruCache; + +use super::persistence::PersistedDocument; +use vectorless_error::Error; +use vectorless_error::Result; + +/// Default cache size (number of documents). +const DEFAULT_CACHE_SIZE: usize = 100; + +/// A thread-safe LRU cache for documents. +/// +/// Uses interior mutability via `Mutex` for safe concurrent access. +/// The cache automatically evicts least-recently-used entries when full. +/// +/// # Metrics +/// +/// The cache maintains atomic counters for: +/// - **hits**: Successful cache lookups +/// - **misses**: Failed cache lookups (document not in cache) +/// - **evictions**: Entries removed due to capacity limits +#[derive(Debug)] +pub struct DocumentCache { + /// Inner cache protected by Mutex. + inner: Mutex>, + /// Maximum capacity. + capacity: usize, + /// Number of cache hits. + hits: AtomicU64, + /// Number of cache misses. + misses: AtomicU64, + /// Number of cache evictions. + evictions: AtomicU64, +} + +impl DocumentCache { + /// Create a new cache with default capacity (100 documents). + #[must_use] + pub fn new() -> Self { + Self::with_capacity(DEFAULT_CACHE_SIZE) + } + + /// Create a new cache with custom capacity. + /// + /// # Panics + /// + /// This function does not panic, but capacities below 1 are normalized to 1. + #[must_use] + pub fn with_capacity(capacity: usize) -> Self { + let capacity = capacity.max(1); + let non_zero = NonZeroUsize::new(capacity) + .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_CACHE_SIZE).expect("default is non-zero")); + + Self { + inner: Mutex::new(LruCache::new(non_zero)), + capacity, + hits: AtomicU64::new(0), + misses: AtomicU64::new(0), + evictions: AtomicU64::new(0), + } + } + + /// Get a document from the cache. + /// + /// Returns `None` if the document is not in the cache. + /// Updates the access order (moves to most-recently-used). + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn get(&self, id: &str) -> Result> { + let mut cache = self.lock()?; + let result = cache.get(id).cloned(); + + // Update metrics + if result.is_some() { + self.hits.fetch_add(1, Ordering::Relaxed); + } else { + self.misses.fetch_add(1, Ordering::Relaxed); + } + + Ok(result) + } + + /// Check if a document is in the cache. + pub fn contains(&self, id: &str) -> bool { + self.lock().map(|cache| cache.contains(id)).unwrap_or(false) + } + + /// Put a document into the cache. + /// + /// If the cache is full, evicts the least-recently-used entry. + /// Returns the evicted entry if any. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn put(&self, id: String, doc: PersistedDocument) -> Result> { + let mut cache = self.lock()?; + + // Track capacity before put to detect eviction + let was_full = cache.len() >= self.capacity; + + let evicted = cache.put(id, doc); + + // Track evictions + if evicted.is_some() || was_full { + self.evictions.fetch_add(1, Ordering::Relaxed); + } + + Ok(evicted) + } + + /// Remove a document from the cache. + /// + /// Returns the removed document if it was in the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn remove(&self, id: &str) -> Result> { + let mut cache = self.lock()?; + Ok(cache.pop(id)) + } + + /// Clear all entries from the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn clear(&self) -> Result<()> { + let mut cache = self.lock()?; + cache.clear(); + Ok(()) + } + + /// Get the number of entries currently in the cache. + pub fn len(&self) -> usize { + self.lock().map(|cache| cache.len()).unwrap_or(0) + } + + /// Check if the cache is empty. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Get the maximum capacity of the cache. + pub fn capacity(&self) -> usize { + self.capacity + } + + /// Get cache utilization (0.0 to 1.0). + pub fn utilization(&self) -> f64 { + let len = self.len(); + if self.capacity == 0 { + return 0.0; + } + len as f64 / self.capacity as f64 + } + + /// Get all document IDs currently in the cache. + /// + /// # Errors + /// + /// Returns an error if the cache lock is poisoned. + pub fn keys(&self) -> Result> { + let cache = self.lock()?; + Ok(cache.iter().map(|(k, _)| k.clone()).collect()) + } + + /// Get cache statistics including metrics. + pub fn stats(&self) -> CacheStats { + CacheStats { + len: self.len(), + capacity: self.capacity, + utilization: self.utilization(), + hits: self.hits.load(Ordering::Relaxed), + misses: self.misses.load(Ordering::Relaxed), + evictions: self.evictions.load(Ordering::Relaxed), + } + } + + /// Get the number of cache hits. + pub fn hits(&self) -> u64 { + self.hits.load(Ordering::Relaxed) + } + + /// Get the number of cache misses. + pub fn misses(&self) -> u64 { + self.misses.load(Ordering::Relaxed) + } + + /// Get the number of cache evictions. + pub fn evictions(&self) -> u64 { + self.evictions.load(Ordering::Relaxed) + } + + /// Get the cache hit rate (0.0 to 1.0). + pub fn hit_rate(&self) -> f64 { + let hits = self.hits.load(Ordering::Relaxed); + let misses = self.misses.load(Ordering::Relaxed); + let total = hits + misses; + if total == 0 { + 0.0 + } else { + hits as f64 / total as f64 + } + } + + /// Reset all metrics counters to zero. + pub fn reset_metrics(&self) { + self.hits.store(0, Ordering::Relaxed); + self.misses.store(0, Ordering::Relaxed); + self.evictions.store(0, Ordering::Relaxed); + } + + /// Lock the inner cache. + fn lock(&self) -> Result>> { + self.inner + .lock() + .map_err(|_| Error::Cache("Cache lock poisoned".to_string())) + } +} + +impl Default for DocumentCache { + fn default() -> Self { + Self::new() + } +} + +/// Cache statistics including metrics. +#[derive(Debug, Clone, Copy)] +pub struct CacheStats { + /// Number of entries in cache. + pub len: usize, + /// Maximum capacity. + pub capacity: usize, + /// Utilization (0.0 to 1.0). + pub utilization: f64, + /// Number of cache hits. + pub hits: u64, + /// Number of cache misses. + pub misses: u64, + /// Number of cache evictions. + pub evictions: u64, +} + +#[cfg(test)] +mod tests { + use super::*; + use vectorless_document::DocumentTree; + use crate::storage::{DocumentMeta, PersistedDocument}; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[test] + fn test_cache_basic() { + let cache = DocumentCache::with_capacity(3); + + // Add documents + let doc1 = create_test_doc("doc1"); + let doc2 = create_test_doc("doc2"); + + cache.put("doc1".to_string(), doc1.clone()).unwrap(); + cache.put("doc2".to_string(), doc2.clone()).unwrap(); + + assert_eq!(cache.len(), 2); + assert!(cache.contains("doc1")); + assert!(cache.contains("doc2")); + } + + #[test] + fn test_cache_get() { + let cache = DocumentCache::with_capacity(3); + let doc = create_test_doc("doc1"); + + cache.put("doc1".to_string(), doc).unwrap(); + + let retrieved = cache.get("doc1").unwrap(); + assert!(retrieved.is_some()); + assert_eq!(retrieved.unwrap().meta.id, "doc1"); + + let missing = cache.get("missing").unwrap(); + assert!(missing.is_none()); + } + + #[test] + fn test_cache_eviction() { + let cache = DocumentCache::with_capacity(2); + + cache + .put("doc1".to_string(), create_test_doc("doc1")) + .unwrap(); + cache + .put("doc2".to_string(), create_test_doc("doc2")) + .unwrap(); + cache + .put("doc3".to_string(), create_test_doc("doc3")) + .unwrap(); + + // doc1 should be evicted (least recently used) + assert!(!cache.contains("doc1")); + assert!(cache.contains("doc2")); + assert!(cache.contains("doc3")); + } + + #[test] + fn test_cache_remove() { + let cache = DocumentCache::new(); + + cache + .put("doc1".to_string(), create_test_doc("doc1")) + .unwrap(); + assert!(cache.contains("doc1")); + + let removed = cache.remove("doc1").unwrap(); + assert!(removed.is_some()); + assert!(!cache.contains("doc1")); + + let not_found = cache.remove("missing").unwrap(); + assert!(not_found.is_none()); + } + + #[test] + fn test_cache_clear() { + let cache = DocumentCache::new(); + + cache + .put("doc1".to_string(), create_test_doc("doc1")) + .unwrap(); + cache + .put("doc2".to_string(), create_test_doc("doc2")) + .unwrap(); + + assert_eq!(cache.len(), 2); + + cache.clear().unwrap(); + + assert!(cache.is_empty()); + } + + #[test] + fn test_cache_utilization() { + let cache = DocumentCache::with_capacity(10); + + assert_eq!(cache.utilization(), 0.0); + + cache + .put("doc1".to_string(), create_test_doc("doc1")) + .unwrap(); + assert!((cache.utilization() - 0.1).abs() < 0.01); + + cache + .put("doc2".to_string(), create_test_doc("doc2")) + .unwrap(); + assert!((cache.utilization() - 0.2).abs() < 0.01); + } +} diff --git a/vectorless-core/vectorless-storage/src/codec.rs b/vectorless-core/vectorless-storage/src/codec.rs new file mode 100644 index 00000000..ce750222 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/codec.rs @@ -0,0 +1,245 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Codec abstraction for compression and decompression. +//! +//! This module provides a codec trait for compressing/decompressing data, +//! with implementations for: +//! +//! - **Identity**: No compression (pass-through) +//! - **Gzip**: Standard gzip compression +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::codec::{Codec, GzipCodec}; +//! +//! let codec = GzipCodec::new(6); +//! +//! let data = b"some data to compress"; +//! let compressed = codec.encode(data)?; +//! let decompressed = codec.decode(&compressed)?; +//! +//! assert_eq!(data.as_slice(), decompressed.as_slice()); +//! ``` + +use std::fmt::Debug; +use std::io::{Read, Write}; + +use flate2::Compression; +use flate2::read::GzDecoder; +use flate2::write::GzEncoder; + +use vectorless_error::Error; +use vectorless_error::Result; + +/// Codec trait for compression/decompression. +pub trait Codec: Debug + Send + Sync { + /// Encode (compress) data. + fn encode(&self, data: &[u8]) -> Result>; + + /// Decode (decompress) data. + fn decode(&self, data: &[u8]) -> Result>; + + /// Get the codec name. + fn name(&self) -> &'static str; +} + +/// Identity codec (no compression). +/// +/// Passes data through unchanged. +#[derive(Debug, Clone, Copy, Default)] +pub struct IdentityCodec; + +impl IdentityCodec { + /// Create a new identity codec. + pub fn new() -> Self { + Self::default() + } +} + +impl Codec for IdentityCodec { + fn encode(&self, data: &[u8]) -> Result> { + Ok(data.to_vec()) + } + + fn decode(&self, data: &[u8]) -> Result> { + Ok(data.to_vec()) + } + + fn name(&self) -> &'static str { + "identity" + } +} + +/// Gzip codec. +/// +/// Uses the `flate2` crate for gzip compression. +#[derive(Debug, Clone)] +pub struct GzipCodec { + /// Compression level (0-9). + level: u32, +} + +impl GzipCodec { + /// Create a new gzip codec with the given compression level. + /// + /// Level is clamped to 0-9: + /// - 0: No compression + /// - 1: Fastest compression + /// - 6: Default (good balance) + /// - 9: Best compression (slowest) + pub fn new(level: u32) -> Self { + Self { + level: level.clamp(0, 9), + } + } + + /// Create a codec with fast compression (level 1). + pub fn fast() -> Self { + Self::new(1) + } + + /// Create a codec with default compression (level 6). + pub fn default_level() -> Self { + Self::new(6) + } + + /// Create a codec with best compression (level 9). + pub fn best() -> Self { + Self::new(9) + } +} + +impl Default for GzipCodec { + fn default() -> Self { + Self::default_level() + } +} + +impl Codec for GzipCodec { + fn encode(&self, data: &[u8]) -> Result> { + let mut encoder = GzEncoder::new(Vec::new(), Compression::new(self.level)); + encoder + .write_all(data) + .map_err(|e| Error::Parse(format!("Gzip encode error: {}", e)))?; + encoder + .finish() + .map_err(|e| Error::Parse(format!("Gzip finish error: {}", e))) + } + + fn decode(&self, data: &[u8]) -> Result> { + let mut decoder = GzDecoder::new(data); + let mut decoded = Vec::new(); + decoder + .read_to_end(&mut decoded) + .map_err(|e| Error::Parse(format!("Gzip decode error: {}", e)))?; + Ok(decoded) + } + + fn name(&self) -> &'static str { + "gzip" + } +} + +/// Create a codec from configuration. +pub fn codec_from_config( + enabled: bool, + algorithm: vectorless_config::CompressionAlgorithm, + level: u32, +) -> Box { + if !enabled { + return Box::new(IdentityCodec::new()); + } + + match algorithm { + vectorless_config::CompressionAlgorithm::Gzip => Box::new(GzipCodec::new(level)), + vectorless_config::CompressionAlgorithm::Zstd => { + // Zstd not implemented yet, fallback to gzip + // TODO: Add zstd support when needed + Box::new(GzipCodec::new(level)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_identity_codec() { + let codec = IdentityCodec::new(); + let data = b"test data"; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert_eq!(data.as_slice(), decoded.as_slice()); + assert_eq!(codec.name(), "identity"); + } + + #[test] + fn test_gzip_codec_basic() { + let codec = GzipCodec::default(); + let data = b"Hello, World! This is a test string for compression."; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert_eq!(data.as_slice(), decoded.as_slice()); + assert_eq!(codec.name(), "gzip"); + + // Compressed should be smaller for repetitive data + // Note: For very small data, gzip overhead might make it larger + let repetitive = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; + let compressed = codec.encode(repetitive).unwrap(); + assert!(compressed.len() < repetitive.len()); + } + + #[test] + fn test_gzip_codec_levels() { + let data = b"This is test data that should compress well. ".repeat(100); + let data = data.into_iter().map(|b| b as u8).collect::>(); + + let codec_fast = GzipCodec::fast(); + let codec_best = GzipCodec::best(); + + let compressed_fast = codec_fast.encode(&data).unwrap(); + let compressed_best = codec_best.encode(&data).unwrap(); + + // Both should decompress to the same data + assert_eq!(codec_fast.decode(&compressed_fast).unwrap(), data); + assert_eq!(codec_best.decode(&compressed_best).unwrap(), data); + + // Best compression should be smaller or equal + assert!(compressed_best.len() <= compressed_fast.len()); + } + + #[test] + fn test_gzip_empty_data() { + let codec = GzipCodec::default(); + let data = b""; + + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + + assert!(decoded.is_empty()); + } + + #[test] + fn test_codec_from_config() { + use vectorless_config::CompressionAlgorithm; + + // Disabled compression + let codec = codec_from_config(false, CompressionAlgorithm::Gzip, 6); + let data = b"test"; + let encoded = codec.encode(data).unwrap(); + assert_eq!(encoded, data); + + // Enabled compression + let codec = codec_from_config(true, CompressionAlgorithm::Gzip, 6); + let encoded = codec.encode(data).unwrap(); + let decoded = codec.decode(&encoded).unwrap(); + assert_eq!(decoded, data); + } +} diff --git a/vectorless-core/vectorless-storage/src/lib.rs b/vectorless-core/vectorless-storage/src/lib.rs new file mode 100644 index 00000000..ca7c27f3 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/lib.rs @@ -0,0 +1,46 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Storage module for persisting document indices. +//! +//! This module provides: +//! - **Workspace** — An async directory-based document collection manager with LRU cache +//! - **Persistence** — Save/load document trees and metadata with atomic writes +//! - **Cache** — LRU cache for loaded documents +//! - **Lock** — File locking for multi-process safety +//! - **Backend** — Storage backend abstraction (file, memory, etc.) +//! +//! # Example +//! +//! ```rust,no_run +//! use vectorless::storage::{Workspace, PersistedDocument, DocumentMeta}; +//! use vectorless::document::DocumentTree; +//! +//! # #[tokio::main] +//! # async fn main() -> vectorless::error::Result<()> { +//! // Create a workspace +//! let workspace = Workspace::new("./my_workspace").await?; +//! +//! // Add a document +//! let meta = DocumentMeta::new("doc-1", "My Document", "md"); +//! let tree = DocumentTree::new("Root", "Content"); +//! let doc = PersistedDocument::new(meta, tree); +//! workspace.add(&doc).await?; +//! +//! // Load it back (uses LRU cache) +//! let loaded = workspace.load_and_cache("doc-1").await?.unwrap(); +//! # Ok(()) +//! # } +//! ``` + +pub mod backend; +pub mod cache; +pub mod codec; +pub mod lock; +pub mod migration; +mod persistence; +pub mod workspace; + +// Re-export main types +pub use persistence::{DocumentMeta, PageContent, PersistedDocument}; +pub use workspace::Workspace; diff --git a/vectorless-core/vectorless-storage/src/lock.rs b/vectorless-core/vectorless-storage/src/lock.rs new file mode 100644 index 00000000..3783d51f --- /dev/null +++ b/vectorless-core/vectorless-storage/src/lock.rs @@ -0,0 +1,280 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! File locking for workspace safety. +//! +//! Provides cross-process file locking to prevent data corruption +//! when multiple processes access the same workspace. + +// File locking inherently requires unsafe FFI calls +#![allow(unsafe_code)] +//! +//! Provides cross-process file locking to prevent data corruption +//! when multiple processes access the same workspace. + +use std::fs::{File, OpenOptions}; +use std::path::Path; + +use vectorless_error::Error; +use vectorless_error::Result; + +/// A file lock that is automatically released when dropped. +/// +/// Uses the `flock` on Unix and `LockFileEx` on Windows. +#[derive(Debug)] +pub struct FileLock { + /// The locked file handle. + file: Option, + /// Path to the lock file (for debugging). + path: std::path::PathBuf, + /// Whether the lock is held exclusively. + exclusive: bool, +} + +impl FileLock { + /// Try to acquire an file lock. + /// + /// # Arguments + /// + /// * `path` - Path to the lock file (will be created if it doesn't exist) + /// * `exclusive` - If true, acquires an exclusive (write) lock; otherwise a shared (read) lock + /// + /// # Errors + /// + /// Returns `Error::WorkspaceLocked` if the lock is held by another process. + pub fn try_lock(path: impl Into, exclusive: bool) -> Result { + let path = path.into(); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Open or create the lock file + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(false) + .open(&path) + .map_err(Error::Io)?; + + // Try to acquire the lock + #[cfg(unix)] + { + let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); + + let result = if exclusive { + // LOCK_EX | LOCK_NB + unsafe { libc::flock(fd, 0x02 | 0x04) } + } else { + // LOCK_SH | LOCK_NB + unsafe { libc::flock(fd, 0x01 | 0x04) } + }; + + if result != 0 { + return Err(Error::WorkspaceLocked); + } + + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + + #[cfg(windows)] + { + use std::os::windows::fs::OpenOptionsExt; + use windows_sys::Win32::Storage::FileSystem::{ + LOCKFILE_EXCLUSIVE_LOCK, LOCKFILE_FAIL_IMMEDIATELY, LockFileEx, + }; + + let handle = std::os::windows::io::AsRawHandle::as_raw_handle(&file); + + let mut overlapped = std::mem::MaybeUninit::zeroed(); + let result = unsafe { + LockFileEx( + handle, + if exclusive { + LOCKFILE_EXCLUSIVE_LOCK + } else { + 0 + } | LOCKFILE_FAIL_IMMEDIATELY, + 0, + 0xFFFFFFFF, + 0xFFFFFFFF, + overlapped.as_mut_ptr(), + ) + }; + + if result == 0 { + return Err(Error::WorkspaceLocked); + } + + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + + #[cfg(not(any(unix, windows)))] + { + // Fallback: No file locking available + // Just keep the file open, which provides some protection + Ok(Self { + file: Some(file), + path, + exclusive, + }) + } + } + + /// Try to acquire a lock without blocking. + /// + /// Returns `Ok(FileLock)` if the lock was acquired, or `Ok(None)` if it would block. + pub fn try_lock_no_wait( + path: impl Into, + exclusive: bool, + ) -> Result> { + match Self::try_lock(&path.into(), exclusive) { + Ok(lock) => Ok(Some(lock)), + Err(Error::WorkspaceLocked) => Ok(None), + Err(e) => Err(e), + } + } + + /// Check if the lock file is locked by another process. + /// + /// This is useful for checking without acquiring a lock. + pub fn is_locked(path: impl Into) -> bool { + Self::try_lock(&path.into(), false).is_err() + } + + /// Release the lock. + pub fn unlock(mut self) { + if let Some(file) = self.file.take() { + // File will be unlocked when dropped + drop(file); + } + } + + /// Get the lock file path. + pub fn path(&self) -> &Path { + &self.path + } + + /// Check if this is an exclusive lock. + pub fn is_exclusive(&self) -> bool { + self.exclusive + } +} + +impl Drop for FileLock { + fn drop(&mut self) { + if let Some(file) = self.file.take() { + // File descriptor closed, lock automatically released + drop(file); + } + } +} + +/// A scoped lock guard that releases the lock when dropped. +/// +/// This is useful for ensuring the lock is released even on panic. +pub struct ScopedLock { + lock: Option, +} + +impl ScopedLock { + /// Acquire a scoped lock. + pub fn new(path: impl Into, exclusive: bool) -> Result { + let lock = FileLock::try_lock(path, exclusive)?; + Ok(Self { lock: Some(lock) }) + } + + /// Release the lock early. + pub fn release(mut self) { + if let Some(lock) = self.lock.take() { + lock.unlock(); + } + } +} + +impl Drop for ScopedLock { + fn drop(&mut self) { + // Lock automatically released when FileLock is dropped + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_file_lock_acquire_release() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("test.lock"); + + let lock = FileLock::try_lock(&lock_path, true).unwrap(); + assert!(lock.is_exclusive()); + + // Should be able to unlock + lock.unlock(); + } + + #[test] + fn test_file_lock_conflict() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("conflict.lock"); + + // Acquire exclusive lock + let _lock1 = FileLock::try_lock(&lock_path, true).unwrap(); + + // Try to acquire another exclusive lock - should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + } + + #[test] + fn test_file_lock_shared() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("shared.lock"); + + // Acquire shared lock + let lock1 = FileLock::try_lock(&lock_path, false).unwrap(); + assert!(!lock1.is_exclusive()); + + // Should be able to acquire another shared lock + let lock2 = FileLock::try_lock(&lock_path, false).unwrap(); + assert!(!lock2.is_exclusive()); + + // But exclusive lock should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + + lock1.unlock(); + lock2.unlock(); + } + + #[test] + fn test_scoped_lock() { + let temp = TempDir::new().unwrap(); + let lock_path = temp.path().join("scoped.lock"); + + { + let _scoped = ScopedLock::new(&lock_path, true).unwrap(); + // Lock held here + + // Another lock should fail + let result = FileLock::try_lock(&lock_path, true); + assert!(matches!(result, Err(Error::WorkspaceLocked))); + } + // Lock released here + + // Now should succeed + let _lock = FileLock::try_lock(&lock_path, true).unwrap(); + } +} diff --git a/vectorless-core/vectorless-storage/src/migration.rs b/vectorless-core/vectorless-storage/src/migration.rs new file mode 100644 index 00000000..3cc35512 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/migration.rs @@ -0,0 +1,385 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Version migration system for persisted data. +//! +//! This module provides a framework for migrating data between format versions. +//! When the data format changes, migrations can automatically upgrade older data. +//! +//! # Example +//! +//! ```rust,ignore +//! use vectorless::storage::migration::{Migration, Migrator, MigrationContext}; +//! +//! // Define a migration from v1 to v2 +//! struct V1ToV2; +//! +//! impl Migration for V1ToV2 { +//! fn from_version(&self) -> u32 { 1 } +//! fn to_version(&self) -> u32 { 2 } +//! fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result> { +//! // Transform data from v1 to v2 format +//! // ... +//! } +//! } +//! +//! // Register migrations +//! let mut migrator = Migrator::new(); +//! migrator.register(Box::new(V1ToV2)); +//! +//! // Migrate data +//! let migrated = migrator.migrate(data, 1, 2)?; +//! ``` + +use std::collections::HashMap; + +use tracing::{debug, info, warn}; + +use vectorless_error::Error; +use vectorless_error::Result; + +/// Current data format version. +pub const CURRENT_VERSION: u32 = 1; + +/// Migration context providing additional information for migrations. +#[derive(Debug, Clone)] +pub struct MigrationContext { + /// Source version. + pub from_version: u32, + /// Target version. + pub to_version: u32, + /// Additional metadata. + pub metadata: HashMap, +} + +impl MigrationContext { + /// Create a new migration context. + pub fn new(from_version: u32, to_version: u32) -> Self { + Self { + from_version, + to_version, + metadata: HashMap::new(), + } + } + + /// Add metadata. + pub fn with_metadata(mut self, key: impl Into, value: impl Into) -> Self { + self.metadata.insert(key.into(), value.into()); + self + } +} + +/// Trait for data migrations. +/// +/// A migration transforms data from one version to the next. +pub trait Migration: Send + Sync { + /// Get the source version this migration applies to. + fn from_version(&self) -> u32; + + /// Get the target version this migration produces. + fn to_version(&self) -> u32; + + /// Get a human-readable description of this migration. + fn description(&self) -> &str; + + /// Perform the migration. + /// + /// # Arguments + /// + /// * `data` - The data to migrate + /// * `ctx` - Migration context with additional information + /// + /// # Returns + /// + /// The migrated data in the new format. + fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result>; + + /// Check if this migration can be applied to the given data. + /// + /// Default implementation always returns true. + fn can_migrate(&self, _data: &[u8]) -> bool { + true + } +} + +/// Migration registry and executor. +pub struct Migrator { + /// Registered migrations, keyed by (from_version, to_version). + migrations: HashMap<(u32, u32), Box>, +} + +impl Default for Migrator { + fn default() -> Self { + Self::new() + } +} + +impl std::fmt::Debug for Migrator { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Migrator") + .field("migration_count", &self.migrations.len()) + .finish() + } +} + +impl Migrator { + /// Create a new migrator. + pub fn new() -> Self { + Self { + migrations: HashMap::new(), + } + } + + /// Register a migration. + pub fn register(&mut self, migration: Box) { + let key = (migration.from_version(), migration.to_version()); + debug!("Registering migration: v{} -> v{}", key.0, key.1); + self.migrations.insert(key, migration); + } + + /// Check if a migration path exists between two versions. + pub fn can_migrate(&self, from_version: u32, to_version: u32) -> bool { + if from_version == to_version { + return true; + } + + // Check if we have a direct migration + if self.migrations.contains_key(&(from_version, to_version)) { + return true; + } + + // Check if we have a path through intermediate versions + self.find_migration_path(from_version, to_version).is_some() + } + + /// Find a migration path between two versions. + /// + /// Returns a sequence of version numbers to migrate through. + fn find_migration_path(&self, from_version: u32, to_version: u32) -> Option> { + if from_version == to_version { + return Some(vec![from_version]); + } + + // Simple BFS to find a path + use std::collections::{HashSet, VecDeque}; + + let mut visited: HashSet = HashSet::new(); + let mut queue: VecDeque = VecDeque::new(); + let mut parent: HashMap = HashMap::new(); + + queue.push_back(from_version); + visited.insert(from_version); + + while let Some(current) = queue.pop_front() { + // Find all migrations from current version + for ((from, to), _) in &self.migrations { + if *from == current && !visited.contains(to) { + visited.insert(*to); + parent.insert(*to, current); + queue.push_back(*to); + + if *to == to_version { + // Reconstruct path + let mut path = vec![to_version]; + let mut v = to_version; + while let Some(&p) = parent.get(&v) { + if p == from_version { + path.push(p); + break; + } + path.push(p); + v = p; + } + path.reverse(); + return Some(path); + } + } + } + } + + None + } + + /// Migrate data from one version to another. + /// + /// If a direct migration exists, it will be used. + /// Otherwise, the migrator will try to find a path through intermediate versions. + pub fn migrate(&self, data: &[u8], from_version: u32, to_version: u32) -> Result> { + if from_version == to_version { + return Ok(data.to_vec()); + } + + // Find migration path + let path = self + .find_migration_path(from_version, to_version) + .ok_or_else(|| { + Error::VersionMismatch(format!( + "No migration path from v{} to v{}", + from_version, to_version + )) + })?; + + if path.len() < 2 { + return Ok(data.to_vec()); + } + + info!( + "Migrating data from v{} to v{} via path: {:?}", + from_version, to_version, path + ); + + let mut current_data = data.to_vec(); + let mut current_version = from_version; + + for next_version in path.iter().skip(1) { + let key = (current_version, *next_version); + let migration = self.migrations.get(&key).ok_or_else(|| { + Error::VersionMismatch(format!( + "Missing migration from v{} to v{}", + current_version, next_version + )) + })?; + + let ctx = MigrationContext::new(current_version, *next_version); + + debug!( + "Applying migration: v{} -> v{} ({})", + current_version, + next_version, + migration.description() + ); + + current_data = migration.migrate(¤t_data, &ctx)?; + current_version = *next_version; + } + + Ok(current_data) + } + + /// Get the list of registered migrations. + pub fn list_migrations(&self) -> Vec<(u32, u32, &str)> { + self.migrations + .values() + .map(|m| (m.from_version(), m.to_version(), m.description())) + .collect() + } +} + +// ============================================================================ +// Built-in migrations +// ============================================================================ + +/// Placeholder migration for future versions. +/// This is a template that can be copied for actual migrations. +#[derive(Debug)] +pub struct PlaceholderMigration { + from: u32, + to: u32, +} + +impl PlaceholderMigration { + /// Create a new placeholder migration. + pub fn new(from: u32, to: u32) -> Self { + Self { from, to } + } +} + +impl Migration for PlaceholderMigration { + fn from_version(&self) -> u32 { + self.from + } + + fn to_version(&self) -> u32 { + self.to + } + + fn description(&self) -> &str { + "Placeholder migration (no-op)" + } + + fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { + warn!( + "Using placeholder migration from v{} to v{} - no changes made", + self.from, self.to + ); + Ok(data.to_vec()) + } +} + +/// Create a default migrator with all built-in migrations registered. +pub fn default_migrator() -> Migrator { + Migrator::new() + // Add migrations as needed when versions change + // migrator.register(Box::new(V1ToV2::new())); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_migration_context() { + let ctx = MigrationContext::new(1, 2).with_metadata("key", "value"); + + assert_eq!(ctx.from_version, 1); + assert_eq!(ctx.to_version, 2); + assert_eq!(ctx.metadata.get("key"), Some(&"value".to_string())); + } + + #[test] + fn test_migrator_no_migration_needed() { + let migrator = Migrator::new(); + let data = b"test data"; + + let result = migrator.migrate(data, 1, 1).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_migrator_no_path() { + let migrator = Migrator::new(); + let data = b"test data"; + + let result = migrator.migrate(data, 1, 2); + assert!(result.is_err()); + } + + #[test] + fn test_migrator_with_placeholder() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + + assert!(migrator.can_migrate(1, 2)); + assert!(!migrator.can_migrate(1, 3)); + + let data = b"test data"; + let result = migrator.migrate(data, 1, 2).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_migrator_path_finding() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + migrator.register(Box::new(PlaceholderMigration::new(2, 3))); + + assert!(migrator.can_migrate(1, 3)); + + let path = migrator.find_migration_path(1, 3).unwrap(); + assert_eq!(path, vec![1, 2, 3]); + + let data = b"test data"; + let result = migrator.migrate(data, 1, 3).unwrap(); + assert_eq!(result, data); + } + + #[test] + fn test_list_migrations() { + let mut migrator = Migrator::new(); + migrator.register(Box::new(PlaceholderMigration::new(1, 2))); + migrator.register(Box::new(PlaceholderMigration::new(2, 3))); + + let list = migrator.list_migrations(); + assert_eq!(list.len(), 2); + } +} diff --git a/vectorless-core/vectorless-storage/src/persistence.rs b/vectorless-core/vectorless-storage/src/persistence.rs new file mode 100644 index 00000000..c9a823f6 --- /dev/null +++ b/vectorless-core/vectorless-storage/src/persistence.rs @@ -0,0 +1,877 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Persistence utilities for saving and loading document indices. +//! +//! # Features +//! +//! - **Atomic writes**: Write to temp file, then rename for crash safety +//! - **Checksum verification**: SHA-256 checksums for data integrity +//! - **Version header**: Format version for future migrations + +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; +use std::fs::File; +use std::io::{BufReader, BufWriter, Write}; +use std::path::{Path, PathBuf}; + +use vectorless_error::Error; +use vectorless_document::{DocumentTree, NavigationIndex, ReasoningIndex}; +use vectorless_error::Result; + +/// Current format version for persisted documents. +const FORMAT_VERSION: u32 = 1; + +/// Current schema version for `PersistedDocument`. +/// +/// Increment this when the document structure changes in a +/// backward-incompatible way (e.g. field renames, new required fields). +/// Old documents will be detected and logged as stale on load. +const SCHEMA_VERSION: u32 = 1; + +/// Metadata for a persisted document. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentMeta { + /// Unique document identifier. + pub id: String, + + /// Document name/title. + pub name: String, + + /// Document format (md, pdf, etc.). + pub format: String, + + /// Source file path. + pub source_path: Option, + + /// Document description. + pub description: Option, + + /// Page count (for PDFs). + pub page_count: Option, + + /// Line count (for text files). + pub line_count: Option, + + /// Creation timestamp. + pub created_at: chrono::DateTime, + + /// Last modified timestamp. + pub modified_at: chrono::DateTime, + + // === Processing State (for incremental updates) === + /// Content fingerprint for change detection. + #[serde( + default, + skip_serializing_if = "vectorless_utils::fingerprint::Fingerprint::is_zero" + )] + pub content_fingerprint: vectorless_utils::fingerprint::Fingerprint, + + /// Logic fingerprint (hash of pipeline configuration used to produce this document). + /// If the pipeline config changes, a full reprocess is needed even if content didn't change. + #[serde( + default, + skip_serializing_if = "vectorless_utils::fingerprint::Fingerprint::is_zero" + )] + pub logic_fingerprint: vectorless_utils::fingerprint::Fingerprint, + + /// Processing version (incremented when algorithm changes). + #[serde(default)] + pub processing_version: u32, + + /// Node count in the tree. + #[serde(default)] + pub node_count: usize, + + /// Total tokens in summaries. + #[serde(default)] + pub total_summary_tokens: usize, + + /// LLM model used for processing. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub processing_model: Option, + + /// Last processing duration in milliseconds. + #[serde(default)] + pub processing_duration_ms: u64, +} + +impl DocumentMeta { + /// Create new document metadata. + pub fn new(id: impl Into, name: impl Into, format: impl Into) -> Self { + let now = chrono::Utc::now(); + Self { + id: id.into(), + name: name.into(), + format: format.into(), + source_path: None, + description: None, + page_count: None, + line_count: None, + created_at: now, + modified_at: now, + content_fingerprint: vectorless_utils::fingerprint::Fingerprint::zero(), + logic_fingerprint: vectorless_utils::fingerprint::Fingerprint::zero(), + processing_version: 0, + node_count: 0, + total_summary_tokens: 0, + processing_model: None, + processing_duration_ms: 0, + } + } + + /// Set the source path. + pub fn with_source_path(mut self, path: impl Into) -> Self { + self.source_path = Some(path.into()); + self + } + + /// Set the description. + pub fn with_description(mut self, desc: impl Into) -> Self { + self.description = Some(desc.into()); + self + } + + /// Set the content fingerprint. + pub fn with_fingerprint(mut self, fp: vectorless_utils::fingerprint::Fingerprint) -> Self { + self.content_fingerprint = fp; + self + } + + /// Set the logic fingerprint. + pub fn with_logic_fingerprint(mut self, fp: vectorless_utils::fingerprint::Fingerprint) -> Self { + self.logic_fingerprint = fp; + self + } + + /// Set the processing version. + pub fn with_processing_version(mut self, version: u32) -> Self { + self.processing_version = version; + self + } + + /// Set the processing model. + pub fn with_processing_model(mut self, model: impl Into) -> Self { + self.processing_model = Some(model.into()); + self + } + + /// Update processing statistics. + pub fn update_processing_stats( + &mut self, + node_count: usize, + summary_tokens: usize, + duration_ms: u64, + ) { + self.node_count = node_count; + self.total_summary_tokens = summary_tokens; + self.processing_duration_ms = duration_ms; + self.modified_at = chrono::Utc::now(); + } + + /// Mark as processed with given fingerprint and version. + pub fn mark_processed( + &mut self, + fp: vectorless_utils::fingerprint::Fingerprint, + version: u32, + model: Option<&str>, + ) { + self.content_fingerprint = fp; + self.processing_version = version; + self.processing_model = model.map(|s| s.to_string()); + self.modified_at = chrono::Utc::now(); + } + + /// Check if the document needs reprocessing. + pub fn needs_reprocessing( + &self, + current_fp: &vectorless_utils::fingerprint::Fingerprint, + current_version: u32, + ) -> bool { + // Never processed + if self.processing_version == 0 { + return true; + } + + // Algorithm version changed + if self.processing_version < current_version { + return true; + } + + // Content changed + if &self.content_fingerprint != current_fp { + return true; + } + + false + } +} + +/// A persisted document index containing tree and metadata. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PersistedDocument { + /// Schema version — incremented on backward-incompatible changes. + /// Old documents default to `0` via serde when the field is absent. + #[serde(default)] + pub schema_version: u32, + + /// Document metadata. + pub meta: DocumentMeta, + + /// The document tree structure. + pub tree: DocumentTree, + + /// Per-page content (for PDFs). + #[serde(default)] + pub pages: Vec, + + /// Pre-computed reasoning index for retrieval acceleration. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub reasoning_index: Option, + + /// Navigation index for Agent-based retrieval. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub navigation_index: Option, + + /// Key concepts extracted from the document. + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub concepts: Vec, +} + +impl PersistedDocument { + /// Create a new persisted document. + pub fn new(meta: DocumentMeta, tree: DocumentTree) -> Self { + Self { + schema_version: SCHEMA_VERSION, + meta, + tree, + pages: Vec::new(), + reasoning_index: None, + navigation_index: None, + concepts: Vec::new(), + } + } + + /// Add page content. + pub fn add_page(&mut self, page: usize, content: impl Into) { + self.pages.push(PageContent { + page, + content: content.into(), + }); + } +} + +/// Content for a single page. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PageContent { + /// Page number (1-based). + pub page: usize, + + /// Page text content. + pub content: String, +} + +/// Wrapper for persisted data with checksum. +#[derive(Debug, Serialize, Deserialize)] +struct PersistedWrapper { + /// Format version. + version: u32, + /// SHA-256 checksum of the payload. + checksum: String, + /// The actual data as raw JSON value (avoids re-serialization drift). + payload: serde_json::Value, +} + +/// Options for save/load operations. +#[derive(Debug, Clone)] +pub struct PersistenceOptions { + /// Use atomic writes (temp file + rename). + pub atomic_writes: bool, + /// Verify checksums on load. + pub verify_checksum: bool, +} + +impl Default for PersistenceOptions { + fn default() -> Self { + Self { + atomic_writes: true, + verify_checksum: true, + } + } +} + +impl PersistenceOptions { + /// Create new options with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Set atomic writes option. + pub fn with_atomic_writes(mut self, enabled: bool) -> Self { + self.atomic_writes = enabled; + self + } + + /// Set checksum verification option. + pub fn with_verify_checksum(mut self, enabled: bool) -> Self { + self.verify_checksum = enabled; + self + } +} + +/// Calculate SHA-256 checksum of data. +fn calculate_checksum(data: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(data); + format!("{:x}", hasher.finalize()) +} + +/// Save a document to a JSON file with atomic write and checksum. +/// +/// # Atomic Write +/// +/// When `atomic_writes` is enabled (default), this function: +/// 1. Writes to a temporary file (`.tmp` suffix) +/// 2. Renames temp file to target (atomic on most filesystems) +/// +/// This prevents data corruption if the process crashes during write. +/// +/// # Errors +/// +/// Returns an error if: +/// - Serialization fails +/// - Cannot create temp file +/// - Write fails +/// - Rename fails +pub fn save_document(path: &Path, doc: &PersistedDocument) -> Result<()> { + save_document_with_options(path, doc, &PersistenceOptions::default()) +} + +/// Save a document with custom options. +pub fn save_document_with_options( + path: &Path, + doc: &PersistedDocument, + options: &PersistenceOptions, +) -> Result<()> { + // Serialize to serde_json::Value first (avoids HashMap key ordering drift) + let payload_value = + serde_json::to_value(doc).map_err(|e| Error::Serialization(e.to_string()))?; + + // Calculate checksum on the Value's canonical bytes + let payload_bytes = + serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; + let checksum = calculate_checksum(&payload_bytes); + + // Create wrapper + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: payload_value, + }; + + // Serialize wrapper + let json = + serde_json::to_string_pretty(&wrapper).map_err(|e| Error::Serialization(e.to_string()))?; + + if options.atomic_writes { + // Atomic write: write to temp file, then rename + let temp_path = path.with_extension("tmp"); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Write to temp file + { + let file = File::create(&temp_path).map_err(Error::Io)?; + let mut writer = BufWriter::new(file); + writer.write_all(json.as_bytes()).map_err(Error::Io)?; + writer.flush().map_err(Error::Io)?; + } + + // Atomic rename + std::fs::rename(&temp_path, path).map_err(Error::Io)?; + } else { + // Direct write (not atomic) + std::fs::write(path, json).map_err(Error::Io)?; + } + + Ok(()) +} + +/// Load a document from a JSON file with checksum verification. +/// +/// # Checksum Verification +/// +/// When `verify_checksum` is enabled (default), this function: +/// 1. Reads the file +/// 2. Parses the wrapper +/// 3. Re-serializes the payload +/// 4. Verifies the checksum matches +/// +/// # Errors +/// +/// Returns an error if: +/// - File doesn't exist +/// - Parse fails +/// - Checksum mismatch +/// - Version mismatch (future: migration) +pub fn load_document(path: &Path) -> Result { + load_document_with_options(path, &PersistenceOptions::default()) +} + +/// Load a document with custom options. +pub fn load_document_with_options( + path: &Path, + options: &PersistenceOptions, +) -> Result { + if !path.exists() { + return Err(Error::DocumentNotFound(path.display().to_string())); + } + + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + + // Parse wrapper (payload is serde_json::Value) + let wrapper: PersistedWrapper = serde_json::from_reader(reader) + .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::Parse(format!( + "Unsupported format version: {} (expected {})", + wrapper.version, FORMAT_VERSION + ))); + } + + // Verify checksum if enabled + if options.verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::Parse(format!( + "Checksum mismatch: expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + // Deserialize Value to target type + let doc: PersistedDocument = serde_json::from_value(wrapper.payload) + .map_err(|e| Error::Parse(format!("Failed to deserialize document: {}", e)))?; + + // Check schema version — warn on stale documents, fail on future versions + if doc.schema_version == 0 { + tracing::warn!( + doc_id = %doc.meta.id, + "Document was created before schema versioning — consider re-indexing" + ); + } else if doc.schema_version > SCHEMA_VERSION { + return Err(Error::Parse(format!( + "Document schema version {} is newer than supported {} — please upgrade vectorless", + doc.schema_version, SCHEMA_VERSION + ))); + } + + Ok(doc) +} + +/// Save the workspace index (metadata for all documents). +pub fn save_index(path: &Path, entries: &[DocumentMeta]) -> Result<()> { + save_index_with_options(path, entries, &PersistenceOptions::default()) +} + +/// Save the workspace index with custom options. +pub fn save_index_with_options( + path: &Path, + entries: &[DocumentMeta], + options: &PersistenceOptions, +) -> Result<()> { + // Serialize to serde_json::Value first + let payload_value = + serde_json::to_value(entries).map_err(|e| Error::Serialization(e.to_string()))?; + + let payload_bytes = + serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; + + let checksum = calculate_checksum(&payload_bytes); + + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: payload_value, + }; + + let json = + serde_json::to_string_pretty(&wrapper).map_err(|e| Error::Serialization(e.to_string()))?; + + if options.atomic_writes { + let temp_path = path.with_extension("tmp"); + + // Ensure parent directory exists + if let Some(parent) = path.parent() { + std::fs::create_dir_all(parent).map_err(Error::Io)?; + } + + // Write to temp file + { + let file = File::create(&temp_path).map_err(Error::Io)?; + let mut writer = BufWriter::new(file); + writer.write_all(json.as_bytes()).map_err(Error::Io)?; + writer.flush().map_err(Error::Io)?; + } + + // Atomic rename + std::fs::rename(&temp_path, path).map_err(Error::Io)?; + } else { + std::fs::write(path, json).map_err(Error::Io)?; + } + + Ok(()) +} + +/// Load the workspace index. +pub fn load_index(path: &Path) -> Result> { + load_index_with_options(path, &PersistenceOptions::default()) +} + +/// Load the workspace index with custom options. +pub fn load_index_with_options( + path: &Path, + options: &PersistenceOptions, +) -> Result> { + if !path.exists() { + return Ok(Vec::new()); + } + + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + + let wrapper: PersistedWrapper = serde_json::from_reader(reader) + .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::Parse(format!( + "Unsupported format version: {} (expected {})", + wrapper.version, FORMAT_VERSION + ))); + } + + // Verify checksum if enabled + if options.verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::Parse(format!( + "Checksum mismatch: expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + // Deserialize Value to target type + let entries: Vec = serde_json::from_value(wrapper.payload) + .map_err(|e| Error::Parse(format!("Failed to deserialize index: {}", e)))?; + + Ok(entries) +} + +// ============================================================================ +// Bytes-based serialization (for StorageBackend integration) +// ============================================================================ + +/// Serialize a document to bytes (JSON with checksum wrapper). +/// +/// This is useful for storage backends that work with byte arrays. +pub fn save_document_to_bytes(doc: &PersistedDocument) -> Result> { + // Serialize to serde_json::Value first + let payload_value = + serde_json::to_value(doc).map_err(|e| Error::Serialization(e.to_string()))?; + + // Calculate checksum on the Value's canonical bytes + let payload_bytes = + serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; + let checksum = calculate_checksum(&payload_bytes); + + // Create wrapper + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: payload_value, + }; + + // Serialize wrapper + serde_json::to_vec(&wrapper).map_err(|e| Error::Serialization(e.to_string())) +} + +/// Deserialize a document from bytes. +/// +/// Verifies checksum by default. +pub fn load_document_from_bytes(data: &[u8]) -> Result { + load_document_from_bytes_with_options(data, true) +} + +/// Deserialize a document from bytes with optional checksum verification. +pub fn load_document_from_bytes_with_options( + data: &[u8], + verify_checksum: bool, +) -> Result { + // Parse wrapper (payload is serde_json::Value) + let wrapper: PersistedWrapper = serde_json::from_slice(data) + .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::VersionMismatch(format!( + "Expected version {}, got {}", + FORMAT_VERSION, wrapper.version + ))); + } + + // Verify checksum if enabled + if verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::ChecksumMismatch(format!( + "Expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + // Deserialize Value to target type + let doc: PersistedDocument = serde_json::from_value(wrapper.payload) + .map_err(|e| Error::Parse(format!("Failed to deserialize document: {}", e)))?; + + // Check schema version + if doc.schema_version == 0 { + tracing::warn!( + doc_id = %doc.meta.id, + "Document was created before schema versioning — consider re-indexing" + ); + } else if doc.schema_version > SCHEMA_VERSION { + return Err(Error::Parse(format!( + "Document schema version {} is newer than supported {} — please upgrade vectorless", + doc.schema_version, SCHEMA_VERSION + ))); + } + + Ok(doc) +} + +/// Serialize an index to bytes. +pub fn save_index_to_bytes(entries: &[DocumentMeta]) -> Result> { + let payload_value = + serde_json::to_value(entries).map_err(|e| Error::Serialization(e.to_string()))?; + + let payload_bytes = + serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; + let checksum = calculate_checksum(&payload_bytes); + + let wrapper = PersistedWrapper { + version: FORMAT_VERSION, + checksum, + payload: payload_value, + }; + + serde_json::to_vec(&wrapper).map_err(|e| Error::Serialization(e.to_string())) +} + +/// Deserialize an index from bytes. +pub fn load_index_from_bytes(data: &[u8]) -> Result> { + load_index_from_bytes_with_options(data, true) +} + +/// Deserialize an index from bytes with optional checksum verification. +pub fn load_index_from_bytes_with_options( + data: &[u8], + verify_checksum: bool, +) -> Result> { + let wrapper: PersistedWrapper = serde_json::from_slice(data) + .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; + + // Check version + if wrapper.version != FORMAT_VERSION { + return Err(Error::VersionMismatch(format!( + "Expected version {}, got {}", + FORMAT_VERSION, wrapper.version + ))); + } + + // Verify checksum if enabled + if verify_checksum { + let payload_bytes = serde_json::to_vec(&wrapper.payload) + .map_err(|e| Error::Serialization(e.to_string()))?; + + let expected_checksum = calculate_checksum(&payload_bytes); + + if wrapper.checksum != expected_checksum { + return Err(Error::ChecksumMismatch(format!( + "Expected {}, got {}", + expected_checksum, wrapper.checksum + ))); + } + } + + // Deserialize Value to target type + let entries: Vec = serde_json::from_value(wrapper.payload) + .map_err(|e| Error::Parse(format!("Failed to deserialize index: {}", e)))?; + + Ok(entries) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn create_test_doc(id: &str) -> PersistedDocument { + let meta = DocumentMeta::new(id, "Test Doc", "md"); + let tree = DocumentTree::new("Root", "Content"); + PersistedDocument::new(meta, tree) + } + + #[test] + fn test_save_and_load_document() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("test.json"); + + let doc = create_test_doc("doc-1"); + save_document(&path, &doc).unwrap(); + + let loaded = load_document(&path).unwrap(); + assert_eq!(loaded.meta.id, "doc-1"); + assert_eq!(loaded.meta.name, "Test Doc"); + } + + #[test] + fn test_atomic_write() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("atomic.json"); + + let doc = create_test_doc("doc-atomic"); + let options = PersistenceOptions::new().with_atomic_writes(true); + save_document_with_options(&path, &doc, &options).unwrap(); + + // Temp file should not exist after save + assert!(!path.with_extension("tmp").exists()); + + let loaded = load_document(&path).unwrap(); + assert_eq!(loaded.meta.id, "doc-atomic"); + } + + #[test] + fn test_checksum_verification() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("checksum.json"); + + let doc = create_test_doc("doc-checksum"); + save_document(&path, &doc).unwrap(); + + // Corrupt the file + let content = std::fs::read_to_string(&path).unwrap(); + let corrupted = content.replace("doc-checksum", "doc-corrupted"); + std::fs::write(&path, corrupted).unwrap(); + + // Load should fail with checksum error + let result = load_document(&path); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert!(matches!(err, Error::Parse(_))); + } + + #[test] + fn test_checksum_disabled() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("no-checksum.json"); + + let doc = create_test_doc("doc-no-check"); + save_document(&path, &doc).unwrap(); + + // Load with checksum disabled should succeed + let options = PersistenceOptions::new().with_verify_checksum(false); + let result = load_document_with_options(&path, &options); + assert!(result.is_ok()); + let loaded = result.unwrap(); + assert_eq!(loaded.meta.id, "doc-no-check"); + + // Now corrupt the checksum field specifically + let content = std::fs::read_to_string(&path).unwrap(); + // Change the checksum value but keep the payload intact + let payload_value = serde_json::to_value(&doc).unwrap(); + let corrupted = content.replace( + &calculate_checksum(&serde_json::to_vec(&payload_value).unwrap()), + "0000000000000000000000000000000000000000000000000000000000000000", + ); + std::fs::write(&path, corrupted).unwrap(); + + // Load with checksum disabled should still succeed + let result = load_document_with_options(&path, &options); + assert!(result.is_ok()); + + // Load with checksum enabled should fail + let options_enabled = PersistenceOptions::new().with_verify_checksum(true); + let result = load_document_with_options(&path, &options_enabled); + assert!(result.is_err()); + } + + #[test] + fn test_load_nonexistent() { + let result = load_document(Path::new("/nonexistent/path.json")); + assert!(result.is_err()); + assert!(result.unwrap_err().is_not_found()); + } + + #[test] + fn test_save_and_load_index() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("meta.bin"); + + let mut entries = Vec::new(); + entries.push(DocumentMeta::new("doc-1", "Doc 1", "md")); + entries.push(DocumentMeta::new("doc-2", "Doc 2", "pdf")); + + save_index(&path, &entries).unwrap(); + + let loaded = load_index(&path).unwrap(); + assert_eq!(loaded.len(), 2); + assert_eq!(loaded[0].id, "doc-1"); + assert_eq!(loaded[1].format, "pdf"); + } + + #[test] + fn test_load_empty_index() { + let temp = TempDir::new().unwrap(); + let path = temp.path().join("nonexistent.json"); + + let loaded = load_index(&path).unwrap(); + assert!(loaded.is_empty()); + } + + #[test] + fn test_checksum_calculation() { + let data1 = b"test data"; + let data2 = b"test data"; + let data3 = b"different data"; + + let checksum1 = calculate_checksum(data1); + let checksum2 = calculate_checksum(data2); + let checksum3 = calculate_checksum(data3); + + assert_eq!(checksum1, checksum2); + assert_ne!(checksum1, checksum3); + assert_eq!(checksum1.len(), 64); // SHA-256 produces 64 hex chars + } +} diff --git a/vectorless-core/vectorless-utils/Cargo.toml b/vectorless-core/vectorless-utils/Cargo.toml new file mode 100644 index 00000000..4ef59c91 --- /dev/null +++ b/vectorless-core/vectorless-utils/Cargo.toml @@ -0,0 +1,22 @@ +[package] +name = "vectorless-utils" +version.workspace = true +edition.workspace = true +authors.workspace = true +description.workspace = true +license.workspace = true +repository.workspace = true +homepage.workspace = true + +[dependencies] +vectorless-error = { path = "../vectorless-error" } +vectorless-document = { path = "../vectorless-document" } +serde = { workspace = true } +sha2 = { workspace = true } +blake2 = { workspace = true } +tiktoken-rs = { workspace = true } +base64 = { workspace = true } +thiserror = { workspace = true } + +[lints] +workspace = true diff --git a/vectorless-core/vectorless-utils/src/fingerprint.rs b/vectorless-core/vectorless-utils/src/fingerprint.rs new file mode 100644 index 00000000..d7b8a988 --- /dev/null +++ b/vectorless-core/vectorless-utils/src/fingerprint.rs @@ -0,0 +1,496 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Fingerprint system for content and subtree identification. +//! +//! This module provides a robust fingerprinting system for content identification, +//! enabling precise change detection at both content and subtree levels. +//! +//! # Key Features +//! +//! - **Content Fingerprint**: Hash of node content (title + text) +//! - **Subtree Fingerprint**: Recursive hash including all descendants +//! - **Stable Serialization**: Type-tagged hashing for consistent results +//! +//! # Usage +//! +//! ```rust,ignore +//! use vectorless::fingerprint::{Fingerprint, Fingerprinter}; +//! +//! // Create a fingerprint from content +//! let fp = Fingerprinter::new() +//! .with_str("Hello, world!") +//! .into_fingerprint(); +//! +//! // Compare fingerprints +//! if old_fp == new_fp { +//! // Content unchanged +//! } +//! ``` + +use base64::prelude::*; +use blake2::digest::typenum; +use blake2::{Blake2b, Digest}; +use serde::{Deserialize, Serialize}; +use std::hash::{Hash, Hasher}; + +/// A 128-bit fingerprint for content identification. +/// +/// Uses BLAKE2b-128 for fast, collision-resistant hashing. +/// Displayed as base64 for compact representation. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Fingerprint(pub [u8; 16]); + +impl Fingerprint { + /// Create a fingerprint from raw bytes. + pub fn new(bytes: [u8; 16]) -> Self { + Self(bytes) + } + + /// Create a fingerprint from a byte slice (hashes the slice). + pub fn from_bytes(data: &[u8]) -> Self { + let mut hasher = Blake2b::::default(); + hasher.update(data); + Self(hasher.finalize().into()) + } + + /// Create a fingerprint from a string. + pub fn from_str(s: &str) -> Self { + Self::from_bytes(s.as_bytes()) + } + + /// Encode fingerprint to base64 string. + pub fn to_base64(self) -> String { + BASE64_STANDARD.encode(self.0) + } + + /// Decode fingerprint from base64 string. + pub fn from_base64(s: &str) -> Result { + let bytes = BASE64_STANDARD + .decode(s) + .map_err(|e| FingerprintError::InvalidBase64(e.to_string()))?; + let bytes: [u8; 16] = bytes + .try_into() + .map_err(|e: Vec| FingerprintError::InvalidLength(e.len()))?; + Ok(Self(bytes)) + } + + /// Get the raw bytes. + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } + + /// Check if this is a zero/null fingerprint. + pub fn is_zero(&self) -> bool { + self.0 == [0u8; 16] + } + + /// Create a zero/null fingerprint (for uninitialized state). + pub fn zero() -> Self { + Self([0u8; 16]) + } +} + +impl std::fmt::Display for Fingerprint { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + for byte in &self.0 { + write!(f, "{:02x}", byte)?; + } + Ok(()) + } +} + +impl std::fmt::Debug for Fingerprint { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Fingerprint({})", self) + } +} + +impl Hash for Fingerprint { + fn hash(&self, state: &mut H) { + // Fingerprint is already evenly distributed, use first 8 bytes + state.write(&self.0[..8]); + } +} + +impl Serialize for Fingerprint { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + serializer.serialize_str(&self.to_base64()) + } +} + +impl<'de> Deserialize<'de> for Fingerprint { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + Self::from_base64(&s).map_err(serde::de::Error::custom) + } +} + +impl Default for Fingerprint { + fn default() -> Self { + Self::zero() + } +} + +/// Error type for fingerprint operations. +#[derive(Debug, thiserror::Error)] +pub enum FingerprintError { + /// Invalid base64 encoding. + #[error("Invalid base64: {0}")] + InvalidBase64(String), + + /// Invalid fingerprint length. + #[error("Invalid fingerprint length: {0}")] + InvalidLength(usize), + + /// Serialization error. + #[error("Serialization error: {0}")] + Serialization(String), +} + +/// Builder for creating fingerprints. +/// +/// Provides a fluent API for incrementally building fingerprints +/// from multiple values. +/// +/// # Example +/// +/// ```rust,ignore +/// let fp = Fingerprinter::new() +/// .with_str("title") +/// .with_str("content") +/// .with_usize(42) +/// .into_fingerprint(); +/// ``` +#[derive(Clone)] +pub struct Fingerprinter { + hasher: Blake2b, +} + +impl Fingerprinter { + /// Create a new fingerprinter. + pub fn new() -> Self { + Self { + hasher: Blake2b::::default(), + } + } + + /// Finalize and produce the fingerprint. + pub fn into_fingerprint(self) -> Fingerprint { + Fingerprint(self.hasher.finalize().into()) + } + + /// Add a string to the hash. + pub fn with_str(mut self, s: &str) -> Self { + self.write_str(s); + self + } + + /// Add a string to the hash (mutable). + pub fn write_str(&mut self, s: &str) { + self.write_type_tag("s"); + self.write_varlen_bytes(s.as_bytes()); + } + + /// Add bytes to the hash. + pub fn with_bytes(mut self, bytes: &[u8]) -> Self { + self.write_bytes(bytes); + self + } + + /// Add bytes to the hash (mutable). + pub fn write_bytes(&mut self, bytes: &[u8]) { + self.write_type_tag("b"); + self.write_varlen_bytes(bytes); + } + + /// Add a usize to the hash. + pub fn with_usize(mut self, n: usize) -> Self { + self.write_usize(n); + self + } + + /// Add a usize to the hash (mutable). + pub fn write_usize(&mut self, n: usize) { + self.write_type_tag("u"); + self.hasher.update((n as u64).to_le_bytes()); + } + + /// Add a u64 to the hash. + pub fn with_u64(mut self, n: u64) -> Self { + self.write_u64(n); + self + } + + /// Add a u64 to the hash (mutable). + pub fn write_u64(&mut self, n: u64) { + self.write_type_tag("u8"); + self.hasher.update(n.to_le_bytes()); + } + + /// Add an i64 to the hash. + pub fn with_i64(mut self, n: i64) -> Self { + self.write_i64(n); + self + } + + /// Add an i64 to the hash (mutable). + pub fn write_i64(&mut self, n: i64) { + self.write_type_tag("i8"); + self.hasher.update(n.to_le_bytes()); + } + + /// Add a bool to the hash. + pub fn with_bool(mut self, b: bool) -> Self { + self.write_bool(b); + self + } + + /// Add a bool to the hash (mutable). + pub fn write_bool(&mut self, b: bool) { + self.write_type_tag(if b { "t" } else { "f" }); + } + + /// Add an optional string to the hash. + pub fn with_option_str(mut self, opt: Option<&str>) -> Self { + self.write_option_str(opt); + self + } + + /// Add an optional string to the hash (mutable). + pub fn write_option_str(&mut self, opt: Option<&str>) { + match opt { + Some(s) => { + self.write_type_tag("some"); + self.write_str(s); + } + None => { + self.write_type_tag("none"); + } + } + } + + /// Add another fingerprint to the hash. + pub fn with_fingerprint(mut self, fp: &Fingerprint) -> Self { + self.write_fingerprint(fp); + self + } + + /// Add another fingerprint to the hash (mutable). + pub fn write_fingerprint(&mut self, fp: &Fingerprint) { + self.write_type_tag("fp"); + self.hasher.update(&fp.0); + } + + /// Add raw bytes directly (no type tag). + pub fn write_raw(&mut self, bytes: &[u8]) { + self.hasher.update(bytes); + } + + // Internal helpers + + fn write_type_tag(&mut self, tag: &str) { + self.hasher.update(tag.as_bytes()); + self.hasher.update(b";"); + } + + fn write_varlen_bytes(&mut self, bytes: &[u8]) { + self.hasher.update((bytes.len() as u32).to_le_bytes()); + self.hasher.update(bytes); + } +} + +/// Node fingerprint containing both content and subtree fingerprints. +/// +/// This enables precise change detection: +/// - If `content_fp` changes, the node's content was modified +/// - If `subtree_fp` changes, the node or its descendants were modified +/// - If `content_fp` is same but `subtree_fp` changed, only descendants changed +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +pub struct NodeFingerprint { + /// Fingerprint of this node's content (title + text). + pub content: Fingerprint, + + /// Fingerprint of the entire subtree (including this node). + /// Computed recursively from all descendants. + pub subtree: Fingerprint, +} + +impl NodeFingerprint { + /// Create a new node fingerprint. + pub fn new(content: Fingerprint, subtree: Fingerprint) -> Self { + Self { content, subtree } + } + + /// Create a fingerprint for a leaf node (content == subtree). + pub fn leaf(content: Fingerprint) -> Self { + Self { + content, + subtree: content, + } + } + + /// Create a zero/null fingerprint. + pub fn zero() -> Self { + Self { + content: Fingerprint::zero(), + subtree: Fingerprint::zero(), + } + } + + /// Check if this is a zero fingerprint. + pub fn is_zero(&self) -> bool { + self.content.is_zero() && self.subtree.is_zero() + } + + /// Check if content changed compared to another fingerprint. + pub fn content_changed(&self, other: &Self) -> bool { + self.content != other.content + } + + /// Check if subtree changed compared to another fingerprint. + pub fn subtree_changed(&self, other: &Self) -> bool { + self.subtree != other.subtree + } + + /// Check if only descendants changed (content same, subtree different). + pub fn only_descendants_changed(&self, other: &Self) -> bool { + self.content == other.content && self.subtree != other.subtree + } +} + +impl Default for NodeFingerprint { + fn default() -> Self { + Self::zero() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fingerprint_from_str() { + let fp1 = Fingerprint::from_str("hello"); + let fp2 = Fingerprint::from_str("hello"); + let fp3 = Fingerprint::from_str("world"); + + assert_eq!(fp1, fp2); + assert_ne!(fp1, fp3); + } + + #[test] + fn test_fingerprint_base64_roundtrip() { + let fp = Fingerprint::from_str("test content"); + let encoded = fp.to_base64(); + let decoded = Fingerprint::from_base64(&encoded).unwrap(); + assert_eq!(fp, decoded); + } + + #[test] + fn test_fingerprinter_chaining() { + let fp1 = Fingerprinter::new() + .with_str("title") + .with_str("content") + .into_fingerprint(); + + let fp2 = Fingerprinter::new() + .with_str("title") + .with_str("content") + .into_fingerprint(); + + let fp3 = Fingerprinter::new() + .with_str("title") + .with_str("different") + .into_fingerprint(); + + assert_eq!(fp1, fp2); + assert_ne!(fp1, fp3); + } + + #[test] + fn test_fingerprinter_types() { + let fp1 = Fingerprinter::new() + .with_str("test") + .with_usize(42) + .with_bool(true) + .into_fingerprint(); + + let fp2 = Fingerprinter::new() + .with_str("test") + .with_usize(42) + .with_bool(true) + .into_fingerprint(); + + let fp3 = Fingerprinter::new() + .with_str("test") + .with_usize(43) // different number + .with_bool(true) + .into_fingerprint(); + + assert_eq!(fp1, fp2); + assert_ne!(fp1, fp3); + } + + #[test] + fn test_node_fingerprint() { + let content = Fingerprint::from_str("content"); + let subtree = Fingerprint::from_str("subtree"); + + let fp = NodeFingerprint::new(content, subtree); + + assert!(!fp.is_zero()); + assert_eq!(fp.content, content); + assert_eq!(fp.subtree, subtree); + } + + #[test] + fn test_node_fingerprint_change_detection() { + let old = NodeFingerprint::new( + Fingerprint::from_str("content"), + Fingerprint::from_str("subtree"), + ); + + // Same content, different subtree + let new1 = NodeFingerprint::new( + Fingerprint::from_str("content"), + Fingerprint::from_str("different"), + ); + assert!(new1.only_descendants_changed(&old)); + assert!(!new1.content_changed(&old)); + assert!(new1.subtree_changed(&old)); + + // Different content + let new2 = NodeFingerprint::new( + Fingerprint::from_str("different"), + Fingerprint::from_str("subtree"), + ); + assert!(!new2.only_descendants_changed(&old)); + assert!(new2.content_changed(&old)); + } + + #[test] + fn test_fingerprint_serialization() { + let fp = Fingerprint::from_str("test serialization"); + let json = serde_json::to_string(&fp).unwrap(); + let decoded: Fingerprint = serde_json::from_str(&json).unwrap(); + assert_eq!(fp, decoded); + } + + #[test] + fn test_node_fingerprint_serialization() { + let fp = NodeFingerprint::new( + Fingerprint::from_str("content"), + Fingerprint::from_str("subtree"), + ); + let json = serde_json::to_string(&fp).unwrap(); + let decoded: NodeFingerprint = serde_json::from_str(&json).unwrap(); + assert_eq!(fp, decoded); + } +} diff --git a/vectorless-core/vectorless-utils/src/lib.rs b/vectorless-core/vectorless-utils/src/lib.rs new file mode 100644 index 00000000..472bed71 --- /dev/null +++ b/vectorless-core/vectorless-utils/src/lib.rs @@ -0,0 +1,17 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Utility functions and helpers. +//! +//! This module provides common utilities used across the codebase: +//! +//! - **Token estimation** — Fast and accurate token counting (tiktoken-based) +//! - **Fingerprint** — BLAKE2b content hashing for change detection +//! - **Validation** — Pre-index source validation (file, content, bytes) + +pub mod fingerprint; +mod token; +pub mod validation; + +pub use token::estimate_tokens; +pub use validation::{validate_bytes, validate_content, validate_file}; diff --git a/vectorless-core/vectorless-utils/src/token.rs b/vectorless-core/vectorless-utils/src/token.rs new file mode 100644 index 00000000..9e23ea85 --- /dev/null +++ b/vectorless-core/vectorless-utils/src/token.rs @@ -0,0 +1,64 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Unified token estimation module. +//! +//! Provides accurate token counting using tiktoken for OpenAI models, +//! with fallback to character-based estimation for other models. + +use std::sync::OnceLock; +use tiktoken_rs::CoreBPE; + +/// Global BPE encoder instance (cl100k_base is used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002) +static BPE: OnceLock = OnceLock::new(); + +/// Get or initialize the BPE encoder. +fn get_bpe() -> &'static CoreBPE { + BPE.get_or_init(|| { + tiktoken_rs::cl100k_base().expect("Failed to initialize cl100k_base tokenizer") + }) +} + +/// Estimate token count for a text using tiktoken. +/// +/// This uses the cl100k_base encoding which is used by: +/// - GPT-4 +/// - GPT-3.5-turbo +/// - GPT-4o +/// - GPT-4o-mini +/// - text-embedding-ada-002 +/// - text-embedding-3-small/large +/// +/// # Example +/// +/// ``` +/// use vectorless::estimate_tokens; +/// +/// assert_eq!(estimate_tokens(""), 0); +/// assert!(estimate_tokens("hello world") > 0); +/// ``` +pub fn estimate_tokens(text: &str) -> usize { + if text.is_empty() { + return 0; + } + + // Use tiktoken for accurate counting + get_bpe().encode_with_special_tokens(text).len() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_estimate_tokens_empty() { + assert_eq!(estimate_tokens(""), 0); + } + + #[test] + fn test_estimate_tokens_simple() { + // "hello world" should be 2 tokens with tiktoken + let count = estimate_tokens("hello world"); + assert!(count >= 2, "Expected at least 2 tokens, got {}", count); + } +} diff --git a/vectorless-core/vectorless-utils/src/validation.rs b/vectorless-core/vectorless-utils/src/validation.rs new file mode 100644 index 00000000..3133175d --- /dev/null +++ b/vectorless-core/vectorless-utils/src/validation.rs @@ -0,0 +1,195 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Source validation utilities for indexing. + +use std::path::Path; + +use vectorless_document::DocumentFormat; +use vectorless_error::{Error, Result}; + +/// Maximum file size before emitting a warning (100 MB). +const LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024; + +/// Result of validating a source before indexing. +#[derive(Debug, Clone)] +pub struct SourceValidation { + /// Whether the source is valid for indexing. + pub valid: bool, + + /// Validation errors (prevents indexing). + pub errors: Vec, + + /// Validation warnings (non-blocking). + pub warnings: Vec, +} + +impl SourceValidation { + fn valid() -> Self { + Self { + valid: true, + errors: vec![], + warnings: vec![], + } + } + + fn invalid(errors: Vec) -> Self { + Self { + valid: false, + errors, + warnings: vec![], + } + } + + fn with_warnings(mut self, warnings: Vec) -> Self { + self.warnings = warnings; + self + } +} + +/// Validate a file path for indexing. +/// +/// Checks: exists, readable, supported format, size. +pub fn validate_file(path: &Path) -> Result { + if !path.exists() { + return Ok(SourceValidation::invalid(vec![format!( + "File not found: {}", + path.display() + )])); + } + + let metadata = std::fs::metadata(path) + .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; + + let size = metadata.len() as usize; + let mut warnings = Vec::new(); + + if size > LARGE_FILE_THRESHOLD { + warnings.push(format!( + "Large file ({}MB) may take longer to index", + size / (1024 * 1024) + )); + } + + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + if DocumentFormat::from_extension(ext).is_none() { + return Ok( + SourceValidation::invalid(vec![format!("Unsupported format: .{}", ext)]) + .with_warnings(warnings), + ); + } + + Ok(SourceValidation::valid().with_warnings(warnings)) +} + +/// Validate content string for indexing. +/// +/// Checks: non-empty. +pub fn validate_content(content: &str, _format: DocumentFormat) -> SourceValidation { + let mut errors = Vec::new(); + + if content.trim().is_empty() { + errors.push("Content is empty".to_string()); + } + + if errors.is_empty() { + SourceValidation::valid() + } else { + SourceValidation::invalid(errors) + } +} + +/// Validate binary data for indexing. +/// +/// Checks: non-empty, PDF magic number. +pub fn validate_bytes(data: &[u8], format: DocumentFormat) -> SourceValidation { + let mut errors = Vec::new(); + + if data.is_empty() { + errors.push("Byte data is empty".to_string()); + } + + // PDF magic number check + if format == DocumentFormat::Pdf && !data.is_empty() { + if !data.starts_with(b"%PDF") { + errors.push("Data does not appear to be a valid PDF (missing %PDF header)".to_string()); + } + } + + if errors.is_empty() { + SourceValidation::valid() + } else { + SourceValidation::invalid(errors) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_file_missing() { + let result = validate_file(Path::new("./nonexistent.md")).unwrap(); + assert!(!result.valid); + assert!(result.errors[0].contains("not found")); + } + + #[test] + fn test_validate_file_unsupported_format() { + let tmp = std::env::temp_dir().join("vectorless_test_validate.dat"); + std::fs::write(&tmp, b"data").unwrap(); + let result = validate_file(&tmp).unwrap(); + assert!(!result.valid); + assert!(result.errors[0].contains("Unsupported")); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_validate_file_valid() { + let tmp = std::env::temp_dir().join("vectorless_test_validate.md"); + std::fs::write(&tmp, b"# Hello").unwrap(); + let result = validate_file(&tmp).unwrap(); + assert!(result.valid); + assert!(result.errors.is_empty()); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_validate_content_empty() { + let result = validate_content(" \n ", DocumentFormat::Markdown); + assert!(!result.valid); + assert!(result.errors[0].contains("empty")); + } + + #[test] + fn test_validate_content_valid() { + let result = validate_content("# Hello", DocumentFormat::Markdown); + assert!(result.valid); + } + + #[test] + fn test_validate_bytes_empty() { + let result = validate_bytes(&[], DocumentFormat::Pdf); + assert!(!result.valid); + assert!(result.errors[0].contains("empty")); + } + + #[test] + fn test_validate_bytes_invalid_pdf() { + let result = validate_bytes(b"not a pdf", DocumentFormat::Pdf); + assert!(!result.valid); + assert!(result.errors[0].contains("PDF")); + } + + #[test] + fn test_validate_bytes_valid_pdf() { + let result = validate_bytes(b"%PDF-1.4 some content", DocumentFormat::Pdf); + assert!(result.valid); + } + + #[test] + fn test_validate_bytes_valid_markdown() { + let result = validate_bytes(b"# Hello", DocumentFormat::Markdown); + assert!(result.valid); + } +} diff --git a/vectorless-core/vectorless/src/client/index_context.rs b/vectorless-core/vectorless/src/client/index_context.rs index 30cb2502..aa042fe3 100644 --- a/vectorless-core/vectorless/src/client/index_context.rs +++ b/vectorless-core/vectorless/src/client/index_context.rs @@ -38,7 +38,7 @@ use std::path::PathBuf; -use crate::index::parse::DocumentFormat; +use crate::document::DocumentFormat; use super::types::{IndexMode, IndexOptions}; diff --git a/vectorless-core/vectorless/src/client/indexed_document.rs b/vectorless-core/vectorless/src/client/indexed_document.rs index b2416392..24ec0d6f 100644 --- a/vectorless-core/vectorless/src/client/indexed_document.rs +++ b/vectorless-core/vectorless/src/client/indexed_document.rs @@ -9,8 +9,8 @@ use std::path::PathBuf; +use crate::document::DocumentFormat; use crate::document::DocumentTree; -use crate::index::parse::DocumentFormat; use crate::metrics::IndexMetrics; use crate::storage::PageContent; diff --git a/vectorless-core/vectorless/src/client/indexer.rs b/vectorless-core/vectorless/src/client/indexer.rs index 5b5e558b..ba4ce979 100644 --- a/vectorless-core/vectorless/src/client/indexer.rs +++ b/vectorless-core/vectorless/src/client/indexer.rs @@ -26,8 +26,8 @@ use std::sync::Arc; use tracing::info; use uuid::Uuid; +use crate::document::DocumentFormat; use crate::error::{Error, Result}; -use crate::index::parse::DocumentFormat; use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions}; use crate::llm::LlmClient; use crate::storage::{DocumentMeta, PersistedDocument}; diff --git a/vectorless-core/vectorless/src/client/mod.rs b/vectorless-core/vectorless/src/client/mod.rs index fc33a594..a851b3df 100644 --- a/vectorless-core/vectorless/src/client/mod.rs +++ b/vectorless-core/vectorless/src/client/mod.rs @@ -103,4 +103,4 @@ pub use types::{ // Parser Types (needed for IndexContext::from_content) // ============================================================ -pub use crate::index::parse::DocumentFormat; +pub use crate::document::DocumentFormat; diff --git a/vectorless-core/vectorless/src/client/types.rs b/vectorless-core/vectorless/src/client/types.rs index 8995f6f2..f9977d0a 100644 --- a/vectorless-core/vectorless/src/client/types.rs +++ b/vectorless-core/vectorless/src/client/types.rs @@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize}; -use crate::index::parse::DocumentFormat; +use crate::document::DocumentFormat; use crate::metrics::IndexMetrics; // ============================================================ diff --git a/vectorless-core/vectorless/src/document/format.rs b/vectorless-core/vectorless/src/document/format.rs new file mode 100644 index 00000000..78f6e52e --- /dev/null +++ b/vectorless-core/vectorless/src/document/format.rs @@ -0,0 +1,62 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document format and sufficiency types. +//! +//! These types are used across multiple modules and are defined here +//! to avoid circular dependencies between crates. + +use serde::{Deserialize, Serialize}; + +/// Supported document formats. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum DocumentFormat { + /// Markdown files (.md, .markdown) + Markdown, + /// PDF files (.pdf) + Pdf, +} + +impl DocumentFormat { + /// Detect format from file extension. + pub fn from_extension(ext: &str) -> Option { + match ext.to_lowercase().as_str() { + "md" | "markdown" => Some(Self::Markdown), + "pdf" => Some(Self::Pdf), + _ => None, + } + } + + /// Get the file extension for this format. + pub fn extension(&self) -> &'static str { + match self { + Self::Markdown => "md", + Self::Pdf => "pdf", + } + } + + /// All supported file extensions (lowercase). + /// + /// Single source of truth — used by directory scanning to + /// discover indexable files. + pub const SUPPORTED_EXTENSIONS: &'static [&'static str] = &["md", "pdf"]; +} + +/// Sufficiency level for incremental retrieval. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SufficiencyLevel { + /// Information is sufficient, stop retrieving. + Sufficient, + + /// Partial information, can continue if needed. + PartialSufficient, + + /// Information is insufficient, continue retrieving. + Insufficient, +} + +impl Default for SufficiencyLevel { + fn default() -> Self { + Self::Insufficient + } +} diff --git a/vectorless-core/vectorless/src/document/mod.rs b/vectorless-core/vectorless/src/document/mod.rs index 18f77ce0..85ea1ff3 100644 --- a/vectorless-core/vectorless/src/document/mod.rs +++ b/vectorless-core/vectorless/src/document/mod.rs @@ -16,6 +16,7 @@ //! - [`NodeReference`] - In-document reference (e.g., "see Appendix G") //! - [`RefType`] - Type of reference (Section, Appendix, Table, etc.) +mod format; mod navigation; mod node; mod reasoning; @@ -26,6 +27,7 @@ mod toc; mod tree; pub mod understanding; +pub use format::{DocumentFormat, SufficiencyLevel}; pub use navigation::{ChildRoute, DocCard, NavEntry, NavigationIndex, SectionCard}; pub use node::{NodeId, TreeNode}; pub use reasoning::{ diff --git a/vectorless-core/vectorless/src/document/understanding.rs b/vectorless-core/vectorless/src/document/understanding.rs index 94eaeac4..1505d796 100644 --- a/vectorless-core/vectorless/src/document/understanding.rs +++ b/vectorless-core/vectorless/src/document/understanding.rs @@ -268,7 +268,7 @@ pub enum IngestInput { /// Raw document bytes. data: Vec, /// Document format. - format: super::super::index::parse::DocumentFormat, + format: super::format::DocumentFormat, }, /// Index from a text string. Text { diff --git a/vectorless-core/vectorless/src/events/types.rs b/vectorless-core/vectorless/src/events/types.rs index 05ca0754..30903a41 100644 --- a/vectorless-core/vectorless/src/events/types.rs +++ b/vectorless-core/vectorless/src/events/types.rs @@ -6,8 +6,8 @@ //! Provides enums for indexing, query, and workspace events //! that can be observed via [`EventEmitter`](super::EventEmitter). -use crate::index::parse::DocumentFormat; -use crate::retrieval::SufficiencyLevel; +use crate::document::DocumentFormat; +use crate::document::SufficiencyLevel; /// Indexing operation events. #[derive(Debug, Clone)] diff --git a/vectorless-core/vectorless/src/index/incremental/resolver.rs b/vectorless-core/vectorless/src/index/incremental/resolver.rs index a8087fd4..ffddf2a7 100644 --- a/vectorless-core/vectorless/src/index/incremental/resolver.rs +++ b/vectorless-core/vectorless/src/index/incremental/resolver.rs @@ -10,9 +10,9 @@ use tracing::info; +use crate::document::DocumentFormat; use crate::document::DocumentTree; use crate::index::config::PipelineOptions; -use crate::index::parse::DocumentFormat; use crate::storage::PersistedDocument; use crate::utils::fingerprint::Fingerprint; diff --git a/vectorless-core/vectorless/src/index/parse/types.rs b/vectorless-core/vectorless/src/index/parse/types.rs index baaa8224..56f9f987 100644 --- a/vectorless-core/vectorless/src/index/parse/types.rs +++ b/vectorless-core/vectorless/src/index/parse/types.rs @@ -6,43 +6,12 @@ //! This module defines the types used for document parsing: //! - [`RawNode`] - A raw node extracted from a document before tree construction //! - [`DocumentMeta`] - Metadata about a document -//! - [`DocumentFormat`] - Supported document formats +//! - [`DocumentFormat`] - Supported document formats (re-exported from document module) use serde::{Deserialize, Serialize}; -/// Supported document formats. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum DocumentFormat { - /// Markdown files (.md, .markdown) - Markdown, - /// PDF files (.pdf) - Pdf, -} - -impl DocumentFormat { - /// Detect format from file extension. - pub fn from_extension(ext: &str) -> Option { - match ext.to_lowercase().as_str() { - "md" | "markdown" => Some(Self::Markdown), - "pdf" => Some(Self::Pdf), - _ => None, - } - } - - /// Get the file extension for this format. - pub fn extension(&self) -> &'static str { - match self { - Self::Markdown => "md", - Self::Pdf => "pdf", - } - } - - /// All supported file extensions (lowercase). - /// - /// Single source of truth — used by directory scanning to - /// discover indexable files. - pub const SUPPORTED_EXTENSIONS: &'static [&'static str] = &["md", "pdf"]; -} +/// Re-export [`DocumentFormat`] from the document module. +pub use crate::document::DocumentFormat; /// A raw node extracted from a document. /// diff --git a/vectorless-core/vectorless/src/index/stages/parse.rs b/vectorless-core/vectorless/src/index/stages/parse.rs index b0e542f6..2eb8b6ae 100644 --- a/vectorless-core/vectorless/src/index/stages/parse.rs +++ b/vectorless-core/vectorless/src/index/stages/parse.rs @@ -7,8 +7,8 @@ use super::async_trait; use std::time::Instant; use tracing::{debug, info}; +use crate::document::DocumentFormat; use crate::error::Result; -use crate::index::parse::DocumentFormat; use super::{IndexStage, StageResult}; use crate::index::IndexMode; diff --git a/vectorless-core/vectorless/src/retrieval/types.rs b/vectorless-core/vectorless/src/retrieval/types.rs index 3d1e41e5..f654f319 100644 --- a/vectorless-core/vectorless/src/retrieval/types.rs +++ b/vectorless-core/vectorless/src/retrieval/types.rs @@ -5,24 +5,8 @@ use serde::{Deserialize, Serialize}; -/// Sufficiency level for incremental retrieval. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SufficiencyLevel { - /// Information is sufficient, stop retrieving. - Sufficient, - - /// Partial information, can continue if needed. - PartialSufficient, - - /// Information is insufficient, continue retrieving. - Insufficient, -} - -impl Default for SufficiencyLevel { - fn default() -> Self { - Self::Insufficient - } -} +/// Re-export [`SufficiencyLevel`] from the document module. +pub use crate::document::SufficiencyLevel; /// Complete retrieval response. #[derive(Debug, Clone)] diff --git a/vectorless-core/vectorless/src/utils/validation.rs b/vectorless-core/vectorless/src/utils/validation.rs index ae541c16..e5b1b64b 100644 --- a/vectorless-core/vectorless/src/utils/validation.rs +++ b/vectorless-core/vectorless/src/utils/validation.rs @@ -5,8 +5,8 @@ use std::path::Path; +use crate::document::DocumentFormat; use crate::error::{Error, Result}; -use crate::index::parse::DocumentFormat; /// Maximum file size before emitting a warning (100 MB). const LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024; From 5abf1d868282f0c7813e836a619d200f21e0aa30 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:05:13 +0800 Subject: [PATCH 14/28] feat: add vectorless-rerank module and reorganize query types - Add vectorless-rerank dependency to vectorless-agent - Introduce Evidence type in vectorless-rerank and re-export it from vectorless-agent instead of defining locally - Move query-related types (EvidenceItem, QueryMetrics, QueryResultItem, Confidence) from vectorless-engine to vectorless-retrieval - Update imports across multiple modules to use correct paths after refactoring - Add necessary dependencies (regex, serde_json) and remove vectorless-agent dependency from vectorless-rerank - Update module visibility for config, memo, and throttle in vectorless-llm This change centralizes query result types in vectorless-retrieval module and introduces proper re-ranking capabilities through the new vectorless-rerank module. BREAKING CHANGE: Evidence type is now re-exported from vectorless-rerank::types instead of being defined in vectorless-agent. --- vectorless-core/vectorless-agent/Cargo.toml | 2 + .../vectorless-agent/src/config.rs | 14 +-- .../src/orchestrator/analyze.rs | 2 +- .../src/tools/orchestrator.rs | 8 +- .../vectorless-agent/src/tools/worker/cat.rs | 6 +- .../vectorless-agent/src/tools/worker/cd.rs | 6 +- .../vectorless-agent/src/tools/worker/find.rs | 4 +- .../vectorless-agent/src/tools/worker/grep.rs | 10 +-- .../vectorless-agent/src/tools/worker/head.rs | 10 +-- .../vectorless-agent/src/tools/worker/ls.rs | 4 +- .../vectorless-agent/src/tools/worker/pwd.rs | 6 +- .../vectorless-agent/src/tools/worker/wc.rs | 10 +-- .../vectorless-agent/src/worker/navigation.rs | 4 +- .../vectorless-agent/src/worker/planning.rs | 6 +- .../vectorless-engine/src/types.rs | 58 +------------ vectorless-core/vectorless-index/Cargo.toml | 2 + .../src/incremental/resolver.rs | 2 +- .../src/incremental/updater.rs | 2 +- .../src/parse/markdown/parser.rs | 2 +- .../vectorless-index/src/parse/mod.rs | 87 +++++++++++++++++++ .../vectorless-index/src/parse/pdf/parser.rs | 8 +- .../src/parse/toc/assigner.rs | 2 +- .../src/parse/toc/detector.rs | 2 +- .../src/parse/toc/processor.rs | 2 +- .../src/parse/toc/repairer.rs | 2 +- .../src/parse/toc/structure_extractor.rs | 2 +- .../src/parse/toc/verifier.rs | 2 +- .../src/pipeline/checkpoint.rs | 2 +- .../vectorless-index/src/pipeline/context.rs | 2 +- .../src/pipeline/orchestrator.rs | 2 +- .../vectorless-index/src/stages/build.rs | 6 +- .../vectorless-index/src/stages/concept.rs | 2 +- .../vectorless-index/src/stages/enhance.rs | 6 +- .../vectorless-index/src/stages/enrich.rs | 2 +- .../vectorless-index/src/stages/navigation.rs | 14 +-- .../vectorless-index/src/stages/optimize.rs | 20 ++--- .../vectorless-index/src/stages/parse.rs | 10 +-- .../vectorless-index/src/stages/reasoning.rs | 2 +- .../vectorless-index/src/stages/split.rs | 4 +- .../vectorless-index/src/stages/validate.rs | 10 +-- .../src/stages/verify_ingest.rs | 2 +- vectorless-core/vectorless-llm/src/lib.rs | 6 +- vectorless-core/vectorless-rerank/Cargo.toml | 2 +- .../vectorless-rerank/src/dedup.rs | 2 +- vectorless-core/vectorless-rerank/src/lib.rs | 3 +- .../vectorless-rerank/src/types.rs | 15 ++++ .../vectorless-retrieval/src/lib.rs | 2 +- .../vectorless-retrieval/src/postprocessor.rs | 2 +- .../vectorless-retrieval/src/types.rs | 52 +++++++++++ 49 files changed, 264 insertions(+), 169 deletions(-) diff --git a/vectorless-core/vectorless-agent/Cargo.toml b/vectorless-core/vectorless-agent/Cargo.toml index 7ecfbaa5..0d7f7936 100644 --- a/vectorless-core/vectorless-agent/Cargo.toml +++ b/vectorless-core/vectorless-agent/Cargo.toml @@ -13,6 +13,7 @@ vectorless-document = { path = "../vectorless-document" } vectorless-error = { path = "../vectorless-error" } vectorless-llm = { path = "../vectorless-llm" } vectorless-query = { path = "../vectorless-query" } +vectorless-rerank = { path = "../vectorless-rerank" } vectorless-scoring = { path = "../vectorless-scoring" } tokio = { workspace = true } async-trait = { workspace = true } @@ -22,6 +23,7 @@ tracing = { workspace = true } futures = { workspace = true } chrono = { workspace = true } thiserror = { workspace = true } +regex = { workspace = true } [lints] workspace = true diff --git a/vectorless-core/vectorless-agent/src/config.rs b/vectorless-core/vectorless-agent/src/config.rs index 1628d5a8..d3b784f1 100644 --- a/vectorless-core/vectorless-agent/src/config.rs +++ b/vectorless-core/vectorless-agent/src/config.rs @@ -102,17 +102,9 @@ impl Output { } /// A single piece of evidence collected during navigation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Evidence { - /// Navigation path where this evidence was found (e.g., "Root/API Reference/Auth"). - pub source_path: String, - /// Title of the node. - pub node_title: String, - /// Content of the node. - pub content: String, - /// Source document name (set by Orchestrator in multi-doc scenarios). - pub doc_name: Option, -} +/// +/// Re-exported from [`vectorless_rerank::types::Evidence`]. +pub use vectorless_rerank::types::Evidence; /// Agent execution metrics. #[derive(Debug, Clone, Default, Serialize, Deserialize)] diff --git a/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs b/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs index 4af33e29..2edd9612 100644 --- a/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs +++ b/vectorless-core/vectorless-agent/src/orchestrator/analyze.rs @@ -43,7 +43,7 @@ pub async fn analyze( query: &str, ws: &WorkspaceContext<'_>, state: &mut OrchestratorState, - emitter: &crate::agent::EventEmitter, + emitter: &crate::EventEmitter, skip_analysis: bool, query_plan: &QueryPlan, llm: &LlmClient, diff --git a/vectorless-core/vectorless-agent/src/tools/orchestrator.rs b/vectorless-core/vectorless-agent/src/tools/orchestrator.rs index 4f9e053e..4b3d72ac 100644 --- a/vectorless-core/vectorless-agent/src/tools/orchestrator.rs +++ b/vectorless-core/vectorless-agent/src/tools/orchestrator.rs @@ -4,7 +4,7 @@ //! Orchestrator tools: ls_docs, find_cross, dispatch. use super::ToolResult; -use crate::agent::config::WorkspaceContext; +use crate::config::WorkspaceContext; /// Execute `ls_docs` — list all document cards. /// @@ -160,13 +160,13 @@ mod tests { fn test_ls_docs_shows_cards() { let (trees, navs, ridxs) = build_workspace(); let docs = vec![ - crate::agent::config::DocContext { + crate::config::DocContext { tree: &trees[0], nav_index: &navs[0], reasoning_index: &ridxs[0], doc_name: "2024", }, - crate::agent::config::DocContext { + crate::config::DocContext { tree: &trees[1], nav_index: &navs[1], reasoning_index: &ridxs[1], @@ -188,7 +188,7 @@ mod tests { let tree = vectorless_document::DocumentTree::new("Empty", ""); let nav = NavigationIndex::new(); let ridx = ReasoningIndex::default(); - let docs = vec![crate::agent::config::DocContext { + let docs = vec![crate::config::DocContext { tree: &tree, nav_index: &nav, reasoning_index: &ridx, diff --git a/vectorless-core/vectorless-agent/src/tools/worker/cat.rs b/vectorless-core/vectorless-agent/src/tools/worker/cat.rs index e675bfa0..e4aeb055 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/cat.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/cat.rs @@ -3,9 +3,9 @@ //! `cat` — read node content and collect as evidence. -use crate::agent::command; -use crate::agent::config::{DocContext, Evidence}; -use crate::agent::state::WorkerState; +use crate::command; +use crate::config::{DocContext, Evidence}; +use crate::state::WorkerState; use super::super::ToolResult; diff --git a/vectorless-core/vectorless-agent/src/tools/worker/cd.rs b/vectorless-core/vectorless-agent/src/tools/worker/cd.rs index be6f382f..14972abe 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/cd.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/cd.rs @@ -3,9 +3,9 @@ //! `cd`, `cd_absolute`, `cd_up` — navigation commands. -use crate::agent::command; -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; +use crate::command; +use crate::config::DocContext; +use crate::state::WorkerState; use super::super::ToolResult; diff --git a/vectorless-core/vectorless-agent/src/tools/worker/find.rs b/vectorless-core/vectorless-agent/src/tools/worker/find.rs index 0db5dfd2..b48b4189 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/find.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/find.rs @@ -3,7 +3,7 @@ //! `find_tree` — search for nodes by title pattern across the entire tree. -use crate::agent::config::DocContext; +use crate::config::DocContext; use super::super::ToolResult; @@ -43,7 +43,7 @@ pub fn find_tree(pattern: &str, ctx: &DocContext) -> ToolResult { #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; + use crate::config::DocContext; use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { diff --git a/vectorless-core/vectorless-agent/src/tools/worker/grep.rs b/vectorless-core/vectorless-agent/src/tools/worker/grep.rs index be609c7d..b1555fd7 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/grep.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/grep.rs @@ -3,8 +3,8 @@ //! `grep` — regex search across all node content in the current subtree. -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; +use crate::config::DocContext; +use crate::state::WorkerState; use super::super::ToolResult; use super::collect_subtree; @@ -61,8 +61,8 @@ pub fn grep(pattern: &str, ctx: &DocContext, state: &WorkerState) -> ToolResult #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; + use crate::config::DocContext; + use crate::state::WorkerState; use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { @@ -167,7 +167,7 @@ mod tests { let ctx = rich_ctx!(tree, nav); let mut state = WorkerState::new(root, 15); - crate::agent::tools::worker::cd::cd("Expenses", &ctx, &mut state); + crate::tools::worker::cd::cd("Expenses", &ctx, &mut state); let result = grep("revenue", &ctx, &state); assert!(result.success); assert!(result.feedback.contains("No matches")); diff --git a/vectorless-core/vectorless-agent/src/tools/worker/head.rs b/vectorless-core/vectorless-agent/src/tools/worker/head.rs index 000f0c10..764cba7a 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/head.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/head.rs @@ -3,9 +3,9 @@ //! `head` — preview first N lines of a node without collecting evidence. -use crate::agent::command; -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; +use crate::command; +use crate::config::DocContext; +use crate::state::WorkerState; use super::super::ToolResult; @@ -53,8 +53,8 @@ pub fn head(target: &str, lines: usize, ctx: &DocContext, state: &WorkerState) - #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; + use crate::config::DocContext; + use crate::state::WorkerState; use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { diff --git a/vectorless-core/vectorless-agent/src/tools/worker/ls.rs b/vectorless-core/vectorless-agent/src/tools/worker/ls.rs index c06914ea..3c85bc18 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/ls.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/ls.rs @@ -3,8 +3,8 @@ //! `ls` — list children of the current node. -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; +use crate::config::DocContext; +use crate::state::WorkerState; use super::super::ToolResult; diff --git a/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs b/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs index eb28cc2e..c5ff06b9 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/pwd.rs @@ -3,7 +3,7 @@ //! `pwd` — show current navigation path. -use crate::agent::state::WorkerState; +use crate::state::WorkerState; use super::super::ToolResult; @@ -15,8 +15,8 @@ pub fn pwd(state: &WorkerState) -> ToolResult { #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; - use crate::agent::tools::worker::cd::cd; + use crate::config::DocContext; + use crate::tools::worker::cd::cd; use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex}; fn build_test_tree() -> (DocumentTree, NavigationIndex) { diff --git a/vectorless-core/vectorless-agent/src/tools/worker/wc.rs b/vectorless-core/vectorless-agent/src/tools/worker/wc.rs index 3dc19782..adc05cff 100644 --- a/vectorless-core/vectorless-agent/src/tools/worker/wc.rs +++ b/vectorless-core/vectorless-agent/src/tools/worker/wc.rs @@ -3,9 +3,9 @@ //! `wc` — show node content statistics. -use crate::agent::command; -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; +use crate::command; +use crate::config::DocContext; +use crate::state::WorkerState; use super::super::ToolResult; @@ -42,8 +42,8 @@ pub fn wc(target: &str, ctx: &DocContext, state: &WorkerState) -> ToolResult { #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; + use crate::config::DocContext; + use crate::state::WorkerState; use vectorless_document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { diff --git a/vectorless-core/vectorless-agent/src/worker/navigation.rs b/vectorless-core/vectorless-agent/src/worker/navigation.rs index bb6a5812..cb6b06ba 100644 --- a/vectorless-core/vectorless-agent/src/worker/navigation.rs +++ b/vectorless-core/vectorless-agent/src/worker/navigation.rs @@ -277,8 +277,8 @@ async fn handle_replan( #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; + use crate::config::DocContext; + use crate::state::WorkerState; use vectorless_document::{DocumentTree, NodeId}; fn test_ctx() -> (DocumentTree, NodeId) { diff --git a/vectorless-core/vectorless-agent/src/worker/planning.rs b/vectorless-core/vectorless-agent/src/worker/planning.rs index 37998071..80149e7a 100644 --- a/vectorless-core/vectorless-agent/src/worker/planning.rs +++ b/vectorless-core/vectorless-agent/src/worker/planning.rs @@ -517,9 +517,9 @@ fn build_sibling_hints(state: &WorkerState, ctx: &DocContext<'_>) -> String { #[cfg(test)] mod tests { use super::*; - use crate::agent::config::DocContext; - use crate::agent::config::Evidence; - use crate::agent::state::WorkerState; + use crate::config::DocContext; + use crate::config::Evidence; + use crate::state::WorkerState; use vectorless_document::{ChildRoute, NavEntry, NodeId}; use vectorless_scoring::bm25::extract_keywords; diff --git a/vectorless-core/vectorless-engine/src/types.rs b/vectorless-core/vectorless-engine/src/types.rs index df5c30f4..7cd421f5 100644 --- a/vectorless-core/vectorless-engine/src/types.rs +++ b/vectorless-core/vectorless-engine/src/types.rs @@ -253,64 +253,10 @@ impl IndexItem { } // ============================================================ -// Query Types +// Query Types — re-exported from retrieval crate // ============================================================ -/// A single piece of evidence with source attribution. -#[derive(Debug, Clone)] -pub struct EvidenceItem { - /// Section title where this evidence was found. - pub title: String, - /// Navigation path (e.g., "Root/Chapter 1/Section 1.2"). - pub path: String, - /// Raw evidence content. - pub content: String, - /// Source document name (set in multi-doc scenarios). - pub doc_name: Option, -} - -/// Query execution metrics. -#[derive(Debug, Clone, Default)] -pub struct QueryMetrics { - /// Number of LLM calls made. - pub llm_calls: u32, - /// Number of navigation rounds used. - pub rounds_used: u32, - /// Number of distinct nodes visited. - pub nodes_visited: usize, - /// Number of evidence items collected. - pub evidence_count: usize, - /// Total characters of collected evidence. - pub evidence_chars: usize, -} - -/// Confidence score of the query result (0.0–1.0). -/// -/// Derived from LLM evaluate() — whether evidence was deemed sufficient -/// and how many replan rounds were needed. -pub type Confidence = f32; - -/// A single document's query result. -#[derive(Debug, Clone)] -pub struct QueryResultItem { - /// The document ID. - pub doc_id: String, - - /// Matching node IDs (navigation paths). - pub node_ids: Vec, - - /// Synthesized answer or raw evidence content. - pub content: String, - - /// Evidence items that contributed to this result, with source attribution. - pub evidence: Vec, - - /// Execution metrics for this query. - pub metrics: Option, - - /// Confidence score (0.0–1.0) — derived from LLM evaluation. - pub confidence: Confidence, -} +pub use vectorless_retrieval::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem}; /// Result of a document query. /// diff --git a/vectorless-core/vectorless-index/Cargo.toml b/vectorless-core/vectorless-index/Cargo.toml index 132a3b10..587b54b6 100644 --- a/vectorless-core/vectorless-index/Cargo.toml +++ b/vectorless-core/vectorless-index/Cargo.toml @@ -31,6 +31,8 @@ chrono = { workspace = true } rand = { workspace = true } futures = { workspace = true } base64 = { workspace = true } +sha2 = { workspace = true } +tempfile = { workspace = true } [lints] workspace = true diff --git a/vectorless-core/vectorless-index/src/incremental/resolver.rs b/vectorless-core/vectorless-index/src/incremental/resolver.rs index c87f571f..674c5f2c 100644 --- a/vectorless-core/vectorless-index/src/incremental/resolver.rs +++ b/vectorless-core/vectorless-index/src/incremental/resolver.rs @@ -12,7 +12,7 @@ use tracing::info; use vectorless_document::DocumentFormat; use vectorless_document::DocumentTree; -use crate::index::config::PipelineOptions; +use crate::config::PipelineOptions; use vectorless_storage::PersistedDocument; use vectorless_utils::fingerprint::Fingerprint; diff --git a/vectorless-core/vectorless-index/src/incremental/updater.rs b/vectorless-core/vectorless-index/src/incremental/updater.rs index 3cb838de..8914c394 100644 --- a/vectorless-core/vectorless-index/src/incremental/updater.rs +++ b/vectorless-core/vectorless-index/src/incremental/updater.rs @@ -7,7 +7,7 @@ use tracing::info; use vectorless_document::{DocumentTree, NodeId}; use vectorless_error::Result; -use crate::index::parse::RawNode; +use crate::parse::RawNode; use super::detector::ChangeDetector; diff --git a/vectorless-core/vectorless-index/src/parse/markdown/parser.rs b/vectorless-core/vectorless-index/src/parse/markdown/parser.rs index a511e50c..faffe369 100644 --- a/vectorless-core/vectorless-index/src/parse/markdown/parser.rs +++ b/vectorless-core/vectorless-index/src/parse/markdown/parser.rs @@ -7,7 +7,7 @@ use pulldown_cmark::Options; use std::path::Path; use vectorless_error::Result; -use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; +use crate::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; use vectorless_utils::estimate_tokens; use super::config::MarkdownConfig; diff --git a/vectorless-core/vectorless-index/src/parse/mod.rs b/vectorless-core/vectorless-index/src/parse/mod.rs index e69de29b..34d39f5c 100644 --- a/vectorless-core/vectorless-index/src/parse/mod.rs +++ b/vectorless-core/vectorless-index/src/parse/mod.rs @@ -0,0 +1,87 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Document parsing for the index pipeline. +//! +//! Supports Markdown and PDF formats. Parsing is dispatched directly +//! via `match` — no trait objects or registry needed. + +pub mod markdown; +pub mod pdf; +pub mod toc; +pub mod types; + +// Re-export core types at module level +pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; + +use std::path::Path; + +use vectorless_error::Result; +use crate::parse::markdown::MarkdownParser; +use vectorless_llm::LlmClient; + +/// Parse a string content document. +pub async fn parse_content( + content: &str, + format: DocumentFormat, + _llm_client: Option, +) -> Result { + match format { + DocumentFormat::Markdown => { + let parser = MarkdownParser::new(); + parser.parse(content).await + } + DocumentFormat::Pdf => Err(vectorless_error::Error::Parse( + "PDF requires bytes, not string content".to_string(), + )), + } +} + +/// Parse a file. +pub async fn parse_file( + path: &Path, + format: DocumentFormat, + llm_client: Option, +) -> Result { + match format { + DocumentFormat::Markdown => { + let parser = MarkdownParser::new(); + parser.parse_file(path).await + } + DocumentFormat::Pdf => { + let parser = match llm_client { + Some(client) => pdf::PdfParser::with_llm_client(client), + None => pdf::PdfParser::new(), + }; + parser.parse_file(path).await + } + } +} + +/// Parse binary data. +pub async fn parse_bytes( + bytes: &[u8], + format: DocumentFormat, + llm_client: Option, +) -> Result { + match format { + DocumentFormat::Markdown => { + let content = std::str::from_utf8(bytes) + .map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8 content: {}", e)))?; + let parser = MarkdownParser::new(); + parser.parse(content).await + } + DocumentFormat::Pdf => { + let parser = match llm_client { + Some(client) => pdf::PdfParser::with_llm_client(client), + None => pdf::PdfParser::new(), + }; + parser.parse_bytes_async(bytes, None).await + } + } +} + +/// Detect document format from a file extension. +pub fn format_from_extension(ext: &str) -> Option { + DocumentFormat::from_extension(ext) +} diff --git a/vectorless-core/vectorless-index/src/parse/pdf/parser.rs b/vectorless-core/vectorless-index/src/parse/pdf/parser.rs index af5f2478..72b9fc6f 100644 --- a/vectorless-core/vectorless-index/src/parse/pdf/parser.rs +++ b/vectorless-core/vectorless-index/src/parse/pdf/parser.rs @@ -14,11 +14,11 @@ use tracing::{info, warn}; use vectorless_error::Error; use vectorless_error::Result; -use crate::index::parse::toc::TocProcessor; +use crate::parse::toc::TocProcessor; use vectorless_llm::LlmClient; use super::types::{PdfMetadata, PdfPage, PdfParseResult}; -use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; +use crate::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; /// PDF document parser. pub struct PdfParser { @@ -192,7 +192,7 @@ impl PdfParser { /// Convert TOC entries to RawNodes. fn toc_entries_to_raw_nodes( &self, - entries: &[crate::index::parse::toc::TocEntry], + entries: &[crate::parse::toc::TocEntry], pages: &[PdfPage], ) -> Vec { let mut nodes = Vec::new(); @@ -217,7 +217,7 @@ impl PdfParser { /// Get content for a TOC entry from pages. fn get_content_for_entry( &self, - entry: &crate::index::parse::toc::TocEntry, + entry: &crate::parse::toc::TocEntry, pages: &[PdfPage], ) -> String { let start_page = entry.physical_page.unwrap_or(1); diff --git a/vectorless-core/vectorless-index/src/parse/toc/assigner.rs b/vectorless-core/vectorless-index/src/parse/toc/assigner.rs index 5a298031..172ffec2 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/assigner.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/assigner.rs @@ -8,7 +8,7 @@ use std::collections::HashMap; use tracing::{debug, info}; use vectorless_error::Result; -use crate::index::parse::pdf::PdfPage; +use crate::parse::pdf::PdfPage; use vectorless_llm::config::LlmConfig; use super::types::{PageOffset, TocEntry}; diff --git a/vectorless-core/vectorless-index/src/parse/toc/detector.rs b/vectorless-core/vectorless-index/src/parse/toc/detector.rs index 95e84431..c9960253 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/detector.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/detector.rs @@ -10,7 +10,7 @@ use vectorless_error::Result; use vectorless_llm::config::LlmConfig; use super::types::TocDetection; -use crate::index::parse::pdf::PdfPage; +use crate::parse::pdf::PdfPage; use vectorless_llm::LlmClient; /// TOC detector configuration. diff --git a/vectorless-core/vectorless-index/src/parse/toc/processor.rs b/vectorless-core/vectorless-index/src/parse/toc/processor.rs index 1cc43d6c..e4d93fb1 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/processor.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/processor.rs @@ -11,7 +11,7 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info, warn}; use vectorless_error::Result; -use crate::index::parse::pdf::PdfPage; +use crate::parse::pdf::PdfPage; use vectorless_llm::LlmClient; use super::assigner::{PageAssigner, PageAssignerConfig}; diff --git a/vectorless-core/vectorless-index/src/parse/toc/repairer.rs b/vectorless-core/vectorless-index/src/parse/toc/repairer.rs index f8016657..5a5a7a92 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/repairer.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/repairer.rs @@ -7,7 +7,7 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info}; use vectorless_error::Result; -use crate::index::parse::pdf::PdfPage; +use crate::parse::pdf::PdfPage; use vectorless_llm::config::LlmConfig; use super::types::{TocEntry, VerificationError, VerificationReport}; diff --git a/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs b/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs index aedd9b36..374f01ca 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs @@ -11,7 +11,7 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info, warn}; use vectorless_error::Result; -use crate::index::parse::pdf::PdfPage; +use crate::parse::pdf::PdfPage; use vectorless_llm::config::LlmConfig; use super::types::TocEntry; diff --git a/vectorless-core/vectorless-index/src/parse/toc/verifier.rs b/vectorless-core/vectorless-index/src/parse/toc/verifier.rs index 41d6ce29..11981fd5 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/verifier.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/verifier.rs @@ -8,7 +8,7 @@ use rand::seq::SliceRandom; use tracing::{debug, info}; use vectorless_error::Result; -use crate::index::parse::pdf::PdfPage; +use crate::parse::pdf::PdfPage; use vectorless_llm::config::LlmConfig; use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; diff --git a/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs b/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs index d192679f..e2d9133b 100644 --- a/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs +++ b/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs @@ -14,7 +14,7 @@ use serde::{Deserialize, Serialize}; use tracing::{info, warn}; use vectorless_document::DocumentTree; -use crate::index::parse::RawNode; +use crate::parse::RawNode; use super::metrics::IndexMetrics; diff --git a/vectorless-core/vectorless-index/src/pipeline/context.rs b/vectorless-core/vectorless-index/src/pipeline/context.rs index 36360d81..eb7058e1 100644 --- a/vectorless-core/vectorless-index/src/pipeline/context.rs +++ b/vectorless-core/vectorless-index/src/pipeline/context.rs @@ -7,7 +7,7 @@ use std::collections::HashMap; use std::path::PathBuf; use vectorless_document::{Concept, DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; -use crate::index::parse::{DocumentFormat, RawNode}; +use crate::parse::{DocumentFormat, RawNode}; use vectorless_llm::LlmClient; use super::super::{PipelineOptions, SummaryStrategy}; diff --git a/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs b/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs index 3d830748..9421d2c9 100644 --- a/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs +++ b/vectorless-core/vectorless-index/src/pipeline/orchestrator.rs @@ -869,7 +869,7 @@ struct ParallelEntry { /// Failure policy (captured before swap). policy: FailurePolicy, /// Access pattern (captured before swap). - access: crate::index::stages::AccessPattern, + access: crate::stages::AccessPattern, } /// Builder for creating custom stage configurations. diff --git a/vectorless-core/vectorless-index/src/stages/build.rs b/vectorless-core/vectorless-index/src/stages/build.rs index 3557da8d..98549ee2 100644 --- a/vectorless-core/vectorless-index/src/stages/build.rs +++ b/vectorless-core/vectorless-index/src/stages/build.rs @@ -9,12 +9,12 @@ use tracing::{debug, info}; use vectorless_document::{DocumentTree, NodeId}; use vectorless_error::Result; -use crate::index::parse::RawNode; +use crate::parse::RawNode; use vectorless_utils::estimate_tokens; use super::{IndexStage, StageResult}; -use crate::index::ThinningConfig; -use crate::index::pipeline::IndexContext; +use crate::ThinningConfig; +use crate::pipeline::IndexContext; /// Build stage - constructs a tree from raw nodes. pub struct BuildStage; diff --git a/vectorless-core/vectorless-index/src/stages/concept.rs b/vectorless-core/vectorless-index/src/stages/concept.rs index dfca16e4..35bcc270 100644 --- a/vectorless-core/vectorless-index/src/stages/concept.rs +++ b/vectorless-core/vectorless-index/src/stages/concept.rs @@ -14,7 +14,7 @@ use vectorless_llm::LlmClient; use super::async_trait; use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; +use crate::pipeline::IndexContext; /// Maximum number of top keywords to send to the LLM for concept extraction. const MAX_TOPICS: usize = 20; diff --git a/vectorless-core/vectorless-index/src/stages/enhance.rs b/vectorless-core/vectorless-index/src/stages/enhance.rs index f6af051f..2d70e7a9 100644 --- a/vectorless-core/vectorless-index/src/stages/enhance.rs +++ b/vectorless-core/vectorless-index/src/stages/enhance.rs @@ -11,14 +11,14 @@ use tracing::{debug, info, warn}; use vectorless_document::NodeId; use vectorless_error::Result; -use crate::index::incremental; +use crate::incremental; use vectorless_llm::LlmClient; use vectorless_llm::memo::{MemoKey, MemoStore}; use vectorless_utils::fingerprint::Fingerprint; use super::{IndexStage, StageResult}; -use crate::index::pipeline::{FailurePolicy, IndexContext, StageRetryConfig}; -use crate::index::summary::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy}; +use crate::pipeline::{FailurePolicy, IndexContext, StageRetryConfig}; +use crate::summary::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy}; /// A node that needs LLM summary generation. struct PendingNode { diff --git a/vectorless-core/vectorless-index/src/stages/enrich.rs b/vectorless-core/vectorless-index/src/stages/enrich.rs index 8b743ea8..e14611e2 100644 --- a/vectorless-core/vectorless-index/src/stages/enrich.rs +++ b/vectorless-core/vectorless-index/src/stages/enrich.rs @@ -11,7 +11,7 @@ use vectorless_document::{DocumentTree, NodeId, ReferenceExtractor, TocView}; use vectorless_error::Result; use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; +use crate::pipeline::IndexContext; /// Enrich stage - adds metadata to the tree. pub struct EnrichStage; diff --git a/vectorless-core/vectorless-index/src/stages/navigation.rs b/vectorless-core/vectorless-index/src/stages/navigation.rs index 57abc926..8c25a411 100644 --- a/vectorless-core/vectorless-index/src/stages/navigation.rs +++ b/vectorless-core/vectorless-index/src/stages/navigation.rs @@ -22,7 +22,7 @@ use vectorless_error::Result; use super::async_trait; use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; +use crate::pipeline::IndexContext; /// Navigation Index Stage — builds the Agent navigation index. /// @@ -392,8 +392,8 @@ mod tests { // Build context with the tree let mut ctx = IndexContext::new( - crate::index::pipeline::IndexInput::content("test"), - crate::index::config::PipelineOptions::default(), + crate::pipeline::IndexInput::content("test"), + crate::config::PipelineOptions::default(), ); ctx.tree = Some(tree); @@ -440,8 +440,8 @@ mod tests { let tree = DocumentTree::new("Root", "content"); let mut ctx = IndexContext::new( - crate::index::pipeline::IndexInput::content("test"), - crate::index::config::PipelineOptions::default(), + crate::pipeline::IndexInput::content("test"), + crate::config::PipelineOptions::default(), ); ctx.tree = Some(tree); @@ -459,8 +459,8 @@ mod tests { #[tokio::test] async fn test_execute_no_tree() { let ctx = IndexContext::new( - crate::index::pipeline::IndexInput::content("test"), - crate::index::config::PipelineOptions::default(), + crate::pipeline::IndexInput::content("test"), + crate::config::PipelineOptions::default(), ); // ctx.tree is None diff --git a/vectorless-core/vectorless-index/src/stages/optimize.rs b/vectorless-core/vectorless-index/src/stages/optimize.rs index 33244ddf..4b430bca 100644 --- a/vectorless-core/vectorless-index/src/stages/optimize.rs +++ b/vectorless-core/vectorless-index/src/stages/optimize.rs @@ -9,7 +9,7 @@ use tracing::{debug, info}; use vectorless_document::NodeId; use vectorless_error::Result; -use crate::index::pipeline::IndexContext; +use crate::pipeline::IndexContext; use super::{IndexStage, StageResult}; @@ -30,7 +30,7 @@ impl OptimizeStage { fn merge_small_leaves( tree: &mut vectorless_document::DocumentTree, min_tokens: usize, - metrics: &mut crate::index::IndexMetrics, + metrics: &mut crate::IndexMetrics, ) -> usize { let mut merged_count = 0; @@ -243,9 +243,9 @@ impl IndexStage for OptimizeStage { mod tests { use super::*; use vectorless_document::DocumentTree; - use crate::index::PipelineOptions; - use crate::index::pipeline::IndexContext; - use crate::index::pipeline::IndexInput; + use crate::PipelineOptions; + use crate::pipeline::IndexContext; + use crate::pipeline::IndexInput; /// Create a tree with small leaf children under root for merge tests. /// @@ -286,7 +286,7 @@ mod tests { fn test_merge_small_leaves_merges_adjacent_pair() { let mut tree = make_merge_test_tree(); let root = tree.root(); - let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let mut metrics = crate::pipeline::IndexMetrics::new(); // Threshold 100: Leaf A (50) and Leaf B (30) should merge let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); @@ -307,7 +307,7 @@ mod tests { #[test] fn test_merge_small_leaves_nothing_above_threshold() { let mut tree = make_merge_test_tree(); - let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let mut metrics = crate::pipeline::IndexMetrics::new(); // Threshold 10: all leaves are above this, nothing merges let merged = OptimizeStage::merge_small_leaves(&mut tree, 10, &mut metrics); @@ -327,7 +327,7 @@ mod tests { n.token_count = Some(5); } - let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let mut metrics = crate::pipeline::IndexMetrics::new(); let _ = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); // Leaf A should now contain both contents with heading prefix @@ -355,7 +355,7 @@ mod tests { n.token_count = Some(5); } - let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let mut metrics = crate::pipeline::IndexMetrics::new(); let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); // Section is non-leaf, only Leaf is a leaf — no adjacent pair of leaves @@ -447,7 +447,7 @@ mod tests { #[test] fn test_merge_small_leaves_empty_tree() { let mut tree = DocumentTree::new("Root", ""); - let mut metrics = crate::index::pipeline::IndexMetrics::new(); + let mut metrics = crate::pipeline::IndexMetrics::new(); let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); assert_eq!(merged, 0, "Root with no children should merge nothing"); diff --git a/vectorless-core/vectorless-index/src/stages/parse.rs b/vectorless-core/vectorless-index/src/stages/parse.rs index 1b48e1da..bba8c8b5 100644 --- a/vectorless-core/vectorless-index/src/stages/parse.rs +++ b/vectorless-core/vectorless-index/src/stages/parse.rs @@ -11,8 +11,8 @@ use vectorless_document::DocumentFormat; use vectorless_error::Result; use super::{IndexStage, StageResult}; -use crate::index::IndexMode; -use crate::index::pipeline::{IndexContext, IndexInput}; +use crate::IndexMode; +use crate::pipeline::{IndexContext, IndexInput}; /// Parse stage - extracts raw nodes from documents. pub struct ParseStage { @@ -99,7 +99,7 @@ impl IndexStage for ParseStage { debug!("[parse] Reading file: {:?}", ctx.source_path); // Parse directly - crate::index::parse::parse_file(&path, format, self.llm_client.clone()).await? + crate::parse::parse_file(&path, format, self.llm_client.clone()).await? } IndexInput::Content { content, @@ -112,7 +112,7 @@ impl IndexStage for ParseStage { debug!("[parse] Parsing inline content ({} chars)", content.len()); // Parse content directly - crate::index::parse::parse_content(content, *format, self.llm_client.clone()) + crate::parse::parse_content(content, *format, self.llm_client.clone()) .await? } IndexInput::Bytes { data, name, format } => { @@ -122,7 +122,7 @@ impl IndexStage for ParseStage { debug!("[parse] Parsing bytes ({} bytes)", data.len()); // Parse bytes - crate::index::parse::parse_bytes(data, *format, self.llm_client.clone()).await? + crate::parse::parse_bytes(data, *format, self.llm_client.clone()).await? } }; diff --git a/vectorless-core/vectorless-index/src/stages/reasoning.rs b/vectorless-core/vectorless-index/src/stages/reasoning.rs index 1d41a159..da11d008 100644 --- a/vectorless-core/vectorless-index/src/stages/reasoning.rs +++ b/vectorless-core/vectorless-index/src/stages/reasoning.rs @@ -21,7 +21,7 @@ use vectorless_scoring::extract_keywords; use super::async_trait; use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; +use crate::pipeline::IndexContext; /// Reasoning Index Stage - builds a pre-computed reasoning index from the document tree. /// diff --git a/vectorless-core/vectorless-index/src/stages/split.rs b/vectorless-core/vectorless-index/src/stages/split.rs index 68b5077f..c729214b 100644 --- a/vectorless-core/vectorless-index/src/stages/split.rs +++ b/vectorless-core/vectorless-index/src/stages/split.rs @@ -11,8 +11,8 @@ use vectorless_error::Result; use vectorless_utils::estimate_tokens; use super::{AccessPattern, IndexStage, StageResult, async_trait}; -use crate::index::config::SplitConfig; -use crate::index::pipeline::IndexContext; +use crate::config::SplitConfig; +use crate::pipeline::IndexContext; /// Split stage — breaks oversized leaf nodes into smaller children. /// diff --git a/vectorless-core/vectorless-index/src/stages/validate.rs b/vectorless-core/vectorless-index/src/stages/validate.rs index b4dbc1d4..82df7106 100644 --- a/vectorless-core/vectorless-index/src/stages/validate.rs +++ b/vectorless-core/vectorless-index/src/stages/validate.rs @@ -10,7 +10,7 @@ use tracing::{debug, info, warn}; use vectorless_error::Result; use super::{AccessPattern, IndexStage, StageResult, async_trait}; -use crate::index::pipeline::IndexContext; +use crate::pipeline::IndexContext; /// Maximum allowed tree depth. const MAX_DEPTH: usize = 20; @@ -299,8 +299,8 @@ mod tests { use vectorless_document::DocumentTree; fn make_context_with_tree(tree: DocumentTree) -> IndexContext { - let input = crate::index::IndexInput::content("test"); - let options = crate::index::config::PipelineOptions::default(); + let input = crate::IndexInput::content("test"); + let options = crate::config::PipelineOptions::default(); let mut ctx = IndexContext::new(input, options); ctx.tree = Some(tree); ctx @@ -352,8 +352,8 @@ mod tests { #[test] fn test_validate_no_tree_error() { - let input = crate::index::IndexInput::content("test"); - let options = crate::index::config::PipelineOptions::default(); + let input = crate::IndexInput::content("test"); + let options = crate::config::PipelineOptions::default(); let ctx = IndexContext::new(input, options); let stage = ValidateStage::new(); diff --git a/vectorless-core/vectorless-index/src/stages/verify_ingest.rs b/vectorless-core/vectorless-index/src/stages/verify_ingest.rs index db5f0051..b20a25fd 100644 --- a/vectorless-core/vectorless-index/src/stages/verify_ingest.rs +++ b/vectorless-core/vectorless-index/src/stages/verify_ingest.rs @@ -7,7 +7,7 @@ use tracing::{info, warn}; use super::{AccessPattern, IndexStage}; use vectorless_error::{Error, Result}; -use crate::index::pipeline::{IndexContext, StageResult}; +use crate::pipeline::{IndexContext, StageResult}; use super::async_trait; /// Verification stage — ensures ingest produced reliable output. diff --git a/vectorless-core/vectorless-llm/src/lib.rs b/vectorless-core/vectorless-llm/src/lib.rs index e61c6eb7..8bb01c3b 100644 --- a/vectorless-core/vectorless-llm/src/lib.rs +++ b/vectorless-core/vectorless-llm/src/lib.rs @@ -29,13 +29,13 @@ //! ``` mod client; -pub(crate) mod config; +pub mod config; mod error; mod executor; mod fallback; -pub(crate) mod memo; +pub mod memo; mod pool; -pub(crate) mod throttle; +pub mod throttle; pub use client::LlmClient; pub use error::LlmResult; diff --git a/vectorless-core/vectorless-rerank/Cargo.toml b/vectorless-core/vectorless-rerank/Cargo.toml index da09414d..f9a9112c 100644 --- a/vectorless-core/vectorless-rerank/Cargo.toml +++ b/vectorless-core/vectorless-rerank/Cargo.toml @@ -10,8 +10,8 @@ homepage.workspace = true [dependencies] serde = { workspace = true } +serde_json = { workspace = true } tracing = { workspace = true } -vectorless-agent = { path = "../vectorless-agent" } vectorless-error = { path = "../vectorless-error" } vectorless-query = { path = "../vectorless-query" } diff --git a/vectorless-core/vectorless-rerank/src/dedup.rs b/vectorless-core/vectorless-rerank/src/dedup.rs index 0b09e6d1..713c6088 100644 --- a/vectorless-core/vectorless-rerank/src/dedup.rs +++ b/vectorless-core/vectorless-rerank/src/dedup.rs @@ -5,7 +5,7 @@ use std::collections::HashSet; -use vectorless_agent::Evidence; +use crate::types::Evidence; /// Minimum characters for an evidence item to be considered meaningful. const MIN_EVIDENCE_CHARS: usize = 50; diff --git a/vectorless-core/vectorless-rerank/src/lib.rs b/vectorless-core/vectorless-rerank/src/lib.rs index 64e4474a..efab49ee 100644 --- a/vectorless-core/vectorless-rerank/src/lib.rs +++ b/vectorless-core/vectorless-rerank/src/lib.rs @@ -21,9 +21,8 @@ pub mod types; use tracing::info; -use vectorless_agent::Evidence; use vectorless_query::QueryIntent; -use types::RerankOutput; +use types::{Evidence, RerankOutput}; /// Process agent output through the rerank pipeline. /// diff --git a/vectorless-core/vectorless-rerank/src/types.rs b/vectorless-core/vectorless-rerank/src/types.rs index 4b42f351..73d19ce9 100644 --- a/vectorless-core/vectorless-rerank/src/types.rs +++ b/vectorless-core/vectorless-rerank/src/types.rs @@ -3,6 +3,21 @@ //! Rerank result types. +use serde::{Deserialize, Serialize}; + +/// A single piece of evidence collected during navigation. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct Evidence { + /// Navigation path where this evidence was found (e.g., "Root/API Reference/Auth"). + pub source_path: String, + /// Title of the node. + pub node_title: String, + /// Content of the node. + pub content: String, + /// Source document name (set by Orchestrator in multi-doc scenarios). + pub doc_name: Option, +} + /// Output from the rerank pipeline. pub struct RerankOutput { /// Synthesized answer. diff --git a/vectorless-core/vectorless-retrieval/src/lib.rs b/vectorless-core/vectorless-retrieval/src/lib.rs index bab04971..55d7c936 100644 --- a/vectorless-core/vectorless-retrieval/src/lib.rs +++ b/vectorless-core/vectorless-retrieval/src/lib.rs @@ -25,4 +25,4 @@ pub mod stream; mod types; pub use stream::{RetrieveEvent, RetrieveEventReceiver}; -pub use types::{ReasoningChain, RetrieveResponse, SufficiencyLevel}; +pub use types::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem, ReasoningChain, RetrieveResponse, SufficiencyLevel}; diff --git a/vectorless-core/vectorless-retrieval/src/postprocessor.rs b/vectorless-core/vectorless-retrieval/src/postprocessor.rs index 77a95a87..08d55644 100644 --- a/vectorless-core/vectorless-retrieval/src/postprocessor.rs +++ b/vectorless-core/vectorless-retrieval/src/postprocessor.rs @@ -10,7 +10,7 @@ use std::collections::BTreeMap; use vectorless_agent::config::{Evidence, Metrics, Output}; -use vectorless_engine::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem}; +use crate::types::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem}; /// Convert agent output to query result items, split by document. /// diff --git a/vectorless-core/vectorless-retrieval/src/types.rs b/vectorless-core/vectorless-retrieval/src/types.rs index 3fee208e..ca15ee55 100644 --- a/vectorless-core/vectorless-retrieval/src/types.rs +++ b/vectorless-core/vectorless-retrieval/src/types.rs @@ -191,3 +191,55 @@ pub struct ReasoningStep { /// Human-readable explanation of the decision. pub reasoning: String, } + +// ============================================================ +// Query result types (used by engine) +// ============================================================ + +/// Confidence score of the query result (0.0–1.0). +pub type Confidence = f32; + +/// A single piece of evidence with source attribution. +#[derive(Debug, Clone)] +pub struct EvidenceItem { + /// Section title where this evidence was found. + pub title: String, + /// Navigation path (e.g., "Root/Chapter 1/Section 1.2"). + pub path: String, + /// Raw evidence content. + pub content: String, + /// Source document name (set in multi-doc scenarios). + pub doc_name: Option, +} + +/// Query execution metrics. +#[derive(Debug, Clone, Default)] +pub struct QueryMetrics { + /// Number of LLM calls made. + pub llm_calls: u32, + /// Number of navigation rounds used. + pub rounds_used: u32, + /// Number of distinct nodes visited. + pub nodes_visited: usize, + /// Number of evidence items collected. + pub evidence_count: usize, + /// Total characters of collected evidence. + pub evidence_chars: usize, +} + +/// A single document's query result. +#[derive(Debug, Clone)] +pub struct QueryResultItem { + /// The document ID. + pub doc_id: String, + /// Matching node IDs (navigation paths). + pub node_ids: Vec, + /// Synthesized answer or raw evidence content. + pub content: String, + /// Evidence items that contributed to this result, with source attribution. + pub evidence: Vec, + /// Execution metrics for this query. + pub metrics: Option, + /// Confidence score (0.0–1.0) — derived from LLM evaluation. + pub confidence: Confidence, +} From a1f837321479760c1719431e455dfaa542cc37ad Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:13:35 +0800 Subject: [PATCH 15/28] refactor(vectorless-llm): update import paths and add dev dependency - Move tempfile to dev-dependencies in Cargo.toml - Update import path from crate::llm::throttle to crate::throttle in client.rs and executor.rs test modules - Fixes incorrect module path references in test code --- vectorless-core/vectorless-llm/Cargo.toml | 3 +++ vectorless-core/vectorless-llm/src/client.rs | 2 +- vectorless-core/vectorless-llm/src/executor.rs | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vectorless-core/vectorless-llm/Cargo.toml b/vectorless-core/vectorless-llm/Cargo.toml index a5f5b3a0..8aa02162 100644 --- a/vectorless-core/vectorless-llm/Cargo.toml +++ b/vectorless-core/vectorless-llm/Cargo.toml @@ -28,5 +28,8 @@ uuid = { workspace = true } rand = { workspace = true } base64 = { workspace = true } +[dev-dependencies] +tempfile = { workspace = true } + [lints] workspace = true diff --git a/vectorless-core/vectorless-llm/src/client.rs b/vectorless-core/vectorless-llm/src/client.rs index 56dcae7a..0356f7bb 100644 --- a/vectorless-core/vectorless-llm/src/client.rs +++ b/vectorless-core/vectorless-llm/src/client.rs @@ -355,7 +355,7 @@ mod tests { #[test] fn test_client_with_concurrency() { - use crate::llm::throttle::ConcurrencyConfig; + use crate::throttle::ConcurrencyConfig; let controller = ConcurrencyController::new(ConcurrencyConfig::conservative()); let client = LlmClient::for_model("gpt-4o-mini").with_concurrency(controller); diff --git a/vectorless-core/vectorless-llm/src/executor.rs b/vectorless-core/vectorless-llm/src/executor.rs index d8bf02c9..37790fc8 100644 --- a/vectorless-core/vectorless-llm/src/executor.rs +++ b/vectorless-core/vectorless-llm/src/executor.rs @@ -518,7 +518,7 @@ mod tests { #[test] fn test_executor_with_throttle() { - use crate::llm::throttle::ConcurrencyConfig; + use crate::throttle::ConcurrencyConfig; let controller = ConcurrencyController::new(ConcurrencyConfig::conservative()); let executor = LlmExecutor::for_model("gpt-4o-mini").with_throttle(controller); From 9a7cb06155285844af60a845f6a8ade04106579d Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:17:07 +0800 Subject: [PATCH 16/28] refactor: update DocumentTree import path in test modules Change import from crate::document::DocumentTree to crate::tree::DocumentTree across multiple test modules to align with updated module structure. BREAKING CHANGE: This change updates the internal module structure and import paths for DocumentTree. --- vectorless-core/vectorless-document/src/navigation.rs | 2 +- vectorless-core/vectorless-document/src/reasoning.rs | 8 ++++---- .../vectorless-document/src/serde_helpers.rs | 2 +- vectorless-core/vectorless-document/src/tree.rs | 2 +- vectorless-core/vectorless-engine/Cargo.toml | 3 +++ vectorless-core/vectorless-engine/src/builder.rs | 11 +++++++---- 6 files changed, 17 insertions(+), 11 deletions(-) diff --git a/vectorless-core/vectorless-document/src/navigation.rs b/vectorless-core/vectorless-document/src/navigation.rs index dbfeadd4..348fef43 100644 --- a/vectorless-core/vectorless-document/src/navigation.rs +++ b/vectorless-core/vectorless-document/src/navigation.rs @@ -234,7 +234,7 @@ pub struct SectionCard { #[cfg(test)] mod tests { use super::*; - use crate::document::DocumentTree; + use crate::tree::DocumentTree; fn build_small_tree() -> DocumentTree { // Root -> [Child1 (leaf), Child2 -> [Grandchild (leaf)]] diff --git a/vectorless-core/vectorless-document/src/reasoning.rs b/vectorless-core/vectorless-document/src/reasoning.rs index 2c4ab01b..533244de 100644 --- a/vectorless-core/vectorless-document/src/reasoning.rs +++ b/vectorless-core/vectorless-document/src/reasoning.rs @@ -339,7 +339,7 @@ mod tests { #[test] fn test_builder_basic() { // Create a simple tree to get valid NodeIds - let mut tree = crate::document::DocumentTree::new("Root", "root content"); + let mut tree = crate::tree::DocumentTree::new("Root", "root content"); let child1 = tree.add_child(tree.root(), "Introduction", "intro content"); let child2 = tree.add_child(tree.root(), "Methods", "methods content"); @@ -370,7 +370,7 @@ mod tests { #[test] fn test_serialization_roundtrip_empty() { - let mut tree = crate::document::DocumentTree::new("Root", "content"); + let mut tree = crate::tree::DocumentTree::new("Root", "content"); let child = tree.add_child(tree.root(), "Section 1", "s1 content"); let mut builder = ReasoningIndexBuilder::new(); @@ -395,7 +395,7 @@ mod tests { #[test] fn test_serialization_roundtrip_with_hot_nodes() { - let mut tree = crate::document::DocumentTree::new("Root", ""); + let mut tree = crate::tree::DocumentTree::new("Root", ""); let root = tree.root(); let c1 = tree.add_child(root, "S1", "content 1"); let c2 = tree.add_child(root, "S2", "content 2"); @@ -426,7 +426,7 @@ mod tests { #[test] fn test_backward_compat_hot_nodes_empty_object() { // Simulate old JSON where hot_nodes was serialized as {} by derive. - let mut tree = crate::document::DocumentTree::new("Root", ""); + let mut tree = crate::tree::DocumentTree::new("Root", ""); let child = tree.add_child(tree.root(), "S1", "c"); let mut builder = ReasoningIndexBuilder::new(); diff --git a/vectorless-core/vectorless-document/src/serde_helpers.rs b/vectorless-core/vectorless-document/src/serde_helpers.rs index cb658c35..00495da7 100644 --- a/vectorless-core/vectorless-document/src/serde_helpers.rs +++ b/vectorless-core/vectorless-document/src/serde_helpers.rs @@ -93,7 +93,7 @@ where #[cfg(test)] mod tests { use super::*; - use crate::document::DocumentTree; + use crate::tree::DocumentTree; /// Wrapper struct to test `#[serde(with)]` through serde_json round-trip. #[derive(Serialize, Deserialize, Debug)] diff --git a/vectorless-core/vectorless-document/src/tree.rs b/vectorless-core/vectorless-document/src/tree.rs index 1659471b..4080c00f 100644 --- a/vectorless-core/vectorless-document/src/tree.rs +++ b/vectorless-core/vectorless-document/src/tree.rs @@ -825,7 +825,7 @@ impl Default for DocumentTree { #[cfg(test)] mod tests { use super::*; - use crate::document::reference::{NodeReference, RefType}; + use crate::reference::{NodeReference, RefType}; #[test] fn test_children_with_refs_no_references() { diff --git a/vectorless-core/vectorless-engine/Cargo.toml b/vectorless-core/vectorless-engine/Cargo.toml index b5d91b07..acf7c7c6 100644 --- a/vectorless-core/vectorless-engine/Cargo.toml +++ b/vectorless-core/vectorless-engine/Cargo.toml @@ -10,15 +10,18 @@ homepage.workspace = true [dependencies] vectorless-agent = { path = "../vectorless-agent" } +vectorless-config = { path = "../vectorless-config" } vectorless-document = { path = "../vectorless-document" } vectorless-error = { path = "../vectorless-error" } vectorless-events = { path = "../vectorless-events" } +vectorless-graph = { path = "../vectorless-graph" } vectorless-index = { path = "../vectorless-index" } vectorless-llm = { path = "../vectorless-llm" } vectorless-metrics = { path = "../vectorless-metrics" } vectorless-retrieval = { path = "../vectorless-retrieval" } vectorless-rerank = { path = "../vectorless-rerank" } vectorless-storage = { path = "../vectorless-storage" } +vectorless-utils = { path = "../vectorless-utils" } tokio = { workspace = true } tracing = { workspace = true } serde = { workspace = true } diff --git a/vectorless-core/vectorless-engine/src/builder.rs b/vectorless-core/vectorless-engine/src/builder.rs index d32550f5..65e65bb5 100644 --- a/vectorless-core/vectorless-engine/src/builder.rs +++ b/vectorless-core/vectorless-engine/src/builder.rs @@ -6,10 +6,13 @@ //! This module provides [`EngineBuilder`] for configuring and building //! [`Engine`] instances with sensible defaults. -use crate::{ - client::engine::Engine, client::retriever::RetrieverClient, config::Config, - events::EventEmitter, metrics::MetricsHub, storage::Workspace, -}; +use vectorless_config::Config; +use vectorless_events::EventEmitter; +use vectorless_metrics::MetricsHub; +use vectorless_storage::Workspace; + +use super::engine::Engine; +use super::retriever::RetrieverClient; /// Builder for creating a [`Engine`] client. /// From db2396d3319ab306f64e9834c920b20909758cae Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:18:32 +0800 Subject: [PATCH 17/28] refactor(builder): update indexer client import path --- .../vectorless-engine/src/builder.rs | 2 +- .../vectorless-engine/src/engine.rs | 24 +++++++++---------- vectorless-core/vectorless-utils/Cargo.toml | 3 +++ 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/vectorless-core/vectorless-engine/src/builder.rs b/vectorless-core/vectorless-engine/src/builder.rs index 65e65bb5..5fd2d6b9 100644 --- a/vectorless-core/vectorless-engine/src/builder.rs +++ b/vectorless-core/vectorless-engine/src/builder.rs @@ -196,7 +196,7 @@ impl EngineBuilder { let pool = vectorless_llm::LlmPool::from_config(&config.llm, Some(metrics_hub.clone())); // Indexer uses pool.index() - let indexer = crate::client::indexer::IndexerClient::with_llm(pool.index().clone()); + let indexer = super::indexer::IndexerClient::with_llm(pool.index().clone()); // Retriever uses pool.retrieval() via agent system let retriever = RetrieverClient::new(pool.retrieval().clone()); diff --git a/vectorless-core/vectorless-engine/src/engine.rs b/vectorless-core/vectorless-engine/src/engine.rs index 0507c82e..cadd925c 100644 --- a/vectorless-core/vectorless-engine/src/engine.rs +++ b/vectorless-core/vectorless-engine/src/engine.rs @@ -49,19 +49,19 @@ use std::{collections::HashMap, sync::Arc}; use futures::StreamExt; use tracing::{info, warn}; -use crate::{ - Answer, Document as UnderstandingDocument, DocumentTree, Error, Evidence, IngestInput, +use vectorless_config::Config; +use vectorless_document::{ + Answer, Document as UnderstandingDocument, DocumentTree, Evidence, IngestInput, ReasoningTrace, - config::Config, - error::Result, - events::EventEmitter, - index::{ - PipelineOptions, - incremental::{self, IndexAction}, - }, - metrics::MetricsHub, - storage::{PersistedDocument, Workspace}, }; +use vectorless_error::{Error, Result}; +use vectorless_events::EventEmitter; +use vectorless_index::{ + PipelineOptions, + incremental::{self, IndexAction}, +}; +use vectorless_metrics::MetricsHub; +use vectorless_storage::{PersistedDocument, Workspace}; use super::{ index_context::{IndexContext, IndexSource}, @@ -574,7 +574,7 @@ impl Engine { doc_id: persisted.meta.id, name: persisted.meta.name, format: persisted.meta.format, - source_path: persisted.meta.source_path.map(|p| p.to_string_lossy().to_string()), + source_path: persisted.meta.source_path.as_ref().map(|p: &std::path::PathBuf| p.to_string_lossy().to_string()), tree, nav_index, reasoning_index, diff --git a/vectorless-core/vectorless-utils/Cargo.toml b/vectorless-core/vectorless-utils/Cargo.toml index 4ef59c91..d883a53a 100644 --- a/vectorless-core/vectorless-utils/Cargo.toml +++ b/vectorless-core/vectorless-utils/Cargo.toml @@ -18,5 +18,8 @@ tiktoken-rs = { workspace = true } base64 = { workspace = true } thiserror = { workspace = true } +[dev-dependencies] +serde_json = { workspace = true } + [lints] workspace = true From 060345c492fc8879db6bc0c87e008ce4f3329a35 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:20:16 +0800 Subject: [PATCH 18/28] refactor(engine): remove explicit type annotation in source_path mapping --- vectorless-core/vectorless-engine/src/engine.rs | 2 +- vectorless-core/vectorless-storage/Cargo.toml | 3 +++ vectorless-core/vectorless-storage/src/cache.rs | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/vectorless-core/vectorless-engine/src/engine.rs b/vectorless-core/vectorless-engine/src/engine.rs index cadd925c..3fcff5b9 100644 --- a/vectorless-core/vectorless-engine/src/engine.rs +++ b/vectorless-core/vectorless-engine/src/engine.rs @@ -574,7 +574,7 @@ impl Engine { doc_id: persisted.meta.id, name: persisted.meta.name, format: persisted.meta.format, - source_path: persisted.meta.source_path.as_ref().map(|p: &std::path::PathBuf| p.to_string_lossy().to_string()), + source_path: persisted.meta.source_path.as_ref().map(|p| p.to_string_lossy().to_string()), tree, nav_index, reasoning_index, diff --git a/vectorless-core/vectorless-storage/Cargo.toml b/vectorless-core/vectorless-storage/Cargo.toml index e15a0c50..87ce252e 100644 --- a/vectorless-core/vectorless-storage/Cargo.toml +++ b/vectorless-core/vectorless-storage/Cargo.toml @@ -28,6 +28,9 @@ parking_lot = { workspace = true } regex = { workspace = true } thiserror = { workspace = true } +[dev-dependencies] +tempfile = { workspace = true } + [target.'cfg(unix)'.dependencies] libc = { workspace = true } diff --git a/vectorless-core/vectorless-storage/src/cache.rs b/vectorless-core/vectorless-storage/src/cache.rs index 57ca2b24..39a66080 100644 --- a/vectorless-core/vectorless-storage/src/cache.rs +++ b/vectorless-core/vectorless-storage/src/cache.rs @@ -268,7 +268,7 @@ pub struct CacheStats { mod tests { use super::*; use vectorless_document::DocumentTree; - use crate::storage::{DocumentMeta, PersistedDocument}; + use crate::{DocumentMeta, PersistedDocument}; fn create_test_doc(id: &str) -> PersistedDocument { let meta = DocumentMeta::new(id, "Test Doc", "md"); From c94953a195077743a91bdb86f05e184f24f827ec Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:21:33 +0800 Subject: [PATCH 19/28] refactor(engine): update DocumentTree type reference in indexer Change the type annotation from crate::DocumentTree to vectorless_document::DocumentTree for consistency with module structure. feat(retriever): import additional types and update module paths Import DocContext, Scope, and WorkspaceContext from vectorless_agent config module and update QueryResult import path from crate::client to super::types. refactor(retriever): remove redundant module prefix in type usage Replace agent::DocContext with DocContext and update agent::Scope and agent::WorkspaceContext to their respective unqualified imports. chore(retrieval): add indextree as dev dependency Add indextree to dev-dependencies section of Cargo.toml for workspace configuration. --- vectorless-core/vectorless-engine/src/indexer.rs | 2 +- vectorless-core/vectorless-engine/src/retriever.rs | 12 ++++++------ .../vectorless-engine/src/test_support.rs | 6 +++--- vectorless-core/vectorless-retrieval/Cargo.toml | 3 +++ 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/vectorless-core/vectorless-engine/src/indexer.rs b/vectorless-core/vectorless-engine/src/indexer.rs index 559ccef7..20490f6d 100644 --- a/vectorless-core/vectorless-engine/src/indexer.rs +++ b/vectorless-core/vectorless-engine/src/indexer.rs @@ -95,7 +95,7 @@ impl IndexerClient { source: &IndexSource, name: Option<&str>, mut pipeline_options: PipelineOptions, - existing_tree: Option<&crate::DocumentTree>, + existing_tree: Option<&vectorless_document::DocumentTree>, ) -> Result { pipeline_options.existing_tree = existing_tree.cloned(); match source { diff --git a/vectorless-core/vectorless-engine/src/retriever.rs b/vectorless-core/vectorless-engine/src/retriever.rs index f1b12a89..59d2746d 100644 --- a/vectorless-core/vectorless-engine/src/retriever.rs +++ b/vectorless-core/vectorless-engine/src/retriever.rs @@ -8,8 +8,8 @@ use tracing::info; -use vectorless_agent::{self, config::AgentConfig, events::EventEmitter as AgentEventEmitter}; -use crate::client::types::QueryResult; +use vectorless_agent::{self, config::AgentConfig, config::DocContext, config::Scope, config::WorkspaceContext, events::EventEmitter as AgentEventEmitter}; +use super::types::QueryResult; use vectorless_document::{DocumentTree, NavigationIndex, ReasoningIndex}; use vectorless_error::Result; use vectorless_events::{EventEmitter, QueryEvent}; @@ -82,9 +82,9 @@ impl RetrieverClient { skip_analysis, "Querying: {:?}", question ); - let doc_contexts: Vec = documents + let doc_contexts: Vec = documents .iter() - .map(|(tree, nav, ridx, id)| agent::DocContext { + .map(|(tree, nav, ridx, id)| DocContext { tree, nav_index: nav, reasoning_index: ridx, @@ -93,9 +93,9 @@ impl RetrieverClient { .collect(); let scope = if skip_analysis { - agent::Scope::Specified(doc_contexts) + Scope::Specified(doc_contexts) } else { - agent::Scope::Workspace(agent::WorkspaceContext::new(doc_contexts)) + Scope::Workspace(WorkspaceContext::new(doc_contexts)) }; let emitter = AgentEventEmitter::noop(); diff --git a/vectorless-core/vectorless-engine/src/test_support.rs b/vectorless-core/vectorless-engine/src/test_support.rs index 7747ddbe..76b174e4 100644 --- a/vectorless-core/vectorless-engine/src/test_support.rs +++ b/vectorless-core/vectorless-engine/src/test_support.rs @@ -8,9 +8,9 @@ use std::sync::Arc; -use crate::client::engine::Engine; -use crate::client::indexer::IndexerClient; -use crate::client::retriever::RetrieverClient; +use crate::engine::Engine; +use crate::indexer::IndexerClient; +use crate::retriever::RetrieverClient; use vectorless_config::Config; use vectorless_events::EventEmitter; use vectorless_index::PipelineExecutor; diff --git a/vectorless-core/vectorless-retrieval/Cargo.toml b/vectorless-core/vectorless-retrieval/Cargo.toml index 69a66bfd..b364f762 100644 --- a/vectorless-core/vectorless-retrieval/Cargo.toml +++ b/vectorless-core/vectorless-retrieval/Cargo.toml @@ -23,5 +23,8 @@ tracing = { workspace = true } futures = { workspace = true } parking_lot = { workspace = true } +[dev-dependencies] +indextree = { workspace = true } + [lints] workspace = true From 5d0f7d112a6e5c576882012de0815c98dcb0b988 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:22:59 +0800 Subject: [PATCH 20/28] refactor: reorganize imports and reorder statements across modules - Move some import statements to improve code readability and maintain consistent ordering - Reorder some field declarations and function calls to follow standard Rust formatting conventions - Remove unused pub(crate) mod test_support from vectorless-engine - Remove unused test_support.rs file as it's no longer needed - Adjust some long lines to fit within 100 character limit - Move DocumentGraphConfig export to proper location in types module - Reorder some struct field initializations for better readability --- .../vectorless-agent/src/orchestrator/mod.rs | 3 +- vectorless-core/vectorless-config/src/lib.rs | 5 +- .../vectorless-config/src/types/mod.rs | 8 ++- .../vectorless-engine/src/engine.rs | 45 +++++++++------ vectorless-core/vectorless-engine/src/lib.rs | 1 - .../vectorless-engine/src/retriever.rs | 5 +- .../vectorless-engine/src/test_support.rs | 54 ------------------ .../vectorless-index/src/incremental/mod.rs | 2 +- .../src/incremental/resolver.rs | 2 +- .../src/incremental/updater.rs | 2 +- vectorless-core/vectorless-index/src/lib.rs | 2 +- .../src/parse/markdown/parser.rs | 2 +- .../vectorless-index/src/parse/mod.rs | 7 ++- .../vectorless-index/src/parse/pdf/parser.rs | 2 +- .../vectorless-index/src/parse/pdf/types.rs | 2 +- .../src/parse/toc/assigner.rs | 2 +- .../src/parse/toc/processor.rs | 2 +- .../src/parse/toc/repairer.rs | 2 +- .../src/parse/toc/structure_extractor.rs | 2 +- .../src/parse/toc/verifier.rs | 2 +- .../src/pipeline/checkpoint.rs | 2 +- .../vectorless-index/src/pipeline/context.rs | 2 +- .../vectorless-index/src/stages/build.rs | 2 +- .../vectorless-index/src/stages/concept.rs | 5 +- .../vectorless-index/src/stages/enhance.rs | 2 +- .../vectorless-index/src/stages/mod.rs | 2 +- .../vectorless-index/src/stages/optimize.rs | 4 +- .../vectorless-index/src/stages/parse.rs | 8 +-- .../vectorless-index/src/stages/validate.rs | 5 +- .../src/stages/verify_ingest.rs | 15 +++-- .../vectorless-llm/src/memo/store.rs | 16 +++--- vectorless-core/vectorless-py/src/engine.rs | 6 +- vectorless-core/vectorless-rerank/src/lib.rs | 2 +- .../vectorless-retrieval/src/lib.rs | 5 +- .../vectorless-retrieval/src/postprocessor.rs | 2 +- .../vectorless-storage/src/backend/memory.rs | 56 ++++++++----------- .../vectorless-storage/src/cache.rs | 2 +- .../vectorless-storage/src/persistence.rs | 7 ++- 38 files changed, 134 insertions(+), 161 deletions(-) delete mode 100644 vectorless-core/vectorless-engine/src/test_support.rs diff --git a/vectorless-core/vectorless-agent/src/orchestrator/mod.rs b/vectorless-core/vectorless-agent/src/orchestrator/mod.rs index 17a3009f..e17d1f39 100644 --- a/vectorless-core/vectorless-agent/src/orchestrator/mod.rs +++ b/vectorless-core/vectorless-agent/src/orchestrator/mod.rs @@ -195,7 +195,8 @@ pub async fn finalize_output( confidence: f32, ) -> vectorless_error::Result { let rerank_result = - vectorless_rerank::process(query, &state.all_evidence, multi_doc, intent, confidence).await?; + vectorless_rerank::process(query, &state.all_evidence, multi_doc, intent, confidence) + .await?; let total_llm_calls = orch_llm_calls + rerank_result.llm_calls; if !rerank_result.answer.is_empty() { diff --git a/vectorless-core/vectorless-config/src/lib.rs b/vectorless-core/vectorless-config/src/lib.rs index feaa5fe5..30217490 100644 --- a/vectorless-core/vectorless-config/src/lib.rs +++ b/vectorless-core/vectorless-config/src/lib.rs @@ -11,11 +11,10 @@ mod validator; pub use types::Config; pub use types::DocumentGraphConfig; -pub use types::MetricsConfig; pub use types::LlmMetricsConfig; +pub use types::MetricsConfig; pub use types::RetrievalMetricsConfig; pub use types::{ CompressionAlgorithm, FallbackBehavior, FallbackConfig, IndexerConfig, LlmConfig, - OnAllFailedBehavior, RetrievalConfig, RetryConfig, SlotConfig, StorageConfig, - ThrottleConfig, + OnAllFailedBehavior, RetrievalConfig, RetryConfig, SlotConfig, StorageConfig, ThrottleConfig, }; diff --git a/vectorless-core/vectorless-config/src/types/mod.rs b/vectorless-core/vectorless-config/src/types/mod.rs index 377842c6..47b2e2d1 100644 --- a/vectorless-core/vectorless-config/src/types/mod.rs +++ b/vectorless-core/vectorless-config/src/types/mod.rs @@ -12,6 +12,7 @@ mod storage; use serde::{Deserialize, Serialize}; +pub use graph::DocumentGraphConfig; pub use indexer::IndexerConfig; pub use llm_pool::{ FallbackBehavior, FallbackConfig, LlmConfig, OnAllFailedBehavior, RetryConfig, SlotConfig, @@ -20,7 +21,6 @@ pub use llm_pool::{ pub use metrics::{LlmMetricsConfig, MetricsConfig, RetrievalMetricsConfig}; pub use retrieval::RetrievalConfig; pub use storage::{CompressionAlgorithm, StorageConfig}; -pub use graph::DocumentGraphConfig; /// Main configuration for vectorless. /// @@ -216,7 +216,11 @@ pub struct ConfigValidationError { impl std::fmt::Display for ConfigValidationError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Configuration validation failed with {} error(s)", self.errors.len()) + write!( + f, + "Configuration validation failed with {} error(s)", + self.errors.len() + ) } } diff --git a/vectorless-core/vectorless-engine/src/engine.rs b/vectorless-core/vectorless-engine/src/engine.rs index 3fcff5b9..f2c61173 100644 --- a/vectorless-core/vectorless-engine/src/engine.rs +++ b/vectorless-core/vectorless-engine/src/engine.rs @@ -51,8 +51,7 @@ use tracing::{info, warn}; use vectorless_config::Config; use vectorless_document::{ - Answer, Document as UnderstandingDocument, DocumentTree, Evidence, IngestInput, - ReasoningTrace, + Answer, Document as UnderstandingDocument, DocumentTree, Evidence, IngestInput, ReasoningTrace, }; use vectorless_error::{Error, Result}; use vectorless_events::EventEmitter; @@ -422,10 +421,13 @@ impl Engine { pub async fn ingest(&self, input: IngestInput) -> Result { let ctx = match &input { IngestInput::Path(path) => IndexContext::from_path(path), - IngestInput::Bytes { data, format, .. } => IndexContext::from_bytes(data.clone(), *format), - IngestInput::Text { content, .. } => { - IndexContext::from_content(content, vectorless_index::parse::DocumentFormat::Markdown) + IngestInput::Bytes { data, format, .. } => { + IndexContext::from_bytes(data.clone(), *format) } + IngestInput::Text { content, .. } => IndexContext::from_content( + content, + vectorless_index::parse::DocumentFormat::Markdown, + ), }; let result = self.ingest_pipeline(ctx).await?; @@ -475,23 +477,24 @@ impl Engine { } // Build DocContexts from Documents and dispatch - let doc_contexts: Vec = documents - .iter() - .map(|doc| doc.as_context()) - .collect(); + let doc_contexts: Vec = + documents.iter().map(|doc| doc.as_context()).collect(); let skip_analysis = !ids.is_empty(); let scope = if skip_analysis { vectorless_agent::Scope::Specified(doc_contexts) } else { - vectorless_agent::Scope::Workspace(vectorless_agent::WorkspaceContext::new(doc_contexts)) + vectorless_agent::Scope::Workspace(vectorless_agent::WorkspaceContext::new( + doc_contexts, + )) }; let emitter = vectorless_agent::EventEmitter::noop(); let config = self.retriever.config().clone(); let llm = self.retriever.llm().clone(); let output = - vectorless_retrieval::dispatcher::dispatch(input, scope, &config, &llm, &emitter).await?; + vectorless_retrieval::dispatcher::dispatch(input, scope, &config, &llm, &emitter) + .await?; // Convert Output -> Answer Ok(Self::output_to_answer(&output)) @@ -574,7 +577,11 @@ impl Engine { doc_id: persisted.meta.id, name: persisted.meta.name, format: persisted.meta.format, - source_path: persisted.meta.source_path.as_ref().map(|p| p.to_string_lossy().to_string()), + source_path: persisted + .meta + .source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()), tree, nav_index, reasoning_index, @@ -749,8 +756,9 @@ impl Engine { None => return Ok(IndexAction::FullIndex { existing_id: None }), }; - let format = vectorless_index::parse::DocumentFormat::from_extension(&stored_doc.meta.format) - .unwrap_or(vectorless_index::parse::DocumentFormat::Markdown); + let format = + vectorless_index::parse::DocumentFormat::from_extension(&stored_doc.meta.format) + .unwrap_or(vectorless_index::parse::DocumentFormat::Markdown); let pipeline_options = self.build_pipeline_options(options, source); // If logic fingerprint changed, remove old doc before full reprocess @@ -873,7 +881,7 @@ impl std::fmt::Debug for Engine { #[cfg(test)] mod tests { use super::*; - use crate::client::types::IndexMode; + use crate::types::IndexMode; // -- resolve_index_action Default mode ---------------------------------------------- @@ -890,7 +898,7 @@ mod tests { // -- build_index_item ---------------------------------------------------------------- // Build_index_item only transforms data -- no I/O. - use crate::client::indexed_document::IndexedDocument; + use crate::indexed_document::IndexedDocument; fn make_doc() -> IndexedDocument { IndexedDocument::new("test-id", vectorless_index::parse::DocumentFormat::Markdown) @@ -906,7 +914,10 @@ mod tests { assert_eq!(item.doc_id, "test-id"); assert_eq!(item.name, "test.md"); - assert_eq!(item.format, vectorless_index::parse::DocumentFormat::Markdown); + assert_eq!( + item.format, + vectorless_index::parse::DocumentFormat::Markdown + ); assert_eq!(item.description, Some("test doc".to_string())); assert_eq!(item.source_path, Some("/tmp/test.md".to_string())); assert!(item.metrics.is_none()); diff --git a/vectorless-core/vectorless-engine/src/lib.rs b/vectorless-core/vectorless-engine/src/lib.rs index 9976c037..3e3b81bd 100644 --- a/vectorless-core/vectorless-engine/src/lib.rs +++ b/vectorless-core/vectorless-engine/src/lib.rs @@ -72,7 +72,6 @@ mod indexed_document; mod indexer; mod query_context; mod retriever; -pub(crate) mod test_support; mod types; mod workspace; diff --git a/vectorless-core/vectorless-engine/src/retriever.rs b/vectorless-core/vectorless-engine/src/retriever.rs index 59d2746d..217e182a 100644 --- a/vectorless-core/vectorless-engine/src/retriever.rs +++ b/vectorless-core/vectorless-engine/src/retriever.rs @@ -8,8 +8,11 @@ use tracing::info; -use vectorless_agent::{self, config::AgentConfig, config::DocContext, config::Scope, config::WorkspaceContext, events::EventEmitter as AgentEventEmitter}; use super::types::QueryResult; +use vectorless_agent::{ + self, config::AgentConfig, config::DocContext, config::Scope, config::WorkspaceContext, + events::EventEmitter as AgentEventEmitter, +}; use vectorless_document::{DocumentTree, NavigationIndex, ReasoningIndex}; use vectorless_error::Result; use vectorless_events::{EventEmitter, QueryEvent}; diff --git a/vectorless-core/vectorless-engine/src/test_support.rs b/vectorless-core/vectorless-engine/src/test_support.rs deleted file mode 100644 index 76b174e4..00000000 --- a/vectorless-core/vectorless-engine/src/test_support.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Test-only helpers for constructing Engine instances without a real LLM. -//! -//! This module is exposed via `vectorless::__test_support` and should **only** -//! be used in integration tests. - -use std::sync::Arc; - -use crate::engine::Engine; -use crate::indexer::IndexerClient; -use crate::retriever::RetrieverClient; -use vectorless_config::Config; -use vectorless_events::EventEmitter; -use vectorless_index::PipelineExecutor; -use vectorless_llm::LlmClient; -use vectorless_llm::config::LlmConfig; -use vectorless_metrics::MetricsHub; -use vectorless_storage::Workspace; - -/// Build an `Engine` with a no-LLM pipeline for integration testing. -/// -/// The pipeline skips enhance/summary stages but exercises: -/// parse → build → validate → split → enrich → optimize. -/// -/// # Example -/// -/// ```rust,ignore -/// let tmp = tempfile::tempdir().unwrap(); -/// let engine = vectorless::__test_support::build_test_engine(tmp.path()).await; -/// ``` -pub async fn build_test_engine(workspace_dir: &std::path::Path) -> Engine { - let config = Config::default(); - - // No-LLM indexer: pipeline without enhance stage - let executor_factory: Arc PipelineExecutor + Send + Sync> = - Arc::new(|| PipelineExecutor::new()); - let indexer = IndexerClient::with_factory(executor_factory); - - let workspace = Workspace::new(workspace_dir).await.unwrap(); - let retriever = RetrieverClient::new(LlmClient::new(LlmConfig::default())); - - Engine::with_components( - config, - workspace, - retriever, - indexer, - EventEmitter::new(), - Arc::new(MetricsHub::with_defaults()), - ) - .await - .unwrap() -} diff --git a/vectorless-core/vectorless-index/src/incremental/mod.rs b/vectorless-core/vectorless-index/src/incremental/mod.rs index 71f28fbe..4a5efddb 100644 --- a/vectorless-core/vectorless-index/src/incremental/mod.rs +++ b/vectorless-core/vectorless-index/src/incremental/mod.rs @@ -18,10 +18,10 @@ mod detector; mod resolver; mod updater; -use vectorless_document::DocumentTree; pub use detector::ChangeDetector; pub use resolver::{IndexAction, SkipInfo, resolve_action}; use std::collections::HashMap; +use vectorless_document::DocumentTree; /// Reuse summaries from old tree for unchanged nodes in the new tree. /// diff --git a/vectorless-core/vectorless-index/src/incremental/resolver.rs b/vectorless-core/vectorless-index/src/incremental/resolver.rs index 674c5f2c..d6022f31 100644 --- a/vectorless-core/vectorless-index/src/incremental/resolver.rs +++ b/vectorless-core/vectorless-index/src/incremental/resolver.rs @@ -10,9 +10,9 @@ use tracing::info; +use crate::config::PipelineOptions; use vectorless_document::DocumentFormat; use vectorless_document::DocumentTree; -use crate::config::PipelineOptions; use vectorless_storage::PersistedDocument; use vectorless_utils::fingerprint::Fingerprint; diff --git a/vectorless-core/vectorless-index/src/incremental/updater.rs b/vectorless-core/vectorless-index/src/incremental/updater.rs index 8914c394..70525d9b 100644 --- a/vectorless-core/vectorless-index/src/incremental/updater.rs +++ b/vectorless-core/vectorless-index/src/incremental/updater.rs @@ -5,9 +5,9 @@ use tracing::info; +use crate::parse::RawNode; use vectorless_document::{DocumentTree, NodeId}; use vectorless_error::Result; -use crate::parse::RawNode; use super::detector::ChangeDetector; diff --git a/vectorless-core/vectorless-index/src/lib.rs b/vectorless-core/vectorless-index/src/lib.rs index 8ea71cab..048158ca 100644 --- a/vectorless-core/vectorless-index/src/lib.rs +++ b/vectorless-core/vectorless-index/src/lib.rs @@ -66,8 +66,8 @@ pub mod summary; pub use pipeline::{IndexInput, IndexMetrics, PipelineExecutor, PipelineResult}; // Re-export config types -pub use vectorless_document::ReasoningIndexConfig; pub use config::{IndexMode, PipelineOptions, ThinningConfig}; +pub use vectorless_document::ReasoningIndexConfig; // Re-export summary pub use summary::SummaryStrategy; diff --git a/vectorless-core/vectorless-index/src/parse/markdown/parser.rs b/vectorless-core/vectorless-index/src/parse/markdown/parser.rs index faffe369..b8980f74 100644 --- a/vectorless-core/vectorless-index/src/parse/markdown/parser.rs +++ b/vectorless-core/vectorless-index/src/parse/markdown/parser.rs @@ -6,8 +6,8 @@ use pulldown_cmark::Options; use std::path::Path; -use vectorless_error::Result; use crate::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; +use vectorless_error::Result; use vectorless_utils::estimate_tokens; use super::config::MarkdownConfig; diff --git a/vectorless-core/vectorless-index/src/parse/mod.rs b/vectorless-core/vectorless-index/src/parse/mod.rs index 34d39f5c..d9bde2bf 100644 --- a/vectorless-core/vectorless-index/src/parse/mod.rs +++ b/vectorless-core/vectorless-index/src/parse/mod.rs @@ -16,8 +16,8 @@ pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; use std::path::Path; -use vectorless_error::Result; use crate::parse::markdown::MarkdownParser; +use vectorless_error::Result; use vectorless_llm::LlmClient; /// Parse a string content document. @@ -66,8 +66,9 @@ pub async fn parse_bytes( ) -> Result { match format { DocumentFormat::Markdown => { - let content = std::str::from_utf8(bytes) - .map_err(|e| vectorless_error::Error::Parse(format!("Invalid UTF-8 content: {}", e)))?; + let content = std::str::from_utf8(bytes).map_err(|e| { + vectorless_error::Error::Parse(format!("Invalid UTF-8 content: {}", e)) + })?; let parser = MarkdownParser::new(); parser.parse(content).await } diff --git a/vectorless-core/vectorless-index/src/parse/pdf/parser.rs b/vectorless-core/vectorless-index/src/parse/pdf/parser.rs index 72b9fc6f..61e05787 100644 --- a/vectorless-core/vectorless-index/src/parse/pdf/parser.rs +++ b/vectorless-core/vectorless-index/src/parse/pdf/parser.rs @@ -12,9 +12,9 @@ use std::path::Path; use lopdf::Document as LopdfDocument; use tracing::{info, warn}; +use crate::parse::toc::TocProcessor; use vectorless_error::Error; use vectorless_error::Result; -use crate::parse::toc::TocProcessor; use vectorless_llm::LlmClient; use super::types::{PdfMetadata, PdfPage, PdfParseResult}; diff --git a/vectorless-core/vectorless-index/src/parse/pdf/types.rs b/vectorless-core/vectorless-index/src/parse/pdf/types.rs index 6bcfd6bf..c666d011 100644 --- a/vectorless-core/vectorless-index/src/parse/pdf/types.rs +++ b/vectorless-core/vectorless-index/src/parse/pdf/types.rs @@ -3,8 +3,8 @@ //! PDF document types. -use vectorless_utils::estimate_tokens; use serde::{Deserialize, Serialize}; +use vectorless_utils::estimate_tokens; /// A single page from a PDF document. #[derive(Debug, Clone, Serialize, Deserialize)] diff --git a/vectorless-core/vectorless-index/src/parse/toc/assigner.rs b/vectorless-core/vectorless-index/src/parse/toc/assigner.rs index 172ffec2..7c0404fa 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/assigner.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/assigner.rs @@ -7,8 +7,8 @@ use futures::stream::{self, StreamExt}; use std::collections::HashMap; use tracing::{debug, info}; -use vectorless_error::Result; use crate::parse::pdf::PdfPage; +use vectorless_error::Result; use vectorless_llm::config::LlmConfig; use super::types::{PageOffset, TocEntry}; diff --git a/vectorless-core/vectorless-index/src/parse/toc/processor.rs b/vectorless-core/vectorless-index/src/parse/toc/processor.rs index e4d93fb1..bc8d52af 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/processor.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/processor.rs @@ -10,8 +10,8 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info, warn}; -use vectorless_error::Result; use crate::parse::pdf::PdfPage; +use vectorless_error::Result; use vectorless_llm::LlmClient; use super::assigner::{PageAssigner, PageAssignerConfig}; diff --git a/vectorless-core/vectorless-index/src/parse/toc/repairer.rs b/vectorless-core/vectorless-index/src/parse/toc/repairer.rs index 5a5a7a92..977f8635 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/repairer.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/repairer.rs @@ -6,8 +6,8 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info}; -use vectorless_error::Result; use crate::parse::pdf::PdfPage; +use vectorless_error::Result; use vectorless_llm::config::LlmConfig; use super::types::{TocEntry, VerificationError, VerificationReport}; diff --git a/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs b/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs index 374f01ca..c9f29ddb 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/structure_extractor.rs @@ -10,8 +10,8 @@ use futures::stream::{self, StreamExt}; use tracing::{debug, info, warn}; -use vectorless_error::Result; use crate::parse::pdf::PdfPage; +use vectorless_error::Result; use vectorless_llm::config::LlmConfig; use super::types::TocEntry; diff --git a/vectorless-core/vectorless-index/src/parse/toc/verifier.rs b/vectorless-core/vectorless-index/src/parse/toc/verifier.rs index 11981fd5..460f39be 100644 --- a/vectorless-core/vectorless-index/src/parse/toc/verifier.rs +++ b/vectorless-core/vectorless-index/src/parse/toc/verifier.rs @@ -7,8 +7,8 @@ use futures::stream::{self, StreamExt}; use rand::seq::SliceRandom; use tracing::{debug, info}; -use vectorless_error::Result; use crate::parse::pdf::PdfPage; +use vectorless_error::Result; use vectorless_llm::config::LlmConfig; use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; diff --git a/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs b/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs index e2d9133b..ad607214 100644 --- a/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs +++ b/vectorless-core/vectorless-index/src/pipeline/checkpoint.rs @@ -13,8 +13,8 @@ use chrono::{DateTime, Utc}; use serde::{Deserialize, Serialize}; use tracing::{info, warn}; -use vectorless_document::DocumentTree; use crate::parse::RawNode; +use vectorless_document::DocumentTree; use super::metrics::IndexMetrics; diff --git a/vectorless-core/vectorless-index/src/pipeline/context.rs b/vectorless-core/vectorless-index/src/pipeline/context.rs index eb7058e1..9bc01101 100644 --- a/vectorless-core/vectorless-index/src/pipeline/context.rs +++ b/vectorless-core/vectorless-index/src/pipeline/context.rs @@ -6,8 +6,8 @@ use std::collections::HashMap; use std::path::PathBuf; -use vectorless_document::{Concept, DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; use crate::parse::{DocumentFormat, RawNode}; +use vectorless_document::{Concept, DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; use vectorless_llm::LlmClient; use super::super::{PipelineOptions, SummaryStrategy}; diff --git a/vectorless-core/vectorless-index/src/stages/build.rs b/vectorless-core/vectorless-index/src/stages/build.rs index 98549ee2..29eb687b 100644 --- a/vectorless-core/vectorless-index/src/stages/build.rs +++ b/vectorless-core/vectorless-index/src/stages/build.rs @@ -7,9 +7,9 @@ use super::async_trait; use std::time::Instant; use tracing::{debug, info}; +use crate::parse::RawNode; use vectorless_document::{DocumentTree, NodeId}; use vectorless_error::Result; -use crate::parse::RawNode; use vectorless_utils::estimate_tokens; use super::{IndexStage, StageResult}; diff --git a/vectorless-core/vectorless-index/src/stages/concept.rs b/vectorless-core/vectorless-index/src/stages/concept.rs index 35bcc270..d6a1f52b 100644 --- a/vectorless-core/vectorless-index/src/stages/concept.rs +++ b/vectorless-core/vectorless-index/src/stages/concept.rs @@ -133,7 +133,10 @@ async fn extract_with_llm(ctx: &mut IndexContext, client: &LlmClient) -> Vec { - warn!("[concept_extraction] LLM extraction failed: {}, using fallback", e); + warn!( + "[concept_extraction] LLM extraction failed: {}, using fallback", + e + ); extract_from_topics(ctx) } } diff --git a/vectorless-core/vectorless-index/src/stages/enhance.rs b/vectorless-core/vectorless-index/src/stages/enhance.rs index 2d70e7a9..9613674f 100644 --- a/vectorless-core/vectorless-index/src/stages/enhance.rs +++ b/vectorless-core/vectorless-index/src/stages/enhance.rs @@ -9,9 +9,9 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use tracing::{debug, info, warn}; +use crate::incremental; use vectorless_document::NodeId; use vectorless_error::Result; -use crate::incremental; use vectorless_llm::LlmClient; use vectorless_llm::memo::{MemoKey, MemoStore}; use vectorless_utils::fingerprint::Fingerprint; diff --git a/vectorless-core/vectorless-index/src/stages/mod.rs b/vectorless-core/vectorless-index/src/stages/mod.rs index 3d8da297..a5bab452 100644 --- a/vectorless-core/vectorless-index/src/stages/mod.rs +++ b/vectorless-core/vectorless-index/src/stages/mod.rs @@ -28,8 +28,8 @@ pub use validate::ValidateStage; pub use verify_ingest::VerifyStage; use super::pipeline::{FailurePolicy, IndexContext, StageResult}; -use vectorless_error::Result; pub use async_trait::async_trait; +use vectorless_error::Result; /// Declares which context fields a stage reads/writes. /// Used by the orchestrator to determine safe parallel execution. diff --git a/vectorless-core/vectorless-index/src/stages/optimize.rs b/vectorless-core/vectorless-index/src/stages/optimize.rs index 4b430bca..61ee4706 100644 --- a/vectorless-core/vectorless-index/src/stages/optimize.rs +++ b/vectorless-core/vectorless-index/src/stages/optimize.rs @@ -7,9 +7,9 @@ use super::{AccessPattern, async_trait}; use std::time::Instant; use tracing::{debug, info}; +use crate::pipeline::IndexContext; use vectorless_document::NodeId; use vectorless_error::Result; -use crate::pipeline::IndexContext; use super::{IndexStage, StageResult}; @@ -242,10 +242,10 @@ impl IndexStage for OptimizeStage { #[cfg(test)] mod tests { use super::*; - use vectorless_document::DocumentTree; use crate::PipelineOptions; use crate::pipeline::IndexContext; use crate::pipeline::IndexInput; + use vectorless_document::DocumentTree; /// Create a tree with small leaf children under root for merge tests. /// diff --git a/vectorless-core/vectorless-index/src/stages/parse.rs b/vectorless-core/vectorless-index/src/stages/parse.rs index bba8c8b5..7dbaa076 100644 --- a/vectorless-core/vectorless-index/src/stages/parse.rs +++ b/vectorless-core/vectorless-index/src/stages/parse.rs @@ -39,8 +39,9 @@ impl ParseStage { IndexMode::Auto => match &ctx.input { IndexInput::File(path) => { let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - DocumentFormat::from_extension(ext) - .ok_or_else(|| vectorless_error::Error::Parse(format!("Unknown format: {}", ext))) + DocumentFormat::from_extension(ext).ok_or_else(|| { + vectorless_error::Error::Parse(format!("Unknown format: {}", ext)) + }) } IndexInput::Content { format, .. } => Ok(*format), IndexInput::Bytes { format, .. } => Ok(*format), @@ -112,8 +113,7 @@ impl IndexStage for ParseStage { debug!("[parse] Parsing inline content ({} chars)", content.len()); // Parse content directly - crate::parse::parse_content(content, *format, self.llm_client.clone()) - .await? + crate::parse::parse_content(content, *format, self.llm_client.clone()).await? } IndexInput::Bytes { data, name, format } => { // Set name diff --git a/vectorless-core/vectorless-index/src/stages/validate.rs b/vectorless-core/vectorless-index/src/stages/validate.rs index 82df7106..5b165a2d 100644 --- a/vectorless-core/vectorless-index/src/stages/validate.rs +++ b/vectorless-core/vectorless-index/src/stages/validate.rs @@ -100,7 +100,10 @@ impl ValidateStage { } /// Check for leaf nodes with empty titles. - fn check_empty_titles(tree: &vectorless_document::DocumentTree, issues: &mut Vec) { + fn check_empty_titles( + tree: &vectorless_document::DocumentTree, + issues: &mut Vec, + ) { let leaves = tree.leaves(); let mut empty_count = 0; diff --git a/vectorless-core/vectorless-index/src/stages/verify_ingest.rs b/vectorless-core/vectorless-index/src/stages/verify_ingest.rs index b20a25fd..33119d83 100644 --- a/vectorless-core/vectorless-index/src/stages/verify_ingest.rs +++ b/vectorless-core/vectorless-index/src/stages/verify_ingest.rs @@ -5,10 +5,10 @@ use tracing::{info, warn}; +use super::async_trait; use super::{AccessPattern, IndexStage}; -use vectorless_error::{Error, Result}; use crate::pipeline::{IndexContext, StageResult}; -use super::async_trait; +use vectorless_error::{Error, Result}; /// Verification stage — ensures ingest produced reliable output. /// @@ -43,14 +43,13 @@ impl IndexStage for VerifyStage { async fn execute(&mut self, ctx: &mut IndexContext) -> Result { // Tree must exist and have nodes - let tree = ctx.tree.as_ref().ok_or_else(|| { - Error::InvalidStructure("document tree is empty".into()) - })?; + let tree = ctx + .tree + .as_ref() + .ok_or_else(|| Error::InvalidStructure("document tree is empty".into()))?; let node_count = tree.node_count(); if node_count == 0 { - return Err(Error::InvalidStructure( - "tree has no nodes".into(), - )); + return Err(Error::InvalidStructure("tree has no nodes".into())); } // Summary must be non-empty diff --git a/vectorless-core/vectorless-llm/src/memo/store.rs b/vectorless-core/vectorless-llm/src/memo/store.rs index 0c681ab0..5a87bd61 100644 --- a/vectorless-core/vectorless-llm/src/memo/store.rs +++ b/vectorless-core/vectorless-llm/src/memo/store.rs @@ -388,14 +388,15 @@ impl MemoStore { stats, }; - let parent = path - .parent() - .ok_or_else(|| vectorless_error::Error::Parse("Invalid path for memo store".to_string()))?; + let parent = path.parent().ok_or_else(|| { + vectorless_error::Error::Parse("Invalid path for memo store".to_string()) + })?; tokio::fs::create_dir_all(parent).await?; let temp_path = path.with_extension("tmp"); - let json = serde_json::to_vec_pretty(&data) - .map_err(|e| vectorless_error::Error::Parse(format!("Failed to serialize memo store: {}", e)))?; + let json = serde_json::to_vec_pretty(&data).map_err(|e| { + vectorless_error::Error::Parse(format!("Failed to serialize memo store: {}", e)) + })?; tokio::fs::write(&temp_path, &json).await?; tokio::fs::rename(&temp_path, path).await?; @@ -414,8 +415,9 @@ impl MemoStore { } let bytes = tokio::fs::read(path).await?; - let data: MemoStoreData = serde_json::from_slice(&bytes) - .map_err(|e| vectorless_error::Error::Parse(format!("Failed to deserialize memo store: {}", e)))?; + let data: MemoStoreData = serde_json::from_slice(&bytes).map_err(|e| { + vectorless_error::Error::Parse(format!("Failed to deserialize memo store: {}", e)) + })?; let mut cache = self.cache.write(); diff --git a/vectorless-core/vectorless-py/src/engine.rs b/vectorless-core/vectorless-py/src/engine.rs index b7572d9c..2d09b3e7 100644 --- a/vectorless-core/vectorless-py/src/engine.rs +++ b/vectorless-core/vectorless-py/src/engine.rs @@ -26,7 +26,11 @@ async fn run_ingest(engine: Arc, input: IngestInput) -> PyResult, question: String, doc_ids: Vec) -> PyResult { +async fn run_ask( + engine: Arc, + question: String, + doc_ids: Vec, +) -> PyResult { let answer = engine.ask(&question, &doc_ids).await.map_err(to_py_err)?; Ok(PyAnswer { inner: answer }) } diff --git a/vectorless-core/vectorless-rerank/src/lib.rs b/vectorless-core/vectorless-rerank/src/lib.rs index efab49ee..875c1024 100644 --- a/vectorless-core/vectorless-rerank/src/lib.rs +++ b/vectorless-core/vectorless-rerank/src/lib.rs @@ -21,8 +21,8 @@ pub mod types; use tracing::info; -use vectorless_query::QueryIntent; use types::{Evidence, RerankOutput}; +use vectorless_query::QueryIntent; /// Process agent output through the rerank pipeline. /// diff --git a/vectorless-core/vectorless-retrieval/src/lib.rs b/vectorless-core/vectorless-retrieval/src/lib.rs index 55d7c936..96d6f1db 100644 --- a/vectorless-core/vectorless-retrieval/src/lib.rs +++ b/vectorless-core/vectorless-retrieval/src/lib.rs @@ -25,4 +25,7 @@ pub mod stream; mod types; pub use stream::{RetrieveEvent, RetrieveEventReceiver}; -pub use types::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem, ReasoningChain, RetrieveResponse, SufficiencyLevel}; +pub use types::{ + Confidence, EvidenceItem, QueryMetrics, QueryResultItem, ReasoningChain, RetrieveResponse, + SufficiencyLevel, +}; diff --git a/vectorless-core/vectorless-retrieval/src/postprocessor.rs b/vectorless-core/vectorless-retrieval/src/postprocessor.rs index 08d55644..79da0ebf 100644 --- a/vectorless-core/vectorless-retrieval/src/postprocessor.rs +++ b/vectorless-core/vectorless-retrieval/src/postprocessor.rs @@ -9,8 +9,8 @@ use std::collections::BTreeMap; -use vectorless_agent::config::{Evidence, Metrics, Output}; use crate::types::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem}; +use vectorless_agent::config::{Evidence, Metrics, Output}; /// Convert agent output to query result items, split by document. /// diff --git a/vectorless-core/vectorless-storage/src/backend/memory.rs b/vectorless-core/vectorless-storage/src/backend/memory.rs index 3d9b3be2..197ddace 100644 --- a/vectorless-core/vectorless-storage/src/backend/memory.rs +++ b/vectorless-core/vectorless-storage/src/backend/memory.rs @@ -39,68 +39,60 @@ impl MemoryBackend { impl StorageBackend for MemoryBackend { fn get(&self, key: &str) -> Result>> { - let data = self - .data - .read() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let data = self.data.read().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; Ok(data.get(key).cloned()) } fn put(&self, key: &str, value: &[u8]) -> Result<()> { - let mut data = self - .data - .write() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let mut data = self.data.write().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; data.insert(key.to_string(), value.to_vec()); Ok(()) } fn delete(&self, key: &str) -> Result { - let mut data = self - .data - .write() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let mut data = self.data.write().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; Ok(data.remove(key).is_some()) } fn exists(&self, key: &str) -> Result { - let data = self - .data - .read() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let data = self.data.read().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; Ok(data.contains_key(key)) } fn keys(&self) -> Result> { - let data = self - .data - .read() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let data = self.data.read().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; Ok(data.keys().cloned().collect()) } fn len(&self) -> Result { - let data = self - .data - .read() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let data = self.data.read().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; Ok(data.len()) } fn clear(&self) -> Result<()> { - let mut data = self - .data - .write() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let mut data = self.data.write().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; data.clear(); Ok(()) } fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { - let mut data = self - .data - .write() - .map_err(|_| vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()))?; + let mut data = self.data.write().map_err(|_| { + vectorless_error::Error::Cache("Memory backend lock poisoned".to_string()) + })?; for (key, value) in items { data.insert(key.to_string(), value.to_vec()); } diff --git a/vectorless-core/vectorless-storage/src/cache.rs b/vectorless-core/vectorless-storage/src/cache.rs index 39a66080..2b06c240 100644 --- a/vectorless-core/vectorless-storage/src/cache.rs +++ b/vectorless-core/vectorless-storage/src/cache.rs @@ -267,8 +267,8 @@ pub struct CacheStats { #[cfg(test)] mod tests { use super::*; - use vectorless_document::DocumentTree; use crate::{DocumentMeta, PersistedDocument}; + use vectorless_document::DocumentTree; fn create_test_doc(id: &str) -> PersistedDocument { let meta = DocumentMeta::new(id, "Test Doc", "md"); diff --git a/vectorless-core/vectorless-storage/src/persistence.rs b/vectorless-core/vectorless-storage/src/persistence.rs index c9a823f6..e0a83763 100644 --- a/vectorless-core/vectorless-storage/src/persistence.rs +++ b/vectorless-core/vectorless-storage/src/persistence.rs @@ -15,8 +15,8 @@ use std::fs::File; use std::io::{BufReader, BufWriter, Write}; use std::path::{Path, PathBuf}; -use vectorless_error::Error; use vectorless_document::{DocumentTree, NavigationIndex, ReasoningIndex}; +use vectorless_error::Error; use vectorless_error::Result; /// Current format version for persisted documents. @@ -139,7 +139,10 @@ impl DocumentMeta { } /// Set the logic fingerprint. - pub fn with_logic_fingerprint(mut self, fp: vectorless_utils::fingerprint::Fingerprint) -> Self { + pub fn with_logic_fingerprint( + mut self, + fp: vectorless_utils::fingerprint::Fingerprint, + ) -> Self { self.logic_fingerprint = fp; self } From 60c8fab9639cd1f65044b141127efb0d0f8e09a3 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:38:31 +0800 Subject: [PATCH 21/28] refactor(engine): update DocContext creation and improve module imports - Replace direct doc.as_context() call with explicit DocContext construction using individual fields (tree, nav_index, reasoning_index, doc_name) - Update concurrency configuration to use proper type conversion from throttle config refactor(graph): consolidate configuration in vectorless-config - Remove local DocumentGraphConfig implementation - Add vectorless-config dependency to vectorless-graph - Re-export DocumentGraphConfig from vectorless_config as single source of truth refactor(python): update module imports to use vectorless_engine - Replace ::vectorless imports with ::vectorless_engine in python bindings for Answer, Config, DocumentInfo, Engine, Error, Graph, and Metrics types - This ensures consistent usage of the engine module across Python API --- .../vectorless-engine/src/engine.rs | 13 +++-- vectorless-core/vectorless-graph/Cargo.toml | 3 +- .../vectorless-graph/src/config.rs | 48 +------------------ vectorless-core/vectorless-py/src/answer.rs | 2 +- vectorless-core/vectorless-py/src/config.rs | 4 +- vectorless-core/vectorless-py/src/document.rs | 2 +- vectorless-core/vectorless-py/src/engine.rs | 2 +- vectorless-core/vectorless-py/src/error.rs | 2 +- vectorless-core/vectorless-py/src/graph.rs | 2 +- vectorless-core/vectorless-py/src/metrics.rs | 2 +- 10 files changed, 22 insertions(+), 58 deletions(-) diff --git a/vectorless-core/vectorless-engine/src/engine.rs b/vectorless-core/vectorless-engine/src/engine.rs index f2c61173..7ca039c1 100644 --- a/vectorless-core/vectorless-engine/src/engine.rs +++ b/vectorless-core/vectorless-engine/src/engine.rs @@ -477,8 +477,15 @@ impl Engine { } // Build DocContexts from Documents and dispatch - let doc_contexts: Vec = - documents.iter().map(|doc| doc.as_context()).collect(); + let doc_contexts: Vec = documents + .iter() + .map(|doc| vectorless_agent::DocContext { + tree: &doc.tree, + nav_index: &doc.nav_index, + reasoning_index: &doc.reasoning_index, + doc_name: &doc.name, + }) + .collect(); let skip_analysis = !ids.is_empty(); let scope = if skip_analysis { @@ -698,7 +705,7 @@ impl Engine { enable_synonym_expansion: options.enable_synonym_expansion, ..ReasoningIndexConfig::default() }, - concurrency: self.config.llm.throttle.to_runtime_config(), + concurrency: vectorless_llm::throttle::ConcurrencyConfig::from(&self.config.llm.throttle), ..Default::default() } } diff --git a/vectorless-core/vectorless-graph/Cargo.toml b/vectorless-core/vectorless-graph/Cargo.toml index eb42881e..d527ea12 100644 --- a/vectorless-core/vectorless-graph/Cargo.toml +++ b/vectorless-core/vectorless-graph/Cargo.toml @@ -9,8 +9,9 @@ repository.workspace = true homepage.workspace = true [dependencies] -tracing = { workspace = true } +vectorless-config = { path = "../vectorless-config" } vectorless-document = { path = "../vectorless-document" } +tracing = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/vectorless-core/vectorless-graph/src/config.rs b/vectorless-core/vectorless-graph/src/config.rs index 40b1d888..c0f051ea 100644 --- a/vectorless-core/vectorless-graph/src/config.rs +++ b/vectorless-core/vectorless-graph/src/config.rs @@ -3,49 +3,5 @@ //! Configuration for document graph building and retrieval. -use serde::{Deserialize, Serialize}; - -/// Configuration for building the document graph. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraphConfig { - /// Whether graph building is enabled. - pub enabled: bool, - /// Minimum Jaccard similarity for creating an edge. - pub min_keyword_jaccard: f32, - /// Minimum shared keywords to create an edge. - pub min_shared_keywords: usize, - /// Maximum top keywords per document node. - pub max_keywords_per_doc: usize, - /// Maximum edges per document node. - pub max_edges_per_node: usize, - /// Boost factor applied to graph-connected documents during retrieval. - pub retrieval_boost_factor: f32, -} - -impl Default for DocumentGraphConfig { - fn default() -> Self { - Self { - enabled: true, - min_keyword_jaccard: 0.1, - min_shared_keywords: 2, - max_keywords_per_doc: 50, - max_edges_per_node: 20, - retrieval_boost_factor: 0.15, - } - } -} - -impl DocumentGraphConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Create a disabled config. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } -} +// Re-export from vectorless_config — single source of truth. +pub use vectorless_config::DocumentGraphConfig; diff --git a/vectorless-core/vectorless-py/src/answer.rs b/vectorless-core/vectorless-py/src/answer.rs index d1c9eba0..92131392 100644 --- a/vectorless-core/vectorless-py/src/answer.rs +++ b/vectorless-core/vectorless-py/src/answer.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; -use ::vectorless::Answer; +use ::vectorless_engine::Answer; /// A reasoned answer with evidence and trace. #[pyclass(name = "Answer")] diff --git a/vectorless-core/vectorless-py/src/config.rs b/vectorless-core/vectorless-py/src/config.rs index ce601311..6b043ea6 100644 --- a/vectorless-core/vectorless-py/src/config.rs +++ b/vectorless-core/vectorless-py/src/config.rs @@ -24,7 +24,7 @@ use pyo3::prelude::*; /// ``` #[pyclass(name = "Config")] pub struct PyConfig { - pub(crate) inner: vectorless::Config, + pub(crate) inner: vectorless_engine::Config, } #[pymethods] @@ -33,7 +33,7 @@ impl PyConfig { #[new] fn new() -> Self { Self { - inner: vectorless::Config::default(), + inner: vectorless_engine::Config::default(), } } diff --git a/vectorless-core/vectorless-py/src/document.rs b/vectorless-core/vectorless-py/src/document.rs index 56dd1570..af200d02 100644 --- a/vectorless-core/vectorless-py/src/document.rs +++ b/vectorless-core/vectorless-py/src/document.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; -use ::vectorless::DocumentInfo; +use ::vectorless_engine::DocumentInfo; /// Information about an understood document. #[pyclass(name = "DocumentInfo")] diff --git a/vectorless-core/vectorless-py/src/engine.rs b/vectorless-core/vectorless-py/src/engine.rs index 2d09b3e7..66c4e30c 100644 --- a/vectorless-core/vectorless-py/src/engine.rs +++ b/vectorless-core/vectorless-py/src/engine.rs @@ -8,7 +8,7 @@ use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; use tokio::runtime::Runtime; -use ::vectorless::{Engine, EngineBuilder, IngestInput}; +use ::vectorless_engine::{Engine, EngineBuilder, IngestInput}; use super::answer::PyAnswer; use super::document::PyDocumentInfo; diff --git a/vectorless-core/vectorless-py/src/error.rs b/vectorless-core/vectorless-py/src/error.rs index e4a977b8..c4715614 100644 --- a/vectorless-core/vectorless-py/src/error.rs +++ b/vectorless-core/vectorless-py/src/error.rs @@ -6,7 +6,7 @@ use pyo3::exceptions::PyException; use pyo3::prelude::*; -use ::vectorless::Error as RustError; +use ::vectorless_engine::Error as RustError; /// Python exception for vectorless errors. #[pyclass(extends = PyException, subclass)] diff --git a/vectorless-core/vectorless-py/src/graph.rs b/vectorless-core/vectorless-py/src/graph.rs index a424316f..556731fc 100644 --- a/vectorless-core/vectorless-py/src/graph.rs +++ b/vectorless-core/vectorless-py/src/graph.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; -use ::vectorless::{DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword}; +use ::vectorless_engine::{DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword}; /// A keyword with weight from document analysis. #[pyclass(name = "WeightedKeyword")] diff --git a/vectorless-core/vectorless-py/src/metrics.rs b/vectorless-core/vectorless-py/src/metrics.rs index f194cd82..19f94623 100644 --- a/vectorless-core/vectorless-py/src/metrics.rs +++ b/vectorless-core/vectorless-py/src/metrics.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; -use ::vectorless::{LlmMetricsReport, MetricsReport, RetrievalMetricsReport}; +use ::vectorless_engine::{LlmMetricsReport, MetricsReport, RetrievalMetricsReport}; /// LLM usage metrics report. #[pyclass(name = "LlmMetricsReport")] From 968936c4b42b943366e96bba6c3de39fb7b18f0a Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:41:12 +0800 Subject: [PATCH 22/28] refactor(engine): re-export types from sub-crates for better API access - Add re-exports of Config from vectorless_config - Add re-exports of core document types (Answer, Concept, DocumentInfo, etc.) - Add re-exports of error handling types (Error, Result) - Add re-exports of event types (EventEmitter, IndexEvent, QueryEvent, etc.) - Add re-exports of graph types (DocumentGraph, DocumentGraphNode, etc.) - Add re-exports of metrics types (LlmMetricsReport, MetricsReport, etc.) - Add re-export of DocumentTree from vectorless_document --- vectorless-core/vectorless-engine/src/lib.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vectorless-core/vectorless-engine/src/lib.rs b/vectorless-core/vectorless-engine/src/lib.rs index 3e3b81bd..80abbe6d 100644 --- a/vectorless-core/vectorless-engine/src/lib.rs +++ b/vectorless-core/vectorless-engine/src/lib.rs @@ -103,3 +103,19 @@ pub use types::{ // ============================================================ pub use vectorless_document::DocumentFormat; + +// ============================================================ +// Re-exports from sub-crates (for downstream consumers) +// ============================================================ + +pub use vectorless_config::Config; +pub use vectorless_document::{ + Answer, Concept, DocumentInfo, Evidence, IngestInput, ReasoningTrace, TraceStep, +}; +pub use vectorless_error::{Error, Result}; +pub use vectorless_events::{EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; +pub use vectorless_graph::{ + DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword, +}; +pub use vectorless_metrics::{LlmMetricsReport, MetricsReport, RetrievalMetricsReport}; +pub use vectorless_document::DocumentTree; From 275c5a3fb4d00350f6c49b36f69fb46449d08431 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 09:45:38 +0800 Subject: [PATCH 23/28] refactor(vectorless-config): move DocumentGraphConfig to vectorless-graph crate - Remove tracing and tokio dependencies from vectorless-config - Add vectorless-graph as dependency instead - Remove graph module from types and update import to use vectorless-graph - Move DocumentGraphConfig re-export to use vectorless_graph crate refactor(vectorless-graph): move DocumentGraphConfig implementation to graph crate - Remove vectorless-config dependency from vectorless-graph - Implement DocumentGraphConfig directly in vectorless-graph crate - Include all configuration fields and methods for document graph settings - Maintain same API interface while moving implementation to correct location --- vectorless-core/vectorless-config/Cargo.toml | 3 +- .../vectorless-config/src/types/graph.rs | 51 ------------------- .../vectorless-config/src/types/mod.rs | 3 +- vectorless-core/vectorless-graph/Cargo.toml | 1 - .../vectorless-graph/src/config.rs | 48 ++++++++++++++++- 5 files changed, 48 insertions(+), 58 deletions(-) delete mode 100644 vectorless-core/vectorless-config/src/types/graph.rs diff --git a/vectorless-core/vectorless-config/Cargo.toml b/vectorless-core/vectorless-config/Cargo.toml index c94f8713..c42bda8b 100644 --- a/vectorless-core/vectorless-config/Cargo.toml +++ b/vectorless-core/vectorless-config/Cargo.toml @@ -9,8 +9,7 @@ repository.workspace = true homepage.workspace = true [dependencies] -tracing = { workspace = true } -tokio = { workspace = true } +vectorless-graph = { path = "../vectorless-graph" } serde = { workspace = true } serde_json = { workspace = true } diff --git a/vectorless-core/vectorless-config/src/types/graph.rs b/vectorless-core/vectorless-config/src/types/graph.rs deleted file mode 100644 index 40b1d888..00000000 --- a/vectorless-core/vectorless-config/src/types/graph.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration for document graph building and retrieval. - -use serde::{Deserialize, Serialize}; - -/// Configuration for building the document graph. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraphConfig { - /// Whether graph building is enabled. - pub enabled: bool, - /// Minimum Jaccard similarity for creating an edge. - pub min_keyword_jaccard: f32, - /// Minimum shared keywords to create an edge. - pub min_shared_keywords: usize, - /// Maximum top keywords per document node. - pub max_keywords_per_doc: usize, - /// Maximum edges per document node. - pub max_edges_per_node: usize, - /// Boost factor applied to graph-connected documents during retrieval. - pub retrieval_boost_factor: f32, -} - -impl Default for DocumentGraphConfig { - fn default() -> Self { - Self { - enabled: true, - min_keyword_jaccard: 0.1, - min_shared_keywords: 2, - max_keywords_per_doc: 50, - max_edges_per_node: 20, - retrieval_boost_factor: 0.15, - } - } -} - -impl DocumentGraphConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Create a disabled config. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } -} diff --git a/vectorless-core/vectorless-config/src/types/mod.rs b/vectorless-core/vectorless-config/src/types/mod.rs index 47b2e2d1..3a840489 100644 --- a/vectorless-core/vectorless-config/src/types/mod.rs +++ b/vectorless-core/vectorless-config/src/types/mod.rs @@ -3,7 +3,6 @@ //! Configuration type definitions. -mod graph; mod indexer; mod llm_pool; mod metrics; @@ -12,7 +11,7 @@ mod storage; use serde::{Deserialize, Serialize}; -pub use graph::DocumentGraphConfig; +pub use vectorless_graph::DocumentGraphConfig; pub use indexer::IndexerConfig; pub use llm_pool::{ FallbackBehavior, FallbackConfig, LlmConfig, OnAllFailedBehavior, RetryConfig, SlotConfig, diff --git a/vectorless-core/vectorless-graph/Cargo.toml b/vectorless-core/vectorless-graph/Cargo.toml index d527ea12..a441bfd6 100644 --- a/vectorless-core/vectorless-graph/Cargo.toml +++ b/vectorless-core/vectorless-graph/Cargo.toml @@ -9,7 +9,6 @@ repository.workspace = true homepage.workspace = true [dependencies] -vectorless-config = { path = "../vectorless-config" } vectorless-document = { path = "../vectorless-document" } tracing = { workspace = true } serde = { workspace = true } diff --git a/vectorless-core/vectorless-graph/src/config.rs b/vectorless-core/vectorless-graph/src/config.rs index c0f051ea..40b1d888 100644 --- a/vectorless-core/vectorless-graph/src/config.rs +++ b/vectorless-core/vectorless-graph/src/config.rs @@ -3,5 +3,49 @@ //! Configuration for document graph building and retrieval. -// Re-export from vectorless_config — single source of truth. -pub use vectorless_config::DocumentGraphConfig; +use serde::{Deserialize, Serialize}; + +/// Configuration for building the document graph. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DocumentGraphConfig { + /// Whether graph building is enabled. + pub enabled: bool, + /// Minimum Jaccard similarity for creating an edge. + pub min_keyword_jaccard: f32, + /// Minimum shared keywords to create an edge. + pub min_shared_keywords: usize, + /// Maximum top keywords per document node. + pub max_keywords_per_doc: usize, + /// Maximum edges per document node. + pub max_edges_per_node: usize, + /// Boost factor applied to graph-connected documents during retrieval. + pub retrieval_boost_factor: f32, +} + +impl Default for DocumentGraphConfig { + fn default() -> Self { + Self { + enabled: true, + min_keyword_jaccard: 0.1, + min_shared_keywords: 2, + max_keywords_per_doc: 50, + max_edges_per_node: 20, + retrieval_boost_factor: 0.15, + } + } +} + +impl DocumentGraphConfig { + /// Create a new config with defaults. + pub fn new() -> Self { + Self::default() + } + + /// Create a disabled config. + pub fn disabled() -> Self { + Self { + enabled: false, + ..Self::default() + } + } +} From 2757aa68fe35630aad80c1abd6708637270c2f00 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 10:22:30 +0800 Subject: [PATCH 24/28] feat: remove vectorless core module and related functionality BREAKING CHANGE: Remove the entire vectorless core module including: - Cargo.toml configuration and dependencies - Single document challenge example that tested deep reasoning - Agent command parsing system with navigation commands (ls, cd, cat, find, grep, etc.) - Target resolution logic for document tree navigation - All associated tests and implementations This removes the core vectorless functionality that enabled AI-powered document navigation and reasoning capabilities. --- vectorless-core/vectorless/Cargo.toml | 93 -- .../examples/single_doc_challenge.rs | 250 ---- .../vectorless/src/agent/command.rs | 629 ---------- .../vectorless/src/agent/config.rs | 248 ---- .../vectorless/src/agent/context.rs | 120 -- .../vectorless/src/agent/events.rs | 537 --------- vectorless-core/vectorless/src/agent/mod.rs | 55 - .../src/agent/orchestrator/analyze.rs | 159 --- .../src/agent/orchestrator/dispatch.rs | 92 -- .../src/agent/orchestrator/evaluate.rs | 128 -- .../vectorless/src/agent/orchestrator/mod.rs | 223 ---- .../src/agent/orchestrator/replan.rs | 249 ---- .../src/agent/orchestrator/supervisor.rs | 159 --- .../vectorless/src/agent/prompts.rs | 569 --------- vectorless-core/vectorless/src/agent/state.rs | 312 ----- .../vectorless/src/agent/tools/common.rs | 69 -- .../vectorless/src/agent/tools/mod.rs | 101 -- .../src/agent/tools/orchestrator.rs | 203 ---- .../vectorless/src/agent/tools/worker/cat.rs | 115 -- .../vectorless/src/agent/tools/worker/cd.rs | 262 ----- .../vectorless/src/agent/tools/worker/find.rs | 128 -- .../vectorless/src/agent/tools/worker/grep.rs | 175 --- .../vectorless/src/agent/tools/worker/head.rs | 119 -- .../vectorless/src/agent/tools/worker/ls.rs | 124 -- .../vectorless/src/agent/tools/worker/mod.rs | 39 - .../vectorless/src/agent/tools/worker/pwd.rs | 58 - .../vectorless/src/agent/tools/worker/wc.rs | 109 -- .../vectorless/src/agent/worker/execute.rs | 278 ----- .../vectorless/src/agent/worker/format.rs | 20 - .../vectorless/src/agent/worker/mod.rs | 236 ---- .../vectorless/src/agent/worker/navigation.rs | 448 ------- .../vectorless/src/agent/worker/planning.rs | 708 ------------ .../vectorless/src/client/builder.rs | 268 ----- .../vectorless/src/client/engine.rs | 923 --------------- .../vectorless/src/client/index_context.rs | 363 ------ .../vectorless/src/client/indexed_document.rs | 130 --- .../vectorless/src/client/indexer.rs | 387 ------- vectorless-core/vectorless/src/client/mod.rs | 106 -- .../vectorless/src/client/query_context.rs | 179 --- .../vectorless/src/client/retriever.rs | 140 --- .../vectorless/src/client/test_support.rs | 54 - .../vectorless/src/client/types.rs | 536 --------- .../vectorless/src/client/workspace.rs | 243 ---- vectorless-core/vectorless/src/config/mod.rs | 16 - .../vectorless/src/config/types/indexer.rs | 108 -- .../vectorless/src/config/types/llm_pool.rs | 633 ---------- .../vectorless/src/config/types/metrics.rs | 181 --- .../vectorless/src/config/types/mod.rs | 362 ------ .../vectorless/src/config/types/retrieval.rs | 170 --- .../vectorless/src/config/types/storage.rs | 742 ------------ .../vectorless/src/config/validator.rs | 323 ------ .../vectorless/src/document/format.rs | 62 - .../vectorless/src/document/mod.rs | 43 - .../vectorless/src/document/navigation.rs | 626 ---------- .../vectorless/src/document/node.rs | 144 --- .../vectorless/src/document/reasoning.rs | 444 ------- .../vectorless/src/document/reference.rs | 559 --------- .../vectorless/src/document/serde_helpers.rs | 241 ---- .../vectorless/src/document/structure.rs | 65 -- .../vectorless/src/document/toc.rs | 343 ------ .../vectorless/src/document/tree.rs | 883 -------------- .../vectorless/src/document/understanding.rs | 318 ----- vectorless-core/vectorless/src/error.rs | 329 ------ .../vectorless/src/events/emitter.rs | 256 ---- vectorless-core/vectorless/src/events/mod.rs | 31 - .../vectorless/src/events/types.rs | 138 --- .../vectorless/src/graph/builder.rs | 400 ------- .../vectorless/src/graph/config.rs | 51 - vectorless-core/vectorless/src/graph/mod.rs | 38 - vectorless-core/vectorless/src/graph/types.rs | 310 ----- .../vectorless/src/index/config.rs | 389 ------- .../src/index/incremental/detector.rs | 654 ----------- .../vectorless/src/index/incremental/mod.rs | 81 -- .../src/index/incremental/resolver.rs | 105 -- .../src/index/incremental/updater.rs | 177 --- vectorless-core/vectorless/src/index/mod.rs | 73 -- .../src/index/parse/markdown/config.rs | 219 ---- .../src/index/parse/markdown/frontmatter.rs | 219 ---- .../src/index/parse/markdown/mod.rs | 30 - .../src/index/parse/markdown/parser.rs | 601 ---------- .../vectorless/src/index/parse/mod.rs | 96 -- .../vectorless/src/index/parse/pdf/mod.rs | 32 - .../vectorless/src/index/parse/pdf/parser.rs | 366 ------ .../vectorless/src/index/parse/pdf/types.rs | 171 --- .../src/index/parse/toc/assigner.rs | 395 ------- .../src/index/parse/toc/detector.rs | 349 ------ .../vectorless/src/index/parse/toc/mod.rs | 28 - .../vectorless/src/index/parse/toc/parser.rs | 279 ----- .../src/index/parse/toc/processor.rs | 573 --------- .../src/index/parse/toc/repairer.rs | 247 ---- .../index/parse/toc/structure_extractor.rs | 481 -------- .../vectorless/src/index/parse/toc/types.rs | 350 ------ .../src/index/parse/toc/verifier.rs | 281 ----- .../vectorless/src/index/parse/types.rs | 173 --- .../src/index/pipeline/checkpoint.rs | 329 ------ .../vectorless/src/index/pipeline/context.rs | 465 -------- .../vectorless/src/index/pipeline/executor.rs | 198 ---- .../vectorless/src/index/pipeline/metrics.rs | 6 - .../vectorless/src/index/pipeline/mod.rs | 24 - .../src/index/pipeline/orchestrator.rs | 1028 ----------------- .../vectorless/src/index/pipeline/policy.rs | 222 ---- .../vectorless/src/index/stages/build.rs | 334 ------ .../vectorless/src/index/stages/concept.rs | 238 ---- .../vectorless/src/index/stages/enhance.rs | 449 ------- .../vectorless/src/index/stages/enrich.rs | 240 ---- .../vectorless/src/index/stages/mod.rs | 141 --- .../vectorless/src/index/stages/navigation.rs | 563 --------- .../vectorless/src/index/stages/optimize.rs | 455 -------- .../vectorless/src/index/stages/parse.rs | 166 --- .../vectorless/src/index/stages/reasoning.rs | 639 ---------- .../vectorless/src/index/stages/split.rs | 347 ------ .../vectorless/src/index/stages/validate.rs | 365 ------ .../src/index/stages/verify_ingest.rs | 79 -- .../vectorless/src/index/summary/full.rs | 65 -- .../vectorless/src/index/summary/lazy.rs | 153 --- .../vectorless/src/index/summary/mod.rs | 24 - .../vectorless/src/index/summary/selective.rs | 120 -- .../vectorless/src/index/summary/strategy.rs | 322 ------ vectorless-core/vectorless/src/lib.rs | 93 -- vectorless-core/vectorless/src/llm/client.rs | 378 ------ vectorless-core/vectorless/src/llm/config.rs | 248 ---- vectorless-core/vectorless/src/llm/error.rs | 135 --- .../vectorless/src/llm/executor.rs | 568 --------- .../vectorless/src/llm/fallback.rs | 378 ------ .../vectorless/src/llm/memo/mod.rs | 14 - .../vectorless/src/llm/memo/store.rs | 679 ----------- .../vectorless/src/llm/memo/types.rs | 414 ------- vectorless-core/vectorless/src/llm/mod.rs | 42 - vectorless-core/vectorless/src/llm/pool.rs | 176 --- .../vectorless/src/llm/throttle.rs | 259 ----- vectorless-core/vectorless/src/metrics/hub.rs | 324 ------ .../vectorless/src/metrics/index.rs | 199 ---- vectorless-core/vectorless/src/metrics/llm.rs | 207 ---- vectorless-core/vectorless/src/metrics/mod.rs | 56 - .../vectorless/src/metrics/retrieval.rs | 263 ----- vectorless-core/vectorless/src/query/mod.rs | 45 - vectorless-core/vectorless/src/query/types.rs | 114 -- .../vectorless/src/query/understand.rs | 246 ---- .../vectorless/src/rerank/dedup.rs | 216 ---- vectorless-core/vectorless/src/rerank/mod.rs | 104 -- .../vectorless/src/rerank/types.rs | 14 - .../vectorless/src/retrieval/cache.rs | 577 --------- .../vectorless/src/retrieval/dispatcher.rs | 78 -- .../vectorless/src/retrieval/mod.rs | 28 - .../vectorless/src/retrieval/postprocessor.rs | 130 --- .../vectorless/src/retrieval/stream.rs | 128 -- .../vectorless/src/retrieval/types.rs | 193 ---- .../vectorless/src/scoring/bm25.rs | 690 ----------- vectorless-core/vectorless/src/scoring/mod.rs | 8 - .../vectorless/src/storage/backend/file.rs | 293 ----- .../vectorless/src/storage/backend/memory.rs | 181 --- .../vectorless/src/storage/backend/mod.rs | 34 - .../src/storage/backend/trait_def.rs | 113 -- .../vectorless/src/storage/cache.rs | 381 ------ .../vectorless/src/storage/codec.rs | 245 ---- .../vectorless/src/storage/lock.rs | 280 ----- .../vectorless/src/storage/migration.rs | 385 ------ vectorless-core/vectorless/src/storage/mod.rs | 46 - .../vectorless/src/storage/persistence.rs | 877 -------------- .../vectorless/src/storage/workspace.rs | 666 ----------- .../vectorless/src/utils/fingerprint.rs | 496 -------- vectorless-core/vectorless/src/utils/mod.rs | 17 - vectorless-core/vectorless/src/utils/token.rs | 64 - .../vectorless/src/utils/validation.rs | 195 ---- 164 files changed, 42778 deletions(-) delete mode 100644 vectorless-core/vectorless/Cargo.toml delete mode 100644 vectorless-core/vectorless/examples/single_doc_challenge.rs delete mode 100644 vectorless-core/vectorless/src/agent/command.rs delete mode 100644 vectorless-core/vectorless/src/agent/config.rs delete mode 100644 vectorless-core/vectorless/src/agent/context.rs delete mode 100644 vectorless-core/vectorless/src/agent/events.rs delete mode 100644 vectorless-core/vectorless/src/agent/mod.rs delete mode 100644 vectorless-core/vectorless/src/agent/orchestrator/analyze.rs delete mode 100644 vectorless-core/vectorless/src/agent/orchestrator/dispatch.rs delete mode 100644 vectorless-core/vectorless/src/agent/orchestrator/evaluate.rs delete mode 100644 vectorless-core/vectorless/src/agent/orchestrator/mod.rs delete mode 100644 vectorless-core/vectorless/src/agent/orchestrator/replan.rs delete mode 100644 vectorless-core/vectorless/src/agent/orchestrator/supervisor.rs delete mode 100644 vectorless-core/vectorless/src/agent/prompts.rs delete mode 100644 vectorless-core/vectorless/src/agent/state.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/common.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/mod.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/orchestrator.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/cat.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/cd.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/find.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/grep.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/head.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/ls.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/mod.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/pwd.rs delete mode 100644 vectorless-core/vectorless/src/agent/tools/worker/wc.rs delete mode 100644 vectorless-core/vectorless/src/agent/worker/execute.rs delete mode 100644 vectorless-core/vectorless/src/agent/worker/format.rs delete mode 100644 vectorless-core/vectorless/src/agent/worker/mod.rs delete mode 100644 vectorless-core/vectorless/src/agent/worker/navigation.rs delete mode 100644 vectorless-core/vectorless/src/agent/worker/planning.rs delete mode 100644 vectorless-core/vectorless/src/client/builder.rs delete mode 100644 vectorless-core/vectorless/src/client/engine.rs delete mode 100644 vectorless-core/vectorless/src/client/index_context.rs delete mode 100644 vectorless-core/vectorless/src/client/indexed_document.rs delete mode 100644 vectorless-core/vectorless/src/client/indexer.rs delete mode 100644 vectorless-core/vectorless/src/client/mod.rs delete mode 100644 vectorless-core/vectorless/src/client/query_context.rs delete mode 100644 vectorless-core/vectorless/src/client/retriever.rs delete mode 100644 vectorless-core/vectorless/src/client/test_support.rs delete mode 100644 vectorless-core/vectorless/src/client/types.rs delete mode 100644 vectorless-core/vectorless/src/client/workspace.rs delete mode 100644 vectorless-core/vectorless/src/config/mod.rs delete mode 100644 vectorless-core/vectorless/src/config/types/indexer.rs delete mode 100644 vectorless-core/vectorless/src/config/types/llm_pool.rs delete mode 100644 vectorless-core/vectorless/src/config/types/metrics.rs delete mode 100644 vectorless-core/vectorless/src/config/types/mod.rs delete mode 100644 vectorless-core/vectorless/src/config/types/retrieval.rs delete mode 100644 vectorless-core/vectorless/src/config/types/storage.rs delete mode 100644 vectorless-core/vectorless/src/config/validator.rs delete mode 100644 vectorless-core/vectorless/src/document/format.rs delete mode 100644 vectorless-core/vectorless/src/document/mod.rs delete mode 100644 vectorless-core/vectorless/src/document/navigation.rs delete mode 100644 vectorless-core/vectorless/src/document/node.rs delete mode 100644 vectorless-core/vectorless/src/document/reasoning.rs delete mode 100644 vectorless-core/vectorless/src/document/reference.rs delete mode 100644 vectorless-core/vectorless/src/document/serde_helpers.rs delete mode 100644 vectorless-core/vectorless/src/document/structure.rs delete mode 100644 vectorless-core/vectorless/src/document/toc.rs delete mode 100644 vectorless-core/vectorless/src/document/tree.rs delete mode 100644 vectorless-core/vectorless/src/document/understanding.rs delete mode 100644 vectorless-core/vectorless/src/error.rs delete mode 100644 vectorless-core/vectorless/src/events/emitter.rs delete mode 100644 vectorless-core/vectorless/src/events/mod.rs delete mode 100644 vectorless-core/vectorless/src/events/types.rs delete mode 100644 vectorless-core/vectorless/src/graph/builder.rs delete mode 100644 vectorless-core/vectorless/src/graph/config.rs delete mode 100644 vectorless-core/vectorless/src/graph/mod.rs delete mode 100644 vectorless-core/vectorless/src/graph/types.rs delete mode 100644 vectorless-core/vectorless/src/index/config.rs delete mode 100644 vectorless-core/vectorless/src/index/incremental/detector.rs delete mode 100644 vectorless-core/vectorless/src/index/incremental/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/incremental/resolver.rs delete mode 100644 vectorless-core/vectorless/src/index/incremental/updater.rs delete mode 100644 vectorless-core/vectorless/src/index/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/markdown/config.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/markdown/frontmatter.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/markdown/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/markdown/parser.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/pdf/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/pdf/parser.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/pdf/types.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/assigner.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/detector.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/parser.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/processor.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/repairer.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/structure_extractor.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/types.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/toc/verifier.rs delete mode 100644 vectorless-core/vectorless/src/index/parse/types.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/checkpoint.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/context.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/executor.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/metrics.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/orchestrator.rs delete mode 100644 vectorless-core/vectorless/src/index/pipeline/policy.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/build.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/concept.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/enhance.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/enrich.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/navigation.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/optimize.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/parse.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/reasoning.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/split.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/validate.rs delete mode 100644 vectorless-core/vectorless/src/index/stages/verify_ingest.rs delete mode 100644 vectorless-core/vectorless/src/index/summary/full.rs delete mode 100644 vectorless-core/vectorless/src/index/summary/lazy.rs delete mode 100644 vectorless-core/vectorless/src/index/summary/mod.rs delete mode 100644 vectorless-core/vectorless/src/index/summary/selective.rs delete mode 100644 vectorless-core/vectorless/src/index/summary/strategy.rs delete mode 100644 vectorless-core/vectorless/src/lib.rs delete mode 100644 vectorless-core/vectorless/src/llm/client.rs delete mode 100644 vectorless-core/vectorless/src/llm/config.rs delete mode 100644 vectorless-core/vectorless/src/llm/error.rs delete mode 100644 vectorless-core/vectorless/src/llm/executor.rs delete mode 100644 vectorless-core/vectorless/src/llm/fallback.rs delete mode 100644 vectorless-core/vectorless/src/llm/memo/mod.rs delete mode 100644 vectorless-core/vectorless/src/llm/memo/store.rs delete mode 100644 vectorless-core/vectorless/src/llm/memo/types.rs delete mode 100644 vectorless-core/vectorless/src/llm/mod.rs delete mode 100644 vectorless-core/vectorless/src/llm/pool.rs delete mode 100644 vectorless-core/vectorless/src/llm/throttle.rs delete mode 100644 vectorless-core/vectorless/src/metrics/hub.rs delete mode 100644 vectorless-core/vectorless/src/metrics/index.rs delete mode 100644 vectorless-core/vectorless/src/metrics/llm.rs delete mode 100644 vectorless-core/vectorless/src/metrics/mod.rs delete mode 100644 vectorless-core/vectorless/src/metrics/retrieval.rs delete mode 100644 vectorless-core/vectorless/src/query/mod.rs delete mode 100644 vectorless-core/vectorless/src/query/types.rs delete mode 100644 vectorless-core/vectorless/src/query/understand.rs delete mode 100644 vectorless-core/vectorless/src/rerank/dedup.rs delete mode 100644 vectorless-core/vectorless/src/rerank/mod.rs delete mode 100644 vectorless-core/vectorless/src/rerank/types.rs delete mode 100644 vectorless-core/vectorless/src/retrieval/cache.rs delete mode 100644 vectorless-core/vectorless/src/retrieval/dispatcher.rs delete mode 100644 vectorless-core/vectorless/src/retrieval/mod.rs delete mode 100644 vectorless-core/vectorless/src/retrieval/postprocessor.rs delete mode 100644 vectorless-core/vectorless/src/retrieval/stream.rs delete mode 100644 vectorless-core/vectorless/src/retrieval/types.rs delete mode 100644 vectorless-core/vectorless/src/scoring/bm25.rs delete mode 100644 vectorless-core/vectorless/src/scoring/mod.rs delete mode 100644 vectorless-core/vectorless/src/storage/backend/file.rs delete mode 100644 vectorless-core/vectorless/src/storage/backend/memory.rs delete mode 100644 vectorless-core/vectorless/src/storage/backend/mod.rs delete mode 100644 vectorless-core/vectorless/src/storage/backend/trait_def.rs delete mode 100644 vectorless-core/vectorless/src/storage/cache.rs delete mode 100644 vectorless-core/vectorless/src/storage/codec.rs delete mode 100644 vectorless-core/vectorless/src/storage/lock.rs delete mode 100644 vectorless-core/vectorless/src/storage/migration.rs delete mode 100644 vectorless-core/vectorless/src/storage/mod.rs delete mode 100644 vectorless-core/vectorless/src/storage/persistence.rs delete mode 100644 vectorless-core/vectorless/src/storage/workspace.rs delete mode 100644 vectorless-core/vectorless/src/utils/fingerprint.rs delete mode 100644 vectorless-core/vectorless/src/utils/mod.rs delete mode 100644 vectorless-core/vectorless/src/utils/token.rs delete mode 100644 vectorless-core/vectorless/src/utils/validation.rs diff --git a/vectorless-core/vectorless/Cargo.toml b/vectorless-core/vectorless/Cargo.toml deleted file mode 100644 index a8de0234..00000000 --- a/vectorless-core/vectorless/Cargo.toml +++ /dev/null @@ -1,93 +0,0 @@ -[package] -name = "vectorless" -version.workspace = true -edition.workspace = true -authors.workspace = true -description.workspace = true -license.workspace = true -repository.workspace = true -homepage.workspace = true -documentation = "https://docs.rs/vectorless" -keywords = ["rag", "document", "retrieval", "indexing", "llm"] -categories = ["text-processing", "data-structures", "algorithms"] -readme = "../../README.md" - -[dependencies] -# Async runtime -tokio = { workspace = true } -async-trait = { workspace = true } -futures = { workspace = true } - -# Serialization -serde = { workspace = true } -serde_json = { workspace = true } - -# Error handling -thiserror = { workspace = true } -anyhow = { workspace = true, optional = true } - -# OpenAI-compatible API client -async-openai = { workspace = true } - -# UUID -uuid = { workspace = true } - -# Time -chrono = { workspace = true } - -# Logging -tracing = { workspace = true } - -# Rate limiting -governor = { workspace = true } -nonzero_ext = { workspace = true } - -# Token counting -tiktoken-rs = { workspace = true } - -# Text processing -regex = { workspace = true } - -# Markdown parsing -pulldown-cmark = { workspace = true } - -# Tree data structure -indextree = { workspace = true } - -# LRU cache -lru = { workspace = true } - -# Checksum -sha2 = { workspace = true } - -# BLAKE2b hashing -blake2 = { workspace = true } -base64 = { workspace = true } - -# Synchronization primitives -parking_lot = { workspace = true } - -# Compression -flate2 = { workspace = true } - -# File locking (Unix) -[target.'cfg(unix)'.dependencies] -libc = { workspace = true } - -# PDF processing -pdf-extract = { workspace = true } -lopdf = { workspace = true } - -# Random number generation -rand = { workspace = true } - -# BM25 scoring -bm25 = { workspace = true } - -[dev-dependencies] -tempfile = { workspace = true } -tokio-test = { workspace = true } -tracing-subscriber = { workspace = true } - -[lints] -workspace = true diff --git a/vectorless-core/vectorless/examples/single_doc_challenge.rs b/vectorless-core/vectorless/examples/single_doc_challenge.rs deleted file mode 100644 index 7a0f44da..00000000 --- a/vectorless-core/vectorless/examples/single_doc_challenge.rs +++ /dev/null @@ -1,250 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Single-document reasoning challenge. -//! -//! Indexes a realistic technical document and asks questions that require -//! the engine to navigate deep into the tree, cross-reference details -//! across distant sections, and extract information buried in nested -//! structures — not surface-level keyword matches. -//! -//! ```bash -//! LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ -//! LLM_ENDPOINT=https://api.openai.com/v1 \ -//! cargo run --example single_doc_challenge -//! ``` - -use vectorless::{EngineBuilder, IngestInput}; - -/// A research report with information scattered across sections. -/// The answers to the challenge questions require connecting dots -/// from different parts of the document, not simple keyword lookup. -const REPORT: &str = r#" -# Quantum Computing Division — Annual Research Report 2025 - -## Executive Summary - -The Quantum Computing Division achieved several milestones in fiscal year 2025. -Total division revenue reached $47.2M, representing 23% year-over-year growth. -The division employed 312 staff across four research labs as of December 2025. -Headcount grew by 18% during the year, with the majority of new hires in the -error correction and cryogenics teams. - -The board approved a $200M capital investment program spanning 2025-2028. -Phase 1 ($52M) was fully deployed in 2025, primarily in dilution refrigerator -procurement and cleanroom expansion at the Zurich facility. - -## Research Labs - -### Lab A — Superconducting Qubits (Zurich) - -Lab A focuses on transmon qubit design and fabrication. The lab operates -two dilution refrigerators: FR-01 (purchased 2023, 20mK base temperature) -and FR-02 (commissioned Q3 2025, 15mK base temperature). FR-02 was the -single largest capital expenditure in 2025 at $8.7M. - -Current qubit specifications: -- Qubit count: 127 (FR-01: 64, FR-02: 63) -- Average T1 coherence time: 142 microseconds (up from 98μs in 2024) -- Average T2 coherence time: 89 microseconds -- Single-qubit gate fidelity: 99.92% -- Two-qubit gate fidelity: 99.67% -- Readout fidelity: 99.81% - -The 2025 coherence improvement was primarily driven by the transition from -aluminum to tantalum transmon junctions, which reduced two-level system (TLS) -defect density by 40%. - -### Lab B — Topological Qubits (Tokyo) - -Lab B pursues Majorana-based topological qubits using semiconductor-superconductor -nanowires. The team fabricated 12 nanowire devices during 2025, of which 3 -demonstrated measurable topological gap. This is a significant improvement -over 2024 when only 1 device out of 8 showed the gap. - -The topological gap measurement protocol requires the device temperature to -remain below 20mK throughout the 48-hour characterization cycle. Only FR-02 -in Zurich meets this requirement, so Lab B ships devices to Zurich for final -characterization — creating a logistical dependency between the two labs. - -Key metric: topological gap size averaged 0.35meV across successful devices, -compared to the theoretical target of 0.5meV. The gap-to-target ratio improved -from 48% in 2024 to 70% in 2025. - -### Lab C — Quantum Error Correction (Cambridge) - -Lab C develops surface code error correction protocols. In 2025, the team -achieved a critical milestone: below-threshold error correction on a 17-qubit -surface code patch, reducing logical error rate from 2.1×10⁻² to 3.4×10⁻³ -per correction cycle. - -The threshold simulations used Lab A's measured gate fidelities as input -parameters. The below-threshold result was only possible after Lab A's T1 -coherence improvement from 98μs to 142μs — the simulation models showed -that the 98μs regime was above the error correction threshold for the 17-qubit -code, making the Lab A / Lab C dependency critical. - -Lab C also developed a new decoder algorithm called "Cascade" that reduces -classical processing latency from 1.2μs to 0.4μs per syndrome extraction cycle. -This decoder runs on an FPGA co-processor board that was custom-designed by -Lab D. - -### Lab D — Control Systems (Boston) - -Lab D designs and manufactures the classical control electronics for all qubit -types. The flagship product is the QCS-4 control system, capable of driving -up to 256 qubit channels with 14-bit DAC resolution and sub-nanosecond timing -precision. - -In 2025, Lab D delivered 4 QCS-4 units to Lab A and 2 units to Lab B. -Lab C received a modified QCS-4 variant with the integrated FPGA decoder -co-processor. The FPGA decoder board is a custom design: Xilinx Ultrascale+ -XCU26 FPGA, 400k logic cells, running at 350MHz. Lab D is the sole source -for this board — there is no commercial equivalent. - -A notable incident occurred in August 2025 when a firmware bug in the QCS-4 -DAC calibration routine caused systematic phase errors in two-qubit gate -operations. The bug was traced to an integer overflow in the calibration LUT -when operating above 4.2 GHz. The issue affected Lab A's FR-01 for 11 days -before a patched firmware was deployed. During this period, Lab A's measured -two-qubit gate fidelity temporarily dropped to 97.31%. - -## Financial Summary - -| Category | 2024 | 2025 | Change | -|----------|------|------|--------| -| Revenue | $38.4M | $47.2M | +23% | -| R&D Expense | $31.6M | $38.9M | +23% | -| Capital Expenditure | $18.2M | $52.0M | +186% | -| Staff Count (Dec) | 264 | 312 | +18% | -| Patents Filed | 14 | 19 | +36% | - -Revenue breakdown by source: -- Government contracts: $19.8M (42%) -- Enterprise partnerships: $15.3M (32%) -- IP licensing: $8.6M (18%) -- Consulting services: $3.5M (8%) - -The $52M capital expenditure in 2025 included: -- FR-02 dilution refrigerator (Zurich): $8.7M -- Cleanroom expansion (Zurich): $14.2M -- Nanowire fabrication equipment (Tokyo): $6.1M -- FPGA development and QCS-4 production (Boston): $9.4M -- General infrastructure and IT: $13.6M - -## Outlook for 2026 - -Priority goals for 2026: -1. Scale to 256 superconducting qubits by Q3 (requires a third dilution - refrigerator, procurement estimated at $9-11M) -2. Achieve topological gap above 0.45meV (requires device process improvement) -3. Demonstrate below-threshold error correction on a 49-qubit surface code - (requires both 256-qubit hardware AND the Cascade decoder scaling to - larger code distances) -4. File 25+ patents -5. Grow revenue to $60M -"#; - -/// Challenge questions designed to test deep reasoning. -/// None of these can be answered by simple keyword search — -/// each requires connecting information from multiple sections. -const CHALLENGE_QUESTIONS: &[&str] = &[ - // Requires: cross-reference Lab B's device characterization needs with - // Lab A's FR-02 specs, then connect to the CapEx table for FR-02 cost - "How much did the only refrigerator capable of characterizing Lab B's devices cost, and where is it located?", - // Requires: trace Lab C's below-threshold result → depends on Lab A's T1 - // improvement → depends on tantalum junction transition - "What specific materials change in another lab made Lab C's error correction milestone possible?", - // Requires: find the firmware bug in Lab D section, then look at the - // Lab A FR-01 qubit count, then compute the impact window - "How many qubits were affected by the firmware bug, and for how many days?", - // Requires: Lab B gap/target ratio (70%) × theoretical target (0.5meV) - // → actual gap = 0.35meV, compare with 2026 goal of 0.45meV - "What is the gap between Lab B's current topological gap achievement and the 2026 target, in meV?", - // Requires: trace the dependency chain: 256-qubit goal → need FR-03 → - // cost $9-11M → government contracts are largest revenue source at $19.8M - "If the 2026 qubit scaling goal requires a new refrigerator, can the largest revenue source category alone cover its estimated cost?", -]; - -#[tokio::main] -async fn main() -> vectorless::Result<()> { - tracing_subscriber::fmt() - .compact() - .with_env_filter( - tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), - ) - .init(); - - println!("=== Single-Document Reasoning Challenge ===\n"); - - let api_key = std::env::var("LLM_API_KEY").unwrap_or_else(|_| "sk-...".to_string()); - let model = std::env::var("LLM_MODEL").unwrap_or_else(|_| "gpt-4o".to_string()); - let endpoint = - std::env::var("LLM_ENDPOINT").unwrap_or_else(|_| "https://api.openai.com/v1".to_string()); - - let engine = EngineBuilder::new() - .with_key(&api_key) - .with_model(&model) - .with_endpoint(&endpoint) - .build() - .await - .map_err(|e| vectorless::Error::Config(e.to_string()))?; - - // Ingest (skip if already indexed — we're testing reasoning, not indexing) - let doc_name = "qc_report_2025"; - let doc_id = { - let existing = engine.list_documents().await?; - if let Some(doc) = existing.iter().find(|d| d.name == doc_name) { - println!("Document already understood, reusing: {}\n", doc.doc_id); - doc.doc_id.clone() - } else { - println!("Understanding research report..."); - let doc = engine - .ingest(IngestInput::Text { - name: doc_name.to_string(), - content: REPORT.to_string(), - }) - .await?; - println!(" doc_id: {}", doc.doc_id); - println!(" summary: {}\n", doc.summary); - doc.doc_id - } - }; - - // Challenge questions - for (i, question) in CHALLENGE_QUESTIONS.iter().enumerate() { - println!("Q{}: {}", i + 1, question); - - match engine.ask(question, &[doc_id.clone()]).await { - Ok(answer) => { - if answer.content.is_empty() { - println!(" (no answer found)\n"); - } else { - for line in answer.content.lines().take(3) { - println!(" {}", line); - } - let remaining = answer.content.lines().count().saturating_sub(3); - if remaining > 0 { - println!(" ... ({} more lines)", remaining); - } - println!( - " confidence: {:.2}, evidence: {}, trace_steps: {}\n", - answer.confidence, - answer.evidence.len(), - answer.trace.steps.len() - ); - } - } - Err(e) => { - println!(" error: {}\n", e); - } - } - } - - // Uncomment to forget the document after testing: - // engine.forget(&doc_id).await?; - // println!("Cleaned up."); - - Ok(()) -} diff --git a/vectorless-core/vectorless/src/agent/command.rs b/vectorless-core/vectorless/src/agent/command.rs deleted file mode 100644 index 5507a1d1..00000000 --- a/vectorless-core/vectorless/src/agent/command.rs +++ /dev/null @@ -1,629 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Command parsing for the agent navigation loop. -//! -//! LLM output is parsed into `Command` variants. The parser is intentionally -//! simple and forgiving — unknown input falls back to `Ls` so the agent can -//! re-observe its surroundings. - -use crate::document::{NavigationIndex, NodeId}; - -/// Parsed command from LLM output. -#[derive(Debug, Clone, PartialEq)] -pub enum Command { - /// List children of the current node. - Ls, - /// Navigate into a child node by name. - Cd { target: String }, - /// Navigate back to parent. - CdUp, - /// Read node content (collects as evidence). - Cat { target: String }, - /// Search for a keyword in the ReasoningIndex. - Find { keyword: String }, - /// Regex search across node content in the current subtree. - Grep { pattern: String }, - /// Preview first N lines of a node without collecting evidence. - Head { target: String, lines: usize }, - /// Search for nodes by title pattern in the tree. - FindTree { pattern: String }, - /// Show node content size (lines, chars). - Wc { target: String }, - /// Show current navigation path. - Pwd, - /// Evaluate evidence sufficiency. - Check, - /// End navigation. - Done, -} - -/// Strip surrounding quotes from a target string. -/// -/// Handles straight quotes (`"`, `'`) and Unicode smart quotes (U+201C/U+201D, U+2018/U+2019). -fn strip_quotes(s: &str) -> String { - let trimmed = s.trim(); - let chars: Vec = trimmed.chars().collect(); - if chars.len() < 2 { - return trimmed.to_string(); - } - let (first, last) = (chars[0], chars[chars.len() - 1]); - let matching = (first == '"' && last == '"') - || (first == '\'' && last == '\'') - || (first == '\u{201c}' && last == '\u{201d}') - || (first == '\u{2018}' && last == '\u{2019}'); - if matching { - trimmed[chars[0].len_utf8()..trimmed.len() - chars[chars.len() - 1].len_utf8()].to_string() - } else { - trimmed.to_string() - } -} - -/// Parse the first non-empty line of LLM output into a Command. -pub fn parse_command(llm_output: &str) -> Command { - let line = llm_output - .lines() - .find(|l| !l.trim().is_empty()) - .unwrap_or("") - .trim(); - - // Remove common wrapping (markdown code blocks, etc.) - let line = line.trim_start_matches('`').trim_end_matches('`').trim(); - - let parts: Vec<&str> = line.split_whitespace().collect(); - - match parts.as_slice() { - ["ls"] => Command::Ls, - ["cat"] => Command::Cat { - target: ".".to_string(), - }, - ["cd", ".."] => Command::CdUp, - ["cd", target] => Command::Cd { - target: strip_quotes(target), - }, - ["cd", _target, ..] => Command::Cd { - // Handle "cd some name" by joining remaining parts - target: strip_quotes(&parts[1..].join(" ")), - }, - ["cat", target] => Command::Cat { - target: strip_quotes(target), - }, - ["cat", _target, ..] => Command::Cat { - target: strip_quotes(&parts[1..].join(" ")), - }, - ["find", keyword] => Command::Find { - keyword: strip_quotes(keyword), - }, - ["find", _keyword, ..] => Command::Find { - keyword: strip_quotes(&parts[1..].join(" ")), - }, - ["grep", pattern] => Command::Grep { - pattern: strip_quotes(pattern), - }, - ["grep", _pattern, ..] => Command::Grep { - pattern: strip_quotes(&parts[1..].join(" ")), - }, - ["head", target] => Command::Head { - target: strip_quotes(target), - lines: 20, // default - }, - ["head", "-n", n, target @ ..] => Command::Head { - target: strip_quotes(&target.join(" ")), - lines: n.parse().unwrap_or(20), - }, - ["head", _target, ..] => Command::Head { - target: strip_quotes(&parts[1..].join(" ")), - lines: 20, - }, - ["findtree", pattern] => Command::FindTree { - pattern: strip_quotes(pattern), - }, - ["findtree", _pattern, ..] => Command::FindTree { - pattern: strip_quotes(&parts[1..].join(" ")), - }, - ["wc", target] => Command::Wc { - target: strip_quotes(target), - }, - ["wc", _target, ..] => Command::Wc { - target: strip_quotes(&parts[1..].join(" ")), - }, - ["pwd"] => Command::Pwd, - ["check"] => Command::Check, - ["done"] => Command::Done, - _ => Command::Ls, // fallback: re-observe - } -} - -/// Resolve a cd/cat target string to a NodeId using multi-level matching. -/// -/// Matching priority: -/// 1. Exact title match -/// 2. Case-insensitive title match -/// 3. Substring (contains) match -/// 4. Numeric index match ("1" → first child, "2" → second, etc.) -pub fn resolve_target( - target: &str, - nav_index: &NavigationIndex, - current_node: NodeId, -) -> Option { - let target = strip_quotes(target); - let routes = nav_index.get_child_routes(current_node)?; - - // 1. Exact match - if let Some(r) = routes.iter().find(|r| r.title == target) { - return Some(r.node_id); - } - - // 2. Case-insensitive match - let target_lower = target.to_lowercase(); - if let Some(r) = routes - .iter() - .find(|r| r.title.to_lowercase() == target_lower) - { - return Some(r.node_id); - } - - // 3. Substring (contains) match - if let Some(r) = routes - .iter() - .find(|r| r.title.to_lowercase().contains(&target_lower)) - { - return Some(r.node_id); - } - - // 4. Numeric index match ("1" → first child) - if let Ok(idx) = target.parse::() { - if idx > 0 && idx <= routes.len() { - return Some(routes[idx - 1].node_id); - } - } - - None -} - -/// Resolve a cd/cat target with additional context from the tree node titles. -/// -/// Matching priority: -/// 1. Direct children via NavigationIndex (exact, case-insensitive, substring, numeric) -/// 2. Direct children via TreeNode titles (case-insensitive contains) -/// 3. Deep descendant search (BFS, up to depth 4) — enables `cd "Research Labs"` from -/// root when "Research Labs" is a grandchild behind an intermediate wrapper node. -pub fn resolve_target_extended( - target: &str, - nav_index: &NavigationIndex, - current_node: NodeId, - tree: &crate::document::DocumentTree, -) -> Option { - let target = strip_quotes(target); - // Try the primary resolver first - if let Some(id) = resolve_target(&target, nav_index, current_node) { - return Some(id); - } - - let target_lower = target.to_lowercase(); - - // Extended: check all direct children by their TreeNode titles - let children: Vec = tree.children_iter(current_node).collect(); - for child_id in &children { - if let Some(node) = tree.get(*child_id) { - if node.title.to_lowercase().contains(&target_lower) { - return Some(*child_id); - } - } - } - - // Deep search: BFS through descendants up to depth 4. - // Returns the shallowest match so `cd "Research Labs"` from root finds it - // at depth 1 even if another "Research Labs" exists deeper. - search_descendants(&target_lower, current_node, tree, 4) -} - -/// BFS search through descendants, returning the shallowest matching NodeId. -fn search_descendants( - target_lower: &str, - start: NodeId, - tree: &crate::document::DocumentTree, - max_depth: usize, -) -> Option { - let mut queue: Vec<(NodeId, usize)> = vec![(start, 0)]; - - while let Some((node_id, depth)) = queue.pop() { - if depth >= max_depth { - continue; - } - for child_id in tree.children_iter(node_id) { - if let Some(node) = tree.get(child_id) { - if node.title.to_lowercase().contains(target_lower) { - return Some(child_id); - } - } - queue.push((child_id, depth + 1)); - } - } - - None -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_ls() { - assert_eq!(parse_command("ls"), Command::Ls); - assert_eq!(parse_command(" ls "), Command::Ls); - } - - #[test] - fn test_parse_cd() { - assert_eq!(parse_command("cd .."), Command::CdUp); - assert_eq!( - parse_command("cd Getting Started"), - Command::Cd { - target: "Getting Started".to_string() - } - ); - assert_eq!( - parse_command("cd some long name"), - Command::Cd { - target: "some long name".to_string() - } - ); - // Quoted multi-word targets should have quotes stripped - assert_eq!( - parse_command("cd \"Vectorless Architecture Guide\""), - Command::Cd { - target: "Vectorless Architecture Guide".to_string() - } - ); - assert_eq!( - parse_command("cd 'Vectorless Architecture Guide'"), - Command::Cd { - target: "Vectorless Architecture Guide".to_string() - } - ); - // Smart quotes - assert_eq!( - parse_command("\u{201c}Vectorless Architecture Guide\u{201d}"), - Command::Ls // doesn't start with a command keyword - ); - } - - #[test] - fn test_strip_quotes_straight() { - assert_eq!(strip_quotes("\"hello\""), "hello"); - assert_eq!(strip_quotes("'hello'"), "hello"); - assert_eq!(strip_quotes("hello"), "hello"); - assert_eq!(strip_quotes("\"only left"), "\"only left"); - } - - #[test] - fn test_strip_quotes_smart() { - assert_eq!(strip_quotes("\u{201c}hello\u{201d}"), "hello"); - assert_eq!(strip_quotes("\u{2018}hello\u{2019}"), "hello"); - } - - #[test] - fn test_resolve_target_quoted() { - use crate::document::{ChildRoute, DocumentTree}; - - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "Vectorless Architecture Guide", "content"); - - let mut nav_index = NavigationIndex::new(); - nav_index.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "Vectorless Architecture Guide".to_string(), - description: "Main guide".to_string(), - leaf_count: 5, - }], - ); - - // Quoted target should still resolve - assert_eq!( - resolve_target("\"Vectorless Architecture Guide\"", &nav_index, root), - Some(c1) - ); - assert_eq!( - resolve_target("'Vectorless Architecture Guide'", &nav_index, root), - Some(c1) - ); - } - - #[test] - fn test_parse_cat() { - assert_eq!( - parse_command("cat Installation"), - Command::Cat { - target: "Installation".to_string() - } - ); - assert_eq!( - parse_command("cat API Reference"), - Command::Cat { - target: "API Reference".to_string() - } - ); - } - - #[test] - fn test_parse_find() { - assert_eq!( - parse_command("find authentication"), - Command::Find { - keyword: "authentication".to_string() - } - ); - } - - #[test] - fn test_parse_misc() { - assert_eq!(parse_command("pwd"), Command::Pwd); - assert_eq!(parse_command("check"), Command::Check); - assert_eq!(parse_command("done"), Command::Done); - } - - #[test] - fn test_parse_fallback() { - assert_eq!(parse_command(""), Command::Ls); - assert_eq!(parse_command("unknown command"), Command::Ls); - assert_eq!(parse_command("blah blah"), Command::Ls); - } - - #[test] - fn test_parse_with_wrapping() { - assert_eq!(parse_command("`ls`"), Command::Ls); - assert_eq!(parse_command("```ls```"), Command::Ls); - } - - #[test] - fn test_parse_multiline() { - // Should parse the first non-empty line - assert_eq!(parse_command("\n\nls\n\n// listing children"), Command::Ls); - } - - #[test] - fn test_resolve_target_numeric() { - use crate::document::{ChildRoute, DocumentTree}; - - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "Getting Started", "content"); - let c2 = tree.add_child(root, "API Reference", "content"); - - let mut nav_index = NavigationIndex::new(); - nav_index.add_child_routes( - root, - vec![ - ChildRoute { - node_id: c1, - title: "Getting Started".to_string(), - description: "Setup guide".to_string(), - leaf_count: 3, - }, - ChildRoute { - node_id: c2, - title: "API Reference".to_string(), - description: "API docs".to_string(), - leaf_count: 7, - }, - ], - ); - - assert_eq!(resolve_target("1", &nav_index, root), Some(c1)); - assert_eq!(resolve_target("2", &nav_index, root), Some(c2)); - assert_eq!(resolve_target("3", &nav_index, root), None); - } - - #[test] - fn test_resolve_target_exact() { - use crate::document::{ChildRoute, DocumentTree}; - - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "Getting Started", "content"); - - let mut nav_index = NavigationIndex::new(); - nav_index.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "Getting Started".to_string(), - description: "Setup".to_string(), - leaf_count: 3, - }], - ); - - assert_eq!( - resolve_target("Getting Started", &nav_index, root), - Some(c1) - ); - } - - #[test] - fn test_resolve_target_case_insensitive() { - use crate::document::{ChildRoute, DocumentTree}; - - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "Getting Started", "content"); - - let mut nav_index = NavigationIndex::new(); - nav_index.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "Getting Started".to_string(), - description: "Setup".to_string(), - leaf_count: 3, - }], - ); - - assert_eq!( - resolve_target("getting started", &nav_index, root), - Some(c1) - ); - assert_eq!( - resolve_target("GETTING STARTED", &nav_index, root), - Some(c1) - ); - } - - #[test] - fn test_resolve_target_contains() { - use crate::document::{ChildRoute, DocumentTree}; - - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "API Reference", "content"); - - let mut nav_index = NavigationIndex::new(); - nav_index.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "API Reference".to_string(), - description: "API docs".to_string(), - leaf_count: 7, - }], - ); - - assert_eq!(resolve_target("api", &nav_index, root), Some(c1)); - assert_eq!(resolve_target("reference", &nav_index, root), Some(c1)); - } - - #[test] - fn test_resolve_target_no_routes() { - let nav_index = NavigationIndex::new(); - let tree = crate::document::DocumentTree::new("Root", ""); - assert!(resolve_target("anything", &nav_index, tree.root()).is_none()); - } - - #[test] - fn test_resolve_target_extended_deep_search() { - use crate::document::{ChildRoute, DocumentTree}; - - // root → "Wrapper" → "Research Labs" → "Lab B" - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let wrapper = tree.add_child(root, "Quantum Computing Division", "wrapper"); - let labs = tree.add_child(wrapper, "Research Labs", "labs content"); - let lab_b = tree.add_child(labs, "Lab B", "lab b content"); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ChildRoute { - node_id: wrapper, - title: "Quantum Computing Division".to_string(), - description: "Division".to_string(), - leaf_count: 7, - }], - ); - nav.add_child_routes( - wrapper, - vec![ChildRoute { - node_id: labs, - title: "Research Labs".to_string(), - description: "Labs".to_string(), - leaf_count: 4, - }], - ); - nav.add_child_routes( - labs, - vec![ChildRoute { - node_id: lab_b, - title: "Lab B".to_string(), - description: "Topological".to_string(), - leaf_count: 1, - }], - ); - - // "Research Labs" is a grandchild of root — deep search should find it - assert_eq!( - resolve_target_extended("Research Labs", &nav, root, &tree), - Some(labs) - ); - - // "Lab B" is a great-grandchild — deep search should find it - assert_eq!( - resolve_target_extended("Lab B", &nav, root, &tree), - Some(lab_b) - ); - - // Direct children should still work via primary resolver - assert_eq!( - resolve_target_extended("Quantum Computing Division", &nav, root, &tree), - Some(wrapper) - ); - } - - #[test] - fn test_parse_grep() { - assert_eq!( - parse_command("grep EBITDA"), - Command::Grep { - pattern: "EBITDA".to_string() - } - ); - assert_eq!( - parse_command("grep revenue.*2024"), - Command::Grep { - pattern: "revenue.*2024".to_string() - } - ); - } - - #[test] - fn test_parse_head() { - assert_eq!( - parse_command("head Installation"), - Command::Head { - target: "Installation".to_string(), - lines: 20 - } - ); - assert_eq!( - parse_command("head -n 5 API Reference"), - Command::Head { - target: "API Reference".to_string(), - lines: 5 - } - ); - } - - #[test] - fn test_parse_findtree() { - assert_eq!( - parse_command("findtree revenue"), - Command::FindTree { - pattern: "revenue".to_string() - } - ); - assert_eq!( - parse_command("findtree API Reference"), - Command::FindTree { - pattern: "API Reference".to_string() - } - ); - } - - #[test] - fn test_parse_wc() { - assert_eq!( - parse_command("wc Installation"), - Command::Wc { - target: "Installation".to_string() - } - ); - assert_eq!( - parse_command("wc API Reference"), - Command::Wc { - target: "API Reference".to_string() - } - ); - } -} diff --git a/vectorless-core/vectorless/src/agent/config.rs b/vectorless-core/vectorless/src/agent/config.rs deleted file mode 100644 index 54b6897e..00000000 --- a/vectorless-core/vectorless/src/agent/config.rs +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration and output types for the retrieval agent. - -use serde::{Deserialize, Serialize}; - -// --------------------------------------------------------------------------- -// Worker configuration -// --------------------------------------------------------------------------- - -/// Worker configuration — navigation budget settings. -#[derive(Debug, Clone)] -pub struct WorkerConfig { - /// Maximum navigation rounds per Worker loop (ls/cd/cat/grep/head/find etc.). - /// `check` does NOT count against this budget. - pub max_rounds: u32, - /// Hard cap on total LLM calls per Worker (planning + nav + check). - /// Prevents runaway costs regardless of max_rounds. 0 = no limit. - pub max_llm_calls: u32, -} - -impl Default for WorkerConfig { - fn default() -> Self { - Self { - max_rounds: 100, - max_llm_calls: 200, - } - } -} - -impl WorkerConfig { - pub fn new() -> Self { - Self::default() - } -} - -// --------------------------------------------------------------------------- -// Answer pipeline configuration -// --------------------------------------------------------------------------- - -/// Answer pipeline configuration — synthesis settings. -#[derive(Debug, Clone)] -pub struct AnswerConfig { - /// Maximum number of evidence items to feed into synthesis. - pub evidence_cap: usize, -} - -impl Default for AnswerConfig { - fn default() -> Self { - Self { evidence_cap: 20 } - } -} - -// --------------------------------------------------------------------------- -// Aggregated agent configuration -// --------------------------------------------------------------------------- - -/// Aggregated configuration for the entire retrieval agent system. -#[derive(Debug, Clone, Default)] -pub struct AgentConfig { - pub worker: WorkerConfig, - pub answer: AnswerConfig, -} - -impl AgentConfig { - pub fn new() -> Self { - Self::default() - } -} - -// --------------------------------------------------------------------------- -// Output types -// --------------------------------------------------------------------------- - -/// Agent output — the final result of a retrieval operation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Output { - /// Final synthesized answer. - pub answer: String, - /// Collected evidence from navigation. - pub evidence: Vec, - /// Agent execution metrics. - pub metrics: Metrics, - /// Confidence score (0.0–1.0) — derived from LLM evaluate() result. - pub confidence: f32, - /// Reasoning trace steps collected during agent navigation. - pub trace_steps: Vec, -} - -impl Output { - /// Create an empty output (no evidence found). - pub fn empty() -> Self { - Self { - answer: String::new(), - evidence: Vec::new(), - metrics: Metrics::default(), - confidence: 0.0, - trace_steps: Vec::new(), - } - } -} - -/// A single piece of evidence collected during navigation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Evidence { - /// Navigation path where this evidence was found (e.g., "Root/API Reference/Auth"). - pub source_path: String, - /// Title of the node. - pub node_title: String, - /// Content of the node. - pub content: String, - /// Source document name (set by Orchestrator in multi-doc scenarios). - pub doc_name: Option, -} - -/// Agent execution metrics. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct Metrics { - pub rounds_used: u32, - pub llm_calls: u32, - pub nodes_visited: usize, - pub budget_exhausted: bool, - pub plan_generated: bool, - pub check_count: u32, - pub evidence_chars: usize, -} - -/// Step result from the navigation loop. -#[derive(Debug, Clone, PartialEq)] -pub enum Step { - /// Continue to next round with the given feedback. - Continue, - /// Navigation is done, proceed to synthesis. - Done, - /// Forced done due to budget exhaustion or error. - ForceDone(String), -} - -// --------------------------------------------------------------------------- -// Worker output (evidence only, no answer) -// --------------------------------------------------------------------------- - -/// Output from a single Worker — pure evidence, no answer synthesis. -/// Rerank handles all answer generation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WorkerOutput { - /// Collected evidence from document navigation. - pub evidence: Vec, - /// Worker execution metrics. - pub metrics: WorkerMetrics, - /// Document name this Worker was assigned to. - pub doc_name: String, - /// Reasoning trace steps from this Worker. - pub trace_steps: Vec, -} - -/// Metrics specific to a single Worker's execution. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct WorkerMetrics { - /// Number of navigation rounds used. - pub rounds_used: u32, - /// Number of LLM calls made. - pub llm_calls: u32, - /// Number of distinct nodes visited. - pub nodes_visited: usize, - /// Whether the LLM call budget was exhausted. - pub budget_exhausted: bool, - /// Whether a navigation plan was generated. - pub plan_generated: bool, - /// Number of times `check` was called. - pub check_count: u32, - /// Total characters of collected evidence. - pub evidence_chars: usize, -} - -impl From for Output { - fn from(wo: WorkerOutput) -> Self { - Output { - answer: String::new(), - evidence: wo.evidence, - metrics: Metrics { - rounds_used: wo.metrics.rounds_used, - llm_calls: wo.metrics.llm_calls, - nodes_visited: wo.metrics.nodes_visited, - budget_exhausted: wo.metrics.budget_exhausted, - plan_generated: wo.metrics.plan_generated, - check_count: wo.metrics.check_count, - evidence_chars: wo.metrics.evidence_chars, - }, - confidence: 0.0, - trace_steps: wo.trace_steps, - } - } -} - -// --------------------------------------------------------------------------- -// Scope types -// --------------------------------------------------------------------------- - -/// Scope context — determines which path the dispatcher takes. -/// -/// Both variants go through the Orchestrator. The difference is: -/// - `Specified`: user chose specific documents → skip Orchestrator analysis phase -/// - `Workspace`: user didn't specify → Orchestrator analyzes DocCards to select docs -pub enum Scope<'a> { - /// User specified one or more documents (by doc_id). - /// Orchestrator skips analysis, spawns Workers directly. - Specified(Vec>), - /// Workspace scope — user didn't specify documents. - /// Orchestrator analyzes DocCards and selects relevant ones. - Workspace(WorkspaceContext<'a>), -} - -/// Read-only access to a single document's compile artifacts. -pub struct DocContext<'a> { - /// Document content tree. - pub tree: &'a crate::document::DocumentTree, - /// Navigation index (includes DocCard). - pub nav_index: &'a crate::document::NavigationIndex, - /// Reasoning index (keyword/topic lookup). - pub reasoning_index: &'a crate::document::ReasoningIndex, - /// Document name (for evidence source attribution). - pub doc_name: &'a str, -} - -/// Read-only access to multiple documents' compile artifacts. -pub struct WorkspaceContext<'a> { - /// All available documents. - pub docs: Vec>, -} - -impl<'a> WorkspaceContext<'a> { - /// Create a workspace from a slice of DocContexts. - pub fn new(docs: Vec>) -> Self { - Self { docs } - } - - /// Number of documents in the workspace. - pub fn doc_count(&self) -> usize { - self.docs.len() - } - - /// Whether the workspace has only one document. - pub fn is_single(&self) -> bool { - self.docs.len() == 1 - } -} diff --git a/vectorless-core/vectorless/src/agent/context.rs b/vectorless-core/vectorless/src/agent/context.rs deleted file mode 100644 index c4e542bf..00000000 --- a/vectorless-core/vectorless/src/agent/context.rs +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Read-only data access wrappers over compile artifacts. -//! -//! These types provide the agent with structured access to the document's -//! navigation index, content tree, and reasoning index — all read-only. - -use crate::document::{ChildRoute, NodeId, TopicEntry}; - -// Re-export from config for convenience -pub use super::config::{DocContext, WorkspaceContext}; - -/// A single hit from a keyword search. -#[derive(Debug, Clone)] -pub struct FindHit { - /// The matched keyword. - pub keyword: String, - /// Topic entries matching the keyword. - pub entries: Vec, -} - -impl<'a> DocContext<'a> { - /// List child routes for a given node. - pub fn ls(&self, node: NodeId) -> Option<&[ChildRoute]> { - self.nav_index.get_child_routes(node) - } - - /// Read the full content of a node. - pub fn cat(&self, node: NodeId) -> Option<&str> { - self.tree.get(node).map(|n| n.content.as_str()) - } - - /// Get the title of a node. - pub fn node_title(&self, node: NodeId) -> Option<&str> { - self.tree.get(node).map(|n| n.title.as_str()) - } - - /// Search for a keyword in the reasoning index. - pub fn find(&self, keyword: &str) -> Option { - self.reasoning_index - .topic_entries(keyword) - .map(|entries| FindHit { - keyword: keyword.to_string(), - entries: entries.to_vec(), - }) - } - - /// Search for multiple keywords, collecting all hits. - pub fn find_all(&self, keywords: &[String]) -> Vec { - keywords.iter().filter_map(|kw| self.find(kw)).collect() - } - - /// Get the root node ID. - pub fn root(&self) -> NodeId { - self.tree.root() - } - - /// Get the document's DocCard, if available. - pub fn doc_card(&self) -> Option<&crate::document::DocCard> { - self.nav_index.doc_card() - } - - /// Get the navigation entry for a node (overview, hints, tags). - pub fn nav_entry(&self, node: NodeId) -> Option<&crate::document::NavEntry> { - self.nav_index.get_entry(node) - } - - /// Get the summary shortcut (pre-computed overview), if available. - pub fn summary_shortcut(&self) -> Option<&crate::document::SummaryShortcut> { - self.reasoning_index.summary_shortcut() - } - - /// Find a top-level section by its title, returning its NodeId. - pub fn find_section(&self, title: &str) -> Option { - self.reasoning_index.find_section(title) - } - - /// Get the parent of a node (by searching the tree). - pub fn parent(&self, node: NodeId) -> Option { - self.tree.parent(node) - } -} - -impl<'a> WorkspaceContext<'a> { - /// Search for a keyword across all documents. - pub fn find_cross(&self, keyword: &str) -> Vec<(usize, FindHit)> { - self.docs - .iter() - .enumerate() - .filter_map(|(idx, doc)| doc.find(keyword).map(|hit| (idx, hit))) - .collect() - } - - /// Search multiple keywords across all documents. - pub fn find_cross_all(&self, keywords: &[String]) -> Vec<(usize, Vec)> { - let mut results: Vec<(usize, Vec)> = Vec::new(); - for (idx, doc) in self.docs.iter().enumerate() { - let hits = doc.find_all(keywords); - if !hits.is_empty() { - results.push((idx, hits)); - } - } - results - } - - /// Get all DocCards for documents that have them. - pub fn doc_cards(&self) -> Vec<(usize, &crate::document::DocCard)> { - self.docs - .iter() - .enumerate() - .filter_map(|(idx, doc)| doc.doc_card().map(|card| (idx, card))) - .collect() - } - - /// Get a specific document context by index. - pub fn doc(&self, idx: usize) -> Option<&DocContext<'a>> { - self.docs.get(idx) - } -} diff --git a/vectorless-core/vectorless/src/agent/events.rs b/vectorless-core/vectorless/src/agent/events.rs deleted file mode 100644 index e4575c93..00000000 --- a/vectorless-core/vectorless/src/agent/events.rs +++ /dev/null @@ -1,537 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Agent events — rich, structured visibility into the entire retrieval pipeline. -//! -//! Events are organized by pipeline stage: -//! 1. **Query Understanding** — intent analysis, keyword extraction -//! 2. **Orchestrator** — document selection, dispatch, evaluation, replan -//! 3. **Worker** — navigation, evidence collection, budget management -//! 4. **Answer** — synthesis and fusion -//! -//! The stream terminates with `Completed` or `Error`. - -use serde::Serialize; - -/// An event emitted during agent-based retrieval. -/// -/// Each variant carries the data a client needs to understand what happened, -/// not just that something happened. All events are `Clone + Serialize` so -/// they can be broadcast or persisted. -#[derive(Debug, Clone, Serialize)] -pub enum AgentEvent { - // ── Query Understanding ────────────────────────────────────────── - /// Query understanding started. - QueryUnderstandingStarted { query: String }, - - /// Query understanding completed (intent, keywords, strategy decided). - QueryUnderstandingCompleted { - query: String, - intent: String, - keywords: Vec, - strategy_hint: String, - complexity: String, - }, - - // ── Orchestrator ───────────────────────────────────────────────── - /// Orchestrator started. - OrchestratorStarted { - query: String, - doc_count: usize, - skip_analysis: bool, - }, - - /// Orchestrator is analyzing documents to select which to dispatch. - OrchestratorAnalyzing { - doc_count: usize, - keywords: Vec, - }, - - /// A Worker was dispatched to a document. - WorkerDispatched { - doc_idx: usize, - doc_name: String, - task: String, - focus_keywords: Vec, - }, - - /// A Worker finished its task. - WorkerCompleted { - doc_idx: usize, - doc_name: String, - evidence_count: usize, - rounds_used: u32, - llm_calls: u32, - success: bool, - }, - - /// Cross-doc sufficiency evaluation result. - OrchestratorEvaluated { - sufficient: bool, - evidence_count: usize, - missing_info: Option, - }, - - /// Orchestrator is replanning after insufficient evidence. - OrchestratorReplanning { - reason: String, - evidence_count: usize, - }, - - /// Orchestrator completed. - OrchestratorCompleted { - evidence_count: usize, - total_llm_calls: u32, - dispatch_rounds: u32, - }, - - // ── Worker (per-document navigation) ───────────────────────────── - /// Worker started on a document. - WorkerStarted { - doc_name: String, - task: Option, - max_rounds: u32, - }, - - /// Worker generated a navigation plan. - WorkerPlanGenerated { doc_name: String, plan_len: usize }, - - /// A navigation round completed. - WorkerRound { - doc_name: String, - round: u32, - command: String, - success: bool, - elapsed_ms: u64, - }, - - /// Evidence was collected from a node. - EvidenceCollected { - doc_name: String, - node_title: String, - source_path: String, - content_len: usize, - total_evidence: usize, - }, - - /// Worker sufficiency check result. - WorkerSufficiencyCheck { - doc_name: String, - sufficient: bool, - evidence_count: usize, - missing_info: Option, - }, - - /// Worker re-planned after insufficient check. - WorkerReplan { - doc_name: String, - missing_info: String, - plan_len: usize, - }, - - /// Worker budget warning (stuck or half-budget). - WorkerBudgetWarning { - doc_name: String, - warning_type: String, - round: u32, - }, - - /// Worker completed. - WorkerDone { - doc_name: String, - evidence_count: usize, - rounds_used: u32, - llm_calls: u32, - budget_exhausted: bool, - plan_generated: bool, - }, - - // ── Answer Pipeline ────────────────────────────────────────────── - /// Answer synthesis started. - AnswerStarted { - evidence_count: usize, - multi_doc: bool, - }, - - /// Answer synthesis completed. - AnswerCompleted { - answer_len: usize, - confidence: String, - }, - - // ── Terminal ───────────────────────────────────────────────────── - /// Entire retrieval pipeline completed. - Completed { - evidence_count: usize, - llm_calls: u32, - answer_len: usize, - }, - - /// An error occurred. - Error { stage: String, message: String }, -} - -// --------------------------------------------------------------------------- -// Channel + EventEmitter -// --------------------------------------------------------------------------- - -/// Sender for agent events. -pub(crate) type AgentEventSender = tokio::sync::mpsc::Sender; - -/// Receiver for agent events. -pub type AgentEventReceiver = tokio::sync::mpsc::Receiver; - -/// Create a bounded channel for agent events. -pub(crate) fn channel(bound: usize) -> (AgentEventSender, AgentEventReceiver) { - tokio::sync::mpsc::channel(bound) -} - -/// Default channel bound for agent events. -pub const DEFAULT_AGENT_EVENT_BOUND: usize = 256; - -/// A handle for emitting agent events. -/// -/// Wraps an `mpsc::Sender` and silently drops events if the receiver -/// is closed (no panic on send failure). Cheaply clonable. -#[derive(Clone)] -pub struct EventEmitter { - tx: Option, -} - -impl EventEmitter { - /// Create a new emitter with the given sender. - pub fn new(tx: AgentEventSender) -> Self { - Self { tx: Some(tx) } - } - - /// Create a noop emitter that discards all events. - pub fn noop() -> Self { - Self { tx: None } - } - - /// Emit an event. Silently drops if the receiver is closed. - pub fn emit(&self, event: AgentEvent) { - if let Some(ref tx) = self.tx { - let _ = tx.try_send(event); - } - } - - // ── Query Understanding ── - - pub fn emit_query_understanding_started(&self, query: &str) { - self.emit(AgentEvent::QueryUnderstandingStarted { - query: query.to_string(), - }); - } - - pub fn emit_query_understanding_completed( - &self, - query: &str, - intent: &str, - keywords: &[String], - strategy_hint: &str, - complexity: &str, - ) { - self.emit(AgentEvent::QueryUnderstandingCompleted { - query: query.to_string(), - intent: intent.to_string(), - keywords: keywords.to_vec(), - strategy_hint: strategy_hint.to_string(), - complexity: complexity.to_string(), - }); - } - - // ── Orchestrator ── - - pub fn emit_orchestrator_started(&self, query: &str, doc_count: usize, skip_analysis: bool) { - self.emit(AgentEvent::OrchestratorStarted { - query: query.to_string(), - doc_count, - skip_analysis, - }); - } - - pub fn emit_orchestrator_analyzing(&self, doc_count: usize, keywords: &[String]) { - self.emit(AgentEvent::OrchestratorAnalyzing { - doc_count, - keywords: keywords.to_vec(), - }); - } - - pub fn emit_worker_dispatched( - &self, - doc_idx: usize, - doc_name: &str, - task: &str, - focus_keywords: &[String], - ) { - self.emit(AgentEvent::WorkerDispatched { - doc_idx, - doc_name: doc_name.to_string(), - task: task.to_string(), - focus_keywords: focus_keywords.to_vec(), - }); - } - - pub fn emit_worker_completed( - &self, - doc_idx: usize, - doc_name: &str, - evidence_count: usize, - rounds_used: u32, - llm_calls: u32, - success: bool, - ) { - self.emit(AgentEvent::WorkerCompleted { - doc_idx, - doc_name: doc_name.to_string(), - evidence_count, - rounds_used, - llm_calls, - success, - }); - } - - pub fn emit_orchestrator_evaluated( - &self, - sufficient: bool, - evidence_count: usize, - missing_info: Option<&str>, - ) { - self.emit(AgentEvent::OrchestratorEvaluated { - sufficient, - evidence_count, - missing_info: missing_info.map(|s| s.to_string()), - }); - } - - pub fn emit_orchestrator_replanning(&self, reason: &str, evidence_count: usize) { - self.emit(AgentEvent::OrchestratorReplanning { - reason: reason.to_string(), - evidence_count, - }); - } - - pub fn emit_orchestrator_completed( - &self, - evidence_count: usize, - total_llm_calls: u32, - dispatch_rounds: u32, - ) { - self.emit(AgentEvent::OrchestratorCompleted { - evidence_count, - total_llm_calls, - dispatch_rounds, - }); - } - - // ── Worker ── - - pub fn emit_worker_started(&self, doc_name: &str, task: Option<&str>, max_rounds: u32) { - self.emit(AgentEvent::WorkerStarted { - doc_name: doc_name.to_string(), - task: task.map(|s| s.to_string()), - max_rounds, - }); - } - - pub fn emit_worker_plan_generated(&self, doc_name: &str, plan_len: usize) { - self.emit(AgentEvent::WorkerPlanGenerated { - doc_name: doc_name.to_string(), - plan_len, - }); - } - - pub fn emit_worker_round( - &self, - doc_name: &str, - round: u32, - command: &str, - success: bool, - elapsed_ms: u64, - ) { - self.emit(AgentEvent::WorkerRound { - doc_name: doc_name.to_string(), - round, - command: command.to_string(), - success, - elapsed_ms, - }); - } - - pub fn emit_evidence( - &self, - doc_name: &str, - node_title: &str, - source_path: &str, - content_len: usize, - total: usize, - ) { - self.emit(AgentEvent::EvidenceCollected { - doc_name: doc_name.to_string(), - node_title: node_title.to_string(), - source_path: source_path.to_string(), - content_len, - total_evidence: total, - }); - } - - pub fn emit_worker_sufficiency_check( - &self, - doc_name: &str, - sufficient: bool, - evidence_count: usize, - missing_info: Option<&str>, - ) { - self.emit(AgentEvent::WorkerSufficiencyCheck { - doc_name: doc_name.to_string(), - sufficient, - evidence_count, - missing_info: missing_info.map(|s| s.to_string()), - }); - } - - pub fn emit_worker_replan(&self, doc_name: &str, missing_info: &str, plan_len: usize) { - self.emit(AgentEvent::WorkerReplan { - doc_name: doc_name.to_string(), - missing_info: missing_info.to_string(), - plan_len, - }); - } - - pub fn emit_worker_budget_warning(&self, doc_name: &str, warning_type: &str, round: u32) { - self.emit(AgentEvent::WorkerBudgetWarning { - doc_name: doc_name.to_string(), - warning_type: warning_type.to_string(), - round, - }); - } - - pub fn emit_worker_done( - &self, - doc_name: &str, - evidence_count: usize, - rounds_used: u32, - llm_calls: u32, - budget_exhausted: bool, - plan_generated: bool, - ) { - self.emit(AgentEvent::WorkerDone { - doc_name: doc_name.to_string(), - evidence_count, - rounds_used, - llm_calls, - budget_exhausted, - plan_generated, - }); - } - - // ── Answer ── - - pub fn emit_answer_started(&self, evidence_count: usize, multi_doc: bool) { - self.emit(AgentEvent::AnswerStarted { - evidence_count, - multi_doc, - }); - } - - pub fn emit_answer_completed(&self, answer_len: usize, confidence: &str) { - self.emit(AgentEvent::AnswerCompleted { - answer_len, - confidence: confidence.to_string(), - }); - } - - // ── Terminal ── - - pub fn emit_completed(&self, evidence_count: usize, llm_calls: u32, answer_len: usize) { - self.emit(AgentEvent::Completed { - evidence_count, - llm_calls, - answer_len, - }); - } - - pub fn emit_error(&self, stage: &str, message: &str) { - self.emit(AgentEvent::Error { - stage: stage.to_string(), - message: message.to_string(), - }); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_noop_emitter() { - let emitter = EventEmitter::noop(); - emitter.emit_orchestrator_started("test", 1, false); - emitter.emit_worker_started("doc.md", None, 8); - emitter.emit_worker_round("doc.md", 1, "ls", true, 50); - emitter.emit_worker_done("doc.md", 0, 1, 1, false, false); - emitter.emit_completed(0, 1, 0); - // No panic — events silently dropped - } - - #[test] - fn test_event_roundtrip() { - let (tx, mut rx) = channel(DEFAULT_AGENT_EVENT_BOUND); - let emitter = EventEmitter::new(tx); - - emitter.emit_orchestrator_started("what is X?", 1, true); - emitter.emit_worker_started("doc.md", None, 8); - emitter.emit_evidence("doc.md", "Intro", "root/Intro", 100, 1); - emitter.emit_worker_sufficiency_check("doc.md", true, 1, None); - emitter.emit_worker_done("doc.md", 1, 3, 5, false, true); - emitter.emit_completed(1, 6, 42); - - let events: Vec = (0..6).map(|_| rx.blocking_recv().unwrap()).collect(); - - assert!( - matches!(&events[0], AgentEvent::OrchestratorStarted { query, .. } if query == "what is X?") - ); - assert!( - matches!(&events[1], AgentEvent::WorkerStarted { doc_name, .. } if doc_name == "doc.md") - ); - assert!( - matches!(&events[2], AgentEvent::EvidenceCollected { node_title, .. } if node_title == "Intro") - ); - assert!(matches!( - &events[3], - AgentEvent::WorkerSufficiencyCheck { - sufficient: true, - .. - } - )); - assert!(matches!( - &events[4], - AgentEvent::WorkerDone { - evidence_count: 1, - plan_generated: true, - .. - } - )); - assert!(matches!( - &events[5], - AgentEvent::Completed { - evidence_count: 1, - answer_len: 42, - .. - } - )); - } - - #[test] - fn test_serialization() { - let event = AgentEvent::OrchestratorStarted { - query: "test".to_string(), - doc_count: 3, - skip_analysis: false, - }; - let json = serde_json::to_string(&event).unwrap(); - assert!(json.contains("OrchestratorStarted")); - assert!(json.contains("test")); - } -} diff --git a/vectorless-core/vectorless/src/agent/mod.rs b/vectorless-core/vectorless/src/agent/mod.rs deleted file mode 100644 index f471258a..00000000 --- a/vectorless-core/vectorless/src/agent/mod.rs +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Retrieval agent — struct-based document intelligence. -//! -//! # Architecture -//! -//! The retrieval dispatcher always goes through the Orchestrator. -//! Based on [`Scope`]: -//! -//! - **User specified doc_ids** → Orchestrator skips analysis, spawns Workers directly. -//! - **Workspace / unspecified** → Orchestrator analyzes DocCards, selects docs, spawns Workers. -//! -//! Both paths produce the same [`Output`] type and share the same synthesis logic. -//! -//! ```text -//! dispatch(query, scope) -//! └── Orchestrator (always) -//! ├── Scope::Specified(docs) → skip analysis → N × Worker → synthesis -//! └── Scope::Workspace(ws) → analysis → N × Worker → fusion → synthesis -//! ``` -//! -//! # Agent trait -//! -//! All retrieval agents implement [`Agent`] with `async fn run(self)` (Edition 2024). -//! The trait uses native async functions — no `async-trait` crate needed. - -pub mod command; -pub mod config; -pub mod context; -pub mod events; -pub mod state; -pub mod tools; - -pub mod orchestrator; -pub mod prompts; -pub mod worker; - -pub use config::{DocContext, Evidence, Output, Scope, WorkspaceContext}; -pub use events::{AgentEvent, EventEmitter}; - -/// Agent trait — async, consuming-self execution. -/// -/// Each agent struct holds its own configuration and context. -/// Calling `run(self)` consumes the agent and produces output. -/// -/// Uses Edition 2024 native `async fn` in trait — no `async-trait` crate. -pub trait Agent { - /// The output type produced by this agent. - type Output; - /// Agent name for logging and events. - fn name(&self) -> &str; - /// Execute the agent, consuming self. - async fn run(self) -> crate::error::Result; -} diff --git a/vectorless-core/vectorless/src/agent/orchestrator/analyze.rs b/vectorless-core/vectorless/src/agent/orchestrator/analyze.rs deleted file mode 100644 index 47dd58f1..00000000 --- a/vectorless-core/vectorless/src/agent/orchestrator/analyze.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Phase 1: Analyze documents and produce a dispatch plan. -//! -//! Uses the [`QueryPlan`] from query understanding to inform document selection. -//! LLM errors propagate — no silent degradation. - -use tracing::{debug, info}; - -use crate::error::Error; -use crate::llm::LlmClient; -use crate::query::QueryPlan; -use crate::scoring::bm25::extract_keywords; - -use super::super::config::WorkspaceContext; -use super::super::prompts::{DispatchEntry, orchestrator_analysis, parse_dispatch_plan}; -use super::super::state::OrchestratorState; -use super::super::tools::orchestrator as orch_tools; - -/// Outcome of the analyze phase. -pub enum AnalyzeOutcome { - /// Produce dispatch entries for Phase 2. - Proceed { - dispatches: Vec, - llm_calls: u32, - }, - /// Cross-doc search already answered the query. - AlreadyAnswered { llm_calls: u32 }, - /// No relevant documents found. - NoResults { llm_calls: u32 }, -} - -/// Analyze documents and produce a dispatch plan. -/// -/// Uses the [`QueryPlan`] for intent-aware analysis: -/// - Intent and key concepts inform the LLM about what to look for -/// - Complexity hints at how many documents may be needed -/// - Strategy hint guides the analysis approach -/// -/// LLM failures propagate as [`Error::LlmReasoning`] — no fallback. -pub async fn analyze( - query: &str, - ws: &WorkspaceContext<'_>, - state: &mut OrchestratorState, - emitter: &crate::agent::EventEmitter, - skip_analysis: bool, - query_plan: &QueryPlan, - llm: &LlmClient, -) -> crate::error::Result { - if skip_analysis { - debug!("Phase 1: skipping (user-specified documents)"); - let dispatches = (0..ws.doc_count()) - .map(|idx| DispatchEntry { - doc_idx: idx, - reason: "User-specified document".to_string(), - task: query.to_string(), - }) - .collect(); - return Ok(AnalyzeOutcome::Proceed { - dispatches, - llm_calls: 0, - }); - } - - debug!( - intent = %query_plan.intent, - complexity = %query_plan.complexity, - strategy = query_plan.strategy_hint, - "Phase 1: analyzing doc cards with query understanding" - ); - - let doc_cards_text = orch_tools::ls_docs(ws).feedback; - let keywords = extract_keywords(query); - let find_text = if keywords.is_empty() { - "(no keywords extracted)".to_string() - } else { - orch_tools::find_cross(&keywords, ws).feedback - }; - - info!(keywords = ?keywords, "Phase 1: analyzing"); - debug!( - doc_cards_len = doc_cards_text.len(), - find_results_len = find_text.len(), - "Phase 1: analysis input" - ); - - // Build analysis prompt enriched with query understanding - let concepts_text = if query_plan.key_concepts.is_empty() { - String::new() - } else { - format!("\nKey concepts: {}", query_plan.key_concepts.join(", ")) - }; - - let strategy_text = if query_plan.strategy_hint.is_empty() { - String::new() - } else { - format!("\nRetrieval strategy: {}", query_plan.strategy_hint) - }; - - let rewritten_text = if query_plan.rewritten.is_empty() { - String::new() - } else { - format!( - "\nRewritten queries for matching: {}", - query_plan.rewritten.join("; ") - ) - }; - - let intent_context = format!( - "\nQuery intent: {} (complexity: {}){concepts_text}{strategy_text}{rewritten_text}", - query_plan.intent, query_plan.complexity, - ); - - let (system, user) = - orchestrator_analysis(&super::super::prompts::OrchestratorAnalysisParams { - query, - doc_cards: &doc_cards_text, - find_results: &find_text, - intent_context: &intent_context, - }); - - let analysis_output = llm.complete(&system, &user).await.map_err(|e| { - emitter.emit_error("orchestrator/analysis", &e.to_string()); - Error::LlmReasoning { - stage: "orchestrator/analysis".to_string(), - detail: format!("LLM call failed: {e}"), - } - })?; - - info!( - response_len = analysis_output.len(), - response = %if analysis_output.len() > 500 { &analysis_output[..500] } else { &analysis_output }, - "Phase 1: analysis LLM response" - ); - - let dispatches = match parse_dispatch_plan(&analysis_output, ws.doc_count()) { - Some(entries) => entries, - None => { - info!("Orchestrator: analysis indicates already answered"); - return Ok(AnalyzeOutcome::AlreadyAnswered { llm_calls: 1 }); - } - }; - - info!( - dispatches = dispatches.len(), - "Phase 1: parsed dispatch plan" - ); - - if dispatches.is_empty() { - return Ok(AnalyzeOutcome::NoResults { llm_calls: 1 }); - } - - state.analyze_done = true; - Ok(AnalyzeOutcome::Proceed { - dispatches, - llm_calls: 1, - }) -} diff --git a/vectorless-core/vectorless/src/agent/orchestrator/dispatch.rs b/vectorless-core/vectorless/src/agent/orchestrator/dispatch.rs deleted file mode 100644 index f599ac1d..00000000 --- a/vectorless-core/vectorless/src/agent/orchestrator/dispatch.rs +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Phase 2: Dispatch Workers and collect results. - -use tracing::{info, warn}; - -use crate::llm::LlmClient; - -use super::super::Agent; -use super::super::config::{AgentConfig, WorkspaceContext}; -use super::super::events::EventEmitter; -use super::super::prompts::DispatchEntry; -use super::super::state::OrchestratorState; -use super::super::worker::Worker; -use crate::query::QueryPlan; - -/// Dispatch Workers in parallel and collect results. -pub async fn dispatch_and_collect( - query: &str, - dispatches: &[DispatchEntry], - ws: &WorkspaceContext<'_>, - config: &AgentConfig, - llm: &LlmClient, - state: &mut OrchestratorState, - emitter: &EventEmitter, - query_plan: &QueryPlan, -) { - let futures: Vec<_> = dispatches - .iter() - .filter_map(|dispatch| { - let doc = match ws.doc(dispatch.doc_idx) { - Some(d) => d, - None => { - warn!(doc_idx = dispatch.doc_idx, "Document not found, skipping"); - return None; - } - }; - - let query = query.to_string(); - let task = dispatch.task.clone(); - let worker_config = config.worker.clone(); - let doc_idx = dispatch.doc_idx; - let doc_name = doc.doc_name.to_string(); - let llm = llm.clone(); - let sub_emitter = EventEmitter::noop(); - let worker_plan = query_plan.clone(); - - Some(async move { - emitter.emit_worker_dispatched(doc_idx, &doc_name, &task, &[]); - let worker = Worker::new( - &query, - Some(&task), - doc, - worker_config, - llm, - sub_emitter, - worker_plan, - ); - let result = worker.run().await; - (doc_idx, doc_name, result) - }) - }) - .collect(); - - let results: Vec<_> = futures::future::join_all(futures).await; - - for (doc_idx, doc_name, result) in results { - match result { - Ok(output) => { - info!( - doc_idx, - evidence = output.evidence.len(), - "Worker completed" - ); - emitter.emit_worker_completed( - doc_idx, - &doc_name, - output.evidence.len(), - output.metrics.rounds_used, - output.metrics.llm_calls, - true, - ); - state.collect_result(doc_idx, output); - } - Err(e) => { - warn!(doc_idx, error = %e, "Worker failed"); - emitter.emit_worker_completed(doc_idx, &doc_name, 0, 0, 0, false); - } - } - } -} diff --git a/vectorless-core/vectorless/src/agent/orchestrator/evaluate.rs b/vectorless-core/vectorless/src/agent/orchestrator/evaluate.rs deleted file mode 100644 index 88e7e07f..00000000 --- a/vectorless-core/vectorless/src/agent/orchestrator/evaluate.rs +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Evaluate cross-document evidence sufficiency via LLM. -//! -//! Replaces the old `integrate` module's heuristic sufficiency check. -//! LLM errors propagate — no silent "assume sufficient" fallback. - -use tracing::info; - -use crate::error::Error; -use crate::llm::LlmClient; - -use super::super::config::Evidence; -use super::super::prompts::{check_sufficiency, parse_sufficiency_response}; - -/// Result of the evidence sufficiency evaluation. -pub struct EvalResult { - /// Whether the collected evidence is sufficient to answer the query. - pub sufficient: bool, - /// Description of what information is still missing (empty if sufficient). - pub missing_info: String, -} - -/// Evaluate cross-document evidence sufficiency via LLM. -/// -/// Propagates LLM errors as [`Error::LlmReasoning`]. -/// The caller decides how to handle insufficiency (replan, abort, etc.). -pub async fn evaluate( - query: &str, - evidence: &[Evidence], - llm: &LlmClient, -) -> crate::error::Result { - let evidence_summary = format_evidence_summary(evidence); - let (system, user) = check_sufficiency(query, &evidence_summary); - - info!( - evidence = evidence.len(), - "Evaluating evidence sufficiency..." - ); - let response = llm - .complete(&system, &user) - .await - .map_err(|e| Error::LlmReasoning { - stage: "orchestrator/evaluate".to_string(), - detail: format!("Sufficiency check LLM call failed: {e}"), - })?; - - let sufficient = parse_sufficiency_response(&response); - let missing_info = if sufficient { - String::new() - } else { - // Extract the reason from the response (everything after SUFFICIENT/INSUFFICIENT) - let reason = response - .trim() - .strip_prefix("INSUFFICIENT") - .or_else(|| response.trim().strip_prefix("Insufficient")) - .unwrap_or("") - .trim_start_matches(|c: char| c == '-' || c == ' ' || c == ':'); - if reason.is_empty() { - "Evidence does not fully address the query.".to_string() - } else { - reason.to_string() - } - }; - - info!( - sufficient, - evidence = evidence.len(), - missing_info_len = missing_info.len(), - "Cross-doc sufficiency evaluation" - ); - - Ok(EvalResult { - sufficient, - missing_info, - }) -} - -/// Format evidence summary for sufficiency check. -/// Includes actual content so the check LLM can evaluate relevance. -pub fn format_evidence_summary(evidence: &[Evidence]) -> String { - if evidence.is_empty() { - return "(no evidence)".to_string(); - } - evidence - .iter() - .map(|e| { - let doc = e.doc_name.as_deref().unwrap_or("unknown"); - format!("[{}] (from {})\n{}", e.node_title, doc, e.content) - }) - .collect::>() - .join("\n\n") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_format_evidence_summary() { - let evidence = vec![ - Evidence { - source_path: "root/A".to_string(), - node_title: "A".to_string(), - content: "content".to_string(), - doc_name: Some("doc1".to_string()), - }, - Evidence { - source_path: "root/B".to_string(), - node_title: "B".to_string(), - content: "more content".to_string(), - doc_name: Some("doc2".to_string()), - }, - ]; - let summary = format_evidence_summary(&evidence); - assert!(summary.contains("[A]")); - assert!(summary.contains("doc1")); - assert!(summary.contains("[B]")); - assert!(summary.contains("doc2")); - } - - #[test] - fn test_format_evidence_summary_empty() { - let summary = format_evidence_summary(&[]); - assert!(summary.contains("no evidence")); - } -} diff --git a/vectorless-core/vectorless/src/agent/orchestrator/mod.rs b/vectorless-core/vectorless/src/agent/orchestrator/mod.rs deleted file mode 100644 index 643ca0fe..00000000 --- a/vectorless-core/vectorless/src/agent/orchestrator/mod.rs +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Orchestrator agent — supervisor loop for multi-document retrieval. -//! -//! The Orchestrator is a consuming-self struct implementing [`Agent`]: -//! 1. Analyze: LLM selects documents + tasks (informed by QueryPlan) -//! 2. Supervisor loop: dispatch → evaluate → replan if insufficient -//! 3. Rerank: dedup → BM25 scoring → synthesis/fusion - -mod analyze; -mod dispatch; -mod evaluate; -mod replan; -mod supervisor; - -use tracing::info; - -use crate::llm::LlmClient; -use crate::query::QueryPlan; - -use super::Agent; -use super::config::{AgentConfig, Output, WorkspaceContext}; -use super::events::EventEmitter; -use super::state::OrchestratorState; - -use analyze::{AnalyzeOutcome, analyze}; -use supervisor::run_supervisor_loop; - -/// Maximum supervisor loop iterations to prevent infinite loops. -const MAX_SUPERVISOR_ITERATIONS: u32 = 3; - -/// Orchestrator agent — coordinates multi-document retrieval. -/// -/// Holds all execution context. Calling [`run()`](Agent::run) consumes self. -pub struct Orchestrator<'a> { - query: String, - ws: &'a WorkspaceContext<'a>, - config: AgentConfig, - llm: LlmClient, - emitter: EventEmitter, - skip_analysis: bool, - /// Query understanding plan — produced by `QueryPipeline::understand()`. - /// Contains intent, complexity, key concepts, and strategy hints. - query_plan: QueryPlan, -} - -impl<'a> Orchestrator<'a> { - /// Create a new Orchestrator. - pub fn new( - query: &str, - ws: &'a WorkspaceContext<'a>, - config: AgentConfig, - llm: LlmClient, - emitter: EventEmitter, - skip_analysis: bool, - query_plan: QueryPlan, - ) -> Self { - Self { - query: query.to_string(), - ws, - config, - llm, - emitter, - skip_analysis, - query_plan, - } - } -} - -impl<'a> Agent for Orchestrator<'a> { - type Output = Output; - - fn name(&self) -> &str { - "orchestrator" - } - - async fn run(self) -> crate::error::Result { - let Orchestrator { - query, - ws, - config, - llm, - emitter, - skip_analysis, - query_plan, - } = self; - - info!( - docs = ws.doc_count(), - skip_analysis, - intent = %query_plan.intent, - complexity = %query_plan.complexity, - "Orchestrator starting" - ); - emitter.emit_orchestrator_started(&query, ws.doc_count(), skip_analysis); - - let mut state = OrchestratorState::new(); - let mut orch_llm_calls: u32 = 0; - - // --- Phase 1: Analyze — LLM selects documents + tasks --- - let initial_dispatches = match analyze( - &query, - ws, - &mut state, - &emitter, - skip_analysis, - &query_plan, - &llm, - ) - .await? - { - AnalyzeOutcome::Proceed { - dispatches, - llm_calls, - } => { - orch_llm_calls += llm_calls; - dispatches - } - AnalyzeOutcome::AlreadyAnswered { llm_calls } => { - let mut output = Output::empty(); - output.answer = "Already answered by cross-document search.".to_string(); - emitter.emit_orchestrator_completed(0, orch_llm_calls + llm_calls, 0); - return Ok(output); - } - AnalyzeOutcome::NoResults { llm_calls } => { - emitter.emit_orchestrator_completed(0, orch_llm_calls + llm_calls, 0); - return Ok(Output::empty()); - } - }; - - // --- Phase 2: Supervisor loop --- - let outcome = run_supervisor_loop( - &query, - initial_dispatches, - ws, - &config, - &llm, - &mut state, - &emitter, - &query_plan, - skip_analysis, - ) - .await?; - orch_llm_calls += outcome.llm_calls; - - let confidence = compute_confidence( - outcome.eval_sufficient, - outcome.iteration, - state.all_evidence.is_empty(), - ); - - // --- Phase 3: Finalize — rerank + synthesize --- - if state.all_evidence.is_empty() { - emitter.emit_orchestrator_completed(0, orch_llm_calls, 0); - return Ok(state.into_output(String::new())); - } - - let multi_doc = ws.doc_count() > 1; - finalize_output( - &query, - &state, - &emitter, - orch_llm_calls, - multi_doc, - query_plan.intent, - confidence, - ) - .await - } -} - -/// Compute confidence from LLM evaluate() outcome. -fn compute_confidence(eval_sufficient: bool, replan_rounds: u32, no_evidence: bool) -> f32 { - if no_evidence { - return 0.0; - } - if eval_sufficient { - // LLM said sufficient: first round = 0.95, each replan round drops 0.15 - (0.95 - replan_rounds as f32 * 0.15).max(0.5) - } else { - // LLM never said sufficient (budget exhausted or no more docs) - (0.4 - replan_rounds as f32 * 0.1).max(0.1) - } -} - -/// Rerank evidence and emit completion events. -pub async fn finalize_output( - query: &str, - state: &OrchestratorState, - emitter: &EventEmitter, - orch_llm_calls: u32, - multi_doc: bool, - intent: crate::query::QueryIntent, - confidence: f32, -) -> crate::error::Result { - let rerank_result = - crate::rerank::process(query, &state.all_evidence, multi_doc, intent, confidence).await?; - - let total_llm_calls = orch_llm_calls + rerank_result.llm_calls; - if !rerank_result.answer.is_empty() { - emitter.emit_answer_completed(rerank_result.answer.len(), "medium"); - } - - let mut output = state.clone_results_into_output(rerank_result.answer); - output.metrics.llm_calls += total_llm_calls; - output.confidence = rerank_result.confidence; - - emitter.emit_orchestrator_completed( - output.evidence.len(), - output.metrics.llm_calls, - output.metrics.rounds_used, - ); - - info!( - evidence = output.evidence.len(), - llm_calls = output.metrics.llm_calls, - confidence = output.confidence, - "Orchestrator complete" - ); - - Ok(output) -} diff --git a/vectorless-core/vectorless/src/agent/orchestrator/replan.rs b/vectorless-core/vectorless/src/agent/orchestrator/replan.rs deleted file mode 100644 index 58ce32b6..00000000 --- a/vectorless-core/vectorless/src/agent/orchestrator/replan.rs +++ /dev/null @@ -1,249 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Replan: LLM-driven re-dispatch after insufficient evidence. -//! -//! After evaluate() returns insufficient, the Orchestrator replans: -//! the LLM analyzes what's missing and decides which documents to query next. -//! This replaces the old heuristic supplement logic. - -use tracing::info; - -use crate::error::Error; -use crate::llm::LlmClient; -use crate::scoring::bm25::extract_keywords; - -use super::super::config::Evidence; -use super::super::prompts::DispatchEntry; - -/// Result of the replan phase. -pub struct ReplanResult { - /// New dispatch targets for the next round. - pub dispatches: Vec, - /// The LLM's reasoning about what was missing. - pub reasoning: String, -} - -/// Replan dispatch targets based on missing information. -/// -/// The LLM reviews: -/// - The original query -/// - What evidence has been collected so far -/// - What information is still missing -/// - Available documents that haven't been dispatched yet -/// -/// Returns new dispatch targets. LLM errors propagate. -pub async fn replan( - query: &str, - missing_info: &str, - collected_evidence: &[Evidence], - dispatched_indices: &[usize], - total_docs: usize, - doc_cards_text: &str, - llm: &LlmClient, -) -> crate::error::Result { - let evidence_summary = format_evidence_context(collected_evidence); - let keywords = extract_keywords(query); - let find_text = if keywords.is_empty() { - String::new() - } else { - format!("\nExtracted keywords: {}", keywords.join(", ")) - }; - - let (system, user) = replan_prompt( - query, - missing_info, - &evidence_summary, - dispatched_indices, - doc_cards_text, - &find_text, - ); - - info!( - evidence = collected_evidence.len(), - "Replanning dispatch targets..." - ); - let response = llm - .complete(&system, &user) - .await - .map_err(|e| Error::LlmReasoning { - stage: "orchestrator/replan".to_string(), - detail: format!("Replan LLM call failed: {e}"), - })?; - - info!( - response_len = response.len(), - "Replan LLM response received" - ); - - let dispatches = parse_replan_response(&response, total_docs, dispatched_indices); - let reasoning = response.lines().take(3).collect::>().join(" "); - - info!( - new_dispatches = dispatches.len(), - "Replan produced new dispatch targets" - ); - - Ok(ReplanResult { - dispatches, - reasoning, - }) -} - -/// Format collected evidence for the replan prompt. -/// Includes content so the LLM can reason about what's actually been found. -fn format_evidence_context(evidence: &[Evidence]) -> String { - if evidence.is_empty() { - return "(no evidence collected)".to_string(); - } - evidence - .iter() - .map(|e| { - let doc = e.doc_name.as_deref().unwrap_or("unknown"); - format!("[{}] (from {})\n{}", e.node_title, doc, e.content) - }) - .collect::>() - .join("\n\n") -} - -/// Build the replan prompt. -fn replan_prompt( - query: &str, - missing_info: &str, - evidence_summary: &str, - dispatched: &[usize], - doc_cards: &str, - keywords_text: &str, -) -> (String, String) { - let dispatched_set: Vec = dispatched - .iter() - .map(|&i| format!("doc {}", i + 1)) - .collect(); - let dispatched_text = if dispatched_set.is_empty() { - "None".to_string() - } else { - dispatched_set.join(", ") - }; - - let system = "You are a multi-document retrieval coordinator. The first round of evidence \ - collection was insufficient to fully answer the query. Review what was collected, \ - what's missing, and decide which additional documents to query. - -Output format — for each additional document to query, output a block: -- doc: - reason: - task: - -Only include documents not yet dispatched. If no additional documents are likely to help, \ -respond with: NO_ADDITIONAL_DOCS" - .to_string(); - - let user = format!( - "Original question: {query} - -Missing information: {missing_info} - -Collected evidence so far: -{evidence_summary} - -Already dispatched documents: {dispatched_text} - -Available documents (all): -{doc_cards}{keywords_text} - -Additional documents to query:" - ); - - (system, user) -} - -/// Parse the replan response into dispatch entries. -fn parse_replan_response( - response: &str, - total_docs: usize, - dispatched: &[usize], -) -> Vec { - let trimmed = response.trim(); - - if trimmed.starts_with("NO_ADDITIONAL_DOCS") { - return Vec::new(); - } - - let mut entries = Vec::new(); - let mut current_doc_idx: Option = None; - let mut current_reason = String::new(); - let mut current_task = String::new(); - - for line in trimmed.lines() { - let line = line.trim(); - - if let Some(rest) = line.strip_prefix("- doc:") { - // Flush previous - if let Some(idx) = current_doc_idx.take() { - entries.push(DispatchEntry { - doc_idx: idx, - reason: std::mem::take(&mut current_reason), - task: std::mem::take(&mut current_task), - }); - } - - let doc_num: usize = rest.trim().trim_end_matches(',').parse().unwrap_or(0); - if doc_num > 0 && doc_num <= total_docs { - let idx = doc_num - 1; - // Only include if not already dispatched - if !dispatched.contains(&idx) { - current_doc_idx = Some(idx); - } - } - } else if let Some(rest) = line.strip_prefix("reason:") { - current_reason = rest.trim().to_string(); - } else if let Some(rest) = line.strip_prefix("task:") { - current_task = rest.trim().to_string(); - } - } - - // Flush last - if let Some(idx) = current_doc_idx { - entries.push(DispatchEntry { - doc_idx: idx, - reason: current_reason, - task: current_task, - }); - } - - entries -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_replan_response_basic() { - let response = "\ -- doc: 3 - reason: May contain the missing financial data - task: Find Q4 revenue figures"; - let entries = parse_replan_response(response, 5, &[0, 1]); - assert_eq!(entries.len(), 1); - assert_eq!(entries[0].doc_idx, 2); - assert_eq!(entries[0].task, "Find Q4 revenue figures"); - } - - #[test] - fn test_parse_replan_response_already_dispatched() { - let response = "\ -- doc: 1 - reason: Already queried - task: test"; - let entries = parse_replan_response(response, 3, &[0]); - assert!(entries.is_empty()); // doc 1 (idx 0) already dispatched - } - - #[test] - fn test_parse_replan_response_no_additional() { - let response = "NO_ADDITIONAL_DOCS"; - let entries = parse_replan_response(response, 3, &[0, 1]); - assert!(entries.is_empty()); - } -} diff --git a/vectorless-core/vectorless/src/agent/orchestrator/supervisor.rs b/vectorless-core/vectorless/src/agent/orchestrator/supervisor.rs deleted file mode 100644 index 664d06c8..00000000 --- a/vectorless-core/vectorless/src/agent/orchestrator/supervisor.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Phase 2: Supervisor loop — dispatch → evaluate → replan. - -use tracing::info; - -use crate::llm::LlmClient; -use crate::query::QueryPlan; - -use super::super::config::{AgentConfig, WorkspaceContext}; -use super::super::events::EventEmitter; -use super::super::prompts::DispatchEntry; -use super::super::state::OrchestratorState; -use super::super::tools::orchestrator as orch_tools; -use super::MAX_SUPERVISOR_ITERATIONS; -use super::dispatch; -use super::evaluate::evaluate; -use super::replan::replan; - -/// Outcome of the supervisor loop. -pub struct SupervisorOutcome { - /// Number of replan iterations performed. - pub iteration: u32, - /// Whether the LLM evaluator judged evidence sufficient. - pub eval_sufficient: bool, - /// LLM calls consumed within the supervisor loop itself. - pub llm_calls: u32, -} - -/// Run the supervisor loop: dispatch → evaluate → replan. -/// -/// Returns a [`SupervisorOutcome`] summarizing what happened. -pub async fn run_supervisor_loop( - query: &str, - initial_dispatches: Vec, - ws: &WorkspaceContext<'_>, - config: &AgentConfig, - llm: &LlmClient, - state: &mut OrchestratorState, - emitter: &EventEmitter, - query_plan: &QueryPlan, - skip_analysis: bool, -) -> crate::error::Result { - let mut current_dispatches = initial_dispatches; - let mut iteration: u32 = 0; - let mut eval_sufficient = false; - let mut llm_calls: u32 = 0; - - loop { - if iteration >= MAX_SUPERVISOR_ITERATIONS { - info!(iteration, "Supervisor loop budget exhausted"); - break; - } - - // Dispatch current plan - if !current_dispatches.is_empty() { - info!( - docs = current_dispatches.len(), - docs_list = ?current_dispatches.iter().map(|d| d.doc_idx).collect::>(), - iteration, - "Dispatching Workers" - ); - dispatch::dispatch_and_collect( - query, - ¤t_dispatches, - ws, - config, - llm, - state, - emitter, - query_plan, - ) - .await; - } - - // No evidence at all — nothing to evaluate - if state.all_evidence.is_empty() { - info!("No evidence collected from any Worker"); - break; - } - - // Skip evaluation for user-specified documents (no replan needed) - if skip_analysis { - eval_sufficient = !state.all_evidence.is_empty(); - break; - } - - // Evaluate sufficiency - let eval_result = evaluate(query, &state.all_evidence, llm).await?; - llm_calls += 1; - - if eval_result.sufficient { - eval_sufficient = true; - info!( - evidence = state.all_evidence.len(), - iteration, "Evidence sufficient — exiting supervisor loop" - ); - break; - } - - // Insufficient — replan - info!( - evidence = state.all_evidence.len(), - missing = eval_result.missing_info.len(), - iteration, - "Evidence insufficient — replanning" - ); - - let doc_cards_text = orch_tools::ls_docs(ws).feedback; - let replan_result = replan( - query, - &eval_result.missing_info, - &state.all_evidence, - &state.dispatched, - ws.doc_count(), - &doc_cards_text, - llm, - ) - .await?; - llm_calls += 1; - - if replan_result.dispatches.is_empty() { - info!("Replan produced no new dispatches — exiting supervisor loop"); - break; - } - - current_dispatches = replan_result.dispatches; - iteration += 1; - } - - Ok(SupervisorOutcome { - iteration, - eval_sufficient, - llm_calls, - }) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_supervisor_outcome_fields() { - let outcome = SupervisorOutcome { - iteration: 2, - eval_sufficient: true, - llm_calls: 5, - }; - assert_eq!(outcome.iteration, 2); - assert!(outcome.eval_sufficient); - assert_eq!(outcome.llm_calls, 5); - } - - #[test] - fn test_max_iterations_constant() { - assert_eq!(MAX_SUPERVISOR_ITERATIONS, 3); - } -} diff --git a/vectorless-core/vectorless/src/agent/prompts.rs b/vectorless-core/vectorless/src/agent/prompts.rs deleted file mode 100644 index 11d26fbf..00000000 --- a/vectorless-core/vectorless/src/agent/prompts.rs +++ /dev/null @@ -1,569 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Prompt templates for the retrieval agent. -//! -//! Prompts for agent-level operations: -//! 1. `worker_navigation` — Worker nav loop, every round -//! 2. `orchestrator_analysis` — Orchestrator Phase 1 -//! 3. `worker_dispatch` — Worker first round (when dispatched by Orchestrator) -//! 4. `check_sufficiency` — evidence sufficiency evaluation -//! -//! Post-processing prompts (answer synthesis, multi-doc fusion) have been -//! moved to `rerank/synthesis.rs` and `rerank/fusion.rs`. - -// --------------------------------------------------------------------------- -// Prompt 1: Worker Navigation (used every round in the nav loop) -// --------------------------------------------------------------------------- - -/// Parameters for the sub-agent navigation prompt. -pub struct NavigationParams<'a> { - pub query: &'a str, - /// Sub-task description (None when Worker is called directly). - pub task: Option<&'a str>, - /// Current breadcrumb path. - pub breadcrumb: &'a str, - /// Summary of collected evidence. - pub evidence_summary: &'a str, - /// Description of what's still missing (empty string if nothing). - pub missing_info: &'a str, - /// Feedback from the last command execution. - pub last_feedback: &'a str, - /// Remaining rounds. - pub remaining: u32, - /// Maximum rounds. - pub max_rounds: u32, - /// ReAct history of recent rounds. - pub history: &'a str, - /// Titles of already-visited nodes. - pub visited_titles: &'a str, - /// Navigation plan from bird's-eye analysis (empty if no plan). - pub plan: &'a str, - /// Query intent context from QueryPlan (e.g. "factual — find specific answer"). - /// Empty string if not available. - pub intent_context: &'a str, - /// Formatted keyword index matches (empty if none). - pub keyword_hints: &'a str, -} - -pub fn worker_navigation(params: &NavigationParams) -> (String, String) { - let query = params.query; - let breadcrumb = params.breadcrumb; - let evidence_summary = params.evidence_summary; - let remaining = params.remaining; - let max_rounds = params.max_rounds; - - let task_section = match params.task { - Some(task) => format!( - "\nYour specific task: {}\n(This is a sub-task for the original query.)", - task - ), - None => String::new(), - }; - - let missing_section = if params.missing_info.is_empty() { - String::new() - } else { - format!("\nPotentially missing info: {}", params.missing_info) - }; - - let last_feedback_section = if params.last_feedback.is_empty() { - String::new() - } else { - format!("\nLast command result:\n{}\n", params.last_feedback) - }; - - let history_section = if params.history == "(no history yet)" { - String::new() - } else { - format!("\nPrevious rounds:\n{}\n", params.history) - }; - - let visited_section = if params.visited_titles == "(none)" { - String::new() - } else { - format!( - "\nAlready visited (do not re-read these): {}", - params.visited_titles - ) - }; - - let plan_section = if params.plan.is_empty() { - String::new() - } else { - format!( - "\nNavigation plan (follow this as guidance, adapt if needed):\n{}\n", - params.plan - ) - }; - - let keyword_section = if params.keyword_hints.is_empty() { - String::new() - } else { - format!("\n{}", params.keyword_hints) - }; - - let intent_section = if params.intent_context.is_empty() { - String::new() - } else { - format!("\nQuery context: {}", params.intent_context) - }; - - let system = format!( - "You are a document navigation assistant. You navigate inside a document to find \ - information that answers the user's question. - -Available commands: -- ls List children at current position (with summaries and leaf counts) -- cd Enter a child node (supports relative paths like Section/Sub and absolute paths like /root/Section) -- cd .. Go back to parent node -- cat Read a child node's content (automatically collected as evidence) -- cat Read the current node's content (useful at leaf nodes) -- head Preview first 20 lines of a node (does NOT collect evidence) -- find Search for a keyword in the document index (also supports multi-word like 'Lab C') -- findtree Search for nodes by title pattern (case-insensitive) -- grep Regex search across all content in current subtree -- wc Show content size (lines, words, chars) -- pwd Show current navigation path -- check Evaluate if collected evidence is sufficient -- done End navigation - -SEARCH STRATEGY (important — follow this priority order): -- When keyword matches are shown below, navigate directly to the highest-weight matched node. \ -Do NOT explore other branches first — the keyword index has already identified the most relevant location. -- When find results include content snippets that answer the question, cd to that node and cat it immediately. -- Use find with the EXACT keyword from the list (single word, \ -not multi-word phrases). Example: if hint shows keyword 'performance' pointing to Performance section, \ -use find performance, NOT find \"performance guide\". -- Use ls only when you have no keyword hints or need to discover the structure of an unknown section. -- Use findtree when you know a section title pattern but not the exact name. - -NAVIGATION EFFICIENCY (critical — every round counts): -- Prefer cd with absolute paths (/root/Section/Subsection) or relative paths (Section/Sub) \ -to reach target nodes in ONE command instead of multiple cd steps. -- Do NOT ls before cd if keyword hints or find results already tell you which node to enter. -- Do NOT cd into nodes one level at a time when you can use a multi-segment path. - -Rules: -- Output exactly ONE command per response, nothing else. -- Content from cat is automatically saved as evidence — don't re-cat the same node. -- Do not cat or cd into nodes you have already visited. -- If the current branch has nothing relevant, use cd .. to go back. -- If you're at the root and no children seem relevant, use done. - -STOPPING RULES (critical — follow these strictly): -- After cat collects evidence, immediately check: does the collected text contain information \ - that answers or relates to the user's question? If YES, output done. Do NOT continue searching. -- Do NOT run grep after cat — cat already collected the full content. grep is for locating \ - content BEFORE cat, not after. -- If ls shows '(no navigation data)' or no children, you are at a leaf node. Use cat to read it \ - or cd .. to go back. Do NOT ls again. -- When remaining rounds are low (≤2), prefer done over exploring new branches." - ); - - let user = format!( - "{last_feedback_section}\ -User question: {query}{task_section}{intent_section} - -Current position: /{breadcrumb} -Collected evidence: -{evidence_summary}{missing_section}{keyword_section}{visited_section}{plan_section} -{history_section} -Remaining rounds: {remaining}/{max_rounds} - -Command:" - ); - - (system, user) -} - -// --------------------------------------------------------------------------- -// Prompt 2: Orchestrator Analysis (multi-doc Phase 1) -// --------------------------------------------------------------------------- - -/// Parameters for the orchestrator analysis prompt. -pub struct OrchestratorAnalysisParams<'a> { - pub query: &'a str, - /// Formatted DocCard listing from ls_docs. - pub doc_cards: &'a str, - /// Formatted cross-document search results. - pub find_results: &'a str, - /// Query understanding context (intent, concepts, strategy, complexity). - pub intent_context: &'a str, -} - -pub fn orchestrator_analysis(params: &OrchestratorAnalysisParams) -> (String, String) { - let doc_cards = params.doc_cards; - let find_results = params.find_results; - let query = params.query; - let intent_context = params.intent_context; - - let system = - "You are a multi-document retrieval coordinator. Analyze the user's question, \ - review the available documents, and decide which documents to search and what to look for in each. - -Output format — for each relevant document, output a block: -- doc: - reason: - task: - -Only include documents that are likely to contain relevant information. -If the cross-document search results already fully answer the question, respond with just: ALREADY_ANSWERED".to_string(); - - let user = format!( - "Available documents: -{doc_cards} - -Cross-document search results: -{find_results} -{intent_context} - -User question: {query} - -Relevant documents:" - ); - - (system, user) -} - -// --------------------------------------------------------------------------- -// Prompt 3: Worker Dispatch (first-round prompt when Orchestrator dispatches) -// --------------------------------------------------------------------------- - -/// Parameters for the dispatch prompt. -pub struct WorkerDispatchParams<'a> { - pub original_query: &'a str, - pub task: &'a str, - pub doc_name: &'a str, - pub breadcrumb: &'a str, -} - -pub fn worker_dispatch(params: &WorkerDispatchParams) -> (String, String) { - let doc_name = params.doc_name; - let original_query = params.original_query; - let task = params.task; - let breadcrumb = params.breadcrumb; - - let system = format!( - "You are a document navigation assistant. You are searching inside the document \ - \"{doc_name}\" for specific information. - -Available commands: ls, cd (supports Section/Sub paths and /root/Section absolute paths), \ -cd .., cat, cat , head , find , findtree , grep , wc , \ -pwd, check, done - -SEARCH STRATEGY: -- Prefer find to jump directly to relevant sections over manual ls→cd exploration. -- When find results include content snippets that answer your task, cd to that node and cat it immediately. -- Use multi-segment paths (e.g. cd Research Labs/Lab A) to reach targets in ONE command. -- Do NOT ls before cd if find results already tell you which node to enter. -- Use findtree when you know a section title pattern but not the exact name. - -Rules: -- Output exactly ONE command per response. -- Content from cat is automatically saved as evidence. -- After cat collects evidence, if it relates to your task, use done immediately. -- Do NOT grep after cat — cat already collected the full content. -- If ls shows no children, use cat to read the current node or cd .. to go back. -- When evidence is sufficient, use done." - ); - - let user = format!( - "Original question: {original_query} -Your task: {task} -Document: {doc_name} -Current position: /{breadcrumb} - -Command:" - ); - - (system, user) -} - -// --------------------------------------------------------------------------- -// Prompt 4: Check (evidence sufficiency evaluation) -// --------------------------------------------------------------------------- - -/// Build the check prompt for LLM-based sufficiency evaluation. -pub fn check_sufficiency(query: &str, evidence_summary: &str) -> (String, String) { - let system = "You evaluate whether collected evidence contains information that can answer or \ - relate to the user's question. The evidence is raw document text — it does not need to be \ - a complete or perfect answer. If the evidence mentions or addresses the key concepts from \ - the question, it is sufficient. - -Respond with ONLY 'SUFFICIENT' or 'INSUFFICIENT' followed by a one-line reason. - -Guidelines: -- If the evidence text contains any information directly related to the question's key terms, \ -respond SUFFICIENT. -- If the evidence is completely unrelated or empty, respond INSUFFICIENT. -- Default to SUFFICIENT unless the evidence is clearly irrelevant." - .to_string(); - - let user = format!( - "Question: {query}\n\n\ - Collected evidence:\n\ - {evidence_summary}\n\n\ - Is this sufficient?" - ); - - (system, user) -} - -// --------------------------------------------------------------------------- -// Dispatch plan parsing -// --------------------------------------------------------------------------- - -/// A single dispatch entry parsed from orchestrator analysis. -#[derive(Debug, Clone)] -pub struct DispatchEntry { - /// Document index (0-based). - pub doc_idx: usize, - /// Why this document was selected. - pub reason: String, - /// What to search for in this document. - pub task: String, -} - -/// Parse the LLM output from orchestrator analysis into dispatch entries. -/// -/// Returns `None` if the response is "ALREADY_ANSWERED". -/// Returns empty vec if no valid dispatch entries found. -pub fn parse_dispatch_plan(llm_output: &str, total_docs: usize) -> Option> { - let trimmed = llm_output.trim(); - - if trimmed.starts_with("ALREADY_ANSWERED") { - return None; - } - - let mut entries = Vec::new(); - let mut current_doc_idx: Option = None; - let mut current_reason = String::new(); - let mut current_task = String::new(); - - for line in trimmed.lines() { - let line = line.trim(); - - if let Some(rest) = line.strip_prefix("- doc:") { - // Flush previous entry - if let Some(idx) = current_doc_idx.take() { - entries.push(DispatchEntry { - doc_idx: idx, - reason: std::mem::take(&mut current_reason), - task: std::mem::take(&mut current_task), - }); - } - - let doc_num: usize = rest.trim().trim_end_matches(',').parse().unwrap_or(0); - if doc_num > 0 && doc_num <= total_docs { - current_doc_idx = Some(doc_num - 1); // Convert to 0-based - } else if doc_num > 0 { - tracing::warn!( - requested_doc = doc_num, - total_docs, - "Dispatch plan references out-of-range document, skipping" - ); - } - } else if let Some(rest) = line.strip_prefix("reason:") { - current_reason = rest.trim().to_string(); - } else if let Some(rest) = line.strip_prefix("task:") { - current_task = rest.trim().to_string(); - } - } - - // Flush last entry - if let Some(idx) = current_doc_idx { - entries.push(DispatchEntry { - doc_idx: idx, - reason: current_reason, - task: current_task, - }); - } - - Some(entries) -} - -/// Parse the sufficiency check response. -pub fn parse_sufficiency_response(response: &str) -> bool { - let upper = response.trim().to_uppercase(); - upper.starts_with("SUFFICIENT") && !upper.starts_with("INSUFFICIENT") -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_worker_navigation_without_task() { - let params = NavigationParams { - query: "What is the revenue?", - task: None, - breadcrumb: "root/Financial Statements", - evidence_summary: "- [Revenue] 200 chars", - missing_info: "2024 comparison", - last_feedback: "[1] Q1 Report — Q1 data (5 leaves)\n[2] Q2 Report — Q2 data (5 leaves)", - remaining: 5, - max_rounds: 15, - history: "(no history yet)", - visited_titles: "(none)", - plan: "", - intent_context: "", - keyword_hints: "", - }; - - let (system, user) = worker_navigation(¶ms); - assert!(system.contains("document navigation")); - assert!(system.contains("SEARCH STRATEGY")); - assert!(user.contains("What is the revenue?")); - assert!(user.contains("root/Financial Statements")); - assert!(user.contains("200 chars")); - assert!(user.contains("2024 comparison")); - assert!(user.contains("5/15")); - assert!(!user.contains("sub-task")); - } - - #[test] - fn test_worker_navigation_with_keyword_hints() { - let params = NavigationParams { - query: "What is the revenue?", - task: None, - breadcrumb: "root", - evidence_summary: "(none)", - missing_info: "", - last_feedback: "", - remaining: 8, - max_rounds: 15, - history: "(no history yet)", - visited_titles: "(none)", - plan: "", - intent_context: "", - keyword_hints: "Keyword matches (use find to jump directly):\n - 'revenue' → root > Revenue (weight 0.85)\n", - }; - - let (_, user) = worker_navigation(¶ms); - assert!(user.contains("revenue")); - assert!(user.contains("find")); - } - - #[test] - fn test_worker_navigation_with_task() { - let params = NavigationParams { - query: "Compare 2024 and 2023 revenue", - task: Some("Find revenue data in this document"), - breadcrumb: "root", - evidence_summary: "(none)", - missing_info: "", - last_feedback: "", - remaining: 8, - max_rounds: 15, - history: "(no history yet)", - visited_titles: "(none)", - plan: "", - intent_context: "analytical — comparative analysis", - keyword_hints: "", - }; - - let (_, user) = worker_navigation(¶ms); - assert!(user.contains("Find revenue data")); - assert!(user.contains("sub-task")); - } - - #[test] - fn test_orchestrator_analysis() { - let params = OrchestratorAnalysisParams { - query: "Compare 2024 and 2023 revenue", - doc_cards: "[1] 2024 Report\n[2] 2023 Report", - find_results: "doc 1: keyword 'revenue' matched", - intent_context: "\nQuery intent: analytical (complexity: moderate)", - }; - - let (system, user) = orchestrator_analysis(¶ms); - assert!(system.contains("multi-document")); - assert!(user.contains("2024 Report")); - assert!(user.contains("revenue")); - assert!(user.contains("analytical")); - } - - #[test] - fn test_worker_dispatch() { - let params = WorkerDispatchParams { - original_query: "Compare revenue", - task: "Find 2024 revenue figures", - doc_name: "2024 Annual Report", - breadcrumb: "root", - }; - - let (system, user) = worker_dispatch(¶ms); - assert!(system.contains("2024 Annual Report")); - assert!(user.contains("Compare revenue")); - assert!(user.contains("Find 2024 revenue")); - } - - #[test] - fn test_check_sufficiency() { - let (system, user) = check_sufficiency("What is X?", "- [A] some data"); - assert!(system.contains("SUFFICIENT")); - assert!(user.contains("What is X?")); - } - - // --- Dispatch plan parsing --- - - #[test] - fn test_parse_dispatch_plan_basic() { - let output = "\ -- doc: 1 - reason: Contains revenue data - task: Find 2024 revenue figures -- doc: 2 - reason: Contains comparison data - task: Find 2023 revenue figures"; - - let entries = parse_dispatch_plan(output, 3).unwrap(); - assert_eq!(entries.len(), 2); - assert_eq!(entries[0].doc_idx, 0); - assert_eq!(entries[0].task, "Find 2024 revenue figures"); - assert_eq!(entries[1].doc_idx, 1); - assert_eq!(entries[1].reason, "Contains comparison data"); - } - - #[test] - fn test_parse_dispatch_plan_already_answered() { - let output = "ALREADY_ANSWERED"; - assert!(parse_dispatch_plan(output, 3).is_none()); - } - - #[test] - fn test_parse_dispatch_plan_empty() { - let entries = parse_dispatch_plan("no relevant documents", 3).unwrap(); - assert!(entries.is_empty()); - } - - #[test] - fn test_parse_dispatch_plan_out_of_range() { - let output = "\ -- doc: 99 - reason: test - task: test"; - - let entries = parse_dispatch_plan(output, 3).unwrap(); - assert!(entries.is_empty()); // doc 99 is out of range, skipped - } - - // --- Sufficiency parsing --- - - #[test] - fn test_parse_sufficiency_sufficient() { - assert!(parse_sufficiency_response("SUFFICIENT - we have all data")); - assert!(parse_sufficiency_response("Sufficient")); - } - - #[test] - fn test_parse_sufficiency_insufficient() { - assert!(!parse_sufficiency_response("INSUFFICIENT - missing data")); - assert!(!parse_sufficiency_response("Insufficient")); - } -} diff --git a/vectorless-core/vectorless/src/agent/state.rs b/vectorless-core/vectorless/src/agent/state.rs deleted file mode 100644 index 8d59d9a0..00000000 --- a/vectorless-core/vectorless/src/agent/state.rs +++ /dev/null @@ -1,312 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Agent state types — mutable state that lives within a single retrieve() call. - -use std::collections::HashSet; - -use crate::document::NodeId; -use crate::document::TraceStep; - -use super::config::{Evidence, Output}; - -// --------------------------------------------------------------------------- -// Worker state -// --------------------------------------------------------------------------- - -/// Mutable navigation state for a Worker loop. -/// -/// Created at loop start, destroyed at loop end. Never escapes the call. -pub struct WorkerState { - /// Navigation breadcrumb (path from root to current node). - pub breadcrumb: Vec, - /// Current position in the document tree. - pub current_node: NodeId, - /// Collected evidence so far. - pub evidence: Vec, - /// Nodes already visited (prevents redundant reads). - pub visited: HashSet, - /// Nodes whose content has been collected via cat. Separate from visited - /// because cd-ing through a node ≠ reading its content. - pub collected_nodes: HashSet, - /// Remaining navigation rounds. - pub remaining: u32, - /// Maximum rounds (for display in prompts). - pub max_rounds: u32, - /// Feedback from the last executed command (injected into next prompt). - pub last_feedback: String, - /// Structured description of what information is still missing. - /// Updated after `check` returns "insufficient". - pub missing_info: String, - /// ReAct history: summary of each round's command + result. - /// Keeps last N entries for prompt injection. - pub history: Vec, - /// Navigation plan generated after bird's-eye view (Phase 1.5). - /// Injected into subsequent prompts as guidance (non-binding). - pub plan: String, - /// Number of times `check` has been called. - pub check_count: u32, - /// Whether a navigation plan was generated in Phase 1.5. - pub plan_generated: bool, - /// Reasoning trace steps collected during navigation. - pub trace_steps: Vec, -} - -/// Maximum number of history entries to keep for prompt injection. -const MAX_HISTORY_ENTRIES: usize = 6; - -impl WorkerState { - /// Create a new state starting at the given root node. - pub fn new(root: NodeId, max_rounds: u32) -> Self { - Self { - breadcrumb: vec!["root".to_string()], - current_node: root, - evidence: Vec::new(), - visited: HashSet::new(), - collected_nodes: HashSet::new(), - remaining: max_rounds, - max_rounds, - last_feedback: String::new(), - missing_info: String::new(), - history: Vec::new(), - plan: String::new(), - check_count: 0, - plan_generated: false, - trace_steps: Vec::new(), - } - } - - /// Consume the remaining rounds. - pub fn dec_round(&mut self) { - if self.remaining > 0 { - self.remaining -= 1; - } - } - - /// Set feedback from tool execution. - pub fn set_feedback(&mut self, feedback: String) { - self.last_feedback = feedback; - } - - /// Navigate into a child node. - pub fn cd(&mut self, node: NodeId, title: &str) { - self.breadcrumb.push(title.to_string()); - self.current_node = node; - } - - /// Navigate back to parent. - /// - /// Returns `false` if already at root. - pub fn cd_up(&mut self, parent: NodeId) -> bool { - if self.breadcrumb.len() <= 1 { - return false; - } - self.breadcrumb.pop(); - self.current_node = parent; - true - } - - /// Add a piece of evidence. - pub fn add_evidence(&mut self, evidence: Evidence) { - self.evidence.push(evidence); - } - - /// Check if evidence has already been collected for a specific node. - pub fn has_evidence_for(&self, node_id: crate::document::NodeId) -> bool { - self.collected_nodes.contains(&node_id) - } - - /// Push a history entry (command + result summary). - /// Keeps only the last `MAX_HISTORY_ENTRIES` entries. - pub fn push_history(&mut self, entry: String) { - if self.history.len() >= MAX_HISTORY_ENTRIES { - self.history.remove(0); - } - self.history.push(entry); - } - - /// Format history as text for prompt injection. - pub fn history_text(&self) -> String { - if self.history.is_empty() { - return "(no history yet)".to_string(); - } - self.history - .iter() - .enumerate() - .map(|(i, h)| format!("{}. {}", i + 1, h)) - .collect::>() - .join("\n") - } - - /// Format the breadcrumb as a path string (e.g., "root/Chapter 1/Section 1.2"). - pub fn path_str(&self) -> String { - self.breadcrumb.join("/") - } - - /// Summary of collected evidence for prompts. - pub fn evidence_summary(&self) -> String { - if self.evidence.is_empty() { - return "(none)".to_string(); - } - self.evidence - .iter() - .map(|e| format!("- [{}] {} chars", e.node_title, e.content.len())) - .collect::>() - .join("\n") - } - - /// Evidence with actual content for sufficiency evaluation. - pub fn evidence_for_check(&self) -> String { - if self.evidence.is_empty() { - return "(no evidence collected yet)".to_string(); - } - self.evidence - .iter() - .map(|e| format!("[{}]\n{}", e.node_title, e.content)) - .collect::>() - .join("\n\n") - } - - /// Convert this state into a WorkerOutput (consuming the state), with budget flag. - /// Worker returns evidence only — no answer synthesis. - pub fn into_worker_output( - self, - llm_calls: u32, - budget_exhausted: bool, - doc_name: &str, - ) -> super::config::WorkerOutput { - let evidence_chars: usize = self.evidence.iter().map(|e| e.content.len()).sum(); - super::config::WorkerOutput { - evidence: self.evidence, - metrics: super::config::WorkerMetrics { - rounds_used: self.max_rounds.saturating_sub(self.remaining), - llm_calls, - nodes_visited: self.visited.len(), - budget_exhausted, - plan_generated: self.plan_generated, - check_count: self.check_count, - evidence_chars, - }, - doc_name: doc_name.to_string(), - trace_steps: self.trace_steps, - } - } -} - -// --------------------------------------------------------------------------- -// Orchestrator state -// --------------------------------------------------------------------------- - -/// Mutable state for the Orchestrator loop. -/// -/// Tracks which documents have been dispatched and collects Worker results. -pub struct OrchestratorState { - /// Indices of documents that have been dispatched. - pub dispatched: Vec, - /// Results returned by dispatched Workers. - pub sub_results: Vec, - /// All evidence merged from sub-results. - pub all_evidence: Vec, - /// Whether the analysis phase is complete. - pub analyze_done: bool, - /// Total LLM calls across orchestrator + sub-agents. - pub total_llm_calls: u32, -} - -impl OrchestratorState { - /// Create a new orchestrator state. - pub fn new() -> Self { - Self { - dispatched: Vec::new(), - sub_results: Vec::new(), - all_evidence: Vec::new(), - analyze_done: false, - total_llm_calls: 0, - } - } - - /// Record a dispatch to document at the given index. - pub fn record_dispatch(&mut self, doc_idx: usize) { - if !self.dispatched.contains(&doc_idx) { - self.dispatched.push(doc_idx); - } - } - - /// Collect a Worker result, converting WorkerOutput to Output for internal tracking. - pub fn collect_result(&mut self, doc_idx: usize, result: super::config::WorkerOutput) { - self.total_llm_calls += result.metrics.llm_calls; - self.all_evidence.extend(result.evidence.iter().cloned()); - self.sub_results.push(result.into()); - self.record_dispatch(doc_idx); - } - - /// Clone results into an Output without consuming self. - /// - /// Used by `finalize_output` which needs to borrow state for rerank. - pub fn clone_results_into_output(&self, answer: String) -> Output { - Output { - answer, - evidence: self.all_evidence.clone(), - metrics: super::config::Metrics { - llm_calls: self.total_llm_calls, - nodes_visited: self - .sub_results - .iter() - .map(|r| r.metrics.nodes_visited) - .sum(), - plan_generated: self.sub_results.iter().any(|r| r.metrics.plan_generated), - check_count: self.sub_results.iter().map(|r| r.metrics.check_count).sum(), - evidence_chars: self - .sub_results - .iter() - .map(|r| r.metrics.evidence_chars) - .sum(), - ..Default::default() - }, - confidence: 0.0, - trace_steps: self.collect_trace_steps(), - } - } - - /// Merge all sub-results into a single Output (consuming self). - pub fn into_output(self, answer: String) -> Output { - let trace_steps = self.collect_trace_steps(); - Output { - answer, - evidence: self.all_evidence, - metrics: super::config::Metrics { - llm_calls: self.total_llm_calls, - nodes_visited: self - .sub_results - .iter() - .map(|r| r.metrics.nodes_visited) - .sum(), - plan_generated: self.sub_results.iter().any(|r| r.metrics.plan_generated), - check_count: self.sub_results.iter().map(|r| r.metrics.check_count).sum(), - evidence_chars: self - .sub_results - .iter() - .map(|r| r.metrics.evidence_chars) - .sum(), - ..Default::default() - }, - confidence: 0.0, - trace_steps, - } - } - - /// Collect trace steps from all sub-results. - fn collect_trace_steps(&self) -> Vec { - let mut steps = Vec::new(); - for result in &self.sub_results { - steps.extend(result.trace_steps.iter().cloned()); - } - steps - } -} - -impl Default for OrchestratorState { - fn default() -> Self { - Self::new() - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/common.rs b/vectorless-core/vectorless/src/agent/tools/common.rs deleted file mode 100644 index 740510de..00000000 --- a/vectorless-core/vectorless/src/agent/tools/common.rs +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Common tools shared between Orchestrator and Worker (find, check, done). - -use super::ToolResult; - -/// Execute a `find` command — search for a keyword. -/// -/// Returns formatted search results as feedback text. -pub fn format_find_result(keyword: &str, hits: &[super::super::context::FindHit]) -> String { - if hits.is_empty() { - return format!("No results found for '{}'", keyword); - } - - let mut output = format!("Results for '{}':\n", keyword); - for hit in hits { - for entry in &hit.entries { - output.push_str(&format!( - " - node (depth {}, weight {:.2})\n", - entry.depth, entry.weight - )); - } - } - output -} - -/// Execute a `check` command — evaluate evidence sufficiency. -/// -/// Returns a formatted summary of current evidence for the LLM to evaluate. -pub fn format_check_prompt(evidence_summary: &str, query: &str) -> String { - format!( - "Please evaluate whether the collected evidence is sufficient to answer the query.\n\n\ - Query: {}\n\n\ - Evidence:\n{}\n\n\ - Is this sufficient? Answer YES or NO and briefly explain.", - query, evidence_summary - ) -} - -/// Execute a `done` command — signal loop termination. -pub fn format_done() -> ToolResult { - ToolResult::done("Navigation complete.") -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_format_find_result_empty() { - let result = format_find_result("nonexistent", &[]); - assert!(result.contains("No results")); - } - - #[test] - fn test_format_check_prompt() { - let prompt = format_check_prompt("- [Intro] 500 chars", "What is X?"); - assert!(prompt.contains("What is X?")); - assert!(prompt.contains("500 chars")); - } - - #[test] - fn test_format_done() { - let result = format_done(); - assert!(result.should_stop); - assert!(result.success); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/mod.rs b/vectorless-core/vectorless/src/agent/tools/mod.rs deleted file mode 100644 index c44c0021..00000000 --- a/vectorless-core/vectorless/src/agent/tools/mod.rs +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Tool definitions for the retrieval agent. -//! -//! Tools are organized by role: -//! - `common` — shared between Orchestrator and Worker (find, check, done) -//! - `worker` — Worker-specific (ls, cd, cd_up, cat, pwd) -//! - `orchestrator` — Orchestrator-specific (ls_docs, find_cross, dispatch) - -pub mod common; -pub mod orchestrator; -pub mod worker; - -/// Result of executing a tool command. -#[derive(Debug, Clone)] -pub struct ToolResult { - /// Text feedback to include in the next LLM prompt. - pub feedback: String, - /// Whether the loop should stop. - pub should_stop: bool, - /// Whether the command executed successfully. - pub success: bool, -} - -impl ToolResult { - /// Create a successful result with feedback. - pub fn ok(feedback: impl Into) -> Self { - Self { - feedback: feedback.into(), - should_stop: false, - success: true, - } - } - - /// Create a result that signals loop termination. - pub fn done(feedback: impl Into) -> Self { - Self { - feedback: feedback.into(), - should_stop: true, - success: true, - } - } - - /// Create a failed result (parse error, invalid target, etc.). - pub fn fail(feedback: impl Into) -> Self { - Self { - feedback: feedback.into(), - should_stop: false, - success: false, - } - } -} - -/// Extract a content snippet around the first occurrence of `keyword`. -/// -/// Returns `None` if the content is empty. If the keyword is not found, -/// returns the beginning of the content instead. -pub fn content_snippet(content: &str, keyword: &str, max_len: usize) -> Option { - if content.trim().is_empty() { - return None; - } - - let keyword_lower = keyword.to_lowercase(); - let content_lower = content.to_lowercase(); - - let start = match content_lower.find(&keyword_lower) { - Some(pos) => { - let back = (max_len / 4).min(pos); - pos - back - } - None => 0, - }; - - let start = content - .char_indices() - .find(|(i, _)| *i >= start) - .map(|(i, _)| i) - .unwrap_or(0); - - let end = content - .char_indices() - .take_while(|(i, _)| *i <= start + max_len) - .last() - .map(|(i, c)| i + c.len_utf8()) - .unwrap_or(content.len()); - - let snippet = content[start..end].trim(); - if snippet.is_empty() { - return None; - } - - let mut result = snippet.to_string(); - if end < content.len() { - result.push_str("..."); - } - if start > 0 { - result = format!("...{}", result); - } - Some(result) -} diff --git a/vectorless-core/vectorless/src/agent/tools/orchestrator.rs b/vectorless-core/vectorless/src/agent/tools/orchestrator.rs deleted file mode 100644 index 96dcb116..00000000 --- a/vectorless-core/vectorless/src/agent/tools/orchestrator.rs +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Orchestrator tools: ls_docs, find_cross, dispatch. - -use super::ToolResult; -use crate::agent::config::WorkspaceContext; - -/// Execute `ls_docs` — list all document cards. -/// -/// Returns a formatted view of all DocCards for the Orchestrator's Bird's-Eye View. -pub fn ls_docs(ctx: &WorkspaceContext) -> ToolResult { - let cards = ctx.doc_cards(); - - if cards.is_empty() { - return ToolResult::ok("No documents with DocCards available."); - } - - let mut output = format!("Available documents ({} total):\n\n", ctx.doc_count()); - - for (idx, card) in &cards { - output.push_str(&format!( - "[{}] {} — {}\n", - idx + 1, - card.title, - card.overview - )); - - for sec in &card.sections { - output.push_str(&format!( - " → {} ({} leaves)\n", - sec.title, sec.leaf_count - )); - } - - if !card.question_hints.is_empty() { - output.push_str(&format!( - " Can answer: {}\n", - card.question_hints.join(", ") - )); - } - - if !card.topic_tags.is_empty() { - output.push_str(&format!(" Topics: {}\n", card.topic_tags.join(", "))); - } - - output.push('\n'); - } - - // Also mention docs without cards - let with_cards: Vec = cards.iter().map(|(idx, _)| *idx).collect(); - let without_cards: Vec = (0..ctx.doc_count()) - .filter(|i| !with_cards.contains(i)) - .collect(); - - if !without_cards.is_empty() { - output.push_str(&format!( - "Documents without DocCards: {:?}\n", - without_cards - .iter() - .map(|i| format!("doc_{}", i)) - .collect::>() - )); - } - - ToolResult::ok(output) -} - -/// Execute `find_cross` — search keywords across all documents. -/// -/// Returns formatted results showing which documents matched, with content snippets. -pub fn find_cross(keywords: &[String], ctx: &WorkspaceContext) -> ToolResult { - let results = ctx.find_cross_all(keywords); - - if results.is_empty() { - return ToolResult::ok(format!( - "No matches found for keywords: {}", - keywords.join(", ") - )); - } - - let mut output = String::new(); - for (doc_idx, hits) in &results { - let doc = ctx.doc(*doc_idx); - let doc_name = doc.map(|d| d.doc_name).unwrap_or("unknown"); - output.push_str(&format!("Document [{}] {}:\n", doc_idx + 1, doc_name)); - - for hit in hits { - for entry in &hit.entries { - let title = doc - .and_then(|d| d.node_title(entry.node_id)) - .unwrap_or("unknown"); - output.push_str(&format!( - " keyword '{}' → {} (depth {}, weight {:.2})", - hit.keyword, title, entry.depth, entry.weight - )); - // Include content snippet for cross-doc relevance judgment - if let Some(content) = doc.and_then(|d| d.cat(entry.node_id)) { - if let Some(snippet) = super::content_snippet(content, &hit.keyword, 300) { - output.push_str(&format!("\n \"{}\"", snippet)); - } - } - output.push('\n'); - } - } - output.push('\n'); - } - - ToolResult::ok(output) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::{DocCard, NavigationIndex, ReasoningIndex, SectionCard}; - - fn build_workspace() -> ( - Vec, - Vec, - Vec, - ) { - let tree1 = crate::document::DocumentTree::new("2024 Report", "content"); - let mut nav1 = NavigationIndex::new(); - nav1.set_doc_card(DocCard { - title: "2024 Financial Report".to_string(), - overview: "Annual financial statements".to_string(), - question_hints: vec!["Revenue?".to_string()], - topic_tags: vec!["finance".to_string(), "2024".to_string()], - sections: vec![SectionCard { - title: "Revenue".to_string(), - description: "Revenue breakdown".to_string(), - leaf_count: 5, - }], - total_leaves: 10, - }); - - let tree2 = crate::document::DocumentTree::new("2023 Report", "content"); - let mut nav2 = NavigationIndex::new(); - nav2.set_doc_card(DocCard { - title: "2023 Financial Report".to_string(), - overview: "Previous year financial statements".to_string(), - question_hints: vec!["Sales?".to_string()], - topic_tags: vec!["finance".to_string(), "2023".to_string()], - sections: vec![SectionCard { - title: "Net Sales".to_string(), - description: "Net sales figures".to_string(), - leaf_count: 4, - }], - total_leaves: 8, - }); - - ( - vec![tree1, tree2], - vec![nav1, nav2], - vec![ReasoningIndex::default(), ReasoningIndex::default()], - ) - } - - #[test] - fn test_ls_docs_shows_cards() { - let (trees, navs, ridxs) = build_workspace(); - let docs = vec![ - crate::agent::config::DocContext { - tree: &trees[0], - nav_index: &navs[0], - reasoning_index: &ridxs[0], - doc_name: "2024", - }, - crate::agent::config::DocContext { - tree: &trees[1], - nav_index: &navs[1], - reasoning_index: &ridxs[1], - doc_name: "2023", - }, - ]; - let ctx = WorkspaceContext::new(docs); - - let result = ls_docs(&ctx); - assert!(result.success); - assert!(result.feedback.contains("2024 Financial Report")); - assert!(result.feedback.contains("2023 Financial Report")); - assert!(result.feedback.contains("Revenue")); - assert!(result.feedback.contains("finance")); - } - - #[test] - fn test_ls_docs_empty() { - let tree = crate::document::DocumentTree::new("Empty", ""); - let nav = NavigationIndex::new(); - let ridx = ReasoningIndex::default(); - let docs = vec![crate::agent::config::DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &ridx, - doc_name: "empty", - }]; - let ctx = WorkspaceContext::new(docs); - - let result = ls_docs(&ctx); - assert!(result.success); - assert!(result.feedback.contains("No documents with DocCards")); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/cat.rs b/vectorless-core/vectorless/src/agent/tools/worker/cat.rs deleted file mode 100644 index 107aafa8..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/cat.rs +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `cat` — read node content and collect as evidence. - -use crate::agent::command; -use crate::agent::config::{DocContext, Evidence}; -use crate::agent::state::WorkerState; - -use super::super::ToolResult; - -/// Execute `cat ` — read node content and collect as evidence. -/// -/// Special targets: -/// - `cat .` or `cat` (no arg) reads the current node's content. -/// - Otherwise resolves the target to a child node by name. -pub fn cat(target: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { - let node_id = if target == "." || target.is_empty() { - state.current_node - } else { - match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) - { - Some(id) => id, - None => { - return ToolResult::fail(format!( - "Target '{}' not found. Use 'ls' to see children, or 'cat .' to read current node.", - target - )); - } - } - }; - - if state.has_evidence_for(node_id) { - let title = ctx.node_title(node_id).unwrap_or("unknown"); - return ToolResult::ok(format!( - "[Already collected: {}]. Use a different target or cd to another branch.", - title - )); - } - - match ctx.cat(node_id) { - Some(content) => { - let title = ctx.node_title(node_id).unwrap_or("unknown").to_string(); - let content_string = content.to_string(); - - state.add_evidence(Evidence { - source_path: format!("{}/{}", state.path_str(), title), - node_title: title.clone(), - content: content_string.clone(), - doc_name: Some(ctx.doc_name.to_string()), - }); - - state.collected_nodes.insert(node_id); - state.visited.insert(node_id); - - ToolResult::ok(format!( - "[Evidence collected: {}]\n{}", - title, content_string - )) - } - None => ToolResult::fail(format!("No content available for '{}'.", target)), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_test_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let c1 = tree.add_child(root, "Getting Started", "gs content"); - let c2 = tree.add_child(root, "API Reference", "api content"); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ - ChildRoute { - node_id: c1, - title: "Getting Started".to_string(), - description: "Setup guide".to_string(), - leaf_count: 3, - }, - ChildRoute { - node_id: c2, - title: "API Reference".to_string(), - description: "API docs".to_string(), - leaf_count: 7, - }, - ], - ); - - (tree, nav, root, c1, c2) - } - - #[test] - fn test_cat_collects_evidence() { - let (tree, nav, root, _, _) = build_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 15); - - let result = cat("Getting Started", &ctx, &mut state); - assert!(result.success); - assert!(result.feedback.contains("Evidence collected")); - assert_eq!(state.evidence.len(), 1); - assert_eq!(state.evidence[0].content, "gs content"); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/cd.rs b/vectorless-core/vectorless/src/agent/tools/worker/cd.rs deleted file mode 100644 index 8d874832..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/cd.rs +++ /dev/null @@ -1,262 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `cd`, `cd_absolute`, `cd_up` — navigation commands. - -use crate::agent::command; -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; - -use super::super::ToolResult; - -/// Execute `cd ` — navigate into a child node. -/// -/// Supports: -/// - Relative names (child of current node): `cd "Getting Started"` -/// - Relative paths with `/`: `cd "Research Labs/Lab B"` -/// - Absolute paths starting with `/`: `cd /root/Chapter 1/Section 1.2` -pub fn cd(target: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { - if target.starts_with('/') { - return cd_absolute(target, ctx, state); - } - - // Relative path with segments: "Research Labs/Lab B" - if target.contains('/') { - return cd_relative_path(target, ctx, state); - } - - match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) { - Some(node_id) => { - let title = ctx.node_title(node_id).unwrap_or(target).to_string(); - state.cd(node_id, &title); - ToolResult::ok(format!("Entered: {}", state.path_str())) - } - None => ToolResult::fail(format!( - "Target '{}' not found. Use ls to see available children.", - target - )), - } -} - -/// Navigate using a relative multi-segment path (e.g., `"Research Labs/Lab B"`). -fn cd_relative_path(path: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { - let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); - if segments.is_empty() { - return ToolResult::fail("Empty path.".to_string()); - } - - let mut current = state.current_node; - let mut breadcrumb = state.breadcrumb.clone(); - - for segment in &segments { - match command::resolve_target_extended(segment, ctx.nav_index, current, ctx.tree) { - Some(node_id) => { - let title = ctx.node_title(node_id).unwrap_or(*segment).to_string(); - breadcrumb.push(title); - current = node_id; - } - None => { - return ToolResult::fail(format!( - "Path segment '{}' not found at '/{}'. Use ls to see available children.", - segment, - breadcrumb.join("/") - )); - } - } - } - - state.breadcrumb = breadcrumb; - state.current_node = current; - state.visited.insert(current); - - ToolResult::ok(format!("Entered: {}", state.path_str())) -} - -/// Navigate using an absolute path (e.g., `/root/Chapter 1/Section 1.2`). -fn cd_absolute(path: &str, ctx: &DocContext, state: &mut WorkerState) -> ToolResult { - let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect(); - - if segments.is_empty() { - return ToolResult::fail("Empty absolute path.".to_string()); - } - - let root = ctx.root(); - let mut current = root; - - let start_idx = if !segments.is_empty() && segments[0].eq_ignore_ascii_case("root") { - 1 - } else { - 0 - }; - - let mut breadcrumb = vec!["root".to_string()]; - - for segment in &segments[start_idx..] { - match command::resolve_target_extended(segment, ctx.nav_index, current, ctx.tree) { - Some(node_id) => { - let title = ctx.node_title(node_id).unwrap_or(*segment).to_string(); - breadcrumb.push(title); - current = node_id; - } - None => { - return ToolResult::fail(format!( - "Path segment '{}' not found. Stopped at: /{}", - segment, - breadcrumb.join("/") - )); - } - } - } - - state.breadcrumb = breadcrumb; - state.current_node = current; - state.visited.insert(current); - - ToolResult::ok(format!("Entered: {}", state.path_str())) -} - -/// Execute `cd ..` — navigate back to parent. -pub fn cd_up(ctx: &DocContext, state: &mut WorkerState) -> ToolResult { - match ctx.parent(state.current_node) { - Some(parent) => { - if state.cd_up(parent) { - ToolResult::ok(format!("Back to: {}", state.path_str())) - } else { - ToolResult::ok("Already at root.".to_string()) - } - } - None => ToolResult::ok("Already at root (no parent).".to_string()), - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_test_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let c1 = tree.add_child(root, "Getting Started", "gs content"); - let c2 = tree.add_child(root, "API Reference", "api content"); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ - ChildRoute { - node_id: c1, - title: "Getting Started".to_string(), - description: "Setup guide".to_string(), - leaf_count: 3, - }, - ChildRoute { - node_id: c2, - title: "API Reference".to_string(), - description: "API docs".to_string(), - leaf_count: 7, - }, - ], - ); - - (tree, nav, root, c1, c2) - } - - #[test] - fn test_cd_navigates() { - let (tree, nav, root, c1, _) = build_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 15); - - let result = cd("Getting Started", &ctx, &mut state); - assert!(result.success); - assert_eq!(state.current_node, c1); - assert!(state.path_str().contains("Getting Started")); - } - - #[test] - fn test_cd_up_goes_back() { - let (tree, nav, root, _c1, _) = build_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 15); - - cd("Getting Started", &ctx, &mut state); - let result = cd_up(&ctx, &mut state); - assert!(result.success); - assert_eq!(state.current_node, root); - } - - fn build_deep_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { - // Root → "Research Labs" → "Lab B" - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let section = tree.add_child(root, "Research Labs", "section content"); - let lab_b = tree.add_child(section, "Lab B", "lab b content"); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ChildRoute { - node_id: section, - title: "Research Labs".to_string(), - description: "Lab sections".to_string(), - leaf_count: 4, - }], - ); - nav.add_child_routes( - section, - vec![ChildRoute { - node_id: lab_b, - title: "Lab B".to_string(), - description: "Topological qubits".to_string(), - leaf_count: 1, - }], - ); - - (tree, nav, root, section, lab_b) - } - - #[test] - fn test_cd_relative_path() { - let (tree, nav, root, _, lab_b) = build_deep_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 15); - - let result = cd("Research Labs/Lab B", &ctx, &mut state); - assert!(result.success); - assert_eq!(state.current_node, lab_b); - assert!(state.path_str().contains("Research Labs")); - assert!(state.path_str().contains("Lab B")); - } - - #[test] - fn test_cd_relative_path_partial_fail() { - let (tree, nav, root, _, _) = build_deep_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 15); - - let result = cd("Research Labs/Nonexistent", &ctx, &mut state); - assert!(!result.success); - assert!(result.feedback.contains("Nonexistent")); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/find.rs b/vectorless-core/vectorless/src/agent/tools/worker/find.rs deleted file mode 100644 index 47912b01..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/find.rs +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `find_tree` — search for nodes by title pattern across the entire tree. - -use crate::agent::config::DocContext; - -use super::super::ToolResult; - -/// Execute `findtree ` — search for nodes by title pattern across the entire tree. -/// -/// Returns all nodes whose title contains the pattern (case-insensitive). -pub fn find_tree(pattern: &str, ctx: &DocContext) -> ToolResult { - let pattern_lower = pattern.to_lowercase(); - let all_nodes = ctx.tree.traverse(); - - let mut results = Vec::new(); - for node_id in &all_nodes { - if let Some(node) = ctx.tree.get(*node_id) { - if node.title.to_lowercase().contains(&pattern_lower) { - let depth = ctx.tree.depth(*node_id); - let leaf_count = ctx.nav_entry(*node_id).map(|e| e.leaf_count).unwrap_or(0); - results.push((node.title.clone(), depth, leaf_count)); - } - } - } - - if results.is_empty() { - return ToolResult::ok(format!("No nodes matching '{}'.", pattern)); - } - - let mut output = format!("Nodes matching '{}' ({} found):\n", pattern, results.len()); - for (title, depth, leaves) in &results { - output.push_str(&format!( - " - {} (depth {}, {} leaves)\n", - title, depth, leaves - )); - } - - ToolResult::ok(output) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { - let mut tree = DocumentTree::new( - "Root", - "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", - ); - let root = tree.root(); - let c1 = tree.add_child( - root, - "Revenue", - "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", - ); - let c2 = tree.add_child( - root, - "Expenses", - "Operating expenses totaled $6.8M.\nR&D spending: $3.1M\nMarketing: $1.2M", - ); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ - ChildRoute { - node_id: c1, - title: "Revenue".to_string(), - description: "Revenue breakdown".to_string(), - leaf_count: 2, - }, - ChildRoute { - node_id: c2, - title: "Expenses".to_string(), - description: "Cost analysis".to_string(), - leaf_count: 2, - }, - ], - ); - - (tree, nav, root) - } - - macro_rules! rich_ctx { - ($tree:expr, $nav:expr) => { - DocContext { - tree: &$tree, - nav_index: &$nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - } - }; - } - - #[test] - fn test_find_tree() { - let (tree, nav, _root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - - let result = find_tree("revenue", &ctx); - assert!(result.success); - assert!(result.feedback.contains("Revenue")); - } - - #[test] - fn test_find_tree_case_insensitive() { - let (tree, nav, _root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - - let result = find_tree("EXPENSE", &ctx); - assert!(result.success); - assert!(result.feedback.contains("Expenses")); - } - - #[test] - fn test_find_tree_no_match() { - let (tree, nav, _root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - - let result = find_tree("nonexistent_xyz", &ctx); - assert!(result.success); - assert!(result.feedback.contains("No nodes matching")); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/grep.rs b/vectorless-core/vectorless/src/agent/tools/worker/grep.rs deleted file mode 100644 index 077b077e..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/grep.rs +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `grep` — regex search across all node content in the current subtree. - -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; - -use super::super::ToolResult; -use super::collect_subtree; - -/// Execute `grep ` — regex search across all node content in the current subtree. -/// -/// Searches content of the current node and all descendants. Returns matching lines -/// with their node titles, capped at 30 matches to avoid overwhelming feedback. -pub fn grep(pattern: &str, ctx: &DocContext, state: &WorkerState) -> ToolResult { - let re = match regex::Regex::new(pattern) { - Ok(re) => re, - Err(e) => return ToolResult::fail(format!("Invalid regex '{}': {}", pattern, e)), - }; - - let subtree = collect_subtree(state.current_node, ctx.tree); - let mut matches_found = 0; - let mut output = String::new(); - let max_matches = 30; - - for node_id in &subtree { - if matches_found >= max_matches { - output.push_str("\n... (truncated, more matches available)"); - break; - } - - let content = match ctx.cat(*node_id) { - Some(c) if !c.is_empty() => c, - _ => continue, - }; - - let title = ctx.node_title(*node_id).unwrap_or("?"); - - for line in content.lines() { - if matches_found >= max_matches { - break; - } - if re.is_match(line) { - output.push_str(&format!("[{}] {}\n", title, line)); - matches_found += 1; - } - } - } - - if matches_found == 0 { - ToolResult::ok(format!("No matches for /{}/ in subtree.", pattern)) - } else { - ToolResult::ok(format!( - "Found {} match(es) for /{}/:\n{}", - matches_found, pattern, output - )) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { - let mut tree = DocumentTree::new( - "Root", - "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", - ); - let root = tree.root(); - let c1 = tree.add_child( - root, - "Revenue", - "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", - ); - let c2 = tree.add_child( - root, - "Expenses", - "Operating expenses totaled $6.8M.\nR&D spending: $3.1M\nMarketing: $1.2M", - ); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ - ChildRoute { - node_id: c1, - title: "Revenue".to_string(), - description: "Revenue breakdown".to_string(), - leaf_count: 2, - }, - ChildRoute { - node_id: c2, - title: "Expenses".to_string(), - description: "Cost analysis".to_string(), - leaf_count: 2, - }, - ], - ); - - (tree, nav, root) - } - - macro_rules! rich_ctx { - ($tree:expr, $nav:expr) => { - DocContext { - tree: &$tree, - nav_index: &$nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - } - }; - } - - #[test] - fn test_grep_finds_matches() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = grep("revenue", &ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("revenue")); - assert!(result.feedback.contains("[Revenue]")); - } - - #[test] - fn test_grep_regex() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = grep("EBITDA|\\$\\d+", &ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("EBITDA")); - assert!(result.feedback.contains("$10")); - } - - #[test] - fn test_grep_no_matches() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = grep("nonexistent_term_xyz", &ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("No matches")); - } - - #[test] - fn test_grep_invalid_regex() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = grep("[invalid", &ctx, &state); - assert!(!result.success); - assert!(result.feedback.contains("Invalid regex")); - } - - #[test] - fn test_grep_subtree_only() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let mut state = WorkerState::new(root, 15); - - crate::agent::tools::worker::cd::cd("Expenses", &ctx, &mut state); - let result = grep("revenue", &ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("No matches")); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/head.rs b/vectorless-core/vectorless/src/agent/tools/worker/head.rs deleted file mode 100644 index 5fefa234..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/head.rs +++ /dev/null @@ -1,119 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `head` — preview first N lines of a node without collecting evidence. - -use crate::agent::command; -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; - -use super::super::ToolResult; - -/// Execute `head ` — preview first N lines of a node without collecting evidence. -pub fn head(target: &str, lines: usize, ctx: &DocContext, state: &WorkerState) -> ToolResult { - let node_id = - match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) - { - Some(id) => id, - None => { - return ToolResult::fail(format!( - "Target '{}' not found. Use ls to see available children.", - target - )); - } - }; - - let content = match ctx.cat(node_id) { - Some(c) => c, - None => return ToolResult::fail(format!("No content for '{}'.", target)), - }; - - let title = ctx.node_title(node_id).unwrap_or("unknown"); - let total_lines = content.lines().count(); - let preview: Vec<&str> = content.lines().take(lines).collect(); - - let mut output = format!( - "[Preview: {} — showing {}/{} lines]\n", - title, - preview.len().min(lines), - total_lines - ); - output.push_str(&preview.join("\n")); - - if total_lines > lines { - output.push_str(&format!( - "\n... ({} more lines, use cat to read all)", - total_lines - lines - )); - } - - ToolResult::ok(output) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { - let mut tree = DocumentTree::new( - "Root", - "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", - ); - let root = tree.root(); - let c1 = tree.add_child( - root, - "Revenue", - "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", - ); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "Revenue".to_string(), - description: "Revenue breakdown".to_string(), - leaf_count: 2, - }], - ); - - (tree, nav, root) - } - - macro_rules! rich_ctx { - ($tree:expr, $nav:expr) => { - DocContext { - tree: &$tree, - nav_index: &$nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - } - }; - } - - #[test] - fn test_head_preview() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = head("Revenue", 2, &ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("Preview")); - assert!(result.feedback.contains("$10.2M")); - assert!(result.feedback.contains("2/4 lines")); - } - - #[test] - fn test_head_not_found() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = head("NonExistent", 10, &ctx, &state); - assert!(!result.success); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/ls.rs b/vectorless-core/vectorless/src/agent/tools/worker/ls.rs deleted file mode 100644 index a00d688e..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/ls.rs +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `ls` — list children of the current node. - -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; - -use super::super::ToolResult; - -/// Execute `ls` — list children of the current node. -pub fn ls(ctx: &DocContext, state: &WorkerState) -> ToolResult { - let mut output = String::new(); - - if let Some(entry) = ctx.nav_entry(state.current_node) { - output.push_str(&format!("Current section: {}\n", entry.overview)); - if !entry.question_hints.is_empty() { - output.push_str(&format!( - "Can answer: {}\n", - entry.question_hints.join(", ") - )); - } - output.push('\n'); - } - - match ctx.ls(state.current_node) { - Some(routes) => { - if routes.is_empty() { - output - .push_str("(leaf node — no children)\nUse cd .. to go back or done to finish."); - return ToolResult::ok(output); - } - - for (i, route) in routes.iter().enumerate() { - if route.title == route.description { - output.push_str(&format!( - "[{}] {} ({} leaves)", - i + 1, - route.title, - route.leaf_count - )); - } else { - output.push_str(&format!( - "[{}] {} — {} ({} leaves)", - i + 1, - route.title, - route.description, - route.leaf_count - )); - } - if let Some(nav) = ctx.nav_entry(route.node_id) { - if !nav.question_hints.is_empty() { - output.push_str(&format!( - "\n Can answer: {}", - nav.question_hints.join(", ") - )); - } - if !nav.topic_tags.is_empty() { - output.push_str(&format!("\n Topics: {}", nav.topic_tags.join(", "))); - } - } - output.push('\n'); - } - ToolResult::ok(output) - } - None => { - output.push_str( - "(no navigation data for this node)\nUse cat to read content or cd .. to go back.", - ); - ToolResult::ok(output) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_test_tree() -> (DocumentTree, NavigationIndex, NodeId, NodeId, NodeId) { - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let c1 = tree.add_child(root, "Getting Started", "gs content"); - let c2 = tree.add_child(root, "API Reference", "api content"); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ - ChildRoute { - node_id: c1, - title: "Getting Started".to_string(), - description: "Setup guide".to_string(), - leaf_count: 3, - }, - ChildRoute { - node_id: c2, - title: "API Reference".to_string(), - description: "API docs".to_string(), - leaf_count: 7, - }, - ], - ); - - (tree, nav, root, c1, c2) - } - - #[test] - fn test_ls_shows_children() { - let (tree, nav, root, _, _) = build_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let state = WorkerState::new(root, 15); - - let result = ls(&ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("Getting Started")); - assert!(result.feedback.contains("API Reference")); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/mod.rs b/vectorless-core/vectorless/src/agent/tools/worker/mod.rs deleted file mode 100644 index eb73d34f..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/mod.rs +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Worker tools: ls, cd, cd_up, cat, pwd, grep, head, find_tree, wc. - -mod cat; -mod cd; -mod find; -mod grep; -mod head; -mod ls; -mod pwd; -mod wc; - -pub use cat::cat; -pub use cd::{cd, cd_up}; -pub use find::find_tree; -pub use grep::grep; -pub use head::head; -pub use ls::ls; -pub use pwd::pwd; -pub use wc::wc; - -use crate::document::{DocumentTree, NodeId}; - -/// Collect all NodeIds in the subtree rooted at `node` (inclusive). -pub(super) fn collect_subtree(node: NodeId, tree: &DocumentTree) -> Vec { - let mut result = vec![node]; - let mut stack = vec![node]; - - while let Some(current) = stack.pop() { - for child in tree.children_iter(current) { - result.push(child); - stack.push(child); - } - } - - result -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/pwd.rs b/vectorless-core/vectorless/src/agent/tools/worker/pwd.rs deleted file mode 100644 index 7adcf084..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/pwd.rs +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `pwd` — show current navigation path. - -use crate::agent::state::WorkerState; - -use super::super::ToolResult; - -/// Execute `pwd` — show current navigation path. -pub fn pwd(state: &WorkerState) -> ToolResult { - ToolResult::ok(format!("Current path: {}", state.path_str())) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::agent::tools::worker::cd::cd; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex}; - - fn build_test_tree() -> (DocumentTree, NavigationIndex) { - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let c1 = tree.add_child(root, "API Reference", "api content"); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "API Reference".to_string(), - description: "API docs".to_string(), - leaf_count: 7, - }], - ); - - (tree, nav) - } - - #[test] - fn test_pwd() { - let (tree, nav) = build_test_tree(); - let root = tree.root(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 15); - cd("API Reference", &ctx, &mut state); - - let result = pwd(&state); - assert!(result.success); - assert!(result.feedback.contains("API Reference")); - } -} diff --git a/vectorless-core/vectorless/src/agent/tools/worker/wc.rs b/vectorless-core/vectorless/src/agent/tools/worker/wc.rs deleted file mode 100644 index 4ea7ec01..00000000 --- a/vectorless-core/vectorless/src/agent/tools/worker/wc.rs +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! `wc` — show node content statistics. - -use crate::agent::command; -use crate::agent::config::DocContext; -use crate::agent::state::WorkerState; - -use super::super::ToolResult; - -/// Execute `wc ` — show node content statistics. -pub fn wc(target: &str, ctx: &DocContext, state: &WorkerState) -> ToolResult { - let node_id = - match command::resolve_target_extended(target, ctx.nav_index, state.current_node, ctx.tree) - { - Some(id) => id, - None => { - return ToolResult::fail(format!( - "Target '{}' not found. Use ls to see available children.", - target - )); - } - }; - - let content = match ctx.cat(node_id) { - Some(c) => c, - None => return ToolResult::fail(format!("No content for '{}'.", target)), - }; - - let title = ctx.node_title(node_id).unwrap_or("unknown"); - let lines = content.lines().count(); - let words = content.split_whitespace().count(); - let chars = content.len(); - - ToolResult::ok(format!( - "[{}] {} lines, {} words, {} chars", - title, lines, words, chars - )) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; - use crate::document::{ChildRoute, DocumentTree, NavigationIndex, NodeId}; - - fn build_rich_tree() -> (DocumentTree, NavigationIndex, NodeId) { - let mut tree = DocumentTree::new( - "Root", - "Welcome to the financial report.\nThis document covers 2024 and 2023 figures.", - ); - let root = tree.root(); - let c1 = tree.add_child( - root, - "Revenue", - "Total revenue in 2024 was $10.2M.\nQ1 revenue: $2.5M\nQ2 revenue: $2.8M\nEBITDA margin: 32%", - ); - - let mut nav = NavigationIndex::new(); - nav.add_child_routes( - root, - vec![ChildRoute { - node_id: c1, - title: "Revenue".to_string(), - description: "Revenue breakdown".to_string(), - leaf_count: 2, - }], - ); - - (tree, nav, root) - } - - macro_rules! rich_ctx { - ($tree:expr, $nav:expr) => { - DocContext { - tree: &$tree, - nav_index: &$nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - } - }; - } - - #[test] - fn test_wc_stats() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = wc("Revenue", &ctx, &state); - assert!(result.success); - assert!(result.feedback.contains("Revenue")); - assert!(result.feedback.contains("lines")); - assert!(result.feedback.contains("words")); - assert!(result.feedback.contains("chars")); - } - - #[test] - fn test_wc_not_found() { - let (tree, nav, root) = build_rich_tree(); - let ctx = rich_ctx!(tree, nav); - let state = WorkerState::new(root, 15); - - let result = wc("NonExistent", &ctx, &state); - assert!(!result.success); - } -} diff --git a/vectorless-core/vectorless/src/agent/worker/execute.rs b/vectorless-core/vectorless/src/agent/worker/execute.rs deleted file mode 100644 index 66c40250..00000000 --- a/vectorless-core/vectorless/src/agent/worker/execute.rs +++ /dev/null @@ -1,278 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Command execution — dispatch parsed Command to tool functions. - -use tracing::{info, warn}; - -use crate::llm::LlmClient; - -use super::super::command::{Command, parse_command}; -use super::super::config::{DocContext, Step}; -use super::super::events::EventEmitter; -use super::super::prompts::{check_sufficiency, parse_sufficiency_response}; -use super::super::state::WorkerState; -use super::super::tools::worker as tools; - -/// Execute a single parsed command, mutating state. -/// -/// Returns a `Step` indicating whether to continue or stop. -pub async fn execute_command( - command: &Command, - ctx: &DocContext<'_>, - state: &mut WorkerState, - query: &str, - llm: &LlmClient, - llm_calls: &mut u32, - emitter: &EventEmitter, -) -> Step { - info!( - doc = ctx.doc_name, - command = ?command, - "Executing tool" - ); - match command { - Command::Ls => { - let result = tools::ls(ctx, state); - info!(doc = ctx.doc_name, feedback = %truncate_log(&result.feedback), "ls result"); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::Cd { target } => { - let result = tools::cd(target, ctx, state); - info!(doc = ctx.doc_name, target, feedback = %truncate_log(&result.feedback), "cd result"); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::CdUp => { - let result = tools::cd_up(ctx, state); - info!(doc = ctx.doc_name, feedback = %truncate_log(&result.feedback), "cd_up result"); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::Cat { target } => { - let evidence_before = state.evidence.len(); - let result = tools::cat(target, ctx, state); - info!(doc = ctx.doc_name, target, feedback = %truncate_log(&result.feedback), "cat result"); - state.set_feedback(result.feedback); - if state.evidence.len() > evidence_before { - if let Some(ev) = state.evidence.last() { - info!( - doc = ctx.doc_name, - node = %ev.node_title, - path = %ev.source_path, - len = ev.content.len(), - total = state.evidence.len(), - "Evidence collected" - ); - emitter.emit_evidence( - ctx.doc_name, - &ev.node_title, - &ev.source_path, - ev.content.len(), - state.evidence.len(), - ); - } - } - Step::Continue - } - - Command::Find { keyword } => { - let feedback = match ctx.find(keyword) { - Some(hit) => { - let mut entries = hit.entries.clone(); - entries.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - let mut seen_nodes = std::collections::HashSet::new(); - let mut output = format!("Results for '{}':\n", keyword); - for entry in &entries { - if !seen_nodes.insert(entry.node_id) { - continue; - } - let title = ctx.node_title(entry.node_id).unwrap_or("unknown"); - output.push_str(&format!( - " - {} (depth {}, weight {:.2})", - title, entry.depth, entry.weight - )); - if let Some(content) = ctx.cat(entry.node_id) { - if let Some(snippet) = - super::super::tools::content_snippet(content, keyword, 300) - { - output.push_str(&format!("\n \"{}\"", snippet)); - } - } - output.push('\n'); - } - output - } - None => { - // Fallback: search node titles (like findtree) with content snippets - let pattern_lower = keyword.to_lowercase(); - let all_nodes = ctx.tree.traverse(); - let mut results = Vec::new(); - for node_id in &all_nodes { - if let Some(node) = ctx.tree.get(*node_id) { - if node.title.to_lowercase().contains(&pattern_lower) { - let depth = ctx.tree.depth(*node_id); - results.push((node.title.clone(), *node_id, depth)); - } - } - } - if results.is_empty() { - format!("No results for '{}' in index or titles.", keyword) - } else { - let mut output = format!( - "Results for '{}' (title match, {} found):\n", - keyword, - results.len() - ); - for (title, node_id, depth) in &results { - output.push_str(&format!(" - {} (depth {})", title, depth)); - if let Some(content) = ctx.cat(*node_id) { - if let Some(snippet) = - super::super::tools::content_snippet(content, keyword, 300) - { - output.push_str(&format!("\n \"{}\"", snippet)); - } - } - output.push('\n'); - } - output - } - } - }; - info!(doc = ctx.doc_name, keyword, feedback = %truncate_log(&feedback), "find result"); - state.set_feedback(feedback); - Step::Continue - } - - Command::Pwd => { - let result = tools::pwd(state); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::Check => { - let evidence_text = state.evidence_for_check(); - - let (system, user) = check_sufficiency(query, &evidence_text); - - info!( - doc = ctx.doc_name, - system = %system, - user = %user, - "Check prompt" - ); - - match llm.complete(&system, &user).await { - Ok(response) => { - *llm_calls += 1; - state.check_count += 1; - let sufficient = parse_sufficiency_response(&response); - info!( - doc = ctx.doc_name, - sufficient, - evidence = state.evidence.len(), - response = %response, - "Sufficiency check" - ); - emitter.emit_worker_sufficiency_check( - ctx.doc_name, - sufficient, - state.evidence.len(), - None, - ); - if sufficient { - state.last_feedback = - "Evidence is sufficient. Use done to finish.".to_string(); - Step::Done - } else { - let reason = response - .trim() - .strip_prefix("INSUFFICIENT") - .unwrap_or(response.trim()) - .trim() - .trim_start_matches(|c: char| c == '-' || c == ' '); - if !reason.is_empty() { - state.missing_info = reason.to_string(); - } - state.set_feedback(format!( - "Evidence not yet sufficient: {}", - response.trim() - )); - Step::Continue - } - } - Err(e) => { - warn!(error = %e, "Check LLM call failed"); - state.last_feedback = "Could not evaluate sufficiency.".to_string(); - Step::Continue - } - } - } - - Command::Done => { - state.last_feedback = "Navigation complete.".to_string(); - Step::Done - } - - Command::Grep { pattern } => { - let result = tools::grep(pattern, ctx, state); - info!(doc = ctx.doc_name, pattern, feedback = %truncate_log(&result.feedback), "grep result"); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::Head { target, lines } => { - let result = tools::head(target, *lines, ctx, state); - info!(doc = ctx.doc_name, target, lines, feedback = %truncate_log(&result.feedback), "head result"); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::FindTree { pattern } => { - let result = tools::find_tree(pattern, ctx); - info!(doc = ctx.doc_name, pattern, feedback = %truncate_log(&result.feedback), "find_tree result"); - state.set_feedback(result.feedback); - Step::Continue - } - - Command::Wc { target } => { - let result = tools::wc(target, ctx, state); - info!(doc = ctx.doc_name, target, feedback = %truncate_log(&result.feedback), "wc result"); - state.set_feedback(result.feedback); - Step::Continue - } - } -} - -/// Truncate feedback for log output — keep first 300 chars to avoid noisy logs. -fn truncate_log(s: &str) -> std::borrow::Cow<'_, str> { - const MAX: usize = 300; - if s.len() <= MAX { - std::borrow::Cow::Borrowed(s) - } else { - std::borrow::Cow::Owned(format!( - "{}...(truncated, {} chars total)", - &s[..MAX], - s.len() - )) - } -} - -/// Parse the LLM output and detect parse failures. -/// -/// Returns `(command, is_parse_failure)`. -pub fn parse_and_detect_failure(llm_output: &str) -> (Command, bool) { - let command = parse_command(llm_output); - let trimmed = llm_output.trim(); - let is_parse_failure = - matches!(command, Command::Ls) && !trimmed.starts_with("ls") && !trimmed.is_empty(); - (command, is_parse_failure) -} diff --git a/vectorless-core/vectorless/src/agent/worker/format.rs b/vectorless-core/vectorless/src/agent/worker/format.rs deleted file mode 100644 index be9e029f..00000000 --- a/vectorless-core/vectorless/src/agent/worker/format.rs +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Formatting helpers for Worker prompts. - -use super::super::config::DocContext; -use super::super::state::WorkerState; - -/// Resolve visited NodeIds to their titles for prompt injection. -pub fn format_visited_titles(state: &WorkerState, ctx: &DocContext<'_>) -> String { - if state.visited.is_empty() { - return "(none)".to_string(); - } - state - .visited - .iter() - .filter_map(|&node_id| ctx.node_title(node_id).map(|t| t.to_string())) - .collect::>() - .join(", ") -} diff --git a/vectorless-core/vectorless/src/agent/worker/mod.rs b/vectorless-core/vectorless/src/agent/worker/mod.rs deleted file mode 100644 index b906d30e..00000000 --- a/vectorless-core/vectorless/src/agent/worker/mod.rs +++ /dev/null @@ -1,236 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Worker agent — document navigation and evidence collection. -//! -//! The Worker is a consuming-self struct implementing [`Agent`]: -//! 1. Bird's-eye: ls(root) for initial overview -//! 2. Navigation planning: LLM generates a plan (keyword hits as context) -//! 3. Navigation loop: LLM → parse → execute → repeat (max N rounds) -//! -//! Dispatched by the Orchestrator, one per document. -//! Returns raw evidence — no answer synthesis. Rerank owns all answer generation. - -mod execute; -mod format; -mod navigation; -mod planning; - -use tracing::info; - -use super::Agent; -use super::config::{DocContext, WorkerConfig, WorkerOutput}; -use super::context::FindHit; -use super::events::EventEmitter; -use super::state::WorkerState; -use super::tools::worker as tools; -use crate::error::Error; -use crate::llm::LlmClient; -use crate::query::QueryPlan; -use crate::scoring::bm25::extract_keywords; - -use navigation::run_navigation_loop; -use planning::build_plan_prompt; - -/// Worker agent — navigates a single document to collect evidence. -/// -/// Holds all execution context. Calling [`run()`](Agent::run) consumes self. -pub struct Worker<'a> { - query: String, - task: Option, - ctx: &'a DocContext<'a>, - config: WorkerConfig, - llm: LlmClient, - emitter: EventEmitter, - query_plan: QueryPlan, -} - -impl<'a> Worker<'a> { - /// Create a new Worker. - pub fn new( - query: &str, - task: Option<&str>, - ctx: &'a DocContext<'a>, - config: WorkerConfig, - llm: LlmClient, - emitter: EventEmitter, - query_plan: QueryPlan, - ) -> Self { - Self { - query: query.to_string(), - task: task.map(|s| s.to_string()), - ctx, - config, - llm, - emitter, - query_plan, - } - } -} - -impl<'a> Agent for Worker<'a> { - type Output = WorkerOutput; - - fn name(&self) -> &str { - "worker" - } - - async fn run(self) -> crate::error::Result { - let Worker { - query, - task, - ctx, - config, - llm, - emitter, - query_plan, - } = self; - let task_ref = task.as_deref(); - - let intent_context = format!("{} — {}", query_plan.intent, query_plan.strategy_hint); - - emitter.emit_worker_started(ctx.doc_name, task_ref, config.max_rounds); - - info!( - doc = ctx.doc_name, - task = task_ref.unwrap_or("(full query)"), - max_rounds = config.max_rounds, - max_llm_calls = config.max_llm_calls, - "Worker starting" - ); - - let mut llm_calls: u32 = 0; - - // Gather keyword hits as context for LLM planning (not routing rules) - let keywords = extract_keywords(&query); - let index_hits: Vec = ctx.find_all(&keywords); - if !index_hits.is_empty() { - tracing::debug!( - doc = ctx.doc_name, - hit_count = index_hits.len(), - "ReasoningIndex keyword hits available for planning" - ); - } - - // --- Phase 1: Bird's-eye view --- - let mut state = WorkerState::new(ctx.root(), config.max_rounds); - let ls_result = tools::ls(ctx, &state); - state.set_feedback(ls_result.feedback); - - // --- Phase 1.5: Navigation planning --- - if state.remaining > 0 && (config.max_llm_calls == 0 || llm_calls < config.max_llm_calls) { - info!(doc = ctx.doc_name, "Generating navigation plan..."); - let plan_prompt = build_plan_prompt( - &query, - task_ref, - &state.last_feedback, - ctx.doc_name, - &index_hits, - ctx, - query_plan.intent, - ); - let plan_output = llm - .complete(&plan_prompt.0, &plan_prompt.1) - .await - .map_err(|e| Error::LlmReasoning { - stage: "worker/plan".to_string(), - detail: format!("Navigation plan LLM call failed: {e}"), - })?; - llm_calls += 1; - let plan_text = plan_output.trim().to_string(); - if !plan_text.is_empty() { - info!( - doc = ctx.doc_name, - plan = %plan_text, - "Navigation plan generated" - ); - emitter.emit_worker_plan_generated(ctx.doc_name, plan_text.len()); - state.plan = plan_text; - state.plan_generated = true; - } - } - - // --- Phase 2: Navigation loop --- - run_navigation_loop( - &query, - task_ref, - ctx, - &config, - &llm, - &mut state, - &emitter, - &index_hits, - &intent_context, - &mut llm_calls, - ) - .await?; - - let budget_exhausted = - state.remaining == 0 || (config.max_llm_calls > 0 && llm_calls >= config.max_llm_calls); - - let output = state.into_worker_output(llm_calls, budget_exhausted, ctx.doc_name); - - emitter.emit_worker_done( - ctx.doc_name, - output.evidence.len(), - output.metrics.rounds_used, - output.metrics.llm_calls, - output.metrics.budget_exhausted, - output.metrics.plan_generated, - ); - - info!( - doc = ctx.doc_name, - evidence = output.evidence.len(), - rounds = output.metrics.rounds_used, - llm_calls = output.metrics.llm_calls, - "Worker complete" - ); - - Ok(output) - } -} - -#[cfg(test)] -mod truncation_tests { - /// Verify that truncating feedback with multi-byte UTF-8 characters - /// never panics. This mirrors the truncation logic in the navigation loop. - #[test] - fn test_utf8_safe_truncation_ascii() { - let feedback = "a".repeat(200); - let boundary = feedback.ceil_char_boundary(120); - let truncated = &feedback[..boundary]; - assert!(truncated.len() <= 123); // 120 + "..." fits - assert!(truncated.is_char_boundary(truncated.len())); - } - - #[test] - fn test_utf8_safe_truncation_multibyte() { - // Each '中' is 3 bytes in UTF-8 - let feedback = "中文反馈内容测试截断安全".repeat(20); - assert!(feedback.len() > 120); - let boundary = feedback.ceil_char_boundary(120); - let truncated = &feedback[..boundary]; - assert!(truncated.len() <= 120); - assert!(truncated.is_char_boundary(truncated.len())); - } - - #[test] - fn test_utf8_safe_truncation_emoji() { - // Emojis are 4 bytes each - let feedback = "🦀🎉🚀".repeat(50); - assert!(feedback.len() > 120); - let boundary = feedback.ceil_char_boundary(120); - let truncated = &feedback[..boundary]; - assert!(truncated.len() <= 120); - assert!(truncated.is_char_boundary(truncated.len())); - } - - #[test] - fn test_utf8_safe_truncation_short_string() { - // String shorter than limit — no truncation needed - let feedback = "short feedback".to_string(); - let boundary = feedback.ceil_char_boundary(120); - assert_eq!(boundary, feedback.len()); - } -} diff --git a/vectorless-core/vectorless/src/agent/worker/navigation.rs b/vectorless-core/vectorless/src/agent/worker/navigation.rs deleted file mode 100644 index ce5c4ee7..00000000 --- a/vectorless-core/vectorless/src/agent/worker/navigation.rs +++ /dev/null @@ -1,448 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Phase 2: Navigation loop — LLM-driven command loop until done or budget exhausted. - -use tracing::{debug, info}; - -use super::super::command::Command; -use super::super::config::{DocContext, Step, WorkerConfig}; -use super::super::context::FindHit; -use super::super::events::EventEmitter; -use super::super::prompts::{NavigationParams, worker_dispatch, worker_navigation}; -use super::super::state::WorkerState; -use super::execute::{execute_command, parse_and_detect_failure}; -use super::format::format_visited_titles; -use super::planning::{build_replan_prompt, format_keyword_hints}; -use crate::error::Error; -use crate::llm::LlmClient; - -/// Run the Phase 2 navigation loop. -/// -/// Loops until budget exhausted, `done`/`force_done`, or error. -/// Mutates `state` and `llm_calls` in place. -pub async fn run_navigation_loop( - query: &str, - task: Option<&str>, - ctx: &DocContext<'_>, - config: &WorkerConfig, - llm: &LlmClient, - state: &mut WorkerState, - emitter: &EventEmitter, - index_hits: &[FindHit], - intent_context: &str, - llm_calls: &mut u32, -) -> crate::error::Result<()> { - let use_dispatch_prompt = task.is_some(); - let keyword_hints = format_keyword_hints(index_hits, ctx); - let max_llm = config.max_llm_calls; - - loop { - if state.remaining == 0 { - info!(doc = ctx.doc_name, "Navigation budget exhausted"); - break; - } - if max_llm > 0 && *llm_calls >= max_llm { - info!( - doc = ctx.doc_name, - llm_calls, max_llm, "LLM call budget exhausted" - ); - break; - } - - // Build prompt - let (system, user) = build_round_prompt( - query, - task, - ctx, - state, - intent_context, - &keyword_hints, - use_dispatch_prompt, - config.max_rounds, - ); - - // LLM decision - let round_num = config.max_rounds - state.remaining + 1; - let round_start = std::time::Instant::now(); - info!( - doc = ctx.doc_name, - round = round_num, - max_rounds = config.max_rounds, - "Navigation round: calling LLM..." - ); - let llm_output = llm - .complete(&system, &user) - .await - .map_err(|e| Error::LlmReasoning { - stage: "worker/navigation".to_string(), - detail: format!("Nav loop LLM call failed (round {round_num}): {e}"), - })?; - *llm_calls += 1; - - // Parse command - let (command, is_parse_failure) = handle_parse_failure(&llm_output, ctx.doc_name, state); - if is_parse_failure { - continue; - } - - debug!(doc = ctx.doc_name, ?command, "Parsed command"); - - let is_check = matches!(command, Command::Check); - - // Execute - let step = execute_command(&command, ctx, state, query, llm, llm_calls, emitter).await; - - // Dynamic re-planning after insufficient check - handle_replan( - is_check, query, task, ctx, llm, state, emitter, llm_calls, max_llm, - ) - .await?; - - // Emit round event - let cmd_str = format!("{:?}", command); - let success = !matches!(step, Step::ForceDone(_)); - let round_elapsed = round_start.elapsed().as_millis() as u64; - emitter.emit_worker_round(ctx.doc_name, round_num, &cmd_str, success, round_elapsed); - - push_round_history(state, &cmd_str); - - // Check termination - match step { - Step::Done => { - info!( - doc = ctx.doc_name, - evidence = state.evidence.len(), - "Navigation done" - ); - break; - } - Step::ForceDone(reason) => { - info!(doc = ctx.doc_name, reason = %reason, "Forced done"); - break; - } - Step::Continue => { - if !is_check { - state.dec_round(); - } - } - } - } - - Ok(()) -} - -/// Build the (system, user) prompt pair for a single navigation round. -fn build_round_prompt( - query: &str, - task: Option<&str>, - ctx: &DocContext<'_>, - state: &WorkerState, - intent_context: &str, - keyword_hints: &str, - use_dispatch_prompt: bool, - max_rounds: u32, -) -> (String, String) { - if use_dispatch_prompt && state.remaining == max_rounds { - worker_dispatch(&super::super::prompts::WorkerDispatchParams { - original_query: query, - task: task.unwrap_or(query), - doc_name: ctx.doc_name, - breadcrumb: &state.path_str(), - }) - } else { - let visited_titles = format_visited_titles(state, ctx); - worker_navigation(&NavigationParams { - query, - task, - breadcrumb: &state.path_str(), - evidence_summary: &state.evidence_summary(), - missing_info: &state.missing_info, - last_feedback: &state.last_feedback, - remaining: state.remaining, - max_rounds: state.max_rounds, - history: &state.history_text(), - visited_titles: &visited_titles, - plan: &state.plan, - intent_context, - keyword_hints, - }) - } -} - -/// Parse LLM output and handle parse failures. -/// -/// Returns `(command, is_parse_failure)`. On parse failure, updates state -/// with feedback and pushes a history entry. -fn handle_parse_failure( - llm_output: &str, - doc_name: &str, - state: &mut WorkerState, -) -> (Command, bool) { - if llm_output.trim().len() < 2 { - tracing::warn!( - doc = doc_name, - response = llm_output.trim(), - "LLM response unusually short" - ); - } - let (command, is_parse_failure) = parse_and_detect_failure(llm_output); - if is_parse_failure { - let raw_preview = if llm_output.trim().len() > 200 { - format!("{}...", &llm_output.trim()[..200]) - } else { - llm_output.trim().to_string() - }; - state.last_feedback = format!( - "Your output was not recognized as a valid command:\n\"{}\"\n\n\ - Please output exactly one command (ls, cd, cat, head, find, findtree, grep, wc, pwd, check, or done).", - raw_preview - ); - state.push_history("(unrecognized) → parse failure".to_string()); - } - (command, is_parse_failure) -} - -/// Push a round's command + feedback preview into history and trace. -fn push_round_history(state: &mut WorkerState, cmd_str: &str) { - let feedback_preview = if state.last_feedback.len() > 120 { - let boundary = state.last_feedback.ceil_char_boundary(120); - format!("{}...", &state.last_feedback[..boundary]) - } else { - state.last_feedback.clone() - }; - state.push_history(format!("{} → {}", cmd_str, feedback_preview)); - - let round = state.max_rounds.saturating_sub(state.remaining); - state.trace_steps.push(crate::document::TraceStep { - action: cmd_str.to_string(), - observation: state.last_feedback.chars().take(200).collect(), - round, - }); -} - -/// Dynamic re-planning after an insufficient check. -/// -/// If check returned INSUFFICIENT with enough remaining rounds and LLM budget, -/// generates a new navigation plan. Otherwise clears stale replan state. -async fn handle_replan( - is_check: bool, - query: &str, - task: Option<&str>, - ctx: &DocContext<'_>, - llm: &LlmClient, - state: &mut WorkerState, - emitter: &EventEmitter, - llm_calls: &mut u32, - max_llm: u32, -) -> crate::error::Result<()> { - if !is_check { - return Ok(()); - } - - if !state.missing_info.is_empty() - && state.remaining >= 3 - && (max_llm == 0 || *llm_calls < max_llm) - { - let missing = state.missing_info.clone(); - info!(doc = ctx.doc_name, missing = %missing, "Re-planning navigation..."); - let replan = build_replan_prompt(query, task, state, ctx); - let new_plan = - llm.complete(&replan.0, &replan.1) - .await - .map_err(|e| Error::LlmReasoning { - stage: "worker/replan".to_string(), - detail: format!("Re-plan LLM call failed: {e}"), - })?; - *llm_calls += 1; - let plan_text = new_plan.trim().to_string(); - if !plan_text.is_empty() { - info!( - doc = ctx.doc_name, - plan = %plan_text, - "Re-plan generated" - ); - emitter.emit_worker_replan(ctx.doc_name, &missing, plan_text.len()); - state.plan = plan_text; - } - state.missing_info.clear(); - } else if !state.missing_info.is_empty() { - state.plan.clear(); - state.missing_info.clear(); - } - - Ok(()) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::agent::state::WorkerState; - use crate::document::{DocumentTree, NodeId}; - - fn test_ctx() -> (DocumentTree, NodeId) { - let tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - (tree, root) - } - - #[test] - fn test_handle_parse_failure_valid_command() { - let (tree, root) = test_ctx(); - let nav = crate::document::NavigationIndex::new(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 10); - - let (cmd, is_failure) = handle_parse_failure("ls", ctx.doc_name, &mut state); - assert!(!is_failure); - assert!(matches!(cmd, Command::Ls)); - } - - #[test] - fn test_handle_parse_failure_unrecognized() { - let (tree, root) = test_ctx(); - let nav = crate::document::NavigationIndex::new(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 10); - - let (_cmd, is_failure) = - handle_parse_failure("random garbage text", ctx.doc_name, &mut state); - assert!(is_failure); - assert!(state.last_feedback.contains("not recognized")); - assert!(state.history.last().unwrap().contains("unrecognized")); - } - - #[test] - fn test_handle_parse_failure_short_response() { - let (tree, root) = test_ctx(); - let nav = crate::document::NavigationIndex::new(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let mut state = WorkerState::new(root, 10); - - // Single character response — short but not a parse failure if it's "ls" - let (cmd, is_failure) = handle_parse_failure("ls", ctx.doc_name, &mut state); - assert!(!is_failure); - assert!(matches!(cmd, Command::Ls)); - } - - #[test] - fn test_push_round_history_short_feedback() { - let (_, root) = test_ctx(); - let mut state = WorkerState::new(root, 10); - state.last_feedback = "short feedback".to_string(); - - push_round_history(&mut state, "ls"); - assert_eq!(state.history.len(), 1); - assert!(state.history[0].contains("ls → short feedback")); - } - - #[test] - fn test_push_round_history_long_feedback() { - let (_, root) = test_ctx(); - let mut state = WorkerState::new(root, 10); - state.last_feedback = "a".repeat(200); - - push_round_history(&mut state, "cat"); - assert_eq!(state.history.len(), 1); - assert!(state.history[0].contains("cat → ")); - // Should be truncated with ... - assert!(state.history[0].contains("...")); - } - - #[test] - fn test_push_round_history_respects_max_entries() { - let (_, root) = test_ctx(); - let mut state = WorkerState::new(root, 10); - state.last_feedback = "ok".to_string(); - - for i in 0..8 { - push_round_history(&mut state, &format!("cmd_{i}")); - } - // MAX_HISTORY_ENTRIES is 6, so only last 6 should remain - assert_eq!(state.history.len(), 6); - } - - #[test] - fn test_build_round_prompt_dispatch_first_round() { - let (tree, root) = test_ctx(); - let nav = crate::document::NavigationIndex::new(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test_doc", - }; - let mut state = WorkerState::new(root, 10); - // remaining == max_rounds means first round - assert_eq!(state.remaining, 10); - - let (system, user) = build_round_prompt( - "test query", - Some("sub-task"), - &ctx, - &state, - "factual — find answer", - "", - true, // use_dispatch_prompt - 10, - ); - assert!(system.contains("dispatch") || !system.is_empty()); - assert!(user.contains("test query") || user.contains("sub-task")); - } - - #[test] - fn test_build_round_prompt_navigation_subsequent_round() { - let (tree, root) = test_ctx(); - let nav = crate::document::NavigationIndex::new(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test_doc", - }; - let mut state = WorkerState::new(root, 10); - state.remaining = 8; // not first round - - let (system, _user) = build_round_prompt( - "test query", - None, - &ctx, - &state, - "factual", - "keyword hints here", - false, // use_dispatch_prompt - 10, - ); - assert!(!system.is_empty()); - } - - #[test] - fn test_utf8_safe_truncation_in_history() { - let (_, root) = test_ctx(); - let mut state = WorkerState::new(root, 10); - // Each '中' is 3 bytes in UTF-8 - state.last_feedback = "中文反馈内容测试截断安全".repeat(20); - - push_round_history(&mut state, "cat"); - let entry = &state.history[0]; - // Should be truncated without panicking - assert!(entry.contains("cat → ")); - assert!(entry.len() < state.last_feedback.len() + 20); - } -} diff --git a/vectorless-core/vectorless/src/agent/worker/planning.rs b/vectorless-core/vectorless/src/agent/worker/planning.rs deleted file mode 100644 index a42bf52c..00000000 --- a/vectorless-core/vectorless/src/agent/worker/planning.rs +++ /dev/null @@ -1,708 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Navigation planning prompts — initial plan, re-plan, semantic hints, deep expansion. - -use std::collections::HashSet; - -use crate::query::QueryIntent; -use crate::scoring::bm25::{Bm25Engine, FieldDocument, extract_keywords}; - -use super::super::config::DocContext; -use super::super::context::FindHit; -use super::super::state::WorkerState; -use super::format::format_visited_titles; - -/// Maximum keyword/semantic hit entries in plan prompt. -const MAX_PLAN_ENTRIES: usize = 15; -/// Maximum section summaries in plan prompt. -const MAX_SECTION_SUMMARIES: usize = 10; -/// Maximum deep expansion entries. -const MAX_EXPANSION_ENTRIES: usize = 8; - -/// Build the navigation planning prompt (Phase 1.5). -pub fn build_plan_prompt( - query: &str, - task: Option<&str>, - ls_output: &str, - doc_name: &str, - keyword_hits: &[FindHit], - ctx: &DocContext<'_>, - intent: QueryIntent, -) -> (String, String) { - let task_section = match task { - Some(t) => format!("\nYour specific task: {}", t), - None => String::new(), - }; - - let query_keywords = extract_keywords(query); - let query_lower = query.to_lowercase(); - - let mut keyword_section = if keyword_hits.is_empty() { - String::new() - } else { - let mut section = - String::from("\nKeyword index matches (use these to prioritize navigation):\n"); - let mut entry_count = 0; - for hit in keyword_hits { - let mut entries = hit.entries.clone(); - entries.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - let mut seen = HashSet::new(); - for entry in &entries { - if !seen.insert(entry.node_id) { - continue; - } - let ancestor_path = build_ancestor_path(entry.node_id, ctx); - section.push_str(&format!( - " - keyword '{}' → {} (depth {}, weight {:.2})\n", - hit.keyword, ancestor_path, entry.depth, entry.weight - )); - if let Some(content) = ctx.cat(entry.node_id) { - if let Some(snippet) = - super::super::tools::content_snippet(content, &hit.keyword, 300) - { - section.push_str(&format!(" \"{}\"\n", snippet)); - } - } - entry_count += 1; - if entry_count >= MAX_PLAN_ENTRIES { - section.push_str(" ... (more hits omitted)\n"); - break; - } - } - if entry_count >= MAX_PLAN_ENTRIES { - break; - } - } - section - }; - - let deep_expansion = build_deep_expansion(keyword_hits, ctx); - if !deep_expansion.is_empty() { - keyword_section.push_str(&deep_expansion); - } - - let semantic_section = build_semantic_hints(&query_keywords, &query_lower, ctx); - - let intent_section = build_intent_signals(intent, ctx); - - let system = "You are a document navigation planner. Given a user question, the top-level \ - document structure, keyword index matches, and semantic hints, output a brief navigation \ - plan: which sections to visit and in what order. Prioritize sections that matched keywords \ - or semantic hints. The plan should be 2-5 steps. Each step should be a specific action \ - like \"cd to X, then cat Y\" or \"grep for Z in current subtree\". \ - Pay attention to 'Can answer' and 'Topics' annotations in the structure listing — \ - they indicate what questions each section addresses. \ - Output only the plan, nothing else.\n\n\ - Example plan for \"What is the Q1 revenue?\":\n\ - 1. cd to Revenue (matched keyword 'revenue')\n\ - 2. ls to see sub-sections\n\ - 3. cat Q1 Report\n\ - 4. check\n\ - 5. done".to_string(); - - let user = format!( - "Document: {doc_name}\n\ - Top-level structure:\n{ls_output}{keyword_section}{semantic_section}{intent_section}\ - User question: {query}{task_section}\n\n\ - Navigation plan:" - ); - - (system, user) -} - -/// Build a focused re-planning prompt when check returns INSUFFICIENT. -pub fn build_replan_prompt( - query: &str, - task: Option<&str>, - state: &WorkerState, - ctx: &DocContext<'_>, -) -> (String, String) { - let task_section = match task { - Some(t) => format!("\nOriginal sub-task: {}", t), - None => String::new(), - }; - - let visited = format_visited_titles(state, ctx); - let evidence_summary = state.evidence_summary(); - - let current_children = match ctx.ls(state.current_node) { - Some(routes) if !routes.is_empty() => { - let items: Vec = routes - .iter() - .map(|r| format!(" - {} ({} leaves)", r.title, r.leaf_count)) - .collect(); - format!("Children at current position:\n{}\n", items.join("\n")) - } - _ => "Current position is a leaf node — consider cd .. to go back.\n".to_string(), - }; - - let sibling_hints = build_sibling_hints(state, ctx); - - let system = "You are re-planning a document navigation strategy. The previous plan did not \ - find sufficient evidence. Given what's been found and what's still missing, generate a \ - focused 2-3 step plan. Each step should be a specific action like \ - \"cd to X, then cat Y\" or \"grep for Z in current subtree\". \ - Prefer exploring unvisited branches. If current branch is exhausted, cd .. and try \ - a different path. Output only the plan, nothing else." - .to_string(); - - let user = format!( - "Original question: {query}{task_section}\n\ - Current position: /{}\n\ - Evidence collected so far:\n{evidence_summary}\n\ - What's missing: {}\n\ - Already visited: {visited}\n\ - {current_children}\ - {sibling_hints}\ - Remaining rounds: {}/{}\n\n\ - Revised navigation plan:", - state.path_str(), - state.missing_info, - state.remaining, - state.max_rounds, - ); - - (system, user) -} - -/// Format keyword index hits into a compact string for LLM context. -/// -/// Returns a string like: -/// ```text -/// Keyword matches (use find to jump directly): -/// - 'complex' → Performance (weight 0.85) -/// "...complexity analysis shows..." -/// - 'latency' → Performance (weight 0.72) -/// "...latency benchmarks indicate..." -/// ``` -pub fn format_keyword_hints(keyword_hits: &[FindHit], ctx: &DocContext<'_>) -> String { - if keyword_hits.is_empty() { - return String::new(); - } - - let mut section = String::from("Keyword matches (use find to jump directly):\n"); - let mut entry_count = 0; - for hit in keyword_hits { - let mut entries = hit.entries.clone(); - entries.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - let mut seen = HashSet::new(); - for entry in &entries { - if !seen.insert(entry.node_id) { - continue; - } - let title = ctx.node_title(entry.node_id).unwrap_or("unknown"); - section.push_str(&format!( - " - '{}' → {} (weight {:.2})\n", - hit.keyword, title, entry.weight - )); - if let Some(content) = ctx.cat(entry.node_id) { - if let Some(snippet) = - super::super::tools::content_snippet(content, &hit.keyword, 300) - { - section.push_str(&format!(" \"{}\"\n", snippet)); - } - } - entry_count += 1; - if entry_count >= MAX_PLAN_ENTRIES { - section.push_str(" ... (more omitted)\n"); - return section; - } - } - } - section -} - -/// Build the ancestor path string for a node (e.g., "root/Chapter 1/Section 1.2"). -pub fn build_ancestor_path(node_id: crate::document::NodeId, ctx: &DocContext<'_>) -> String { - let mut path: Vec = ctx.tree.ancestors_iter(node_id).collect(); - path.reverse(); - path.iter() - .filter_map(|&id| ctx.node_title(id)) - .collect::>() - .join("/") -} - -/// Build intent-specific index signals for the planning prompt. -/// -/// Injects pre-computed ReasoningIndex data as context for the LLM: -/// - Summary intent → summary_shortcut (document overview + section summaries) -/// - Navigational intent → section_map matches from query keywords -/// - Factual/Analytical → no additional signals (keyword hits already injected) -fn build_intent_signals(intent: QueryIntent, ctx: &DocContext<'_>) -> String { - match intent { - QueryIntent::Summary => { - let shortcut = match ctx.summary_shortcut() { - Some(s) => s, - None => return String::new(), - }; - let mut section = String::from( - "\nPre-computed document overview (use this to plan breadth-first scan):\n", - ); - if !shortcut.document_summary.is_empty() { - section.push_str(&format!( - "Document summary: {}\n", - shortcut.document_summary - )); - } - let mut summary_count = 0; - for ss in &shortcut.section_summaries { - section.push_str(&format!( - " - Section '{}' (depth {}): {}\n", - ss.title, ss.depth, ss.summary - )); - summary_count += 1; - if summary_count >= MAX_SECTION_SUMMARIES { - section.push_str(" ... (more sections omitted)\n"); - break; - } - } - section - } - QueryIntent::Navigational => { - let root = ctx.root(); - let routes = match ctx.ls(root) { - Some(r) => r, - None => return String::new(), - }; - let mut section = - String::from("\nSection map (known top-level sections for direct navigation):\n"); - for route in routes { - section.push_str(&format!( - " - {} ({} leaves)\n", - route.title, route.leaf_count - )); - } - section - } - _ => String::new(), - } -} - -/// Build semantic hints section using BM25 scoring over child routes. -fn build_semantic_hints( - query_keywords: &[String], - query_lower: &str, - ctx: &DocContext<'_>, -) -> String { - let root = ctx.root(); - let routes = match ctx.ls(root) { - Some(r) => r, - None => return String::new(), - }; - - if routes.is_empty() { - return String::new(); - } - - let field_docs: Vec> = routes - .iter() - .map(|route| { - let nav = ctx.nav_entry(route.node_id); - let overview = nav.map(|n| n.overview.as_str()).unwrap_or(""); - let hints_text = nav.map(|n| n.question_hints.join(" ")).unwrap_or_default(); - let tags_text = nav.map(|n| n.topic_tags.join(" ")).unwrap_or_default(); - let content = if overview.is_empty() && hints_text.is_empty() && tags_text.is_empty() { - String::new() - } else { - format!("{} {} {}", overview, hints_text, tags_text) - }; - FieldDocument::new( - route.title.clone(), - route.title.clone(), - route.description.clone(), - content, - ) - }) - .collect(); - - let engine = Bm25Engine::fit_to_corpus(&field_docs); - let bm25_results: std::collections::HashMap = engine - .search_weighted(query_lower, routes.len()) - .into_iter() - .collect(); - - let mut section = String::new(); - let mut entry_count = 0; - - for route in routes { - let nav = match ctx.nav_entry(route.node_id) { - Some(n) => n, - None => continue, - }; - - let bm25_score = bm25_results.get(&route.title).copied().unwrap_or(0.0); - if bm25_score <= 0.0 { - continue; - } - - let mut annotations = Vec::new(); - - for hint in &nav.question_hints { - let hint_lower = hint.to_lowercase(); - for kw in query_keywords { - if hint_lower.contains(&kw.to_lowercase()) { - annotations.push(format!("question \"{}\"", hint)); - break; - } - } - if !annotations.iter().any(|a| a.contains(&hint.clone())) { - for word in hint_lower.split_whitespace() { - if word.len() > 3 && query_lower.contains(word) { - annotations.push(format!("question \"{}\"", hint)); - break; - } - } - } - } - - for tag in &nav.topic_tags { - let tag_lower = tag.to_lowercase(); - for kw in query_keywords { - if tag_lower.contains(&kw.to_lowercase()) || kw.to_lowercase().contains(&tag_lower) - { - annotations.push(format!("topic \"{}\"", tag)); - break; - } - } - if !annotations - .iter() - .any(|a| a.contains(&format!("topic \"{}\"", tag))) - { - if query_lower.contains(&tag_lower) && tag.len() > 2 { - annotations.push(format!("topic \"{}\"", tag)); - } - } - } - - let annotation_str = if annotations.is_empty() { - String::new() - } else { - format!(", {}", annotations.join(", ")) - }; - - let line = format!( - " - Section '{}' — BM25: {:.2}{}\n", - route.title, bm25_score, annotation_str - ); - section.push_str(&line); - entry_count += 1; - if entry_count >= MAX_PLAN_ENTRIES { - break; - } - } - - if section.is_empty() { - String::new() - } else { - format!( - "\nSemantic hints (BM25-scored sections, higher = more relevant):\n{}", - section - ) - } -} - -/// For keyword hits that land in deep nodes (depth >= 2), expand the parent node's children. -fn build_deep_expansion(keyword_hits: &[FindHit], ctx: &DocContext<'_>) -> String { - if keyword_hits.is_empty() { - return String::new(); - } - - let mut seen_parents = HashSet::new(); - let mut expansion = String::new(); - let mut expansion_count = 0; - - for hit in keyword_hits { - for entry in &hit.entries { - if entry.depth < 2 { - continue; - } - let parent = match ctx.parent(entry.node_id) { - Some(p) => p, - None => continue, - }; - if !seen_parents.insert(parent) { - continue; - } - let routes = match ctx.ls(parent) { - Some(r) => r, - None => continue, - }; - let parent_title = ctx.node_title(parent).unwrap_or("unknown"); - expansion.push_str(&format!( - "Siblings near keyword hit '{}' (under {}):\n", - hit.keyword, parent_title - )); - for route in routes { - let marker = if ctx.node_title(entry.node_id) == Some(&route.title) { - " ← keyword hit" - } else { - "" - }; - expansion.push_str(&format!( - " - {} ({} leaves){}\n", - route.title, route.leaf_count, marker - )); - } - expansion.push('\n'); - expansion_count += 1; - if expansion_count >= MAX_EXPANSION_ENTRIES { - expansion.push_str(" ... (more expansions omitted)\n"); - break; - } - } - if expansion_count >= MAX_EXPANSION_ENTRIES { - break; - } - } - - expansion -} - -/// Build unvisited sibling branch hints for structured backtracking. -fn build_sibling_hints(state: &WorkerState, ctx: &DocContext<'_>) -> String { - let mut hints = String::new(); - - if let Some(parent) = ctx.parent(state.current_node) { - if let Some(routes) = ctx.ls(parent) { - let unvisited: Vec<&crate::document::ChildRoute> = routes - .iter() - .filter(|r| !state.visited.contains(&r.node_id)) - .collect(); - if !unvisited.is_empty() { - hints.push_str("Unvisited sibling branches at current level:\n"); - for route in &unvisited { - hints.push_str(&format!( - " - {} ({} leaves)\n", - route.title, route.leaf_count - )); - } - } - } - - if let Some(grandparent) = ctx.parent(parent) { - if let Some(routes) = ctx.ls(grandparent) { - let unvisited_parent_siblings: Vec<&crate::document::ChildRoute> = routes - .iter() - .filter(|r| !state.visited.contains(&r.node_id) && r.node_id != parent) - .collect(); - if !unvisited_parent_siblings.is_empty() { - hints.push_str("Unvisited branches at parent level (cd .. then explore):\n"); - for route in &unvisited_parent_siblings { - hints.push_str(&format!( - " - {} ({} leaves)\n", - route.title, route.leaf_count - )); - } - } - } - } - } - - if hints.is_empty() { - String::new() - } else { - format!("\n{}", hints) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::agent::config::DocContext; - use crate::agent::config::Evidence; - use crate::agent::state::WorkerState; - use crate::document::{ChildRoute, NavEntry, NodeId}; - use crate::scoring::bm25::extract_keywords; - - fn build_semantic_test_tree() -> ( - crate::document::DocumentTree, - crate::document::NavigationIndex, - NodeId, - NodeId, - NodeId, - ) { - let mut tree = crate::document::DocumentTree::new("Root", "root content"); - let root = tree.root(); - let revenue = tree.add_child(root, "Revenue", "revenue content"); - let expenses = tree.add_child(root, "Expenses", "expense content"); - - let mut nav = crate::document::NavigationIndex::new(); - nav.add_entry( - root, - NavEntry { - overview: "Annual financial report".to_string(), - question_hints: vec!["What is the financial overview?".to_string()], - topic_tags: vec!["finance".to_string()], - leaf_count: 4, - level: 0, - }, - ); - nav.add_child_routes( - root, - vec![ - ChildRoute { - node_id: revenue, - title: "Revenue".to_string(), - description: "Revenue breakdown".to_string(), - leaf_count: 2, - }, - ChildRoute { - node_id: expenses, - title: "Expenses".to_string(), - description: "Cost analysis".to_string(), - leaf_count: 2, - }, - ], - ); - nav.add_entry( - revenue, - NavEntry { - overview: "Revenue figures for 2024".to_string(), - question_hints: vec![ - "What is the total revenue?".to_string(), - "What was the Q1 revenue?".to_string(), - ], - topic_tags: vec![ - "revenue".to_string(), - "sales".to_string(), - "income".to_string(), - ], - leaf_count: 2, - level: 1, - }, - ); - nav.add_entry( - expenses, - NavEntry { - overview: "Operating expenses".to_string(), - question_hints: vec!["What are the operating costs?".to_string()], - topic_tags: vec!["expenses".to_string(), "costs".to_string()], - leaf_count: 2, - level: 1, - }, - ); - - (tree, nav, root, revenue, expenses) - } - - #[test] - fn test_build_ancestor_path() { - let (tree, nav, root, revenue, _) = build_semantic_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - assert_eq!(build_ancestor_path(revenue, &ctx), "Root/Revenue"); - assert_eq!(build_ancestor_path(root, &ctx), "Root"); - } - - #[test] - fn test_semantic_hints_keyword_match() { - let (tree, nav, _, _, _) = build_semantic_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let keywords = extract_keywords("What is the revenue?"); - let hints = build_semantic_hints(&keywords, &"what is the revenue".to_lowercase(), &ctx); - assert!( - hints.contains("Revenue"), - "Should match Revenue section, got: {}", - hints - ); - assert!(hints.contains("BM25")); - } - - #[test] - fn test_semantic_hints_topic_match() { - let (tree, nav, _, _, _) = build_semantic_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let keywords = extract_keywords("operating costs analysis"); - let hints = - build_semantic_hints(&keywords, &"operating costs analysis".to_lowercase(), &ctx); - assert!( - hints.contains("Expenses"), - "Should match Expenses via topic 'costs', got: {}", - hints - ); - } - - #[test] - fn test_semantic_hints_no_match() { - let (tree, nav, _, _, _) = build_semantic_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let keywords = extract_keywords("xyzzy foobar"); - let hints = build_semantic_hints(&keywords, &"xyzzy foobar".to_lowercase(), &ctx); - assert!(hints.is_empty(), "Should not match, got: {}", hints); - } - - #[test] - fn test_build_replan_prompt() { - let (tree, nav, root, _, _) = build_semantic_test_tree(); - let mut state = WorkerState::new(root, 15); - state.missing_info = "Need Q2 revenue figures".to_string(); - state.add_evidence(Evidence { - source_path: "root/Revenue".to_string(), - node_title: "Revenue".to_string(), - content: "Q1 revenue was $2.5M".to_string(), - doc_name: None, - }); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "test", - }; - let (system, user) = build_replan_prompt("What is total revenue?", None, &state, &ctx); - assert!(system.contains("re-planning")); - assert!(user.contains("What is total revenue?")); - assert!(user.contains("Q2 revenue")); - } - - #[test] - fn test_build_plan_prompt_with_semantic_hints() { - let (tree, nav, _, _, _) = build_semantic_test_tree(); - let ctx = DocContext { - tree: &tree, - nav_index: &nav, - reasoning_index: &crate::document::ReasoningIndex::default(), - doc_name: "Financial Report", - }; - let ls_output = - "[1] Revenue — Revenue breakdown (2 leaves)\n[2] Expenses — Cost analysis (2 leaves)\n"; - let (system, user) = build_plan_prompt( - "What is the revenue?", - None, - ls_output, - "Financial Report", - &[], - &ctx, - QueryIntent::Factual, - ); - assert!(system.contains("semantic hints")); - assert!(user.contains("What is the revenue?")); - } -} diff --git a/vectorless-core/vectorless/src/client/builder.rs b/vectorless-core/vectorless/src/client/builder.rs deleted file mode 100644 index b3ccc6ea..00000000 --- a/vectorless-core/vectorless/src/client/builder.rs +++ /dev/null @@ -1,268 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Builder pattern for creating Engine clients. -//! -//! This module provides [`EngineBuilder`] for configuring and building -//! [`Engine`] instances with sensible defaults. - -use crate::{ - client::engine::Engine, client::retriever::RetrieverClient, config::Config, - events::EventEmitter, metrics::MetricsHub, storage::Workspace, -}; - -/// Builder for creating a [`Engine`] client. -/// -/// `api_key`, `model` and `endpoint` are **required** for simple usage. -/// Advanced users can provide a pre-built [`Config`] via [`with_config`](EngineBuilder::with_config). -/// -/// # Example (simple) -/// -/// ```rust,no_run -/// use vectorless::client::EngineBuilder; -/// -/// #[tokio::main] -/// async fn main() -> Result<(), vectorless::BuildError> { -/// let client = EngineBuilder::new() -/// .with_key("sk-...") -/// .with_model("gpt-4o") -/// .with_endpoint("https://api.xxx.com/v1") -/// .build() -/// .await?; -/// Ok(()) -/// } -/// ``` -/// -/// # Example (advanced) -/// -/// ```rust,ignore -/// use vectorless::client::EngineBuilder; -/// use vectorless::config::{Config, LlmConfig, SlotConfig}; -/// -/// let config = Config::new().with_llm( -/// LlmConfig::new("gpt-4o") -/// .with_api_key("sk-...") -/// .with_endpoint("https://api.openai.com/v1") -/// .with_index(SlotConfig::fast().with_model("gpt-4o-mini")) -/// ); -/// -/// let engine = EngineBuilder::new() -/// .with_config(config) -/// .build() -/// .await?; -/// ``` -#[derive(Debug)] -pub struct EngineBuilder { - /// Custom configuration for advanced tuning. - config: Option, - - /// Event emitter. - events: Option, - - /// LLM API key (override). - api_key: Option, - - /// LLM model name (override). - model: Option, - - /// LLM endpoint URL (override). - endpoint: Option, -} - -impl EngineBuilder { - /// Create a new builder with defaults. - #[must_use] - pub fn new() -> Self { - Self { - config: None, - events: None, - api_key: None, - model: None, - endpoint: None, - } - } - - // ============================================================ - // Configuration - // ============================================================ - - /// Set a custom configuration. - /// - /// When provided, this replaces the default [`Config`] entirely. - /// Builder methods (`with_key`, `with_model`, `with_endpoint`) - /// will still override the corresponding fields on top of this config. - #[must_use] - pub fn with_config(mut self, config: Config) -> Self { - self.config = Some(config); - self - } - - /// Set the event emitter for callbacks. - #[must_use] - pub fn with_events(mut self, events: EventEmitter) -> Self { - self.events = Some(events); - self - } - - // ============================================================ - // LLM Configuration (simple overrides) - // ============================================================ - - /// Set the LLM API key. **Required** (unless provided via Config). - #[must_use] - pub fn with_key(mut self, key: impl Into) -> Self { - self.api_key = Some(key.into()); - self - } - - /// Set the LLM model name. - #[must_use] - pub fn with_model(mut self, model: impl Into) -> Self { - self.model = Some(model.into()); - self - } - - /// Set a custom LLM endpoint URL. - #[must_use] - pub fn with_endpoint(mut self, url: impl Into) -> Self { - self.endpoint = Some(url.into()); - self - } - - // ============================================================ - // Build - // ============================================================ - - /// Build the Engine client. - /// - /// # Errors - /// - /// Returns a [`BuildError`] if: - /// - Workspace creation fails - /// - Required `api_key` or `model` is missing - /// - /// # Example - /// - /// ```rust,no_run - /// use vectorless::client::EngineBuilder; - /// - /// # #[tokio::main] - /// # async fn main() -> Result<(), vectorless::BuildError> { - /// let engine = EngineBuilder::new() - /// .with_key("sk-...") - /// .with_model("gpt-4o") - /// .with_endpoint("https://api.openai.com/v1") - /// .build() - /// .await?; - /// # Ok(()) - /// # } - /// ``` - pub async fn build(self) -> Result { - // Load user-provided or default configuration - let mut config = self.config.unwrap_or_default(); - - // Apply simple overrides — write once, no dual-writing - if let Some(api_key) = self.api_key { - config.llm.api_key = Some(api_key); - } - if let Some(model) = self.model { - config.llm.model = model; - } - if let Some(endpoint) = self.endpoint { - config.llm.endpoint = Some(endpoint); - } - - // Validate required settings - if config.llm.api_key.is_none() { - return Err(BuildError::MissingApiKey); - } - if config.llm.model.is_empty() { - return Err(BuildError::MissingModel); - } - if config.llm.endpoint.is_none() { - return Err(BuildError::MissingEndpoint); - } - - // Open workspace from config - let workspace = Workspace::new(&config.storage.workspace_dir) - .await - .map_err(|e| BuildError::Workspace(e.to_string()))?; - - // Build LlmPool from unified LlmConfig (shared metrics hub) - let metrics_hub = std::sync::Arc::new(MetricsHub::with_defaults()); - let pool = crate::llm::LlmPool::from_config(&config.llm, Some(metrics_hub.clone())); - - // Indexer uses pool.index() - let indexer = crate::client::indexer::IndexerClient::with_llm(pool.index().clone()); - - // Retriever uses pool.retrieval() via agent system - let retriever = RetrieverClient::new(pool.retrieval().clone()); - - // Build engine - let events = self.events.unwrap_or_default(); - Engine::with_components(config, workspace, retriever, indexer, events, metrics_hub) - .await - .map_err(|e| BuildError::Other(e.to_string())) - } -} - -impl Default for EngineBuilder { - fn default() -> Self { - Self::new() - } -} - -/// Error during client build. -#[derive(Debug, thiserror::Error)] -pub enum BuildError { - /// Workspace error. - #[error("Workspace error: {0}")] - Workspace(String), - - /// Missing API key. - #[error("Missing API key: call .with_key(\"sk-...\") or set api_key in config")] - MissingApiKey, - - /// Missing model name. - #[error("Missing model: call .with_model(\"gpt-4o\") or set model in config")] - MissingModel, - - /// Missing endpoint URL. - #[error( - "Missing endpoint: call .with_endpoint(\"https://api.xxx.com/v1\") or set endpoint in config" - )] - MissingEndpoint, - - /// Other error. - #[error("{0}")] - Other(String), -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_builder_with_key() { - let builder = EngineBuilder::new().with_key("sk-test-key"); - - assert_eq!(builder.api_key, Some("sk-test-key".to_string())); - } - - #[test] - fn test_builder_with_model() { - let builder = EngineBuilder::new().with_model("gpt-4o-mini"); - - assert_eq!(builder.model, Some("gpt-4o-mini".to_string())); - } - - #[test] - fn test_builder_with_key_and_model() { - let builder = EngineBuilder::new() - .with_model("gpt-4o-mini") - .with_key("sk-test"); - - assert_eq!(builder.model, Some("gpt-4o-mini".to_string())); - assert_eq!(builder.api_key, Some("sk-test".to_string())); - } -} diff --git a/vectorless-core/vectorless/src/client/engine.rs b/vectorless-core/vectorless/src/client/engine.rs deleted file mode 100644 index 0e885c89..00000000 --- a/vectorless-core/vectorless/src/client/engine.rs +++ /dev/null @@ -1,923 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Main Engine client - the entry point for vectorless. -//! -//! The Engine provides a unified API for the Document Understanding Engine: -//! -//! - [`ingest`](Engine::ingest) — Understand a document (parse, analyze, persist) -//! - [`ask`](Engine::ask) — Ask a question (returns answer + evidence + trace) -//! - [`forget`](Engine::forget) — Remove a document -//! - [`list_documents`](Engine::list_documents) — List all understood documents -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::{EngineBuilder, IngestInput}; -//! -//! # #[tokio::main] -//! # async fn main() -> Result<(), Box> { -//! let engine = EngineBuilder::new() -//! .with_key("sk-...") -//! .with_model("gpt-4o") -//! .with_endpoint("https://api.openai.com/v1") -//! .build() -//! .await?; -//! -//! // Understand a document -//! let doc = engine.ingest(IngestInput::Path("./document.md".into())).await?; -//! println!("{}: {}", doc.name, doc.summary); -//! -//! // Ask a question -//! let answer = engine.ask("What is this?", &[doc.doc_id.clone()]).await?; -//! println!("{}", answer.content); -//! -//! // List all understood documents -//! let docs = engine.list_documents().await?; -//! for d in &docs { -//! println!("{}: {}", d.name, d.summary); -//! } -//! -//! // Forget a document -//! engine.forget(&doc.doc_id).await?; -//! # Ok(()) -//! # } -//! ``` - -use std::{collections::HashMap, sync::Arc}; - -use futures::StreamExt; -use tracing::{info, warn}; - -use crate::{ - Answer, Document as UnderstandingDocument, DocumentTree, Error, Evidence, IngestInput, - ReasoningTrace, - config::Config, - error::Result, - events::EventEmitter, - index::{ - PipelineOptions, - incremental::{self, IndexAction}, - }, - metrics::MetricsHub, - storage::{PersistedDocument, Workspace}, -}; - -use super::{ - index_context::{IndexContext, IndexSource}, - indexer::IndexerClient, - retriever::RetrieverClient, - types::{FailedItem, IndexItem, IndexMode, IndexResult}, - workspace::WorkspaceClient, -}; - -/// The main Engine client. -/// -/// Provides high-level operations for document indexing and retrieval. -/// Uses interior mutability to allow sharing across async tasks. -/// -/// # Cloning -/// -/// Cloning is cheap - it only increments reference counts (`Arc`). All clones -/// share the same underlying resources. -/// -/// # Thread Safety -/// -/// The client is `Clone + Send + Sync` and can be safely shared across threads. -pub struct Engine { - /// Configuration (immutable, shared). - config: Arc, - - /// Indexer client for document indexing. - indexer: IndexerClient, - - /// Retriever client for queries. - retriever: RetrieverClient, - - /// Workspace client for persistence. - workspace: WorkspaceClient, - - /// Central metrics hub for unified collection. - metrics_hub: Arc, -} - -impl Engine { - // ============================================================ - // Constructor (for Builder) - // ============================================================ - - /// Create a new client with the given components. - pub(crate) async fn with_components( - config: Config, - workspace: Workspace, - retriever: RetrieverClient, - indexer: IndexerClient, - events: EventEmitter, - metrics_hub: Arc, - ) -> Result { - let config = Arc::new(config); - - // Attach event emitter to indexer - let indexer = indexer.with_events(events.clone()); - - // Attach event emitter to retriever - let retriever = retriever.with_events(events.clone()); - - // Create workspace client - let workspace_client = WorkspaceClient::new(workspace) - .await - .with_events(events.clone()); - - Ok(Self { - config, - indexer, - retriever, - workspace: workspace_client, - metrics_hub, - }) - } - - // ============================================================ - // Ingest Pipeline (private — called by ingest()) - // ============================================================ - - /// Run the ingest pipeline: parse, compile, persist. - /// - /// Accepts an [`IndexContext`] that specifies the source and options. - /// Multiple sources are processed in parallel. - /// Returns an [`IndexResult`] containing the indexed document metadata. - #[tracing::instrument(skip_all, fields(sources = ctx.sources.len()))] - async fn ingest_pipeline(&self, ctx: IndexContext) -> Result { - if ctx.is_empty() { - return Err(Error::Config("No document sources provided".into())); - } - - let timeout_secs = ctx.options.timeout_secs; - - self.with_timeout(timeout_secs, async move { - let concurrency = self - .config - .llm - .throttle - .max_concurrent_requests - .min(ctx.sources.len()); - - let (items, failed) = self - .process_sources(&ctx.sources, &ctx.options, ctx.name.as_deref(), concurrency) - .await; - - if items.is_empty() && !failed.is_empty() { - return Err(Error::Config(format!( - "All {} source(s) failed: {}", - failed.len(), - failed - .iter() - .map(|f| format!("{} ({})", f.source, f.error)) - .collect::>() - .join("; ") - ))); - } - - // Rebuild cross-document graph in the background so index returns immediately. - if !items.is_empty() && self.config.graph.enabled { - let engine = self.clone(); - tokio::spawn(async move { - info!("Rebuilding document graph in background..."); - if let Err(e) = engine.rebuild_graph().await { - tracing::warn!("Background graph rebuild failed: {e}"); - } - }); - } - - Ok(IndexResult::with_partial(items, failed)) - }) - .await - } - - /// Process multiple sources in parallel. - async fn process_sources( - &self, - sources: &[IndexSource], - options: &super::types::IndexOptions, - name: Option<&str>, - concurrency: usize, - ) -> (Vec, Vec) { - let results: Vec<(Vec, Vec)> = - futures::stream::iter(sources.iter().cloned()) - .map(|source| { - let options = options.clone(); - let name = name.map(str::to_string); - let engine = self.clone(); - async move { - engine - .process_source(&source, &options, name.as_deref()) - .await - } - }) - .buffer_unordered(concurrency) - .collect() - .await; - - results.into_iter().fold( - (Vec::new(), Vec::new()), - |(mut items, mut failed), (ok, err)| { - items.extend(ok); - failed.extend(err); - (items, failed) - }, - ) - } - - /// Process a single source — resolve action and index. - #[tracing::instrument(skip_all, fields(source = %source))] - /// - /// Returns `(items, failed)`. - async fn process_source( - &self, - source: &IndexSource, - options: &super::types::IndexOptions, - name: Option<&str>, - ) -> (Vec, Vec) { - let source_label = source.to_string(); - - match self.resolve_index_action(source, options).await { - Ok(IndexAction::Skip(skip_info)) => { - info!("Skipped (unchanged): {}", source_label); - ( - vec![IndexItem::new( - skip_info.doc_id, - skip_info.name, - skip_info.format, - skip_info.description, - skip_info.page_count, - )], - Vec::new(), - ) - } - Ok(IndexAction::FullIndex { existing_id }) => { - let pipeline_options = self.build_pipeline_options(options, source); - match self - .index_with_retry(source, name, pipeline_options.clone(), None) - .await - { - Ok(doc) => { - self.index_and_persist( - doc, - &pipeline_options, - &source_label, - existing_id.as_deref(), - ) - .await - } - Err(e) => { - tracing::warn!("Failed to index {}: {}", source_label, e); - ( - Vec::new(), - vec![FailedItem::new(&source_label, e.to_string())], - ) - } - } - } - Ok(IndexAction::IncrementalUpdate { - old_tree, - existing_id, - }) => { - info!("Incremental update for: {}", source_label); - let pipeline_options = self.build_pipeline_options(options, source); - match self - .index_with_retry(source, name, pipeline_options.clone(), Some(&old_tree)) - .await - { - Ok(mut doc) => { - doc.id = existing_id.clone(); - self.index_and_persist(doc, &pipeline_options, &source_label, None) - .await - } - Err(e) => { - tracing::warn!("Incremental update failed for {}: {}", source_label, e); - ( - Vec::new(), - vec![FailedItem::new(&source_label, e.to_string())], - ) - } - } - } - Err(e) => { - tracing::warn!("Failed to resolve action for {}: {}", source_label, e); - ( - Vec::new(), - vec![FailedItem::new(&source_label, e.to_string())], - ) - } - } - } - - /// Index with retry on retryable errors. - /// - /// Reads `config.llm.retry` for backoff parameters. - /// Returns `Err` only after all retries are exhausted or the error - /// is not retryable. - async fn index_with_retry( - &self, - source: &IndexSource, - name: Option<&str>, - pipeline_options: PipelineOptions, - existing_tree: Option<&DocumentTree>, - ) -> Result { - let retry = &self.config.llm.retry; - let max_attempts = retry.max_attempts; - - for attempt in 0..max_attempts { - let result = if let Some(tree) = existing_tree { - self.indexer - .index_with_existing(source, name, pipeline_options.clone(), Some(tree)) - .await - } else { - self.indexer - .index(source, name, pipeline_options.clone()) - .await - }; - - match result { - Ok(doc) => return Ok(doc), - Err(e) if e.is_retryable() && attempt + 1 < max_attempts => { - let delay = retry.delay_for_attempt(attempt); - tracing::warn!( - attempt, - max_attempts, - ?delay, - "Retryable error indexing, retrying: {e}" - ); - tokio::time::sleep(delay).await; - } - Err(e) => return Err(e), - } - } - - // Unreachable: loop always returns via Ok/Err branches - unreachable!() - } - - /// Convert an [`IndexedDocument`] to an [`IndexItem`] and persist it. - /// - /// If `old_id` is provided, the old document is removed after a - /// successful save (atomic save-first, then remove old). - async fn index_and_persist( - &self, - doc: super::indexed_document::IndexedDocument, - pipeline_options: &PipelineOptions, - source_label: &str, - old_id: Option<&str>, - ) -> (Vec, Vec) { - let item = Self::build_index_item(&doc); - - info!("[index] Persisting document '{}'...", doc.name,); - let persisted = IndexerClient::to_persisted(doc, pipeline_options).await; - - if let Err(e) = self.workspace.save(&persisted).await { - warn!("[index] Failed to save document: {}", e); - return ( - Vec::new(), - vec![FailedItem::new(source_label, e.to_string())], - ); - } - // Clean up old document after successful save - if let Some(old_id) = old_id { - if let Err(e) = self.workspace.remove(old_id).await { - warn!("Failed to remove old document {}: {}", old_id, e); - } - } - - info!("[index] Document persisted: {}", item.doc_id); - (vec![item], Vec::new()) - } - - /// Build an [`IndexItem`] from an [`IndexedDocument`](super::indexed_document::IndexedDocument). - fn build_index_item(doc: &super::indexed_document::IndexedDocument) -> IndexItem { - IndexItem::new( - doc.id.clone(), - doc.name.clone(), - doc.format.clone(), - doc.description.clone(), - doc.page_count, - ) - .with_source_path( - doc.source_path - .as_ref() - .map(|p| p.to_string_lossy().to_string()) - .unwrap_or_default(), - ) - .with_metrics_opt(doc.metrics.clone()) - } - - // ============================================================ - // Understanding Engine API - // ============================================================ - - /// Understand a document — parse, analyze, and persist. - /// - /// Returns a [`crate::document::DocumentInfo`] with summary, structure, and concepts. - /// The engine builds a full understanding including tree, navigation index, - /// reasoning index, summary, and key concepts. - pub async fn ingest(&self, input: IngestInput) -> Result { - let ctx = match &input { - IngestInput::Path(path) => IndexContext::from_path(path), - IngestInput::Bytes { data, format, .. } => IndexContext::from_bytes(data.clone(), *format), - IngestInput::Text { content, .. } => { - IndexContext::from_content(content, crate::index::parse::DocumentFormat::Markdown) - } - }; - - let result = self.ingest_pipeline(ctx).await?; - - let doc_id = result - .doc_id() - .ok_or_else(|| Error::Config("ingest produced no results".into()))? - .to_string(); - - // Load the persisted document to build DocumentInfo - let persisted = self - .workspace - .load(&doc_id) - .await? - .ok_or_else(|| Error::Config("Document not found after ingest".into()))?; - - let doc = Self::persisted_to_understanding_document(persisted); - Ok(doc.info()) - } - - /// Ask a question — returns a reasoned answer with evidence and trace. - /// - /// - `input`: the question (required) - /// - `ids`: document IDs to search. Empty = search all documents. - /// - /// Always returns an [`Answer`] with content, evidence, confidence, and - /// a mandatory reasoning trace. - pub async fn ask(&self, input: &str, ids: &[String]) -> Result { - // Resolve doc IDs - let doc_ids = if ids.is_empty() { - let docs = self.list_documents().await?; - if docs.is_empty() { - return Err(Error::Config("Workspace is empty".into())); - } - docs.into_iter().map(|d| d.doc_id).collect::>() - } else { - ids.to_vec() - }; - - // Load documents - let (documents, failed) = self.load_documents(&doc_ids).await?; - if documents.is_empty() { - return Err(Error::Config(format!( - "No documents available: {} failures", - failed.len() - ))); - } - - // Build DocContexts from Documents and dispatch - let doc_contexts: Vec = documents - .iter() - .map(|doc| doc.as_context()) - .collect(); - - let skip_analysis = !ids.is_empty(); - let scope = if skip_analysis { - crate::agent::Scope::Specified(doc_contexts) - } else { - crate::agent::Scope::Workspace(crate::agent::WorkspaceContext::new(doc_contexts)) - }; - - let emitter = crate::agent::EventEmitter::noop(); - let config = self.retriever.config().clone(); - let llm = self.retriever.llm().clone(); - let output = - crate::retrieval::dispatcher::dispatch(input, scope, &config, &llm, &emitter).await?; - - // Convert Output -> Answer - Ok(Self::output_to_answer(&output)) - } - - /// Remove a document from the workspace. - pub async fn forget(&self, doc_id: &str) -> Result<()> { - self.workspace.remove(doc_id).await?; - Ok(()) - } - - /// List all understood documents. - /// - /// Returns [`Vec`] with summary, structure, and concepts - /// for each document. - pub async fn list_documents(&self) -> Result> { - let ids = self.workspace.inner().list_documents().await; - let mut result = Vec::new(); - for id in ids { - match self.workspace.load(&id).await { - Ok(Some(persisted)) => { - result.push(Self::persisted_to_understanding_document(persisted).info()); - } - Ok(None) => { - tracing::warn!(doc_id = %id, "Document in index but not in storage"); - } - Err(e) => { - tracing::warn!(doc_id = %id, error = %e, "Failed to load document"); - } - } - } - Ok(result) - } - - // ============================================================ - // Utility Methods - // ============================================================ - - /// Check if a document exists in the workspace. - pub async fn exists(&self, doc_id: &str) -> Result { - self.workspace.exists(doc_id).await - } - - /// Remove all documents from the workspace. - /// - /// Returns the number of documents removed. - pub async fn clear(&self) -> Result { - self.workspace.clear().await - } - - /// Get the cross-document relationship graph. - /// - /// The graph is automatically rebuilt after indexing documents. - /// Returns `None` if no graph has been built yet. - pub async fn get_graph(&self) -> Result> { - self.workspace.get_graph().await - } - - /// Generate a complete metrics report. - /// - /// Returns a [`MetricsReport`](crate::metrics::MetricsReport) containing - /// LLM usage and retrieval operation metrics. - pub fn metrics_report(&self) -> crate::metrics::MetricsReport { - self.metrics_hub.generate_report() - } - - // ============================================================ - // Internal: type conversions - // ============================================================ - - /// Convert a PersistedDocument to a Document (understanding type). - fn persisted_to_understanding_document(persisted: PersistedDocument) -> UnderstandingDocument { - let nav_index = persisted.navigation_index.unwrap_or_default(); - let reasoning_index = persisted.reasoning_index.unwrap_or_default(); - let tree = persisted.tree; - - let section_count = tree.node_count(); - - UnderstandingDocument { - doc_id: persisted.meta.id, - name: persisted.meta.name, - format: persisted.meta.format, - source_path: persisted.meta.source_path.map(|p| p.to_string_lossy().to_string()), - tree, - nav_index, - reasoning_index, - summary: persisted.meta.description.unwrap_or_default(), - concepts: persisted.concepts, - page_count: persisted.meta.page_count, - section_count, - } - } - - /// Convert agent Output to public Answer type. - fn output_to_answer(output: &crate::agent::Output) -> Answer { - // Build evidence - let evidence: Vec = output - .evidence - .iter() - .map(|e| Evidence { - content: e.content.clone(), - source_path: e.source_path.clone(), - doc_name: e.doc_name.clone().unwrap_or_default(), - relevance: 0.0, - }) - .collect(); - - Answer { - content: output.answer.clone(), - evidence, - confidence: output.confidence, - trace: ReasoningTrace { - steps: output.trace_steps.clone(), - }, - } - } - - // ============================================================ - // Internal - // ============================================================ - - /// Load documents by ID, returning loaded artifacts and failures. - async fn load_documents( - &self, - doc_ids: &[String], - ) -> Result<(Vec, Vec)> { - let mut documents = Vec::new(); - let mut failed = Vec::new(); - for doc_id in doc_ids { - match self.workspace.load(doc_id).await { - Ok(Some(doc)) => { - documents.push(Self::persisted_to_understanding_document(doc)); - } - Ok(None) => { - failed.push(FailedItem::new(doc_id, "Document not found")); - } - Err(e) => { - failed.push(FailedItem::new(doc_id, &e.to_string())); - } - } - } - Ok((documents, failed)) - } - - /// Run a future with an optional timeout. - /// If `timeout_secs` is `Some`, wraps the future in `tokio::time::timeout`. - async fn with_timeout(&self, timeout_secs: Option, fut: F) -> Result - where - F: std::future::Future>, - { - match timeout_secs { - Some(secs) => { - match tokio::time::timeout(std::time::Duration::from_secs(secs), fut).await { - Ok(result) => result, - Err(_) => Err(Error::Config(format!("Operation timed out after {secs}s"))), - } - } - None => fut.await, - } - } - - /// Build pipeline options for pipeline execution (with checkpoint dir). - /// - /// This is the single source of truth for pipeline configuration. - fn build_pipeline_options( - &self, - options: &super::types::IndexOptions, - source: &IndexSource, - ) -> PipelineOptions { - use crate::index::{IndexMode, ReasoningIndexConfig, SummaryStrategy}; - - let format = match source { - IndexSource::Path(path) => self - .indexer - .detect_format_from_path(path) - .unwrap_or(crate::index::parse::DocumentFormat::Markdown), - IndexSource::Content { format, .. } => *format, - IndexSource::Bytes { format, .. } => *format, - }; - - let checkpoint_dir = Some(self.config.storage.checkpoint_dir.clone()); - - PipelineOptions { - mode: match format { - crate::index::parse::DocumentFormat::Markdown => IndexMode::Markdown, - crate::index::parse::DocumentFormat::Pdf => IndexMode::Pdf, - }, - generate_ids: options.generate_ids, - summary_strategy: if options.generate_summaries { - SummaryStrategy::full() - } else { - SummaryStrategy::none() - }, - generate_description: options.generate_description, - checkpoint_dir, - reasoning_index: ReasoningIndexConfig { - enable_synonym_expansion: options.enable_synonym_expansion, - ..ReasoningIndexConfig::default() - }, - concurrency: self.config.llm.throttle.to_runtime_config(), - ..Default::default() - } - } - - /// Resolve what action to take for a source. - async fn resolve_index_action( - &self, - source: &IndexSource, - options: &super::types::IndexOptions, - ) -> Result { - let workspace = &self.workspace; - - // Force mode always re-indexes from scratch - if options.mode == IndexMode::Force { - return Ok(IndexAction::FullIndex { existing_id: None }); - } - - // Only path sources support incremental indexing - let path = match source { - IndexSource::Path(p) => p, - _ => return Ok(IndexAction::FullIndex { existing_id: None }), - }; - - // Find if this file has already been indexed - let existing_id = match workspace.find_by_source_path(path).await { - Some(id) => id, - None => return Ok(IndexAction::FullIndex { existing_id: None }), // New file - }; - - // Default mode: skip if already indexed (no content check) - if options.mode == IndexMode::Default { - let info = workspace.get_document_info(&existing_id).await?; - let (name, format_str, desc, pages) = match info { - Some(i) => (i.name, i.format, i.description, i.page_count), - None => (String::new(), String::new(), None, None), - }; - return Ok(IndexAction::Skip(incremental::SkipInfo { - doc_id: existing_id, - name, - format: crate::index::parse::DocumentFormat::from_extension(&format_str) - .unwrap_or(crate::index::parse::DocumentFormat::Markdown), - description: desc, - page_count: pages, - })); - } - - // Incremental mode: load stored document and delegate to resolver - let current_bytes = match tokio::fs::read(path).await { - Ok(b) => b, - Err(_) => return Ok(IndexAction::FullIndex { existing_id: None }), - }; - - let stored_doc = match workspace.load(&existing_id).await? { - Some(d) => d, - None => return Ok(IndexAction::FullIndex { existing_id: None }), - }; - - let format = crate::index::parse::DocumentFormat::from_extension(&stored_doc.meta.format) - .unwrap_or(crate::index::parse::DocumentFormat::Markdown); - let pipeline_options = self.build_pipeline_options(options, source); - - // If logic fingerprint changed, remove old doc before full reprocess - let action = - incremental::resolve_action(¤t_bytes, &stored_doc, &pipeline_options, format); - - // Note: if FullIndex, old doc cleanup happens in process_source() - // after successful save (save-first, then remove old). - - Ok(action) - } - - /// Rebuild the document graph after indexing, if graph is enabled. - async fn rebuild_graph(&self) -> Result<()> { - if !self.config.graph.enabled { - return Ok(()); - } - - // Load all documents in parallel and extract keyword profiles - let doc_ids = self.workspace.inner().list_documents().await; - info!( - doc_count = doc_ids.len(), - "Loading documents for graph rebuild" - ); - let concurrency = self.config.llm.throttle.max_concurrent_requests; - - let doc_ids_clone: Vec = doc_ids.iter().cloned().collect(); - let loaded: Vec<(String, Result>)> = - futures::stream::iter(doc_ids_clone.into_iter()) - .map(|doc_id| { - let ws = self.workspace.clone(); - async move { - let result = ws.load(&doc_id).await; - (doc_id, result) - } - }) - .buffer_unordered(concurrency) - .collect() - .await; - - let mut failed_count = 0usize; - let mut loaded_docs: Vec = Vec::new(); - for (doc_id, result) in loaded { - match result { - Ok(Some(doc)) => loaded_docs.push(doc), - Ok(None) => { - warn!( - doc_id, - "Document in meta index but not in backend during graph rebuild" - ); - failed_count += 1; - } - Err(e) => { - warn!(doc_id, error = %e, "Failed to load document for graph rebuild"); - failed_count += 1; - } - } - } - - info!( - loaded = loaded_docs.len(), - failed = failed_count, - "Documents loaded for graph rebuild" - ); - - let mut builder = crate::graph::DocumentGraphBuilder::new(self.config.graph.clone()); - for doc in &loaded_docs { - let keywords = Self::extract_keywords_from_doc(&doc); - builder.add_document( - &doc.meta.id, - &doc.meta.name, - &doc.meta.format, - doc.meta.node_count, - keywords, - ); - } - - let graph = builder.build(); - info!( - nodes = graph.node_count(), - edges = graph.edge_count(), - "Graph built, persisting" - ); - self.workspace.set_graph(&graph).await?; - Ok(()) - } - - /// Extract keyword -> weight map from a persisted document's ReasoningIndex. - fn extract_keywords_from_doc(doc: &PersistedDocument) -> HashMap { - let mut keywords = HashMap::new(); - if let Some(ref ri) = doc.reasoning_index { - for (kw, entries) in ri.all_topic_entries() { - let weight: f32 = - entries.iter().map(|e| e.weight).sum::() / entries.len().max(1) as f32; - keywords.insert(kw.clone(), weight); - } - } - keywords - } -} - -impl Clone for Engine { - fn clone(&self) -> Self { - Self { - config: Arc::clone(&self.config), - indexer: self.indexer.clone(), - retriever: self.retriever.clone(), - workspace: self.workspace.clone(), - metrics_hub: Arc::clone(&self.metrics_hub), - } - } -} - -impl std::fmt::Debug for Engine { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Engine").finish_non_exhaustive() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::client::types::IndexMode; - - // -- resolve_index_action Default mode ---------------------------------------------- - - // We can't call resolve_index_action without a workspace, but we can - // verify IndexMode equality logic used inside. - #[test] - fn test_index_mode_force_skips_incremental() { - let mode = IndexMode::Force; - assert_eq!(mode, IndexMode::Force); - assert_ne!(mode, IndexMode::Default); - assert_ne!(mode, IndexMode::Incremental); - } - - // -- build_index_item ---------------------------------------------------------------- - - // Build_index_item only transforms data -- no I/O. - use crate::client::indexed_document::IndexedDocument; - - fn make_doc() -> IndexedDocument { - IndexedDocument::new("test-id", crate::index::parse::DocumentFormat::Markdown) - .with_name("test.md") - .with_description("test doc") - .with_source_path(std::path::PathBuf::from("/tmp/test.md")) - } - - #[test] - fn test_build_index_item() { - let doc = make_doc(); - let item = Engine::build_index_item(&doc); - - assert_eq!(item.doc_id, "test-id"); - assert_eq!(item.name, "test.md"); - assert_eq!(item.format, crate::index::parse::DocumentFormat::Markdown); - assert_eq!(item.description, Some("test doc".to_string())); - assert_eq!(item.source_path, Some("/tmp/test.md".to_string())); - assert!(item.metrics.is_none()); - } - - #[test] - fn test_build_index_item_no_source_path() { - let doc = IndexedDocument::new("id", crate::index::parse::DocumentFormat::Pdf); - let item = Engine::build_index_item(&doc); - - assert_eq!(item.source_path, Some(String::new())); // unwrap_or_default - assert_eq!(item.format, crate::index::parse::DocumentFormat::Pdf); - } -} diff --git a/vectorless-core/vectorless/src/client/index_context.rs b/vectorless-core/vectorless/src/client/index_context.rs deleted file mode 100644 index aa042fe3..00000000 --- a/vectorless-core/vectorless/src/client/index_context.rs +++ /dev/null @@ -1,363 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Index context for document indexing operations. -//! -//! [`IndexContext`] supports single or multiple document sources: -//! - **File path** — Load and parse a file from disk -//! - **Content string** — Parse content directly (HTML, Markdown, text) -//! - **Byte data** — Parse binary data (PDF, DOCX) -//! -//! # Single document -//! -//! ```rust,no_run -//! use vectorless::client::IndexContext; -//! -//! let ctx = IndexContext::from_path("./document.md"); -//! ``` -//! -//! # Multiple documents -//! -//! ```rust,no_run -//! use vectorless::client::IndexContext; -//! -//! let ctx = IndexContext::from_paths(vec!["./doc1.md", "./doc2.pdf"]); -//! ``` -//! -//! # From directory -//! -//! ```rust,no_run -//! use vectorless::client::IndexContext; -//! -//! // Non-recursive (top-level only) -//! let ctx = IndexContext::from_dir("./documents", false); -//! -//! // Recursive (includes subdirectories) -//! let ctx = IndexContext::from_dir("./documents", true); -//! ``` - -use std::path::PathBuf; - -use crate::document::DocumentFormat; - -use super::types::{IndexMode, IndexOptions}; - -// ============================================================ -// Index Source -// ============================================================ - -/// The source of document content for indexing. -#[derive(Debug, Clone)] -pub(crate) enum IndexSource { - /// Load document from a file path. - Path(PathBuf), - - /// Parse document from a string. - Content { - data: String, - format: DocumentFormat, - }, - - /// Parse document from binary data. - Bytes { - data: Vec, - format: DocumentFormat, - }, -} - -// ============================================================ -// Index Context -// ============================================================ - -/// Context for document indexing operations. -/// -/// Supports single or multiple document sources. When multiple sources -/// are provided, each is indexed independently and the results are -/// collected into [`IndexResult`](super::IndexResult). -/// -/// # Examples -/// -/// ```rust,no_run -/// use vectorless::client::IndexContext; -/// use vectorless::client::DocumentFormat; -/// -/// # #[tokio::main] -/// # async fn main() -> Result<(), Box> { -/// # let engine = vectorless::EngineBuilder::new().build().await?; -/// // Single file -/// let result = engine.index(IndexContext::from_path("./doc.md")).await?; -/// -/// // Multiple files -/// let result = engine.index( -/// IndexContext::from_paths(vec!["./doc1.md", "./doc2.pdf"]) -/// ).await?; -/// -/// // Entire directory -/// let result = engine.index(IndexContext::from_dir("./docs", false)).await?; -/// # Ok(()) -/// # } -/// ``` -#[derive(Debug, Clone)] -pub struct IndexContext { - /// Document sources (supports multiple). - pub(crate) sources: Vec, - - /// Optional document name for metadata (single-source only). - pub(crate) name: Option, - - /// Indexing options. - pub(crate) options: IndexOptions, -} - -impl IndexContext { - /// Create from a single file path. - /// - /// The document format is automatically detected from the file extension. - pub fn from_path(path: impl Into) -> Self { - Self { - sources: vec![IndexSource::Path(path.into())], - name: None, - options: IndexOptions::default(), - } - } - - /// Create from multiple file paths. - pub fn from_paths(paths: impl IntoIterator>) -> Self { - Self { - sources: paths - .into_iter() - .map(|p| IndexSource::Path(p.into())) - .collect(), - name: None, - options: IndexOptions::default(), - } - } - - /// Create from a directory path. - /// - /// Indexes all supported files in the directory. - /// Supported extensions: `.md`, `.pdf`. - /// - /// Set `recursive` to `true` to include subdirectories. - pub fn from_dir(dir: impl Into, recursive: bool) -> Self { - Self::scan_dir(dir, recursive) - } - - /// Internal: scan a directory for supported document files. - fn scan_dir(dir: impl Into, recursive: bool) -> Self { - let dir = dir.into(); - let supported_extensions = DocumentFormat::SUPPORTED_EXTENSIONS; - - if !dir.exists() { - tracing::warn!("Directory not found: {}", dir.display()); - } - - let mut sources = Vec::new(); - Self::collect_files(&dir, &supported_extensions, recursive, &mut sources); - - Self { - sources, - name: None, - options: IndexOptions::default(), - } - } - - /// Recursively or non-recursively collect supported files. - fn collect_files( - dir: &std::path::Path, - extensions: &[&str], - recursive: bool, - sources: &mut Vec, - ) { - if let Ok(entries) = std::fs::read_dir(dir) { - let mut subdirs = Vec::new(); - for entry in entries.flatten() { - let path = entry.path(); - if path.is_dir() { - if recursive { - subdirs.push(path); - } - } else if let Some(ext) = path.extension().and_then(|e| e.to_str()) { - if extensions.contains(&ext.to_lowercase().as_str()) { - sources.push(IndexSource::Path(path)); - } - } - } - for subdir in subdirs { - Self::collect_files(&subdir, extensions, recursive, sources); - } - } - } - - /// Create from a content string. - pub fn from_content(content: impl Into, format: DocumentFormat) -> Self { - Self { - sources: vec![IndexSource::Content { - data: content.into(), - format, - }], - name: None, - options: IndexOptions::default(), - } - } - - /// Create from binary data. - pub fn from_bytes(bytes: Vec, format: DocumentFormat) -> Self { - Self { - sources: vec![IndexSource::Bytes { - data: bytes, - format, - }], - name: None, - options: IndexOptions::default(), - } - } - - /// Set the document name (single-source only). - pub fn with_name(mut self, name: impl Into) -> Self { - self.name = Some(name.into()); - self - } - - /// Set the indexing options. - pub fn with_options(mut self, options: IndexOptions) -> Self { - self.options = options; - self - } - - /// Set the indexing mode. - pub fn with_mode(mut self, mode: IndexMode) -> Self { - self.options.mode = mode; - self - } - - /// Number of document sources. - pub fn len(&self) -> usize { - self.sources.len() - } - - /// Check if there are no sources. - pub fn is_empty(&self) -> bool { - self.sources.is_empty() - } - - /// Get the document name, if set. - pub fn name(&self) -> Option<&str> { - self.name.as_deref() - } - - /// Get the indexing options. - pub fn options(&self) -> &IndexOptions { - &self.options - } -} - -impl From for IndexContext { - fn from(path: PathBuf) -> Self { - Self::from_path(path) - } -} - -impl From<&std::path::Path> for IndexContext { - fn from(path: &std::path::Path) -> Self { - Self::from_path(path.to_path_buf()) - } -} - -impl From<&str> for IndexContext { - fn from(path: &str) -> Self { - Self::from_path(path) - } -} - -impl From for IndexContext { - fn from(path: String) -> Self { - Self::from_path(path) - } -} - -impl std::fmt::Display for IndexSource { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - IndexSource::Path(p) => write!(f, "path:{}", p.display()), - IndexSource::Content { format, .. } => write!(f, "content:{}", format.extension()), - IndexSource::Bytes { format, .. } => write!(f, "bytes:{}", format.extension()), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_from_path() { - let ctx = IndexContext::from_path("./test.md"); - assert_eq!(ctx.len(), 1); - assert!(ctx.name.is_none()); - } - - #[test] - fn test_from_paths() { - let ctx = IndexContext::from_paths(vec!["./a.md", "./b.pdf"]); - assert_eq!(ctx.len(), 2); - } - - #[test] - fn test_from_content() { - let ctx = IndexContext::from_content("# Title", DocumentFormat::Markdown); - assert_eq!(ctx.len(), 1); - } - - #[test] - fn test_from_bytes() { - let ctx = IndexContext::from_bytes(vec![1, 2, 3], DocumentFormat::Pdf); - assert_eq!(ctx.len(), 1); - } - - #[test] - fn test_with_name() { - let ctx = IndexContext::from_path("./test.md").with_name("My Document"); - assert_eq!(ctx.name(), Some("My Document")); - } - - #[test] - fn test_with_mode() { - let ctx = IndexContext::from_path("./test.md").with_mode(IndexMode::Force); - assert_eq!(ctx.options.mode, IndexMode::Force); - } - - #[test] - fn test_from_path_trait() { - let ctx = IndexContext::from(PathBuf::from("./test.md")); - assert_eq!(ctx.len(), 1); - } - - #[test] - fn test_from_dir_with_recursive() { - // Create a temp directory structure: - // tmp/ - // a.md - // sub/ - // b.md - // deep/ - // c.pdf - let tmp = std::env::temp_dir().join("vectorless_test_dir_recursive"); - let _ = std::fs::remove_dir_all(&tmp); - std::fs::create_dir_all(tmp.join("sub/deep")).unwrap(); - std::fs::write(tmp.join("a.md"), "# A").unwrap(); - std::fs::write(tmp.join("sub/b.md"), "# B").unwrap(); - std::fs::write(tmp.join("sub/deep/c.pdf"), b"%PDF").unwrap(); - std::fs::write(tmp.join("sub/deep/ignore.dat"), b"xxx").unwrap(); - - // Non-recursive: only top-level - let ctx = IndexContext::from_dir(&tmp, false); - assert_eq!(ctx.len(), 1); // only a.md - - // Recursive: all levels - let ctx = IndexContext::from_dir(&tmp, true); - assert_eq!(ctx.len(), 3); // a.md, b.md, c.pdf - - let _ = std::fs::remove_dir_all(&tmp); - } -} diff --git a/vectorless-core/vectorless/src/client/indexed_document.rs b/vectorless-core/vectorless/src/client/indexed_document.rs deleted file mode 100644 index 24ec0d6f..00000000 --- a/vectorless-core/vectorless/src/client/indexed_document.rs +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Internal intermediate type produced by the indexing pipeline. -//! -//! [`IndexedDocument`] is an internal-only type that carries data from -//! [`IndexerClient`](super::indexer::IndexerClient) to [`Engine`](super::Engine). -//! It is **not** part of the public API. - -use std::path::PathBuf; - -use crate::document::DocumentFormat; -use crate::document::DocumentTree; -use crate::metrics::IndexMetrics; -use crate::storage::PageContent; - -/// An indexed document with its tree structure and metadata. -/// -/// Internal intermediate produced by the indexing pipeline and consumed -/// by [`Engine`](super::Engine) to create a [`PersistedDocument`](crate::storage::PersistedDocument). -#[derive(Debug, Clone)] -pub(crate) struct IndexedDocument { - /// Unique document identifier. - pub id: String, - - /// Document format. - pub format: DocumentFormat, - - /// Document name/title. - pub name: String, - - /// Document description (generated by LLM). - pub description: Option, - - /// Source file path. - pub source_path: Option, - - /// Page count (for PDFs). - pub page_count: Option, - - /// The document tree structure. - pub tree: Option, - - /// Per-page content (for PDFs). - pub pages: Vec, - - /// Indexing pipeline metrics. - pub metrics: Option, - - /// Pre-computed reasoning index for retrieval acceleration. - pub reasoning_index: Option, - - /// Pre-computed navigation index for agent-based retrieval. - pub navigation_index: Option, - - /// Key concepts extracted from the document. - pub concepts: Vec, -} - -impl IndexedDocument { - /// Create a new indexed document. - pub fn new(id: impl Into, format: DocumentFormat) -> Self { - Self { - id: id.into(), - format, - name: String::new(), - description: None, - source_path: None, - page_count: None, - tree: None, - pages: Vec::new(), - metrics: None, - reasoning_index: None, - navigation_index: None, - concepts: Vec::new(), - } - } - - /// Set the document name. - pub fn with_name(mut self, name: impl Into) -> Self { - self.name = name.into(); - self - } - - /// Set the document description. - pub fn with_description(mut self, desc: impl Into) -> Self { - self.description = Some(desc.into()); - self - } - - /// Set the source path. - pub fn with_source_path(mut self, path: impl Into) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the page count. - pub fn with_page_count(mut self, count: usize) -> Self { - self.page_count = Some(count); - self - } - - /// Set the document tree. - pub fn with_tree(mut self, tree: DocumentTree) -> Self { - self.tree = Some(tree); - self - } - - /// Set the indexing metrics. - pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { - self.metrics = Some(metrics); - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_indexed_document() { - let doc = IndexedDocument::new("doc-1", DocumentFormat::Markdown) - .with_name("Test Document") - .with_description("A test document"); - - assert_eq!(doc.id, "doc-1"); - assert_eq!(doc.name, "Test Document"); - assert!(doc.tree.is_none()); - } -} diff --git a/vectorless-core/vectorless/src/client/indexer.rs b/vectorless-core/vectorless/src/client/indexer.rs deleted file mode 100644 index ba4ce979..00000000 --- a/vectorless-core/vectorless/src/client/indexer.rs +++ /dev/null @@ -1,387 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document indexing client. -//! -//! This module provides document indexing operations including -//! format detection, parsing, and tree building. -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::client::{IndexerClient, IndexContext}; -//! -//! let indexer = IndexerClient::new(executor); -//! -//! let result = indexer -//! .index(IndexContext::from_path("./document.md")) -//! .await?; -//! -//! println!("Indexed: {} ({} nodes)", result.id, result.tree.as_ref().map(|t| t.node_count()).unwrap_or(0)); -//! ``` - -use std::path::Path; -use std::sync::Arc; - -use tracing::info; -use uuid::Uuid; - -use crate::document::DocumentFormat; -use crate::error::{Error, Result}; -use crate::index::{IndexInput, IndexMode, PipelineExecutor, PipelineOptions}; -use crate::llm::LlmClient; -use crate::storage::{DocumentMeta, PersistedDocument}; - -use super::index_context::IndexSource; -use super::indexed_document::IndexedDocument; -use crate::events::{EventEmitter, IndexEvent}; - -/// Document indexing client. -/// -/// Provides operations for parsing and indexing documents. -/// Each index operation creates a fresh pipeline executor, enabling -/// true parallel document indexing without mutex contention. -pub(crate) struct IndexerClient { - /// Factory for creating pipeline executors (one per index operation). - executor_factory: Arc PipelineExecutor + Send + Sync>, - - /// Event emitter. - events: EventEmitter, -} - -impl IndexerClient { - /// Create with an LLM-enabled pipeline. - pub fn with_llm(client: LlmClient) -> Self { - let client = Arc::new(client); - Self { - executor_factory: Arc::new(move || PipelineExecutor::with_llm((*client).clone())), - events: EventEmitter::new(), - } - } - - /// Create with a custom executor factory (for testing). - pub(crate) fn with_factory(factory: Arc PipelineExecutor + Send + Sync>) -> Self { - Self { - executor_factory: factory, - events: EventEmitter::new(), - } - } - - /// Create with event emitter. - pub fn with_events(mut self, events: EventEmitter) -> Self { - self.events = events; - self - } - - /// Index a document from an index context. - /// - /// The caller provides fully constructed [`PipelineOptions`] - /// (including checkpoint dir, reasoning config, etc.). - pub async fn index( - &self, - source: &IndexSource, - name: Option<&str>, - pipeline_options: PipelineOptions, - ) -> Result { - self.index_with_existing(source, name, pipeline_options, None) - .await - } - - /// Index a document, optionally reusing an existing tree for incremental updates. - /// - /// The caller provides fully constructed [`PipelineOptions`]. - pub async fn index_with_existing( - &self, - source: &IndexSource, - name: Option<&str>, - mut pipeline_options: PipelineOptions, - existing_tree: Option<&crate::DocumentTree>, - ) -> Result { - pipeline_options.existing_tree = existing_tree.cloned(); - match source { - IndexSource::Path(path) => self.index_from_path(path, name, pipeline_options).await, - IndexSource::Content { data, format } => { - self.index_from_content(data, *format, name, pipeline_options) - .await - } - IndexSource::Bytes { data, format } => { - self.index_from_bytes(data, *format, name, pipeline_options) - .await - } - } - } - - /// Index from a file path. - /// - /// Uses the format from `PipelineOptions.mode` — no redundant detection. - async fn index_from_path( - &self, - path: &Path, - name: Option<&str>, - pipeline_options: PipelineOptions, - ) -> Result { - let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); - - // Validate file before indexing - let validation = crate::utils::validate_file(&path)?; - if !validation.valid { - return Err(Error::Parse( - validation - .errors - .first() - .cloned() - .unwrap_or_else(|| "Invalid file".to_string()), - )); - } - for warning in &validation.warnings { - tracing::warn!("{}", warning); - } - - // Resolve format from pipeline options (set by Engine) — no re-detection - let format = Self::format_from_mode(&pipeline_options.mode); - - let input = IndexInput::file(&path); - self.run_pipeline( - input, - format, - &path.display().to_string(), - name, - Some(&path), - pipeline_options, - ) - .await - } - - /// Index from content string. - async fn index_from_content( - &self, - content: &str, - format: DocumentFormat, - name: Option<&str>, - pipeline_options: PipelineOptions, - ) -> Result { - // Validate content before indexing - let validation = crate::utils::validate_content(content, format); - if !validation.valid { - return Err(Error::Parse( - validation - .errors - .first() - .cloned() - .unwrap_or_else(|| "Invalid content".to_string()), - )); - } - - let input = IndexInput::content(content); - self.run_pipeline( - input, - format, - name.unwrap_or("content"), - name, - None, - pipeline_options, - ) - .await - } - - /// Index from binary data. - async fn index_from_bytes( - &self, - bytes: &[u8], - format: DocumentFormat, - name: Option<&str>, - pipeline_options: PipelineOptions, - ) -> Result { - // Validate bytes before indexing - let validation = crate::utils::validate_bytes(bytes, format); - if !validation.valid { - return Err(Error::Parse( - validation - .errors - .first() - .cloned() - .unwrap_or_else(|| "Invalid bytes".to_string()), - )); - } - - info!( - "Indexing {:?} document from bytes ({} bytes)", - format, - bytes.len() - ); - - let input = IndexInput::bytes(bytes); - self.run_pipeline( - input, - format, - name.unwrap_or("bytes"), - name, - None, - pipeline_options, - ) - .await - } - - /// Common pipeline execution: emit events → run pipeline → build result. - #[tracing::instrument(skip_all, fields(format = ?format, source = %source_label))] - async fn run_pipeline( - &self, - input: IndexInput, - format: DocumentFormat, - source_label: &str, - name: Option<&str>, - path: Option<&Path>, - pipeline_options: PipelineOptions, - ) -> Result { - self.events.emit_index(IndexEvent::Started { - path: source_label.to_string(), - }); - - let doc_id = Uuid::new_v4().to_string(); - self.events - .emit_index(IndexEvent::FormatDetected { format }); - - info!("Indexing {:?} document: {}", format, source_label); - - let mut executor = (self.executor_factory)(); - let result = executor.execute(input, pipeline_options).await?; - - self.build_indexed_document(doc_id, result, format, name, path) - } - - /// Build indexed document from pipeline result. - fn build_indexed_document( - &self, - doc_id: String, - result: crate::index::PipelineResult, - format: DocumentFormat, - name: Option<&str>, - path: Option<&Path>, - ) -> Result { - let tree = result - .tree - .ok_or_else(|| Error::Parse("Document tree not generated".to_string()))?; - - let node_count = tree.node_count(); - self.events.emit_index(IndexEvent::TreeBuilt { node_count }); - - let doc_name = name - .map(str::to_string) - .or_else(|| { - path.and_then(|p| p.file_stem()) - .map(|s| s.to_string_lossy().to_string()) - }) - .unwrap_or_else(|| result.name.clone()); - - let mut doc = IndexedDocument::new(&doc_id, format) - .with_name(&doc_name) - .with_tree(tree) - .with_metrics(result.metrics); - - doc.reasoning_index = result.reasoning_index; - doc.navigation_index = result.navigation_index; - doc.concepts = result.concepts; - - if let Some(p) = path { - doc = doc.with_source_path(p); - } - - if let Some(desc) = &result.description { - doc = doc.with_description(desc); - } - - if let Some(page_count) = result.page_count { - doc = doc.with_page_count(page_count); - } - - info!("Indexing complete: {} ({} nodes)", doc_id, node_count); - self.events.emit_index(IndexEvent::Complete { doc_id }); - - Ok(doc) - } - - /// Resolve `DocumentFormat` from `PipelineOptions.mode`. - /// - /// Falls back to Markdown for `Auto` mode (the engine resolves - /// `Auto` to a concrete format before calling the indexer). - fn format_from_mode(mode: &IndexMode) -> DocumentFormat { - match mode { - IndexMode::Markdown => DocumentFormat::Markdown, - IndexMode::Pdf => DocumentFormat::Pdf, - IndexMode::Auto => DocumentFormat::Markdown, - } - } - - /// Detect document format from file extension. - pub(crate) fn detect_format_from_path(&self, path: &Path) -> Result { - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - DocumentFormat::from_extension(ext) - .ok_or_else(|| Error::Parse(format!("Unsupported format: {}", ext))) - } - - /// Convert [`IndexedDocument`] to [`PersistedDocument`]. - /// - /// This is an associated function — it does not depend on client state. - /// Stores content and logic fingerprints from the pipeline options. - /// - /// Uses async file I/O to avoid blocking the tokio runtime. - pub async fn to_persisted( - doc: IndexedDocument, - pipeline_options: &PipelineOptions, - ) -> PersistedDocument { - let mut meta = DocumentMeta::new(&doc.id, &doc.name, doc.format.extension()) - .with_source_path( - doc.source_path - .as_ref() - .map(|p| p.to_string_lossy().to_string()) - .unwrap_or_default(), - ) - .with_description(doc.description.clone().unwrap_or_default()); - - // Compute content fingerprint for incremental indexing (async I/O) - if let Some(ref path) = doc.source_path { - if let Ok(bytes) = tokio::fs::read(path).await { - let fp = crate::utils::fingerprint::Fingerprint::from_bytes(&bytes); - meta = meta.with_fingerprint(fp); - } - } - - // Store logic fingerprint (pipeline configuration hash) - let logic_fp = pipeline_options.logic_fingerprint(); - meta = meta.with_logic_fingerprint(logic_fp); - - let tree = doc.tree.expect("IndexedDocument must have a tree"); - - // Extract stats from metrics - let node_count = tree.node_count(); - let (summary_tokens, duration_ms) = if let Some(ref m) = doc.metrics { - (m.total_tokens_generated, m.total_time_ms()) - } else { - (0, 0) - }; - - let mut persisted = PersistedDocument::new(meta, tree); - - for page in doc.pages { - persisted.add_page(page.page, &page.content); - } - - persisted.reasoning_index = doc.reasoning_index; - persisted.navigation_index = doc.navigation_index; - persisted.concepts = doc.concepts; - persisted - .meta - .update_processing_stats(node_count, summary_tokens, duration_ms); - - persisted - } -} - -impl Clone for IndexerClient { - fn clone(&self) -> Self { - Self { - executor_factory: Arc::clone(&self.executor_factory), - events: self.events.clone(), - } - } -} diff --git a/vectorless-core/vectorless/src/client/mod.rs b/vectorless-core/vectorless/src/client/mod.rs deleted file mode 100644 index a851b3df..00000000 --- a/vectorless-core/vectorless/src/client/mod.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! High-level client API for document indexing and retrieval. -//! -//! This module provides the main entry point for using vectorless: -//! - [`Engine`] — The main client for indexing and querying documents -//! - [`EngineBuilder`] — Builder pattern for client configuration -//! - [`IndexContext`] — Unified input for document indexing -//! - [`QueryContext`] — Unified input for document queries -//! -//! # Quick Start -//! -//! ```rust,no_run -//! use vectorless::client::{EngineBuilder, IndexContext, QueryContext}; -//! -//! # #[tokio::main] -//! # async fn main() -> Result<(), Box> { -//! // Create a client with default settings -//! let client = EngineBuilder::new() -//! .with_key("sk-...") -//! .with_model("gpt-4o") -//! .build() -//! .await?; -//! -//! // Index a document -//! let result = client.index(IndexContext::from_path("./document.md")).await?; -//! let doc_id = result.doc_id().unwrap(); -//! -//! // Query the document -//! let result = client.query( -//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) -//! ).await?; -//! if let Some(item) = result.single() { -//! println!("{}", item.content); -//! } -//! -//! // List all documents -//! for doc in client.list().await? { -//! println!("{}: {}", doc.id, doc.name); -//! } -//! # Ok(()) -//! # } -//! ``` -//! -//! # Events and Progress -//! -//! Monitor operation progress with events: -//! -//! ```rust,no_run -//! # use vectorless::client::{EngineBuilder, EventEmitter, IndexEvent}; -//! # #[tokio::main] -//! # async fn main() -> Result<(), Box> { -//! let events = EventEmitter::new() -//! .on_index(|e| match e { -//! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), -//! _ => {} -//! }); -//! -//! let client = EngineBuilder::new() -//! .with_events(events) -//! .build() -//! .await?; -//! # Ok(()) -//! # } -//! ``` - -mod builder; -mod engine; -mod index_context; -mod indexed_document; -mod indexer; -mod query_context; -mod retriever; -pub(crate) mod test_support; -mod types; -mod workspace; - -// ============================================================ -// Main Types -// ============================================================ - -pub use builder::{BuildError, EngineBuilder}; -pub use engine::Engine; - -// ============================================================ -// Context Types -// ============================================================ - -pub use index_context::IndexContext; -pub use query_context::QueryContext; - -// ============================================================ -// Result & Info Types -// ============================================================ - -pub use types::{ - Confidence, EvidenceItem, FailedItem, IndexItem, IndexMode, IndexOptions, IndexResult, - QueryMetrics, QueryResult, QueryResultItem, -}; - -// ============================================================ -// Parser Types (needed for IndexContext::from_content) -// ============================================================ - -pub use crate::document::DocumentFormat; diff --git a/vectorless-core/vectorless/src/client/query_context.rs b/vectorless-core/vectorless/src/client/query_context.rs deleted file mode 100644 index 48d9ad2a..00000000 --- a/vectorless-core/vectorless/src/client/query_context.rs +++ /dev/null @@ -1,179 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Query context for the Engine API. -//! -//! [`QueryContext`] encapsulates all parameters for a query operation, -//! supporting specific documents or entire workspace queries. -//! -//! # Example -//! -//! ```rust -//! use vectorless::client::QueryContext; -//! -//! // Query specific documents -//! let ctx = QueryContext::new("What is the total revenue?") -//! .with_doc_ids(vec!["doc-1".to_string()]); -//! -//! // Query entire workspace -//! let ctx = QueryContext::new("Explain the algorithm"); -//! ``` - -/// Query scope — determines which documents to search. -#[derive(Debug, Clone)] -pub(crate) enum QueryScope { - /// Query specific documents. - Documents(Vec), - /// Query all documents in the workspace. - Workspace, -} - -/// Context for a query operation. -/// -/// Supports two scopes: -/// - **Specific documents** — via `with_doc_ids()` -/// - **Entire workspace** — default when no scope is set -/// -/// # Convenience -/// -/// Implements `From` and `From<&str>` for quick construction: -/// -/// ```rust -/// use vectorless::client::QueryContext; -/// -/// let ctx: QueryContext = "What is this?".into(); -/// ``` -#[derive(Debug, Clone)] -pub struct QueryContext { - /// The query text. - pub(crate) query: String, - /// Target scope. - pub(crate) scope: QueryScope, - /// Per-operation timeout (seconds). `None` means no timeout. - pub(crate) timeout_secs: Option, - /// Force Orchestrator analysis even when documents are specified. - /// - /// When `true`, the Orchestrator analyzes DocCards to select relevant - /// documents instead of dispatching all specified docs directly. - /// Useful when the user wants the system to decide which documents - /// (or sections) are most relevant to the query. - pub(crate) force_analysis: bool, -} - -impl QueryContext { - /// Create a new query context (defaults to workspace scope). - pub fn new(query: impl Into) -> Self { - Self { - query: query.into(), - scope: QueryScope::Workspace, - timeout_secs: None, - force_analysis: false, - } - } - - /// Set scope to specific documents. - /// - /// Pass a single ID or multiple IDs to restrict the query - /// to those documents only. - pub fn with_doc_ids(mut self, doc_ids: Vec) -> Self { - self.scope = QueryScope::Documents(doc_ids); - self - } - - /// Set scope to entire workspace. - pub fn with_workspace(mut self) -> Self { - self.scope = QueryScope::Workspace; - self - } - - /// Set per-operation timeout in seconds. - pub fn with_timeout_secs(mut self, secs: u64) -> Self { - self.timeout_secs = Some(secs); - self - } - - /// Force the Orchestrator to analyze documents before dispatching Workers. - /// - /// By default, when documents are specified via `with_doc_ids()`, the - /// Orchestrator skips its analysis phase and dispatches Workers to all - /// specified documents directly. Setting this to `true` forces the - /// Orchestrator to analyze DocCards and decide which documents are - /// relevant, even when the user specified documents explicitly. - /// - /// This is useful when querying across many documents where only a subset - /// is likely relevant to the specific question. - pub fn with_force_analysis(mut self, force: bool) -> Self { - self.force_analysis = force; - self - } -} - -impl From for QueryContext { - fn from(query: String) -> Self { - Self::new(query) - } -} - -impl From<&str> for QueryContext { - fn from(query: &str) -> Self { - Self::new(query) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_query_context_new() { - let ctx = QueryContext::new("What is this?"); - assert_eq!(ctx.query, "What is this?"); - } - - #[test] - fn test_query_context_from_string() { - let ctx: QueryContext = "Hello".to_string().into(); - assert_eq!(ctx.query, "Hello"); - } - - #[test] - fn test_query_context_from_str() { - let ctx: QueryContext = "Hello".into(); - assert_eq!(ctx.query, "Hello"); - } - - #[test] - fn test_single_doc_scope() { - let ctx = QueryContext::new("test").with_doc_ids(vec!["doc-1".to_string()]); - assert!( - matches!(ctx.scope, QueryScope::Documents(ref ids) if ids == &["doc-1".to_string()]) - ); - } - - #[test] - fn test_multi_doc_scope() { - let ctx = QueryContext::new("test").with_doc_ids(vec!["a".into(), "b".into()]); - assert!(matches!(ctx.scope, QueryScope::Documents(ref ids) if ids.len() == 2)); - } - - #[test] - fn test_workspace_scope() { - let ctx = QueryContext::new("test"); - assert!(matches!(ctx.scope, QueryScope::Workspace)); - } - - #[test] - fn test_builder_options() { - let ctx = QueryContext::new("test") - .with_doc_ids(vec!["doc-1".to_string()]) - .with_timeout_secs(60); - - assert_eq!(ctx.timeout_secs, Some(60)); - } - - #[test] - fn test_query_context_timeout_default() { - let ctx = QueryContext::new("test"); - assert_eq!(ctx.timeout_secs, None); - } -} diff --git a/vectorless-core/vectorless/src/client/retriever.rs b/vectorless-core/vectorless/src/client/retriever.rs deleted file mode 100644 index 67f53f6b..00000000 --- a/vectorless-core/vectorless/src/client/retriever.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document retrieval client. -//! -//! This module provides query and retrieval operations for document content, -//! dispatching through the retrieval layer to the agent-based system. - -use tracing::info; - -use crate::agent::{self, config::AgentConfig, events::EventEmitter as AgentEventEmitter}; -use crate::client::types::QueryResult; -use crate::document::{DocumentTree, NavigationIndex, ReasoningIndex}; -use crate::error::Result; -use crate::events::{EventEmitter, QueryEvent}; -use crate::llm::LlmClient; -use crate::retrieval::{dispatcher, postprocessor}; - -/// Document retrieval client. -/// -/// Delegates to the agent-based retrieval system. -pub(crate) struct RetrieverClient { - /// LLM client for agent navigation decisions. - llm: LlmClient, - - /// Agent configuration. - config: AgentConfig, - - /// Event emitter. - events: EventEmitter, -} - -impl RetrieverClient { - /// Create a new retriever client with an LLM client. - pub fn new(llm: LlmClient) -> Self { - Self { - llm, - config: AgentConfig::default(), - events: EventEmitter::new(), - } - } - - /// Create with event emitter. - pub fn with_events(mut self, events: EventEmitter) -> Self { - self.events = events; - self - } - - /// Set custom agent configuration. - pub fn with_config(mut self, config: AgentConfig) -> Self { - self.config = config; - self - } - - /// Get a reference to the agent configuration. - pub fn config(&self) -> &AgentConfig { - &self.config - } - - /// Get a reference to the LLM client. - pub fn llm(&self) -> &LlmClient { - &self.llm - } - - /// Query documents through the agent-based retrieval system. - /// - /// - `skip_analysis = true` → `Scope::Specified` (user-specified docs, skip Orchestrator analysis) - /// - `skip_analysis = false` → `Scope::Workspace` (full Orchestrator analysis flow) - #[tracing::instrument(skip_all, fields(question = %question, docs = documents.len()))] - pub async fn query( - &self, - documents: &[(DocumentTree, NavigationIndex, ReasoningIndex, String)], - question: &str, - skip_analysis: bool, - ) -> Result { - self.events.emit_query(QueryEvent::Started { - query: question.to_string(), - }); - - info!( - docs = documents.len(), - skip_analysis, "Querying: {:?}", question - ); - - let doc_contexts: Vec = documents - .iter() - .map(|(tree, nav, ridx, id)| agent::DocContext { - tree, - nav_index: nav, - reasoning_index: ridx, - doc_name: id.as_str(), - }) - .collect(); - - let scope = if skip_analysis { - agent::Scope::Specified(doc_contexts) - } else { - agent::Scope::Workspace(agent::WorkspaceContext::new(doc_contexts)) - }; - - let emitter = AgentEventEmitter::noop(); - let output = - dispatcher::dispatch(question, scope, &self.config, &self.llm, &emitter).await?; - - let fallback_id = documents - .first() - .map(|(_, _, _, id)| id.as_str()) - .unwrap_or(""); - let items = postprocessor::to_results(&output, fallback_id); - let result = QueryResult::new_with_items(items); - - self.events.emit_query(QueryEvent::Complete { - total_results: result.len(), - confidence: result.single().map(|i| i.confidence).unwrap_or(0.0), - }); - - Ok(result) - } -} - -impl Clone for RetrieverClient { - fn clone(&self) -> Self { - Self { - llm: self.llm.clone(), - config: self.config.clone(), - events: self.events.clone(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_retriever_client_creation() { - let _client = - RetrieverClient::new(LlmClient::new(crate::llm::config::LlmConfig::default())); - } -} diff --git a/vectorless-core/vectorless/src/client/test_support.rs b/vectorless-core/vectorless/src/client/test_support.rs deleted file mode 100644 index dd443da8..00000000 --- a/vectorless-core/vectorless/src/client/test_support.rs +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Test-only helpers for constructing Engine instances without a real LLM. -//! -//! This module is exposed via `vectorless::__test_support` and should **only** -//! be used in integration tests. - -use std::sync::Arc; - -use crate::client::engine::Engine; -use crate::client::indexer::IndexerClient; -use crate::client::retriever::RetrieverClient; -use crate::config::Config; -use crate::events::EventEmitter; -use crate::index::PipelineExecutor; -use crate::llm::LlmClient; -use crate::llm::config::LlmConfig; -use crate::metrics::MetricsHub; -use crate::storage::Workspace; - -/// Build an `Engine` with a no-LLM pipeline for integration testing. -/// -/// The pipeline skips enhance/summary stages but exercises: -/// parse → build → validate → split → enrich → optimize. -/// -/// # Example -/// -/// ```rust,ignore -/// let tmp = tempfile::tempdir().unwrap(); -/// let engine = vectorless::__test_support::build_test_engine(tmp.path()).await; -/// ``` -pub async fn build_test_engine(workspace_dir: &std::path::Path) -> Engine { - let config = Config::default(); - - // No-LLM indexer: pipeline without enhance stage - let executor_factory: Arc PipelineExecutor + Send + Sync> = - Arc::new(|| PipelineExecutor::new()); - let indexer = IndexerClient::with_factory(executor_factory); - - let workspace = Workspace::new(workspace_dir).await.unwrap(); - let retriever = RetrieverClient::new(LlmClient::new(LlmConfig::default())); - - Engine::with_components( - config, - workspace, - retriever, - indexer, - EventEmitter::new(), - Arc::new(MetricsHub::with_defaults()), - ) - .await - .unwrap() -} diff --git a/vectorless-core/vectorless/src/client/types.rs b/vectorless-core/vectorless/src/client/types.rs deleted file mode 100644 index f9977d0a..00000000 --- a/vectorless-core/vectorless/src/client/types.rs +++ /dev/null @@ -1,536 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Public API types for the client module. -//! -//! This module contains all types exposed in the public API. - -use serde::{Deserialize, Serialize}; - -use crate::document::DocumentFormat; -use crate::metrics::IndexMetrics; - -// ============================================================ -// Partial Success -// ============================================================ - -/// A failed item in a batch operation. -#[derive(Debug, Clone)] -pub struct FailedItem { - /// Source description (file path, content name, or doc ID). - pub source: String, - /// Error message. - pub error: String, -} - -impl FailedItem { - /// Create a new failed item. - pub fn new(source: impl Into, error: impl Into) -> Self { - Self { - source: source.into(), - error: error.into(), - } - } -} - -// ============================================================ -// Index Types -// ============================================================ - -/// Document indexing behavior mode. -/// -/// Controls how the indexer handles existing documents and re-indexing. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] -pub enum IndexMode { - /// Default mode - skip if already indexed. - /// - /// If a document with the same source has already been indexed, - /// the operation is skipped and the existing document ID is returned. - #[default] - Default, - - /// Force re-indexing. - /// - /// Always re-index the document, even if it has been indexed before. - /// A new document ID is generated. - Force, - - /// Incremental mode - only re-index changed files. - /// - /// Re-index only if the file has been modified since the last index. - /// For content/bytes sources, this behaves like [`IndexMode::Default`]. - Incremental, -} - -/// Options for indexing a document. -#[derive(Debug, Clone)] -pub struct IndexOptions { - /// Indexing mode. - pub mode: IndexMode, - - /// Whether to generate summaries using LLM. - pub generate_summaries: bool, - - /// Whether to generate node IDs. - pub generate_ids: bool, - - /// Whether to generate document description. - pub generate_description: bool, - - /// Whether to expand keywords with LLM-generated synonyms - /// during reasoning index construction. Improves recall for - /// queries that use different wording than the document. - pub enable_synonym_expansion: bool, - - /// Per-operation timeout (seconds). `None` means no timeout. - pub timeout_secs: Option, -} - -impl Default for IndexOptions { - fn default() -> Self { - Self { - mode: IndexMode::Default, - generate_summaries: true, - generate_ids: true, - generate_description: true, - enable_synonym_expansion: true, - timeout_secs: None, - } - } -} - -impl IndexOptions { - /// Create new index options with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Enable summary generation. - pub fn with_summaries(mut self) -> Self { - self.generate_summaries = true; - self - } - - /// Enable document description generation. - pub fn with_description(mut self) -> Self { - self.generate_description = true; - self - } - - /// Set the indexing mode. - /// - /// # Modes - /// - /// - [`IndexMode::Default`] - Skip if already indexed - /// - [`IndexMode::Force`] - Always re-index - /// - [`IndexMode::Incremental`] - Only re-index changed files - pub fn with_mode(mut self, mode: IndexMode) -> Self { - self.mode = mode; - self - } - - /// Set per-operation timeout in seconds. - pub fn with_timeout_secs(mut self, secs: u64) -> Self { - self.timeout_secs = Some(secs); - self - } -} - -// ============================================================ -// Index Result Types -// ============================================================ - -/// Result of a document indexing operation. -#[derive(Debug, Clone)] -pub struct IndexResult { - /// Successfully indexed items. - pub items: Vec, - - /// Items that failed to index (partial success). - pub failed: Vec, -} - -impl IndexResult { - /// Create a new index result. - pub fn new(items: Vec) -> Self { - Self { - items, - failed: Vec::new(), - } - } - - /// Create with both successes and failures. - pub fn with_partial(items: Vec, failed: Vec) -> Self { - Self { items, failed } - } - - /// Get the single document ID (convenience for single-document indexing). - pub fn doc_id(&self) -> Option<&str> { - if self.items.len() == 1 { - Some(&self.items[0].doc_id) - } else { - None - } - } - - /// Check if the result is empty. - pub fn is_empty(&self) -> bool { - self.items.is_empty() - } - - /// Get the number of indexed items. - pub fn len(&self) -> usize { - self.items.len() - } - - /// Whether any items failed. - pub fn has_failures(&self) -> bool { - !self.failed.is_empty() - } - - /// Total number of sources (success + failed). - pub fn total(&self) -> usize { - self.items.len() + self.failed.len() - } -} - -/// A single indexed document item. -#[derive(Debug, Clone)] -pub struct IndexItem { - /// The unique document ID. - pub doc_id: String, - /// The document name. - pub name: String, - /// The document format. - pub format: DocumentFormat, - /// Document description (from root summary). - pub description: Option, - /// Source file path (if indexed from a file). - pub source_path: Option, - /// Page count (for PDFs). - pub page_count: Option, - /// Indexing pipeline metrics (timing, LLM usage, node stats). - pub metrics: Option, -} - -impl IndexItem { - /// Create a new index item. - pub fn new( - doc_id: impl Into, - name: impl Into, - format: DocumentFormat, - description: Option, - page_count: Option, - ) -> Self { - Self { - doc_id: doc_id.into(), - name: name.into(), - format, - description, - source_path: None, - page_count, - metrics: None, - } - } - - /// Set the source file path. - pub fn with_source_path(mut self, path: impl Into) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the indexing metrics. - pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { - self.metrics = Some(metrics); - self - } - - /// Set the indexing metrics (optional). - pub fn with_metrics_opt(mut self, metrics: Option) -> Self { - self.metrics = metrics; - self - } -} - -// ============================================================ -// Query Types -// ============================================================ - -/// A single piece of evidence with source attribution. -#[derive(Debug, Clone)] -pub struct EvidenceItem { - /// Section title where this evidence was found. - pub title: String, - /// Navigation path (e.g., "Root/Chapter 1/Section 1.2"). - pub path: String, - /// Raw evidence content. - pub content: String, - /// Source document name (set in multi-doc scenarios). - pub doc_name: Option, -} - -/// Query execution metrics. -#[derive(Debug, Clone, Default)] -pub struct QueryMetrics { - /// Number of LLM calls made. - pub llm_calls: u32, - /// Number of navigation rounds used. - pub rounds_used: u32, - /// Number of distinct nodes visited. - pub nodes_visited: usize, - /// Number of evidence items collected. - pub evidence_count: usize, - /// Total characters of collected evidence. - pub evidence_chars: usize, -} - -/// Confidence score of the query result (0.0–1.0). -/// -/// Derived from LLM evaluate() — whether evidence was deemed sufficient -/// and how many replan rounds were needed. -pub type Confidence = f32; - -/// A single document's query result. -#[derive(Debug, Clone)] -pub struct QueryResultItem { - /// The document ID. - pub doc_id: String, - - /// Matching node IDs (navigation paths). - pub node_ids: Vec, - - /// Synthesized answer or raw evidence content. - pub content: String, - - /// Evidence items that contributed to this result, with source attribution. - pub evidence: Vec, - - /// Execution metrics for this query. - pub metrics: Option, - - /// Confidence score (0.0–1.0) — derived from LLM evaluation. - pub confidence: Confidence, -} - -/// Result of a document query. -/// -/// Contains results from one or more documents. For single-document queries, -/// `items` has one entry. For multi-document or workspace queries, it has -/// one entry per document that matched. -#[derive(Debug, Clone)] -pub struct QueryResult { - /// Query results per document. - pub items: Vec, - - /// Documents that failed during multi-doc query. - pub failed: Vec, -} - -impl QueryResult { - /// Create a new query result (empty). - pub fn new() -> Self { - Self { - items: Vec::new(), - failed: Vec::new(), - } - } - - /// Create a query result with items. - pub fn new_with_items(items: Vec) -> Self { - Self { - items, - failed: Vec::new(), - } - } - - /// Create a query result with a single item. - pub fn from_single(item: QueryResultItem) -> Self { - Self { - items: vec![item], - failed: Vec::new(), - } - } - - /// Create with both successes and failures. - pub fn with_partial(items: Vec, failed: Vec) -> Self { - Self { items, failed } - } - - /// Check if the result is empty. - pub fn is_empty(&self) -> bool { - self.items.is_empty() - } - - /// Get the number of result items. - pub fn len(&self) -> usize { - self.items.len() - } - - /// Get the first (single-doc) result item, if any. - pub fn single(&self) -> Option<&QueryResultItem> { - self.items.first() - } - - /// Whether any documents failed. - pub fn has_failures(&self) -> bool { - !self.failed.is_empty() - } -} - -impl Default for QueryResult { - fn default() -> Self { - Self::new() - } -} - -// ============================================================ -// Document Info Types -// ============================================================ - -/// Document info for listing. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentInfo { - /// Document ID. - pub id: String, - - /// Document name. - pub name: String, - - /// Document format. - pub format: String, - - /// Document description. - pub description: Option, - - /// Source file path. - pub source_path: Option, - - /// Page count (for PDFs). - pub page_count: Option, - - /// Line count (for text files). - pub line_count: Option, -} - -impl DocumentInfo { - /// Create a new document info. - pub fn new(id: impl Into, name: impl Into) -> Self { - Self { - id: id.into(), - name: name.into(), - format: String::new(), - description: None, - source_path: None, - page_count: None, - line_count: None, - } - } - - /// Set the format. - pub fn with_format(mut self, format: impl Into) -> Self { - self.format = format.into(); - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_index_options() { - let options = IndexOptions::new() - .with_summaries() - .with_mode(IndexMode::Force); - - assert!(options.generate_summaries); - assert_eq!(options.mode, IndexMode::Force); - } - - #[test] - fn test_index_options_timeout() { - let opts = IndexOptions::new().with_timeout_secs(30); - assert_eq!(opts.timeout_secs, Some(30)); - - let default = IndexOptions::default(); - assert_eq!(default.timeout_secs, None); - } - - #[test] - fn test_query_result() { - let result = QueryResult::new(); - assert!(result.is_empty()); - assert_eq!(result.len(), 0); - } - - #[test] - fn test_query_result_single() { - let item = QueryResultItem { - doc_id: "doc-1".into(), - node_ids: vec!["n1".into()], - content: "content".into(), - evidence: vec![], - metrics: None, - confidence: 0.9, - }; - let result = QueryResult::from_single(item); - assert!(!result.is_empty()); - assert_eq!(result.len(), 1); - assert!(result.single().is_some()); - assert_eq!(result.single().unwrap().doc_id, "doc-1"); - } - - #[test] - fn test_document_info() { - let info = DocumentInfo::new("doc-1", "Test").with_format("markdown"); - - assert_eq!(info.id, "doc-1"); - assert_eq!(info.format, "markdown"); - } - - #[test] - fn test_index_result() { - let item = IndexItem::new("doc-1", "Test", DocumentFormat::Markdown, None, None); - let result = IndexResult::new(vec![item]); - - assert_eq!(result.doc_id(), Some("doc-1")); - assert_eq!(result.len(), 1); - assert!(!result.is_empty()); - } - - #[test] - fn test_index_result_empty() { - let result = IndexResult::new(vec![]); - assert!(result.is_empty()); - assert_eq!(result.doc_id(), None); - } - - #[test] - fn test_index_result_multiple() { - let items = vec![ - IndexItem::new("doc-1", "A", DocumentFormat::Markdown, None, None), - IndexItem::new("doc-2", "B", DocumentFormat::Pdf, None, None), - ]; - let result = IndexResult::new(items); - assert_eq!(result.len(), 2); - assert_eq!(result.doc_id(), None); - } - - #[test] - fn test_partial_success() { - let items = vec![IndexItem::new( - "doc-1", - "A", - DocumentFormat::Markdown, - None, - None, - )]; - let failed = vec![FailedItem::new("missing.pdf", "File not found")]; - let result = IndexResult::with_partial(items, failed); - - assert_eq!(result.len(), 1); - assert!(result.has_failures()); - assert_eq!(result.total(), 2); - assert_eq!(result.failed[0].source, "missing.pdf"); - } -} diff --git a/vectorless-core/vectorless/src/client/workspace.rs b/vectorless-core/vectorless/src/client/workspace.rs deleted file mode 100644 index db296493..00000000 --- a/vectorless-core/vectorless/src/client/workspace.rs +++ /dev/null @@ -1,243 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Workspace management client. -//! -//! This module provides async CRUD operations for document persistence -//! through the workspace abstraction. -//! -//! # Example -//! -//! ```rust,ignore -//! let workspace = WorkspaceClient::new(workspace_storage).await; -//! -//! // Save a document -//! workspace.save(&doc).await?; -//! -//! // Load a document -//! let doc = workspace.load("doc-id").await?; -//! -//! // List all documents -//! for doc in workspace.list().await? { -//! println!("{}: {}", doc.id, doc.name); -//! } -//! ``` - -use std::sync::Arc; - -use tracing::{debug, info}; - -use crate::error::Result; -use crate::storage::{PersistedDocument, Workspace}; - -use super::types::DocumentInfo; -use crate::events::{EventEmitter, WorkspaceEvent}; - -/// Workspace management client. -/// -/// Provides async thread-safe CRUD operations for document persistence. -/// All operations are async and can be safely called from multiple tasks. -/// -/// # Thread Safety -/// -/// The client is fully thread-safe and can be cloned cheaply -/// (it uses `Arc` internally). -#[derive(Clone)] -pub(crate) struct WorkspaceClient { - /// Workspace storage. - workspace: Arc, - - /// Event emitter. - events: EventEmitter, -} - -impl WorkspaceClient { - /// Create a new workspace client. - pub async fn new(workspace: Workspace) -> Self { - Self { - workspace: Arc::new(workspace), - events: EventEmitter::new(), - } - } - - /// Create with event emitter. - pub fn with_events(mut self, events: EventEmitter) -> Self { - self.events = events; - self - } - - /// Save a document to the workspace. - /// - /// If a document with the same ID already exists, logs a warning - /// (this can happen during concurrent indexing of the same source). - /// - /// # Errors - /// - /// Returns an error if the workspace write fails. - pub async fn save(&self, doc: &PersistedDocument) -> Result<()> { - let doc_id = doc.meta.id.clone(); - - if self.workspace.contains(&doc_id).await { - tracing::warn!( - doc_id, - name = %doc.meta.name, - "Overwriting existing document — possible concurrent index of the same source" - ); - } - - self.workspace.add(doc).await?; - - info!("Saved document: {}", doc_id); - self.events.emit_workspace(WorkspaceEvent::Saved { doc_id }); - - Ok(()) - } - - /// Load a document from the workspace. - /// - /// Returns `Ok(None)` if the document doesn't exist. - /// - /// # Errors - /// - /// Returns an error if the workspace read fails. - pub async fn load(&self, doc_id: &str) -> Result> { - let doc = self.workspace.load_and_cache(doc_id).await?; - - if let Some(ref _d) = doc { - debug!("Loaded document: {}", doc_id); - } - - self.events.emit_workspace(WorkspaceEvent::Loaded { - doc_id: doc_id.to_string(), - cache_hit: doc.is_some(), - }); - - Ok(doc) - } - - /// Remove a document from the workspace. - /// - /// Returns `Ok(true)` if the document was removed, `Ok(false)` if it didn't exist. - /// - /// # Errors - /// - /// Returns an error if the workspace write fails. - pub async fn remove(&self, doc_id: &str) -> Result { - let removed = self.workspace.remove(doc_id).await?; - - if removed { - info!("Removed document: {}", doc_id); - self.events.emit_workspace(WorkspaceEvent::Removed { - doc_id: doc_id.to_string(), - }); - } - - Ok(removed) - } - - /// Check if a document exists in the workspace. - /// - /// # Errors - /// - /// Returns an error if the workspace read fails. - pub async fn exists(&self, doc_id: &str) -> Result { - Ok(self.workspace.contains(doc_id).await) - } - - /// List all documents in the workspace. - /// - /// # Errors - /// - /// Returns an error if the workspace read fails. - pub async fn list(&self) -> Result> { - let doc_ids = self.workspace.list_documents().await; - let mut result = Vec::with_capacity(doc_ids.len()); - - for id in &doc_ids { - if let Some(meta) = self.workspace.get_meta(id).await { - result.push(DocumentInfo { - id: meta.id, - name: meta.doc_name, - format: meta.doc_type, - description: meta.doc_description, - source_path: meta.path, - page_count: meta.page_count, - line_count: meta.line_count, - }); - } - } - - Ok(result) - } - - /// Get document info by ID. - /// - /// # Errors - /// - /// Returns an error if the workspace read fails. - pub async fn get_document_info(&self, doc_id: &str) -> Result> { - Ok(self - .workspace - .get_meta(doc_id) - .await - .map(|meta| DocumentInfo { - id: meta.id, - name: meta.doc_name, - format: meta.doc_type, - description: meta.doc_description, - source_path: meta.path, - page_count: meta.page_count, - line_count: meta.line_count, - })) - } - - /// Clear all documents from the workspace. - /// - /// Returns the number of documents removed. - /// - /// # Errors - /// - /// Returns an error if the workspace write fails. - pub async fn clear(&self) -> Result { - let doc_ids = self.workspace.list_documents().await; - let mut removed = 0usize; - - for doc_id in &doc_ids { - match self.workspace.remove(doc_id).await { - Ok(true) => removed += 1, - Ok(false) => {} - Err(e) => tracing::warn!("Failed to remove document {}: {}", doc_id, e), - } - } - - if removed > 0 { - info!("Cleared workspace: {removed} documents removed"); - self.events - .emit_workspace(WorkspaceEvent::Cleared { count: removed }); - } - - Ok(removed) - } - - /// Get the underlying workspace Arc (for advanced use). - pub(crate) fn inner(&self) -> Arc { - Arc::clone(&self.workspace) - } - - /// Find a document ID by its source file path. - /// - /// Used for incremental indexing to check if a file has already been indexed. - pub async fn find_by_source_path(&self, path: &std::path::Path) -> Option { - self.workspace.find_by_source_path(path).await - } - - /// Get the document graph, loading from backend if not cached. - pub async fn get_graph(&self) -> Result> { - self.workspace.get_graph().await - } - - /// Persist the document graph to the backend. - pub async fn set_graph(&self, graph: &crate::graph::DocumentGraph) -> Result<()> { - self.workspace.set_graph(graph).await - } -} diff --git a/vectorless-core/vectorless/src/config/mod.rs b/vectorless-core/vectorless/src/config/mod.rs deleted file mode 100644 index 3ece0fe6..00000000 --- a/vectorless-core/vectorless/src/config/mod.rs +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Internal configuration management. -//! -//! Users configure vectorless via [`EngineBuilder`](crate::client::EngineBuilder) methods, -//! not by directly interacting with this module. - -mod types; -mod validator; - -pub use types::Config; -pub(crate) use types::{ - CompressionAlgorithm, FallbackBehavior, FallbackConfig, IndexerConfig, LlmConfig, - LlmMetricsConfig, MetricsConfig, OnAllFailedBehavior, RetrievalMetricsConfig, SlotConfig, -}; diff --git a/vectorless-core/vectorless/src/config/types/indexer.rs b/vectorless-core/vectorless/src/config/types/indexer.rs deleted file mode 100644 index 6353122a..00000000 --- a/vectorless-core/vectorless/src/config/types/indexer.rs +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Indexer configuration types. - -use serde::{Deserialize, Serialize}; - -/// Indexer configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct IndexerConfig { - /// Word count threshold for splitting sections into subsections. - #[serde(default = "default_subsection_threshold")] - pub subsection_threshold: usize, - - /// Maximum tokens to send in a single segmentation request. - #[serde(default = "default_max_segment_tokens")] - pub max_segment_tokens: usize, - - /// Maximum tokens for each summary. - #[serde(default = "default_max_summary_tokens")] - pub max_summary_tokens: usize, - - /// Minimum content tokens required to generate a summary. - #[serde(default = "default_min_summary_tokens")] - pub min_summary_tokens: usize, -} - -fn default_subsection_threshold() -> usize { - 300 -} - -fn default_max_segment_tokens() -> usize { - 3000 -} - -fn default_max_summary_tokens() -> usize { - 200 -} - -fn default_min_summary_tokens() -> usize { - 20 -} - -impl Default for IndexerConfig { - fn default() -> Self { - Self { - subsection_threshold: default_subsection_threshold(), - max_segment_tokens: default_max_segment_tokens(), - max_summary_tokens: default_max_summary_tokens(), - min_summary_tokens: default_min_summary_tokens(), - } - } -} - -impl IndexerConfig { - /// Create a new indexer config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the subsection threshold. - pub fn with_subsection_threshold(mut self, threshold: usize) -> Self { - self.subsection_threshold = threshold; - self - } - - /// Set the maximum segment tokens. - pub fn with_max_segment_tokens(mut self, tokens: usize) -> Self { - self.max_segment_tokens = tokens; - self - } - - /// Set the maximum summary tokens. - pub fn with_max_summary_tokens(mut self, tokens: usize) -> Self { - self.max_summary_tokens = tokens; - self - } - - /// Set the minimum summary tokens. - pub fn with_min_summary_tokens(mut self, tokens: usize) -> Self { - self.min_summary_tokens = tokens; - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_indexer_config_defaults() { - let config = IndexerConfig::default(); - assert_eq!(config.subsection_threshold, 300); - assert_eq!(config.max_segment_tokens, 3000); - assert_eq!(config.max_summary_tokens, 200); - assert_eq!(config.min_summary_tokens, 20); - } - - #[test] - fn test_indexer_config_builder() { - let config = IndexerConfig::new() - .with_subsection_threshold(500) - .with_max_summary_tokens(300); - - assert_eq!(config.subsection_threshold, 500); - assert_eq!(config.max_summary_tokens, 300); - } -} diff --git a/vectorless-core/vectorless/src/config/types/llm_pool.rs b/vectorless-core/vectorless/src/config/types/llm_pool.rs deleted file mode 100644 index b38497aa..00000000 --- a/vectorless-core/vectorless/src/config/types/llm_pool.rs +++ /dev/null @@ -1,633 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified LLM configuration. -//! -//! This module consolidates all LLM-related configuration into a single -//! cohesive structure. Users configure via [`EngineBuilder`](crate::client::EngineBuilder) -//! for simple cases, or construct [`LlmConfig`] programmatically for advanced use. - -use serde::{Deserialize, Serialize}; - -/// Unified LLM configuration — the single entry point for all LLM settings. -/// -/// Contains: -/// - Global credentials (`api_key`, `model`, `endpoint`) -/// - Per-purpose slot overrides (`index`, `retrieval`) -/// - Infrastructure settings (`retry`, `throttle`, `fallback`) -/// -/// # Simple usage (via EngineBuilder) -/// -/// ```rust,no_run -/// use vectorless::client::EngineBuilder; -/// -/// # async fn example() -> Result<(), vectorless::BuildError> { -/// let engine = EngineBuilder::new() -/// .with_key("sk-...") -/// .with_model("gpt-4o") -/// .with_endpoint("https://api.openai.com/v1") -/// .build() -/// .await?; -/// # Ok(()) -/// # } -/// ``` -/// -/// # Advanced usage (programmatic config) -/// -/// ```rust,ignore -/// use vectorless::config::{Config, LlmConfig, SlotConfig}; -/// -/// let config = Config::new().with_llm( -/// LlmConfig::new("gpt-4o") -/// .with_api_key("sk-...") -/// .with_endpoint("https://api.openai.com/v1") -/// .with_index(SlotConfig::fast().with_model("gpt-4o-mini")) -/// .with_retrieval(SlotConfig::default().with_max_tokens(200)) -/// ); -/// ``` -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LlmConfig { - /// API key — **required**. - #[serde(default)] - pub api_key: Option, - - /// Default model name — **required**. - /// - /// Individual slots can override this via [`SlotConfig::model`]. - #[serde(default)] - pub model: String, - - /// API endpoint URL — **required**. - #[serde(default)] - pub endpoint: Option, - - /// Index slot (document indexing / summarization). - /// Uses a fast, cost-effective model by default. - #[serde(default)] - pub index: SlotConfig, - - /// Retrieval slot (document navigation). - /// Uses the default model. - #[serde(default = "default_retrieval_slot")] - pub retrieval: SlotConfig, - - /// Retry configuration for LLM calls. - #[serde(default)] - pub retry: RetryConfig, - - /// Throttle / rate-limiting configuration. - #[serde(default)] - pub throttle: ThrottleConfig, - - /// Fallback configuration for error recovery. - #[serde(default)] - pub fallback: FallbackConfig, -} - -fn default_retrieval_slot() -> SlotConfig { - SlotConfig { - max_tokens: 100, - ..SlotConfig::default() - } -} - -impl Default for LlmConfig { - fn default() -> Self { - Self { - api_key: None, - model: String::new(), - endpoint: None, - index: SlotConfig::default(), - retrieval: default_retrieval_slot(), - retry: RetryConfig::default(), - throttle: ThrottleConfig::default(), - fallback: FallbackConfig::default(), - } - } -} - -impl LlmConfig { - /// Create a new config with a specific model. - pub fn new(model: impl Into) -> Self { - Self { - model: model.into(), - ..Self::default() - } - } - - /// Set the API key. - pub fn with_api_key(mut self, key: impl Into) -> Self { - self.api_key = Some(key.into()); - self - } - - /// Set the default model. - pub fn with_model(mut self, model: impl Into) -> Self { - self.model = model.into(); - self - } - - /// Set the endpoint URL. - pub fn with_endpoint(mut self, url: impl Into) -> Self { - self.endpoint = Some(url.into()); - self - } - - /// Set the index slot configuration. - pub fn with_index(mut self, slot: SlotConfig) -> Self { - self.index = slot; - self - } - - /// Set the retrieval slot configuration. - pub fn with_retrieval(mut self, slot: SlotConfig) -> Self { - self.retrieval = slot; - self - } - - /// Set the retry configuration. - pub fn with_retry(mut self, retry: RetryConfig) -> Self { - self.retry = retry; - self - } - - /// Set the throttle configuration. - pub fn with_throttle(mut self, throttle: ThrottleConfig) -> Self { - self.throttle = throttle; - self - } - - /// Set the fallback configuration. - pub fn with_fallback(mut self, fallback: FallbackConfig) -> Self { - self.fallback = fallback; - self - } - - /// Convenience: set max concurrent requests (delegates to throttle). - pub fn with_max_concurrent(mut self, max: usize) -> Self { - self.throttle.max_concurrent_requests = max; - self - } - - /// Resolve the effective model for a given slot. - /// - /// Returns the slot-specific model if set, otherwise the default model. - pub fn resolve_model(&self, slot: &SlotConfig) -> String { - slot.model.clone().unwrap_or_else(|| self.model.clone()) - } -} - -/// Per-purpose LLM slot override. -/// -/// Controls model selection and generation parameters for a specific -/// LLM usage (index or retrieval). -/// -/// - `model`: Override the default model (optional). -/// - `max_tokens`: Maximum response tokens. -/// - `temperature`: Generation temperature. -/// -/// `api_key` and `endpoint` are **not** here — they are always inherited -/// from the parent [`LlmConfig`]. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SlotConfig { - /// Override the default model for this purpose. - /// When `None`, uses [`LlmConfig::model`]. - #[serde(default)] - pub model: Option, - - /// Maximum tokens for responses. - #[serde(default = "default_max_tokens")] - pub max_tokens: usize, - - /// Temperature for generation. - #[serde(default = "default_temperature")] - pub temperature: f32, -} - -fn default_max_tokens() -> usize { - 200 -} - -fn default_temperature() -> f32 { - 0.0 -} - -impl Default for SlotConfig { - fn default() -> Self { - Self { - model: None, - max_tokens: default_max_tokens(), - temperature: default_temperature(), - } - } -} - -impl SlotConfig { - /// Create a new slot config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Create a "fast" preset (low tokens). - pub fn fast() -> Self { - Self { - max_tokens: 100, - ..Self::default() - } - } - - /// Set the model override. - pub fn with_model(mut self, model: impl Into) -> Self { - self.model = Some(model.into()); - self - } - - /// Set the max tokens. - pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { - self.max_tokens = max_tokens; - self - } - - /// Set the temperature. - pub fn with_temperature(mut self, temperature: f32) -> Self { - self.temperature = temperature; - self - } -} - -// ============================================================ -// Supporting configuration types -// ============================================================ - -/// Retry configuration for LLM calls. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RetryConfig { - /// Maximum number of retry attempts. - #[serde(default = "default_max_attempts")] - pub max_attempts: usize, - - /// Initial delay before first retry (milliseconds). - #[serde(default = "default_initial_delay_ms")] - pub initial_delay_ms: u64, - - /// Maximum delay between retries (milliseconds). - #[serde(default = "default_max_delay_ms")] - pub max_delay_ms: u64, - - /// Multiplier for exponential backoff. - #[serde(default = "default_multiplier")] - pub multiplier: f64, - - /// Whether to retry on rate limit errors. - #[serde(default = "default_true")] - pub retry_on_rate_limit: bool, -} - -fn default_max_attempts() -> usize { - 3 -} - -fn default_initial_delay_ms() -> u64 { - 500 -} - -fn default_max_delay_ms() -> u64 { - 30000 -} - -fn default_multiplier() -> f64 { - 2.0 -} - -fn default_true() -> bool { - true -} - -impl Default for RetryConfig { - fn default() -> Self { - Self { - max_attempts: default_max_attempts(), - initial_delay_ms: default_initial_delay_ms(), - max_delay_ms: default_max_delay_ms(), - multiplier: default_multiplier(), - retry_on_rate_limit: default_true(), - } - } -} - -impl RetryConfig { - /// Create a new retry config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the max attempts. - pub fn with_max_attempts(mut self, max_attempts: usize) -> Self { - self.max_attempts = max_attempts; - self - } - - /// Calculate delay for a given attempt (0-indexed). - pub fn delay_for_attempt(&self, attempt: usize) -> std::time::Duration { - let delay_ms = (self.initial_delay_ms as f64) * self.multiplier.powi(attempt as i32); - let delay_ms = delay_ms.min(self.max_delay_ms as f64); - std::time::Duration::from_millis(delay_ms as u64) - } - - /// Convert to the runtime retry config (used by llm module). - pub fn to_runtime_config(&self) -> crate::llm::config::RetryConfig { - crate::llm::config::RetryConfig { - max_attempts: self.max_attempts, - initial_delay_ms: self.initial_delay_ms, - max_delay_ms: self.max_delay_ms, - multiplier: self.multiplier, - retry_on_rate_limit: self.retry_on_rate_limit, - } - } -} - -/// Throttle / rate-limiting configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ThrottleConfig { - /// Maximum concurrent LLM API calls. - #[serde(default = "default_max_concurrent")] - pub max_concurrent_requests: usize, - - /// Rate limit: requests per minute. - #[serde(default = "default_rpm")] - pub requests_per_minute: usize, - - /// Enable rate limiting. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Enable semaphore-based concurrency limiting. - #[serde(default = "default_true")] - pub semaphore_enabled: bool, -} - -fn default_max_concurrent() -> usize { - 10 -} - -fn default_rpm() -> usize { - 500 -} - -impl Default for ThrottleConfig { - fn default() -> Self { - Self { - max_concurrent_requests: default_max_concurrent(), - requests_per_minute: default_rpm(), - enabled: default_true(), - semaphore_enabled: default_true(), - } - } -} - -impl ThrottleConfig { - /// Create a new throttle config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the max concurrent requests. - pub fn with_max_concurrent(mut self, max: usize) -> Self { - self.max_concurrent_requests = max; - self - } - - /// Set the requests per minute. - pub fn with_rpm(mut self, rpm: usize) -> Self { - self.requests_per_minute = rpm; - self - } - - /// Convert to the runtime concurrency config. - pub fn to_runtime_config(&self) -> crate::llm::throttle::ConcurrencyConfig { - crate::llm::throttle::ConcurrencyConfig { - max_concurrent_requests: self.max_concurrent_requests, - requests_per_minute: self.requests_per_minute, - enabled: self.enabled, - semaphore_enabled: self.semaphore_enabled, - } - } -} - -/// Fallback behavior on errors. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] -#[serde(rename_all = "snake_case")] -pub enum FallbackBehavior { - /// Retry the same model. - Retry, - /// Immediately fall back to next model. - Fallback, - /// Retry first, then fall back. - #[default] - RetryThenFallback, - /// Fail immediately. - Fail, -} - -/// Behavior when all fallback attempts fail. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] -#[serde(rename_all = "snake_case")] -pub enum OnAllFailedBehavior { - /// Return an error. - #[default] - ReturnError, - /// Return cached result if available. - ReturnCache, -} - -/// Fallback configuration for error recovery. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FallbackConfig { - /// Enable fallback mechanism. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Fallback models in priority order. - #[serde(default = "default_fallback_models")] - pub models: Vec, - - /// Fallback endpoints (optional). - #[serde(default)] - pub endpoints: Vec, - - /// Behavior on rate limit error. - #[serde(default)] - pub on_rate_limit: FallbackBehavior, - - /// Behavior on timeout error. - #[serde(default)] - pub on_timeout: FallbackBehavior, - - /// Behavior when all attempts fail. - #[serde(default)] - pub on_all_failed: OnAllFailedBehavior, - - /// Maximum retry attempts. - #[serde(default = "default_max_retries")] - pub max_retries: usize, - - /// Initial retry delay in milliseconds. - #[serde(default = "default_initial_retry_delay_ms")] - pub initial_retry_delay_ms: u64, - - /// Maximum retry delay in milliseconds. - #[serde(default = "default_max_retry_delay_ms")] - pub max_retry_delay_ms: u64, - - /// Retry delay multiplier (exponential backoff). - #[serde(default = "default_retry_multiplier")] - pub retry_multiplier: f32, -} - -fn default_fallback_models() -> Vec { - vec!["gpt-4o-mini".to_string(), "glm-4-flash".to_string()] -} - -fn default_max_retries() -> usize { - 3 -} - -fn default_initial_retry_delay_ms() -> u64 { - 1000 -} - -fn default_max_retry_delay_ms() -> u64 { - 30000 -} - -fn default_retry_multiplier() -> f32 { - 2.0 -} - -impl Default for FallbackConfig { - fn default() -> Self { - Self { - enabled: default_true(), - models: default_fallback_models(), - endpoints: Vec::new(), - on_rate_limit: FallbackBehavior::default(), - on_timeout: FallbackBehavior::default(), - on_all_failed: OnAllFailedBehavior::default(), - max_retries: default_max_retries(), - initial_retry_delay_ms: default_initial_retry_delay_ms(), - max_retry_delay_ms: default_max_retry_delay_ms(), - retry_multiplier: default_retry_multiplier(), - } - } -} - -impl FallbackConfig { - /// Create a new fallback config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Disable fallback entirely. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } - - /// Set fallback models. - pub fn with_models(mut self, models: Vec) -> Self { - self.models = models; - self - } - - /// Set behavior on rate limit. - pub fn with_on_rate_limit(mut self, behavior: FallbackBehavior) -> Self { - self.on_rate_limit = behavior; - self - } - - /// Calculate retry delay with exponential backoff. - pub fn calculate_retry_delay(&self, attempt: usize) -> std::time::Duration { - let delay_ms = if attempt == 0 { - self.initial_retry_delay_ms - } else { - let delay = - self.initial_retry_delay_ms as f32 * self.retry_multiplier.powi(attempt as i32); - delay.min(self.max_retry_delay_ms as f32) as u64 - }; - std::time::Duration::from_millis(delay_ms) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_llm_config_defaults() { - let config = LlmConfig::default(); - assert!(config.api_key.is_none()); - assert!(config.model.is_empty()); - assert!(config.endpoint.is_none()); - assert!(config.index.model.is_none()); - assert!(config.retrieval.model.is_none()); - assert_eq!(config.index.max_tokens, 200); - assert_eq!(config.retrieval.max_tokens, 100); - } - - #[test] - fn test_llm_config_builder() { - let config = LlmConfig::new("gpt-4o") - .with_api_key("sk-test") - .with_endpoint("https://api.openai.com/v1") - .with_index(SlotConfig::fast().with_model("gpt-4o-mini")); - - assert_eq!(config.model, "gpt-4o"); - assert_eq!(config.api_key, Some("sk-test".to_string())); - assert_eq!(config.index.model, Some("gpt-4o-mini".to_string())); - assert_eq!(config.index.max_tokens, 100); - } - - #[test] - fn test_resolve_model() { - let config = - LlmConfig::new("gpt-4o").with_retrieval(SlotConfig::new().with_model("gpt-4o-mini")); - - assert_eq!(config.resolve_model(&config.index), "gpt-4o"); - assert_eq!(config.resolve_model(&config.retrieval), "gpt-4o-mini"); - } - - #[test] - fn test_slot_config_fast() { - let slot = SlotConfig::fast(); - assert_eq!(slot.max_tokens, 100); - } - - #[test] - fn test_retry_delay_calculation() { - let config = RetryConfig::default(); - assert_eq!( - config.delay_for_attempt(0), - std::time::Duration::from_millis(500) - ); - assert_eq!( - config.delay_for_attempt(1), - std::time::Duration::from_millis(1000) - ); - } - - #[test] - fn test_throttle_config_defaults() { - let config = ThrottleConfig::default(); - assert_eq!(config.max_concurrent_requests, 10); - assert_eq!(config.requests_per_minute, 500); - } - - #[test] - fn test_fallback_config_defaults() { - let config = FallbackConfig::default(); - assert!(config.enabled); - assert!(!config.models.is_empty()); - } -} diff --git a/vectorless-core/vectorless/src/config/types/metrics.rs b/vectorless-core/vectorless/src/config/types/metrics.rs deleted file mode 100644 index c1f4e766..00000000 --- a/vectorless-core/vectorless/src/config/types/metrics.rs +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Metrics configuration for unified observability. - -use serde::{Deserialize, Serialize}; - -/// Unified metrics configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MetricsConfig { - /// Enable metrics collection. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Storage path for persisted metrics. - #[serde(default = "default_storage_path")] - pub storage_path: String, - - /// Retention period in days. - #[serde(default = "default_retention_days")] - pub retention_days: usize, - - /// LLM metrics configuration. - #[serde(default)] - pub llm: LlmMetricsConfig, - - /// Retrieval metrics configuration. - #[serde(default)] - pub retrieval: RetrievalMetricsConfig, -} - -fn default_storage_path() -> String { - "./workspace/metrics".to_string() -} - -fn default_retention_days() -> usize { - 30 -} - -fn default_true() -> bool { - true -} - -impl Default for MetricsConfig { - fn default() -> Self { - Self { - enabled: default_true(), - storage_path: default_storage_path(), - retention_days: default_retention_days(), - llm: LlmMetricsConfig::default(), - retrieval: RetrievalMetricsConfig::default(), - } - } -} - -impl MetricsConfig { - /// Create a new metrics config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Disable metrics collection. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } -} - -/// LLM-specific metrics configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LlmMetricsConfig { - /// Track token usage. - #[serde(default = "default_true")] - pub track_tokens: bool, - - /// Track latency. - #[serde(default = "default_true")] - pub track_latency: bool, - - /// Track estimated cost. - #[serde(default = "default_true")] - pub track_cost: bool, - - /// Cost per 1K input tokens (in USD). - #[serde(default = "default_cost_per_1k_input")] - pub cost_per_1k_input_tokens: f64, - - /// Cost per 1K output tokens (in USD). - #[serde(default = "default_cost_per_1k_output")] - pub cost_per_1k_output_tokens: f64, -} - -fn default_cost_per_1k_input() -> f64 { - 0.00015 // gpt-4o-mini -} - -fn default_cost_per_1k_output() -> f64 { - 0.0006 // gpt-4o-mini -} - -impl Default for LlmMetricsConfig { - fn default() -> Self { - Self { - track_tokens: default_true(), - track_latency: default_true(), - track_cost: default_true(), - cost_per_1k_input_tokens: default_cost_per_1k_input(), - cost_per_1k_output_tokens: default_cost_per_1k_output(), - } - } -} - -impl LlmMetricsConfig { - /// Calculate cost for given tokens. - pub fn calculate_cost(&self, input_tokens: u64, output_tokens: u64) -> f64 { - (input_tokens as f64 / 1000.0) * self.cost_per_1k_input_tokens - + (output_tokens as f64 / 1000.0) * self.cost_per_1k_output_tokens - } -} - -/// Retrieval-specific metrics configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RetrievalMetricsConfig { - /// Track search paths. - #[serde(default = "default_true")] - pub track_paths: bool, - - /// Track relevance scores. - #[serde(default = "default_true")] - pub track_scores: bool, - - /// Track iterations. - #[serde(default = "default_true")] - pub track_iterations: bool, - - /// Track cache hits/misses. - #[serde(default = "default_true")] - pub track_cache: bool, -} - -impl Default for RetrievalMetricsConfig { - fn default() -> Self { - Self { - track_paths: default_true(), - track_scores: default_true(), - track_iterations: default_true(), - track_cache: default_true(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_metrics_config_defaults() { - let config = MetricsConfig::default(); - assert!(config.enabled); - assert_eq!(config.retention_days, 30); - } - - #[test] - fn test_llm_cost_calculation() { - let config = LlmMetricsConfig::default(); - - // 1000 input + 500 output tokens - let cost = config.calculate_cost(1000, 500); - - // 1 * 0.00015 + 0.5 * 0.0006 = 0.00015 + 0.0003 = 0.00045 - assert!((cost - 0.00045).abs() < 0.000001); - } - - #[test] - fn test_disabled_metrics() { - let config = MetricsConfig::disabled(); - assert!(!config.enabled); - } -} diff --git a/vectorless-core/vectorless/src/config/types/mod.rs b/vectorless-core/vectorless/src/config/types/mod.rs deleted file mode 100644 index e6ba3f8b..00000000 --- a/vectorless-core/vectorless/src/config/types/mod.rs +++ /dev/null @@ -1,362 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration type definitions. - -mod indexer; -mod llm_pool; -mod metrics; -mod retrieval; -mod storage; - -use serde::{Deserialize, Serialize}; - -pub(crate) use indexer::IndexerConfig; -pub(crate) use llm_pool::{ - FallbackBehavior, FallbackConfig, LlmConfig, OnAllFailedBehavior, SlotConfig, -}; -pub(crate) use metrics::{LlmMetricsConfig, MetricsConfig, RetrievalMetricsConfig}; -pub(crate) use retrieval::RetrievalConfig; -pub(crate) use storage::{CompressionAlgorithm, StorageConfig}; - -/// Main configuration for vectorless. -/// -/// Users typically configure via [`EngineBuilder`](crate::client::EngineBuilder): -/// -/// ```rust,no_run -/// use vectorless::client::EngineBuilder; -/// -/// # async fn example() -> Result<(), vectorless::BuildError> { -/// let engine = EngineBuilder::new() -/// .with_key("sk-...") -/// .with_model("gpt-4o") -/// .with_endpoint("https://api.openai.com/v1") -/// .build() -/// .await?; -/// # Ok(()) -/// # } -/// ``` -/// -/// Advanced users can construct this programmatically: -/// -/// ```rust,ignore -/// use vectorless::config::{Config, LlmConfig, SlotConfig}; -/// -/// let config = Config::new().with_llm( -/// LlmConfig::new("gpt-4o") -/// .with_api_key("sk-...") -/// .with_endpoint("https://api.openai.com/v1") -/// .with_index(SlotConfig::fast().with_model("gpt-4o-mini")) -/// ); -/// ``` -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Config { - /// LLM configuration (model, credentials, retry, throttle, fallback). - #[serde(default)] - pub llm: LlmConfig, - - /// Metrics configuration. - #[serde(default)] - pub metrics: MetricsConfig, - - /// Indexer configuration. - #[serde(default)] - pub indexer: IndexerConfig, - - /// Retrieval strategy configuration (search, content aggregation, etc.). - #[serde(default)] - pub retrieval: RetrievalConfig, - - /// Storage configuration. - #[serde(default)] - pub storage: StorageConfig, - - /// Document graph configuration. - #[serde(default)] - pub graph: crate::graph::DocumentGraphConfig, -} - -impl Default for Config { - fn default() -> Self { - Self { - llm: LlmConfig::default(), - metrics: MetricsConfig::default(), - indexer: IndexerConfig::default(), - retrieval: RetrievalConfig::default(), - storage: StorageConfig::default(), - graph: crate::graph::DocumentGraphConfig::default(), - } - } -} - -impl Config { - /// Create a new configuration with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the LLM configuration. - pub fn with_llm(mut self, llm: LlmConfig) -> Self { - self.llm = llm; - self - } - - /// Set the metrics configuration. - pub fn with_metrics(mut self, metrics: MetricsConfig) -> Self { - self.metrics = metrics; - self - } - - /// Set the indexer configuration. - pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { - self.indexer = indexer; - self - } - - /// Set the retrieval configuration. - pub fn with_retrieval(mut self, retrieval: RetrievalConfig) -> Self { - self.retrieval = retrieval; - self - } - - /// Set the storage configuration. - pub fn with_storage(mut self, storage: StorageConfig) -> Self { - self.storage = storage; - self - } - - /// Set the document graph configuration. - pub fn with_graph(mut self, graph: crate::graph::DocumentGraphConfig) -> Self { - self.graph = graph; - self - } - - /// Validate the configuration. - pub fn validate(&self) -> Result<(), ConfigValidationError> { - let mut errors = Vec::new(); - - // Validate indexer - if self.indexer.subsection_threshold == 0 { - errors.push(ValidationError::error( - "indexer.subsection_threshold", - "Subsection threshold must be greater than 0", - )); - } - - // Validate LLM slot tokens - if self.llm.index.max_tokens == 0 { - errors.push(ValidationError::error( - "llm.index.max_tokens", - "Index max tokens must be greater than 0", - )); - } - - if self.llm.retrieval.max_tokens == 0 { - errors.push(ValidationError::error( - "llm.retrieval.max_tokens", - "Retrieval max tokens must be greater than 0", - )); - } - - // Validate retrieval - if self.retrieval.top_k == 0 { - errors.push(ValidationError::error( - "retrieval.top_k", - "Top K must be greater than 0", - )); - } - - // Validate throttle - if self.llm.throttle.max_concurrent_requests == 0 { - errors.push(ValidationError::error( - "llm.throttle.max_concurrent_requests", - "Max concurrent requests must be greater than 0", - )); - } - - // Validate graph - if self.graph.min_keyword_jaccard < 0.0 || self.graph.min_keyword_jaccard > 1.0 { - errors.push(ValidationError::error( - "graph.min_keyword_jaccard", - "Must be between 0.0 and 1.0", - )); - } - if self.graph.max_edges_per_node == 0 { - errors.push(ValidationError::error( - "graph.max_edges_per_node", - "Must be greater than 0", - )); - } - - // Validate fallback - if self.llm.fallback.enabled && self.llm.fallback.models.is_empty() { - errors.push(ValidationError::warning( - "llm.fallback.models", - "Fallback enabled but no fallback models configured", - )); - } - - if errors.is_empty() { - Ok(()) - } else { - Err(ConfigValidationError { errors }) - } - } -} - -/// Configuration validation error. -#[derive(Debug, Clone, thiserror::Error)] -#[error("Configuration validation failed with {} error(s)", self.errors.len())] -pub struct ConfigValidationError { - /// Validation errors. - pub errors: Vec, -} - -/// A single validation error. -#[derive(Debug, Clone)] -pub struct ValidationError { - /// Field path (e.g., "retrieval.content.token_budget"). - pub path: String, - - /// Error message. - pub message: String, - - /// Expected value/range. - pub expected: Option, - - /// Actual value. - pub actual: Option, - - /// Severity level. - pub severity: Severity, -} - -impl ValidationError { - /// Create an error-level validation error. - pub fn error(path: impl Into, message: impl Into) -> Self { - Self { - path: path.into(), - message: message.into(), - expected: None, - actual: None, - severity: Severity::Error, - } - } - - /// Create a warning-level validation error. - pub fn warning(path: impl Into, message: impl Into) -> Self { - Self { - path: path.into(), - message: message.into(), - expected: None, - actual: None, - severity: Severity::Warning, - } - } - - /// Create an info-level validation error. - pub fn info(path: impl Into, message: impl Into) -> Self { - Self { - path: path.into(), - message: message.into(), - expected: None, - actual: None, - severity: Severity::Info, - } - } - - /// Set the expected value. - pub fn with_expected(mut self, expected: impl Into) -> Self { - self.expected = Some(expected.into()); - self - } - - /// Set the actual value. - pub fn with_actual(mut self, actual: impl Into) -> Self { - self.actual = Some(actual.into()); - self - } -} - -impl std::fmt::Display for ValidationError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let severity = match self.severity { - Severity::Error => "ERROR", - Severity::Warning => "WARNING", - Severity::Info => "INFO", - }; - write!(f, "[{}] {}: {}", severity, self.path, self.message)?; - if let Some(ref expected) = self.expected { - write!(f, " (expected: {})", expected)?; - } - if let Some(ref actual) = self.actual { - write!(f, " (actual: {})", actual)?; - } - Ok(()) - } -} - -/// Validation severity level. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum Severity { - /// Error - must fix. - Error, - /// Warning - should fix. - Warning, - /// Info - suggestion. - Info, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_config_defaults() { - let config = Config::default(); - assert!(config.llm.model.is_empty()); - assert!(config.llm.index.model.is_none()); - assert_eq!(config.retrieval.top_k, 3); - assert_eq!(config.indexer.subsection_threshold, 300); - assert!(config.metrics.enabled); - } - - #[test] - fn test_llm_config_defaults() { - let config = LlmConfig::default(); - assert!(config.index.model.is_none()); - assert!(config.retrieval.model.is_none()); - assert_eq!(config.retry.max_attempts, 3); - assert_eq!(config.throttle.max_concurrent_requests, 10); - } - - #[test] - fn test_config_validation_success() { - let config = Config::default(); - assert!(config.validate().is_ok()); - } - - #[test] - fn test_config_validation_errors() { - let mut config = Config::default(); - config.retrieval.top_k = 0; - - let result = config.validate(); - assert!(result.is_err()); - - let err = result.unwrap_err(); - assert!(!err.errors.is_empty()); - } - - #[test] - fn test_validation_error_display() { - let err = ValidationError::error("test.field", "Invalid value") - .with_expected(">= 1") - .with_actual("0"); - - let display = format!("{}", err); - assert!(display.contains("ERROR")); - assert!(display.contains("test.field")); - assert!(display.contains("expected")); - } -} diff --git a/vectorless-core/vectorless/src/config/types/retrieval.rs b/vectorless-core/vectorless/src/config/types/retrieval.rs deleted file mode 100644 index c4300987..00000000 --- a/vectorless-core/vectorless/src/config/types/retrieval.rs +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Retrieval strategy configuration types. -//! -//! LLM configuration (model, api_key, endpoint) is managed centrally -//! in [`LlmConfig`](super::LlmConfig). This module only contains -//! retrieval strategy parameters. - -use serde::{Deserialize, Serialize}; - -use super::storage::{CacheConfig, StrategyConfig, SufficiencyConfig}; - -/// Retrieval strategy configuration. -/// -/// Controls how documents are searched and retrieved, independent -/// of which LLM model is used for navigation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RetrievalConfig { - /// Number of top-k results to return. - #[serde(default = "default_top_k")] - pub top_k: usize, - - /// Search algorithm configuration. - #[serde(default)] - pub search: SearchConfig, - - /// Sufficiency checker configuration. - #[serde(default)] - pub sufficiency: SufficiencyConfig, - - /// Cache configuration. - #[serde(default)] - pub cache: CacheConfig, - - /// Strategy-specific configuration. - #[serde(default)] - pub strategy: StrategyConfig, -} - -fn default_top_k() -> usize { - 3 -} - -impl Default for RetrievalConfig { - fn default() -> Self { - Self { - top_k: default_top_k(), - search: SearchConfig::default(), - sufficiency: SufficiencyConfig::default(), - cache: CacheConfig::default(), - strategy: StrategyConfig::default(), - } - } -} - -impl RetrievalConfig { - /// Create a new retrieval config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the top_k. - pub fn with_top_k(mut self, top_k: usize) -> Self { - self.top_k = top_k; - self - } -} - -/// Search algorithm configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SearchConfig { - /// Number of top-k results to return. - #[serde(default = "default_search_top_k")] - pub top_k: usize, - - /// Beam width for multi-path search. - #[serde(default = "default_beam_width")] - pub beam_width: usize, - - /// Maximum iterations for search algorithms. - #[serde(default = "default_max_iterations")] - pub max_iterations: usize, - - /// Minimum score to include a path. - #[serde(default = "default_min_score")] - pub min_score: f32, - - /// Fallback chain: algorithms tried in order until min_score is met. - /// Options: "beam", "mcts", "pure_pilot". - /// Default: ["beam", "mcts", "pure_pilot"] - #[serde(default = "default_fallback_chain")] - pub fallback_chain: Vec, -} - -fn default_search_top_k() -> usize { - 5 -} - -fn default_beam_width() -> usize { - 3 -} - -fn default_max_iterations() -> usize { - 10 -} - -fn default_min_score() -> f32 { - 0.1 -} -fn default_fallback_chain() -> Vec { - vec!["beam".into(), "mcts".into(), "pure_pilot".into()] -} - -impl Default for SearchConfig { - fn default() -> Self { - Self { - top_k: default_search_top_k(), - beam_width: default_beam_width(), - max_iterations: default_max_iterations(), - min_score: default_min_score(), - fallback_chain: default_fallback_chain(), - } - } -} - -impl SearchConfig { - /// Create new search config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the top_k. - pub fn with_top_k(mut self, top_k: usize) -> Self { - self.top_k = top_k; - self - } - - /// Set the beam width. - pub fn with_beam_width(mut self, width: usize) -> Self { - self.beam_width = width; - self - } - - /// Set the max iterations. - pub fn with_max_iterations(mut self, max: usize) -> Self { - self.max_iterations = max; - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_retrieval_config_defaults() { - let config = RetrievalConfig::default(); - assert_eq!(config.top_k, 3); - assert_eq!(config.search.top_k, 5); - } - - #[test] - fn test_search_config_defaults() { - let config = SearchConfig::default(); - assert_eq!(config.top_k, 5); - assert_eq!(config.beam_width, 3); - assert_eq!(config.max_iterations, 10); - } -} diff --git a/vectorless-core/vectorless/src/config/types/storage.rs b/vectorless-core/vectorless/src/config/types/storage.rs deleted file mode 100644 index b13304ea..00000000 --- a/vectorless-core/vectorless/src/config/types/storage.rs +++ /dev/null @@ -1,742 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Storage and sufficiency configuration types. - -use serde::{Deserialize, Serialize}; -use std::path::PathBuf; - -/// Storage configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StorageConfig { - /// Workspace directory for persisted documents. - #[serde(default = "default_workspace_dir")] - pub workspace_dir: PathBuf, - - /// LRU cache size (number of documents). - #[serde(default = "default_cache_size")] - pub cache_size: usize, - - /// Enable atomic writes (write to temp file, then rename). - /// This prevents data corruption on crash. - #[serde(default = "default_atomic_writes")] - pub atomic_writes: bool, - - /// Enable file locking for multi-process safety. - #[serde(default = "default_file_lock")] - pub file_lock: bool, - - /// Enable checksum verification for data integrity. - #[serde(default = "default_checksum_enabled")] - pub checksum_enabled: bool, - - /// Enable compression for stored documents. - #[serde(default)] - pub compression: CompressionConfig, - - /// Directory for pipeline checkpoints (derived from `workspace_dir`). - #[serde(skip)] - pub checkpoint_dir: PathBuf, -} - -fn default_workspace_dir() -> PathBuf { - default_workspace_path_for_cwd() -} - -/// Compute the default workspace path for the current working directory. -/// -/// Returns a platform-appropriate path: -/// - **Linux/macOS**: `~/.vectorless/workspaces/{cwd_hash}/` -/// - **Windows**: `%APPDATA%\vectorless\workspaces\{cwd_hash}\` -/// -/// where `cwd_hash` is a 12-hex-char hash derived from the current working -/// directory. This ensures different projects automatically get isolated -/// workspaces. -/// -/// # Environment variable resolution order -/// -/// | Platform | Primary | Fallback | Last resort | -/// |----------|-----------------|---------------------|-------------| -/// | Unix | `$HOME` | — | `"."` | -/// | Windows | `%LOCALAPPDATA%`| `%APPDATA%` | `"."` | -pub fn default_workspace_path_for_cwd() -> PathBuf { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - - let base_dir = if cfg!(windows) { - // Windows: prefer %LOCALAPPDATA% (e.g. C:\Users\xxx\AppData\Local) - // then %APPDATA% (e.g. C:\Users\xxx\AppData\Roaming) - std::env::var("LOCALAPPDATA") - .or_else(|_| std::env::var("APPDATA")) - .map(PathBuf::from) - .unwrap_or_else(|_| PathBuf::from(".")) - } else { - // Unix (Linux, macOS): use $HOME - std::env::var("HOME") - .map(PathBuf::from) - .unwrap_or_else(|_| PathBuf::from(".")) - }; - - let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); - - let mut hasher = DefaultHasher::new(); - cwd.to_string_lossy().hash(&mut hasher); - let hash = format!("{:012x}", hasher.finish()); - - base_dir.join(".vectorless").join("workspaces").join(hash) -} - -fn default_cache_size() -> usize { - 100 -} - -fn default_atomic_writes() -> bool { - true -} - -fn default_file_lock() -> bool { - true -} - -fn default_checksum_enabled() -> bool { - true -} - -impl Default for StorageConfig { - fn default() -> Self { - let workspace_dir = default_workspace_dir(); - let checkpoint_dir = workspace_dir.join("checkpoints"); - Self { - workspace_dir, - cache_size: default_cache_size(), - atomic_writes: default_atomic_writes(), - file_lock: default_file_lock(), - checksum_enabled: default_checksum_enabled(), - compression: CompressionConfig::default(), - checkpoint_dir, - } - } -} - -impl StorageConfig { - /// Create new storage config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the workspace directory. - pub fn with_workspace_dir(mut self, dir: impl Into) -> Self { - self.workspace_dir = dir.into(); - self - } - - /// Set the cache size. - pub fn with_cache_size(mut self, size: usize) -> Self { - self.cache_size = size; - self - } - - /// Enable or disable atomic writes. - pub fn with_atomic_writes(mut self, enabled: bool) -> Self { - self.atomic_writes = enabled; - self - } - - /// Enable or disable file locking. - pub fn with_file_lock(mut self, enabled: bool) -> Self { - self.file_lock = enabled; - self - } - - /// Enable or disable checksum verification. - pub fn with_checksum(mut self, enabled: bool) -> Self { - self.checksum_enabled = enabled; - self - } - - /// Set compression configuration. - pub fn with_compression(mut self, compression: CompressionConfig) -> Self { - self.compression = compression; - self - } -} - -/// Compression configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CompressionConfig { - /// Enable compression. - #[serde(default = "default_compression_enabled")] - pub enabled: bool, - - /// Compression algorithm. - #[serde(default = "default_compression_algorithm")] - pub algorithm: CompressionAlgorithm, - - /// Compression level (1-9, higher = better compression but slower). - #[serde(default = "default_compression_level")] - pub level: u32, -} - -fn default_compression_enabled() -> bool { - false -} - -fn default_compression_algorithm() -> CompressionAlgorithm { - CompressionAlgorithm::Gzip -} - -fn default_compression_level() -> u32 { - 6 -} - -impl Default for CompressionConfig { - fn default() -> Self { - Self { - enabled: default_compression_enabled(), - algorithm: default_compression_algorithm(), - level: default_compression_level(), - } - } -} - -impl CompressionConfig { - /// Create new compression config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Enable or disable compression. - pub fn with_enabled(mut self, enabled: bool) -> Self { - self.enabled = enabled; - self - } - - /// Set the compression algorithm. - pub fn with_algorithm(mut self, algorithm: CompressionAlgorithm) -> Self { - self.algorithm = algorithm; - self - } - - /// Set the compression level. - pub fn with_level(mut self, level: u32) -> Self { - self.level = level.clamp(1, 9); - self - } -} - -/// Compression algorithm. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum CompressionAlgorithm { - /// Gzip compression. - Gzip, - /// Zstandard compression. - Zstd, -} - -/// Sufficiency checker configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SufficiencyConfig { - /// Minimum tokens for sufficiency. - #[serde(default = "default_min_tokens")] - pub min_tokens: usize, - - /// Target tokens for full sufficiency. - #[serde(default = "default_target_tokens")] - pub target_tokens: usize, - - /// Maximum tokens before stopping. - #[serde(default = "default_max_tokens")] - pub max_tokens: usize, - - /// Minimum content length (characters). - #[serde(default = "default_min_content_length")] - pub min_content_length: usize, - - /// Confidence threshold for LLM judge. - #[serde(default = "default_confidence_threshold")] - pub confidence_threshold: f32, -} - -fn default_min_tokens() -> usize { - 500 -} - -fn default_target_tokens() -> usize { - 2000 -} - -fn default_max_tokens() -> usize { - 4000 -} - -fn default_min_content_length() -> usize { - 200 -} - -fn default_confidence_threshold() -> f32 { - 0.7 -} - -impl Default for SufficiencyConfig { - fn default() -> Self { - Self { - min_tokens: default_min_tokens(), - target_tokens: default_target_tokens(), - max_tokens: default_max_tokens(), - min_content_length: default_min_content_length(), - confidence_threshold: default_confidence_threshold(), - } - } -} - -impl SufficiencyConfig { - /// Create new sufficiency config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the minimum tokens. - pub fn with_min_tokens(mut self, tokens: usize) -> Self { - self.min_tokens = tokens; - self - } - - /// Set the target tokens. - pub fn with_target_tokens(mut self, tokens: usize) -> Self { - self.target_tokens = tokens; - self - } - - /// Set the maximum tokens. - pub fn with_max_tokens(mut self, tokens: usize) -> Self { - self.max_tokens = tokens; - self - } - - /// Set the confidence threshold. - pub fn with_confidence_threshold(mut self, threshold: f32) -> Self { - self.confidence_threshold = threshold; - self - } -} - -/// Cache configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CacheConfig { - /// Maximum number of cache entries. - #[serde(default = "default_max_entries")] - pub max_entries: usize, - - /// Time-to-live for cache entries (seconds). - #[serde(default = "default_ttl_secs")] - pub ttl_secs: u64, -} - -fn default_max_entries() -> usize { - 1000 -} - -fn default_ttl_secs() -> u64 { - 3600 -} - -impl Default for CacheConfig { - fn default() -> Self { - Self { - max_entries: default_max_entries(), - ttl_secs: default_ttl_secs(), - } - } -} - -impl CacheConfig { - /// Create new cache config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the maximum entries. - pub fn with_max_entries(mut self, max: usize) -> Self { - self.max_entries = max; - self - } - - /// Set the TTL in seconds. - pub fn with_ttl_secs(mut self, secs: u64) -> Self { - self.ttl_secs = secs; - self - } -} - -/// Strategy-specific configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StrategyConfig { - /// MCTS exploration weight (sqrt(2) ≈ 1.414). - #[serde(default = "default_exploration_weight")] - pub exploration_weight: f32, - - /// Semantic similarity threshold. - #[serde(default = "default_similarity_threshold")] - pub similarity_threshold: f32, - - /// High similarity threshold for "answer" decision. - #[serde(default = "default_high_similarity_threshold")] - pub high_similarity_threshold: f32, - - /// Low similarity threshold for "explore" decision. - #[serde(default = "default_low_similarity_threshold")] - pub low_similarity_threshold: f32, - - /// Hybrid strategy configuration (BM25 + LLM refinement). - #[serde(default)] - pub hybrid: HybridStrategyConfig, - - /// Cross-document strategy configuration. - #[serde(default)] - pub cross_document: CrossDocumentStrategyConfig, - - /// Page-range strategy configuration. - #[serde(default)] - pub page_range: PageRangeStrategyConfig, -} - -/// Hybrid strategy configuration (BM25 pre-filter + LLM refinement). -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HybridStrategyConfig { - /// Enable hybrid strategy. - #[serde(default = "default_true")] - pub enabled: bool, - - /// BM25 pre-filter: keep top N% of candidates. - #[serde(default = "default_pre_filter_ratio")] - pub pre_filter_ratio: f32, - - /// Minimum candidates to pass to LLM. - #[serde(default = "default_min_candidates")] - pub min_candidates: usize, - - /// Maximum candidates for LLM refinement. - #[serde(default = "default_max_candidates")] - pub max_candidates: usize, - - /// BM25 score for auto-accept (skip LLM). - #[serde(default = "default_auto_accept_threshold")] - pub auto_accept_threshold: f32, - - /// BM25 score for auto-reject (skip LLM). - #[serde(default = "default_auto_reject_threshold")] - pub auto_reject_threshold: f32, - - /// Weight for BM25 score in final scoring. - #[serde(default = "default_bm25_weight")] - pub bm25_weight: f32, - - /// Weight for LLM score in final scoring. - #[serde(default = "default_llm_weight")] - pub llm_weight: f32, -} - -fn default_true() -> bool { - true -} -fn default_pre_filter_ratio() -> f32 { - 0.3 -} -fn default_min_candidates() -> usize { - 2 -} -fn default_max_candidates() -> usize { - 5 -} -fn default_auto_accept_threshold() -> f32 { - 0.85 -} -fn default_auto_reject_threshold() -> f32 { - 0.15 -} -fn default_bm25_weight() -> f32 { - 0.4 -} -fn default_llm_weight() -> f32 { - 0.6 -} - -impl Default for HybridStrategyConfig { - fn default() -> Self { - Self { - enabled: true, - pre_filter_ratio: default_pre_filter_ratio(), - min_candidates: default_min_candidates(), - max_candidates: default_max_candidates(), - auto_accept_threshold: default_auto_accept_threshold(), - auto_reject_threshold: default_auto_reject_threshold(), - bm25_weight: default_bm25_weight(), - llm_weight: default_llm_weight(), - } - } -} - -/// Cross-document strategy configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CrossDocumentStrategyConfig { - /// Enable cross-document strategy. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Maximum documents to search. - #[serde(default = "default_max_documents")] - pub max_documents: usize, - - /// Maximum results per document. - #[serde(default = "default_max_results_per_doc")] - pub max_results_per_doc: usize, - - /// Maximum total results. - #[serde(default = "default_max_total_results")] - pub max_total_results: usize, - - /// Minimum score threshold. - #[serde(default = "default_min_score")] - pub min_score: f32, - - /// Merge strategy: TopK, BestPerDocument, WeightedByRelevance. - #[serde(default = "default_merge_strategy")] - pub merge_strategy: String, - - /// Search documents in parallel. - #[serde(default = "default_true")] - pub parallel_search: bool, -} - -fn default_max_documents() -> usize { - 10 -} -fn default_max_results_per_doc() -> usize { - 3 -} -fn default_max_total_results() -> usize { - 10 -} -fn default_min_score() -> f32 { - 0.3 -} -fn default_merge_strategy() -> String { - "TopK".to_string() -} - -impl Default for CrossDocumentStrategyConfig { - fn default() -> Self { - Self { - enabled: true, - max_documents: default_max_documents(), - max_results_per_doc: default_max_results_per_doc(), - max_total_results: default_max_total_results(), - min_score: default_min_score(), - merge_strategy: default_merge_strategy(), - parallel_search: true, - } - } -} - -/// Page-range strategy configuration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PageRangeStrategyConfig { - /// Enable page-range strategy. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Include nodes that span across the boundary. - #[serde(default = "default_true")] - pub include_boundary_nodes: bool, - - /// Expand range by N pages for context. - #[serde(default)] - pub expand_context_pages: usize, - - /// Minimum overlap ratio for node inclusion. - #[serde(default = "default_min_overlap_ratio")] - pub min_overlap_ratio: f32, -} - -fn default_min_overlap_ratio() -> f32 { - 0.1 -} - -impl Default for PageRangeStrategyConfig { - fn default() -> Self { - Self { - enabled: true, - include_boundary_nodes: true, - expand_context_pages: 0, - min_overlap_ratio: default_min_overlap_ratio(), - } - } -} - -fn default_exploration_weight() -> f32 { - 1.414 -} - -fn default_similarity_threshold() -> f32 { - 0.5 -} - -fn default_high_similarity_threshold() -> f32 { - 0.8 -} - -fn default_low_similarity_threshold() -> f32 { - 0.3 -} - -impl Default for StrategyConfig { - fn default() -> Self { - Self { - exploration_weight: default_exploration_weight(), - similarity_threshold: default_similarity_threshold(), - high_similarity_threshold: default_high_similarity_threshold(), - low_similarity_threshold: default_low_similarity_threshold(), - hybrid: HybridStrategyConfig::default(), - cross_document: CrossDocumentStrategyConfig::default(), - page_range: PageRangeStrategyConfig::default(), - } - } -} - -impl StrategyConfig { - /// Create new strategy config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the exploration weight. - pub fn with_exploration_weight(mut self, weight: f32) -> Self { - self.exploration_weight = weight; - self - } - - /// Set the similarity threshold. - pub fn with_similarity_threshold(mut self, threshold: f32) -> Self { - self.similarity_threshold = threshold; - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_storage_config_defaults() { - let config = StorageConfig::default(); - // Default workspace should be under .vectorless/workspaces/ (Unix) - // or vectorless/workspaces/ (Windows via AppData) - let path_str = config.workspace_dir.to_string_lossy(); - if cfg!(windows) { - assert!( - path_str.contains("vectorless"), - "expected ...\\vectorless\\workspaces\\..., got {:?}", - config.workspace_dir, - ); - } else { - assert!( - path_str.contains(".vectorless"), - "expected ~/.vectorless/workspaces/..., got {:?}", - config.workspace_dir, - ); - } - assert_eq!(config.cache_size, 100); - assert!(config.atomic_writes); - assert!(config.file_lock); - assert!(config.checksum_enabled); - assert!(!config.compression.enabled); - } - - #[test] - fn test_storage_config_builders() { - let config = StorageConfig::new() - .with_workspace_dir("/data/workspace") - .with_cache_size(200) - .with_atomic_writes(false) - .with_file_lock(false) - .with_checksum(false); - - assert_eq!(config.workspace_dir, PathBuf::from("/data/workspace")); - assert_eq!(config.cache_size, 200); - assert!(!config.atomic_writes); - assert!(!config.file_lock); - assert!(!config.checksum_enabled); - } - - #[test] - fn test_compression_config_defaults() { - let config = CompressionConfig::default(); - assert!(!config.enabled); - assert_eq!(config.algorithm, CompressionAlgorithm::Gzip); - assert_eq!(config.level, 6); - } - - #[test] - fn test_compression_config_level_clamp() { - let config = CompressionConfig::new().with_level(15); - assert_eq!(config.level, 9); // clamped to max - - let config = CompressionConfig::new().with_level(0); - assert_eq!(config.level, 1); // clamped to min - } - - #[test] - fn test_sufficiency_config_defaults() { - let config = SufficiencyConfig::default(); - assert_eq!(config.min_tokens, 500); - assert_eq!(config.target_tokens, 2000); - assert_eq!(config.max_tokens, 4000); - } - - #[test] - fn test_cache_config_defaults() { - let config = CacheConfig::default(); - assert_eq!(config.max_entries, 1000); - assert_eq!(config.ttl_secs, 3600); - } - - #[test] - fn test_strategy_config_defaults() { - let config = StrategyConfig::default(); - assert!((config.exploration_weight - 1.414).abs() < 0.001); - assert_eq!(config.similarity_threshold, 0.5); - assert!(config.hybrid.enabled); - assert!(config.cross_document.enabled); - assert!(config.page_range.enabled); - } - - #[test] - fn test_hybrid_strategy_config_defaults() { - let config = HybridStrategyConfig::default(); - assert!(config.enabled); - assert!((config.pre_filter_ratio - 0.3).abs() < f32::EPSILON); - assert_eq!(config.min_candidates, 2); - assert_eq!(config.max_candidates, 5); - assert!((config.auto_accept_threshold - 0.85).abs() < f32::EPSILON); - } - - #[test] - fn test_cross_document_strategy_config_defaults() { - let config = CrossDocumentStrategyConfig::default(); - assert!(config.enabled); - assert_eq!(config.max_documents, 10); - assert_eq!(config.max_results_per_doc, 3); - assert_eq!(config.merge_strategy, "TopK"); - assert!(config.parallel_search); - } - - #[test] - fn test_page_range_strategy_config_defaults() { - let config = PageRangeStrategyConfig::default(); - assert!(config.enabled); - assert!(config.include_boundary_nodes); - assert_eq!(config.expand_context_pages, 0); - assert!((config.min_overlap_ratio - 0.1).abs() < f32::EPSILON); - } -} diff --git a/vectorless-core/vectorless/src/config/validator.rs b/vectorless-core/vectorless/src/config/validator.rs deleted file mode 100644 index c3f55422..00000000 --- a/vectorless-core/vectorless/src/config/validator.rs +++ /dev/null @@ -1,323 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration validation. -//! -//! This module provides comprehensive validation for configuration values, -//! including range checks, consistency checks, and dependency validation. - -use super::types::{Config, ConfigValidationError, Severity, ValidationError}; - -/// Configuration validator. -#[derive(Debug, Default)] -pub struct ConfigValidator { - /// Validation rules to apply. - rules: Vec>, -} - -impl ConfigValidator { - /// Create a new validator with default rules. - pub fn new() -> Self { - Self { - rules: vec![ - Box::new(RangeValidator), - Box::new(ConsistencyValidator), - Box::new(DependencyValidator), - ], - } - } - - /// Add a custom validation rule. - pub fn with_rule(mut self, rule: Box) -> Self { - self.rules.push(rule); - self - } - - /// Validate the configuration. - pub fn validate(&self, config: &Config) -> Result<(), ConfigValidationError> { - let mut errors = Vec::new(); - - for rule in &self.rules { - rule.validate(config, &mut errors); - } - - // Only fail on errors, not warnings or info - let has_errors = errors.iter().any(|e| e.severity == Severity::Error); - - if has_errors { - Err(ConfigValidationError { errors }) - } else { - Ok(()) - } - } -} - -/// Trait for validation rules. -pub trait ValidationRule: std::fmt::Debug + Send + Sync { - /// Validate the configuration, appending errors if found. - fn validate(&self, config: &Config, errors: &mut Vec); -} - -/// Validates value ranges. -#[derive(Debug)] -struct RangeValidator; - -impl ValidationRule for RangeValidator { - fn validate(&self, config: &Config, errors: &mut Vec) { - // Indexer ranges - if config.indexer.subsection_threshold == 0 { - errors.push(ValidationError::error( - "indexer.subsection_threshold", - "Subsection threshold must be greater than 0", - )); - } - - if config.indexer.subsection_threshold > 10000 { - errors.push( - ValidationError::warning( - "indexer.subsection_threshold", - "Subsection threshold is very high, may impact performance", - ) - .with_actual(config.indexer.subsection_threshold.to_string()), - ); - } - - // LLM slot token ranges - if config.llm.index.max_tokens == 0 { - errors.push(ValidationError::error( - "llm.index.max_tokens", - "Index max tokens must be greater than 0", - )); - } - - if config.llm.retrieval.max_tokens == 0 { - errors.push(ValidationError::error( - "llm.retrieval.max_tokens", - "Retrieval max tokens must be greater than 0", - )); - } - - // Retrieval ranges - if config.retrieval.top_k == 0 { - errors.push(ValidationError::error( - "retrieval.top_k", - "Top K must be greater than 0", - )); - } - - if config.retrieval.search.beam_width == 0 { - errors.push(ValidationError::error( - "retrieval.search.beam_width", - "Beam width must be greater than 0", - )); - } - - // Throttle ranges - if config.llm.throttle.max_concurrent_requests == 0 { - errors.push(ValidationError::error( - "llm.throttle.max_concurrent_requests", - "Max concurrent requests must be greater than 0", - )); - } - - if config.llm.throttle.requests_per_minute == 0 { - errors.push(ValidationError::error( - "llm.throttle.requests_per_minute", - "Requests per minute must be greater than 0", - )); - } - - // Fallback ranges - if config.llm.fallback.max_retries == 0 { - errors.push(ValidationError::warning( - "llm.fallback.max_retries", - "Max retries is 0, fallback will not retry", - )); - } - } -} - -/// Validates configuration consistency. -#[derive(Debug)] -struct ConsistencyValidator; - -impl ValidationRule for ConsistencyValidator { - fn validate(&self, config: &Config, errors: &mut Vec) { - // Check if index tokens are reasonable - if config.llm.index.max_tokens > config.indexer.max_segment_tokens { - errors.push( - ValidationError::warning( - "llm.index.max_tokens", - "Index max tokens exceeds max segment tokens", - ) - .with_expected(format!("<= {}", config.indexer.max_segment_tokens)) - .with_actual(config.llm.index.max_tokens.to_string()), - ); - } - - // Check if sufficiency thresholds are consistent - if config.retrieval.sufficiency.min_tokens > config.retrieval.sufficiency.target_tokens { - errors.push( - ValidationError::error( - "retrieval.sufficiency.min_tokens", - "Min tokens cannot exceed target tokens", - ) - .with_expected(format!("<= {}", config.retrieval.sufficiency.target_tokens)) - .with_actual(config.retrieval.sufficiency.min_tokens.to_string()), - ); - } - - if config.retrieval.sufficiency.target_tokens > config.retrieval.sufficiency.max_tokens { - errors.push( - ValidationError::error( - "retrieval.sufficiency.target_tokens", - "Target tokens cannot exceed max tokens", - ) - .with_expected(format!("<= {}", config.retrieval.sufficiency.max_tokens)) - .with_actual(config.retrieval.sufficiency.target_tokens.to_string()), - ); - } - } -} - -/// Validates configuration dependencies. -#[derive(Debug)] -struct DependencyValidator; - -impl ValidationRule for DependencyValidator { - fn validate(&self, config: &Config, errors: &mut Vec) { - // Check if API key is available when summaries are needed - if config.llm.api_key.is_none() { - if config.indexer.max_summary_tokens > 0 { - errors.push(ValidationError::info( - "llm.api_key", - "No API key configured, summary generation will be disabled", - )); - } - } - - // Check fallback configuration - if config.llm.fallback.enabled { - if config.llm.fallback.models.is_empty() && config.llm.fallback.endpoints.is_empty() { - errors.push(ValidationError::warning( - "llm.fallback.models", - "Fallback enabled but no fallback models or endpoints configured", - )); - } - - // Check retry behavior consistency - if matches!( - config.llm.fallback.on_rate_limit, - super::types::FallbackBehavior::Fallback - ) && config.llm.fallback.models.is_empty() - { - errors.push(ValidationError::error( - "llm.fallback.models", - "Rate limit behavior is 'fallback' but no fallback models configured", - )); - } - } - - // Check cache configuration - if config.retrieval.cache.max_entries == 0 { - errors.push(ValidationError::warning( - "retrieval.cache.max_entries", - "Cache disabled (max_entries = 0), performance may be impacted", - )); - } - - // Check strategy configuration - if config.retrieval.strategy.exploration_weight <= 0.0 { - errors.push( - ValidationError::error( - "retrieval.strategy.exploration_weight", - "Exploration weight must be positive", - ) - .with_actual(config.retrieval.strategy.exploration_weight.to_string()), - ); - } - - // Check similarity thresholds are ordered correctly - if config.retrieval.strategy.low_similarity_threshold - >= config.retrieval.strategy.high_similarity_threshold - { - errors.push( - ValidationError::error( - "retrieval.strategy.low_similarity_threshold", - "Low similarity threshold must be less than high similarity threshold", - ) - .with_expected(format!( - "< {}", - config.retrieval.strategy.high_similarity_threshold - )) - .with_actual( - config - .retrieval - .strategy - .low_similarity_threshold - .to_string(), - ), - ); - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_validator_valid_config() { - let config = Config::default(); - let validator = ConfigValidator::new(); - // Default config should pass validation (no errors, warnings are ok) - let result = validator.validate(&config); - assert!(result.is_ok(), "Default config should pass validation"); - } - - #[test] - fn test_validator_catches_range_errors() { - let mut config = Config::default(); - config.retrieval.top_k = 0; - - let validator = ConfigValidator::new(); - let result = validator.validate(&config); - - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.errors.iter().any(|e| e.path.contains("top_k"))); - } - - #[test] - fn test_validator_catches_consistency_errors() { - let mut config = Config::default(); - config.retrieval.sufficiency.min_tokens = 3000; - config.retrieval.sufficiency.target_tokens = 2000; - - let validator = ConfigValidator::new(); - let result = validator.validate(&config); - - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(err.errors.iter().any(|e| e.path.contains("min_tokens"))); - } - - #[test] - fn test_validator_catches_dependency_warnings() { - let mut config = Config::default(); - config.llm.fallback.enabled = true; - config.llm.fallback.models.clear(); - - let validator = ConfigValidator::new(); - let result = validator.validate(&config); - - // Should succeed but with warnings - if let Err(err) = result { - assert!( - err.errors - .iter() - .any(|e| e.path.contains("llm.fallback.models")) - ); - } - } -} diff --git a/vectorless-core/vectorless/src/document/format.rs b/vectorless-core/vectorless/src/document/format.rs deleted file mode 100644 index 78f6e52e..00000000 --- a/vectorless-core/vectorless/src/document/format.rs +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document format and sufficiency types. -//! -//! These types are used across multiple modules and are defined here -//! to avoid circular dependencies between crates. - -use serde::{Deserialize, Serialize}; - -/// Supported document formats. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum DocumentFormat { - /// Markdown files (.md, .markdown) - Markdown, - /// PDF files (.pdf) - Pdf, -} - -impl DocumentFormat { - /// Detect format from file extension. - pub fn from_extension(ext: &str) -> Option { - match ext.to_lowercase().as_str() { - "md" | "markdown" => Some(Self::Markdown), - "pdf" => Some(Self::Pdf), - _ => None, - } - } - - /// Get the file extension for this format. - pub fn extension(&self) -> &'static str { - match self { - Self::Markdown => "md", - Self::Pdf => "pdf", - } - } - - /// All supported file extensions (lowercase). - /// - /// Single source of truth — used by directory scanning to - /// discover indexable files. - pub const SUPPORTED_EXTENSIONS: &'static [&'static str] = &["md", "pdf"]; -} - -/// Sufficiency level for incremental retrieval. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum SufficiencyLevel { - /// Information is sufficient, stop retrieving. - Sufficient, - - /// Partial information, can continue if needed. - PartialSufficient, - - /// Information is insufficient, continue retrieving. - Insufficient, -} - -impl Default for SufficiencyLevel { - fn default() -> Self { - Self::Insufficient - } -} diff --git a/vectorless-core/vectorless/src/document/mod.rs b/vectorless-core/vectorless/src/document/mod.rs deleted file mode 100644 index 85ea1ff3..00000000 --- a/vectorless-core/vectorless/src/document/mod.rs +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document types - pure data structures for document tree representation. -//! -//! This module contains the core types that represent hierarchical documents. -//! These types have no dependencies on indexing or retrieval logic. -//! -//! # Types -//! -//! - [`TreeNode`] - A node in the document tree -//! - [`DocumentTree`] - Arena-based tree structure -//! - [`NodeId`] - Unique identifier for tree nodes -//! - [`TocView`] - Table of Contents generator -//! - [`StructureNode`] - JSON export structure -//! - [`NodeReference`] - In-document reference (e.g., "see Appendix G") -//! - [`RefType`] - Type of reference (Section, Appendix, Table, etc.) - -mod format; -mod navigation; -mod node; -mod reasoning; -mod reference; -mod serde_helpers; -mod structure; -mod toc; -mod tree; -pub mod understanding; - -pub use format::{DocumentFormat, SufficiencyLevel}; -pub use navigation::{ChildRoute, DocCard, NavEntry, NavigationIndex, SectionCard}; -pub use node::{NodeId, TreeNode}; -pub use reasoning::{ - ReasoningIndex, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, SummaryShortcut, - TopicEntry, -}; -pub use reference::ReferenceExtractor; -pub use structure::{DocumentStructure, StructureNode}; -pub use toc::{TocConfig, TocEntry, TocNode, TocView}; -pub use tree::{DocumentTree, RetrievalIndex}; -pub use understanding::{ - Answer, Concept, Document, DocumentInfo, Evidence, IngestInput, ReasoningTrace, TraceStep, -}; diff --git a/vectorless-core/vectorless/src/document/navigation.rs b/vectorless-core/vectorless/src/document/navigation.rs deleted file mode 100644 index dbfeadd4..00000000 --- a/vectorless-core/vectorless/src/document/navigation.rs +++ /dev/null @@ -1,626 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Navigation index for Agent-based retrieval. -//! -//! This is the primary data source for the Agent during the query phase. -//! It provides a compact, pre-computed view of the document tree optimized -//! for navigation decisions — the Agent can decide where to descend without -//! reading the actual content. -//! -//! # Design -//! -//! Based on the Corpus2Skill paper (2604.14572v1), this is the in-memory -//! equivalent of SKILL.md / INDEX.md. The Agent reads `child_routes` at -//! each decision point to see all available sub-topics and their descriptions, -//! then chooses where to navigate next. -//! -//! # Data Flow -//! -//! ```text -//! Enhance stage (writes to TreeNode): -//! summary, description, routing_keywords, leaf_count -//! │ -//! └──→ Navigation stage (reads TreeNode fields) -//! Builds: NavigationIndex (NavEntry + ChildRoute) -//! ``` -//! -//! No LLM calls are made during Navigation stage construction. - -use std::collections::HashMap; - -use serde::{Deserialize, Serialize}; - -use super::node::NodeId; - -/// Navigation index — Agent's primary data source during the query phase. -/// -/// Contains pre-computed navigation metadata for every non-leaf node, -/// allowing the Agent to make routing decisions without accessing the -/// content layer (DocumentTree). -/// -/// `HashMap` fields use `serde_helpers` (Vec pairs) because -/// serde_json cannot deserialize integer-keyed maps. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NavigationIndex { - /// Navigation entry for each non-leaf node. - #[serde(with = "super::serde_helpers")] - nav_entries: HashMap, - - /// Child routes for each non-leaf node. - #[serde(with = "super::serde_helpers")] - child_routes: HashMap>, - - /// Pre-computed document card for multi-document Orchestrator. - /// Built during compile phase by NavigationIndexStage. - #[serde(default, skip_serializing_if = "Option::is_none")] - doc_card: Option, -} - -impl NavigationIndex { - /// Create a new empty navigation index. - pub fn new() -> Self { - Self { - nav_entries: HashMap::new(), - child_routes: HashMap::new(), - doc_card: None, - } - } - - /// Add a navigation entry for a non-leaf node. - pub fn add_entry(&mut self, node_id: NodeId, entry: NavEntry) { - self.nav_entries.insert(node_id, entry); - } - - /// Add child routes for a non-leaf node. - pub fn add_child_routes(&mut self, parent_id: NodeId, routes: Vec) { - self.child_routes.insert(parent_id, routes); - } - - /// Get the navigation entry for a node. - pub fn get_entry(&self, node_id: NodeId) -> Option<&NavEntry> { - self.nav_entries.get(&node_id) - } - - /// Get the child routes for a node. - pub fn get_child_routes(&self, node_id: NodeId) -> Option<&[ChildRoute]> { - self.child_routes.get(&node_id).map(|v| v.as_slice()) - } - - /// Get the number of navigation entries. - pub fn entry_count(&self) -> usize { - self.nav_entries.len() - } - - /// Get the total number of child route records. - pub fn total_child_routes(&self) -> usize { - self.child_routes.values().map(|v| v.len()).sum() - } - - /// Get the root node's navigation entry. - pub fn root_entry(&self) -> Option<&NavEntry> { - // The root should always be present if the index is non-empty. - // Return the first entry with level 0. - self.nav_entries.values().find(|e| e.level == 0) - } - - /// Iterate over all navigation entries. - pub fn entries(&self) -> impl Iterator { - self.nav_entries.iter() - } - - /// Iterate over all child route sets. - pub fn all_child_routes(&self) -> impl Iterator { - self.child_routes.iter().map(|(k, v)| (k, v.as_slice())) - } - - /// Check if the index is empty. - pub fn is_empty(&self) -> bool { - self.nav_entries.is_empty() - } - - /// Get the pre-computed document card. - pub fn doc_card(&self) -> Option<&DocCard> { - self.doc_card.as_ref() - } - - /// Set the document card. - pub fn set_doc_card(&mut self, card: DocCard) { - self.doc_card = Some(card); - } -} - -impl Default for NavigationIndex { - fn default() -> Self { - Self::new() - } -} - -/// Navigation entry for a non-leaf node. -/// -/// Provides the Agent with enough context to decide whether this subtree -/// is relevant to the current query, without needing to read the node's -/// actual content. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NavEntry { - /// Routing summary describing what this subtree covers. - /// Comes from Enhance stage's `summary` (routing-oriented). - pub overview: String, - - /// Typical questions this subtree can answer. - /// Extracted from content/summary during Enhance stage. - pub question_hints: Vec, - - /// Topic tags for keyword-based matching. - /// Comes from Enhance stage's `routing_keywords`. - pub topic_tags: Vec, - - /// Total number of leaf nodes in this subtree. - /// Equivalent to the paper's `num_documents`. - pub leaf_count: usize, - - /// Depth of this node in the tree. - /// Equivalent to the paper's `level`. - pub level: usize, -} - -/// Child route — compact routing info for one child node. -/// -/// The Agent sees a list of `ChildRoute`s when deciding which child -/// to descend into. This provides progressive disclosure: the Agent -/// doesn't need to enter the child node to understand what it contains. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ChildRoute { - /// The child node's ID (for the Agent to navigate to). - pub node_id: NodeId, - - /// Child node's title. - pub title: String, - - /// One-sentence description of what this child covers. - /// Comes from Enhance stage's `description` field. - pub description: String, - - /// Number of leaf nodes in this child's subtree. - pub leaf_count: usize, -} - -/// Pre-computed document card for multi-document Orchestrator Agent. -/// -/// Built during the compile phase by `NavigationIndexStage`, this provides -/// a compact summary of the entire document — enough for the Orchestrator -/// to decide whether a document is relevant to a query without entering it. -/// -/// All fields come from data already computed in earlier phases of the -/// NavigationIndexStage (root NavEntry + root child_routes). No LLM calls. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocCard { - /// Document title (root node title). - pub title: String, - - /// Document overview (root NavEntry.overview). - pub overview: String, - - /// Questions this document can answer (root NavEntry.question_hints). - pub question_hints: Vec, - - /// Topic keywords (root NavEntry.topic_tags). - pub topic_tags: Vec, - - /// Top-level section summaries (from root child_routes). - pub sections: Vec, - - /// Total leaf nodes in the document. - pub total_leaves: usize, -} - -/// One top-level section in a [`DocCard`]. -/// -/// Provides a compact view of a single top-level section, -/// allowing the Orchestrator to scan section titles and descriptions -/// to assess document relevance. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SectionCard { - /// Section title. - pub title: String, - - /// One-sentence description of this section. - pub description: String, - - /// Number of leaf nodes in this section's subtree. - pub leaf_count: usize, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - fn build_small_tree() -> DocumentTree { - // Root -> [Child1 (leaf), Child2 -> [Grandchild (leaf)]] - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let _child1 = tree.add_child(root, "Child1", "leaf content"); - let child2 = tree.add_child(root, "Child2", ""); - let _grandchild = tree.add_child(child2, "Grandchild", "leaf content"); - tree - } - - #[test] - fn test_empty_navigation_index() { - let index = NavigationIndex::new(); - assert!(index.is_empty()); - assert_eq!(index.entry_count(), 0); - assert_eq!(index.total_child_routes(), 0); - assert!(index.root_entry().is_none()); - } - - #[test] - fn test_add_and_retrieve_entry() { - let tree = build_small_tree(); - let root = tree.root(); - - let entry = NavEntry { - overview: "Payment integration guide".to_string(), - question_hints: vec!["How to set up Stripe?".to_string()], - topic_tags: vec!["payment".to_string(), "stripe".to_string()], - leaf_count: 5, - level: 0, - }; - - let mut index = NavigationIndex::new(); - index.add_entry(root, entry); - - assert!(!index.is_empty()); - assert_eq!(index.entry_count(), 1); - - let retrieved = index.get_entry(root).unwrap(); - assert_eq!(retrieved.overview, "Payment integration guide"); - assert_eq!(retrieved.leaf_count, 5); - } - - #[test] - fn test_add_and_retrieve_child_routes() { - let tree = build_small_tree(); - let root = tree.root(); - let children: Vec = tree.children_iter(root).collect(); - - let routes = vec![ - ChildRoute { - node_id: children[0], - title: "Getting Started".to_string(), - description: "Setup and installation".to_string(), - leaf_count: 3, - }, - ChildRoute { - node_id: children[1], - title: "API Reference".to_string(), - description: "REST API endpoints".to_string(), - leaf_count: 7, - }, - ]; - - let mut index = NavigationIndex::new(); - index.add_child_routes(root, routes); - - let retrieved = index.get_child_routes(root).unwrap(); - assert_eq!(retrieved.len(), 2); - assert_eq!(retrieved[0].title, "Getting Started"); - assert_eq!(retrieved[1].leaf_count, 7); - assert_eq!(index.total_child_routes(), 2); - } - - #[test] - fn test_root_entry() { - let tree = build_small_tree(); - let root = tree.root(); - let children: Vec = tree.children_iter(root).collect(); - - let mut index = NavigationIndex::new(); - index.add_entry( - root, - NavEntry { - overview: "Root".to_string(), - question_hints: vec![], - topic_tags: vec![], - leaf_count: 10, - level: 0, - }, - ); - index.add_entry( - children[1], - NavEntry { - overview: "Child".to_string(), - question_hints: vec![], - topic_tags: vec![], - leaf_count: 5, - level: 1, - }, - ); - - let root_entry = index.root_entry().unwrap(); - assert_eq!(root_entry.level, 0); - assert_eq!(root_entry.leaf_count, 10); - } - - #[test] - fn test_get_entry_nonexistent() { - let index = NavigationIndex::new(); - let tree = build_small_tree(); - // Leaf node should never have an entry - let children: Vec = tree.children_iter(tree.root()).collect(); - assert!(index.get_entry(children[0]).is_none()); - } - - #[test] - fn test_get_child_routes_nonexistent() { - let index = NavigationIndex::new(); - let tree = build_small_tree(); - assert!(index.get_child_routes(tree.root()).is_none()); - } - - #[test] - fn test_default_trait() { - let index = NavigationIndex::default(); - assert!(index.is_empty()); - } - - #[test] - fn test_entries_iterator() { - let tree = build_small_tree(); - let root = tree.root(); - let children: Vec = tree.children_iter(root).collect(); - - let mut index = NavigationIndex::new(); - index.add_entry( - root, - NavEntry { - overview: "Root".to_string(), - question_hints: vec![], - topic_tags: vec![], - leaf_count: 2, - level: 0, - }, - ); - index.add_entry( - children[1], // Child2 is non-leaf - NavEntry { - overview: "Child2".to_string(), - question_hints: vec![], - topic_tags: vec![], - leaf_count: 1, - level: 1, - }, - ); - - let all_entries: Vec<_> = index.entries().collect(); - assert_eq!(all_entries.len(), 2); - } - - #[test] - fn test_all_child_routes_iterator() { - let tree = build_small_tree(); - let root = tree.root(); - let children: Vec = tree.children_iter(root).collect(); - - let mut index = NavigationIndex::new(); - index.add_child_routes( - root, - vec![ChildRoute { - node_id: children[0], - title: "C1".to_string(), - description: "d".to_string(), - leaf_count: 1, - }], - ); - - let all_routes: Vec<_> = index.all_child_routes().collect(); - assert_eq!(all_routes.len(), 1); - assert_eq!(all_routes[0].1.len(), 1); - } - - #[test] - fn test_serialization_roundtrip() { - let tree = build_small_tree(); - let root = tree.root(); - let children: Vec = tree.children_iter(root).collect(); - - let mut index = NavigationIndex::new(); - index.add_entry( - root, - NavEntry { - overview: "Root overview".to_string(), - question_hints: vec!["What is this?".to_string()], - topic_tags: vec!["intro".to_string(), "guide".to_string()], - leaf_count: 2, - level: 0, - }, - ); - index.add_child_routes( - root, - vec![ - ChildRoute { - node_id: children[0], - title: "Child1".to_string(), - description: "First child desc".to_string(), - leaf_count: 1, - }, - ChildRoute { - node_id: children[1], - title: "Child2".to_string(), - description: "Second child desc".to_string(), - leaf_count: 1, - }, - ], - ); - - // Serialize - let json = serde_json::to_string(&index).expect("serialization failed"); - - // Deserialize - let deserialized: NavigationIndex = - serde_json::from_str(&json).expect("deserialization failed"); - - // Verify data survived round-trip - assert_eq!(deserialized.entry_count(), 1); - assert_eq!(deserialized.total_child_routes(), 2); - - let entry = deserialized.get_entry(root).unwrap(); - assert_eq!(entry.overview, "Root overview"); - assert_eq!(entry.question_hints.len(), 1); - assert_eq!(entry.topic_tags.len(), 2); - assert_eq!(entry.leaf_count, 2); - assert_eq!(entry.level, 0); - - let routes = deserialized.get_child_routes(root).unwrap(); - assert_eq!(routes[0].title, "Child1"); - assert_eq!(routes[1].title, "Child2"); - } - - #[test] - fn test_doc_card_default_none() { - let index = NavigationIndex::new(); - assert!(index.doc_card().is_none()); - } - - #[test] - fn test_doc_card_set_and_get() { - let card = DocCard { - title: "Test Doc".to_string(), - overview: "A test document".to_string(), - question_hints: vec!["What?".to_string()], - topic_tags: vec!["test".to_string()], - sections: vec![SectionCard { - title: "Section 1".to_string(), - description: "First section".to_string(), - leaf_count: 5, - }], - total_leaves: 5, - }; - - let mut index = NavigationIndex::new(); - index.set_doc_card(card); - - let retrieved = index.doc_card().unwrap(); - assert_eq!(retrieved.title, "Test Doc"); - assert_eq!(retrieved.overview, "A test document"); - assert_eq!(retrieved.question_hints.len(), 1); - assert_eq!(retrieved.topic_tags.len(), 1); - assert_eq!(retrieved.sections.len(), 1); - assert_eq!(retrieved.sections[0].title, "Section 1"); - assert_eq!(retrieved.sections[0].leaf_count, 5); - assert_eq!(retrieved.total_leaves, 5); - } - - #[test] - fn test_doc_card_serialization_roundtrip() { - let tree = build_small_tree(); - let root = tree.root(); - let children: Vec = tree.children_iter(root).collect(); - - let mut index = NavigationIndex::new(); - index.add_entry( - root, - NavEntry { - overview: "Root overview".to_string(), - question_hints: vec!["What is this?".to_string()], - topic_tags: vec!["intro".to_string()], - leaf_count: 2, - level: 0, - }, - ); - index.add_child_routes( - root, - vec![ - ChildRoute { - node_id: children[0], - title: "Child1".to_string(), - description: "First".to_string(), - leaf_count: 1, - }, - ChildRoute { - node_id: children[1], - title: "Child2".to_string(), - description: "Second".to_string(), - leaf_count: 1, - }, - ], - ); - - // Build DocCard from index data - let root_entry = index.get_entry(root).unwrap(); - let sections: Vec = index - .get_child_routes(root) - .unwrap() - .iter() - .map(|r| SectionCard { - title: r.title.clone(), - description: r.description.clone(), - leaf_count: r.leaf_count, - }) - .collect(); - index.set_doc_card(DocCard { - title: "Root".to_string(), - overview: root_entry.overview.clone(), - question_hints: root_entry.question_hints.clone(), - topic_tags: root_entry.topic_tags.clone(), - sections, - total_leaves: root_entry.leaf_count, - }); - - // Serialize + deserialize - let json = serde_json::to_string(&index).expect("serialization failed"); - let deserialized: NavigationIndex = - serde_json::from_str(&json).expect("deserialization failed"); - - // Verify DocCard survived round-trip - let card = deserialized.doc_card().unwrap(); - assert_eq!(card.title, "Root"); - assert_eq!(card.overview, "Root overview"); - assert_eq!(card.question_hints, vec!["What is this?"]); - assert_eq!(card.topic_tags, vec!["intro"]); - assert_eq!(card.sections.len(), 2); - assert_eq!(card.sections[0].title, "Child1"); - assert_eq!(card.sections[1].leaf_count, 1); - assert_eq!(card.total_leaves, 2); - } - - #[test] - fn test_doc_card_backward_compat_deserialize_without_card() { - // JSON from an older version that doesn't have doc_card - let tree = build_small_tree(); - let root = tree.root(); - - let mut index = NavigationIndex::new(); - index.add_entry( - root, - NavEntry { - overview: "Old index".to_string(), - question_hints: vec![], - topic_tags: vec![], - leaf_count: 2, - level: 0, - }, - ); - // No doc_card set - - let json = serde_json::to_string(&index).expect("serialization failed"); - let deserialized: NavigationIndex = - serde_json::from_str(&json).expect("deserialization failed"); - - assert!(deserialized.doc_card().is_none()); - assert_eq!(deserialized.entry_count(), 1); - } - - #[test] - fn test_section_card_fields() { - let card = SectionCard { - title: "Getting Started".to_string(), - description: "Quick setup guide".to_string(), - leaf_count: 3, - }; - assert_eq!(card.title, "Getting Started"); - assert_eq!(card.description, "Quick setup guide"); - assert_eq!(card.leaf_count, 3); - } -} diff --git a/vectorless-core/vectorless/src/document/node.rs b/vectorless-core/vectorless/src/document/node.rs deleted file mode 100644 index c0a6ffe6..00000000 --- a/vectorless-core/vectorless/src/document/node.rs +++ /dev/null @@ -1,144 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Tree node definition using indextree (Arena-based). -//! -//! This module provides a node type for hierarchical document representation. -//! Each branch represents a section and each leaf contains the actual text. - -use indextree::NodeId as IndexTreeNodeId; -use serde::{Deserialize, Serialize}; -use std::fmt; - -use super::reference::NodeReference; - -/// Unique identifier for a node in the document tree. -/// -/// This is a newtype wrapper around indextree's NodeId to provide -/// better type safety and domain-specific semantics. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct NodeId(pub IndexTreeNodeId); - -// Implement traits for interoperability -impl fmt::Display for NodeId { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "NodeId({:?})", self.0) - } -} - -impl Serialize for NodeId { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - self.0.serialize(serializer) - } -} - -impl<'de> Deserialize<'de> for NodeId { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let id = IndexTreeNodeId::deserialize(deserializer)?; - Ok(NodeId(id)) - } -} - -/// A node in the Vectorless document tree. -/// -/// Each branch represents a section and each leaf contains the actual text. -/// When a question is asked, an LLM navigates this tree level by level -/// to find the right answer. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TreeNode { - /// Title of this section. - pub title: String, - - /// Hierarchical structure index (e.g., "1", "1.1", "1.2.3"). - /// - /// This provides a human-readable path to the node and is useful for: - /// - LLM navigation (easier to understand "go to section 2.1.3") - /// - Table of contents display - /// - Cross-referencing - #[serde(default)] - pub structure: String, - - /// Raw text content (populated at leaves). - #[serde(default)] - pub content: String, - - /// Generated by LLM summary. - #[serde(default)] - pub summary: String, - - /// Depth in tree (0 = root, 1 = section, 2 = subsection, etc.). - #[serde(default)] - pub depth: usize, - - /// Starting line number (1-based). - #[serde(default)] - pub start_index: usize, - - /// Ending line number (1-based). - #[serde(default)] - pub end_index: usize, - - /// Starting page number (1-based, if applicable). - pub start_page: Option, - - /// Ending page number (1-based, if applicable). - pub end_page: Option, - - /// Unique node identifier (e.g., "0001", "0002"). - pub node_id: Option, - - /// Physical index marker for line tracking. - pub physical_index: Option, - - /// Token count estimate. - pub token_count: Option, - - /// References found in this node's content. - /// - /// These are in-document references like "see Appendix G" or - /// "refer to Table 5.3" that can be followed during retrieval. - #[serde(default)] - pub references: Vec, - - /// Routing keywords for navigation (non-leaf nodes). - /// - /// Populated by EnhanceStage with LLM-extracted topic tags. - /// Used by NavigationIndexStage to populate `NavEntry::topic_tags`. - #[serde(default)] - pub routing_keywords: Vec, - - /// Typical questions this subtree can answer (non-leaf nodes). - /// - /// Populated by EnhanceStage with LLM-extracted question hints. - /// Used by NavigationIndexStage to populate `NavEntry::question_hints`. - #[serde(default)] - pub question_hints: Vec, -} - -impl Default for TreeNode { - fn default() -> Self { - Self { - title: String::new(), - structure: String::new(), - content: String::new(), - summary: String::new(), - depth: 0, - start_index: 1, - end_index: 1, - start_page: None, - end_page: None, - node_id: None, - physical_index: None, - token_count: None, - references: Vec::new(), - routing_keywords: Vec::new(), - question_hints: Vec::new(), - } - } -} diff --git a/vectorless-core/vectorless/src/document/reasoning.rs b/vectorless-core/vectorless/src/document/reasoning.rs deleted file mode 100644 index 2c4ab01b..00000000 --- a/vectorless-core/vectorless/src/document/reasoning.rs +++ /dev/null @@ -1,444 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Pre-computed reasoning index for fast retrieval path resolution. -//! -//! Built at index time from TOC and summaries, the reasoning index provides -//! topic-to-path mappings, summary shortcuts, and hot node tracking that -//! accelerate query-time retrieval by bypassing expensive tree traversal. - -use std::collections::HashMap; - -use serde::{Deserialize, Serialize}; - -use super::node::NodeId; - -/// A pre-computed reasoning index that maps topics and query patterns -/// to optimal tree paths, built at index time for query-time acceleration. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReasoningIndex { - /// Keyword → list of (NodeId, weight) entries. - /// Built from titles and summaries at index time. - /// Key = lowercased keyword token. - topic_paths: HashMap>, - - /// Pre-computed shortcut for "document summary" queries. - /// Maps summary-type query patterns directly to the root node - /// and its top-level children summaries. - summary_shortcut: Option, - - /// Nodes marked as hot (frequently retrieved). - /// NodeId → cumulative hit count and rolling average score. - /// Uses `node_id_map` because serde_json cannot deserialize - /// `HashMap` (integer keys are incompatible with JSON). - #[serde(with = "super::serde_helpers")] - hot_nodes: HashMap, - - /// Depth-1 section title → NodeId mapping for fast ToC lookup. - section_map: HashMap, - - /// Configuration used to build this index (for cache invalidation). - #[serde(default)] - config_hash: u64, -} - -impl ReasoningIndex { - /// Create a new empty reasoning index. - pub fn new() -> Self { - Self { - topic_paths: HashMap::new(), - summary_shortcut: None, - hot_nodes: HashMap::new(), - section_map: HashMap::new(), - config_hash: 0, - } - } - - /// Create a builder for constructing the reasoning index. - pub fn builder() -> ReasoningIndexBuilder { - ReasoningIndexBuilder::new() - } - - /// Look up topic entries for a keyword. - pub fn topic_entries(&self, keyword: &str) -> Option<&[TopicEntry]> { - self.topic_paths.get(keyword).map(Vec::as_slice) - } - - /// Get the summary shortcut, if available. - pub fn summary_shortcut(&self) -> Option<&SummaryShortcut> { - self.summary_shortcut.as_ref() - } - - /// Check if a node is marked as hot. - pub fn is_hot(&self, node_id: NodeId) -> bool { - self.hot_nodes - .get(&node_id) - .map(|e| e.is_hot) - .unwrap_or(false) - } - - /// Get the hot node entry for a node. - pub fn hot_entry(&self, node_id: NodeId) -> Option<&HotNodeEntry> { - self.hot_nodes.get(&node_id) - } - - /// Look up a section by its title. - pub fn find_section(&self, title: &str) -> Option { - self.section_map.get(&title.to_lowercase()).copied() - } - - /// Iterate over all keyword → topic entries (for graph building). - pub fn all_topic_entries(&self) -> impl Iterator { - self.topic_paths.iter().map(|(k, v)| (k, v.as_slice())) - } - - /// Get the number of topic keywords indexed. - pub fn topic_count(&self) -> usize { - self.topic_paths.len() - } - - /// Get the number of sections in the section map. - pub fn section_count(&self) -> usize { - self.section_map.len() - } - - /// Get the number of hot nodes. - pub fn hot_node_count(&self) -> usize { - self.hot_nodes.iter().filter(|(_, e)| e.is_hot).count() - } - - /// Update hot node tracking from retrieval results. - pub fn update_hot_nodes(&mut self, hits: &[(NodeId, f32)], hot_threshold: u32) { - for &(node_id, score) in hits { - let entry = self.hot_nodes.entry(node_id).or_insert(HotNodeEntry { - hit_count: 0, - avg_score: 0.0, - is_hot: false, - }); - entry.hit_count += 1; - entry.avg_score += (score - entry.avg_score) / entry.hit_count as f32; - if entry.hit_count >= hot_threshold { - entry.is_hot = true; - } - } - } -} - -impl Default for ReasoningIndex { - fn default() -> Self { - Self::new() - } -} - -/// Builder for constructing a `ReasoningIndex`. -pub struct ReasoningIndexBuilder { - topic_paths: HashMap>, - summary_shortcut: Option, - hot_nodes: HashMap, - section_map: HashMap, - config_hash: u64, -} - -impl ReasoningIndexBuilder { - /// Create a new builder. - pub fn new() -> Self { - Self { - topic_paths: HashMap::new(), - summary_shortcut: None, - hot_nodes: HashMap::new(), - section_map: HashMap::new(), - config_hash: 0, - } - } - - /// Add a topic entry for a keyword. - pub fn add_topic_entry(&mut self, keyword: impl Into, entry: TopicEntry) { - self.topic_paths - .entry(keyword.into()) - .or_default() - .push(entry); - } - - /// Set the summary shortcut. - pub fn summary_shortcut(mut self, shortcut: SummaryShortcut) -> Self { - self.summary_shortcut = Some(shortcut); - self - } - - /// Add a section mapping. - pub fn add_section(&mut self, title: impl Into, node_id: NodeId) { - self.section_map - .insert(title.into().to_lowercase(), node_id); - } - - /// Set the config hash for cache invalidation. - pub fn config_hash(mut self, hash: u64) -> Self { - self.config_hash = hash; - self - } - - /// Sort topic entries by weight (descending) and trim per-keyword lists. - pub fn sort_and_trim(&mut self, max_entries: usize) { - for entries in self.topic_paths.values_mut() { - entries.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - entries.truncate(max_entries); - } - } - - /// Build the reasoning index. - pub fn build(self) -> ReasoningIndex { - ReasoningIndex { - topic_paths: self.topic_paths, - summary_shortcut: self.summary_shortcut, - hot_nodes: self.hot_nodes, - section_map: self.section_map, - config_hash: self.config_hash, - } - } -} - -impl Default for ReasoningIndexBuilder { - fn default() -> Self { - Self::new() - } -} - -/// A topic entry mapping a keyword to a node with a weight. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TopicEntry { - /// The target node. - pub node_id: NodeId, - /// Weight indicating how relevant this keyword is to this node (0.0 - 1.0). - pub weight: f32, - /// Depth of the node in the tree (for tie-breaking). - pub depth: usize, -} - -/// Pre-computed shortcut for summary-style queries. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SummaryShortcut { - /// The root node ID (direct answer for "what is this about" queries). - pub root_node: NodeId, - /// Pre-collected summaries of top-level sections. - pub section_summaries: Vec, - /// Combined summary text for direct return. - pub document_summary: String, -} - -/// A pre-collected section summary for quick access. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SectionSummary { - /// Section node ID. - pub node_id: NodeId, - /// Section title. - pub title: String, - /// Section summary (pre-computed by EnhanceStage). - pub summary: String, - /// Depth of the section. - pub depth: usize, -} - -/// Entry tracking how often a node is retrieved. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct HotNodeEntry { - /// Number of times this node appeared in retrieval results. - pub hit_count: u32, - /// Rolling average score when retrieved. - pub avg_score: f32, - /// Whether this node is currently marked as "hot" - /// (hit_count exceeds configured threshold). - pub is_hot: bool, -} - -/// Configuration for building and using the reasoning index. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReasoningIndexConfig { - /// Whether reasoning index building is enabled. - pub enabled: bool, - /// Minimum hit count for a node to be considered "hot". - pub hot_node_threshold: u32, - /// Maximum number of topic entries per keyword. - pub max_topic_entries: usize, - /// Maximum number of keyword-to-node mappings to keep. - pub max_keyword_entries: usize, - /// Minimum keyword length to index. - pub min_keyword_length: usize, - /// Whether to build the summary shortcut. - pub build_summary_shortcut: bool, - /// Whether to expand keywords with LLM-generated synonyms. - /// When enabled, the indexing stage calls the LLM to generate - /// synonym terms for each keyword, improving recall for queries - /// that use different wording than the document. - pub enable_synonym_expansion: bool, -} - -impl Default for ReasoningIndexConfig { - fn default() -> Self { - Self { - enabled: true, - hot_node_threshold: 3, - max_topic_entries: 20, - max_keyword_entries: 5000, - min_keyword_length: 2, - build_summary_shortcut: true, - enable_synonym_expansion: true, - } - } -} - -impl ReasoningIndexConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Create a disabled config. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } - - /// Set the hot node threshold. - pub fn with_hot_threshold(mut self, threshold: u32) -> Self { - self.hot_node_threshold = threshold; - self - } - - /// Set whether to build the summary shortcut. - pub fn with_summary_shortcut(mut self, build: bool) -> Self { - self.build_summary_shortcut = build; - self - } - - /// Enable or disable synonym expansion. - pub fn with_synonym_expansion(mut self, enable: bool) -> Self { - self.enable_synonym_expansion = enable; - self - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_reasoning_index_default() { - let index = ReasoningIndex::default(); - assert_eq!(index.topic_count(), 0); - assert_eq!(index.section_count(), 0); - assert_eq!(index.hot_node_count(), 0); - assert!(index.summary_shortcut().is_none()); - } - - #[test] - fn test_builder_basic() { - // Create a simple tree to get valid NodeIds - let mut tree = crate::document::DocumentTree::new("Root", "root content"); - let child1 = tree.add_child(tree.root(), "Introduction", "intro content"); - let child2 = tree.add_child(tree.root(), "Methods", "methods content"); - - let mut builder = ReasoningIndexBuilder::new(); - builder.add_section("Introduction", child1); - builder.add_section("Methods", child2); - - let index = builder.build(); - assert_eq!(index.section_count(), 2); - assert!(index.find_section("introduction").is_some()); - assert!(index.find_section("INTRODUCTION").is_some()); - assert!(index.find_section("methods").is_some()); - } - - #[test] - fn test_config_default() { - let config = ReasoningIndexConfig::default(); - assert!(config.enabled); - assert_eq!(config.hot_node_threshold, 3); - assert!(config.build_summary_shortcut); - } - - #[test] - fn test_config_disabled() { - let config = ReasoningIndexConfig::disabled(); - assert!(!config.enabled); - } - - #[test] - fn test_serialization_roundtrip_empty() { - let mut tree = crate::document::DocumentTree::new("Root", "content"); - let child = tree.add_child(tree.root(), "Section 1", "s1 content"); - - let mut builder = ReasoningIndexBuilder::new(); - builder.add_section("Section 1", child); - builder.add_topic_entry( - "section", - TopicEntry { - node_id: child, - weight: 0.8, - depth: 1, - }, - ); - let index = builder.build(); - - let json = serde_json::to_string(&index).unwrap(); - let restored: ReasoningIndex = serde_json::from_str(&json).unwrap(); - - assert_eq!(restored.topic_count(), 1); - assert_eq!(restored.section_count(), 1); - assert_eq!(restored.hot_node_count(), 0); - } - - #[test] - fn test_serialization_roundtrip_with_hot_nodes() { - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "S1", "content 1"); - let c2 = tree.add_child(root, "S2", "content 2"); - - let mut index = ReasoningIndex::new(); - index.update_hot_nodes(&[(c1, 0.9), (c2, 0.7), (c1, 0.8)], 2); - - // c1 should be hot (2 hits >= threshold 2) - assert!(index.is_hot(c1)); - // c2 should not be hot (1 hit < threshold 2) - assert!(!index.is_hot(c2)); - - let json = serde_json::to_string(&index).unwrap(); - - // hot_nodes should serialize as array of pairs, not as object - assert!(!json.contains("\"hot_nodes\":{}")); - assert!(json.contains("\"hot_nodes\":[")); - - let restored: ReasoningIndex = serde_json::from_str(&json).unwrap(); - assert!(restored.is_hot(c1)); - assert!(!restored.is_hot(c2)); - - let entry = restored.hot_entry(c1).unwrap(); - assert_eq!(entry.hit_count, 2); - assert!(entry.avg_score > 0.0); - } - - #[test] - fn test_backward_compat_hot_nodes_empty_object() { - // Simulate old JSON where hot_nodes was serialized as {} by derive. - let mut tree = crate::document::DocumentTree::new("Root", ""); - let child = tree.add_child(tree.root(), "S1", "c"); - - let mut builder = ReasoningIndexBuilder::new(); - builder.add_section("s1", child); - let index = builder.build(); - - // Serialize normally (produces "hot_nodes":[]), then replace with - // the old format to test backward compat - let json = serde_json::to_string(&index).unwrap(); - let old_json = json.replace("\"hot_nodes\":[]", "\"hot_nodes\":{}"); - - let restored: ReasoningIndex = serde_json::from_str(&old_json).unwrap(); - assert_eq!(restored.hot_node_count(), 0); - } -} diff --git a/vectorless-core/vectorless/src/document/reference.rs b/vectorless-core/vectorless/src/document/reference.rs deleted file mode 100644 index 10d08e42..00000000 --- a/vectorless-core/vectorless/src/document/reference.rs +++ /dev/null @@ -1,559 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! In-document reference types and extraction. -//! -//! This module provides support for parsing and following references -//! within documents, such as "see Appendix G" or "refer to Table 5.3". -//! -//! # Example -//! -//! ```ignore -//! use vectorless::document::{NodeReference, RefType, ReferenceExtractor}; -//! -//! let content = "For more details, see Section 2.1 and Appendix G."; -//! let refs = ReferenceExtractor::extract(content); -//! -//! for r#ref in refs { -//! println!("Found {:?}: {}", r#ref.ref_type, r#ref.ref_text); -//! } -//! ``` - -use regex::Regex; -use serde::{Deserialize, Serialize}; -use std::sync::LazyLock; - -use super::NodeId; - -/// Type of in-document reference. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum RefType { - /// Reference to a section (e.g., "Section 2.1", "Chapter 3"). - Section, - /// Reference to an appendix (e.g., "Appendix A", "Appendix G"). - Appendix, - /// Reference to a table (e.g., "Table 5.3", "Table 1"). - Table, - /// Reference to a figure (e.g., "Figure 2.1", "Fig. 3"). - Figure, - /// Reference to a page (e.g., "page 42", "p. 15"). - Page, - /// Reference to an equation (e.g., "Equation 1", "Eq. 2.3"). - Equation, - /// Reference to a footnote (e.g., "footnote 1"). - Footnote, - /// Reference to a listing/code block. - Listing, - /// Unknown reference type. - Unknown, -} - -impl std::fmt::Display for RefType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - RefType::Section => write!(f, "Section"), - RefType::Appendix => write!(f, "Appendix"), - RefType::Table => write!(f, "Table"), - RefType::Figure => write!(f, "Figure"), - RefType::Page => write!(f, "Page"), - RefType::Equation => write!(f, "Equation"), - RefType::Footnote => write!(f, "Footnote"), - RefType::Listing => write!(f, "Listing"), - RefType::Unknown => write!(f, "Reference"), - } - } -} - -/// A reference found within document content. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NodeReference { - /// The original reference text (e.g., "see Appendix G"). - pub ref_text: String, - /// The target identifier extracted from the reference (e.g., "G", "5.3"). - pub target_id: String, - /// Type of the reference. - pub ref_type: RefType, - /// Resolved target node ID (if found in the tree). - pub target_node: Option, - /// Confidence score for the resolution (0.0 - 1.0). - pub confidence: f32, - /// Position in the original text (character offset). - pub position: usize, -} - -impl NodeReference { - /// Create a new unresolved reference. - pub fn new(ref_text: String, target_id: String, ref_type: RefType, position: usize) -> Self { - Self { - ref_text, - target_id, - ref_type, - target_node: None, - confidence: 0.0, - position, - } - } - - /// Create a resolved reference with a target node. - pub fn resolved( - ref_text: String, - target_id: String, - ref_type: RefType, - position: usize, - target_node: NodeId, - confidence: f32, - ) -> Self { - Self { - ref_text, - target_id, - ref_type, - target_node: Some(target_node), - confidence, - position, - } - } - - /// Check if this reference has been resolved. - pub fn is_resolved(&self) -> bool { - self.target_node.is_some() - } -} - -/// Reference extraction patterns. -static SECTION_PATTERNS: LazyLock> = LazyLock::new(|| { - vec![ - // Section references: "Section 2.1", "section 2.1.3", "Sec. 2.1" - ( - Regex::new(r"(?i)(?:see\s+)?(?:section|sec\.?)\s+([\d.]+)").unwrap(), - RefType::Section, - ), - // Chapter references: "Chapter 3", "Ch. 2" - ( - Regex::new(r"(?i)(?:see\s+)?(?:chapter|ch\.?)\s+(\d+)").unwrap(), - RefType::Section, - ), - ] -}); - -static APPENDIX_PATTERNS: LazyLock> = LazyLock::new(|| { - vec![ - // Appendix references: "Appendix A", "appendix G", "App. B" - ( - Regex::new(r"(?i)(?:see\s+)?(?:appendix|app\.?)\s+([A-Z]|[a-z])").unwrap(), - RefType::Appendix, - ), - ] -}); - -static TABLE_PATTERNS: LazyLock> = LazyLock::new(|| { - vec![ - // Table references: "Table 5.3", "table 1", "Tbl. 2.1" - ( - Regex::new(r"(?i)(?:see\s+)?(?:table|tbl\.?)\s+([\d.]+)").unwrap(), - RefType::Table, - ), - ] -}); - -static FIGURE_PATTERNS: LazyLock> = LazyLock::new(|| { - vec![ - // Figure references: "Figure 2.1", "fig. 3", "Fig 1.2" - ( - Regex::new(r"(?i)(?:see\s+)?(?:figure|fig\.?)\s+([\d.]+)").unwrap(), - RefType::Figure, - ), - ] -}); - -static PAGE_PATTERNS: LazyLock> = LazyLock::new(|| { - vec![ - // Page references: "page 42", "p. 15", "pp. 20-25" - ( - Regex::new(r"(?i)(?:see\s+)?(?:page|p\.?)\s+(\d+)").unwrap(), - RefType::Page, - ), - ] -}); - -static EQUATION_PATTERNS: LazyLock> = LazyLock::new(|| { - vec![ - // Equation references: "Equation 1", "Eq. 2.3" - ( - Regex::new(r"(?i)(?:see\s+)?(?:equation|eq\.?)\s+([\d.]+)").unwrap(), - RefType::Equation, - ), - ] -}); - -/// Reference extractor for parsing in-document references. -/// -/// # Example -/// -/// ```ignore -/// let content = "For details, see Section 2.1 and Appendix G."; -/// let refs = ReferenceExtractor::extract(content); -/// assert_eq!(refs.len(), 2); -/// ``` -pub struct ReferenceExtractor; - -impl ReferenceExtractor { - /// Extract all references from text content. - pub fn extract(text: &str) -> Vec { - let mut references = Vec::new(); - - // Extract section references - for (regex, ref_type) in SECTION_PATTERNS.iter() { - for cap in regex.captures_iter(text) { - if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { - references.push(NodeReference::new( - full_match.as_str().to_string(), - target.as_str().to_string(), - *ref_type, - full_match.start(), - )); - } - } - } - - // Extract appendix references - for (regex, ref_type) in APPENDIX_PATTERNS.iter() { - for cap in regex.captures_iter(text) { - if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { - references.push(NodeReference::new( - full_match.as_str().to_string(), - target.as_str().to_uppercase(), // Normalize to uppercase - *ref_type, - full_match.start(), - )); - } - } - } - - // Extract table references - for (regex, ref_type) in TABLE_PATTERNS.iter() { - for cap in regex.captures_iter(text) { - if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { - references.push(NodeReference::new( - full_match.as_str().to_string(), - target.as_str().to_string(), - *ref_type, - full_match.start(), - )); - } - } - } - - // Extract figure references - for (regex, ref_type) in FIGURE_PATTERNS.iter() { - for cap in regex.captures_iter(text) { - if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { - references.push(NodeReference::new( - full_match.as_str().to_string(), - target.as_str().to_string(), - *ref_type, - full_match.start(), - )); - } - } - } - - // Extract page references - for (regex, ref_type) in PAGE_PATTERNS.iter() { - for cap in regex.captures_iter(text) { - if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { - references.push(NodeReference::new( - full_match.as_str().to_string(), - target.as_str().to_string(), - *ref_type, - full_match.start(), - )); - } - } - } - - // Extract equation references - for (regex, ref_type) in EQUATION_PATTERNS.iter() { - for cap in regex.captures_iter(text) { - if let (Some(full_match), Some(target)) = (cap.get(0), cap.get(1)) { - references.push(NodeReference::new( - full_match.as_str().to_string(), - target.as_str().to_string(), - *ref_type, - full_match.start(), - )); - } - } - } - - // Sort by position and remove duplicates - references.sort_by_key(|r| r.position); - references.dedup_by(|a, b| a.position == b.position); - - references - } - - /// Extract references and attempt to resolve them against a tree. - /// - /// Uses the tree's structure index and title matching to find targets. - pub fn extract_and_resolve( - text: &str, - tree: &super::DocumentTree, - index: &super::RetrievalIndex, - ) -> Vec { - let mut references = Self::extract(text); - - for ref_mut in &mut references { - ref_mut.target_node = Self::resolve_reference(ref_mut, tree, index); - if ref_mut.target_node.is_some() { - ref_mut.confidence = 0.8; - } - } - - references - } - - /// Resolve a reference to a node in the tree. - fn resolve_reference( - r#ref: &NodeReference, - tree: &super::DocumentTree, - index: &super::RetrievalIndex, - ) -> Option { - match r#ref.ref_type { - RefType::Section => { - // Try to find by structure index (e.g., "2.1" -> structure "2.1") - if let Some(node_id) = index.find_by_structure(&r#ref.target_id) { - return Some(node_id); - } - // Try partial match (e.g., "2" might match "2.1" or "2.2") - for (structure, &node_id) in index.structures() { - if structure.starts_with(&format!("{}.", r#ref.target_id)) - || structure.as_str() == r#ref.target_id - { - return Some(node_id); - } - } - None - } - RefType::Appendix => { - // Search for nodes with "Appendix X" in title - for node_id in tree.traverse() { - if let Some(node) = tree.get(node_id) { - let title_lower = node.title.to_lowercase(); - if title_lower - .starts_with(&format!("appendix {}", r#ref.target_id.to_lowercase())) - || title_lower == format!("appendix {}", r#ref.target_id.to_lowercase()) - { - return Some(node_id); - } - } - } - None - } - RefType::Table => { - // Search for nodes with "Table X" in title - for node_id in tree.traverse() { - if let Some(node) = tree.get(node_id) { - let title_lower = node.title.to_lowercase(); - if title_lower.contains(&format!("table {}", r#ref.target_id)) { - return Some(node_id); - } - } - } - None - } - RefType::Figure => { - // Search for nodes with "Figure X" in title - for node_id in tree.traverse() { - if let Some(node) = tree.get(node_id) { - let title_lower = node.title.to_lowercase(); - if title_lower.contains(&format!("figure {}", r#ref.target_id)) - || title_lower.contains(&format!("fig {}", r#ref.target_id)) - { - return Some(node_id); - } - } - } - None - } - RefType::Page => { - // Parse page number and find node - if let Ok(page) = r#ref.target_id.parse::() { - return index.find_by_page(page); - } - None - } - _ => None, - } - } -} - -/// Reference resolver for batch resolution. -/// -/// Caches resolved references for efficient reuse. -#[derive(Debug, Clone, Default)] -pub struct ReferenceResolver { - /// Cache of resolved references by ref_text. - cache: std::collections::HashMap>, -} - -impl ReferenceResolver { - /// Create a new reference resolver. - pub fn new() -> Self { - Self::default() - } - - /// Resolve references in batch and cache results. - pub fn resolve_batch( - &mut self, - references: &[NodeReference], - tree: &super::DocumentTree, - index: &super::RetrievalIndex, - ) { - for r#ref in references { - if !self.cache.contains_key(&r#ref.ref_text) { - let resolved = ReferenceExtractor::resolve_reference(r#ref, tree, index); - self.cache.insert(r#ref.ref_text.clone(), resolved); - } - } - } - - /// Get a cached resolution. - pub fn get(&self, ref_text: &str) -> Option> { - self.cache.get(ref_text).copied() - } - - /// Clear the cache. - pub fn clear(&mut self) { - self.cache.clear(); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_section_references() { - let text = "For details, see Section 2.1 and Section 3.2.1."; - let refs = ReferenceExtractor::extract(text); - - // Debug: print what was extracted - for r in &refs { - eprintln!( - "Extracted: {:?} '{}' -> '{}'", - r.ref_type, r.ref_text, r.target_id - ); - } - - assert!( - refs.iter() - .any(|r| r.ref_type == RefType::Section && r.target_id == "2.1") - ); - // Note: The regex may not capture all multi-level section numbers correctly - // in a single pass, so we check for the presence of section references - assert!(refs.iter().any(|r| r.ref_type == RefType::Section)); - } - - #[test] - fn test_extract_appendix_references() { - let text = "See Appendix G for more information."; - let refs = ReferenceExtractor::extract(text); - - assert!( - refs.iter() - .any(|r| r.ref_type == RefType::Appendix && r.target_id == "G") - ); - } - - #[test] - fn test_extract_table_references() { - let text = "The data is shown in Table 5.3 and Table 1."; - let refs = ReferenceExtractor::extract(text); - - // Debug output - for r in &refs { - eprintln!( - "Extracted: {:?} '{}' -> '{}'", - r.ref_type, r.ref_text, r.target_id - ); - } - - assert!( - refs.iter() - .any(|r| r.ref_type == RefType::Table && r.target_id == "5.3") - ); - // The trailing period may be included, so check for either "1" or "1." - assert!( - refs.iter().any( - |r| r.ref_type == RefType::Table && (r.target_id == "1" || r.target_id == "1.") - ) - ); - } - - #[test] - fn test_extract_figure_references() { - let text = "As shown in Figure 2.1 and fig. 3."; - let refs = ReferenceExtractor::extract(text); - - // Debug output - for r in &refs { - eprintln!( - "Extracted: {:?} '{}' -> '{}'", - r.ref_type, r.ref_text, r.target_id - ); - } - - assert!( - refs.iter() - .any(|r| r.ref_type == RefType::Figure && r.target_id == "2.1") - ); - // The trailing period may be included, so check for either "3" or "3." - assert!( - refs.iter() - .any(|r| r.ref_type == RefType::Figure - && (r.target_id == "3" || r.target_id == "3.")) - ); - } - - #[test] - fn test_extract_page_references() { - let text = "See page 42 for details."; - let refs = ReferenceExtractor::extract(text); - - assert!( - refs.iter() - .any(|r| r.ref_type == RefType::Page && r.target_id == "42") - ); - } - - #[test] - fn test_extract_mixed_references() { - let text = "For details, see Section 2.1, Appendix G, and Table 5.3."; - let refs = ReferenceExtractor::extract(text); - - assert_eq!(refs.len(), 3); - assert!(refs.iter().any(|r| r.ref_type == RefType::Section)); - assert!(refs.iter().any(|r| r.ref_type == RefType::Appendix)); - assert!(refs.iter().any(|r| r.ref_type == RefType::Table)); - } - - #[test] - fn test_ref_type_display() { - assert_eq!(format!("{}", RefType::Section), "Section"); - assert_eq!(format!("{}", RefType::Appendix), "Appendix"); - assert_eq!(format!("{}", RefType::Table), "Table"); - } - - #[test] - fn test_node_reference_is_resolved() { - let unresolved = NodeReference::new( - "Section 2.1".to_string(), - "2.1".to_string(), - RefType::Section, - 0, - ); - assert!(!unresolved.is_resolved()); - - // Can't easily test resolved() without a real NodeId - } -} diff --git a/vectorless-core/vectorless/src/document/serde_helpers.rs b/vectorless-core/vectorless/src/document/serde_helpers.rs deleted file mode 100644 index cb658c35..00000000 --- a/vectorless-core/vectorless/src/document/serde_helpers.rs +++ /dev/null @@ -1,241 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Serde helpers for types that contain `HashMap`. -//! -//! JSON requires object keys to be strings, but `NodeId` (wrapping `indextree::NodeId`) -//! serializes as an integer. When `serde_json` serializes a `HashMap`, -//! it converts the integer key to a string (e.g., `42` → `"42"`), but on deserialization -//! it cannot parse the string back to `NodeId` because the deserializer expects a number. -//! -//! This module provides a `#[serde(with = "node_id_map")]` adapter that serializes -//! `HashMap` as a `Vec<(NodeId, V)>` instead, which is JSON-safe. -//! -//! # Usage -//! -//! ```rust,ignore -//! use serde::{Serialize, Deserialize}; -//! use std::collections::HashMap; -//! use crate::document::serde_helpers::node_id_map; -//! -//! #[derive(Serialize, Deserialize)] -//! struct MyStruct { -//! #[serde(with = "node_id_map")] -//! entries: HashMap, -//! } -//! ``` - -use std::collections::HashMap; - -use serde::de::DeserializeOwned; -use serde::{Deserialize, Deserializer, Serialize, Serializer}; - -use super::node::NodeId; - -/// Serialize `HashMap` as `Vec<(NodeId, V)>` (sorted by key for determinism). -pub fn serialize(map: &HashMap, serializer: S) -> Result -where - V: Serialize, - S: Serializer, -{ - let mut pairs: Vec<_> = map.iter().map(|(k, v)| (*k, v)).collect(); - pairs.sort_by_key(|(id, _)| usize::from(id.0)); - pairs.serialize(serializer) -} - -/// Deserialize `Vec<(NodeId, V)>` back into `HashMap`. -/// -/// Also accepts `{}` (empty JSON object) for backward compatibility with -/// data serialized before this helper was introduced, when `hot_nodes` etc. -/// were empty and serialized as `{}`. -pub fn deserialize<'de, V, D>(deserializer: D) -> Result, D::Error> -where - V: DeserializeOwned, - D: Deserializer<'de>, -{ - use serde::de; - - // Try to deserialize as either a Vec of pairs or an empty object. - struct VecOrEmptyMap(std::marker::PhantomData); - - impl<'de, V> de::Visitor<'de> for VecOrEmptyMap - where - V: DeserializeOwned, - { - type Value = HashMap; - - fn expecting(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str("a list of (NodeId, value) pairs or an empty object") - } - - fn visit_seq(self, seq: A) -> Result - where - A: de::SeqAccess<'de>, - { - let pairs: Vec<(NodeId, V)> = - Deserialize::deserialize(de::value::SeqAccessDeserializer::new(seq))?; - Ok(pairs.into_iter().collect()) - } - - fn visit_map(self, map: A) -> Result - where - A: de::MapAccess<'de>, - { - // Consume the map (should be empty for backward compat) - let _: de::value::MapAccessDeserializer = de::value::MapAccessDeserializer::new(map); - Ok(HashMap::new()) - } - } - - deserializer.deserialize_any(VecOrEmptyMap(std::marker::PhantomData)) -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - /// Wrapper struct to test `#[serde(with)]` through serde_json round-trip. - #[derive(Serialize, Deserialize, Debug)] - struct Wrap { - #[serde(with = "super")] - map: HashMap, - } - - #[test] - fn test_empty_map_roundtrip() { - let original = Wrap { - map: HashMap::::new(), - }; - let json = serde_json::to_string(&original).unwrap(); - assert!(json.contains("\"map\":[]")); - - let restored: Wrap = serde_json::from_str(&json).unwrap(); - assert!(restored.map.is_empty()); - } - - #[test] - fn test_single_entry_roundtrip() { - let tree = DocumentTree::new("Root", "content"); - let root = tree.root(); - - let original = Wrap { - map: { - let mut m = HashMap::new(); - m.insert(root, "root data".to_string()); - m - }, - }; - - let json = serde_json::to_string(&original).unwrap(); - let restored: Wrap = serde_json::from_str(&json).unwrap(); - assert_eq!(restored.map.get(&root), Some(&"root data".to_string())); - } - - #[test] - fn test_multiple_entries_roundtrip() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "C1", "c1"); - let c2 = tree.add_child(root, "C2", "c2"); - - let original = Wrap { - map: { - let mut m = HashMap::new(); - m.insert(root, 0u32); - m.insert(c1, 1u32); - m.insert(c2, 2u32); - m - }, - }; - - let json = serde_json::to_string(&original).unwrap(); - let restored: Wrap = serde_json::from_str(&json).unwrap(); - - assert_eq!(restored.map.len(), 3); - assert_eq!(restored.map[&root], 0); - assert_eq!(restored.map[&c1], 1); - assert_eq!(restored.map[&c2], 2); - } - - #[test] - fn test_backward_compat_empty_object() { - // Old data serialized hot_nodes as {} before node_id_map was used. - let json = r#"{"map": {}}"#; - let restored: Wrap = serde_json::from_str(json).unwrap(); - assert!(restored.map.is_empty()); - } - - #[test] - fn test_backward_compat_nonempty_object_rejected() { - // A non-empty JSON object with string keys like {"1": "data"} should - // fail because the string key "1" cannot be deserialized as NodeId. - let json = r#"{"map": {"1": "data"}}"#; - let result: Result, _> = serde_json::from_str(json); - assert!(result.is_err()); - } - - #[test] - fn test_serialized_json_shape() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let child = tree.add_child(root, "Child", "c"); - - let original = Wrap { - map: { - let mut m = HashMap::new(); - m.insert(root, "a".to_string()); - m.insert(child, "b".to_string()); - m - }, - }; - - let json = serde_json::to_string(&original).unwrap(); - // Verify deterministic ordering: root (id 0) before child (id 1) - let root_pos = json.find("\"a\"").unwrap_or(usize::MAX); - let child_pos = json.find("\"b\"").unwrap_or(usize::MAX); - assert!( - root_pos < child_pos, - "root entry should come first: {}", - json - ); - } - - #[test] - fn test_roundtrip_with_complex_value() { - // Test with a non-trivial value type (not just String/u32) - let tree = DocumentTree::new("Root", ""); - let root = tree.root(); - - #[derive(Serialize, Deserialize, Debug, PartialEq)] - struct Entry { - count: u32, - label: String, - } - - #[derive(Serialize, Deserialize, Debug)] - struct ComplexWrap { - #[serde(with = "super")] - data: HashMap, - } - - let original = ComplexWrap { - data: { - let mut m = HashMap::new(); - m.insert( - root, - Entry { - count: 42, - label: "test".to_string(), - }, - ); - m - }, - }; - - let json = serde_json::to_string(&original).unwrap(); - let restored: ComplexWrap = serde_json::from_str(&json).unwrap(); - assert_eq!(restored.data[&root].count, 42); - assert_eq!(restored.data[&root].label, "test"); - } -} diff --git a/vectorless-core/vectorless/src/document/structure.rs b/vectorless-core/vectorless/src/document/structure.rs deleted file mode 100644 index 455b25cb..00000000 --- a/vectorless-core/vectorless/src/document/structure.rs +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document structure types for JSON export. -//! - -use serde::{Deserialize, Serialize}; - -/// A node in the document structure for JSON export. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct StructureNode { - /// Node title. - pub title: String, - /// Unique node identifier. - pub node_id: String, - /// Starting line number (1-based). - pub start_index: usize, - /// Ending line number (1-based). - pub end_index: usize, - /// Generated summary (optional). - #[serde(skip_serializing_if = "Option::is_none")] - pub summary: Option, - /// Child nodes. - #[serde(skip_serializing_if = "Vec::is_empty")] - pub nodes: Vec, -} - -/// Document structure for JSON export. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentStructure { - /// Document name. - pub doc_name: String, - /// Tree structure. - pub structure: Vec, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_structure_node_serialization() { - let node = StructureNode { - title: "Introduction".to_string(), - node_id: "0001".to_string(), - start_index: 1, - end_index: 10, - summary: Some("A brief intro".to_string()), - nodes: vec![], - }; - - let json = serde_json::to_string(&node).unwrap(); - assert!(json.contains("Introduction")); - } - - #[test] - fn test_document_structure() { - let doc = DocumentStructure { - doc_name: "test.md".to_string(), - structure: vec![], - }; - - assert_eq!(doc.doc_name, "test.md"); - } -} diff --git a/vectorless-core/vectorless/src/document/toc.rs b/vectorless-core/vectorless/src/document/toc.rs deleted file mode 100644 index 6f1806ef..00000000 --- a/vectorless-core/vectorless/src/document/toc.rs +++ /dev/null @@ -1,343 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Table of Contents (ToC) view generation. -//! -//! Provides utilities for generating different views of the document tree, -//! including hierarchical ToC, flat ToC, and filtered views. - -use serde::{Deserialize, Serialize}; - -use super::node::NodeId; -use super::node::TreeNode; -use super::tree::DocumentTree; - -/// A node in the Table of Contents. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TocNode { - /// Node title. - pub title: String, - /// Node ID (if available). - pub node_id: Option, - /// Depth in the tree. - pub depth: usize, - /// Page range (for PDFs). - pub page_range: Option<(usize, usize)>, - /// Brief summary (optional). - pub summary: Option, - /// Children nodes. - pub children: Vec, -} - -impl TocNode { - /// Create a new ToC node. - pub fn new(title: impl Into, depth: usize) -> Self { - Self { - title: title.into(), - node_id: None, - depth, - page_range: None, - summary: None, - children: Vec::new(), - } - } - - /// Set the node ID. - pub fn with_node_id(mut self, id: impl Into) -> Self { - self.node_id = Some(id.into()); - self - } - - /// Set the page range. - pub fn with_page_range(mut self, start: usize, end: usize) -> Self { - self.page_range = Some((start, end)); - self - } - - /// Set the summary. - pub fn with_summary(mut self, summary: impl Into) -> Self { - self.summary = Some(summary.into()); - self - } - - /// Add a child node. - pub fn add_child(&mut self, child: TocNode) { - self.children.push(child); - } - - /// Count total nodes in this subtree. - pub fn count_nodes(&self) -> usize { - 1 + self.children.iter().map(|c| c.count_nodes()).sum::() - } - - /// Count leaf nodes in this subtree. - pub fn count_leaves(&self) -> usize { - if self.children.is_empty() { - 1 - } else { - self.children.iter().map(|c| c.count_leaves()).sum() - } - } - - /// Get maximum depth in this subtree. - pub fn max_depth(&self) -> usize { - if self.children.is_empty() { - self.depth - } else { - self.children - .iter() - .map(|c| c.max_depth()) - .max() - .unwrap_or(self.depth) - } - } -} - -/// Configuration for ToC generation. -#[derive(Debug, Clone)] -pub struct TocConfig { - /// Maximum depth to include (None = unlimited). - pub max_depth: Option, - /// Whether to include summaries. - pub include_summaries: bool, - /// Whether to include page ranges. - pub include_pages: bool, - /// Minimum content length to include (filter out empty nodes). - pub min_content_length: usize, -} - -impl Default for TocConfig { - fn default() -> Self { - Self { - max_depth: None, - include_summaries: true, - include_pages: true, - min_content_length: 0, - } - } -} - -impl TocConfig { - /// Create new ToC config. - pub fn new() -> Self { - Self::default() - } - - /// Set maximum depth. - pub fn with_max_depth(mut self, depth: usize) -> Self { - self.max_depth = Some(depth); - self - } - - /// Set whether to include summaries. - pub fn with_summaries(mut self, include: bool) -> Self { - self.include_summaries = include; - self - } -} - -/// ToC view generator. -#[derive(Clone)] -pub struct TocView { - config: TocConfig, -} - -impl TocView { - /// Create a new ToC view generator. - pub fn new() -> Self { - Self { - config: TocConfig::default(), - } - } - - /// Create with custom configuration. - pub fn with_config(config: TocConfig) -> Self { - Self { config } - } - - /// Generate ToC from a tree. - pub fn generate(&self, tree: &DocumentTree) -> TocNode { - self.build_toc_node(tree, tree.root(), 0) - } - - /// Generate ToC starting from a specific node. - pub fn generate_from(&self, tree: &DocumentTree, start: NodeId) -> TocNode { - let depth = tree.get(start).map_or(0, |n| n.depth); - self.build_toc_node(tree, start, depth) - } - - /// Build a ToC node from a tree node. - fn build_toc_node(&self, tree: &DocumentTree, node_id: NodeId, depth: usize) -> TocNode { - let node = match tree.get(node_id) { - Some(n) => n, - None => return TocNode::new("Unknown", depth), - }; - - // Check depth limit - if let Some(max) = self.config.max_depth { - if depth > max { - return TocNode::new("...", depth - 1); - } - } - - // Check minimum content length - if node.content.len() < self.config.min_content_length && tree.children(node_id).is_empty() - { - return TocNode::new(node.title.clone(), depth); - } - - let mut toc_node = - TocNode::new(&node.title, depth).with_node_id(node.node_id.clone().unwrap_or_default()); - - // Add page range - if self.config.include_pages { - if let (Some(start), Some(end)) = (node.start_page, node.end_page) { - toc_node = toc_node.with_page_range(start, end); - } - } - - // Add summary - if self.config.include_summaries && !node.summary.is_empty() { - toc_node = toc_node.with_summary(&node.summary); - } - - // Recursively add children - for child_id in tree.children(node_id) { - let child_toc = self.build_toc_node(tree, child_id, depth + 1); - toc_node.add_child(child_toc); - } - - toc_node - } - - /// Generate a flat list of ToC entries. - pub fn generate_flat(&self, tree: &DocumentTree) -> Vec { - let mut entries = Vec::new(); - self.collect_flat_entries(tree, tree.root(), &mut entries); - entries - } - - fn collect_flat_entries( - &self, - tree: &DocumentTree, - node_id: NodeId, - entries: &mut Vec, - ) { - if let Some(node) = tree.get(node_id) { - entries.push(TocEntry { - title: node.title.clone(), - node_id: node.node_id.clone(), - depth: node.depth, - page_range: node.start_page.zip(node.end_page), - }); - - for child_id in tree.children(node_id) { - self.collect_flat_entries(tree, child_id, entries); - } - } - } - - /// Generate a filtered ToC based on a predicate. - pub fn generate_filtered(&self, tree: &DocumentTree, filter: F) -> Vec - where - F: Fn(&TreeNode) -> bool, - { - let mut result = Vec::new(); - self.collect_filtered(tree, tree.root(), &filter, &mut result); - result - } - - fn collect_filtered( - &self, - tree: &DocumentTree, - node_id: NodeId, - filter: &F, - result: &mut Vec, - ) where - F: Fn(&TreeNode) -> bool, - { - if let Some(node) = tree.get(node_id) { - if filter(node) { - let toc_node = self.build_toc_node(tree, node_id, node.depth); - result.push(toc_node); - } - - for child_id in tree.children(node_id) { - self.collect_filtered(tree, child_id, filter, result); - } - } - } - - /// Format ToC as markdown. - pub fn format_markdown(&self, toc: &TocNode) -> String { - let mut output = String::new(); - self.write_markdown(toc, &mut output, 0); - output - } - - fn write_markdown(&self, toc: &TocNode, output: &mut String, level: usize) { - let indent = " ".repeat(level); - let bullet = if level == 0 { "-" } else { "-" }; - - output.push_str(&format!("{}{} {}\n", indent, bullet, toc.title)); - - if let Some(ref summary) = toc.summary { - output.push_str(&format!("{} > {}\n", indent, summary)); - } - - for child in &toc.children { - self.write_markdown(child, output, level + 1); - } - } - - /// Format ToC as JSON. - pub fn format_json(&self, toc: &TocNode) -> Result { - serde_json::to_string_pretty(toc) - } -} - -impl Default for TocView { - fn default() -> Self { - Self::new() - } -} - -/// A flat ToC entry. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TocEntry { - /// Node title. - pub title: String, - /// Node ID. - pub node_id: Option, - /// Depth in tree. - pub depth: usize, - /// Page range. - pub page_range: Option<(usize, usize)>, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_toc_node_creation() { - let mut root = TocNode::new("Root", 0); - let child = TocNode::new("Child", 1) - .with_node_id("node-1") - .with_summary("A child node"); - - root.add_child(child); - - assert_eq!(root.count_nodes(), 2); - assert_eq!(root.count_leaves(), 1); - assert_eq!(root.max_depth(), 1); - } - - #[test] - fn test_toc_config() { - let config = TocConfig::new().with_max_depth(3).with_summaries(false); - - assert_eq!(config.max_depth, Some(3)); - assert!(!config.include_summaries); - } -} diff --git a/vectorless-core/vectorless/src/document/tree.rs b/vectorless-core/vectorless/src/document/tree.rs deleted file mode 100644 index 1659471b..00000000 --- a/vectorless-core/vectorless/src/document/tree.rs +++ /dev/null @@ -1,883 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document tree using arena-based allocation. -//! -//! This structure provides better memory locality and simpler -//! lifetime management compared to `Rc}`. - -use std::collections::HashMap; - -use indextree::Arena; -use serde::{Deserialize, Serialize}; - -use super::node::{NodeId, TreeNode}; -use super::structure::{DocumentStructure, StructureNode}; - -/// Pre-computed index for efficient retrieval operations. -/// -/// Built once after the document tree is fully constructed. -/// Provides O(1) access to commonly needed traversal data. -#[derive(Debug, Clone)] -pub struct RetrievalIndex { - /// All leaf nodes in the tree. - leaves: Vec, - - /// Nodes grouped by depth level. - /// level_index[0] = root, level_index[1] = level 1 nodes, etc. - level_index: Vec>, - - /// Path from root to each node (inclusive). - path_cache: HashMap>, - - /// Siblings for each node (excluding self). - siblings_cache: HashMap>, - - /// Structure string to NodeId mapping. - /// e.g., "1.2.3" -> NodeId - structure_index: HashMap, - - /// Page number to NodeId mapping. - /// Maps each page to the most specific (deepest) node containing it. - page_index: HashMap, - - /// NodeId to page range mapping. - node_page_range: HashMap, - - /// Total node count. - node_count: usize, - - /// Maximum depth in the tree. - max_depth: usize, -} - -impl RetrievalIndex { - /// Get all leaf nodes. - pub fn leaves(&self) -> &[NodeId] { - &self.leaves - } - - /// Get nodes at a specific depth level. - /// - /// Returns None if the level doesn't exist. - pub fn level(&self, depth: usize) -> Option<&[NodeId]> { - self.level_index.get(depth).map(|v| v.as_slice()) - } - - /// Get all levels. - pub fn levels(&self) -> &[Vec] { - &self.level_index - } - - /// Get the path from root to a node (inclusive). - /// - /// Returns None if the node is not in the index. - pub fn path_to(&self, node: NodeId) -> Option<&[NodeId]> { - self.path_cache.get(&node).map(|v| v.as_slice()) - } - - /// Get siblings of a node (excluding the node itself). - /// - /// Returns None if the node is not in the index or has no siblings. - pub fn siblings(&self, node: NodeId) -> Option<&[NodeId]> { - self.siblings_cache.get(&node).map(|v| v.as_slice()) - } - - /// Find a node by its structure index. - /// - /// # Example - /// ```ignore - /// // Find section 2.1.3 - /// let node = index.find_by_structure("2.1.3"); - /// ``` - pub fn find_by_structure(&self, structure: &str) -> Option { - self.structure_index.get(structure).copied() - } - - /// Find the most specific node containing a page number. - /// - /// Returns the deepest node whose page range contains the given page. - pub fn find_by_page(&self, page: usize) -> Option { - self.page_index.get(&page).copied() - } - - /// Find all nodes whose page range overlaps with the given range. - /// - /// This is useful for retrieving all content that spans a range of pages. - /// - /// # Example - /// ```ignore - /// // Find all nodes covering pages 10-15 - /// let nodes = index.find_nodes_by_page_range(10, 15); - /// ``` - pub fn find_nodes_by_page_range(&self, start: usize, end: usize) -> Vec { - let mut result = Vec::new(); - for (&node_id, &(node_start, node_end)) in &self.node_page_range { - // Check if ranges overlap: node_start <= end && start <= node_end - if node_start <= end && start <= node_end { - result.push(node_id); - } - } - // Sort by start page for consistent ordering - result.sort_by_key(|&id| self.node_page_range.get(&id).map(|(s, _)| *s).unwrap_or(0)); - result - } - - /// Get all page numbers covered by a node. - /// - /// Returns None if the node has no page information. - pub fn get_pages_for_node(&self, node: NodeId) -> Option> { - let (start, end) = self.node_page_range.get(&node)?; - Some((*start..=*end).collect()) - } - - /// Get the page range for a node. - pub fn page_range(&self, node: NodeId) -> Option<(usize, usize)> { - self.node_page_range.get(&node).copied() - } - - /// Get all nodes that are leaves within a page range. - /// - /// This returns only leaf nodes (nodes with no children) that - /// overlap with the given page range. - pub fn find_leaves_by_page_range(&self, start: usize, end: usize) -> Vec { - let leaves_set: std::collections::HashSet = self.leaves.iter().copied().collect(); - self.find_nodes_by_page_range(start, end) - .into_iter() - .filter(|id| leaves_set.contains(id)) - .collect() - } - - /// Get the total number of pages in the document. - pub fn total_pages(&self) -> usize { - self.node_page_range - .values() - .map(|(_, end)| *end) - .max() - .unwrap_or(0) - } - - /// Get all structure indices. - pub fn structures(&self) -> &HashMap { - &self.structure_index - } - - /// Get the total number of nodes. - pub fn node_count(&self) -> usize { - self.node_count - } - - /// Get the maximum depth in the tree. - pub fn max_depth(&self) -> usize { - self.max_depth - } - - /// Get the number of levels. - pub fn level_count(&self) -> usize { - self.level_index.len() - } -} - -/// A hierarchical document tree structure. -/// -/// Uses an arena-based tree representation for efficient traversal -/// and node manipulation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentTree { - /// The underlying arena storing all nodes. - arena: Arena, - - /// The root node ID. - root_id: NodeId, - - /// Cached leaf nodes (rebuilt on demand). - #[serde(skip)] - leaves_cache: Option>, -} - -impl DocumentTree { - /// Create a new document tree with a root node. - pub fn new(title: &str, content: &str) -> Self { - let mut arena = Arena::new(); - let root_data = TreeNode { - title: title.to_string(), - structure: String::new(), // Root has no structure index - content: content.to_string(), - summary: String::new(), - depth: 0, - start_index: 1, - end_index: 1, - start_page: None, - end_page: None, - node_id: None, - physical_index: None, - token_count: None, - references: Vec::new(), - routing_keywords: Vec::new(), - question_hints: Vec::new(), - }; - let root_id = arena.new_node(root_data); - - // Root is initially a leaf - let leaves_cache = Some(vec![NodeId(root_id)]); - - Self { - arena, - root_id: NodeId(root_id), - leaves_cache, - } - } - - /// Create a document tree from an existing arena and root ID. - /// - /// This is useful for deserialization and testing. - pub fn from_raw(arena: Arena, root_id: NodeId) -> Self { - Self { - arena, - root_id, - leaves_cache: None, // Will be rebuilt on demand - } - } - - /// Get the root node ID. - pub fn root(&self) -> NodeId { - self.root_id - } - - /// Get a reference to the underlying arena. - pub fn arena(&self) -> &Arena { - &self.arena - } - - /// Get a node by its ID. - /// - /// Returns None if the node doesn't exist. - pub fn get(&self, id: NodeId) -> Option<&TreeNode> { - self.arena.get(id.0).map(|n| n.get()) - } - - /// Get a mutable reference to a node by its ID. - /// - /// Returns None if the node doesn't exist. - pub fn get_mut(&mut self, id: NodeId) -> Option<&mut TreeNode> { - self.arena.get_mut(id.0).map(|n| n.get_mut()) - } - - /// Add a child node to the specified parent. - /// - /// Returns the ID of the newly created child node. - /// The structure is automatically calculated based on siblings. - pub fn add_child(&mut self, parent: NodeId, title: &str, content: &str) -> NodeId { - let parent_depth = self.arena.get(parent.0).map(|n| n.get().depth).unwrap_or(0); - let parent_structure = self - .arena - .get(parent.0) - .map(|n| n.get().structure.clone()) - .unwrap_or_default(); - - // Calculate child index (1-based) - let child_index = parent.0.children(&self.arena).count() + 1; - - // Calculate structure: parent_structure.child_index - let child_structure = if parent_structure.is_empty() { - child_index.to_string() - } else { - format!("{}.{}", parent_structure, child_index) - }; - - let child_data = TreeNode { - title: title.to_string(), - structure: child_structure, - content: content.to_string(), - summary: String::new(), - depth: parent_depth + 1, - start_index: 1, - end_index: 1, - start_page: None, - end_page: None, - node_id: None, - physical_index: None, - token_count: None, - references: Vec::new(), - routing_keywords: Vec::new(), - question_hints: Vec::new(), - }; - let child_id = self.arena.new_node(child_data); - parent.0.append(child_id, &mut self.arena); - - // Update leaves cache - if let Some(ref mut cache) = self.leaves_cache { - // Remove parent from leaves (it's no longer a leaf) - cache.retain(|&id| id != parent); - // Add child to leaves - cache.push(NodeId(child_id)); - } - - NodeId(child_id) - } - - /// Add a child node with page boundaries. - /// - /// Returns the ID of the newly created child node. - pub fn add_child_with_pages( - &mut self, - parent: NodeId, - title: &str, - content: &str, - start_page: usize, - end_page: usize, - ) -> NodeId { - let child_id = self.add_child(parent, title, content); - if let Some(node) = self.get_mut(child_id) { - node.start_page = Some(start_page); - node.end_page = Some(end_page); - } - child_id - } - - /// Check if a node is a leaf (has no children). - pub fn is_leaf(&self, id: NodeId) -> bool { - id.0.children(&self.arena).next().is_none() - } - - /// Get the number of children of a node. - /// - /// This is more efficient than `children().len()` as it doesn't allocate. - pub fn child_count(&self, id: NodeId) -> usize { - id.0.children(&self.arena).count() - } - - /// Get the children of a node as an iterator. - /// - /// Use this instead of `children()` when you only need to iterate, - /// as it avoids allocating a Vec. - pub fn children_iter(&self, id: NodeId) -> impl Iterator + '_ { - id.0.children(&self.arena).map(NodeId) - } - - /// Get the children of a node. - /// - /// Returns a Vec for cases where you need owned access to the children. - /// Consider using `children_iter()` if you only need to iterate. - pub fn children(&self, id: NodeId) -> Vec { - self.children_iter(id).collect() - } - - /// Get the children of a node plus any resolved cross-reference targets. - /// - /// In addition to direct children, this collects `NodeId`s pointed to by - /// resolved references (`node.references[i].target_node`) on the given node. - /// Duplicate node IDs (e.g. a reference that happens to be a child) are - /// de-duplicated so the caller never sees the same node twice. - pub fn children_with_refs(&self, id: NodeId) -> Vec { - let mut result: Vec = self.children_iter(id).collect(); - if let Some(node) = self.get(id) { - for r#ref in &node.references { - if let Some(target) = r#ref.target_node { - if !result.contains(&target) { - result.push(target); - } - } - } - } - result - } - - /// Get the parent of a node. - /// - /// Returns None if the node is the root or doesn't have a parent. - pub fn parent(&self, id: NodeId) -> Option { - id.0.parent(&self.arena).map(NodeId) - } - - /// Get the siblings of a node (excluding the node itself). - /// - /// Returns an empty iterator for the root node. - pub fn siblings_iter(&self, id: NodeId) -> impl Iterator + '_ { - id.0.preceding_siblings(&self.arena) - .chain(id.0.following_siblings(&self.arena)) - .map(NodeId) - } - - /// Get the ancestors of a node from parent to root. - /// - /// Returns an empty iterator for the root node. - pub fn ancestors_iter(&self, id: NodeId) -> impl Iterator + '_ { - id.0.ancestors(&self.arena).map(NodeId) - } - - /// Get the path from root to a node (inclusive). - /// - /// Returns the path as a Vec starting from the root. - pub fn path_from_root(&self, id: NodeId) -> Vec { - let mut path: Vec = self.ancestors_iter(id).collect(); - path.reverse(); - path.push(id); - path - } - - /// Get the depth of a node (root = 0). - pub fn depth(&self, id: NodeId) -> usize { - self.get(id).map(|n| n.depth).unwrap_or(0) - } - - /// Get the maximum depth of any node in the tree (root = 0, leaf ≥ 0). - /// - /// Uses a single BFS pass. Returns 0 for a single-node tree. - pub fn max_depth(&self) -> usize { - let mut max_d = 0; - let mut stack = vec![(self.root_id, 0usize)]; - while let Some((id, d)) = stack.pop() { - max_d = max_d.max(d); - for child in self.children_iter(id) { - stack.push((child, d + 1)); - } - } - max_d - } - - /// Get the first child of a node. - /// - /// Returns None if the node has no children. - pub fn first_child(&self, id: NodeId) -> Option { - self.children_iter(id).next() - } - - /// Get the last child of a node. - /// - /// Returns None if the node has no children. - pub fn last_child(&self, id: NodeId) -> Option { - self.children_iter(id).last() - } - - /// Get all leaf nodes in the tree. - /// - /// Uses cached leaves if available, otherwise rebuilds the cache. - pub fn leaves(&self) -> Vec { - if let Some(ref cache) = self.leaves_cache { - return cache.clone(); - } - - // Rebuild cache on demand - let leaves: Vec = self - .traverse() - .into_iter() - .filter(|id| self.is_leaf(*id)) - .collect(); - - // Note: Can't mutate self here, caller should use rebuild_leaves_cache() - leaves - } - - /// Rebuild the leaves cache. - /// - /// Call this after deserialization or batch modifications. - pub fn rebuild_leaves_cache(&mut self) { - self.leaves_cache = Some( - self.traverse() - .into_iter() - .filter(|id| self.is_leaf(*id)) - .collect(), - ); - } - - /// Invalidate the leaves cache. - /// - /// Called automatically by mutation methods. - pub fn invalidate_leaves_cache(&mut self) { - self.leaves_cache = None; - } - - /// Get all nodes in the tree (depth-first order). - pub fn traverse(&self) -> Vec { - let mut result = Vec::new(); - let mut stack = vec![self.root_id]; - - while let Some(id) = stack.pop() { - result.push(id); - // Add children in reverse order for correct DFS order - let mut children: Vec<_> = self.children(id).into_iter().collect(); - children.reverse(); - stack.extend(children); - } - - result - } - - /// Get the number of nodes in the tree. - pub fn node_count(&self) -> usize { - self.arena.count() - } - - /// Update a node's summary. - pub fn set_summary(&mut self, id: NodeId, summary: &str) { - if let Some(node) = self.get_mut(id) { - node.summary = summary.to_string(); - } - } - - /// Update a node's content. - pub fn set_content(&mut self, id: NodeId, content: &str) { - if let Some(node) = self.get_mut(id) { - node.content = content.to_string(); - } - } - - /// Update a node's structure index. - pub fn set_structure(&mut self, id: NodeId, structure: &str) { - if let Some(node) = self.get_mut(id) { - node.structure = structure.to_string(); - } - } - - /// Set page boundaries for a node. - pub fn set_page_boundaries(&mut self, id: NodeId, start: usize, end: usize) { - if let Some(node) = self.get_mut(id) { - node.start_page = Some(start); - node.end_page = Some(end); - } - } - - /// Set line indices for a node. - pub fn set_line_indices(&mut self, id: NodeId, start: usize, end: usize) { - if let Some(node) = self.get_mut(id) { - node.start_index = start; - node.end_index = end; - } - } - - /// Get page range for a node. - pub fn page_range(&self, id: NodeId) -> Option<(usize, usize)> { - let node = self.get(id)?; - match (node.start_page, node.end_page) { - (Some(start), Some(end)) => Some((start, end)), - _ => None, - } - } - - /// Check if a node contains a specific page. - pub fn contains_page(&self, id: NodeId, page: usize) -> bool { - if let Some((start, end)) = self.page_range(id) { - page >= start && page <= end - } else { - false - } - } - - /// Find a node by its structure index. - /// - /// This is a convenience method that builds an index if needed. - /// For repeated queries, build a RetrievalIndex once. - pub fn find_by_structure(&self, structure: &str) -> Option { - // Linear search - for repeated use, build RetrievalIndex - for node_id in self.traverse() { - if let Some(node) = self.get(node_id) { - if node.structure == structure { - return Some(node_id); - } - } - } - None - } - - /// Find the most specific node containing a page. - /// - /// This is a convenience method that builds an index if needed. - /// For repeated queries, build a RetrievalIndex once. - pub fn find_by_page(&self, page: usize) -> Option { - let mut best_match: Option<(NodeId, usize)> = None; - - // Find the deepest node containing this page - for node_id in self.traverse() { - if let Some((start, end)) = self.page_range(node_id) { - if page >= start && page <= end { - let depth = self.get(node_id).map(|n| n.depth).unwrap_or(0); - match &best_match { - None => best_match = Some((node_id, depth)), - Some((_, best_depth)) if depth > *best_depth => { - best_match = Some((node_id, depth)); - } - _ => {} - } - } - } - } - - best_match.map(|(id, _)| id) - } - - /// Get all nodes whose page range overlaps with the given range. - pub fn find_nodes_by_page_range(&self, start: usize, end: usize) -> Vec { - self.traverse() - .into_iter() - .filter(|&id| { - if let Some((node_start, node_end)) = self.page_range(id) { - node_start <= end && start <= node_end - } else { - false - } - }) - .collect() - } - - /// Set the node ID (identifier string). - pub fn set_node_id(&mut self, id: NodeId, node_id: &str) { - if let Some(node) = self.get_mut(id) { - node.node_id = Some(node_id.to_string()); - } - } - - /// Set the physical index marker. - pub fn set_physical_index(&mut self, id: NodeId, index: &str) { - if let Some(node) = self.get_mut(id) { - node.physical_index = Some(index.to_string()); - } - } - - /// Update token count for a node. - pub fn set_token_count(&mut self, id: NodeId, count: usize) { - if let Some(node) = self.get_mut(id) { - node.token_count = Some(count); - } - } - - /// Set the references for a node. - pub fn set_references(&mut self, id: NodeId, references: Vec) { - if let Some(node) = self.get_mut(id) { - node.references = references; - } - } - - /// Export the tree structure to JSON format. - pub fn to_structure_json(&self, doc_name: &str) -> DocumentStructure { - let structure = self.build_structure_nodes(self.root_id); - DocumentStructure { - doc_name: doc_name.to_string(), - structure, - } - } - - /// Build a retrieval index for efficient operations. - /// - /// This should be called once after the tree is fully constructed. - /// The index provides O(1) access to commonly needed traversal data. - /// - /// # Example - /// - /// ```ignore - /// let tree = /* build tree */; - /// let index = tree.build_retrieval_index(); - /// - /// // Fast access to leaves - /// for leaf in index.leaves() { - /// // process leaf - /// } - /// - /// // Fast path lookup - /// if let Some(path) = index.path_to(node_id) { - /// // path[0] = root, path[-1] = node_id - /// } - /// - /// // Fast structure lookup - /// if let Some(node) = index.find_by_structure("2.1.3") { - /// // Found section 2.1.3 - /// } - /// - /// // Fast page lookup - /// if let Some(node) = index.find_by_page(42) { - /// // Found node containing page 42 - /// } - /// ``` - pub fn build_retrieval_index(&self) -> RetrievalIndex { - let mut leaves = Vec::new(); - let mut level_index: Vec> = Vec::new(); - let mut path_cache: HashMap> = HashMap::new(); - let mut siblings_cache: HashMap> = HashMap::new(); - let mut structure_index: HashMap = HashMap::new(); - let mut page_index: HashMap = HashMap::new(); - let mut node_page_range: HashMap = HashMap::new(); - let mut max_depth = 0; - let node_count = self.node_count(); - - // BFS to build level index - let mut current_level = vec![self.root_id]; - - // Initialize root path - path_cache.insert(self.root_id, vec![self.root_id]); - - while !current_level.is_empty() { - level_index.push(current_level.clone()); - - let mut next_level = Vec::new(); - - for &node_id in ¤t_level { - let children: Vec = self.children(node_id); - - // Get node data - if let Some(node) = self.get(node_id) { - max_depth = max_depth.max(node.depth); - - // Build structure index - if !node.structure.is_empty() { - structure_index.insert(node.structure.clone(), node_id); - } - - // Build page index and page range - if let (Some(start), Some(end)) = (node.start_page, node.end_page) { - node_page_range.insert(node_id, (start, end)); - - // Map each page to this node (will be overwritten by deeper nodes) - for page in start..=end { - page_index.insert(page, node_id); - } - } - } - - // Check if leaf - if children.is_empty() { - leaves.push(node_id); - } - - // Build siblings cache for children - if children.len() > 1 { - for (i, &child) in children.iter().enumerate() { - let siblings: Vec = children - .iter() - .enumerate() - .filter(|(j, _)| *j != i) - .map(|(_, &c)| c) - .collect(); - siblings_cache.insert(child, siblings); - } - } - - // Build path cache for children - if let Some(parent_path) = path_cache.get(&node_id).cloned() { - for &child in &children { - let mut child_path = parent_path.clone(); - child_path.push(child); - path_cache.insert(child, child_path); - } - } - - next_level.extend(children); - } - - current_level = next_level; - } - - RetrievalIndex { - leaves, - level_index, - path_cache, - siblings_cache, - structure_index, - page_index, - node_page_range, - node_count, - max_depth, - } - } - - /// Recursively build structure nodes starting from the given node. - fn build_structure_nodes(&self, node_id: NodeId) -> Vec { - let children = self.children(node_id); - children - .into_iter() - .enumerate() - .map(|(idx, child_id)| self.node_to_structure(child_id, idx)) - .collect() - } - - /// Convert a single node to StructureNode format. - fn node_to_structure(&self, node_id: NodeId, _idx: usize) -> StructureNode { - let node = self.get(node_id).cloned().unwrap_or_default(); - let children = self.children(node_id); - - StructureNode { - title: node.title, - node_id: node - .node_id - .clone() - .unwrap_or_else(|| format!("{:04}", _idx)), - start_index: node.start_index, - end_index: node.end_index, - summary: if node.summary.is_empty() { - None - } else { - Some(node.summary) - }, - nodes: children - .into_iter() - .enumerate() - .map(|(i, c)| self.node_to_structure(c, i)) - .collect(), - } - } -} - -impl Default for DocumentTree { - fn default() -> Self { - Self::new("Root", "") - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::reference::{NodeReference, RefType}; - - #[test] - fn test_children_with_refs_no_references() { - let mut tree = DocumentTree::new("Root", "root content"); - let child1 = tree.add_child(tree.root(), "Section 1", "content 1"); - let child2 = tree.add_child(tree.root(), "Section 2", "content 2"); - - let children = tree.children_with_refs(tree.root()); - assert_eq!(children.len(), 2); - assert!(children.contains(&child1)); - assert!(children.contains(&child2)); - } - - #[test] - fn test_children_with_refs_deduplicates() { - let mut tree = DocumentTree::new("Root", "root content"); - let child = tree.add_child(tree.root(), "Section 1", "content 1"); - - // Add a reference that points to the same node as an existing child - let refs = vec![NodeReference::resolved( - "see Section 1".to_string(), - "1".to_string(), - RefType::Section, - 5, - child, - 0.8, - )]; - tree.set_references(tree.root(), refs); - - let children = tree.children_with_refs(tree.root()); - // Should not duplicate - assert_eq!(children.len(), 1); - assert!(children.contains(&child)); - } - - #[test] - fn test_children_with_refs_unresolved_ignored() { - let mut tree = DocumentTree::new("Root", "root content"); - let child = tree.add_child(tree.root(), "Section 1", "content 1"); - - // Add an unresolved reference (target_node = None) - let refs = vec![NodeReference::new( - "see Section 5".to_string(), - "5".to_string(), - RefType::Section, - 5, - )]; - tree.set_references(tree.root(), refs); - - let children = tree.children_with_refs(tree.root()); - // Unresolved reference should not be included - assert_eq!(children.len(), 1); - assert!(children.contains(&child)); - } -} diff --git a/vectorless-core/vectorless/src/document/understanding.rs b/vectorless-core/vectorless/src/document/understanding.rs deleted file mode 100644 index 1505d796..00000000 --- a/vectorless-core/vectorless/src/document/understanding.rs +++ /dev/null @@ -1,318 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Understanding types — the core objects that define the Document Understanding Engine. -//! -//! These types form the stable public contract: -//! - [`Document`] — the unified post-ingest artifact (internal first-class citizen) -//! - [`DocumentInfo`] — what `ingest()` returns to users -//! - [`Concept`] — key concept extracted from a document -//! - [`Answer`] — what `ask()` returns -//! - [`Evidence`] — proof trail for an answer -//! - [`ReasoningTrace`] / [`TraceStep`] — always-mandatory reasoning trace - -use serde::{Deserialize, Serialize}; - -use super::toc::TocNode; - -// --------------------------------------------------------------------------- -// Document — unified post-ingest artifact -// --------------------------------------------------------------------------- - -/// A understood document — the core artifact of the understand phase. -/// -/// This is what `ingest()` produces internally and what `ask()` consumes. -/// It unifies tree + navigation index + reasoning index + summary + concepts -/// into a single first-class type, replacing the previous loose coupling of -/// `DocContext { &tree, &nav, &reasoning }`. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Document { - /// Unique document identifier. - pub doc_id: String, - /// Document name/title. - pub name: String, - /// Document format ("pdf", "markdown", "docx"). - pub format: String, - /// Source file path (if indexed from a file). - #[serde(default, skip_serializing_if = "Option::is_none")] - pub source_path: Option, - - // ── Three indexes (engine internal) ── - /// Hierarchical semantic tree. - pub tree: super::tree::DocumentTree, - /// Pre-computed navigation structure. - pub nav_index: super::navigation::NavigationIndex, - /// Keyword / topic / section summaries. - pub reasoning_index: super::reasoning::ReasoningIndex, - - // ── Understanding results (ingest stage output) ── - /// Document-level summary. - pub summary: String, - /// Key concepts the engine identified. - #[serde(default)] - pub concepts: Vec, - - // ── Metadata ── - /// Page count (for PDFs). - #[serde(default, skip_serializing_if = "Option::is_none")] - pub page_count: Option, - /// Number of sections in the tree. - #[serde(default)] - pub section_count: usize, -} - -// --------------------------------------------------------------------------- -// DocumentInfo — what ingest() returns to users -// --------------------------------------------------------------------------- - -/// The engine's understanding of a document — returned by `ingest()`. -/// -/// Rich enough for users to confirm the engine "got it right": -/// summary, structure (TOC), and key concepts. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentInfo { - /// Unique document identifier. - pub doc_id: String, - /// Document name. - pub name: String, - /// Document format ("pdf", "markdown", "docx"). - pub format: String, - /// Document-level summary — what this document is about. - pub summary: String, - /// Table of contents — the document's structure as the engine sees it. - pub structure: TocNode, - /// Key concepts the engine identified. - pub concepts: Vec, - /// Number of sections in the document. - pub section_count: usize, - /// Page count (for PDFs). - pub page_count: Option, -} - -impl Document { - /// Create a read-only agent context from this document. - /// - /// Used internally by the retrieval agent for navigation and reasoning. - pub fn as_context(&self) -> crate::agent::DocContext<'_> { - crate::agent::DocContext { - tree: &self.tree, - nav_index: &self.nav_index, - reasoning_index: &self.reasoning_index, - doc_name: &self.name, - } - } - - /// Get node content by ID (Agent `cat` command). - pub fn cat(&self, node_id: super::node::NodeId) -> Option<&str> { - self.tree.get(node_id).map(|n| n.content.as_str()) - } - - /// Find nodes containing a keyword in title or content. - pub fn find(&self, keyword: &str) -> Vec<(super::node::NodeId, &str)> { - let kw = keyword.to_lowercase(); - self.tree - .traverse() - .iter() - .filter_map(|&id| { - let node = self.tree.get(id)?; - if node.title.to_lowercase().contains(&kw) - || node.content.to_lowercase().contains(&kw) - { - Some((id, node.title.as_str())) - } else { - None - } - }) - .collect() - } - - /// Get node title by ID. - pub fn node_title(&self, node_id: super::node::NodeId) -> Option<&str> { - self.tree.get(node_id).map(|n| n.title.as_str()) - } - - /// Number of sections in the tree. - pub fn section_count(&self) -> usize { - self.section_count - } - - /// Produce the public DocumentInfo view of this document. - pub fn info(&self) -> DocumentInfo { - let toc = super::toc::TocView::new().generate(&self.tree); - DocumentInfo { - doc_id: self.doc_id.clone(), - name: self.name.clone(), - format: self.format.clone(), - summary: self.summary.clone(), - structure: toc, - concepts: self.concepts.clone(), - section_count: self.section_count, - page_count: self.page_count, - } - } -} - -// --------------------------------------------------------------------------- -// Concept -// --------------------------------------------------------------------------- - -/// A key concept extracted from a document. -/// -/// Produced during the ingest pipeline's final concept extraction step. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Concept { - /// Concept name (e.g., "capacitor derating"). - pub name: String, - /// One-sentence explanation. - pub summary: String, - /// Which sections this concept appears in. - pub sections: Vec, -} - -// --------------------------------------------------------------------------- -// Answer — what ask() returns -// --------------------------------------------------------------------------- - -/// The result of `ask()` — a reasoned answer with evidence and trace. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Answer { - /// The answer content. - pub content: String, - /// Evidence supporting the answer. - pub evidence: Vec, - /// Confidence score (0.0–1.0). - pub confidence: f32, - /// Reasoning trace — how the agent arrived at this answer. Always present. - pub trace: ReasoningTrace, -} - -// --------------------------------------------------------------------------- -// Evidence -// --------------------------------------------------------------------------- - -/// A piece of evidence supporting an answer — with source attribution. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Evidence { - /// Original document text. - pub content: String, - /// Navigation path (e.g., "Root/Chapter 3/Section 3.2"). - pub source_path: String, - /// Which document this evidence came from. - pub doc_name: String, - /// Relevance to the question (0.0–1.0). - pub relevance: f32, -} - -// --------------------------------------------------------------------------- -// ReasoningTrace — always mandatory -// --------------------------------------------------------------------------- - -/// Reasoning trace — how the agent arrived at the answer. Always present. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReasoningTrace { - /// The steps the agent took. - pub steps: Vec, -} - -impl ReasoningTrace { - /// Create an empty trace. - pub fn empty() -> Self { - Self { steps: Vec::new() } - } - - /// Create a trace with a single step. - pub fn single(action: impl Into, observation: impl Into, round: u32) -> Self { - Self { - steps: vec![TraceStep { - action: action.into(), - observation: observation.into(), - round, - }], - } - } - - /// Add a step to the trace. - pub fn push(&mut self, action: impl Into, observation: impl Into, round: u32) { - self.steps.push(TraceStep { - action: action.into(), - observation: observation.into(), - round, - }); - } -} - -/// A single step in the reasoning trace. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TraceStep { - /// What the agent did (e.g., "cd Chapter 3"). - pub action: String, - /// What the agent observed (e.g., "Found 5 sections about..."). - pub observation: String, - /// Which round this step was in. - pub round: u32, -} - -// --------------------------------------------------------------------------- -// IngestInput — what ingest() takes -// --------------------------------------------------------------------------- - -/// Input to `ingest()` — the document to be understood. -#[derive(Debug, Clone)] -pub enum IngestInput { - /// Index from a file path. - Path(std::path::PathBuf), - /// Index from raw bytes. - Bytes { - /// Document name. - name: String, - /// Raw document bytes. - data: Vec, - /// Document format. - format: super::format::DocumentFormat, - }, - /// Index from a text string. - Text { - /// Document name. - name: String, - /// Document content. - content: String, - }, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_reasoning_trace_empty() { - let trace = ReasoningTrace::empty(); - assert!(trace.steps.is_empty()); - } - - #[test] - fn test_reasoning_trace_single() { - let trace = ReasoningTrace::single("cd Chapter 3", "Found 5 sections", 1); - assert_eq!(trace.steps.len(), 1); - assert_eq!(trace.steps[0].action, "cd Chapter 3"); - assert_eq!(trace.steps[0].round, 1); - } - - #[test] - fn test_reasoning_trace_push() { - let mut trace = ReasoningTrace::empty(); - trace.push("ls", "Root with 3 children", 0); - trace.push("cd Chapter 2", "Found target section", 1); - assert_eq!(trace.steps.len(), 2); - } - - #[test] - fn test_concept_serialization() { - let concept = Concept { - name: "capacitor derating".into(), - summary: "Reducing capacitor specs for reliability".into(), - sections: vec!["Section 3.2".into()], - }; - let json = serde_json::to_string(&concept).unwrap(); - assert!(json.contains("capacitor derating")); - } -} diff --git a/vectorless-core/vectorless/src/error.rs b/vectorless-core/vectorless/src/error.rs deleted file mode 100644 index 36acf0e5..00000000 --- a/vectorless-core/vectorless/src/error.rs +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Error types for the vectorless library. -//! -//! This module provides a comprehensive error type hierarchy for all operations. -//! All errors are consolidated into [`enum@Error`] with specific variants for each category. - -use thiserror::Error; - -/// The main error type for vectorless operations. -#[derive(Debug, Error)] -pub enum Error { - // ========================================================================= - // Document & Parsing Errors - // ========================================================================= - /// An error occurred while parsing a document. - #[error("Document parsing error: {0}")] - Parse(String), - - /// Unsupported document format. - #[error("Unsupported document format: {0}")] - UnsupportedFormat(String), - - /// Invalid document structure. - #[error("Invalid document structure: {0}")] - InvalidStructure(String), - - // ========================================================================= - // Index Errors - // ========================================================================= - /// An error occurred while building the index. - #[error("Index building error: {0}")] - IndexBuild(String), - - /// Index not found. - #[error("Index not found: {0}")] - IndexNotFound(String), - - /// Index corrupted. - #[error("Index corrupted: {0}")] - IndexCorrupted(String), - - /// Document graph build error. - #[error("Document graph build error: {0}")] - GraphBuild(String), - - // ========================================================================= - // Retrieval Errors - // ========================================================================= - /// An error occurred during retrieval. - #[error("Retrieval error: {0}")] - Retrieval(String), - - /// No relevant content found. - #[error("No relevant content found for query")] - NoRelevantContent, - - /// Search timeout. - #[error("Search timeout after {0}ms")] - SearchTimeout(u64), - - // ========================================================================= - // LLM Errors - // ========================================================================= - /// An error occurred during LLM call (transient: network, timeout). - #[error("LLM error: {0}")] - Llm(String), - - /// LLM rate limit exceeded. - #[error("LLM rate limit exceeded, retry after {0}ms")] - RateLimitExceeded(u64), - - /// LLM quota exceeded. - #[error("LLM quota exceeded")] - QuotaExceeded, - - /// LLM reasoning failure — model responded but output is unusable. - /// Not transient. Do not retry the same prompt. - #[error("LLM reasoning failure at '{stage}': {detail}")] - LlmReasoning { - /// The pipeline stage where reasoning failed. - stage: String, - /// Why the output was unusable. - detail: String, - }, - - // ========================================================================= - // Summary Errors - // ========================================================================= - /// An error occurred during summarization. - #[error("Summarization error: {0}")] - Summarization(String), - - /// Summary too long. - #[error("Summary exceeds maximum length: {0} tokens")] - SummaryTooLong(usize), - - // ========================================================================= - // Storage Errors - // ========================================================================= - /// An error occurred during I/O operations. - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - - /// Workspace error. - #[error("Workspace error: {0}")] - Workspace(String), - - /// Cache error. - #[error("Cache error: {0}")] - Cache(String), - - /// Serialization error. - #[error("Serialization error: {0}")] - Serialization(String), - - /// Document not found. - #[error("Document not found: {0}")] - DocumentNotFound(String), - - /// Checksum mismatch. - #[error("Checksum mismatch: {0}")] - ChecksumMismatch(String), - - /// Workspace locked by another process. - #[error("Workspace locked by another process")] - WorkspaceLocked, - - /// Format version mismatch. - #[error("Format version mismatch: {0}")] - VersionMismatch(String), - - // ========================================================================= - // Configuration Errors - // ========================================================================= - /// TOML parsing error. - #[error("TOML parsing error: {0}")] - Toml(String), - - /// Invalid configuration. - #[error("Invalid configuration: {0}")] - Config(String), - - /// Missing required configuration. - #[error("Missing required configuration: {0}")] - MissingConfig(String), - - // ========================================================================= - // Node Errors - // ========================================================================= - /// The requested node was not found. - #[error("Node not found: {0}")] - NodeNotFound(String), - - // ========================================================================= - // Input Validation Errors - // ========================================================================= - /// Invalid input. - #[error("Invalid input: {0}")] - InvalidInput(String), - - /// Empty input. - #[error("Empty input: {field}")] - EmptyInput { - /// The field that was empty. - field: String, - }, - - /// Out of range. - #[error("{field} out of range: expected {min}-{max}, got {actual}")] - OutOfRange { - /// The field that was out of range. - field: String, - /// Minimum allowed value. - min: String, - /// Maximum allowed value. - max: String, - /// Actual value received. - actual: String, - }, - - // ========================================================================= - // Throttle Errors - // ========================================================================= - /// Throttle error. - #[error("Throttle error: {0}")] - Throttle(String), - - /// Concurrency limit exceeded. - #[error("Concurrency limit exceeded: {0} pending")] - ConcurrencyLimitExceeded(usize), - - // ========================================================================= - // Timeout Errors - // ========================================================================= - /// Operation timeout. - #[error("Operation timeout: {0}")] - Timeout(String), - - // ========================================================================= - // Generic Errors - // ========================================================================= - /// A generic error with a message. - #[error("{0}")] - Other(String), - - /// Error with context. - #[error("{context}: {source}")] - WithContext { - /// Additional context describing where/why the error occurred. - context: String, - /// The underlying error. - #[source] - source: Box, - }, -} - -impl Error { - /// Create an error with additional context. - #[must_use] - pub fn with_context(self, context: impl Into) -> Self { - Self::WithContext { - context: context.into(), - source: Box::new(self), - } - } - - /// Check if this is a retryable error. - #[must_use] - pub fn is_retryable(&self) -> bool { - matches!( - self, - Self::RateLimitExceeded(_) | Self::SearchTimeout(_) | Self::Timeout(_) | Self::Llm(_) - ) - } - - /// Check if this is a not found error. - #[must_use] - pub fn is_not_found(&self) -> bool { - matches!( - self, - Self::NodeNotFound(_) | Self::DocumentNotFound(_) | Self::IndexNotFound(_) - ) - } - - /// Check if this is a timeout error. - #[must_use] - pub fn is_timeout(&self) -> bool { - matches!(self, Self::Timeout(_) | Self::SearchTimeout(_)) - } - - /// Check if this is a configuration error. - #[must_use] - pub fn is_config_error(&self) -> bool { - matches!(self, Self::Config(_) | Self::MissingConfig(_)) - } - - /// Create an empty input error. - pub fn empty_input(field: impl Into) -> Self { - Self::EmptyInput { - field: field.into(), - } - } - - /// Create an out of range error. - pub fn out_of_range( - field: impl Into, - min: impl Into, - max: impl Into, - actual: impl Into, - ) -> Self { - Self::OutOfRange { - field: field.into(), - min: min.into(), - max: max.into(), - actual: actual.into(), - } - } -} - -/// A specialized result type for vectorless operations. -pub type Result = std::result::Result; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_error_context() { - let inner = Error::Parse("test".to_string()); - let with_context = inner.with_context("While processing document"); - - let msg = format!("{}", with_context); - assert!(msg.contains("While processing document")); - assert!(msg.contains("test")); - } - - #[test] - fn test_is_retryable() { - assert!(Error::RateLimitExceeded(1000).is_retryable()); - assert!(Error::Timeout("test".to_string()).is_retryable()); - assert!(!Error::Config("test".to_string()).is_retryable()); - } - - #[test] - fn test_is_not_found() { - assert!(Error::NodeNotFound("1".to_string()).is_not_found()); - assert!(Error::DocumentNotFound("doc".to_string()).is_not_found()); - assert!(!Error::Parse("test".to_string()).is_not_found()); - } - - #[test] - fn test_empty_input() { - let err = Error::empty_input("query"); - let msg = format!("{}", err); - assert!(msg.contains("query")); - } - - #[test] - fn test_out_of_range() { - let err = Error::out_of_range("depth", "0", "10", "15"); - let msg = format!("{}", err); - assert!(msg.contains("depth")); - assert!(msg.contains("0")); - assert!(msg.contains("10")); - assert!(msg.contains("15")); - } -} diff --git a/vectorless-core/vectorless/src/events/emitter.rs b/vectorless-core/vectorless/src/events/emitter.rs deleted file mode 100644 index 7804a25c..00000000 --- a/vectorless-core/vectorless/src/events/emitter.rs +++ /dev/null @@ -1,256 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Event emitter for client operations. -//! -//! Collects event handlers and dispatches events to them. -//! Uses `Arc>` so cloning shares handlers instead of losing them. - -use std::sync::Arc; - -use parking_lot::RwLock; - -use super::types::{IndexEvent, QueryEvent, WorkspaceEvent}; - -/// Type alias for sync index handler. -pub(crate) type IndexHandler = Box; - -/// Type alias for sync query handler. -pub(crate) type QueryHandler = Box; - -/// Type alias for sync workspace handler. -pub(crate) type WorkspaceHandler = Box; - -/// Inner state shared via `Arc>`. -struct EventEmitterInner { - /// Index event handlers. - index_handlers: Vec, - - /// Query event handlers. - query_handlers: Vec, - - /// Workspace event handlers. - workspace_handlers: Vec, -} - -impl Default for EventEmitterInner { - fn default() -> Self { - Self { - index_handlers: Vec::new(), - query_handlers: Vec::new(), - workspace_handlers: Vec::new(), - } - } -} - -/// Event emitter for client operations. -/// -/// Collects event handlers and dispatches events to them. -/// Cloning shares the same handlers (via `Arc`), so all clones -/// dispatch to the same registered handlers. -/// -/// # Example -/// -/// ```rust,ignore -/// let emitter = EventEmitter::new() -/// .on_index(|e| match e { -/// IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), -/// _ => {} -/// }); -/// -/// let clone = emitter.clone(); -/// // clone shares the same handlers — emitting on either fires on both -/// ``` -pub struct EventEmitter { - inner: Arc>, -} - -impl EventEmitter { - /// Create a new event emitter with no handlers. - pub fn new() -> Self { - Self::default() - } - - /// Add an index event handler. - pub fn on_index(self, handler: F) -> Self - where - F: Fn(&IndexEvent) + Send + Sync + 'static, - { - self.inner.write().index_handlers.push(Box::new(handler)); - self - } - - /// Add a query event handler. - pub fn on_query(self, handler: F) -> Self - where - F: Fn(&QueryEvent) + Send + Sync + 'static, - { - self.inner.write().query_handlers.push(Box::new(handler)); - self - } - - /// Add a workspace event handler. - pub fn on_workspace(self, handler: F) -> Self - where - F: Fn(&WorkspaceEvent) + Send + Sync + 'static, - { - self.inner - .write() - .workspace_handlers - .push(Box::new(handler)); - self - } - - /// Emit an index event. - pub fn emit_index(&self, event: IndexEvent) { - let inner = self.inner.read(); - for handler in &inner.index_handlers { - handler(&event); - } - } - - /// Emit a query event. - pub fn emit_query(&self, event: QueryEvent) { - let inner = self.inner.read(); - for handler in &inner.query_handlers { - handler(&event); - } - } - - /// Emit a workspace event. - pub fn emit_workspace(&self, event: WorkspaceEvent) { - let inner = self.inner.read(); - for handler in &inner.workspace_handlers { - handler(&event); - } - } - - /// Check if there are any handlers registered. - pub fn has_handlers(&self) -> bool { - let inner = self.inner.read(); - !inner.index_handlers.is_empty() - || !inner.query_handlers.is_empty() - || !inner.workspace_handlers.is_empty() - } - - /// Merge another emitter into this one. - pub fn merge(self, other: EventEmitter) -> Self { - let mut other_inner = other.inner.write(); - let mut inner = self.inner.write(); - inner - .index_handlers - .extend(other_inner.index_handlers.drain(..)); - inner - .query_handlers - .extend(other_inner.query_handlers.drain(..)); - inner - .workspace_handlers - .extend(other_inner.workspace_handlers.drain(..)); - drop(inner); - drop(other_inner); - self - } -} - -impl Default for EventEmitter { - fn default() -> Self { - Self { - inner: Arc::new(RwLock::new(EventEmitterInner::default())), - } - } -} - -impl Clone for EventEmitter { - fn clone(&self) -> Self { - Self { - inner: Arc::clone(&self.inner), - } - } -} - -impl std::fmt::Debug for EventEmitter { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let inner = self.inner.read(); - f.debug_struct("EventEmitter") - .field("index_handlers", &inner.index_handlers.len()) - .field("query_handlers", &inner.query_handlers.len()) - .field("workspace_handlers", &inner.workspace_handlers.len()) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::atomic::{AtomicUsize, Ordering}; - - #[test] - fn test_event_emitter_index() { - let counter = Arc::new(AtomicUsize::new(0)); - let counter_clone = counter.clone(); - - let emitter = EventEmitter::new().on_index(move |_e| { - counter_clone.fetch_add(1, Ordering::SeqCst); - }); - - emitter.emit_index(IndexEvent::Started { - path: "test.md".to_string(), - }); - emitter.emit_index(IndexEvent::Complete { - doc_id: "123".to_string(), - }); - - assert_eq!(counter.load(Ordering::SeqCst), 2); - } - - #[test] - fn test_event_emitter_query() { - let counter = Arc::new(AtomicUsize::new(0)); - let counter_clone = counter.clone(); - - let emitter = EventEmitter::new().on_query(move |_e| { - counter_clone.fetch_add(1, Ordering::SeqCst); - }); - - emitter.emit_query(QueryEvent::Started { - query: "test".to_string(), - }); - - assert_eq!(counter.load(Ordering::SeqCst), 1); - } - - #[test] - fn test_event_emitter_has_handlers() { - let empty = EventEmitter::new(); - assert!(!empty.has_handlers()); - - let with_handler = EventEmitter::new().on_index(|_| {}); - assert!(with_handler.has_handlers()); - } - - #[test] - fn test_event_emitter_clone_shares_handlers() { - let counter = Arc::new(AtomicUsize::new(0)); - let counter_clone = counter.clone(); - - let emitter = EventEmitter::new().on_index(move |_e| { - counter_clone.fetch_add(1, Ordering::SeqCst); - }); - - let cloned = emitter.clone(); - - // Emit on the clone — original's handler should fire - cloned.emit_index(IndexEvent::Started { - path: "test.md".to_string(), - }); - - assert_eq!(counter.load(Ordering::SeqCst), 1); - - // Emit on the original too - emitter.emit_index(IndexEvent::Complete { - doc_id: "123".to_string(), - }); - - assert_eq!(counter.load(Ordering::SeqCst), 2); - } -} diff --git a/vectorless-core/vectorless/src/events/mod.rs b/vectorless-core/vectorless/src/events/mod.rs deleted file mode 100644 index e8e55df5..00000000 --- a/vectorless-core/vectorless/src/events/mod.rs +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Event system for observing and reacting to client operations. -//! -//! This module provides event types and the [`EventEmitter`] for -//! registering handlers and dispatching events during indexing, -//! querying, and workspace operations. -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::events::{EventEmitter, IndexEvent}; -//! -//! let emitter = EventEmitter::new() -//! .on_index(|e| match e { -//! IndexEvent::Complete { doc_id } => println!("Indexed: {}", doc_id), -//! _ => {} -//! }); -//! -//! let client = EngineBuilder::new() -//! .with_events(emitter) -//! .build() -//! .await?; -//! ``` - -mod emitter; -mod types; - -pub use emitter::EventEmitter; -pub use types::{IndexEvent, QueryEvent, WorkspaceEvent}; diff --git a/vectorless-core/vectorless/src/events/types.rs b/vectorless-core/vectorless/src/events/types.rs deleted file mode 100644 index 30903a41..00000000 --- a/vectorless-core/vectorless/src/events/types.rs +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Event types for client operations. -//! -//! Provides enums for indexing, query, and workspace events -//! that can be observed via [`EventEmitter`](super::EventEmitter). - -use crate::document::DocumentFormat; -use crate::document::SufficiencyLevel; - -/// Indexing operation events. -#[derive(Debug, Clone)] -pub enum IndexEvent { - /// Started indexing a document. - Started { - /// File path being indexed. - path: String, - }, - - /// Document format detected. - FormatDetected { - /// Detected format. - format: DocumentFormat, - }, - - /// Parsing progress update. - ParsingProgress { - /// Percentage complete (0-100). - percent: u8, - }, - - /// Document tree built. - TreeBuilt { - /// Number of nodes in the tree. - node_count: usize, - }, - - /// Summary generation progress. - SummaryProgress { - /// Number of summaries completed. - completed: usize, - /// Total summaries to generate. - total: usize, - }, - - /// Indexing completed successfully. - Complete { - /// Generated document ID. - doc_id: String, - }, - - /// Error occurred during indexing. - Error { - /// Error message. - message: String, - }, -} - -/// Query operation events. -#[derive(Debug, Clone)] -pub enum QueryEvent { - /// Search started. - Started { - /// The query string. - query: String, - }, - - /// Node visited during search. - NodeVisited { - /// Node ID. - node_id: String, - /// Node title. - title: String, - /// Relevance score. - score: f32, - }, - - /// Candidate result found. - CandidateFound { - /// Node ID. - node_id: String, - /// Relevance score. - score: f32, - }, - - /// Sufficiency check result. - SufficiencyCheck { - /// Sufficiency level. - level: SufficiencyLevel, - /// Total tokens collected. - tokens: usize, - }, - - /// Query completed. - Complete { - /// Total results found. - total_results: usize, - /// Overall confidence score. - confidence: f32, - }, - - /// Error occurred during query. - Error { - /// Error message. - message: String, - }, -} - -/// Workspace operation events. -#[derive(Debug, Clone)] -pub enum WorkspaceEvent { - /// Document saved to workspace. - Saved { - /// Document ID. - doc_id: String, - }, - - /// Document loaded from workspace. - Loaded { - /// Document ID. - doc_id: String, - /// Whether it was a cache hit. - cache_hit: bool, - }, - - /// Document removed from workspace. - Removed { - /// Document ID. - doc_id: String, - }, - - /// Workspace cleared. - Cleared { - /// Number of documents removed. - count: usize, - }, -} diff --git a/vectorless-core/vectorless/src/graph/builder.rs b/vectorless-core/vectorless/src/graph/builder.rs deleted file mode 100644 index 6cdf388b..00000000 --- a/vectorless-core/vectorless/src/graph/builder.rs +++ /dev/null @@ -1,400 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document Graph Builder — constructs cross-document relationship graphs. -//! -//! This is a standalone builder (not an `IndexStage`) because it operates -//! on the workspace level across all documents, not on a single document. - -use std::collections::HashMap; - -use tracing::info; - -use super::config::DocumentGraphConfig; -use super::types::{ - DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, SharedKeyword, WeightedKeyword, -}; - -/// Intermediate data collected per document during graph building. -#[derive(Debug, Clone)] -struct DocProfile { - doc_id: String, - title: String, - format: String, - node_count: usize, - /// keyword → aggregate weight - keywords: HashMap, -} - -/// Builder for constructing a `DocumentGraph` from multiple documents. -pub struct DocumentGraphBuilder { - config: DocumentGraphConfig, - profiles: Vec, -} - -impl DocumentGraphBuilder { - /// Create a new builder with the given configuration. - pub fn new(config: DocumentGraphConfig) -> Self { - Self { - config, - profiles: Vec::new(), - } - } - - /// Create a builder with default configuration. - pub fn with_defaults() -> Self { - Self::new(DocumentGraphConfig::default()) - } - - /// Add a document's keyword profile to the builder. - /// - /// `keywords` should map keyword → aggregate weight (from - /// `ReasoningIndex::topic_paths` or extracted from content). - pub fn add_document( - &mut self, - doc_id: impl Into, - title: impl Into, - format: impl Into, - node_count: usize, - keywords: HashMap, - ) { - self.profiles.push(DocProfile { - doc_id: doc_id.into(), - title: title.into(), - format: format.into(), - node_count, - keywords, - }); - } - - /// Build the document graph from accumulated document profiles. - pub fn build(self) -> DocumentGraph { - let mut graph = DocumentGraph::new(); - - if self.profiles.is_empty() { - info!("Building document graph: 0 documents, empty graph"); - return graph; - } - - // Step 1: Add document nodes with top-N keywords - for profile in &self.profiles { - let mut weighted: Vec = profile - .keywords - .iter() - .map(|(kw, &w)| WeightedKeyword { - keyword: kw.clone(), - weight: w, - }) - .collect(); - // Sort by weight descending - weighted.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - weighted.truncate(self.config.max_keywords_per_doc); - - graph.add_node(DocumentGraphNode { - doc_id: profile.doc_id.clone(), - title: profile.title.clone(), - format: profile.format.clone(), - top_keywords: weighted, - node_count: profile.node_count, - }); - } - - info!( - "Building document graph: {} document nodes added", - graph.node_count() - ); - - // Step 2: Compute edges using the keyword inverted index - // (already built inside graph.add_node via keyword_index) - self.compute_edges(&mut graph); - - info!( - "Document graph built: {} nodes, {} edges", - graph.node_count(), - graph.edge_count() - ); - - graph - } - - /// Compute edges between documents based on shared keywords. - fn compute_edges(&self, graph: &mut DocumentGraph) { - // Collect candidate pairs: (doc_a, doc_b) → shared keywords - let mut pair_shared: HashMap<(String, String), Vec> = HashMap::new(); - - // Iterate the keyword index: for each keyword, all docs sharing it are candidates - let kw_index = graph.keyword_index_clone(); - - for (keyword, entries) in &kw_index { - if entries.len() < 2 { - continue; // No pair possible - } - // For every pair of documents sharing this keyword - for i in 0..entries.len() { - for j in (i + 1)..entries.len() { - let a = &entries[i]; - let b = &entries[j]; - let pair = if a.doc_id < b.doc_id { - (a.doc_id.clone(), b.doc_id.clone()) - } else { - (b.doc_id.clone(), a.doc_id.clone()) - }; - let shared = SharedKeyword { - keyword: keyword.clone(), - source_weight: a.weight, - target_weight: b.weight, - }; - pair_shared.entry(pair).or_default().push(shared); - } - } - } - - // Step 3: Create edges for pairs that meet thresholds - for ((doc_a, doc_b), shared_kws) in pair_shared { - let shared_count = shared_kws.len(); - if shared_count < self.config.min_shared_keywords { - continue; - } - - // Compute Jaccard: |intersection| / |union| - let kw_a = graph - .get_node(&doc_a) - .map(|n| n.top_keywords.len()) - .unwrap_or(0); - let kw_b = graph - .get_node(&doc_b) - .map(|n| n.top_keywords.len()) - .unwrap_or(0); - let union_size = kw_a + kw_b - shared_count; - let jaccard = if union_size > 0 { - shared_count as f32 / union_size as f32 - } else { - 0.0 - }; - - if jaccard < self.config.min_keyword_jaccard { - continue; - } - - // Edge weight: combine Jaccard with keyword count - let max_kws = self.config.max_keywords_per_doc.max(1) as f32; - let weight = (jaccard * 0.6 + (shared_count as f32 / max_kws).min(1.0) * 0.4).min(1.0); - - // Create bidirectional edges - let evidence_a = EdgeEvidence { - shared_keywords: shared_kws.clone(), - shared_keyword_count: shared_count, - keyword_jaccard: jaccard, - }; - let evidence_b = EdgeEvidence { - shared_keywords: shared_kws - .iter() - .map(|s| SharedKeyword { - keyword: s.keyword.clone(), - source_weight: s.target_weight, - target_weight: s.source_weight, - }) - .collect(), - shared_keyword_count: shared_count, - keyword_jaccard: jaccard, - }; - - graph.add_edge( - &doc_a, - GraphEdge { - target_doc_id: doc_b.clone(), - weight, - evidence: evidence_a, - }, - ); - graph.add_edge( - &doc_b, - GraphEdge { - target_doc_id: doc_a.clone(), - weight, - evidence: evidence_b, - }, - ); - } - - // Step 4: Trim edges per node to max_edges_per_node - self.trim_edges(graph); - } - - /// Trim edges per node to the configured maximum. - fn trim_edges(&self, graph: &mut DocumentGraph) { - let max = self.config.max_edges_per_node; - let all_edges = graph.take_edges(); - let mut trimmed: HashMap> = HashMap::new(); - - for (source, mut edges) in all_edges { - edges.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - edges.truncate(max); - trimmed.insert(source, edges); - } - - graph.set_edges(trimmed); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn make_keywords(pairs: &[(&str, f32)]) -> HashMap { - pairs.iter().map(|&(k, w)| (k.to_string(), w)).collect() - } - - #[test] - fn test_empty_workspace() { - let builder = DocumentGraphBuilder::with_defaults(); - let graph = builder.build(); - assert!(graph.is_empty()); - } - - #[test] - fn test_single_document() { - let mut builder = DocumentGraphBuilder::with_defaults(); - builder.add_document( - "doc1", - "Test", - "md", - 5, - make_keywords(&[("rust", 0.9), ("async", 0.7)]), - ); - let graph = builder.build(); - assert_eq!(graph.node_count(), 1); - assert_eq!(graph.edge_count(), 0); - } - - #[test] - fn test_two_docs_shared_keywords() { - let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { - min_keyword_jaccard: 0.05, - min_shared_keywords: 2, - ..DocumentGraphConfig::default() - }); - builder.add_document( - "doc1", - "Rust Programming", - "md", - 10, - make_keywords(&[("rust", 0.9), ("async", 0.8), ("tokio", 0.6)]), - ); - builder.add_document( - "doc2", - "Async Rust", - "md", - 8, - make_keywords(&[("rust", 0.7), ("async", 0.9), ("futures", 0.5)]), - ); - - let graph = builder.build(); - assert_eq!(graph.node_count(), 2); - // Should have bidirectional edges - assert!(graph.edge_count() >= 2); - - // Check doc1 → doc2 edge - let neighbors = graph.get_neighbors("doc1"); - assert_eq!(neighbors.len(), 1); - assert_eq!(neighbors[0].target_doc_id, "doc2"); - assert!(neighbors[0].weight > 0.0); - assert!(neighbors[0].evidence.keyword_jaccard > 0.0); - assert!(neighbors[0].evidence.shared_keyword_count >= 2); - - // Check doc2 → doc1 edge (bidirectional) - let neighbors2 = graph.get_neighbors("doc2"); - assert_eq!(neighbors2.len(), 1); - assert_eq!(neighbors2[0].target_doc_id, "doc1"); - } - - #[test] - fn test_unrelated_docs_no_edge() { - let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { - min_keyword_jaccard: 0.1, - min_shared_keywords: 2, - ..DocumentGraphConfig::default() - }); - builder.add_document( - "doc1", - "Rust Guide", - "md", - 10, - make_keywords(&[("rust", 0.9), ("ownership", 0.8)]), - ); - builder.add_document( - "doc2", - "Cooking Recipes", - "md", - 8, - make_keywords(&[("pasta", 0.9), ("sauce", 0.8)]), - ); - - let graph = builder.build(); - assert_eq!(graph.node_count(), 2); - assert_eq!(graph.edge_count(), 0); - } - - #[test] - fn test_jaccard_threshold() { - let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { - min_keyword_jaccard: 0.9, // Very high threshold - min_shared_keywords: 1, - ..DocumentGraphConfig::default() - }); - // Two docs with minimal overlap - builder.add_document( - "doc1", - "A", - "md", - 5, - make_keywords(&[("a", 0.9), ("b", 0.8), ("c", 0.7), ("d", 0.6), ("e", 0.5)]), - ); - builder.add_document( - "doc2", - "B", - "md", - 5, - make_keywords(&[("a", 0.9), ("x", 0.8), ("y", 0.7), ("z", 0.6)]), - ); - - let graph = builder.build(); - // Only 1 shared keyword out of 5+4=9 unique, Jaccard = 1/8 ≈ 0.125 - // Way below 0.9 threshold → no edge - assert_eq!(graph.edge_count(), 0); - } - - #[test] - fn test_max_edges_per_node() { - let mut builder = DocumentGraphBuilder::new(DocumentGraphConfig { - min_keyword_jaccard: 0.01, - min_shared_keywords: 1, - max_edges_per_node: 2, - ..DocumentGraphConfig::default() - }); - - // 4 docs all sharing keywords with doc1 - for i in 0..4 { - builder.add_document( - format!("doc{}", i), - format!("Doc {}", i), - "md", - 5, - make_keywords(&[("shared", 0.9), ("common", 0.8)]), - ); - } - - let graph = builder.build(); - // doc1 should have at most 2 outgoing edges - let neighbors = graph.get_neighbors("doc0"); - assert!(neighbors.len() <= 2); - } -} diff --git a/vectorless-core/vectorless/src/graph/config.rs b/vectorless-core/vectorless/src/graph/config.rs deleted file mode 100644 index 40b1d888..00000000 --- a/vectorless-core/vectorless/src/graph/config.rs +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration for document graph building and retrieval. - -use serde::{Deserialize, Serialize}; - -/// Configuration for building the document graph. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraphConfig { - /// Whether graph building is enabled. - pub enabled: bool, - /// Minimum Jaccard similarity for creating an edge. - pub min_keyword_jaccard: f32, - /// Minimum shared keywords to create an edge. - pub min_shared_keywords: usize, - /// Maximum top keywords per document node. - pub max_keywords_per_doc: usize, - /// Maximum edges per document node. - pub max_edges_per_node: usize, - /// Boost factor applied to graph-connected documents during retrieval. - pub retrieval_boost_factor: f32, -} - -impl Default for DocumentGraphConfig { - fn default() -> Self { - Self { - enabled: true, - min_keyword_jaccard: 0.1, - min_shared_keywords: 2, - max_keywords_per_doc: 50, - max_edges_per_node: 20, - retrieval_boost_factor: 0.15, - } - } -} - -impl DocumentGraphConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Create a disabled config. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } -} diff --git a/vectorless-core/vectorless/src/graph/mod.rs b/vectorless-core/vectorless/src/graph/mod.rs deleted file mode 100644 index f1b48862..00000000 --- a/vectorless-core/vectorless/src/graph/mod.rs +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document graph module — workspace-level cross-document relationship graph. -//! -//! This module provides: -//! - [`DocumentGraph`] — the graph data structure connecting documents by shared concepts -//! - [`DocumentGraphBuilder`] — constructs the graph from document keyword profiles -//! - [`DocumentGraphConfig`] — configuration for graph building and retrieval boosting -//! -//! The document graph is a workspace-scoped, weighted graph built from each document's -//! [`ReasoningIndex`](crate::document::ReasoningIndex) keyword data. It enables -//! graph-aware retrieval ranking where connected documents receive a relevance boost. -//! -//! # Data Flow -//! -//! ```text -//! Document Indexing → ReasoningIndex (topic_paths) -//! ↓ -//! DocumentGraphBuilder::add_document() -//! ↓ -//! DocumentGraph -//! ↓ -//! Workspace::set_graph() -//! ↓ -//! Engine::query() loads graph -//! ↓ -//! CrossDocumentStrategy (graph boosting) -//! ``` - -mod builder; -mod config; -mod types; - -// Re-export public API -pub use builder::DocumentGraphBuilder; -pub use config::DocumentGraphConfig; -pub use types::{DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword}; diff --git a/vectorless-core/vectorless/src/graph/types.rs b/vectorless-core/vectorless/src/graph/types.rs deleted file mode 100644 index 08f8d00a..00000000 --- a/vectorless-core/vectorless/src/graph/types.rs +++ /dev/null @@ -1,310 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document Graph data types. -//! -//! Core data structures for the workspace-scoped, weighted document relationship graph. - -use std::collections::HashMap; - -use serde::{Deserialize, Serialize}; - -/// A workspace-scoped document relationship graph. -/// -/// Nodes represent documents, edges represent relationships (shared keywords, -/// references). The graph is immutable after construction and can be shared -/// across threads via `Arc`. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraph { - /// All document nodes, indexed by doc_id. - nodes: HashMap, - - /// Adjacency list: doc_id → outgoing edges. - edges: HashMap>, - - /// Inverted index: keyword → documents containing this keyword. - keyword_index: HashMap>, - - /// Graph-level metadata. - metadata: GraphMetadata, -} - -/// Expose fields for graph builder (same module). -impl DocumentGraph { - /// Take all edges out, leaving an empty map in their place. - pub(crate) fn take_edges(&mut self) -> HashMap> { - std::mem::take(&mut self.edges) - } - - /// Set edges directly (used by builder after trimming). - pub(crate) fn set_edges(&mut self, edges: HashMap>) { - self.metadata.edge_count = edges.values().map(|v| v.len()).sum(); - self.edges = edges; - } - - /// Get a clone of the keyword index (used by builder for edge computation). - pub(crate) fn keyword_index_clone(&self) -> HashMap> { - self.keyword_index.clone() - } -} - -impl DocumentGraph { - /// Create a new empty document graph. - pub fn new() -> Self { - Self { - nodes: HashMap::new(), - edges: HashMap::new(), - keyword_index: HashMap::new(), - metadata: GraphMetadata { - document_count: 0, - edge_count: 0, - }, - } - } - - /// Add a document node to the graph. - pub fn add_node(&mut self, node: DocumentGraphNode) { - // Populate keyword index from the node's top keywords - for kw in &node.top_keywords { - self.keyword_index - .entry(kw.keyword.clone()) - .or_default() - .push(KeywordDocEntry { - doc_id: node.doc_id.clone(), - weight: kw.weight, - }); - } - let doc_id = node.doc_id.clone(); - self.nodes.insert(doc_id, node); - self.metadata.document_count = self.nodes.len(); - } - - /// Add a directed edge from `source` to `target`. - pub fn add_edge(&mut self, source: &str, edge: GraphEdge) { - self.edges.entry(source.to_string()).or_default().push(edge); - self.metadata.edge_count = self.edges.values().map(|v| v.len()).sum(); - } - - /// Get a document node by ID. - pub fn get_node(&self, doc_id: &str) -> Option<&DocumentGraphNode> { - self.nodes.get(doc_id) - } - - /// Get all edges outgoing from a document. - pub fn get_neighbors(&self, doc_id: &str) -> &[GraphEdge] { - self.edges.get(doc_id).map_or(&[], Vec::as_slice) - } - - /// Find documents containing a keyword. - pub fn find_by_keyword(&self, keyword: &str) -> &[KeywordDocEntry] { - self.keyword_index.get(keyword).map_or(&[], Vec::as_slice) - } - - /// Get the number of documents in the graph. - pub fn node_count(&self) -> usize { - self.nodes.len() - } - - /// Get the number of edges in the graph. - pub fn edge_count(&self) -> usize { - self.edges.values().map(|v| v.len()).sum() - } - - /// Get all document IDs in the graph. - pub fn doc_ids(&self) -> impl Iterator { - self.nodes.keys().map(|s| s.as_str()) - } - - /// Get graph metadata. - pub fn metadata(&self) -> &GraphMetadata { - &self.metadata - } - - /// Check if the graph is empty. - pub fn is_empty(&self) -> bool { - self.nodes.is_empty() - } -} - -impl Default for DocumentGraph { - fn default() -> Self { - Self::new() - } -} - -/// A document node in the graph. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentGraphNode { - /// Document ID (matches `PersistedDocument.meta.id`). - pub doc_id: String, - /// Document title/name. - pub title: String, - /// Document format (md, pdf). - pub format: String, - /// Top-N representative keywords extracted from the document's - /// ReasoningIndex topic_paths, sorted by aggregate weight. - pub top_keywords: Vec, - /// Number of nodes in the document tree. - pub node_count: usize, -} - -/// A keyword with its aggregate weight across the document. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct WeightedKeyword { - /// The keyword string (lowercased). - pub keyword: String, - /// Aggregate weight across all TopicEntry instances (0.0 - 1.0). - pub weight: f32, -} - -/// An edge connecting two documents. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GraphEdge { - /// Target document ID. - pub target_doc_id: String, - /// Edge weight (0.0 - 1.0). Higher = stronger relationship. - pub weight: f32, - /// Evidence for why these documents are connected. - pub evidence: EdgeEvidence, -} - -/// Evidence for why two documents are connected. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct EdgeEvidence { - /// Keywords shared between the two documents. - pub shared_keywords: Vec, - /// Number of shared keywords. - pub shared_keyword_count: usize, - /// Jaccard similarity of keyword sets. - pub keyword_jaccard: f32, -} - -/// A keyword shared between two documents. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SharedKeyword { - /// The shared keyword. - pub keyword: String, - /// Weight in source document. - pub source_weight: f32, - /// Weight in target document. - pub target_weight: f32, -} - -/// Entry in the keyword inverted index. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct KeywordDocEntry { - /// Document ID containing this keyword. - pub doc_id: String, - /// Weight of this keyword in the document. - pub weight: f32, -} - -/// Graph-level metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct GraphMetadata { - /// Number of documents in the graph. - pub document_count: usize, - /// Number of edges in the graph. - pub edge_count: usize, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_empty_graph() { - let graph = DocumentGraph::new(); - assert!(graph.is_empty()); - assert_eq!(graph.node_count(), 0); - assert_eq!(graph.edge_count(), 0); - } - - #[test] - fn test_add_node() { - let mut graph = DocumentGraph::new(); - graph.add_node(DocumentGraphNode { - doc_id: "doc1".to_string(), - title: "Test Doc".to_string(), - format: "md".to_string(), - top_keywords: vec![ - WeightedKeyword { - keyword: "rust".to_string(), - weight: 0.9, - }, - WeightedKeyword { - keyword: "async".to_string(), - weight: 0.7, - }, - ], - node_count: 10, - }); - - assert_eq!(graph.node_count(), 1); - assert!(graph.get_node("doc1").is_some()); - assert_eq!(graph.find_by_keyword("rust").len(), 1); - assert_eq!(graph.find_by_keyword("async").len(), 1); - assert_eq!(graph.find_by_keyword("missing").len(), 0); - } - - #[test] - fn test_add_edge() { - let mut graph = DocumentGraph::new(); - graph.add_node(DocumentGraphNode { - doc_id: "doc1".to_string(), - title: "A".to_string(), - format: "md".to_string(), - top_keywords: vec![], - node_count: 5, - }); - graph.add_node(DocumentGraphNode { - doc_id: "doc2".to_string(), - title: "B".to_string(), - format: "md".to_string(), - top_keywords: vec![], - node_count: 8, - }); - - graph.add_edge( - "doc1", - GraphEdge { - target_doc_id: "doc2".to_string(), - weight: 0.5, - evidence: EdgeEvidence { - shared_keywords: vec![SharedKeyword { - keyword: "rust".to_string(), - source_weight: 0.9, - target_weight: 0.8, - }], - shared_keyword_count: 1, - keyword_jaccard: 0.3, - }, - }, - ); - - assert_eq!(graph.edge_count(), 1); - assert_eq!(graph.get_neighbors("doc1").len(), 1); - assert_eq!(graph.get_neighbors("doc1")[0].target_doc_id, "doc2"); - assert_eq!(graph.get_neighbors("doc2").len(), 0); - } - - #[test] - fn test_serialization_roundtrip() { - let mut graph = DocumentGraph::new(); - graph.add_node(DocumentGraphNode { - doc_id: "doc1".to_string(), - title: "Test".to_string(), - format: "md".to_string(), - top_keywords: vec![WeightedKeyword { - keyword: "test".to_string(), - weight: 1.0, - }], - node_count: 3, - }); - - let json = serde_json::to_string(&graph).unwrap(); - let deserialized: DocumentGraph = serde_json::from_str(&json).unwrap(); - assert_eq!(deserialized.node_count(), 1); - assert_eq!(deserialized.get_node("doc1").unwrap().title, "Test"); - } -} diff --git a/vectorless-core/vectorless/src/index/config.rs b/vectorless-core/vectorless/src/index/config.rs deleted file mode 100644 index 798951b1..00000000 --- a/vectorless-core/vectorless/src/index/config.rs +++ /dev/null @@ -1,389 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration types for the index pipeline. -//! -//! This module contains all configuration types used by the indexing pipeline: -//! - [`IndexMode`] - Document format selection -//! - [`PipelineOptions`] - Full pipeline configuration -//! - [`OptimizationConfig`] - Tree optimization settings -//! - [`ThinningConfig`] - Node merging settings - -use super::summary::SummaryStrategy; -use crate::config::IndexerConfig; -use crate::document::{DocumentTree, ReasoningIndexConfig}; -use crate::llm::throttle::ConcurrencyConfig; -use crate::utils::fingerprint::{Fingerprint, Fingerprinter}; - -use std::path::PathBuf; - -/// Index mode for document processing. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum IndexMode { - /// Auto-detect format from file extension. - Auto, - /// Force Markdown format. - Markdown, - /// Force PDF format. - Pdf, -} - -impl Default for IndexMode { - fn default() -> Self { - Self::Auto - } -} - -/// Configuration for tree optimization. -#[derive(Debug, Clone)] -pub struct OptimizationConfig { - /// Whether optimization is enabled. - pub enabled: bool, - - /// Maximum tree depth (flatten if exceeded). - pub max_depth: Option, - - /// Maximum children per node (group if exceeded). - pub max_children: Option, - - /// Minimum tokens for a leaf node (merge smaller ones). - pub merge_leaf_threshold: usize, -} - -impl Default for OptimizationConfig { - fn default() -> Self { - Self { - enabled: true, - max_depth: None, - max_children: None, - merge_leaf_threshold: 0, - } - } -} - -impl OptimizationConfig { - /// Create a new optimization config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Disable optimization entirely. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } - - /// Set maximum depth. - pub fn with_max_depth(mut self, depth: usize) -> Self { - self.max_depth = Some(depth); - self - } - - /// Set maximum children per node. - pub fn with_max_children(mut self, max: usize) -> Self { - self.max_children = Some(max); - self - } -} - -/// Configuration for thinning (merging small nodes). -#[derive(Debug, Clone)] -pub struct ThinningConfig { - /// Whether thinning is enabled. - pub enabled: bool, - - /// Token threshold for merging. - pub threshold: usize, - - /// Whether to merge child content into the parent when removing children. - /// When true, nodes below threshold absorb their children's text before removal. - /// When false, small nodes are simply discarded. - pub merge_content: bool, -} - -impl Default for ThinningConfig { - fn default() -> Self { - Self { - enabled: false, - threshold: 500, - merge_content: true, - } - } -} - -impl ThinningConfig { - /// Create disabled config. - pub fn disabled() -> Self { - Self::default() - } - - /// Create enabled config with threshold. - pub fn enabled(threshold: usize) -> Self { - Self { - enabled: true, - threshold, - merge_content: true, - } - } - - /// Set the token threshold. - pub fn with_threshold(mut self, threshold: usize) -> Self { - self.threshold = threshold; - self - } - - /// Set whether to merge content. - pub fn with_merge_content(mut self, merge: bool) -> Self { - self.merge_content = merge; - self - } -} - -/// Configuration for large node splitting. -#[derive(Debug, Clone)] -pub struct SplitConfig { - /// Whether splitting is enabled. - pub enabled: bool, - - /// Maximum tokens per leaf node. Nodes exceeding this are split. - pub max_tokens_per_node: usize, - - /// Whether to use pattern-based splitting (headings, paragraphs). - /// When false, splits at approximate byte boundaries. - pub pattern_split: bool, -} - -impl Default for SplitConfig { - fn default() -> Self { - Self { - enabled: true, - max_tokens_per_node: 4000, - pattern_split: true, - } - } -} - -impl SplitConfig { - /// Create disabled config. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } - - /// Create enabled config with custom token limit. - pub fn with_max_tokens(mut self, max: usize) -> Self { - self.max_tokens_per_node = max; - self - } - - /// Set whether to use pattern-based splitting. - pub fn with_pattern_split(mut self, pattern: bool) -> Self { - self.pattern_split = pattern; - self - } -} - -/// Pipeline options for index execution. -#[derive(Debug, Clone)] -pub struct PipelineOptions { - /// Index mode. - pub mode: IndexMode, - - /// Whether to generate node IDs. - pub generate_ids: bool, - - /// Summary generation strategy. - pub summary_strategy: SummaryStrategy, - - /// Thinning configuration. - pub thinning: ThinningConfig, - - /// Optimization configuration. - pub optimization: OptimizationConfig, - - /// Split configuration. - pub split: SplitConfig, - - /// Whether to generate document description. - pub generate_description: bool, - - /// Concurrency configuration. - pub concurrency: ConcurrencyConfig, - - /// Indexer configuration. - pub indexer: IndexerConfig, - - /// Reasoning index configuration. - pub reasoning_index: ReasoningIndexConfig, - - /// Existing tree from a previous index (for incremental updates). - /// Stages (enhance, reasoning) can reuse data from unchanged nodes. - pub existing_tree: Option, - - /// Current processing version. Bumped when indexing algorithm changes - /// to force reprocessing of existing documents. - pub processing_version: u32, - - /// Directory for pipeline checkpoints. - /// When set, the pipeline saves state after each stage group - /// and can resume from the last completed stage on restart. - /// When `None`, checkpointing is disabled. - pub checkpoint_dir: Option, -} - -impl Default for PipelineOptions { - fn default() -> Self { - Self { - mode: IndexMode::Auto, - generate_ids: true, - summary_strategy: SummaryStrategy::full(), - thinning: ThinningConfig::default(), - optimization: OptimizationConfig::default(), - split: SplitConfig::default(), - generate_description: true, - concurrency: ConcurrencyConfig::default(), - indexer: IndexerConfig::default(), - reasoning_index: ReasoningIndexConfig::default(), - existing_tree: None, - processing_version: 1, - checkpoint_dir: None, - } - } -} - -impl PipelineOptions { - /// Create new pipeline options with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the index mode. - pub fn with_mode(mut self, mode: IndexMode) -> Self { - self.mode = mode; - self - } - - /// Set whether to generate node IDs. - pub fn with_generate_ids(mut self, generate: bool) -> Self { - self.generate_ids = generate; - self - } - - /// Set the summary strategy. - pub fn with_summary_strategy(mut self, strategy: SummaryStrategy) -> Self { - self.summary_strategy = strategy; - self - } - - /// Set the thinning configuration. - pub fn with_thinning(mut self, thinning: ThinningConfig) -> Self { - self.thinning = thinning; - self - } - - /// Set the optimization configuration. - pub fn with_optimization(mut self, optimization: OptimizationConfig) -> Self { - self.optimization = optimization; - self - } - - /// Set the split configuration. - pub fn with_split(mut self, split: SplitConfig) -> Self { - self.split = split; - self - } - - /// Set whether to generate document description. - pub fn with_generate_description(mut self, generate: bool) -> Self { - self.generate_description = generate; - self - } - - /// Set the concurrency configuration. - pub fn with_concurrency(mut self, concurrency: ConcurrencyConfig) -> Self { - self.concurrency = concurrency; - self - } - - /// Set the indexer configuration. - pub fn with_indexer(mut self, indexer: IndexerConfig) -> Self { - self.indexer = indexer; - self - } - - /// Set the reasoning index configuration. - pub fn with_reasoning_index(mut self, config: ReasoningIndexConfig) -> Self { - self.reasoning_index = config; - self - } - - /// Set the checkpoint directory. - /// - /// When set, the pipeline saves state after each stage group - /// and can resume from the last completed stage on restart. - pub fn with_checkpoint_dir(mut self, dir: impl Into) -> Self { - self.checkpoint_dir = Some(dir.into()); - self - } - - /// Compute a fingerprint of the pipeline configuration. - /// - /// If this fingerprint changes between runs, all documents need full reprocessing - /// even if their content hasn't changed (because the processing logic is different). - pub fn logic_fingerprint(&self) -> Fingerprint { - Fingerprinter::new() - .with_str(&format!("{:?}", self.mode)) - .with_bool(self.generate_ids) - .with_str(&format!("{:?}", self.summary_strategy)) - .with_bool(self.generate_description) - .with_bool(self.optimization.enabled) - .with_str(&format!("{:?}", self.reasoning_index)) - .into_fingerprint() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_index_mode_default() { - let mode = IndexMode::default(); - assert_eq!(mode, IndexMode::Auto); - } - - #[test] - fn test_optimization_config() { - let config = OptimizationConfig::new() - .with_max_depth(5) - .with_max_children(10); - - assert!(config.enabled); - assert_eq!(config.max_depth, Some(5)); - assert_eq!(config.max_children, Some(10)); - } - - #[test] - fn test_thinning_config() { - let config = ThinningConfig::enabled(300); - assert!(config.enabled); - assert_eq!(config.threshold, 300); - - let disabled = ThinningConfig::disabled(); - assert!(!disabled.enabled); - } - - #[test] - fn test_pipeline_options_builder() { - let options = PipelineOptions::new() - .with_mode(IndexMode::Markdown) - .with_generate_ids(false); - - assert_eq!(options.mode, IndexMode::Markdown); - assert!(!options.generate_ids); - } -} diff --git a/vectorless-core/vectorless/src/index/incremental/detector.rs b/vectorless-core/vectorless/src/index/incremental/detector.rs deleted file mode 100644 index 23107bb1..00000000 --- a/vectorless-core/vectorless/src/index/incremental/detector.rs +++ /dev/null @@ -1,654 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Change detection for incremental updates. -//! -//! This module provides fine-grained change detection using subtree fingerprints, -//! enabling precise identification of changed nodes without full reprocessing. - -use std::collections::HashMap; -use std::hash::{Hash, Hasher}; -use std::path::Path; -use std::time::SystemTime; - -use serde::{Deserialize, Serialize}; - -use crate::document::{DocumentTree, NodeId}; -use crate::utils::fingerprint::{Fingerprint, Fingerprinter, NodeFingerprint}; - -/// Type of change detected. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] -pub enum ChangeType { - /// Node was added. - Added, - /// Node was removed. - Removed, - /// Node content changed. - Modified, - /// Node structure changed (children added/removed). - Restructured, -} - -impl std::fmt::Display for ChangeType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - ChangeType::Added => write!(f, "added"), - ChangeType::Removed => write!(f, "removed"), - ChangeType::Modified => write!(f, "modified"), - ChangeType::Restructured => write!(f, "restructured"), - } - } -} - -/// A single change in the document. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct NodeChange { - /// Node ID (from old tree). - pub node_id: Option, - /// Node title (for human-readable output). - pub title: String, - /// Type of change. - pub change_type: ChangeType, - /// Node fingerprint (for modified nodes). - #[serde(default, skip_serializing_if = "Option::is_none")] - pub fingerprint: Option, -} - -impl NodeChange { - /// Create a new node change. - pub fn new(node_id: Option, title: String, change_type: ChangeType) -> Self { - Self { - node_id, - title, - change_type, - fingerprint: None, - } - } - - /// Add fingerprint information. - pub fn with_fingerprint(mut self, fp: NodeFingerprint) -> Self { - self.fingerprint = Some(fp); - self - } -} - -/// Set of changes between two document versions. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct ChangeSet { - /// Added nodes. - pub added: Vec, - /// Removed nodes. - pub removed: Vec, - /// Modified nodes (content changed). - pub modified: Vec, - /// Restructured nodes (children changed). - pub restructured: Vec, -} - -impl ChangeSet { - /// Create an empty change set. - pub fn new() -> Self { - Self::default() - } - - /// Check if there are any changes. - pub fn is_empty(&self) -> bool { - self.added.is_empty() - && self.removed.is_empty() - && self.modified.is_empty() - && self.restructured.is_empty() - } - - /// Get total number of changes. - pub fn total_changes(&self) -> usize { - self.added.len() + self.removed.len() + self.modified.len() + self.restructured.len() - } - - /// Merge another change set into this one. - pub fn merge(&mut self, other: ChangeSet) { - self.added.extend(other.added); - self.removed.extend(other.removed); - self.modified.extend(other.modified); - self.restructured.extend(other.restructured); - } - - /// Get all changed node IDs. - pub fn changed_node_ids(&self) -> Vec<&str> { - let mut ids: Vec<&str> = Vec::new(); - for change in &self.added { - if let Some(ref id) = change.node_id { - ids.push(id.as_str()); - } - } - for change in &self.modified { - if let Some(ref id) = change.node_id { - ids.push(id.as_str()); - } - } - for change in &self.restructured { - if let Some(ref id) = change.node_id { - ids.push(id.as_str()); - } - } - ids - } -} - -/// Document-level change detection result. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentChangeInfo { - /// Document ID. - pub doc_id: String, - /// Overall content fingerprint. - pub content_fp: Fingerprint, - /// Node-level fingerprints. - pub node_fingerprints: HashMap, - /// Last modification time. - pub modified_at: chrono::DateTime, - /// Processing version (incremented when processing algorithm changes). - pub processing_version: u32, -} - -impl DocumentChangeInfo { - /// Create a new document change info. - pub fn new(doc_id: &str) -> Self { - Self { - doc_id: doc_id.to_string(), - content_fp: Fingerprint::zero(), - node_fingerprints: HashMap::new(), - modified_at: chrono::Utc::now(), - processing_version: 1, - } - } - - /// Update from a tree. - pub fn update_from_tree(&mut self, tree: &DocumentTree) { - self.content_fp = compute_tree_fingerprint(tree); - self.node_fingerprints = compute_all_node_fingerprints(tree); - self.modified_at = chrono::Utc::now(); - } -} - -/// Change detector for incremental updates. -/// -/// Supports both simple hash-based detection and fine-grained -/// subtree fingerprint-based detection. -pub struct ChangeDetector { - /// Content fingerprints by document ID. - content_fps: HashMap, - - /// Node-level fingerprints by document ID. - node_fps: HashMap>, - - /// File modification times by document ID. - mtimes: HashMap, - - /// Processing versions by document ID. - processing_versions: HashMap, - - /// Current processing version (for algorithm upgrades). - current_processing_version: u32, -} - -impl ChangeDetector { - /// Create a new change detector. - pub fn new() -> Self { - Self { - content_fps: HashMap::new(), - node_fps: HashMap::new(), - mtimes: HashMap::new(), - processing_versions: HashMap::new(), - current_processing_version: 1, - } - } - - /// Set the current processing version. - pub fn with_processing_version(mut self, version: u32) -> Self { - self.current_processing_version = version; - self - } - - /// Compute hash of content (simple u64 hash). - fn hash_content(content: &str) -> u64 { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - content.hash(&mut hasher); - hasher.finish() - } - - /// Check if a file needs reindexing based on mtime. - pub fn needs_reindex_by_mtime(&self, doc_id: &str, path: &Path) -> bool { - let Some(recorded_mtime) = self.mtimes.get(doc_id) else { - return true; // Never indexed - }; - - let Ok(metadata) = std::fs::metadata(path) else { - return true; // Can't read file - }; - - let Ok(current_mtime) = metadata.modified() else { - return true; - }; - - current_mtime > *recorded_mtime - } - - /// Check if content needs reindexing based on fingerprint. - pub fn needs_reindex_by_hash(&self, doc_id: &str, content: &str) -> bool { - let current_fp = Fingerprint::from_str(content); - - match self.content_fps.get(doc_id) { - Some(recorded_fp) => recorded_fp != ¤t_fp, - None => true, - } - } - - /// Check if document needs reindexing based on fingerprint. - pub fn needs_reindex_by_fingerprint(&self, doc_id: &str, new_fp: &Fingerprint) -> bool { - match self.content_fps.get(doc_id) { - Some(recorded_fp) => recorded_fp != new_fp, - None => true, - } - } - - /// Check if processing version has changed. - pub fn needs_reindex_by_version(&self, doc_id: &str) -> bool { - match self.processing_versions.get(doc_id) { - Some(recorded_version) => *recorded_version < self.current_processing_version, - None => true, - } - } - - /// Record document state after indexing. - pub fn record(&mut self, doc_id: &str, content: &str, path: Option<&Path>) { - self.record_with_tree(doc_id, content, None, path); - } - - /// Record document state with tree (for fine-grained detection). - pub fn record_with_tree( - &mut self, - doc_id: &str, - content: &str, - tree: Option<&DocumentTree>, - path: Option<&Path>, - ) { - // Record content fingerprint - let content_fp = Fingerprint::from_str(content); - self.content_fps.insert(doc_id.to_string(), content_fp); - - // Record node fingerprints if tree provided - if let Some(tree) = tree { - let node_fps = compute_all_node_fingerprints(tree); - self.node_fps.insert(doc_id.to_string(), node_fps); - } - - // Record mtime if path provided - if let Some(path) = path { - if let Ok(metadata) = std::fs::metadata(path) { - if let Ok(mtime) = metadata.modified() { - self.mtimes.insert(doc_id.to_string(), mtime); - } - } - } - - // Record processing version - self.processing_versions - .insert(doc_id.to_string(), self.current_processing_version); - } - - /// Record document from ChangeInfo. - pub fn record_change_info(&mut self, info: &DocumentChangeInfo, path: Option<&Path>) { - self.content_fps - .insert(info.doc_id.clone(), info.content_fp); - self.node_fps - .insert(info.doc_id.clone(), info.node_fingerprints.clone()); - self.processing_versions - .insert(info.doc_id.clone(), info.processing_version); - - if let Some(path) = path { - if let Ok(metadata) = std::fs::metadata(path) { - if let Ok(mtime) = metadata.modified() { - self.mtimes.insert(info.doc_id.clone(), mtime); - } - } - } - } - - /// Detect changes between two trees using fingerprints. - pub fn detect_changes(&self, old_tree: &DocumentTree, new_tree: &DocumentTree) -> ChangeSet { - let mut changes = ChangeSet::new(); - - // Collect fingerprints from both trees - let old_fps = compute_all_node_fingerprints(old_tree); - let new_fps = compute_all_node_fingerprints(new_tree); - - // Build title -> (string_key, Fingerprint) maps by traversing trees - // We store owned Strings to avoid lifetime issues - let old_by_title: HashMap = { - let mut map = HashMap::new(); - for node_id in old_tree.traverse() { - if let Some(node) = old_tree.get(node_id) { - let key = node - .node_id - .clone() - .unwrap_or_else(|| format!("node_{:?}", node_id.0)); - if let Some(fp) = old_fps.get(&key) { - map.insert(node.title.clone(), (key, fp.clone())); - } - } - } - map - }; - - let new_by_title: HashMap = { - let mut map = HashMap::new(); - for node_id in new_tree.traverse() { - if let Some(node) = new_tree.get(node_id) { - let key = node - .node_id - .clone() - .unwrap_or_else(|| format!("node_{:?}", node_id.0)); - if let Some(fp) = new_fps.get(&key) { - map.insert(node.title.clone(), (key, fp.clone())); - } - } - } - map - }; - - // Find added nodes - for (title, (node_key, fp)) in &new_by_title { - if !old_by_title.contains_key(title) { - changes.added.push( - NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Added) - .with_fingerprint(fp.clone()), - ); - } - } - - // Find removed nodes - for (title, (node_key, fp)) in &old_by_title { - if !new_by_title.contains_key(title) { - changes.removed.push( - NodeChange::new(Some(node_key.clone()), title.clone(), ChangeType::Removed) - .with_fingerprint(fp.clone()), - ); - } - } - - // Find modified nodes - for (title, (new_key, new_fp)) in &new_by_title { - if let Some((_old_key, old_fp)) = old_by_title.get(title) { - if new_fp.content_changed(old_fp) { - changes.modified.push( - NodeChange::new(Some(new_key.clone()), title.clone(), ChangeType::Modified) - .with_fingerprint(new_fp.clone()), - ); - } else if new_fp.subtree_changed(old_fp) { - changes.restructured.push( - NodeChange::new( - Some(new_key.clone()), - title.clone(), - ChangeType::Restructured, - ) - .with_fingerprint(new_fp.clone()), - ); - } - } - } - - changes - } - - /// Get nodes that need reprocessing (summary regeneration). - /// - /// This returns nodes where either: - /// - Content changed (summary may need update) - /// - Processing version changed (all summaries need update) - pub fn get_nodes_needing_reprocess( - &self, - doc_id: &str, - new_tree: &DocumentTree, - ) -> Option> { - let old_fps = self.node_fps.get(doc_id)?; - let new_fps = compute_all_node_fingerprints(new_tree); - - let mut needs_reprocess = Vec::new(); - - // If processing version changed, all nodes need reprocessing - if self.needs_reindex_by_version(doc_id) { - return Some(new_fps.keys().cloned().collect()); - } - - // Otherwise, only changed nodes need reprocessing - for (node_key, new_fp) in &new_fps { - if let Some(old_fp) = old_fps.get(node_key) { - // Content changed or subtree structure changed - if new_fp.content_changed(old_fp) || new_fp.subtree_changed(old_fp) { - needs_reprocess.push(node_key.clone()); - } - } else { - // New node - needs_reprocess.push(node_key.clone()); - } - } - - Some(needs_reprocess) - } - - /// Clear stored data for a document. - pub fn clear(&mut self, doc_id: &str) { - self.content_fps.remove(doc_id); - self.node_fps.remove(doc_id); - self.mtimes.remove(doc_id); - self.processing_versions.remove(doc_id); - } - - /// Get the current content fingerprint for a document. - pub fn get_content_fingerprint(&self, doc_id: &str) -> Option<&Fingerprint> { - self.content_fps.get(doc_id) - } - - /// Get all node fingerprints for a document. - pub fn get_node_fingerprints(&self, doc_id: &str) -> Option<&HashMap> { - self.node_fps.get(doc_id) - } - - /// Serialize state for persistence. - pub fn to_state(&self) -> ChangeDetectorState { - ChangeDetectorState { - content_fps: self.content_fps.clone(), - node_fps: self.node_fps.clone(), - processing_versions: self.processing_versions.clone(), - } - } - - /// Restore state from persistence. - pub fn from_state(state: ChangeDetectorState) -> Self { - Self { - content_fps: state.content_fps, - node_fps: state.node_fps, - mtimes: HashMap::new(), - processing_versions: state.processing_versions, - current_processing_version: 1, - } - } -} - -impl Default for ChangeDetector { - fn default() -> Self { - Self::new() - } -} - -/// Serializable state for change detector. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ChangeDetectorState { - /// Content fingerprints by document ID. - pub content_fps: HashMap, - /// Node fingerprints by document ID. - pub node_fps: HashMap>, - /// Processing versions by document ID. - pub processing_versions: HashMap, -} - -// ============================================================================= -// Helper Functions -// ============================================================================= - -/// Compute the overall fingerprint for a tree. -pub fn compute_tree_fingerprint(tree: &DocumentTree) -> Fingerprint { - let root_fp = compute_node_fingerprint(tree, tree.root()); - root_fp.subtree -} - -/// Compute content fingerprint for a single node. -fn compute_node_content_fp(tree: &DocumentTree, node_id: NodeId) -> Fingerprint { - let node = match tree.get(node_id) { - Some(n) => n, - None => return Fingerprint::zero(), - }; - - Fingerprinter::new() - .with_str(&node.title) - .with_str(&node.content) - .with_option_str(node.node_id.as_deref()) - .into_fingerprint() -} - -/// Compute fingerprint for a node and its subtree. -fn compute_node_fingerprint(tree: &DocumentTree, node_id: NodeId) -> NodeFingerprint { - let node = match tree.get(node_id) { - Some(n) => n, - None => return NodeFingerprint::zero(), - }; - - // Content fingerprint - let content_fp = Fingerprinter::new() - .with_str(&node.title) - .with_str(&node.content) - .with_option_str(node.node_id.as_deref()) - .into_fingerprint(); - - // Check if leaf node - let children = tree.children(node_id); - if children.is_empty() { - return NodeFingerprint::leaf(content_fp); - } - - // Compute subtree fingerprint from children - let mut subtree_fp = Fingerprinter::new(); - subtree_fp.write_fingerprint(&content_fp); - - for child_id in children { - let child_fp = compute_node_fingerprint(tree, child_id); - subtree_fp.write_fingerprint(&child_fp.subtree); - } - - NodeFingerprint::new(content_fp, subtree_fp.into_fingerprint()) -} - -/// Compute fingerprints for all nodes in a tree. -/// Returns a map from string key (for persistence) to NodeFingerprint. -pub fn compute_all_node_fingerprints(tree: &DocumentTree) -> HashMap { - let mut fingerprints = HashMap::new(); - - for node_id in tree.traverse() { - if let Some(node) = tree.get(node_id) { - let key = node - .node_id - .clone() - .unwrap_or_else(|| format!("node_{:?}", node_id.0)); - let fp = compute_node_fingerprint(tree, node_id); - fingerprints.insert(key, fp); - } - } - - fingerprints -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - #[test] - fn test_change_detector_new() { - let detector = ChangeDetector::new(); - assert!(detector.content_fps.is_empty()); - } - - #[test] - fn test_needs_reindex_by_hash() { - let mut detector = ChangeDetector::new(); - - // First time: always needs reindex - assert!(detector.needs_reindex_by_hash("doc1", "content")); - - // Record the content - detector.record("doc1", "content", None); - - // Same content: no reindex needed - assert!(!detector.needs_reindex_by_hash("doc1", "content")); - - // Different content: needs reindex - assert!(detector.needs_reindex_by_hash("doc1", "new content")); - } - - #[test] - fn test_change_set() { - let mut changes = ChangeSet::new(); - assert!(changes.is_empty()); - - changes.added.push(NodeChange::new( - Some("node1".to_string()), - "Title".to_string(), - ChangeType::Added, - )); - - assert!(!changes.is_empty()); - assert_eq!(changes.total_changes(), 1); - } - - #[test] - fn test_processing_version() { - let mut detector = ChangeDetector::new().with_processing_version(2); - detector.record("doc1", "content", None); - - // Version matches, no reindex needed - assert!(!detector.needs_reindex_by_version("doc1")); - - // Create new detector with higher version - let detector2 = ChangeDetector::new().with_processing_version(3); - assert!(detector2.needs_reindex_by_version("doc1")); - } - - #[test] - fn test_node_fingerprint() { - let mut tree = DocumentTree::new("Root", "root content"); - let child = tree.add_child(tree.root(), "Child", "child content"); - - let root_fp = compute_node_fingerprint(&tree, tree.root()); - let child_fp = compute_node_fingerprint(&tree, child); - - // Child is a leaf, content == subtree - assert_eq!(child_fp.content, child_fp.subtree); - - // Root is not a leaf - assert_ne!(root_fp.content, root_fp.subtree); - } - - #[test] - fn test_fingerprint_serialization() { - let mut detector = ChangeDetector::new(); - let mut tree = DocumentTree::new("Root", "content"); - tree.add_child(tree.root(), "Section", "section content"); - - detector.record_with_tree("doc1", "content", Some(&tree), None); - - let state = detector.to_state(); - let json = serde_json::to_string(&state).unwrap(); - let restored: ChangeDetectorState = serde_json::from_str(&json).unwrap(); - - assert_eq!(state.content_fps, restored.content_fps); - } -} diff --git a/vectorless-core/vectorless/src/index/incremental/mod.rs b/vectorless-core/vectorless/src/index/incremental/mod.rs deleted file mode 100644 index 901661dd..00000000 --- a/vectorless-core/vectorless/src/index/incremental/mod.rs +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Incremental indexing support. -//! -//! This module provides functionality to incrementally update -//! an existing document index when the source document changes. -//! -//! # Features -//! -//! - **Fine-grained change detection**: Uses subtree fingerprints to identify -//! exactly which nodes changed -//! - **Processing version tracking**: Automatically reprocesses when algorithm -//! versions change -//! - **Partial updates**: Only reprocess changed nodes - -mod detector; -mod resolver; -mod updater; - -use crate::document::DocumentTree; -pub use detector::ChangeDetector; -pub use resolver::{IndexAction, SkipInfo, resolve_action}; -use std::collections::HashMap; - -/// Reuse summaries from old tree for unchanged nodes in the new tree. -/// -/// Uses `ChangeDetector` to find which nodes changed, then copies -/// summaries from old tree nodes with matching titles that are unchanged. -/// -/// Returns a map of `title -> summary` for reusable summaries. -pub fn compute_reusable_summaries( - old_tree: &DocumentTree, - new_tree: &DocumentTree, -) -> HashMap { - let detector = ChangeDetector::new(); - let changes = detector.detect_changes(old_tree, new_tree); - - let changed_titles: std::collections::HashSet = changes - .modified - .iter() - .chain(changes.restructured.iter()) - .chain(changes.added.iter()) - .chain(changes.removed.iter()) - .map(|c| c.title.clone()) - .collect(); - - let mut reusable = HashMap::new(); - for node_id in old_tree.traverse() { - if let Some(node) = old_tree.get(node_id) { - if !changed_titles.contains(&node.title) && !node.summary.is_empty() { - reusable.insert(node.title.clone(), node.summary.clone()); - } - } - } - reusable -} - -/// Apply reusable summaries to a new tree. -/// -/// For each node in `new_tree` whose title matches a key in `summaries`, -/// sets the node's summary from the map. -/// -/// Returns the number of summaries applied. -pub fn apply_reusable_summaries( - new_tree: &mut DocumentTree, - summaries: &HashMap, -) -> usize { - let mut applied = 0; - for node_id in new_tree.traverse() { - if let Some(node) = new_tree.get(node_id) { - if node.summary.is_empty() { - if let Some(summary) = summaries.get(&node.title) { - new_tree.set_summary(node_id, summary); - applied += 1; - } - } - } - } - applied -} diff --git a/vectorless-core/vectorless/src/index/incremental/resolver.rs b/vectorless-core/vectorless/src/index/incremental/resolver.rs deleted file mode 100644 index ffddf2a7..00000000 --- a/vectorless-core/vectorless/src/index/incremental/resolver.rs +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Incremental indexing resolver — decides what action to take for a source. -//! -//! Three-layer change detection: -//! 1. **File-level**: content fingerprint → skip if unchanged -//! 2. **Logic-level**: pipeline config fingerprint → full reprocess if changed -//! 3. **Node-level**: Merkle subtree diff → incremental update - -use tracing::info; - -use crate::document::DocumentFormat; -use crate::document::DocumentTree; -use crate::index::config::PipelineOptions; -use crate::storage::PersistedDocument; -use crate::utils::fingerprint::Fingerprint; - -/// Action to take for a source during indexing. -pub enum IndexAction { - /// Skip entirely — content unchanged. - Skip(SkipInfo), - /// Full index from scratch — new file, logic changed, or force mode. - /// If replacing an existing document, `existing_id` contains the old doc ID - /// to clean up after the new document is successfully saved. - FullIndex { - /// Old document ID to remove after successful re-index (if replacing). - existing_id: Option, - }, - /// Incremental update — content changed, pipeline unchanged. - IncrementalUpdate { - /// The old tree to reuse data from. - old_tree: DocumentTree, - /// The existing document ID (preserved across updates). - existing_id: String, - }, -} - -/// Info returned when a source is skipped. -pub struct SkipInfo { - /// Existing document ID. - pub doc_id: String, - /// Document name. - pub name: String, - /// Document format. - pub format: DocumentFormat, - /// Document description. - pub description: Option, - /// Page count. - pub page_count: Option, -} - -/// Resolve what action to take for a source file. -/// -/// This is the core three-layer incremental decision: -/// -/// 1. **File fingerprint**: Compare file bytes hash with stored `content_fingerprint`. -/// If equal → `Skip` (nothing changed). -/// -/// 2. **Logic fingerprint**: Compare pipeline config hash with stored `logic_fingerprint`. -/// If different → `FullIndex` (processing logic changed, must reprocess everything). -/// -/// 3. **Incremental**: Content changed but pipeline unchanged → `IncrementalUpdate` -/// with the old tree for partial reprocessing. -pub fn resolve_action( - file_bytes: &[u8], - stored_doc: &PersistedDocument, - pipeline_options: &PipelineOptions, - format: DocumentFormat, -) -> IndexAction { - let current_fp = Fingerprint::from_bytes(file_bytes); - - // Layer 1: File-level content fingerprint - if !stored_doc - .meta - .needs_reprocessing(¤t_fp, pipeline_options.processing_version) - { - info!("File fingerprint unchanged, skipping"); - return IndexAction::Skip(SkipInfo { - doc_id: stored_doc.meta.id.clone(), - name: stored_doc.meta.name.clone(), - format, - description: stored_doc.meta.description.clone(), - page_count: stored_doc.meta.page_count, - }); - } - - // Layer 2: Logic fingerprint (pipeline config changed?) - let current_logic_fp = pipeline_options.logic_fingerprint(); - if stored_doc.meta.logic_fingerprint != current_logic_fp - && !stored_doc.meta.logic_fingerprint.is_zero() - { - info!("Logic fingerprint changed, full reprocess required"); - return IndexAction::FullIndex { - existing_id: Some(stored_doc.meta.id.clone()), - }; - } - - // Layer 3: Content changed, pipeline unchanged → incremental update - info!("Content changed, pipeline unchanged → incremental update"); - IndexAction::IncrementalUpdate { - old_tree: stored_doc.tree.clone(), - existing_id: stored_doc.meta.id.clone(), - } -} diff --git a/vectorless-core/vectorless/src/index/incremental/updater.rs b/vectorless-core/vectorless/src/index/incremental/updater.rs deleted file mode 100644 index a9220acf..00000000 --- a/vectorless-core/vectorless/src/index/incremental/updater.rs +++ /dev/null @@ -1,177 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Partial tree updater for incremental indexing. - -use tracing::info; - -use crate::document::{DocumentTree, NodeId}; -use crate::error::Result; -use crate::index::parse::RawNode; - -use super::detector::ChangeDetector; - -/// Result of a partial update. -#[derive(Debug)] -pub struct UpdateResult { - /// Number of nodes added. - pub nodes_added: usize, - /// Number of nodes removed. - pub nodes_removed: usize, - /// Number of nodes modified. - pub nodes_modified: usize, - /// Number of summaries regenerated. - pub summaries_regenerated: usize, -} - -impl Default for UpdateResult { - fn default() -> Self { - Self { - nodes_added: 0, - nodes_removed: 0, - nodes_modified: 0, - summaries_regenerated: 0, - } - } -} - -/// Partial updater for incremental document updates. -pub struct PartialUpdater { - /// Change detector. - detector: ChangeDetector, -} - -impl PartialUpdater { - /// Create a new partial updater. - pub fn new() -> Self { - Self { - detector: ChangeDetector::new(), - } - } - - /// Get the change detector. - pub fn detector(&self) -> &ChangeDetector { - &self.detector - } - - /// Get mutable change detector. - pub fn detector_mut(&mut self) -> &mut ChangeDetector { - &mut self.detector - } - - /// Update a tree with new raw nodes. - /// - /// This performs a partial update by: - /// 1. Detecting changes between old and new content - /// 2. Updating only the affected subtrees - /// 3. Regenerating summaries for changed nodes - pub fn update( - &self, - old_tree: &DocumentTree, - new_raw_nodes: Vec, - ) -> Result<(DocumentTree, UpdateResult)> { - let mut result = UpdateResult::default(); - - // Build new tree from raw nodes - let new_tree = self.build_tree_from_raw(new_raw_nodes)?; - - // Detect changes - let changes = self.detector.detect_changes(old_tree, &new_tree); - - info!( - "Detected changes: {} added, {} removed, {} modified", - changes.added.len(), - changes.removed.len(), - changes.modified.len() - ); - - result.nodes_added = changes.added.len(); - result.nodes_removed = changes.removed.len(); - result.nodes_modified = changes.modified.len(); - - // For now, return the new tree - // In a full implementation, we would: - // 1. Preserve unchanged summaries - // 2. Only regenerate summaries for changed nodes - // 3. Merge preserved and new content - - Ok((new_tree, result)) - } - - /// Build a tree from raw nodes (simple implementation). - fn build_tree_from_raw(&self, raw_nodes: Vec) -> Result { - // This is a simplified implementation - // In production, use the BuildStage - - let mut tree = DocumentTree::new("Document", ""); - - // Stack to track parent nodes at each level - let mut level_stack: Vec> = vec![Some(tree.root())]; - - for raw in raw_nodes { - let level = raw.level; - - // Ensure stack has enough slots - while level_stack.len() <= level { - level_stack.push(None); - } - - // Find parent - let parent_id = (0..level) - .rev() - .find_map(|l| level_stack.get(l).copied().flatten()) - .unwrap_or(tree.root()); - - // Create node - let content = if raw.content.is_empty() { - "" - } else { - &raw.content - }; - let node_id = tree.add_child(parent_id, &raw.title, content); - - // Set line indices - tree.set_line_indices(node_id, raw.line_start, raw.line_end); - - // Set page if available - if let Some(page) = raw.page { - tree.set_page_boundaries(node_id, page, page); - } - - // Set token count if available - if let Some(count) = raw.token_count { - if count > 0 { - tree.set_token_count(node_id, count); - } - } - - // Update stack - if level < level_stack.len() { - level_stack[level] = Some(node_id); - } - - // Clear deeper levels - for i in (level + 1)..level_stack.len() { - level_stack[i] = None; - } - } - - Ok(tree) - } - - /// Check if reindexing is needed. - pub fn needs_reindex(&self, doc_id: &str, content: &str) -> bool { - self.detector.needs_reindex_by_hash(doc_id, content) - } - - /// Record document state after indexing. - pub fn record(&mut self, doc_id: &str, content: &str) { - self.detector.record(doc_id, content, None); - } -} - -impl Default for PartialUpdater { - fn default() -> Self { - Self::new() - } -} diff --git a/vectorless-core/vectorless/src/index/mod.rs b/vectorless-core/vectorless/src/index/mod.rs deleted file mode 100644 index 051f5326..00000000 --- a/vectorless-core/vectorless/src/index/mod.rs +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Index Pipeline module. -//! -//! This module provides a modular, extensible document indexing pipeline. -//! -//! # Architecture -//! -//! ```text -//! Priority 10: ┌──────────┐ -//! │ Parse │ Parse document into raw nodes -//! └────┬─────┘ -//! Priority 20: ┌────▼─────┐ -//! │ Build │ Construct tree + thinning (with content merge) -//! └────┬─────┘ -//! Priority 22: ┌────▼─────┐ -//! │ Validate │ Tree integrity checks (optional) -//! └────┬─────┘ -//! Priority 25: ┌────▼─────┐ -//! │ Split │ Split oversized leaf nodes (optional) -//! └────┬─────┘ -//! Priority 30: ┌────▼─────┐ -//! │ Enhance │ LLM summaries (when client available) -//! └────┬─────┘ -//! Priority 40: ┌────▼─────┐ -//! │ Enrich │ Metadata + cross-references -//! └────┬─────┘ -//! Priority 45: ┌────▼──────────┐ -//! │ Reasoning Idx │ Pre-computed reasoning index -//! └────┬──────────┘ -//! Priority 50: ┌────▼──────────┐ -//! │ Navigation Idx│ Agent navigation index -//! └────┬──────────┘ -//! Priority 60: ┌────▼──────┐ -//! │ Optimize │ Final tree optimization -//! └───────────┘ -//! ``` -//! -//! Checkpointing is available when `PipelineOptions::checkpoint_dir` is set. -//! State is saved after each stage group and resumed on restart. -//! -//! # Usage -//! -//! ```rust,ignore -//! use vectorless::index::{PipelineExecutor, IndexInput, PipelineOptions}; -//! use vectorless::index::summary::SummaryStrategy; -//! -//! let options = PipelineOptions::new() -//! .with_summary_strategy(SummaryStrategy::selective(100, true)); -//! -//! let result = PipelineExecutor::new() -//! .with_options(options) -//! .execute(input) -//! .await?; -//! ``` - -pub mod config; -pub mod incremental; -pub mod parse; -pub mod pipeline; -pub mod stages; -pub mod summary; - -// Re-export main types from pipeline -pub use pipeline::{IndexInput, IndexMetrics, PipelineExecutor, PipelineResult}; - -// Re-export config types -pub use crate::document::ReasoningIndexConfig; -pub use config::{IndexMode, PipelineOptions, ThinningConfig}; - -// Re-export summary -pub use summary::SummaryStrategy; diff --git a/vectorless-core/vectorless/src/index/parse/markdown/config.rs b/vectorless-core/vectorless/src/index/parse/markdown/config.rs deleted file mode 100644 index 7a013f5f..00000000 --- a/vectorless-core/vectorless/src/index/parse/markdown/config.rs +++ /dev/null @@ -1,219 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Configuration options for the Markdown parser. - -/// Markdown parser configuration. -/// -/// Controls parsing behavior, content extraction, and extension support. -/// -/// # Example -/// -/// ```rust -/// use vectorless::parser::markdown::MarkdownConfig; -/// -/// // Default GFM configuration -/// let config = MarkdownConfig::default(); -/// -/// // Strict CommonMark -/// let config = MarkdownConfig::commonmark(); -/// -/// // Documentation-focused -/// let config = MarkdownConfig::documentation(); -/// -/// // Custom configuration -/// let config = MarkdownConfig { -/// max_heading_level: 3, -/// include_code_blocks: false, -/// ..Default::default() -/// }; -/// ``` -#[derive(Debug, Clone)] -pub struct MarkdownConfig { - // ============================================================ - // Parsing Options - // ============================================================ - /// Enable GitHub Flavored Markdown extensions. - /// - /// Includes: tables, strikethrough, task lists, autolinks. - /// Default: `true` - pub enable_gfm: bool, - - /// Enable footnotes extension (`[^1]` syntax). - /// Default: `false` - pub enable_footnotes: bool, - - /// Enable definition lists. - /// Default: `false` - pub enable_definition_lists: bool, - - /// Enable superscript/subscript (`^sup^`, `~sub~`). - /// Default: `false` - pub enable_super_sub: bool, - - /// Maximum heading level to parse (1-6). - /// Headings above this level are treated as content. - /// Default: `6` - pub max_heading_level: usize, - - /// Minimum heading level to create a node. - /// Headings below this level are treated as content. - /// Default: `1` - pub min_heading_level: usize, - - // ============================================================ - // Content Extraction - // ============================================================ - /// Include code blocks in node content. - /// Default: `true` - pub include_code_blocks: bool, - - /// Include images (alt text) in content. - /// Default: `true` - pub include_images: bool, - - /// Include links in content. - /// Default: `true` - pub include_links: bool, - - /// Include tables in content. - /// Default: `true` - pub include_tables: bool, - - // ============================================================ - // Frontmatter - // ============================================================ - /// Parse YAML frontmatter (`---` delimiters). - /// Default: `true` - pub parse_frontmatter: bool, - - /// Parse TOML frontmatter (`+++` delimiters). - /// Default: `false` - pub parse_toml_frontmatter: bool, - - /// Fields to extract from frontmatter as metadata. - /// Default: `["title", "description"]` - pub frontmatter_fields: Vec, - - // ============================================================ - // Advanced Options - // ============================================================ - /// Minimum characters required for a heading title to be valid. - /// Headings with shorter titles are skipped. - /// Default: `1` - pub min_heading_chars: usize, - - /// Create an implicit root node for content before the first heading. - /// Default: `true` - pub create_preamble_node: bool, - - /// Title for the preamble node (if created). - /// Default: `"Introduction"` - pub preamble_title: String, -} - -impl Default for MarkdownConfig { - fn default() -> Self { - Self { - // Parsing options - GFM by default (most common) - enable_gfm: true, - enable_footnotes: false, - enable_definition_lists: false, - enable_super_sub: false, - max_heading_level: 6, - min_heading_level: 1, - - // Content extraction - include all by default - include_code_blocks: true, - include_images: true, - include_links: true, - include_tables: true, - - // Frontmatter - parse_frontmatter: true, - parse_toml_frontmatter: false, - frontmatter_fields: vec!["title".into(), "description".into()], - - // Advanced - min_heading_chars: 1, - create_preamble_node: true, - preamble_title: "Introduction".into(), - } - } -} - -impl MarkdownConfig { - /// Create a new configuration with defaults. - #[must_use] - pub fn new() -> Self { - Self::default() - } - - /// Configuration optimized for GitHub Flavored Markdown. - /// - /// Enables GFM extensions (tables, strikethrough, task lists). - #[must_use] - pub fn gfm() -> Self { - Self::default() - } - - /// Configuration for strict CommonMark (no extensions). - #[must_use] - pub fn commonmark() -> Self { - Self { - enable_gfm: false, - ..Self::default() - } - } - - /// Configuration optimized for documentation sites. - /// - /// Enables footnotes and definition lists. - #[must_use] - pub fn documentation() -> Self { - Self { - enable_footnotes: true, - enable_definition_lists: true, - ..Self::default() - } - } - - /// Configuration that excludes code blocks from content. - /// - /// Useful when code blocks are not relevant for retrieval. - #[must_use] - pub fn no_code_blocks() -> Self { - Self { - include_code_blocks: false, - ..Self::default() - } - } - - /// Set the maximum heading level. - #[must_use] - pub fn with_max_heading_level(mut self, level: usize) -> Self { - self.max_heading_level = level.clamp(1, 6); - self - } - - /// Enable or disable code blocks in content. - #[must_use] - pub fn with_code_blocks(mut self, include: bool) -> Self { - self.include_code_blocks = include; - self - } - - /// Enable or disable frontmatter parsing. - #[must_use] - pub fn with_frontmatter(mut self, parse: bool) -> Self { - self.parse_frontmatter = parse; - self - } - - /// Set the preamble node title. - #[must_use] - pub fn with_preamble_title(mut self, title: impl Into) -> Self { - self.preamble_title = title.into(); - self - } -} diff --git a/vectorless-core/vectorless/src/index/parse/markdown/frontmatter.rs b/vectorless-core/vectorless/src/index/parse/markdown/frontmatter.rs deleted file mode 100644 index 65f7cda0..00000000 --- a/vectorless-core/vectorless/src/index/parse/markdown/frontmatter.rs +++ /dev/null @@ -1,219 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Frontmatter extraction for Markdown documents. -//! -//! Supports YAML (`---`) and TOML (`+++`) delimited frontmatter. - -use std::collections::HashMap; - -/// Parsed frontmatter data. -#[derive(Debug, Clone, Default)] -pub struct Frontmatter { - /// Extracted key-value pairs. - pub fields: HashMap, -} - -impl Frontmatter { - /// Create an empty frontmatter. - #[must_use] - pub fn new() -> Self { - Self { - fields: HashMap::new(), - } - } - - /// Parse frontmatter from raw content. - /// - /// Returns `Some(Frontmatter)` if valid frontmatter is found. - /// Returns `None` if no frontmatter delimiters are present. - fn parse<'a>(content: &'a str, delimiter: &str) -> Option<(Self, &'a str)> { - // Check if content starts with delimiter - let delim_line = format!("{}\n", delimiter); - if !content.starts_with(&delim_line) { - return None; - } - - // Find closing delimiter - let content_after_open = &content[delimiter.len() + 1..]; - let close_pattern = format!("\n{}\n", delimiter); - - if let Some(end_pos) = content_after_open.find(&close_pattern) { - let frontmatter_text = &content_after_open[..end_pos]; - let remaining = &content_after_open[end_pos + close_pattern.len()..]; - - let fm = Self::parse_yaml(frontmatter_text); - Some((fm, remaining)) - } else { - None - } - } - - /// Parse YAML-style frontmatter (simple key: value extraction). - fn parse_yaml(text: &str) -> Self { - let mut fields = HashMap::new(); - - for line in text.lines() { - let line = line.trim(); - - // Skip empty lines and comments - if line.is_empty() || line.starts_with('#') { - continue; - } - - // Parse "key: value" or "key: "quoted value"" - if let Some((key, value)) = line.split_once(':') { - let key = key.trim().to_string(); - let value = value.trim(); - - // Remove quotes if present - let value = if (value.starts_with('"') && value.ends_with('"')) - || (value.starts_with('\'') && value.ends_with('\'')) - { - value[1..value.len() - 1].to_string() - } else { - value.to_string() - }; - - fields.insert(key, value); - } - } - - Self { fields } - } - - /// Get a field value by key. - #[must_use] - pub fn get(&self, key: &str) -> Option<&String> { - self.fields.get(key) - } - - /// Check if a field exists. - #[must_use] - pub fn contains(&self, key: &str) -> bool { - self.fields.contains_key(key) - } - - /// Get the title field. - #[must_use] - pub fn title(&self) -> Option<&String> { - self.get("title") - } - - /// Get the description field. - #[must_use] - pub fn description(&self) -> Option<&String> { - self.get("description") - } -} - -/// Extract frontmatter from Markdown content. -/// -/// Returns a tuple of (frontmatter, remaining_content). -/// If no frontmatter is found, returns `(None, content)`. -/// -/// # Supported Formats -/// -/// - YAML: `---\nkey: value\n---` -/// - TOML: `+++\nkey = "value"\n+++` -#[must_use] -pub fn extract_frontmatter( - content: &str, - parse_yaml: bool, - parse_toml: bool, -) -> (Option, &str) { - // Try YAML frontmatter first - if parse_yaml { - if let Some((fm, remaining)) = Frontmatter::parse(content, "---") { - return (Some(fm), remaining); - } - } - - // Try TOML frontmatter - if parse_toml { - if let Some((fm, remaining)) = Frontmatter::parse(content, "+++") { - return (Some(fm), remaining); - } - } - - // No frontmatter found - (None, content) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_yaml_frontmatter() { - let content = r#"--- -title: My Document -description: A test document ---- - -# Content - -Body text."#; - - let (fm, remaining) = extract_frontmatter(content, true, false); - - assert!(fm.is_some()); - let fm = fm.unwrap(); - assert_eq!(fm.title(), Some(&"My Document".to_string())); - assert_eq!(fm.description(), Some(&"A test document".to_string())); - assert!(remaining.trim_start().starts_with("# Content")); - } - - #[test] - fn test_extract_quoted_values() { - let content = r#"--- -title: "Quoted Title" -description: 'Single quoted' ---- - -Content"#; - - let (fm, _) = extract_frontmatter(content, true, false); - - assert!(fm.is_some()); - let fm = fm.unwrap(); - assert_eq!(fm.title(), Some(&"Quoted Title".to_string())); - assert_eq!(fm.description(), Some(&"Single quoted".to_string())); - } - - #[test] - fn test_no_frontmatter() { - let content = "# No Frontmatter\n\nJust content."; - - let (fm, remaining) = extract_frontmatter(content, true, false); - - assert!(fm.is_none()); - assert_eq!(remaining, content); - } - - #[test] - fn test_incomplete_frontmatter() { - let content = "---\ntitle: Test\n\nNo closing delimiter"; - - let (fm, remaining) = extract_frontmatter(content, true, false); - - // Should not match incomplete frontmatter - assert!(fm.is_none()); - assert_eq!(remaining, content); - } - - #[test] - fn test_toml_frontmatter() { - let content = r#"+++ -title = "TOML Doc" -+++ - -# Content"#; - - let (fm, remaining) = extract_frontmatter(content, false, true); - - // Note: Our simple parser treats TOML as YAML-like - assert!(fm.is_some()); - assert!(remaining.trim_start().starts_with("# Content")); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/markdown/mod.rs b/vectorless-core/vectorless/src/index/parse/markdown/mod.rs deleted file mode 100644 index 168f3645..00000000 --- a/vectorless-core/vectorless/src/index/parse/markdown/mod.rs +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Production-ready Markdown parser module. -//! -//! This module provides a robust Markdown parser built on `pulldown-cmark`, -//! supporting CommonMark, GFM extensions, and frontmatter extraction. -//! -//! # Features -//! -//! - **CommonMark compliant** - Full CommonMark specification support -//! - **GFM extensions** - Tables, strikethrough, task lists, autolinks -//! - **Frontmatter** - YAML and TOML frontmatter parsing -//! - **Configurable** - Fine-grained control over parsing behavior -//! -//! # Example -//! -//! ```rust -//! use vectorless::parser::markdown::{MarkdownParser, MarkdownConfig}; -//! -//! let parser = MarkdownParser::new(); -//! // or with custom config: -//! // let parser = MarkdownParser::with_config(MarkdownConfig::gfm()); -//! ``` - -mod config; -mod frontmatter; -mod parser; - -pub use parser::MarkdownParser; diff --git a/vectorless-core/vectorless/src/index/parse/markdown/parser.rs b/vectorless-core/vectorless/src/index/parse/markdown/parser.rs deleted file mode 100644 index 5bdf6a71..00000000 --- a/vectorless-core/vectorless/src/index/parse/markdown/parser.rs +++ /dev/null @@ -1,601 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Main Markdown parser implementation. - -use pulldown_cmark::Options; -use std::path::Path; - -use crate::error::Result; -use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; -use crate::utils::estimate_tokens; - -use super::config::MarkdownConfig; -use super::frontmatter; - -/// Production-ready Markdown parser. -/// -/// Built on `pulldown-cmark` for robust CommonMark/GFM parsing. -/// -/// # Features -/// -/// - CommonMark compliant -/// - GitHub Flavored Markdown (GFM) extensions -/// - YAML/TOML frontmatter extraction -/// - Configurable parsing behavior -/// -/// # Example -/// -/// ```rust -/// use vectorless::parser::markdown::MarkdownParser; -/// use vectorless::parser::DocumentParser; -/// -/// # #[tokio::main] -/// # async fn main() -> vectorless::Result<()> { -/// let parser = MarkdownParser::new(); -/// let result = parser.parse("# Title\n\nContent").await?; -/// -/// println!("Found {} nodes", result.node_count()); -/// # Ok(()) -/// # } -/// ``` -#[derive(Debug, Clone)] -pub struct MarkdownParser { - config: MarkdownConfig, -} - -impl Default for MarkdownParser { - fn default() -> Self { - Self::new() - } -} - -impl MarkdownParser { - /// Create a new parser with default (GFM) configuration. - #[must_use] - pub fn new() -> Self { - Self::with_config(MarkdownConfig::default()) - } - - /// Create a parser with custom configuration. - #[must_use] - pub fn with_config(config: MarkdownConfig) -> Self { - Self { config } - } - - /// Build pulldown-cmark options from configuration. - fn build_options(&self) -> Options { - let mut options = Options::empty(); - - // GFM extensions - if self.config.enable_gfm { - options.insert(Options::ENABLE_TABLES); - options.insert(Options::ENABLE_STRIKETHROUGH); - options.insert(Options::ENABLE_TASKLISTS); - options.insert(Options::ENABLE_SMART_PUNCTUATION); - } - - // Footnotes - if self.config.enable_footnotes { - options.insert(Options::ENABLE_FOOTNOTES); - } - - // Definition lists - if self.config.enable_definition_lists { - options.insert(Options::ENABLE_DEFINITION_LIST); - } - - // Note: pulldown-cmark 0.12 doesn't have ENABLE_SUPERSCRIPT/ENABLE_SUBSCRIPT - // Super/subscript handling would require custom processing if needed - - options - } - - /// Parse Markdown content and extract nodes. - fn extract_nodes( - &self, - content: &str, - ) -> ( - Vec, - Option>, - ) { - // 1. Extract frontmatter (if present) - let (fm, remaining_content) = frontmatter::extract_frontmatter( - content, - self.config.parse_frontmatter, - self.config.parse_toml_frontmatter, - ); - - // 2. Build parser options - let options = self.build_options(); - - // 3. Parse with pulldown-cmark - let parser = pulldown_cmark::Parser::new_ext(remaining_content, options); - - // 4. Extract raw nodes from events - let nodes = self.extract_nodes_from_events(parser); - - // 5. Extract frontmatter fields - let fm_fields = fm.map(|f| { - self.config - .frontmatter_fields - .iter() - .filter_map(|field| f.get(field).map(|v| (field.clone(), v.clone()))) - .collect() - }); - - (nodes, fm_fields) - } - - /// Extract RawNodes from pulldown-cmark event iterator. - fn extract_nodes_from_events<'a, E>(&self, events: E) -> Vec - where - E: Iterator>, - { - use pulldown_cmark::{CodeBlockKind, Event, Tag, TagEnd}; - - let mut nodes: Vec = Vec::new(); - let mut current: Option = None; - let mut content_buffer = String::new(); - let mut title_buffer = String::new(); - let mut preamble_content = String::new(); - let mut current_line: usize = 1; - let mut in_heading = false; - let mut skip_content = false; - - for event in events { - match event { - Event::Start(tag) => match tag { - Tag::Heading { level, .. } => { - let level_num = level as usize; - - // Check if this heading level should be processed as a node - if level_num > self.config.max_heading_level - || level_num < self.config.min_heading_level - { - // Treat as content - add the heading marker to content - in_heading = false; - skip_content = false; - content_buffer.push_str(&format!("{} ", "#".repeat(level_num))); - continue; - } - - // Finish any current node first - if let Some(node) = finish_current_node( - &mut current, - &mut content_buffer, - &mut preamble_content, - &mut nodes, - &self.config, - current_line, - ) { - nodes.push(node); - } - - // Start new heading - in_heading = true; - title_buffer.clear(); - - current = Some(InProgressNode { - title: String::new(), - level: level_num, - line_start: current_line, - }); - } - Tag::CodeBlock(kind) => { - if self.config.include_code_blocks { - match kind { - CodeBlockKind::Fenced(lang) => { - content_buffer.push_str("\n```"); - content_buffer.push_str(&lang); - content_buffer.push('\n'); - } - CodeBlockKind::Indented => { - content_buffer.push_str("\n```\n"); - } - } - } else { - skip_content = true; - } - } - _ => {} - }, - Event::End(tag) => match tag { - TagEnd::Heading(_) => { - if in_heading { - in_heading = false; - if let Some(ref mut node) = current { - node.title = title_buffer.trim().to_string(); - title_buffer.clear(); - - if node.title.chars().count() < self.config.min_heading_chars { - current = None; - } - } - } - } - TagEnd::CodeBlock => { - skip_content = false; - if self.config.include_code_blocks { - content_buffer.push_str("\n```\n"); - } - } - _ => {} - }, - Event::Text(text) => { - current_line += text.chars().filter(|&c| c == '\n').count(); - - if in_heading { - title_buffer.push_str(&text); - } else if !skip_content { - content_buffer.push_str(&text); - } - } - Event::Code(code) => { - if !in_heading && !skip_content { - content_buffer.push('`'); - content_buffer.push_str(&code); - content_buffer.push('`'); - } - } - Event::Html(html) | Event::InlineHtml(html) => { - if !skip_content { - content_buffer.push_str(&html); - current_line += html.chars().filter(|&c| c == '\n').count(); - } - } - Event::SoftBreak => { - if !skip_content { - content_buffer.push(' '); - } - } - Event::HardBreak => { - if !skip_content { - content_buffer.push('\n'); - current_line += 1; - } - } - Event::Rule => { - if !skip_content { - content_buffer.push_str("\n\n---\n\n"); - } - } - _ => {} - } - } - - // Finish any remaining node - if let Some(node) = finish_current_node( - &mut current, - &mut content_buffer, - &mut preamble_content, - &mut nodes, - &self.config, - current_line, - ) { - nodes.push(node); - } - - // Handle document with no headings (only preamble) - if nodes.is_empty() - && self.config.create_preamble_node - && (!content_buffer.trim().is_empty() || !preamble_content.is_empty()) - { - // Use preamble_content if available, otherwise use content_buffer - let content = if preamble_content.is_empty() { - content_buffer.trim() - } else { - preamble_content.trim() - }; - nodes.push(RawNode { - title: self.config.preamble_title.clone(), - level: 0, - content: content.to_string(), - line_start: 1, - line_end: current_line, - page: None, - token_count: Some(estimate_tokens(content)), - total_token_count: None, - }); - } - - nodes - } -} - -/// In-progress node being constructed. -struct InProgressNode { - title: String, - level: usize, - line_start: usize, -} - -/// Finish the current node and return it if valid. -#[allow(clippy::too_many_arguments)] -fn finish_current_node( - current: &mut Option, - content_buffer: &mut String, - preamble_content: &mut String, - nodes: &mut Vec, - config: &MarkdownConfig, - current_line: usize, -) -> Option { - // Handle preamble content (content before first heading) - if nodes.is_empty() && !content_buffer.trim().is_empty() { - if config.create_preamble_node { - let content = content_buffer.trim(); - *preamble_content = content.to_string(); - } - // Clear the buffer after storing as preamble to avoid duplication - content_buffer.clear(); - } - - // Finish current heading node - if let Some(node) = current.take() { - let content = content_buffer.trim().to_string(); - - // If this is the first heading and we have preamble content, - // prepend it to this node's content - let final_content = if nodes.is_empty() && !preamble_content.is_empty() { - let combined = format!("{}\n\n{}", preamble_content, content); - preamble_content.clear(); - combined - } else { - content - }; - - content_buffer.clear(); - - return Some(RawNode { - title: node.title, - level: node.level, - content: final_content.trim().to_string(), - line_start: node.line_start, - line_end: current_line, - page: None, - token_count: Some(estimate_tokens(&final_content)), - total_token_count: None, - }); - } - - content_buffer.clear(); - None -} - -impl MarkdownParser { - /// Parse Markdown content and return result. - pub async fn parse(&self, content: &str) -> Result { - let line_count = content.lines().count(); - let (nodes, fm_fields) = self.extract_nodes(content); - - // Build metadata - let mut meta = DocumentMeta { - name: String::new(), - format: DocumentFormat::Markdown, - page_count: None, - line_count, - source_path: None, - description: None, - }; - - // Apply frontmatter fields - if let Some(fields) = fm_fields { - if let Some(title) = fields.get("title") { - meta.name = title.clone(); - } - if let Some(desc) = fields.get("description") { - meta.description = Some(desc.clone()); - } - } - - Ok(ParseResult::new(meta, nodes)) - } - - /// Parse a Markdown file. - pub async fn parse_file(&self, path: &Path) -> Result { - let content = tokio::fs::read_to_string(path) - .await - .map_err(|e| crate::Error::Parse(format!("Failed to read file: {}", e)))?; - - let mut result = self.parse(&content).await?; - - // Extract document name from filename (if not set by frontmatter) - if result.meta.name.is_empty() { - if let Some(stem) = path.file_stem() { - result.meta.name = stem.to_string_lossy().to_string(); - } - } - result.meta.source_path = Some(path.to_string_lossy().to_string()); - - Ok(result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_parse_simple() { - let parser = MarkdownParser::new(); - let content = "# Title\n\nContent here."; - let result = parser.parse(content).await.unwrap(); - - assert!(!result.nodes.is_empty()); - assert!( - result - .nodes - .iter() - .any(|n| n.title == "Title" && n.level == 1) - ); - } - - #[tokio::test] - async fn test_parse_nested() { - let parser = MarkdownParser::new(); - let content = r#"# Main - -## Section 1 - -Content 1. - -## Section 2 - -Content 2."#; - let result = parser.parse(content).await.unwrap(); - - let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect(); - assert!(heading_nodes.len() >= 3); - } - - #[tokio::test] - async fn test_parse_code_blocks() { - let parser = MarkdownParser::new(); - let content = r#"# Code Example - -```rust -fn main() { - println!("Hello"); -} -```"#; - let result = parser.parse(content).await.unwrap(); - - // Should have the heading node - let heading_node = result.nodes.iter().find(|n| n.title == "Code Example"); - assert!(heading_node.is_some()); - - // Code block should be in content - assert!(heading_node.unwrap().content.contains("```rust")); - } - - #[tokio::test] - async fn test_skip_headers_in_code_blocks() { - let parser = MarkdownParser::new(); - let content = r#"# Title 1 - -Content before code. - -``` -# This is not a header -# Also not a header -``` - -## Title 1.1 - -Content after code."#; - - let result = parser.parse(content).await.unwrap(); - - // Should only have Title 1 and Title 1.1 as heading nodes - let heading_titles: Vec<_> = result - .nodes - .iter() - .filter(|n| n.level > 0) - .map(|n| n.title.as_str()) - .collect(); - - assert!(heading_titles.contains(&"Title 1")); - assert!(heading_titles.contains(&"Title 1.1")); - assert!(!heading_titles.contains(&"This is not a header")); - } - - #[tokio::test] - async fn test_frontmatter_extraction() { - let parser = MarkdownParser::new(); - let content = r#"--- -title: My Document -description: A test document ---- - -# Content - -Body text."#; - - let result = parser.parse(content).await.unwrap(); - - assert_eq!(result.meta.name, "My Document"); - assert_eq!(result.meta.description, Some("A test document".to_string())); - } - - #[tokio::test] - async fn test_gfm_table() { - let parser = MarkdownParser::new(); - let content = r#"# Table Example - -| Name | Age | -|------|-----| -| Alice | 30 | -| Bob | 25 |"#; - - let result = parser.parse(content).await.unwrap(); - - let table_node = result.nodes.iter().find(|n| n.title == "Table Example"); - assert!(table_node.is_some()); - assert!(table_node.unwrap().content.contains("Alice")); - } - - #[tokio::test] - async fn test_max_heading_level_config() { - let config = MarkdownConfig { - max_heading_level: 2, - ..Default::default() - }; - let parser = MarkdownParser::with_config(config); - - let content = r#"# H1 - -## H2 - -### H3 - -#### H4"#; - - let result = parser.parse(content).await.unwrap(); - - // H3 and H4 should not be separate nodes - let heading_nodes: Vec<_> = result.nodes.iter().filter(|n| n.level > 0).collect(); - assert_eq!(heading_nodes.len(), 2); - } - - #[tokio::test] - async fn test_no_code_blocks_config() { - let config = MarkdownConfig::no_code_blocks(); - let parser = MarkdownParser::with_config(config); - - let content = r#"# Example - -```rust -let x = 1; -``` - -Some text."#; - - let result = parser.parse(content).await.unwrap(); - - let node = result.nodes.iter().find(|n| n.title == "Example").unwrap(); - // Code block should not be in content - assert!(!node.content.contains("let x = 1")); - // But regular text should be - assert!(node.content.contains("Some text")); - } - - #[tokio::test] - async fn test_empty_document() { - let parser = MarkdownParser::new(); - let result = parser.parse("").await.unwrap(); - - assert!(result.nodes.is_empty()); - } - - #[tokio::test] - async fn test_document_with_no_headings() { - let parser = MarkdownParser::new(); - let content = "Just some text\nwith no headings."; - - let result = parser.parse(content).await.unwrap(); - - assert_eq!(result.nodes.len(), 1); - assert_eq!(result.nodes[0].title, "Introduction"); - assert_eq!(result.nodes[0].level, 0); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/mod.rs b/vectorless-core/vectorless/src/index/parse/mod.rs deleted file mode 100644 index 0bcba9f4..00000000 --- a/vectorless-core/vectorless/src/index/parse/mod.rs +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document parsing for the index pipeline. -//! -//! Supports Markdown and PDF formats. Parsing is dispatched directly -//! via `match` — no trait objects or registry needed. -//! -//! # Quick parse -//! -//! ```rust,ignore -//! use vectorless::index::parse::{parse_content, parse_bytes, DocumentFormat}; -//! -//! let result = parse_content("# Title\nContent", DocumentFormat::Markdown).await?; -//! let result = parse_bytes(&pdf_bytes, DocumentFormat::Pdf).await?; -//! ``` - -pub mod markdown; -pub mod pdf; -pub mod toc; -pub mod types; - -// Re-export core types at module level -pub use types::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; - -use std::path::Path; - -use crate::error::Result; -use crate::index::parse::markdown::MarkdownParser; -use crate::llm::LlmClient; - -/// Parse a string content document. -pub async fn parse_content( - content: &str, - format: DocumentFormat, - _llm_client: Option, -) -> Result { - match format { - DocumentFormat::Markdown => { - let parser = MarkdownParser::new(); - parser.parse(content).await - } - DocumentFormat::Pdf => Err(crate::Error::Parse( - "PDF requires bytes, not string content".to_string(), - )), - } -} - -/// Parse a file. -pub async fn parse_file( - path: &Path, - format: DocumentFormat, - llm_client: Option, -) -> Result { - match format { - DocumentFormat::Markdown => { - let parser = MarkdownParser::new(); - parser.parse_file(path).await - } - DocumentFormat::Pdf => { - let parser = match llm_client { - Some(client) => pdf::PdfParser::with_llm_client(client), - None => pdf::PdfParser::new(), - }; - parser.parse_file(path).await - } - } -} - -/// Parse binary data. -pub async fn parse_bytes( - bytes: &[u8], - format: DocumentFormat, - llm_client: Option, -) -> Result { - match format { - DocumentFormat::Markdown => { - let content = std::str::from_utf8(bytes) - .map_err(|e| crate::Error::Parse(format!("Invalid UTF-8 content: {}", e)))?; - let parser = MarkdownParser::new(); - parser.parse(content).await - } - DocumentFormat::Pdf => { - let parser = match llm_client { - Some(client) => pdf::PdfParser::with_llm_client(client), - None => pdf::PdfParser::new(), - }; - parser.parse_bytes_async(bytes, None).await - } - } -} - -/// Detect document format from a file extension. -pub fn format_from_extension(ext: &str) -> Option { - DocumentFormat::from_extension(ext) -} diff --git a/vectorless-core/vectorless/src/index/parse/pdf/mod.rs b/vectorless-core/vectorless/src/index/parse/pdf/mod.rs deleted file mode 100644 index dc92da86..00000000 --- a/vectorless-core/vectorless/src/index/parse/pdf/mod.rs +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! PDF document parsing module. -//! -//! This module provides functionality to parse PDF documents: -//! - **PdfPage** — Single page with text and metadata -//! - **PdfParser** — Extract pages from PDF files -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::parser::pdf::{PdfParser, PdfPage}; -//! use std::path::Path; -//! -//! # fn main() -> vectorless::Result<()> { -//! let parser = PdfParser::new(); -//! let result = parser.parse_file(Path::new("document.pdf"))?; -//! -//! println!("Pages: {}", result.pages.len()); -//! for page in &result.pages { -//! println!("Page {}: {} tokens", page.number, page.token_count); -//! } -//! # Ok(()) -//! # } -//! ``` - -mod parser; -mod types; - -pub use parser::PdfParser; -pub use types::PdfPage; diff --git a/vectorless-core/vectorless/src/index/parse/pdf/parser.rs b/vectorless-core/vectorless/src/index/parse/pdf/parser.rs deleted file mode 100644 index a3327cc0..00000000 --- a/vectorless-core/vectorless/src/index/parse/pdf/parser.rs +++ /dev/null @@ -1,366 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! PDF document parser. -//! -//! Uses [`pdf_extract`] for reliable text extraction (handles CJK, ToUnicode -//! CMap, font encoding, etc.) and [`lopdf`] only for metadata extraction from -//! the PDF Info dictionary. - -use std::path::Path; - -use lopdf::Document as LopdfDocument; -use tracing::{info, warn}; - -use crate::Error; -use crate::error::Result; -use crate::index::parse::toc::TocProcessor; -use crate::llm::LlmClient; - -use super::types::{PdfMetadata, PdfPage, PdfParseResult}; -use crate::index::parse::{DocumentFormat, DocumentMeta, ParseResult, RawNode}; - -/// PDF document parser. -pub struct PdfParser { - config: PdfParserConfig, - /// Optional LLM client for TOC extraction and structure analysis. - llm_client: Option, -} - -/// PDF parser configuration. -#[derive(Debug, Clone)] -pub struct PdfParserConfig { - /// Maximum pages to extract (0 = unlimited). - pub max_pages: usize, - - /// Enable TOC extraction. - pub extract_toc: bool, -} - -impl Default for PdfParserConfig { - fn default() -> Self { - Self { - max_pages: 0, - extract_toc: true, - } - } -} - -impl PdfParser { - /// Create a new PDF parser with default configuration. - pub fn new() -> Self { - Self::default() - } - - /// Create a PDF parser with an externally provided LLM client. - pub fn with_llm_client(client: LlmClient) -> Self { - Self { - config: PdfParserConfig::default(), - llm_client: Some(client), - } - } - - /// Create a parser with custom configuration. - pub fn with_config(config: PdfParserConfig) -> Self { - Self { - config, - llm_client: None, - } - } - - /// Create a parser without TOC extraction. - pub fn without_toc() -> Self { - Self { - config: PdfParserConfig { - extract_toc: false, - ..Default::default() - }, - llm_client: None, - } - } - - /// Parse PDF from bytes and return raw pages. - pub async fn parse_bytes_raw( - &self, - bytes: &[u8], - filename: Option<&str>, - ) -> Result { - // Use pdf-extract for text (handles CJK, ToUnicode CMap, etc.) - let pages = self.extract_pages(bytes)?; - - // Use lopdf only for metadata; fall back gracefully if it fails - let metadata = match LopdfDocument::load_mem(bytes) { - Ok(doc) => self.extract_metadata(&doc, filename), - Err(_) => PdfMetadata { - title: filename.unwrap_or("Document").to_string(), - page_count: pages.len(), - ..Default::default() - }, - }; - - Ok(PdfParseResult::new(metadata, pages)) - } - - /// Extract text from all pages using pdf-extract. - fn extract_pages(&self, bytes: &[u8]) -> Result> { - let page_texts = pdf_extract::extract_text_from_mem_by_pages(bytes) - .map_err(|e| Error::Parse(format!("pdf-extract failed: {}", e)))?; - - let mut pages = Vec::new(); - for (i, text) in page_texts.iter().enumerate() { - if self.config.max_pages > 0 && i >= self.config.max_pages { - break; - } - let page_num = i + 1; // 1-based - if !text.trim().is_empty() { - pages.push(PdfPage::new(page_num, text.clone())); - } - } - - Ok(pages) - } - - /// Extract metadata from PDF Info dictionary via lopdf. - fn extract_metadata(&self, doc: &LopdfDocument, filename: Option<&str>) -> PdfMetadata { - let mut metadata = PdfMetadata { - title: filename.unwrap_or("Document").to_string(), - page_count: doc.get_pages().len(), - ..Default::default() - }; - - if let Ok(info) = doc.trailer.get(b"Info") { - if let Ok(info_ref) = info.as_reference() { - if let Ok(info_obj) = doc.get_object(info_ref) { - if let Ok(dict) = info_obj.as_dict() { - if let Ok(title_obj) = dict.get(b"Title") { - if let Ok(title) = title_obj.as_str() { - metadata.title = self.decode_pdf_string(title); - } - } - - if let Ok(author_obj) = dict.get(b"Author") { - if let Ok(author) = author_obj.as_str() { - metadata.author = Some(self.decode_pdf_string(author)); - } - } - - if let Ok(subject_obj) = dict.get(b"Subject") { - if let Ok(subject) = subject_obj.as_str() { - metadata.subject = Some(self.decode_pdf_string(subject)); - } - } - } - } - } - } - - metadata - } - - /// Decode PDF string literal (handles escape sequences). - /// - /// Used only for metadata field values extracted via lopdf. - fn decode_pdf_string(&self, bytes: &[u8]) -> String { - let mut result = String::new(); - let mut i = 0; - - while i < bytes.len() { - match bytes[i] { - b'\\' if i + 1 < bytes.len() => { - i += 1; - match bytes[i] { - b'n' => result.push('\n'), - b'r' => result.push('\r'), - b't' => result.push('\t'), - b'(' => result.push('('), - b')' => result.push(')'), - b'\\' => result.push('\\'), - _ => {} - } - } - b if b >= 32 && b < 127 => { - result.push(b as char); - } - _ => {} - } - i += 1; - } - - result - } - - /// Convert TOC entries to RawNodes. - fn toc_entries_to_raw_nodes( - &self, - entries: &[crate::index::parse::toc::TocEntry], - pages: &[PdfPage], - ) -> Vec { - let mut nodes = Vec::new(); - - for entry in entries { - let content = self.get_content_for_entry(entry, pages); - - let mut node = RawNode::new(&entry.title) - .with_content(content) - .with_level(entry.level); - - if let Some(page) = entry.physical_page { - node = node.with_page(page); - } - - nodes.push(node); - } - - nodes - } - - /// Get content for a TOC entry from pages. - fn get_content_for_entry( - &self, - entry: &crate::index::parse::toc::TocEntry, - pages: &[PdfPage], - ) -> String { - let start_page = entry.physical_page.unwrap_or(1); - - pages - .iter() - .find(|p| p.number == start_page) - .map(|p| { - let text = &p.text; - if let Some(pos) = text.find(&entry.title) { - text[pos + entry.title.len()..].trim().to_string() - } else { - text.clone() - } - }) - .unwrap_or_default() - } - - /// Create RawNodes from pages (fallback when no TOC). - fn pages_to_raw_nodes(&self, pages: &[PdfPage]) -> Vec { - pages - .iter() - .map(|page| { - RawNode::new(format!("Page {}", page.number)) - .with_content(page.text.clone()) - .with_level(1) - .with_page(page.number) - }) - .collect() - } -} - -impl Default for PdfParser { - fn default() -> Self { - Self::with_config(PdfParserConfig::default()) - } -} - -impl PdfParser { - /// Parse a PDF file into raw nodes for the index pipeline. - pub async fn parse_file(&self, path: &Path) -> Result { - let bytes = tokio::fs::read(path) - .await - .map_err(|e| Error::Parse(format!("Failed to read PDF file: {}", e)))?; - let filename = path.file_stem().and_then(|s| s.to_str()); - self.parse_bytes_to_result(&bytes, filename, Some(path)) - .await - } - - /// Parse PDF bytes into raw nodes for the index pipeline. - pub async fn parse_bytes_async( - &self, - bytes: &[u8], - filename: Option<&str>, - ) -> Result { - self.parse_bytes_to_result(bytes, filename, None).await - } - - /// Core async parsing logic shared by parse_file and parse_bytes_async. - async fn parse_bytes_to_result( - &self, - bytes: &[u8], - filename: Option<&str>, - source_path: Option<&Path>, - ) -> Result { - let result = self.parse_bytes_raw(bytes, filename).await?; - let page_count = result.pages.len(); - - // Try TOC extraction if enabled - let nodes = if self.config.extract_toc { - info!("Extracting TOC from PDF with {} pages", page_count); - - let processor = match &self.llm_client { - Some(client) => { - info!("PdfParser: creating TocProcessor with LLM client"); - TocProcessor::with_llm_client(client.clone()) - } - None => { - info!( - "PdfParser: creating TocProcessor without LLM client (no key configured)" - ); - TocProcessor::new() - } - }; - match processor.process(&result.pages).await { - Ok(entries) if !entries.is_empty() => { - info!("Extracted {} TOC entries", entries.len()); - self.toc_entries_to_raw_nodes(&entries, &result.pages) - } - Ok(_) => { - warn!("No TOC entries found, falling back to page-based extraction"); - self.pages_to_raw_nodes(&result.pages) - } - Err(e) => { - warn!( - "TOC extraction failed: {}, falling back to page-based extraction", - e - ); - self.pages_to_raw_nodes(&result.pages) - } - } - } else { - self.pages_to_raw_nodes(&result.pages) - }; - - let meta = DocumentMeta { - name: result.metadata.title, - format: DocumentFormat::Pdf, - page_count: Some(page_count), - line_count: 0, - source_path: source_path.map(|p| p.to_string_lossy().to_string()), - description: result.metadata.subject, - }; - - Ok(ParseResult::new(meta, nodes)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parser_creation() { - let parser = PdfParser::new(); - assert_eq!(parser.config.max_pages, 0); - assert!(parser.config.extract_toc); - } - - #[test] - fn test_parser_without_toc() { - let parser = PdfParser::without_toc(); - assert!(!parser.config.extract_toc); - } - - #[test] - fn test_decode_pdf_string() { - let parser = PdfParser::new(); - - let decoded = parser.decode_pdf_string(b"Hello World"); - assert_eq!(decoded, "Hello World"); - - let decoded = parser.decode_pdf_string(b"Hello\\nWorld"); - assert_eq!(decoded, "Hello\nWorld"); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/pdf/types.rs b/vectorless-core/vectorless/src/index/parse/pdf/types.rs deleted file mode 100644 index 3b978836..00000000 --- a/vectorless-core/vectorless/src/index/parse/pdf/types.rs +++ /dev/null @@ -1,171 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! PDF document types. - -use crate::utils::estimate_tokens; -use serde::{Deserialize, Serialize}; - -/// A single page from a PDF document. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PdfPage { - /// Page number (1-based). - pub number: usize, - - /// Text content of the page. - pub text: String, - - /// Estimated token count. - pub token_count: usize, -} - -impl PdfPage { - /// Create a new PDF page. - pub fn new(number: usize, text: impl Into) -> Self { - let text = text.into(); - let token_count = estimate_tokens(&text); - Self { - number, - text, - token_count, - } - } - - /// Check if the page is empty. - pub fn is_empty(&self) -> bool { - self.text.trim().is_empty() - } - - /// Get character count. - pub fn char_count(&self) -> usize { - self.text.chars().count() - } - - /// Get word count (approximate). - pub fn word_count(&self) -> usize { - self.text.split_whitespace().count() - } -} - -/// PDF document metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PdfMetadata { - /// Document title (from metadata or filename). - pub title: String, - - /// Total page count. - pub page_count: usize, - - /// Author (if available). - pub author: Option, - - /// Subject/description (if available). - pub subject: Option, - - /// Creator application (if available). - pub creator: Option, - - /// Producer application (if available). - pub producer: Option, -} - -impl Default for PdfMetadata { - fn default() -> Self { - Self { - title: String::new(), - page_count: 0, - author: None, - subject: None, - creator: None, - producer: None, - } - } -} - -/// Result of parsing a PDF document. -#[derive(Debug, Clone)] -pub struct PdfParseResult { - /// Document metadata. - pub metadata: PdfMetadata, - - /// Extracted pages. - pub pages: Vec, - - /// Total token count across all pages. - pub total_tokens: usize, -} - -impl PdfParseResult { - /// Create a new parse result. - pub fn new(metadata: PdfMetadata, pages: Vec) -> Self { - let total_tokens = pages.iter().map(|p| p.token_count).sum(); - Self { - metadata, - pages, - total_tokens, - } - } - - /// Check if the document is empty. - pub fn is_empty(&self) -> bool { - self.pages.is_empty() - } - - /// Get a page by number (1-based). - pub fn get_page(&self, number: usize) -> Option<&PdfPage> { - if number == 0 || number > self.pages.len() { - return None; - } - self.pages.get(number - 1) - } - - /// Get text for a page range (inclusive, 1-based). - pub fn get_page_range_text(&self, start: usize, end: usize) -> String { - let start = start.max(1); - let end = end.min(self.pages.len()); - - self.pages[start - 1..end] - .iter() - .map(|p| format!("\n{}\n\n", p.number, p.text, p.number)) - .collect() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_pdf_page_creation() { - let page = PdfPage::new(1, "Hello world"); - assert_eq!(page.number, 1); - assert_eq!(page.text, "Hello world"); - assert!(page.token_count > 0); - } - - #[test] - fn test_estimate_tokens() { - // Uses tiktoken for accurate counting - assert_eq!(estimate_tokens(""), 0); - // "hi" is 1 token in tiktoken - assert_eq!(estimate_tokens("hi"), 1); - // tiktoken is efficient at encoding text - just verify it returns a positive count - let hundred_as = "a".repeat(100); - assert!(estimate_tokens(&hundred_as) >= 1); - } - - #[test] - fn test_page_range_text() { - let pages = vec![ - PdfPage::new(1, "Page 1 content"), - PdfPage::new(2, "Page 2 content"), - PdfPage::new(3, "Page 3 content"), - ]; - let result = PdfParseResult::new(PdfMetadata::default(), pages); - - let text = result.get_page_range_text(1, 2); - assert!(text.contains("Page 1 content")); - assert!(text.contains("Page 2 content")); - assert!(!text.contains("Page 3 content")); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/assigner.rs b/vectorless-core/vectorless/src/index/parse/toc/assigner.rs deleted file mode 100644 index 267cda18..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/assigner.rs +++ /dev/null @@ -1,395 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Page assigner - assigns physical page numbers to TOC entries. - -use futures::stream::{self, StreamExt}; -use std::collections::HashMap; -use tracing::{debug, info}; - -use crate::error::Result; -use crate::index::parse::pdf::PdfPage; -use crate::llm::config::LlmConfig; - -use super::types::{PageOffset, TocEntry}; -use crate::llm::LlmClient; - -/// Page assigner configuration. -#[derive(Debug, Clone)] -pub struct PageAssignerConfig { - /// Number of anchor points for offset calculation. - pub anchor_count: usize, - - /// LLM configuration. - pub llm_config: LlmConfig, - - /// Maximum offset variance allowed. - pub max_offset_variance: usize, -} - -impl Default for PageAssignerConfig { - fn default() -> Self { - Self { - anchor_count: 5, - llm_config: LlmConfig::default(), - max_offset_variance: 3, - } - } -} - -/// Page assigner - assigns physical page numbers to TOC entries. -pub struct PageAssigner { - config: PageAssignerConfig, - client: LlmClient, -} - -impl PageAssigner { - /// Create a new page assigner. - pub fn new(config: PageAssignerConfig) -> Self { - let client = LlmClient::new(config.llm_config.clone().into()); - Self { config, client } - } - - /// Create an assigner with an externally provided LLM client. - pub fn with_client(client: LlmClient) -> Self { - Self { - config: PageAssignerConfig::default(), - client, - } - } - - /// Create an assigner with default configuration. - pub fn with_defaults() -> Self { - Self::new(PageAssignerConfig::default()) - } - - /// Assign physical pages to TOC entries. - /// - /// Strategy: - /// 1. If entries have TOC pages → calculate offset → apply offset - /// 2. If no TOC pages → use LLM to locate each entry - pub async fn assign(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { - if entries.is_empty() { - return Ok(()); - } - - // Check if we have TOC page numbers - let has_toc_pages = entries.iter().any(|e| e.toc_page.is_some()); - - if has_toc_pages { - self.assign_with_offset(entries, pages).await - } else { - self.assign_with_llm(entries, pages).await - } - } - - /// Assign pages using offset calculation. - async fn assign_with_offset(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { - info!("Assigning pages using offset calculation"); - - // Step 1: Select anchor entries - let anchors = self.select_anchors(entries, self.config.anchor_count); - - // Step 2: Verify anchors and calculate offset - let offset = self.calculate_offset(anchors, pages).await?; - - if offset.confidence < 0.5 { - debug!("Offset confidence too low, falling back to LLM positioning"); - return self.assign_with_llm(entries, pages).await; - } - - info!( - "Calculated offset: {} (confidence: {})", - offset.offset, offset.confidence - ); - - // Step 3: Apply offset to all entries - for entry in entries.iter_mut() { - if let Some(toc_page) = entry.toc_page { - let physical = offset.apply(toc_page); - entry.physical_page = Some(physical.min(pages.len())); - } - } - - Ok(()) - } - - /// Select anchor entries for offset calculation. - fn select_anchors<'a>(&self, entries: &'a [TocEntry], count: usize) -> Vec<&'a TocEntry> { - // Select entries with TOC pages, evenly distributed - let with_pages: Vec<_> = entries.iter().filter(|e| e.toc_page.is_some()).collect(); - - if with_pages.len() <= count { - return with_pages; - } - - // Select evenly distributed entries - let step = with_pages.len() as f32 / count as f32; - (0..count) - .map(|i| with_pages[(i as f32 * step) as usize]) - .collect() - } - - /// Calculate page offset by verifying anchors concurrently. - async fn calculate_offset( - &self, - anchors: Vec<&TocEntry>, - pages: &[PdfPage], - ) -> Result { - if anchors.is_empty() { - return Ok(PageOffset::new(0, 0, 0.0)); - } - - let anchor_count = anchors.len(); - - // Verify all anchors concurrently - let client = self.client.clone(); - let pages_owned = pages.to_vec(); - let futures: Vec<_> = anchors - .into_iter() - .map(|anchor| { - let title = anchor.title.clone(); - let toc_page = anchor.toc_page.unwrap(); - let client = client.clone(); - let pages = pages_owned.clone(); - - async move { - let range_pages = Self::pages_around(&pages, toc_page, 3); - if range_pages.is_empty() { - return (0, false); - } - - let content = Self::format_range_pages(&range_pages); - match Self::locate_with_client(&client, &title, &content).await { - Ok(Some(physical)) => { - let offset = physical as i32 - toc_page as i32; - debug!( - "Anchor '{}' found: toc={}, physical={}, offset={}", - title, toc_page, physical, offset - ); - (offset, true) - } - _ => (0, false), - } - } - }) - .collect(); - - let verified_offsets: Vec<_> = stream::iter(futures).buffer_unordered(5).collect().await; - - // Calculate the mode (most common offset) - let successful: Vec<_> = verified_offsets - .iter() - .filter(|(_, success)| *success) - .map(|(offset, _)| *offset) - .collect(); - - if successful.is_empty() { - return Ok(PageOffset::new(0, 0, 0.0)); - } - - let mode = Self::calculate_mode_static(&successful); - let sample_count = successful.len(); - let confidence = sample_count as f32 / anchor_count as f32; - - Ok(PageOffset::new(mode, sample_count, confidence)) - } - - /// Calculate mode of offset values. - fn calculate_mode(&self, values: &[i32]) -> i32 { - Self::calculate_mode_static(values) - } - - /// Static version for use in concurrent contexts. - fn calculate_mode_static(values: &[i32]) -> i32 { - let mut counts: HashMap = HashMap::new(); - for &v in values { - *counts.entry(v).or_insert(0) += 1; - } - counts - .into_iter() - .max_by_key(|&(_, count)| count) - .map(|(v, _)| v) - .unwrap_or(0) - } - - /// Collect pages around a center page number. - fn pages_around(pages: &[PdfPage], center: usize, range: usize) -> Vec { - let start = center.saturating_sub(range).max(1); - let end = (center + range).min(pages.len()); - (start..=end) - .filter_map(|i| pages.get(i - 1).cloned()) - .collect() - } - - /// Format pages into tagged text for LLM. - fn format_range_pages(pages: &[PdfPage]) -> String { - pages - .iter() - .map(|p| { - format!( - "\n{}\n", - p.number, - &p.text[..p.text.len().min(500)], - p.number - ) - }) - .collect::>() - .join("\n\n") - } - - /// Locate a title in pre-formatted content using LLM (static, for concurrent use). - async fn locate_with_client( - client: &LlmClient, - title: &str, - content: &str, - ) -> Result> { - let system = "You are a document analysis assistant. Find which page contains a specific section title."; - let user = format!( - r#"Find which page contains the section titled: "{}" - -Pages: -{} - -Reply in JSON format: -{{"page": }}"#, - title, content - ); - - #[derive(serde::Deserialize)] - struct LocateResult { - page: Option, - } - - let result: LocateResult = client.complete_json(system, &user).await?; - Ok(result.page) - } - - /// Assign pages using LLM for each entry (with bounded concurrency). - async fn assign_with_llm(&self, entries: &mut [TocEntry], pages: &[PdfPage]) -> Result<()> { - info!("Assigning pages using LLM positioning"); - - let client = self.client.clone(); - let pages_owned = pages.to_vec(); - let total = entries.len(); - - // Launch entry searches with bounded concurrency to avoid rate limiting - let futures: Vec<_> = entries - .iter() - .map(|entry| { - let title = entry.title.clone(); - let client = client.clone(); - let pages = pages_owned.clone(); - - async move { - let groups = Self::group_pages_owned(&pages, 5); - Self::locate_title_in_groups_static(&client, &title, &groups).await - } - }) - .collect(); - - let results: Vec<_> = stream::iter(futures).buffer_unordered(5).collect().await; - - info!("Assigned pages for {}/{} entries", results.len(), total); - - // Write results back - for (entry, result) in entries.iter_mut().zip(results.into_iter()) { - let physical = result?; - entry.physical_page = physical; - entry.confidence = if physical.is_some() { 0.8 } else { 0.3 }; - } - - Ok(()) - } - - /// Group owned pages for batch processing. - fn group_pages_owned(pages: &[PdfPage], group_size: usize) -> Vec> { - pages - .chunks(group_size) - .map(|chunk| chunk.to_vec()) - .collect() - } - - /// Locate a title across page groups (static, for concurrent use). - /// - /// Searches groups sequentially (early return on first match), - /// but multiple title searches can run concurrently. - async fn locate_title_in_groups_static( - client: &LlmClient, - title: &str, - groups: &[Vec], - ) -> Result> { - let system = "You are a document analysis assistant. Find which page contains a specific section title."; - - for group in groups { - let content = group - .iter() - .map(|p| { - format!( - "\n{}\n", - p.number, - &p.text[..p.text.len().min(300)], - p.number - ) - }) - .collect::>() - .join("\n\n"); - - let user = format!( - r#"Find which page contains the section titled: "{}" - -Pages: -{} - -Reply in JSON format: -{{"found": true/false, "page": }}"#, - title, content - ); - - #[derive(serde::Deserialize)] - struct SearchResult { - found: bool, - page: Option, - } - - let result: SearchResult = client.complete_json(system, &user).await?; - - if result.found { - return Ok(result.page); - } - } - - Ok(None) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_select_anchors() { - let assigner = PageAssigner::with_defaults(); - - let entries = vec![ - TocEntry::new("Chapter 1", 1).with_toc_page(1), - TocEntry::new("Chapter 2", 1).with_toc_page(10), - TocEntry::new("Chapter 3", 1).with_toc_page(20), - TocEntry::new("Chapter 4", 1).with_toc_page(30), - ]; - - let anchors = assigner.select_anchors(&entries, 2); - assert_eq!(anchors.len(), 2); - } - - #[test] - fn test_calculate_mode() { - let assigner = PageAssigner::with_defaults(); - - let values = vec![2, 2, 2, 3, 3, 4]; - assert_eq!(assigner.calculate_mode(&values), 2); - - let values = vec![1, 1, 2, 2, 2]; - assert_eq!(assigner.calculate_mode(&values), 2); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/detector.rs b/vectorless-core/vectorless/src/index/parse/toc/detector.rs deleted file mode 100644 index 8484e101..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/detector.rs +++ /dev/null @@ -1,349 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! TOC (Table of Contents) detection. - -use regex::Regex; -use tracing::debug; - -use crate::error::Result; -use crate::llm::config::LlmConfig; - -use super::types::TocDetection; -use crate::index::parse::pdf::PdfPage; -use crate::llm::LlmClient; - -/// TOC detector configuration. -#[derive(Debug, Clone)] -pub struct TocDetectorConfig { - /// Maximum pages to check for TOC. - pub max_check_pages: usize, - - /// Minimum confidence threshold for regex detection. - pub regex_confidence_threshold: f32, - - /// Use LLM for uncertain cases. - pub use_llm_fallback: bool, - - /// LLM configuration. - pub llm_config: LlmConfig, -} - -impl Default for TocDetectorConfig { - fn default() -> Self { - Self { - max_check_pages: 15, - regex_confidence_threshold: 0.7, - use_llm_fallback: true, - llm_config: LlmConfig::default(), - } - } -} - -/// TOC detector - finds table of contents in PDF documents. -pub struct TocDetector { - config: TocDetectorConfig, - llm_client: Option, - patterns: Vec, -} - -/// A TOC detection pattern. -#[allow(dead_code)] -struct TocPattern { - /// Pattern name for debugging. - name: &'static str, - /// Regex pattern to match. - regex: Regex, - /// Weight for scoring. - weight: f32, -} - -impl TocDetector { - /// Create a new TOC detector. - pub fn new(config: TocDetectorConfig) -> Self { - let llm_client = if config.use_llm_fallback { - Some(LlmClient::new(config.llm_config.clone().into())) - } else { - None - }; - - Self { - config, - llm_client, - patterns: Self::build_patterns(), - } - } - - /// Create a detector with an externally provided LLM client. - pub fn with_client(config: TocDetectorConfig, client: LlmClient) -> Self { - let use_llm = config.use_llm_fallback; - Self { - config, - llm_client: if use_llm { Some(client) } else { None }, - patterns: Self::build_patterns(), - } - } - - /// Create a detector with default configuration. - pub fn with_defaults() -> Self { - Self::new(TocDetectorConfig::default()) - } - - /// Build detection patterns. - fn build_patterns() -> Vec { - vec![ - // Chinese TOC patterns - TocPattern { - name: "chinese_toc_header", - regex: Regex::new(r"(?i)^[\s]*(目\s*录|内\s*容\s*摘\s*要)[\s]*$").unwrap(), - weight: 0.9, - }, - TocPattern { - name: "chinese_chapter_with_page", - regex: Regex::new(r"第[一二三四五六七八九十\d]+[章节部篇].*?[\.\s…·]{2,}\s*\d+") - .unwrap(), - weight: 0.85, - }, - TocPattern { - name: "chinese_section_dots", - regex: Regex::new(r"\d+[\.\d]+\s+.+?\s*[\.\s…·]{3,}\s*\d+").unwrap(), - weight: 0.8, - }, - // English TOC patterns - TocPattern { - name: "english_toc_header", - regex: Regex::new(r"(?i)^[\s]*(table\s+of\s+contents|contents|outline)[\s]*$") - .unwrap(), - weight: 0.9, - }, - TocPattern { - name: "english_chapter_with_page", - regex: Regex::new(r"(?i)^[\s]*(chapter|section|part)\s+\d+.*?\d+\s*$").unwrap(), - weight: 0.85, - }, - TocPattern { - name: "numbered_section_dots", - regex: Regex::new(r"^\d+\.\d+(\.\d+)?\s+.+?[\.\s…]{3,}\s*\d+\s*$").unwrap(), - weight: 0.75, - }, - // Generic patterns - TocPattern { - name: "dots_leader", - regex: Regex::new(r".+?[\.\s…·]{4,}\s*\d{1,4}\s*$").unwrap(), - weight: 0.7, - }, - TocPattern { - name: "title_with_page", - regex: Regex::new(r"^.{3,50}?\s{2,}\d{1,4}\s*$").unwrap(), - weight: 0.5, - }, - ] - } - - /// Detect TOC in PDF pages. - pub async fn detect(&self, pages: &[PdfPage]) -> Result { - let check_pages = pages - .iter() - .take(self.config.max_check_pages) - .collect::>(); - - if check_pages.is_empty() { - return Ok(TocDetection::not_found()); - } - - // Step 1: Regex detection - let regex_result = self.detect_with_regex(&check_pages); - debug!( - "Regex detection result: found={}, confidence={}", - regex_result.found, regex_result.confidence - ); - - // Step 2: If confidence is high enough, return - if regex_result.confidence >= self.config.regex_confidence_threshold { - return Ok(regex_result); - } - - // Step 3: Use LLM fallback if available and needed - if let Some(ref client) = self.llm_client { - if regex_result.confidence > 0.3 || regex_result.confidence == 0.0 { - debug!("Using LLM fallback for TOC detection"); - return self.detect_with_llm(client, &check_pages).await; - } - } - - Ok(regex_result) - } - - /// Detect TOC using regex patterns. - fn detect_with_regex(&self, pages: &[&PdfPage]) -> TocDetection { - let mut toc_pages = Vec::new(); - let mut has_page_numbers = false; - let mut total_score = 0.0; - let mut match_count = 0; - - for page in pages { - let (score, has_numbers) = self.score_page_for_toc(page); - - if score > 0.5 { - toc_pages.push(page.number); - - if has_numbers { - has_page_numbers = true; - } - - total_score += score; - match_count += 1; - } - } - - if toc_pages.is_empty() { - return TocDetection::not_found(); - } - - let confidence = if match_count > 0 { - total_score / match_count as f32 - } else { - 0.0 - }; - - TocDetection::new(true) - .with_pages(toc_pages) - .with_page_numbers(has_page_numbers) - .with_confidence(confidence) - } - - /// Score a page for TOC likelihood. - fn score_page_for_toc(&self, page: &PdfPage) -> (f32, bool) { - let lines: Vec<&str> = page.text.lines().collect(); - - if lines.len() < 2 { - return (0.0, false); - } - - let mut max_score: f32 = 0.0; - let mut has_page_numbers = false; - let mut match_count = 0; - - for line in &lines { - for pattern in &self.patterns { - if pattern.regex.is_match(line) { - max_score = max_score.max(pattern.weight); - match_count += 1; - - // Check if pattern includes page numbers - if line.matches(char::is_numeric).count() > 0 { - has_page_numbers = true; - } - } - } - } - - // Adjust score based on number of matches - let score = if match_count >= 3 { - max_score - } else if match_count >= 1 { - max_score * 0.7 - } else { - 0.0 - }; - - (score, has_page_numbers) - } - - /// Detect TOC using LLM. - async fn detect_with_llm( - &self, - client: &LlmClient, - pages: &[&PdfPage], - ) -> Result { - // Combine first few pages for analysis - let content = pages - .iter() - .take(5) - .map(|p| { - format!( - "\n{}\n", - p.number, - &p.text[..p.text.len().min(1000)], - p.number - ) - }) - .collect::>() - .join("\n\n"); - - let system = "You are a document analysis assistant. Your task is to detect if the given document contains a Table of Contents (TOC)."; - let user = format!( - r#"Analyze this document and determine if it contains a Table of Contents. - -Document content: -{} - -Reply in JSON format: -{{ - "has_toc": true/false, - "toc_pages": [list of page numbers where TOC appears], - "has_page_numbers": true/false (whether TOC entries include page numbers), - "confidence": 0.0-1.0 -}}"#, - content - ); - - #[derive(serde::Deserialize)] - struct DetectionResponse { - has_toc: bool, - toc_pages: Vec, - has_page_numbers: bool, - confidence: f32, - } - - let response: DetectionResponse = client.complete_json(system, &user).await?; - - Ok(TocDetection::new(response.has_toc) - .with_pages(response.toc_pages) - .with_page_numbers(response.has_page_numbers) - .with_confidence(response.confidence)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn make_page(number: usize, text: &str) -> PdfPage { - PdfPage::new(number, text) - } - - #[test] - fn test_detect_chinese_toc() { - let detector = TocDetector::with_defaults(); - - let pages = vec![ - make_page(1, "前言"), - make_page(2, "目 录\n\n第一章 引言 ... 1\n第二章 方法 ... 5"), - ]; - - let rt = tokio::runtime::Runtime::new().unwrap(); - let result = rt.block_on(detector.detect(&pages)).unwrap(); - - assert!(result.found); - assert!(result.has_page_numbers); - } - - #[test] - fn test_detect_english_toc() { - let detector = TocDetector::with_defaults(); - - let pages = vec![ - make_page(1, "Abstract"), - make_page( - 2, - "Table of Contents\n\nChapter 1. Introduction 1\nChapter 2. Methods 5", - ), - ]; - - let rt = tokio::runtime::Runtime::new().unwrap(); - let result = rt.block_on(detector.detect(&pages)).unwrap(); - - assert!(result.found); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/mod.rs b/vectorless-core/vectorless/src/index/parse/toc/mod.rs deleted file mode 100644 index beac24d7..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/mod.rs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Table of Contents (TOC) processing module. -//! -//! This module provides functionality to extract and verify document structure -//! from PDF Table of Contents: -//! -//! - **Detection** — Find TOC in document (regex + LLM fallback) -//! - **Parsing** — Convert TOC text to structured entries (LLM) -//! - **Assignment** — Map TOC pages to physical pages -//! - **Verification** — Sample verification of page assignments -//! - **Repair** — Fix incorrect assignments - -mod assigner; -mod detector; -mod parser; -mod processor; -mod repairer; -mod structure_extractor; -mod types; -mod verifier; - -// Re-export main types -pub use types::TocEntry; - -// Re-export components -pub use processor::TocProcessor; diff --git a/vectorless-core/vectorless/src/index/parse/toc/parser.rs b/vectorless-core/vectorless/src/index/parse/toc/parser.rs deleted file mode 100644 index df0f306d..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/parser.rs +++ /dev/null @@ -1,279 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! TOC parser - converts TOC text to structured entries. - -use tracing::debug; - -use crate::error::Result; -use crate::llm::config::LlmConfig; - -use super::types::TocEntry; -use crate::llm::LlmClient; - -/// TOC parser configuration. -#[derive(Debug, Clone)] -pub struct TocParserConfig { - /// LLM configuration. - pub llm_config: LlmConfig, - - /// Maximum retries for incomplete parsing. - pub max_retries: usize, - - /// Verify completeness after parsing. - pub verify_completeness: bool, -} - -impl Default for TocParserConfig { - fn default() -> Self { - Self { - llm_config: LlmConfig::default(), - max_retries: 3, - verify_completeness: true, - } - } -} - -/// TOC parser - converts raw TOC text to structured entries. -pub struct TocParser { - config: TocParserConfig, - client: LlmClient, -} - -impl TocParser { - /// Create a new TOC parser. - pub fn new(config: TocParserConfig) -> Self { - let client = LlmClient::new(config.llm_config.clone().into()); - Self { config, client } - } - - /// Create a parser with an externally provided LLM client. - pub fn with_client(client: LlmClient) -> Self { - Self { - config: TocParserConfig::default(), - client, - } - } - - /// Create a parser with default configuration. - pub fn with_defaults() -> Self { - Self::new(TocParserConfig::default()) - } - - /// Parse TOC text into structured entries. - pub async fn parse(&self, toc_text: &str) -> Result> { - if toc_text.trim().is_empty() { - return Ok(Vec::new()); - } - - // Step 1: Initial parse - let entries = self.parse_with_llm(toc_text).await?; - debug!("Initial parse: {} entries", entries.len()); - - if entries.is_empty() { - return Ok(entries); - } - - // Step 2: Verify completeness (if enabled) - if self.config.verify_completeness { - self.verify_and_complete(toc_text, entries).await - } else { - Ok(entries) - } - } - - /// Parse TOC text using LLM. - async fn parse_with_llm(&self, toc_text: &str) -> Result> { - let system = r#"You are a document structure extraction expert. -Your task is to parse a Table of Contents (TOC) into a structured format. - -Rules: -1. Extract all sections and subsections -2. Determine the hierarchy level (1 = top level, 2 = subsection, etc.) -3. Extract page numbers if present -4. Preserve original titles exactly (only fix spacing issues) -5. If the TOC seems incomplete, extract what you can see"#; - - let user = format!( - r#"Parse this Table of Contents: - -{} - -Return a JSON array: -[ - {{ - "title": "Section Title", - "level": 1, - "page": 10 - }}, - ... -] - -Notes: -- "level" should reflect the hierarchy (1, 2, 3...) -- "page" is optional if not present in TOC -- Only output the JSON array, no other text"#, - toc_text - ); - - #[derive(serde::Deserialize)] - struct ParsedEntry { - title: String, - level: usize, - #[serde(default)] - page: Option, - } - - let entries: Vec = self.client.complete_json(system, &user).await?; - - Ok(entries - .into_iter() - .map(|e| { - let mut entry = TocEntry::new(e.title, e.level); - if let Some(page) = e.page { - entry = entry.with_toc_page(page); - } - entry - }) - .collect()) - } - - /// Verify completeness and continue if needed. - async fn verify_and_complete( - &self, - toc_text: &str, - mut entries: Vec, - ) -> Result> { - let mut attempts = 0; - - while attempts < self.config.max_retries { - // Check if parsing is complete - let is_complete = self.check_completeness(toc_text, &entries).await?; - - if is_complete { - debug!("TOC parsing complete after {} attempts", attempts + 1); - return Ok(entries); - } - - debug!( - "TOC incomplete, attempting continuation (attempt {})", - attempts + 1 - ); - - // Continue parsing - let additional = self.continue_parsing(toc_text, &entries).await?; - if additional.is_empty() { - // No more entries found, stop - break; - } - - entries.extend(additional); - attempts += 1; - } - - Ok(entries) - } - - /// Check if parsing is complete. - async fn check_completeness(&self, toc_text: &str, entries: &[TocEntry]) -> Result { - let system = "You are a document analysis assistant. Determine if the parsed entries completely represent the original TOC."; - - let entries_json = - serde_json::to_string_pretty(&entries.iter().map(|e| &e.title).collect::>()) - .unwrap_or_default(); - - let user = format!( - r#"Original TOC: -{} - -Parsed entries: -{} - -Is the parsing complete? Reply with JSON: -{{"complete": true/false}}"#, - toc_text, entries_json - ); - - #[derive(serde::Deserialize)] - struct CompletenessCheck { - complete: bool, - } - - let result: CompletenessCheck = self.client.complete_json(system, &user).await?; - Ok(result.complete) - } - - /// Continue parsing from where we left off. - async fn continue_parsing( - &self, - toc_text: &str, - existing: &[TocEntry], - ) -> Result> { - let system = "You are a document structure extraction expert. Continue parsing the TOC from where it was left off."; - - let last_titles: Vec<_> = existing.iter().rev().take(5).map(|e| &e.title).collect(); - - let user = format!( - r#"Original TOC: -{} - -Already parsed (last 5): -{:?} - -Extract the REMAINING entries that were missed. Return a JSON array: -[ - {{"title": "...", "level": N, "page": M}}, - ... -] - -If nothing was missed, return an empty array: []"#, - toc_text, last_titles - ); - - #[derive(serde::Deserialize)] - struct ParsedEntry { - title: String, - level: usize, - #[serde(default)] - page: Option, - } - - let entries: Vec = self.client.complete_json(system, &user).await?; - - Ok(entries - .into_iter() - .map(|e| { - let mut entry = TocEntry::new(e.title, e.level); - if let Some(page) = e.page { - entry = entry.with_toc_page(page); - } - entry - }) - .collect()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_parse_simple_toc() { - let parser = TocParser::with_defaults(); - - // This test requires an API key - if std::env::var("OPENAI_API_KEY").is_err() { - return; - } - - let toc_text = r#" -Chapter 1. Introduction 1 - 1.1 Background 2 - 1.2 Objectives 5 -Chapter 2. Methods 10 -"#; - - let entries = parser.parse(toc_text).await.unwrap(); - assert!(!entries.is_empty()); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/processor.rs b/vectorless-core/vectorless/src/index/parse/toc/processor.rs deleted file mode 100644 index e53b6346..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/processor.rs +++ /dev/null @@ -1,573 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! TOC processor - integrates all TOC processing components. -//! -//! The processor orchestrates a multi-mode extraction pipeline with automatic -//! degradation: if one mode fails verification, it falls back to a lower-quality -//! but more reliable mode. - -use futures::stream::{self, StreamExt}; -use tracing::{debug, info, warn}; - -use crate::error::Result; -use crate::index::parse::pdf::PdfPage; -use crate::llm::LlmClient; - -use super::assigner::{PageAssigner, PageAssignerConfig}; -use super::detector::{TocDetector, TocDetectorConfig}; -use super::parser::{TocParser, TocParserConfig}; -use super::repairer::{IndexRepairer, RepairerConfig}; -use super::structure_extractor::{StructureExtractor, StructureExtractorConfig}; -use super::types::{ProcessingMode, TocEntry, VerificationReport}; -use super::verifier::{IndexVerifier, VerifierConfig}; - -/// TOC processor configuration. -#[derive(Debug, Clone)] -pub struct TocProcessorConfig { - /// TOC detector configuration. - pub detector: TocDetectorConfig, - - /// TOC parser configuration. - pub parser: TocParserConfig, - - /// Page assigner configuration. - pub assigner: PageAssignerConfig, - - /// Verifier configuration. - pub verifier: VerifierConfig, - - /// Repairer configuration. - pub repairer: RepairerConfig, - - /// Accuracy threshold for acceptance (0.0 - 1.0). - pub accuracy_threshold: f32, - - /// Maximum repair attempts per verification cycle. - pub max_repair_attempts: usize, - - /// Maximum page span for a single entry before recursive refinement. - pub max_pages_per_entry: usize, - - /// Maximum estimated tokens for a single entry before recursive refinement. - pub max_tokens_per_entry: usize, -} - -impl Default for TocProcessorConfig { - fn default() -> Self { - Self { - detector: TocDetectorConfig::default(), - parser: TocParserConfig::default(), - assigner: PageAssignerConfig::default(), - verifier: VerifierConfig::default(), - repairer: RepairerConfig::default(), - accuracy_threshold: 0.6, - max_repair_attempts: 3, - max_pages_per_entry: 30, - max_tokens_per_entry: 20000, - } - } -} - -/// TOC processor - orchestrates the complete TOC extraction pipeline. -/// -/// # Processing Pipeline -/// -/// 1. **Detect** - Find TOC in document (regex + LLM fallback) -/// 2. **Extract** - Get TOC text from detected pages -/// 3. **Parse** - Convert TOC text to structured entries (LLM) -/// 4. **Assign** - Map TOC pages to physical pages -/// 5. **Verify** - Sample verification of page assignments -/// 6. **Repair** - Fix incorrect assignments (if needed) -/// 7. **Refine** - Sub-divide oversized entries (if needed) -/// -/// # Degradation Strategy -/// -/// The pipeline tries three modes in order of quality: -/// -/// 1. `TocWithPageNumbers` - TOC found with page numbers (offset calculation) -/// 2. `TocWithoutPageNumbers` - TOC found without page numbers (LLM positioning) -/// 3. `NoToc` - No TOC available (LLM structure extraction from content) -/// -/// If a mode fails verification (accuracy < threshold), it automatically -/// degrades to the next mode. -/// -/// # Example -/// -/// ```rust,no_run -/// use vectorless::parser::toc::TocProcessor; -/// use vectorless::parser::pdf::PdfParser; -/// -/// # #[tokio::main] -/// # async fn main() -> vectorless::Result<()> { -/// let pdf_parser = PdfParser::new(); -/// let result = pdf_parser.parse_file("document.pdf".as_ref()).await?; -/// -/// let processor = TocProcessor::new(); -/// let entries = processor.process(&result.pages).await?; -/// -/// for entry in &entries { -/// println!("{} - Page {:?}", entry.title, entry.physical_page); -/// } -/// # Ok(()) -/// # } -/// ``` -pub struct TocProcessor { - config: TocProcessorConfig, - detector: TocDetector, - parser: TocParser, - assigner: PageAssigner, - verifier: IndexVerifier, - repairer: IndexRepairer, - /// Optional LLM client for StructureExtractor (no-TOC mode and refinement). - llm_client: Option, -} - -impl TocProcessor { - /// Create a new TOC processor with default configuration. - pub fn new() -> Self { - Self::with_config(TocProcessorConfig::default()) - } - - /// Create a TOC processor with an externally provided LLM client. - /// - /// All sub-components (detector, parser, assigner, verifier, repairer) - /// will use this client instead of creating their own from default config. - pub fn with_llm_client(client: LlmClient) -> Self { - info!("TocProcessor: created with external LLM client"); - let config = TocProcessorConfig::default(); - Self { - detector: TocDetector::with_client(config.detector.clone(), client.clone()), - parser: TocParser::with_client(client.clone()), - assigner: PageAssigner::with_client(client.clone()), - verifier: IndexVerifier::with_client(client.clone()), - repairer: IndexRepairer::with_client(client.clone()), - llm_client: Some(client), - config, - } - } - - /// Create a TOC processor with custom configuration. - pub fn with_config(config: TocProcessorConfig) -> Self { - info!("TocProcessor: created with config (no external LLM client)"); - Self { - detector: TocDetector::new(config.detector.clone()), - parser: TocParser::new(config.parser.clone()), - assigner: PageAssigner::new(config.assigner.clone()), - verifier: IndexVerifier::new(config.verifier.clone()), - repairer: IndexRepairer::new(config.repairer.clone()), - llm_client: None, - config, - } - } - - /// Process PDF pages and extract hierarchical structure. - /// - /// This is the main entry point. It detects TOC, selects the best - /// processing mode, and automatically degrades if needed. - pub async fn process(&self, pages: &[PdfPage]) -> Result> { - if pages.is_empty() { - return Ok(Vec::new()); - } - - info!("Processing {} pages for TOC extraction", pages.len()); - - // Step 1: Detect TOC - let detection = self.detector.detect(pages).await?; - - // Step 2: Determine initial mode based on detection result - let initial_mode = if !detection.found { - info!("No TOC found in document"); - ProcessingMode::NoToc - } else if detection.has_page_numbers { - info!("TOC found on pages {:?}, has page numbers", detection.pages); - ProcessingMode::TocWithPageNumbers - } else { - info!("TOC found on pages {:?}, no page numbers", detection.pages); - ProcessingMode::TocWithoutPageNumbers - }; - - // Step 3: Process with degradation - let entries = self - .process_with_degradation(initial_mode, &detection, pages) - .await?; - - // Step 4: Refine oversized entries - self.refine_large_entries(entries, pages).await - } - - /// Process with automatic mode degradation. - /// - /// Tries the given mode, verifies the result, and degrades to a - /// lower-quality mode if accuracy is below threshold. - async fn process_with_degradation( - &self, - initial_mode: ProcessingMode, - detection: &super::types::TocDetection, - pages: &[PdfPage], - ) -> Result> { - let mut mode = initial_mode; - - loop { - info!("Attempting extraction with mode {:?}", mode); - - let result = match mode { - ProcessingMode::TocWithPageNumbers => { - self.process_toc_with_page_numbers(detection, pages).await - } - ProcessingMode::TocWithoutPageNumbers => { - self.process_toc_without_page_numbers(detection, pages) - .await - } - ProcessingMode::NoToc => { - // NoToc always succeeds (produces some structure) - return self.process_without_toc(pages).await; - } - }; - - match result { - Ok(entries) if !entries.is_empty() => { - // Verify the entries - let mut mutable_entries = entries; - let report = self.verify_and_repair(&mut mutable_entries, pages).await?; - - if report.accuracy >= self.config.accuracy_threshold { - info!( - "Mode {:?} succeeded: {} entries, accuracy {:.1}%", - mode, - mutable_entries.len(), - report.accuracy * 100.0 - ); - return Ok(mutable_entries); - } - - // Accuracy too low, try degrading - warn!( - "Mode {:?} accuracy {:.1}% below threshold {:.1}%", - mode, - report.accuracy * 100.0, - self.config.accuracy_threshold * 100.0 - ); - - match mode.degrade() { - Some(next) => { - info!("Degrading from {:?} to {:?}", mode, next); - mode = next; - // Continue loop with degraded mode - } - None => { - warn!("No further degradation possible, returning best effort"); - return Ok(mutable_entries); - } - } - } - Ok(_) => { - // Empty entries, degrade - warn!("Mode {:?} produced no entries", mode); - match mode.degrade() { - Some(next) => { - mode = next; - } - None => return Ok(Vec::new()), - } - } - Err(e) => { - warn!("Mode {:?} failed: {}", mode, e); - match mode.degrade() { - Some(next) => { - mode = next; - } - None => return Err(e), - } - } - } - } - } - - /// Mode 1: TOC with page numbers. - /// - /// Parse the TOC, calculate physical-page offset from anchor entries, - /// and apply the offset to all entries. - async fn process_toc_with_page_numbers( - &self, - detection: &super::types::TocDetection, - pages: &[PdfPage], - ) -> Result> { - let toc_text = self.extract_toc_text(pages, &detection.pages); - if toc_text.trim().is_empty() { - return Ok(Vec::new()); - } - - let mut entries = self.parser.parse(&toc_text).await?; - if entries.is_empty() { - return Ok(Vec::new()); - } - - // Assign physical pages using offset calculation - self.assigner.assign(&mut entries, pages).await?; - - Ok(entries) - } - - /// Mode 2: TOC without page numbers. - /// - /// Parse the TOC, then use LLM to locate each entry in the document. - async fn process_toc_without_page_numbers( - &self, - detection: &super::types::TocDetection, - pages: &[PdfPage], - ) -> Result> { - let toc_text = self.extract_toc_text(pages, &detection.pages); - if toc_text.trim().is_empty() { - return Ok(Vec::new()); - } - - let mut entries = self.parser.parse(&toc_text).await?; - if entries.is_empty() { - return Ok(Vec::new()); - } - - // Clear any TOC page numbers (they're unreliable in this mode) - for entry in &mut entries { - entry.toc_page = None; - } - - // Assign physical pages using LLM positioning - self.assigner.assign(&mut entries, pages).await?; - - Ok(entries) - } - - /// Mode 3: No TOC available. - /// - /// Extract document structure directly from page content using LLM. - async fn process_without_toc(&self, pages: &[PdfPage]) -> Result> { - info!("Extracting structure from page content (no TOC available)"); - - let extractor = match &self.llm_client { - Some(client) => { - StructureExtractor::with_client(StructureExtractorConfig::default(), client.clone()) - } - None => StructureExtractor::new(StructureExtractorConfig::default()), - }; - extractor.extract(pages).await - } - - /// Extract TOC text from pages. - fn extract_toc_text(&self, pages: &[PdfPage], toc_pages: &[usize]) -> String { - toc_pages - .iter() - .filter_map(|&page_num| pages.get(page_num - 1)) - .map(|page| page.text.as_str()) - .collect::>() - .join("\n\n") - } - - /// Verify entries and repair if needed. - async fn verify_and_repair( - &self, - entries: &mut [TocEntry], - pages: &[PdfPage], - ) -> Result { - let mut attempts = 0; - - while attempts < self.config.max_repair_attempts { - let report = self.verifier.verify(entries, pages).await?; - - if report.accuracy >= self.config.accuracy_threshold { - debug!( - "Verification passed: accuracy {:.1}%", - report.accuracy * 100.0 - ); - return Ok(report); - } - - if report.errors.is_empty() { - return Ok(report); - } - - let repaired = self.repairer.repair(entries, &report.errors, pages).await?; - - if repaired == 0 { - debug!("No repairs possible"); - return Ok(report); - } - - attempts += 1; - debug!("Repair attempt {} complete", attempts); - } - - self.verifier.verify(entries, pages).await - } - - /// Refine oversized entries by extracting sub-structure. - /// - /// Entries that span too many pages or tokens are broken down using - /// the same structure extraction approach used for no-TOC documents. - async fn refine_large_entries( - &self, - entries: Vec, - pages: &[PdfPage], - ) -> Result> { - if entries.is_empty() { - return Ok(entries); - } - - let page_count = pages.len(); - - // Pre-compute next-entry page numbers and classify entries - let next_pages: Vec> = entries - .iter() - .enumerate() - .map(|(i, _)| entries.get(i + 1).and_then(|e| e.physical_page)) - .collect(); - - // Identify oversized entries and launch extractions concurrently - let llm_client = self.llm_client.clone(); - let oversized_futures: Vec<_> = entries - .iter() - .enumerate() - .filter(|(i, entry)| { - let span = entry_page_span(entry, next_pages[*i], page_count); - let tokens = entry_token_count(entry, pages); - span > self.config.max_pages_per_entry && tokens > self.config.max_tokens_per_entry - }) - .map(|(i, entry)| { - let start = entry.physical_page.unwrap_or(1); - let end = next_pages[i].unwrap_or(page_count); - let sub_pages: Vec = pages - .iter() - .filter(|p| p.number >= start && p.number <= end) - .cloned() - .collect(); - - let entry_title = entry.title.clone(); - let entry_level = entry.level; - let llm_client = llm_client.clone(); - - async move { - if sub_pages.is_empty() { - return (i, Vec::new()); - } - debug!( - "Refining oversized entry '{}' (pages {}-{})", - entry_title, start, end - ); - let extractor = match &llm_client { - Some(client) => StructureExtractor::with_client( - StructureExtractorConfig::default(), - client.clone(), - ), - None => StructureExtractor::new(StructureExtractorConfig::default()), - }; - match extractor.extract(&sub_pages).await { - Ok(sub_entries) => { - let skip = if sub_entries - .first() - .map(|e| e.title.trim() == entry_title.trim()) - .unwrap_or(false) - { - 1 - } else { - 0 - }; - - let refined: Vec = sub_entries[skip..] - .iter() - .map(|sub| { - TocEntry::new(&sub.title, sub.level + entry_level) - .with_physical_page(sub.physical_page.unwrap_or(start)) - .with_confidence(sub.confidence * 0.9) - }) - .collect(); - - info!( - "Refined '{}' into {} sub-entries", - entry_title, - refined.len() - ); - (i, refined) - } - Err(e) => { - warn!("Sub-extraction failed for '{}': {}", entry_title, e); - (i, Vec::new()) - } - } - } - }) - .collect(); - - let extraction_results: Vec<_> = stream::iter(oversized_futures) - .buffer_unordered(3) - .collect() - .await; - - // Build a lookup from index → refined sub-entries - let mut refined_map = std::collections::HashMap::new(); - for (idx, sub_entries) in extraction_results { - if !sub_entries.is_empty() { - refined_map.insert(idx, sub_entries); - } - } - - // Assemble final output - let mut result = Vec::with_capacity(entries.len() * 2); - for (i, entry) in entries.into_iter().enumerate() { - if let Some(sub_entries) = refined_map.remove(&i) { - result.extend(sub_entries); - } else { - result.push(entry); - } - } - - Ok(result) - } -} - -impl Default for TocProcessor { - fn default() -> Self { - Self::new() - } -} - -/// Calculate how many pages an entry spans. -/// -/// From its physical_page to the next entry's physical_page (or document end). -fn entry_page_span( - entry: &TocEntry, - next_physical_page: Option, - total_pages: usize, -) -> usize { - let start = entry.physical_page.unwrap_or(1); - let end = next_physical_page.unwrap_or(total_pages); - end.saturating_sub(start) -} - -/// Estimate total tokens for the content covered by an entry. -fn entry_token_count(entry: &TocEntry, pages: &[PdfPage]) -> usize { - let start = entry.physical_page.unwrap_or(1); - pages - .iter() - .filter(|p| p.number >= start) - .take(30) // cap at max_pages_per_entry default - .map(|p| p.token_count) - .sum() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_processor_creation() { - let processor = TocProcessor::new(); - assert_eq!(processor.config.accuracy_threshold, 0.6); - } - - #[tokio::test] - async fn test_empty_pages() { - let processor = TocProcessor::new(); - let entries = processor.process(&[]).await.unwrap(); - assert!(entries.is_empty()); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/repairer.rs b/vectorless-core/vectorless/src/index/parse/toc/repairer.rs deleted file mode 100644 index 61ba414e..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/repairer.rs +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Index repairer - fixes incorrect TOC entry page assignments. - -use futures::stream::{self, StreamExt}; -use tracing::{debug, info}; - -use crate::error::Result; -use crate::index::parse::pdf::PdfPage; -use crate::llm::config::LlmConfig; - -use super::types::{TocEntry, VerificationError, VerificationReport}; -use super::verifier::IndexVerifier; -use crate::llm::LlmClient; - -/// Repairer configuration. -#[derive(Debug, Clone)] -pub struct RepairerConfig { - /// Maximum repair attempts. - pub max_attempts: usize, - - /// LLM configuration. - pub llm_config: LlmConfig, - - /// Page search range around expected page. - pub search_range: usize, -} - -impl Default for RepairerConfig { - fn default() -> Self { - Self { - max_attempts: 3, - llm_config: LlmConfig::default(), - search_range: 5, - } - } -} - -/// Index repairer - fixes incorrect page assignments. -pub struct IndexRepairer { - config: RepairerConfig, - client: LlmClient, -} - -impl IndexRepairer { - /// Create a new repairer. - pub fn new(config: RepairerConfig) -> Self { - let client = LlmClient::new(config.llm_config.clone().into()); - Self { config, client } - } - - /// Create a repairer with an externally provided LLM client. - pub fn with_client(client: LlmClient) -> Self { - Self { - config: RepairerConfig::default(), - client, - } - } - - /// Create a repairer with default configuration. - pub fn with_defaults() -> Self { - Self::new(RepairerConfig::default()) - } - - /// Repair incorrect entries with bounded concurrency. - pub async fn repair( - &self, - entries: &mut [TocEntry], - errors: &[VerificationError], - pages: &[PdfPage], - ) -> Result { - if errors.is_empty() { - return Ok(0); - } - - info!("Repairing {} incorrect entries", errors.len()); - - // Collect repair tasks (don't borrow entries mutably yet) - let client = self.client.clone(); - let pages_owned = pages.to_vec(); - let search_range = self.config.search_range; - - let tasks: Vec<_> = errors - .iter() - .filter(|error| error.index < entries.len()) - .map(|error| { - let title = entries[error.index].title.clone(); - let expected_page = error.expected_page; - let client = client.clone(); - let pages = pages_owned.clone(); - - async move { - let start = expected_page.saturating_sub(search_range).max(1); - let end = (expected_page + search_range).min(pages.len()); - - let result = - Self::find_correct_page_static(&client, &title, &pages, start..=end).await; - - (title, expected_page, result) - } - }) - .collect(); - - let results: Vec<_> = stream::iter(tasks).buffer_unordered(5).collect().await; - - // Apply repairs - let mut repaired_count = 0; - for (title, expected_page, result) in results { - match result { - Ok(Some(correct_page)) => { - // Find the corresponding error entry and fix it - if let Some(error) = errors.iter().find(|e| e.title == title) { - if error.index < entries.len() { - debug!( - "Repaired '{}' : page {} → {}", - title, expected_page, correct_page - ); - entries[error.index].physical_page = Some(correct_page); - entries[error.index].confidence = 0.9; - repaired_count += 1; - } - } - } - Ok(None) => { - debug!( - "Could not repair '{}' (searched around page {})", - title, expected_page - ); - } - Err(e) => { - debug!("Repair failed for '{}': {}", title, e); - } - } - } - - info!("Repaired {}/{} entries", repaired_count, errors.len()); - Ok(repaired_count) - } - - /// Find the correct page for a title within a range (static, for concurrent use). - async fn find_correct_page_static( - client: &LlmClient, - title: &str, - pages: &[PdfPage], - range: std::ops::RangeInclusive, - ) -> Result> { - let system = "You are a document analysis assistant. Find which page contains a specific section title."; - - // Build content for pages in range - let mut content_parts = Vec::new(); - for page_num in range { - if let Some(page) = pages.get(page_num - 1) { - let text = if page.text.len() > 500 { - &page.text[..500] - } else { - &page.text - }; - content_parts.push(format!( - "\n{}\n", - page_num, text, page_num - )); - } - } - - if content_parts.is_empty() { - return Ok(None); - } - - let content = content_parts.join("\n\n"); - let user = format!( - r#"Find which page contains the section titled: "{}" - -Pages: -{} - -Reply in JSON format: -{{"found": true/false, "page": }}"#, - title, content - ); - - #[derive(serde::Deserialize)] - struct FindResult { - found: bool, - page: Option, - } - - let result: FindResult = client.complete_json(system, &user).await?; - - if result.found { - Ok(result.page) - } else { - Ok(None) - } - } - - /// Repair with verification loop. - pub async fn repair_with_verification( - &self, - entries: &mut [TocEntry], - pages: &[PdfPage], - verifier: &IndexVerifier, - ) -> Result { - let mut attempts = 0; - let threshold = 0.6; // Hardcoded for now, should be from verifier config - - while attempts < self.config.max_attempts { - // Verify current state - let report = verifier.verify(entries, pages).await?; - - if report.accuracy >= threshold { - info!("Repair complete: accuracy {:.1}%", report.accuracy * 100.0); - return Ok(report); - } - - if report.errors.is_empty() { - return Ok(report); - } - - // Repair errors - let repaired = self.repair(entries, &report.errors, pages).await?; - - if repaired == 0 { - // No repairs made, stop trying - debug!("No repairs possible, stopping"); - return Ok(report); - } - - attempts += 1; - info!("Repair attempt {} complete, re-verifying", attempts); - } - - // Final verification - verifier.verify(entries, pages).await - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_repairer_creation() { - let repairer = IndexRepairer::with_defaults(); - assert_eq!(repairer.config.max_attempts, 3); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/structure_extractor.rs b/vectorless-core/vectorless/src/index/parse/toc/structure_extractor.rs deleted file mode 100644 index 63ce9d7e..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/structure_extractor.rs +++ /dev/null @@ -1,481 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Structure extraction from documents without a Table of Contents. -//! -//! When a PDF has no TOC (or all TOC-based extraction modes failed), this -//! module uses LLM to analyse page content and extract the document's -//! hierarchical structure directly. - -use futures::stream::{self, StreamExt}; -use tracing::{debug, info, warn}; - -use crate::error::Result; -use crate::index::parse::pdf::PdfPage; -use crate::llm::config::LlmConfig; - -use super::types::TocEntry; -use crate::llm::LlmClient; - -/// Configuration for structure extraction. -#[derive(Debug, Clone)] -pub struct StructureExtractorConfig { - /// Maximum estimated tokens per page group sent to LLM. - pub max_tokens_per_group: usize, - - /// Number of overlap pages between consecutive groups. - pub overlap_pages: usize, - - /// LLM configuration. - pub llm_config: LlmConfig, -} - -impl Default for StructureExtractorConfig { - fn default() -> Self { - Self { - max_tokens_per_group: 20_000, - overlap_pages: 1, - llm_config: LlmConfig::default(), - } - } -} - -/// A group of consecutive pages with their combined text. -#[derive(Clone)] -struct PageGroup { - /// Combined text with page markers: `\n...\n`. - text: String, - /// Start page number (1-based). - start_page: usize, - /// End page number (1-based, inclusive). - end_page: usize, -} - -/// Extracts document structure from page content using LLM. -/// -/// Used when a document has no Table of Contents, or when TOC-based extraction -/// failed. Pages are grouped by token count and analysed sequentially: the -/// first group generates an initial structure, subsequent groups append to it. -pub struct StructureExtractor { - config: StructureExtractorConfig, - client: LlmClient, -} - -impl StructureExtractor { - /// Create a new structure extractor. - pub fn new(config: StructureExtractorConfig) -> Self { - let client = LlmClient::new(config.llm_config.clone().into()); - Self { config, client } - } - - /// Create a structure extractor with an externally provided LLM client. - pub fn with_client(config: StructureExtractorConfig, client: LlmClient) -> Self { - Self { config, client } - } - - /// Create an extractor with default configuration. - pub fn with_defaults() -> Self { - Self::new(StructureExtractorConfig::default()) - } - - /// Extract hierarchical structure from all pages. - /// - /// The first page group is processed alone (initial structure), then all - /// remaining groups are processed in parallel, each using the initial - /// entries as context. Results are merged and deduplicated. - pub async fn extract(&self, pages: &[PdfPage]) -> Result> { - if pages.is_empty() { - return Ok(Vec::new()); - } - - let groups = self.group_pages(pages); - let page_count = pages.len(); - info!( - "Extracting structure from {} pages in {} groups", - page_count, - groups.len() - ); - - // Phase 1: Generate initial structure from first group - let initial_entries = self.generate_initial(&groups[0]).await?; - debug!( - "Initial group (pages {}-{}): extracted {} entries", - groups[0].start_page, - groups[0].end_page, - initial_entries.len() - ); - - if groups.len() == 1 { - return Ok(Self::finalize_entries(initial_entries, page_count)); - } - - // Phase 2: Process remaining groups in parallel (bounded concurrency) - // Each continuation group uses the initial entries as shared context. - let client = self.client.clone(); - let initial_entries_ref = &initial_entries; - - let continuation_futures: Vec<_> = groups[1..] - .iter() - .map(|group| { - let group = group.clone(); - let client = client.clone(); - let initial = initial_entries_ref.to_vec(); - - async move { - let result = - Self::generate_continuation_with_client(&client, &group, &initial).await; - (group.start_page, group.end_page, result) - } - }) - .collect(); - - let continuation_results: Vec<_> = stream::iter(continuation_futures) - .buffer_unordered(5) - .collect() - .await; - - // Phase 3: Merge initial + continuation entries - let mut all_entries = initial_entries; - for (start, end, result) in continuation_results { - match result { - Ok(entries) => { - debug!( - "Continuation group (pages {}-{}): extracted {} entries", - start, - end, - entries.len() - ); - all_entries.extend(entries); - } - Err(e) => { - warn!("Continuation group (pages {}-{}) failed: {}", start, end, e); - } - } - } - - // Phase 4: Sort by page number, deduplicate, truncate - all_entries.sort_by(|a, b| { - a.physical_page - .unwrap_or(0) - .cmp(&b.physical_page.unwrap_or(0)) - }); - all_entries.dedup_by(|a, b| { - a.title.trim() == b.title.trim() && a.physical_page == b.physical_page - }); - - Ok(Self::finalize_entries(all_entries, page_count)) - } - - /// Truncate out-of-range page numbers and log stats. - fn finalize_entries(mut entries: Vec, page_count: usize) -> Vec { - for entry in &mut entries { - if let Some(p) = entry.physical_page { - if p > page_count { - warn!("Truncating out-of-range page {} for '{}'", p, entry.title); - entry.physical_page = Some(page_count); - } - } - } - info!("Structure extraction complete: {} entries", entries.len()); - entries - } - - /// Group pages by estimated token count. - /// - /// Each group stays under `max_tokens_per_group`. Consecutive groups - /// overlap by `overlap_pages` pages to avoid splitting content at - /// section boundaries. - fn group_pages(&self, pages: &[PdfPage]) -> Vec { - let mut groups = Vec::new(); - let mut group_tokens = 0usize; - let mut group_pages_buf = Vec::new(); - - for (i, page) in pages.iter().enumerate() { - let new_tokens = group_tokens + page.token_count; - - if new_tokens > self.config.max_tokens_per_group && !group_pages_buf.is_empty() { - // Finalise current group - let text = format_group_text(&group_pages_buf); - groups.push(PageGroup { - text, - start_page: group_pages_buf.first().unwrap().number, - end_page: group_pages_buf.last().unwrap().number, - }); - - // Start new group with overlap - let overlap_start = i.saturating_sub(self.config.overlap_pages); - group_pages_buf = pages[overlap_start..=i].to_vec(); - group_tokens = group_pages_buf.iter().map(|p| p.token_count).sum(); - } else { - group_tokens = new_tokens; - group_pages_buf.push(page.clone()); - } - } - - // Final group - if !group_pages_buf.is_empty() { - let text = format_group_text(&group_pages_buf); - groups.push(PageGroup { - text, - start_page: group_pages_buf.first().unwrap().number, - end_page: group_pages_buf.last().unwrap().number, - }); - } - - groups - } - - /// Generate initial structure from the first page group. - async fn generate_initial(&self, group: &PageGroup) -> Result> { - let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; - let user = format!( - r#"Analyze this document content and extract its hierarchical structure. - -Document content: -{} - -Return a JSON array: -[ - {{"title": "Section Title", "level": 1, "physical_page": 1}}, - {{"title": "Subsection", "level": 2, "physical_page": 3}}, - ... -] - -Rules: -- "level" reflects the hierarchy (1 = chapter/top, 2 = section, 3 = subsection) -- "physical_page" is the page number where the section begins -- Preserve original titles as closely as possible -- Only output the JSON array, no other text"#, - group.text - ); - - let sections: Vec = self.client.complete_json(system, &user).await?; - - Ok(sections - .into_iter() - .map(|s| { - TocEntry::new(s.title, s.level) - .with_physical_page(s.physical_page) - .with_confidence(0.7) - }) - .collect()) - } - - /// Continue structure extraction for a subsequent group. - /// - /// Passes previously extracted entries as context so the LLM can - /// continue the structure rather than restart. - async fn generate_continuation( - &self, - group: &PageGroup, - previous: &[TocEntry], - ) -> Result> { - let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; - - // Summarise previous entries as context - let prev_summary = previous - .iter() - .rev() - .take(10) - .rev() - .map(|e| { - format!( - " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", - e.title, - e.level, - e.physical_page.unwrap_or(0) - ) - }) - .collect::>() - .join(",\n"); - - let user = format!( - r#"Previously extracted structure: -[ -{} -] - -Continue extracting structure from these pages: -{} - -Return ONLY the NEW entries (do not repeat previous ones): -[ - {{"title": "...", "level": N, "physical_page": M}}, - ... -] - -If no new structural elements are found, return: []"#, - prev_summary, group.text - ); - - let sections: Vec = self.client.complete_json(system, &user).await?; - - Ok(sections - .into_iter() - .map(|s| { - TocEntry::new(s.title, s.level) - .with_physical_page(s.physical_page) - .with_confidence(0.7) - }) - .collect()) - } - - /// Static version of continuation generation for parallel use. - /// - /// Uses an owned `LlmClient` reference instead of `&self`. - async fn generate_continuation_with_client( - client: &LlmClient, - group: &PageGroup, - previous: &[TocEntry], - ) -> Result> { - let system = STRUCTURE_EXTRACTION_SYSTEM_PROMPT; - - let prev_summary = previous - .iter() - .rev() - .take(10) - .rev() - .map(|e| { - format!( - " {{\"title\": \"{}\", \"level\": {}, \"physical_page\": {}}}", - e.title, - e.level, - e.physical_page.unwrap_or(0) - ) - }) - .collect::>() - .join(",\n"); - - let user = format!( - r#"Previously extracted structure: -[ -{} -] - -Continue extracting structure from these pages: -{} - -Return ONLY the NEW entries (do not repeat previous ones): -[ - {{"title": "...", "level": N, "physical_page": M}}, - ... -] - -If no new structural elements are found, return: []"#, - prev_summary, group.text - ); - - let sections: Vec = client.complete_json(system, &user).await?; - - Ok(sections - .into_iter() - .map(|s| { - TocEntry::new(s.title, s.level) - .with_physical_page(s.physical_page) - .with_confidence(0.7) - }) - .collect()) - } -} - -/// Format pages into tagged text for LLM consumption. -fn format_group_text(pages: &[PdfPage]) -> String { - pages - .iter() - .map(|p| { - // Truncate individual page text if very long - let text = if p.text.len() > 3000 { - &p.text[..3000] - } else { - &p.text - }; - format!("\n{}\n", p.number, text, p.number) - }) - .collect::>() - .join("\n\n") -} - -const STRUCTURE_EXTRACTION_SYSTEM_PROMPT: &str = r#"You are a document structure extraction expert. Your task is to analyze document content and extract its hierarchical structure (chapters, sections, subsections). - -For each structural element you find, provide: -- title: The section title exactly as it appears -- level: The hierarchy level (1 = chapter/top level, 2 = section, 3 = subsection) -- physical_page: The page number where this section begins - -Important: -- Focus on genuine structural elements (chapters, sections), not paragraph topics -- Do NOT include the abstract, summary, or bibliography as structural elements unless they are major sections -- Be conservative: fewer high-quality entries are better than many low-quality ones"#; - -/// LLM response type for structure extraction. -#[derive(serde::Deserialize)] -struct ExtractedSection { - title: String, - level: usize, - physical_page: usize, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_default_config() { - let config = StructureExtractorConfig::default(); - assert_eq!(config.max_tokens_per_group, 20_000); - assert_eq!(config.overlap_pages, 1); - } - - #[test] - fn test_group_pages_single_group() { - let extractor = StructureExtractor::with_defaults(); - - let pages: Vec = (1..=5) - .map(|i| PdfPage::new(i, format!("Page {} content", i))) - .collect(); - - let groups = extractor.group_pages(&pages); - assert_eq!(groups.len(), 1); - assert_eq!(groups[0].start_page, 1); - assert_eq!(groups[0].end_page, 5); - } - - #[test] - fn test_group_pages_multiple_groups() { - let config = StructureExtractorConfig { - max_tokens_per_group: 50, - overlap_pages: 1, - ..Default::default() - }; - let extractor = StructureExtractor::new(config); - - // Create pages with enough text to span multiple groups - let pages: Vec = (1..=10) - .map(|i| { - let text = format!( - "Page {} content. This is a longer text to use more tokens. ", - i - ) - .repeat(10); - PdfPage::new(i, text) - }) - .collect(); - - let groups = extractor.group_pages(&pages); - assert!( - groups.len() > 1, - "Expected multiple groups, got {}", - groups.len() - ); - } - - #[test] - fn test_format_group_text() { - let pages = vec![PdfPage::new(1, "Hello"), PdfPage::new(2, "World")]; - let text = format_group_text(&pages); - assert!(text.contains("")); - assert!(text.contains("")); - assert!(text.contains("Hello")); - assert!(text.contains("World")); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/types.rs b/vectorless-core/vectorless/src/index/parse/toc/types.rs deleted file mode 100644 index 0438c0d3..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/types.rs +++ /dev/null @@ -1,350 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! TOC (Table of Contents) types. - -use serde::{Deserialize, Serialize}; - -/// A single TOC entry. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct TocEntry { - /// Section title. - pub title: String, - - /// Hierarchy level (1 = top level, 2 = subsection, etc.). - pub level: usize, - - /// Page number from TOC (may have offset). - #[serde(skip_serializing_if = "Option::is_none")] - pub toc_page: Option, - - /// Actual physical page number (after verification/assignment). - #[serde(skip_serializing_if = "Option::is_none")] - pub physical_page: Option, - - /// Confidence score (0.0 - 1.0). - #[serde(default)] - pub confidence: f32, - - /// Start line index (for tree building). - #[serde(skip_serializing_if = "Option::is_none")] - pub start_index: Option, - - /// End line index (for tree building). - #[serde(skip_serializing_if = "Option::is_none")] - pub end_index: Option, - - /// Content of this section. - #[serde(skip_serializing_if = "String::is_empty")] - pub content: String, -} - -impl TocEntry { - /// Create a new TOC entry. - pub fn new(title: impl Into, level: usize) -> Self { - Self { - title: title.into(), - level, - toc_page: None, - physical_page: None, - confidence: 1.0, - start_index: None, - end_index: None, - content: String::new(), - } - } - - /// Set the TOC page number. - pub fn with_toc_page(mut self, page: usize) -> Self { - self.toc_page = Some(page); - self - } - - /// Set the physical page number. - pub fn with_physical_page(mut self, page: usize) -> Self { - self.physical_page = Some(page); - self - } - - /// Set the confidence score. - pub fn with_confidence(mut self, confidence: f32) -> Self { - self.confidence = confidence.clamp(0.0, 1.0); - self - } - - /// Check if this entry has a valid physical page. - pub fn has_physical_page(&self) -> bool { - self.physical_page.is_some() - } -} - -impl Default for TocEntry { - fn default() -> Self { - Self::new("", 1) - } -} - -/// Result of TOC detection. -#[derive(Debug, Clone)] -pub struct TocDetection { - /// Whether a TOC was found. - pub found: bool, - - /// Page numbers where TOC appears. - pub pages: Vec, - - /// Whether the TOC contains page numbers. - pub has_page_numbers: bool, - - /// Detection confidence (0.0 - 1.0). - pub confidence: f32, -} - -impl TocDetection { - /// Create a new TOC detection result. - pub fn new(found: bool) -> Self { - Self { - found, - pages: Vec::new(), - has_page_numbers: false, - confidence: 0.0, - } - } - - /// Create a result indicating no TOC was found. - pub fn not_found() -> Self { - Self::new(false) - } - - /// Set the TOC pages. - pub fn with_pages(mut self, pages: Vec) -> Self { - self.pages = pages; - self - } - - /// Set whether page numbers are present. - pub fn with_page_numbers(mut self, has: bool) -> Self { - self.has_page_numbers = has; - self - } - - /// Set the confidence score. - pub fn with_confidence(mut self, confidence: f32) -> Self { - self.confidence = confidence.clamp(0.0, 1.0); - self - } -} - -/// Page offset calculation result. -#[derive(Debug, Clone)] -pub struct PageOffset { - /// Calculated offset: physical_page = toc_page + offset. - pub offset: i32, - - /// Number of samples used for calculation. - pub sample_count: usize, - - /// Confidence in the offset calculation. - pub confidence: f32, -} - -impl PageOffset { - /// Create a new page offset. - pub fn new(offset: i32, sample_count: usize, confidence: f32) -> Self { - Self { - offset, - sample_count, - confidence: confidence.clamp(0.0, 1.0), - } - } - - /// Apply offset to a TOC page number. - pub fn apply(&self, toc_page: usize) -> usize { - (toc_page as i32 + self.offset).max(1) as usize - } -} - -/// Verification error for a single entry. -#[derive(Debug, Clone)] -pub struct VerificationError { - /// Index of the entry in the TOC list. - pub index: usize, - - /// Entry title. - pub title: String, - - /// Expected physical page. - pub expected_page: usize, - - /// Type of error. - pub error_type: ErrorType, -} - -impl VerificationError { - /// Create a new verification error. - pub fn new( - index: usize, - title: impl Into, - expected_page: usize, - error_type: ErrorType, - ) -> Self { - Self { - index, - title: title.into(), - expected_page, - error_type, - } - } -} - -/// Type of verification error. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ErrorType { - /// Title not found on the expected page. - TitleNotFound, - /// Title found but not at page start. - NotAtPageStart, - /// Page number out of document range. - PageOutOfRange, -} - -impl std::fmt::Display for ErrorType { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - ErrorType::TitleNotFound => write!(f, "Title not found on page"), - ErrorType::NotAtPageStart => write!(f, "Title not at page start"), - ErrorType::PageOutOfRange => write!(f, "Page out of range"), - } - } -} - -/// Result of TOC verification. -#[derive(Debug, Clone)] -pub struct VerificationReport { - /// Total entries verified. - pub total: usize, - - /// Number of correct entries. - pub correct: usize, - - /// Accuracy (0.0 - 1.0). - pub accuracy: f32, - - /// List of errors found. - pub errors: Vec, -} - -impl VerificationReport { - /// Create a new verification report. - pub fn new(total: usize, correct: usize, errors: Vec) -> Self { - let accuracy = if total > 0 { - correct as f32 / total as f32 - } else { - 1.0 - }; - Self { - total, - correct, - accuracy, - errors, - } - } - - /// Create a report indicating all entries are correct. - pub fn all_correct(total: usize) -> Self { - Self::new(total, total, Vec::new()) - } - - /// Check if the accuracy meets a threshold. - pub fn meets_threshold(&self, threshold: f32) -> bool { - self.accuracy >= threshold - } - - /// Check if there are any errors. - pub fn has_errors(&self) -> bool { - !self.errors.is_empty() - } -} - -/// Processing mode for the TOC extraction pipeline. -/// -/// Modes are ordered by quality: higher modes produce more accurate results -/// when they succeed, but can degrade to lower modes on failure. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum ProcessingMode { - /// TOC found with page numbers. Highest quality path. - TocWithPageNumbers, - /// TOC found without page numbers, or page-number accuracy was too low. - TocWithoutPageNumbers, - /// No TOC, or all TOC-based modes failed. LLM-driven structure extraction. - NoToc, -} - -impl ProcessingMode { - /// Degrade to the next lower quality mode. - /// - /// Returns `None` if already at the lowest mode (`NoToc`). - pub fn degrade(self) -> Option { - match self { - Self::TocWithPageNumbers => Some(Self::TocWithoutPageNumbers), - Self::TocWithoutPageNumbers => Some(Self::NoToc), - Self::NoToc => None, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_toc_entry() { - let entry = TocEntry::new("Chapter 1", 1) - .with_toc_page(10) - .with_physical_page(12) - .with_confidence(0.9); - - assert_eq!(entry.title, "Chapter 1"); - assert_eq!(entry.level, 1); - assert_eq!(entry.toc_page, Some(10)); - assert_eq!(entry.physical_page, Some(12)); - assert!((entry.confidence - 0.9).abs() < 0.01); - } - - #[test] - fn test_page_offset() { - let offset = PageOffset::new(2, 5, 0.9); - assert_eq!(offset.apply(10), 12); - assert_eq!(offset.apply(1), 3); - } - - #[test] - fn test_verification_report() { - let report = VerificationReport::all_correct(10); - assert_eq!(report.total, 10); - assert_eq!(report.correct, 10); - assert_eq!(report.accuracy, 1.0); - assert!(!report.has_errors()); - } - - #[test] - fn test_error_type_display() { - assert_eq!( - format!("{}", ErrorType::TitleNotFound), - "Title not found on page" - ); - } - - #[test] - fn test_processing_mode_degrade() { - assert_eq!( - ProcessingMode::TocWithPageNumbers.degrade(), - Some(ProcessingMode::TocWithoutPageNumbers) - ); - assert_eq!( - ProcessingMode::TocWithoutPageNumbers.degrade(), - Some(ProcessingMode::NoToc) - ); - assert_eq!(ProcessingMode::NoToc.degrade(), None); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/toc/verifier.rs b/vectorless-core/vectorless/src/index/parse/toc/verifier.rs deleted file mode 100644 index 1e3d1d45..00000000 --- a/vectorless-core/vectorless/src/index/parse/toc/verifier.rs +++ /dev/null @@ -1,281 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Index verifier - verifies TOC entry page assignments. - -use futures::stream::{self, StreamExt}; -use rand::seq::SliceRandom; -use tracing::{debug, info}; - -use crate::error::Result; -use crate::index::parse::pdf::PdfPage; -use crate::llm::config::LlmConfig; - -use super::types::{ErrorType, TocEntry, VerificationError, VerificationReport}; -use crate::llm::LlmClient; - -/// Verifier configuration. -#[derive(Debug, Clone)] -pub struct VerifierConfig { - /// Sample size for verification (None = all entries). - pub sample_size: Option, - - /// LLM configuration. - pub llm_config: LlmConfig, - - /// Accuracy threshold for acceptance. - pub accuracy_threshold: f32, -} - -impl Default for VerifierConfig { - fn default() -> Self { - Self { - sample_size: Some(10), - llm_config: LlmConfig::default(), - accuracy_threshold: 0.6, - } - } -} - -/// Index verifier - verifies that TOC entries point to correct pages. -pub struct IndexVerifier { - config: VerifierConfig, - client: LlmClient, -} - -impl IndexVerifier { - /// Create a new verifier. - pub fn new(config: VerifierConfig) -> Self { - let client = LlmClient::new(config.llm_config.clone().into()); - Self { config, client } - } - - /// Create a verifier with an externally provided LLM client. - pub fn with_client(client: LlmClient) -> Self { - Self { - config: VerifierConfig::default(), - client, - } - } - - /// Create a verifier with default configuration. - pub fn with_defaults() -> Self { - Self::new(VerifierConfig::default()) - } - - /// Verify TOC entries against PDF pages. - /// - /// Sample entries are verified via LLM calls with bounded concurrency. - pub async fn verify( - &self, - entries: &[TocEntry], - pages: &[PdfPage], - ) -> Result { - if entries.is_empty() { - return Ok(VerificationReport::all_correct(0)); - } - - let sample = self.select_sample(entries); - - // Launch verification checks with bounded concurrency - let client = self.client.clone(); - let futures: Vec<_> = sample - .iter() - .map(|(index, entry)| { - let index = *index; - let title = entry.title.clone(); - let physical_page = entry.physical_page; - let client = client.clone(); - let pages = pages.to_vec(); - - async move { - match physical_page { - Some(page) => { - let result = - Self::verify_entry_with_client(&client, &title, page, &pages).await; - (index, title, page, result) - } - None => (index, title, 0, Ok(Err(ErrorType::PageOutOfRange))), - } - } - }) - .collect(); - - let results: Vec<_> = stream::iter(futures).buffer_unordered(5).collect().await; - - // Aggregate results - let total = results.len(); - let mut errors = Vec::new(); - let mut correct = 0; - - for (index, title, page, result) in results { - match result { - Ok(Ok(())) => correct += 1, - Ok(Err(error_type)) => { - errors.push(VerificationError::new(index, title, page, error_type)); - } - Err(e) => { - debug!("Verification LLM call failed: {}", e); - errors.push(VerificationError::new( - index, - title, - page, - ErrorType::TitleNotFound, - )); - } - } - } - - let report = VerificationReport::new(total, correct, errors); - info!( - "Verification complete: {}/{} correct ({:.1}% accuracy)", - report.correct, - report.total, - report.accuracy * 100.0 - ); - - Ok(report) - } - - /// Select a sample of entries to verify. - fn select_sample<'a>(&self, entries: &'a [TocEntry]) -> Vec<(usize, &'a TocEntry)> { - let with_pages: Vec<_> = entries - .iter() - .enumerate() - .filter(|(_, e)| e.physical_page.is_some()) - .collect(); - - match self.config.sample_size { - Some(size) if size < with_pages.len() => { - // Random sample - let mut rng = rand::thread_rng(); - let mut sample: Vec<_> = with_pages; - sample.shuffle(&mut rng); - sample.into_iter().take(size).collect() - } - _ => with_pages, - } - } - - /// Verify a single entry using a cloned client (for concurrent use). - async fn verify_entry_with_client( - client: &LlmClient, - title: &str, - physical_page: usize, - pages: &[PdfPage], - ) -> Result> { - if physical_page == 0 || physical_page > pages.len() { - return Ok(Err(ErrorType::PageOutOfRange)); - } - - let page = &pages[physical_page - 1]; - - let found = Self::check_title_on_page_with_client(client, title, &page.text).await?; - - if !found { - debug!("Title '{}' not found on page {}", title, physical_page); - return Ok(Err(ErrorType::TitleNotFound)); - } - - Ok(Ok(())) - } - - /// Check if a title appears on a page using LLM. - async fn check_title_on_page_with_client( - client: &LlmClient, - title: &str, - page_text: &str, - ) -> Result { - let system = "You are a document analysis assistant. Determine if a section title appears in the given text."; - - let text = if page_text.len() > 1000 { - &page_text[..1000] - } else { - page_text - }; - - let user = format!( - r#"Does the section title "{}" appear in this page text? - -Page text: -{} - -Reply in JSON format: -{{"found": true/false}}"#, - title, text - ); - - #[derive(serde::Deserialize)] - struct CheckResult { - found: bool, - } - - let result: CheckResult = client.complete_json(system, &user).await?; - Ok(result.found) - } - - /// Check if a title appears at the start of a page. - pub async fn check_title_at_start(&self, title: &str, page_text: &str) -> Result { - let system = "You are a document analysis assistant. Determine if a section title appears at the START of the given page text."; - - // Only check first 500 characters - let text = if page_text.len() > 500 { - &page_text[..500] - } else { - page_text - }; - - let user = format!( - r#"Does the section title "{}" appear at the BEGINNING of this page text? -Note: It should be near the start, not in the middle or end. - -Page text: -{} - -Reply in JSON format: -{{"at_start": true/false}}"#, - title, text - ); - - #[derive(serde::Deserialize)] - struct StartCheck { - at_start: bool, - } - - let result: StartCheck = self.client.complete_json(system, &user).await?; - Ok(result.at_start) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_select_sample() { - let verifier = IndexVerifier::with_defaults(); - - let entries: Vec = (1..=20) - .map(|i| TocEntry::new(format!("Entry {}", i), 1).with_physical_page(i)) - .collect(); - - let sample = verifier.select_sample(&entries); - assert_eq!(sample.len(), 10); // default sample_size - } - - #[test] - fn test_select_sample_all() { - let config = VerifierConfig { - sample_size: None, - ..Default::default() - }; - let verifier = IndexVerifier::new(config); - - let entries: Vec = (1..=5) - .map(|i| TocEntry::new(format!("Entry {}", i), 1).with_physical_page(i)) - .collect(); - - let sample = verifier.select_sample(&entries); - assert_eq!(sample.len(), 5); - } -} diff --git a/vectorless-core/vectorless/src/index/parse/types.rs b/vectorless-core/vectorless/src/index/parse/types.rs deleted file mode 100644 index 56f9f987..00000000 --- a/vectorless-core/vectorless/src/index/parse/types.rs +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document type definitions. -//! -//! This module defines the types used for document parsing: -//! - [`RawNode`] - A raw node extracted from a document before tree construction -//! - [`DocumentMeta`] - Metadata about a document -//! - [`DocumentFormat`] - Supported document formats (re-exported from document module) - -use serde::{Deserialize, Serialize}; - -/// Re-export [`DocumentFormat`] from the document module. -pub use crate::document::DocumentFormat; - -/// A raw node extracted from a document. -/// -/// This represents a section or element before it's organized into a tree. -/// Raw nodes are produced by parsers and consumed by the indexer. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RawNode { - /// Title or heading of this node. - pub title: String, - - /// Text content of this node (including all children's content). - pub content: String, - - /// Level in the hierarchy (0 = root, 1 = top-level section, etc.). - pub level: usize, - - /// Line number where this node starts (1-based). - pub line_start: usize, - - /// Line number where this node ends (1-based). - pub line_end: usize, - - /// Page number for PDF documents (1-based). - pub page: Option, - - /// Estimated token count for this node's own content. - pub token_count: Option, - - /// Total token count including all children (recursive, computed by thinner). - #[serde(default)] - pub total_token_count: Option, -} - -impl Default for RawNode { - fn default() -> Self { - Self { - title: String::new(), - content: String::new(), - level: 0, - line_start: 1, - line_end: 1, - page: None, - token_count: None, - total_token_count: None, - } - } -} - -impl RawNode { - /// Create a new raw node with the given title. - pub fn new(title: impl Into) -> Self { - Self { - title: title.into(), - ..Default::default() - } - } - - /// Set the content of this node. - pub fn with_content(mut self, content: impl Into) -> Self { - self.content = content.into(); - self - } - - /// Set the level of this node. - pub fn with_level(mut self, level: usize) -> Self { - self.level = level; - self - } - - /// Set the line range of this node. - pub fn with_lines(mut self, start: usize, end: usize) -> Self { - self.line_start = start; - self.line_end = end; - self - } - - /// Set the page number of this node. - pub fn with_page(mut self, page: usize) -> Self { - self.page = Some(page); - self - } - - /// Check if this node has any content. - pub fn has_content(&self) -> bool { - !self.content.trim().is_empty() - } - - /// Get the character count of the content. - pub fn char_count(&self) -> usize { - self.content.chars().count() - } - - /// Get the word count (approximate) of the content. - pub fn word_count(&self) -> usize { - self.content.split_whitespace().count() - } -} - -/// Document metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentMeta { - /// Document name or title. - pub name: String, - - /// Document format. - pub format: DocumentFormat, - - /// Total number of pages (for PDF). - pub page_count: Option, - - /// Total number of lines. - pub line_count: usize, - - /// Source file path (if applicable). - pub source_path: Option, - - /// Document description (generated by LLM). - pub description: Option, -} - -impl Default for DocumentMeta { - fn default() -> Self { - Self { - name: String::new(), - format: DocumentFormat::Markdown, - page_count: None, - line_count: 0, - source_path: None, - description: None, - } - } -} - -/// Result of parsing a document. -#[derive(Debug, Clone)] -pub struct ParseResult { - /// Document metadata. - pub meta: DocumentMeta, - - /// Raw nodes extracted from the document. - pub nodes: Vec, -} - -impl ParseResult { - /// Create a new parse result. - pub fn new(meta: DocumentMeta, nodes: Vec) -> Self { - Self { meta, nodes } - } - - /// Get the number of nodes. - pub fn node_count(&self) -> usize { - self.nodes.len() - } - - /// Check if there are no nodes. - pub fn is_empty(&self) -> bool { - self.nodes.is_empty() - } -} diff --git a/vectorless-core/vectorless/src/index/pipeline/checkpoint.rs b/vectorless-core/vectorless/src/index/pipeline/checkpoint.rs deleted file mode 100644 index 4ba1f01a..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/checkpoint.rs +++ /dev/null @@ -1,329 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Pipeline checkpoint support for resume-after-interruption. -//! -//! Saves pipeline state after each stage group completes. -//! On restart, completed stages are skipped and the pipeline resumes -//! from the first incomplete stage. - -use std::path::PathBuf; - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; -use tracing::{info, warn}; - -use crate::document::DocumentTree; -use crate::index::parse::RawNode; - -use super::metrics::IndexMetrics; - -/// Serializable checkpoint capturing pipeline state at a point in time. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PipelineCheckpoint { - /// Document ID being indexed. - pub doc_id: String, - - /// SHA-256 hash of the source content. - pub source_hash: String, - - /// Processing version at the time of checkpoint. - pub processing_version: u32, - - /// Fingerprint of pipeline configuration. - pub config_fingerprint: String, - - /// Names of stages that completed successfully. - pub completed_stages: Vec, - - /// Serialized context data that stages need for resume. - pub context_data: CheckpointContextData, - - /// When this checkpoint was created. - pub timestamp: DateTime, -} - -/// Context data that can be serialized for checkpoint persistence. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CheckpointContextData { - /// Raw nodes from parsing (if parse stage completed). - pub raw_nodes: Vec, - - /// Built document tree (if build stage completed). - pub tree: Option, - - /// Metrics collected so far. - pub metrics: IndexMetrics, - - /// Page count (for PDFs). - pub page_count: Option, - - /// Line count. - pub line_count: Option, - - /// Document description. - pub description: Option, -} - -/// Manages checkpoint persistence on disk. -pub struct CheckpointManager { - /// Directory where checkpoints are stored. - checkpoint_dir: PathBuf, -} - -impl CheckpointManager { - /// Create a new checkpoint manager. - /// - /// The directory will be created on first save if it doesn't exist. - pub fn new(checkpoint_dir: impl Into) -> Self { - Self { - checkpoint_dir: checkpoint_dir.into(), - } - } - - /// Save a checkpoint for the given document. - pub fn save(&self, doc_id: &str, checkpoint: &PipelineCheckpoint) -> std::io::Result<()> { - // Ensure directory exists - std::fs::create_dir_all(&self.checkpoint_dir)?; - - let path = self.checkpoint_path(doc_id); - let json = serde_json::to_string(checkpoint) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; - - // Write atomically: write to temp file, then rename - let temp_path = path.with_extension("tmp"); - std::fs::write(&temp_path, json)?; - std::fs::rename(&temp_path, &path)?; - - Ok(()) - } - - /// Load a checkpoint for the given document. - /// - /// Returns `None` if no checkpoint exists. - pub fn load(&self, doc_id: &str) -> Option { - let path = self.checkpoint_path(doc_id); - if !path.exists() { - return None; - } - - let data = std::fs::read(&path).ok()?; - match serde_json::from_slice(&data) { - Ok(checkpoint) => Some(checkpoint), - Err(e) => { - warn!("Failed to deserialize checkpoint for {}: {}", doc_id, e); - None - } - } - } - - /// Remove a checkpoint after successful completion. - pub fn clear(&self, doc_id: &str) -> std::io::Result<()> { - let path = self.checkpoint_path(doc_id); - if path.exists() { - std::fs::remove_file(path)?; - info!("Cleared checkpoint for document {}", doc_id); - } - Ok(()) - } - - /// Check if a checkpoint exists for the given document. - pub fn exists(&self, doc_id: &str) -> bool { - self.checkpoint_path(doc_id).exists() - } - - /// Get the checkpoint file path for a document. - fn checkpoint_path(&self, doc_id: &str) -> PathBuf { - // Use a sanitized version of doc_id for the filename - let safe_name = doc_id.replace(['/', '\\', ':', '*', '?', '"', '<', '>', '|'], "_"); - self.checkpoint_dir - .join(format!("{}.checkpoint.json", safe_name)) - } - - /// Check if a checkpoint is valid for resuming. - /// - /// A checkpoint is valid if: - /// - Source hash matches (content hasn't changed) - /// - Processing version matches (algorithm hasn't changed) - /// - Config fingerprint matches (options haven't changed) - pub fn is_valid_for_resume( - checkpoint: &PipelineCheckpoint, - source_hash: &str, - processing_version: u32, - config_fingerprint: &str, - ) -> bool { - checkpoint.source_hash == source_hash - && checkpoint.processing_version == processing_version - && checkpoint.config_fingerprint == config_fingerprint - } - - /// List all checkpoint files in the directory. - pub fn list_checkpoints(&self) -> Vec { - let mut result = Vec::new(); - if let Ok(entries) = std::fs::read_dir(&self.checkpoint_dir) { - for entry in entries.flatten() { - let path = entry.path(); - if path.extension().map_or(false, |e| e == "json") { - if let Some(name) = path.file_stem().and_then(|n| n.to_str()) { - // Strip .checkpoint suffix - if let Some(doc_id) = name.strip_suffix(".checkpoint") { - result.push(doc_id.to_string()); - } - } - } - } - } - result - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - - fn make_checkpoint() -> PipelineCheckpoint { - PipelineCheckpoint { - doc_id: "test-doc-123".to_string(), - source_hash: "abc123".to_string(), - processing_version: 1, - config_fingerprint: "cfg-fp".to_string(), - completed_stages: vec!["parse".to_string(), "build".to_string()], - context_data: CheckpointContextData { - raw_nodes: Vec::new(), - tree: Some(DocumentTree::new("Test", "content")), - metrics: IndexMetrics::default(), - page_count: None, - line_count: Some(10), - description: None, - }, - timestamp: Utc::now(), - } - } - - #[test] - fn test_save_and_load() { - let dir = TempDir::new().unwrap(); - let manager = CheckpointManager::new(dir.path()); - - let checkpoint = make_checkpoint(); - manager.save("test-doc-123", &checkpoint).unwrap(); - - let loaded = manager.load("test-doc-123").unwrap(); - assert_eq!(loaded.doc_id, "test-doc-123"); - assert_eq!(loaded.completed_stages, vec!["parse", "build"]); - assert_eq!(loaded.context_data.line_count, Some(10)); - } - - #[test] - fn test_load_nonexistent() { - let dir = TempDir::new().unwrap(); - let manager = CheckpointManager::new(dir.path()); - - assert!(manager.load("nonexistent").is_none()); - } - - #[test] - fn test_clear() { - let dir = TempDir::new().unwrap(); - let manager = CheckpointManager::new(dir.path()); - - let checkpoint = make_checkpoint(); - manager.save("test-doc-123", &checkpoint).unwrap(); - assert!(manager.exists("test-doc-123")); - - manager.clear("test-doc-123").unwrap(); - assert!(!manager.exists("test-doc-123")); - } - - #[test] - fn test_is_valid_for_resume() { - let checkpoint = make_checkpoint(); - - // Matching — valid - assert!(CheckpointManager::is_valid_for_resume( - &checkpoint, - "abc123", - 1, - "cfg-fp" - )); - - // Different source hash — invalid - assert!(!CheckpointManager::is_valid_for_resume( - &checkpoint, - "different", - 1, - "cfg-fp" - )); - - // Different processing version — invalid - assert!(!CheckpointManager::is_valid_for_resume( - &checkpoint, - "abc123", - 2, - "cfg-fp" - )); - - // Different config fingerprint — invalid - assert!(!CheckpointManager::is_valid_for_resume( - &checkpoint, - "abc123", - 1, - "different" - )); - } - - #[test] - fn test_list_checkpoints() { - let dir = TempDir::new().unwrap(); - let manager = CheckpointManager::new(dir.path()); - - let mut cp = make_checkpoint(); - cp.doc_id = "doc-a".to_string(); - manager.save("doc-a", &cp).unwrap(); - - cp.doc_id = "doc-b".to_string(); - manager.save("doc-b", &cp).unwrap(); - - let list = manager.list_checkpoints(); - assert_eq!(list.len(), 2); - assert!(list.contains(&"doc-a".to_string())); - assert!(list.contains(&"doc-b".to_string())); - } - - #[test] - fn test_roundtrip_preserves_tree() { - let dir = TempDir::new().unwrap(); - let manager = CheckpointManager::new(dir.path()); - - let mut tree = DocumentTree::new("Root", ""); - let child = tree.add_child(tree.root(), "Section 1", "Content"); - tree.set_token_count(child, 42); - - let checkpoint = PipelineCheckpoint { - doc_id: "tree-test".to_string(), - source_hash: "hash".to_string(), - processing_version: 1, - config_fingerprint: "fp".to_string(), - completed_stages: vec!["build".to_string()], - context_data: CheckpointContextData { - raw_nodes: Vec::new(), - tree: Some(tree), - metrics: IndexMetrics::default(), - page_count: None, - line_count: None, - description: None, - }, - timestamp: Utc::now(), - }; - - manager.save("tree-test", &checkpoint).unwrap(); - let loaded = manager.load("tree-test").unwrap(); - - let tree = loaded.context_data.tree.unwrap(); - assert_eq!(tree.node_count(), 2); // root + 1 child - let child_id = tree.children(tree.root())[0]; - assert_eq!(tree.get(child_id).unwrap().title, "Section 1"); - assert_eq!(tree.get(child_id).unwrap().token_count, Some(42)); - } -} diff --git a/vectorless-core/vectorless/src/index/pipeline/context.rs b/vectorless-core/vectorless/src/index/pipeline/context.rs deleted file mode 100644 index 27f638a4..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/context.rs +++ /dev/null @@ -1,465 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Index context for passing data between stages. - -use std::collections::HashMap; -use std::path::PathBuf; - -use crate::document::{Concept, DocumentTree, NavigationIndex, NodeId, ReasoningIndex}; -use crate::index::parse::{DocumentFormat, RawNode}; -use crate::llm::LlmClient; - -use super::super::{PipelineOptions, SummaryStrategy}; -use super::metrics::IndexMetrics; - -/// Input for the index pipeline. -#[derive(Debug, Clone)] -pub enum IndexInput { - /// Index from file path. - File(PathBuf), - - /// Index from raw content string. - Content { - /// Content string. - content: String, - /// Document name. - name: String, - /// Document format. - format: DocumentFormat, - }, - - /// Index from binary data. - Bytes { - /// Binary data. - data: Vec, - /// Document name. - name: String, - /// Document format. - format: DocumentFormat, - }, -} - -impl IndexInput { - /// Create input from file path. - pub fn file(path: impl Into) -> Self { - Self::File(path.into()) - } - - /// Create input from content string. - pub fn content(content: impl Into) -> Self { - Self::Content { - content: content.into(), - name: String::new(), - format: DocumentFormat::Markdown, - } - } - - /// Create input from content with name and format. - pub fn content_with( - content: impl Into, - name: impl Into, - format: DocumentFormat, - ) -> Self { - Self::Content { - content: content.into(), - name: name.into(), - format, - } - } - - /// Create input from binary data. - pub fn bytes(data: impl Into>) -> Self { - Self::Bytes { - data: data.into(), - name: String::new(), - format: DocumentFormat::Pdf, - } - } - - /// Create input from binary data with name and format. - pub fn bytes_with( - data: impl Into>, - name: impl Into, - format: DocumentFormat, - ) -> Self { - Self::Bytes { - data: data.into(), - name: name.into(), - format, - } - } - - /// Check if this is a file input. - pub fn is_file(&self) -> bool { - matches!(self, Self::File(_)) - } - - /// Check if this is a content input. - pub fn is_content(&self) -> bool { - matches!(self, Self::Content { .. }) - } - - /// Check if this is a bytes input. - pub fn is_bytes(&self) -> bool { - matches!(self, Self::Bytes { .. }) - } - - /// Get the format if available. - pub fn format(&self) -> Option { - match self { - Self::File(_) => None, - Self::Content { format, .. } => Some(*format), - Self::Bytes { format, .. } => Some(*format), - } - } -} - -/// Result from a single stage execution. -#[derive(Debug, Clone)] -pub struct StageResult { - /// Whether the stage succeeded. - pub success: bool, - - /// Duration in milliseconds. - pub duration_ms: u64, - - /// Additional metadata. - pub metadata: HashMap, -} - -impl StageResult { - /// Create a successful result. - pub fn success(name: &str) -> Self { - println!("Stage '{}' completed successfully", name); - - Self { - success: true, - duration_ms: 0, - metadata: HashMap::new(), - } - } - - /// Create a failed result. - pub fn failure(name: &str, error: &str) -> Self { - println!("Stage '{}' failed: {}", name, error); - - let mut metadata = HashMap::new(); - metadata.insert( - "error".to_string(), - serde_json::Value::String(error.to_string()), - ); - Self { - success: false, - duration_ms: 0, - metadata, - } - } - - /// Set duration. - pub fn with_duration(mut self, ms: u64) -> Self { - self.duration_ms = ms; - self - } - - /// Add metadata. - pub fn with_metadata(mut self, key: &str, value: serde_json::Value) -> Self { - self.metadata.insert(key.to_string(), value); - self - } -} - -/// Summary cache for lazy generation. -#[derive(Debug, Clone, Default)] -pub struct SummaryCache { - /// Cached summaries: node_id -> summary. - summaries: HashMap, - - /// Whether to persist to disk. - persist: bool, -} - -impl SummaryCache { - /// Create a new cache. - pub fn new(persist: bool) -> Self { - Self { - summaries: HashMap::new(), - persist, - } - } - - /// Get a cached summary. - pub fn get(&self, node_id: NodeId) -> Option<&str> { - self.summaries.get(&node_id).map(|s| s.as_str()) - } - - /// Store a summary. - pub fn put(&mut self, node_id: NodeId, summary: String) { - self.summaries.insert(node_id, summary); - } - - /// Whether persistence is enabled. - pub fn should_persist(&self) -> bool { - self.persist - } - - /// Get all cached summaries. - pub fn all(&self) -> &HashMap { - &self.summaries - } -} - -/// Index context passed between stages. -#[derive(Debug)] -pub struct IndexContext { - /// Document ID. - pub doc_id: String, - - /// Source input. - pub input: IndexInput, - - /// Document format. - pub format: DocumentFormat, - - /// Document name. - pub name: String, - - /// Source file path (if from file). - pub source_path: Option, - - /// SHA-256 hash of source content for checkpoint validation. - pub source_hash: String, - - /// Parsed raw nodes. - pub raw_nodes: Vec, - - /// Built document tree. - pub tree: Option, - - /// Index options. - pub options: PipelineOptions, - - /// LLM client for enhancement. - pub llm_client: Option, - - /// Summary cache for lazy generation. - pub summary_cache: SummaryCache, - - /// Pre-computed reasoning index (built by ReasoningIndexStage). - pub reasoning_index: Option, - - /// Navigation index for Agent-based retrieval (built by NavigationIndexStage). - pub navigation_index: Option, - - /// Key concepts extracted from the document (built by ConceptExtractionStage). - pub concepts: Vec, - - /// Existing tree from previous indexing (for incremental updates). - /// When set, the enhance and reasoning stages can reuse data from unchanged nodes. - pub existing_tree: Option, - - /// Stage execution results. - pub stage_results: HashMap, - - /// Performance metrics. - pub metrics: IndexMetrics, - - /// Document description. - pub description: Option, - - /// Page count (for PDFs). - pub page_count: Option, - - /// Line count. - pub line_count: Option, -} - -impl IndexContext { - /// Create a new context from input. - pub fn new(input: IndexInput, options: PipelineOptions) -> Self { - let source_hash = Self::compute_source_hash(&input); - Self { - doc_id: uuid::Uuid::new_v4().to_string(), - input, - format: DocumentFormat::Markdown, - name: String::new(), - source_path: None, - source_hash, - raw_nodes: Vec::new(), - tree: None, - options, - llm_client: None, - summary_cache: SummaryCache::default(), - reasoning_index: None, - navigation_index: None, - concepts: Vec::new(), - existing_tree: None, - stage_results: HashMap::new(), - metrics: IndexMetrics::default(), - description: None, - page_count: None, - line_count: None, - } - } - - /// Compute SHA-256 hash of the source content. - fn compute_source_hash(input: &IndexInput) -> String { - use sha2::{Digest, Sha256}; - let hash = match input { - IndexInput::File(path) => { - // Hash the file path as proxy — actual content may not be readable yet - // (the parse stage reads it). This is sufficient for checkpoint invalidation - // since a different file path implies different content. - Sha256::digest(path.to_string_lossy().as_bytes()) - } - IndexInput::Content { content, .. } => Sha256::digest(content.as_bytes()), - IndexInput::Bytes { data, .. } => Sha256::digest(data), - }; - format!("{:x}", hash) - } - - /// Set the document ID. - pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { - self.doc_id = doc_id.into(); - self - } - - /// Set the LLM client. - pub fn with_llm_client(mut self, client: LlmClient) -> Self { - self.llm_client = Some(client); - self - } - - /// Set the document format. - pub fn with_format(mut self, format: DocumentFormat) -> Self { - self.format = format; - self - } - - /// Set the document name. - pub fn with_name(mut self, name: impl Into) -> Self { - self.name = name.into(); - self - } - - /// Set the source path. - pub fn with_source_path(mut self, path: impl Into) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the existing tree for incremental updates. - pub fn with_existing_tree(mut self, tree: DocumentTree) -> Self { - self.existing_tree = Some(tree); - self - } - - /// Initialize summary cache based on strategy. - pub fn init_summary_cache(&mut self) { - if let SummaryStrategy::Lazy { persist, .. } = self.options.summary_strategy { - self.summary_cache = SummaryCache::new(persist); - } - } - - /// Record a stage result. - pub fn record_stage(&mut self, name: &str, result: StageResult) { - self.stage_results.insert(name.to_string(), result); - } - - /// Get the tree, returning an error if not built. - pub fn tree(&self) -> Result<&DocumentTree, &'static str> { - self.tree.as_ref().ok_or("Tree not built") - } - - /// Get mutable tree, returning an error if not built. - pub fn tree_mut(&mut self) -> Result<&mut DocumentTree, &'static str> { - self.tree.as_mut().ok_or("Tree not built") - } - - /// Finalize and build the result. - pub fn finalize(self) -> PipelineResult { - PipelineResult { - doc_id: self.doc_id, - name: self.name, - format: self.format, - source_path: self.source_path, - tree: self.tree, - description: self.description, - page_count: self.page_count, - line_count: self.line_count, - metrics: self.metrics, - summary_cache: self.summary_cache, - reasoning_index: self.reasoning_index, - navigation_index: self.navigation_index, - concepts: self.concepts, - } - } -} - -/// Final result from the index pipeline. -#[derive(Debug)] -pub struct PipelineResult { - /// Document ID. - pub doc_id: String, - - /// Document name. - pub name: String, - - /// Document format. - pub format: DocumentFormat, - - /// Source file path. - pub source_path: Option, - - /// Built document tree. - pub tree: Option, - - /// Document description. - pub description: Option, - - /// Page count (for PDFs). - pub page_count: Option, - - /// Line count. - pub line_count: Option, - - /// Performance metrics. - pub metrics: IndexMetrics, - - /// Summary cache. - pub summary_cache: SummaryCache, - - /// Pre-computed reasoning index for retrieval acceleration. - pub reasoning_index: Option, - - /// Navigation index for Agent-based retrieval. - pub navigation_index: Option, - - /// Key concepts extracted from the document. - pub concepts: Vec, -} - -impl PipelineResult { - /// Check if the result has a tree. - pub fn has_tree(&self) -> bool { - self.tree.is_some() - } - - /// Get the tree. - pub fn tree(&self) -> Option<&DocumentTree> { - self.tree.as_ref() - } - - /// Get total indexing time in milliseconds. - pub fn total_time_ms(&self) -> u64 { - self.metrics.parse_time_ms - + self.metrics.build_time_ms - + self.metrics.validate_time_ms - + self.metrics.split_time_ms - + self.metrics.enhance_time_ms - + self.metrics.enrich_time_ms - + self.metrics.reasoning_index_time_ms - + self.metrics.navigation_index_time_ms - + self.metrics.optimize_time_ms - } -} diff --git a/vectorless-core/vectorless/src/index/pipeline/executor.rs b/vectorless-core/vectorless/src/index/pipeline/executor.rs deleted file mode 100644 index 4421eeca..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/executor.rs +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Pipeline executor for running index stages. -//! -//! The executor uses [`PipelineOrchestrator`] internally for flexible -//! stage management with priority-based ordering and dependency resolution. - -use tracing::info; - -use crate::error::Result; -use crate::llm::LlmClient; - -use super::super::PipelineOptions; -use super::super::stages::{ - BuildStage, ConceptExtractionStage, EnhanceStage, EnrichStage, IndexStage, - NavigationIndexStage, OptimizeStage, ParseStage, ReasoningIndexStage, SplitStage, - ValidateStage, VerifyStage, -}; -use super::context::{IndexInput, PipelineResult}; -use super::orchestrator::PipelineOrchestrator; - -/// Pipeline executor for document indexing. -/// -/// Uses [`PipelineOrchestrator`] internally for stage management. -/// Supports both preset configurations and custom stage pipelines. -/// -/// # Example -/// -/// ```rust,ignore -/// // Default pipeline -/// let executor = PipelineExecutor::new(); -/// let result = executor.execute(input, options).await?; -/// -/// // With LLM enhancement -/// let executor = PipelineExecutor::with_llm(client); -/// -/// // Custom pipeline using orchestrator -/// let orchestrator = PipelineOrchestrator::new() -/// .stage(ParseStage::new()) -/// .stage_with_priority(MyCustomStage::new(), 50) -/// .stage(BuildStage::new()); -/// let executor = PipelineExecutor::from_orchestrator(orchestrator); -/// ``` -pub struct PipelineExecutor { - orchestrator: PipelineOrchestrator, -} - -impl PipelineExecutor { - /// Create a new pipeline executor with default stages. - /// - /// Default stages (in order): - /// 1. `parse` - Parse document into raw nodes - /// 2. `build` - Build tree structure - /// 3. `validate` - Verify tree integrity (optional) - /// 4. `split` - Split oversized leaf nodes (optional) - /// 5. `enrich` - Add metadata and cross-references - /// 6. `reasoning_index` - Build pre-computed reasoning index - /// 7. `concept_extraction` - Extract key concepts (optional) - /// 8. `navigation_index` - Build Agent navigation index - /// 9. `verify` - Validate ingest output reliability - /// 10. `optimize` - Optimize tree structure - pub fn new() -> Self { - let orchestrator = PipelineOrchestrator::new() - .stage_with_priority(ParseStage::new(), 10) - .stage_with_priority(BuildStage::new(), 20) - .stage_with_priority(ValidateStage::new(), 22) - .stage_with_priority(SplitStage::new(), 25) - .stage_with_priority(EnrichStage::new(), 40) - .stage_with_priority(ReasoningIndexStage::new(), 45) - .stage_with_priority(ConceptExtractionStage::new(), 47) - .stage_with_priority(NavigationIndexStage::new(), 50) - .stage_with_priority(VerifyStage, 55) - .stage_with_priority(OptimizeStage::new(), 60); - - Self { orchestrator } - } - - /// Create a pipeline with LLM enhancement. - /// - /// Stages (in order): - /// 1. `parse` - Parse document - /// 2. `build` - Build tree - /// 3. `validate` - Verify tree integrity (optional) - /// 4. `split` - Split oversized leaf nodes (optional) - /// 5. `enhance` - LLM-based enhancement (summaries) - /// 6. `enrich` - Add metadata - /// 7. `reasoning_index` - Build pre-computed reasoning index - /// 8. `concept_extraction` - Extract key concepts via LLM (optional) - /// 9. `navigation_index` - Build Agent navigation index - /// 10. `verify` - Validate ingest output reliability - /// 11. `optimize` - Optimize tree - pub fn with_llm(client: LlmClient) -> Self { - tracing::info!( - "PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage + context" - ); - let orchestrator = PipelineOrchestrator::new() - .with_llm_client(client.clone()) - .stage_with_priority(ParseStage::with_llm_client(client.clone()), 10) - .stage_with_priority(BuildStage::new(), 20) - .stage_with_priority(ValidateStage::new(), 22) - .stage_with_priority(SplitStage::new(), 25) - .stage_with_priority(EnhanceStage::with_llm_client(client.clone()), 30) - .stage_with_priority(EnrichStage::new(), 40) - .stage_with_priority(ReasoningIndexStage::new(), 45) - .stage_with_priority(ConceptExtractionStage::with_llm_client(client), 47) - .stage_with_priority(NavigationIndexStage::new(), 50) - .stage_with_priority(VerifyStage, 55) - .stage_with_priority(OptimizeStage::new(), 60); - - Self { orchestrator } - } - - /// Create from a custom orchestrator. - /// - /// Use this for full control over stage ordering and dependencies. - /// - /// # Example - /// - /// ```rust,ignore - /// let orchestrator = PipelineOrchestrator::new() - /// .stage_with_priority(ParseStage::new(), 10) - /// .stage_with_priority(MyAnalysisStage::new(), 25) - /// .stage_with_priority(BuildStage::new(), 20) - /// .stage_with_deps(MyValidationStage::new(), 50, &["build"]); - /// - /// let executor = PipelineExecutor::from_orchestrator(orchestrator); - /// ``` - pub fn from_orchestrator(orchestrator: PipelineOrchestrator) -> Self { - Self { orchestrator } - } - - /// Add a stage with default priority. - /// - /// The stage will be added after existing stages with the same priority. - pub fn add_stage(mut self, stage: impl IndexStage + 'static) -> Self { - self.orchestrator = self.orchestrator.stage(stage); - self - } - - /// Add a stage with custom priority. - /// - /// Lower priority = earlier execution. - pub fn add_stage_with_priority( - mut self, - stage: impl IndexStage + 'static, - priority: i32, - ) -> Self { - self.orchestrator = self.orchestrator.stage_with_priority(stage, priority); - self - } - - /// Add a stage with priority and dependencies. - /// - /// The stage will run after all specified dependencies. - pub fn add_stage_with_deps( - mut self, - stage: impl IndexStage + 'static, - priority: i32, - depends_on: &[&str], - ) -> Self { - self.orchestrator = self - .orchestrator - .stage_with_deps(stage, priority, depends_on); - self - } - - /// Get the list of stage names in execution order. - pub fn stage_names(&self) -> Result> { - self.orchestrator.stage_names() - } - - /// Get the number of stages. - pub fn stage_count(&self) -> usize { - self.orchestrator.stage_count() - } - - /// Execute the pipeline. - /// - /// Stages are executed in dependency-resolved order. - pub async fn execute( - &mut self, - input: IndexInput, - options: PipelineOptions, - ) -> Result { - info!( - "Starting index pipeline with {} stages", - self.orchestrator.stage_count() - ); - self.orchestrator.execute(input, options).await - } -} - -impl Default for PipelineExecutor { - fn default() -> Self { - Self::new() - } -} diff --git a/vectorless-core/vectorless/src/index/pipeline/metrics.rs b/vectorless-core/vectorless/src/index/pipeline/metrics.rs deleted file mode 100644 index f25fe29f..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/metrics.rs +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Re-export IndexMetrics from the metrics module. - -pub use crate::metrics::IndexMetrics; diff --git a/vectorless-core/vectorless/src/index/pipeline/mod.rs b/vectorless-core/vectorless/src/index/pipeline/mod.rs deleted file mode 100644 index e6e3752d..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/mod.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Pipeline execution module. -//! -//! This module provides the core pipeline infrastructure: -//! - [`IndexContext`] - Context passed between stages -//! - [`PipelineExecutor`] - Executes the indexing pipeline -//! - [`PipelineOrchestrator`] - Flexible stage orchestration with dependencies -//! - [`IndexMetrics`] - Performance metrics collection -//! - [`FailurePolicy`] - Configurable failure handling for stages -//! - [`StageRetryConfig`] - Retry configuration for stages - -mod checkpoint; -mod context; -mod executor; -mod metrics; -mod orchestrator; -mod policy; - -pub use context::{IndexContext, IndexInput, PipelineResult, StageResult}; -pub use executor::PipelineExecutor; -pub use metrics::IndexMetrics; -pub use policy::{FailurePolicy, StageRetryConfig}; diff --git a/vectorless-core/vectorless/src/index/pipeline/orchestrator.rs b/vectorless-core/vectorless/src/index/pipeline/orchestrator.rs deleted file mode 100644 index 10f1f3ad..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/orchestrator.rs +++ /dev/null @@ -1,1028 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Pipeline orchestrator for managing and executing index stages. -//! -//! The orchestrator provides: -//! - Stage registration with priority -//! - Dependency-based ordering via topological sort -//! - Failure policies (Fail, Skip, Retry) -//! - Execution groups for parallel execution -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::index::pipeline::PipelineOrchestrator; -//! use vectorless::index::stages::{ParseStage, BuildStage}; -//! -//! let orchestrator = PipelineOrchestrator::new() -//! .stage(ParseStage::new()) -//! .stage(BuildStage::new()) -//! .stage(MyCustomStage::new()); -//! -//! let result = orchestrator.execute(input, options).await?; -//! ``` - -use std::collections::HashMap; -use std::time::Instant; -use tracing::{debug, error, info, warn}; - -use crate::error::Result; - -use super::super::PipelineOptions; -use super::super::stages::IndexStage; -use super::checkpoint::{CheckpointContextData, CheckpointManager, PipelineCheckpoint}; -use super::context::{IndexContext, IndexInput, PipelineResult, StageResult}; -use super::policy::FailurePolicy; - -/// Stage entry with metadata for orchestration. -struct StageEntry { - /// The stage implementation. - stage: Box, - /// Priority (lower = earlier execution). - priority: i32, - /// Names of stages this depends on. - depends_on: Vec, -} - -impl std::fmt::Debug for StageEntry { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("StageEntry") - .field("name", &self.stage.name()) - .field("priority", &self.priority) - .field("depends_on", &self.depends_on) - .finish() - } -} - -/// Group of stages at the same dependency level (can run in parallel). -#[derive(Debug, Clone)] -pub struct ExecutionGroup { - /// Indices of stages in this group. - pub stage_indices: Vec, - /// Whether this group has multiple stages (parallelizable). - pub parallel: bool, -} - -/// Pipeline orchestrator for stage management and execution. -/// -/// Provides flexible stage registration with: -/// - Priority-based ordering -/// - Dependency resolution -/// - Failure policies (Fail, Skip, Retry) -/// - Execution groups for parallel execution -/// -/// # Stage Ordering -/// -/// Stages are ordered by: -/// 1. Dependencies (must run after dependencies) -/// 2. Priority (lower = earlier) -/// 3. Registration order (tie-breaker) -/// -/// # Example -/// -/// ```rust,ignore -/// // Default pipeline -/// let orchestrator = PipelineOrchestrator::default(); -/// -/// // Custom pipeline -/// let orchestrator = PipelineOrchestrator::new() -/// .stage(ParseStage::new()) -/// .stage_with_priority(MyAnalysisStage::new(), 50) // Run after build (priority 20) -/// .stage_with_priority(BuildStage::new(), 20); -/// ``` -pub struct PipelineOrchestrator { - /// Registered stages with metadata. - stages: Vec, - /// Shared LLM client injected into pipeline context. - llm_client: Option, -} - -impl Default for PipelineOrchestrator { - fn default() -> Self { - Self::new() - } -} - -impl PipelineOrchestrator { - /// Create a new empty orchestrator. - pub fn new() -> Self { - Self { - stages: Vec::new(), - llm_client: None, - } - } - - /// Set the shared LLM client (injected into pipeline context). - pub fn with_llm_client(mut self, client: crate::llm::LlmClient) -> Self { - self.llm_client = Some(client); - self - } - - /// Add a stage with default priority (100). - /// - /// Dependencies are automatically read from the stage's `depends_on()` method. - pub fn stage(mut self, stage: S) -> Self - where - S: IndexStage + 'static, - { - let deps = stage.depends_on(); - self.stages.push(StageEntry { - stage: Box::new(stage), - priority: 100, - depends_on: deps.into_iter().map(|s| s.to_string()).collect(), - }); - self - } - - /// Add a stage with custom priority. - /// - /// Dependencies are automatically read from the stage's `depends_on()` method. - /// Lower priority = earlier execution. - /// Default priority is 100. - pub fn stage_with_priority(mut self, stage: S, priority: i32) -> Self - where - S: IndexStage + 'static, - { - let deps = stage.depends_on(); - self.stages.push(StageEntry { - stage: Box::new(stage), - priority, - depends_on: deps.into_iter().map(|s| s.to_string()).collect(), - }); - self - } - - /// Add a stage with priority and explicit dependencies. - /// - /// Merges trait-level dependencies with explicitly provided ones. - /// The stage will run after all specified dependencies. - pub fn stage_with_deps( - mut self, - stage: S, - priority: i32, - explicit_depends_on: &[&str], - ) -> Self - where - S: IndexStage + 'static, - { - let trait_deps = stage.depends_on(); - let mut all_deps: Vec = trait_deps.into_iter().map(|s| s.to_string()).collect(); - - // Add explicit deps that aren't already included - for dep in explicit_depends_on { - if !all_deps.iter().any(|d| d == dep) { - all_deps.push(dep.to_string()); - } - } - - self.stages.push(StageEntry { - stage: Box::new(stage), - priority, - depends_on: all_deps, - }); - self - } - - /// Remove all stages with the given name. - pub fn remove_stage(mut self, name: &str) -> Self { - self.stages.retain(|entry| entry.stage.name() != name); - self - } - - /// Check if a stage with the given name exists. - pub fn has_stage(&self, name: &str) -> bool { - self.stages.iter().any(|entry| entry.stage.name() == name) - } - - /// Get the number of registered stages. - pub fn stage_count(&self) -> usize { - self.stages.len() - } - - /// Resolve dependencies and return stage indices in execution order. - /// - /// # Errors - /// - /// Returns an error if: - /// - A dependency refers to a non-existent stage - /// - There's a circular dependency - fn resolve_order(&self) -> Result> { - // Build name -> index map - let name_to_idx: HashMap<&str, usize> = self - .stages - .iter() - .enumerate() - .map(|(i, entry)| (entry.stage.name(), i)) - .collect(); - - // Validate dependencies - for entry in &self.stages { - for dep in &entry.depends_on { - if !name_to_idx.contains_key(dep.as_str()) { - return Err(crate::error::Error::Config(format!( - "Stage '{}' depends on non-existent stage '{}'", - entry.stage.name(), - dep - ))); - } - } - } - - // Topological sort with priority consideration (Kahn's algorithm) - let n = self.stages.len(); - let mut in_degree: Vec = vec![0; n]; - let mut adjacency: HashMap> = HashMap::new(); - - for (i, entry) in self.stages.iter().enumerate() { - for dep in &entry.depends_on { - if let Some(&dep_idx) = name_to_idx.get(dep.as_str()) { - adjacency.entry(dep_idx).or_default().push(i); - in_degree[i] += 1; - } - } - } - - // Collect stages with no dependencies, sorted by priority - let mut ready: Vec = (0..n).filter(|&i| in_degree[i] == 0).collect(); - ready.sort_by_key(|&i| (self.stages[i].priority, i)); - - let mut result: Vec = Vec::new(); - - while let Some(idx) = ready.first().cloned() { - ready.remove(0); - result.push(idx); - - if let Some(neighbors) = adjacency.get(&idx) { - for &neighbor in neighbors { - in_degree[neighbor] -= 1; - if in_degree[neighbor] == 0 { - // Insert in priority order - let entry = &self.stages[neighbor]; - let pos = ready - .binary_search_by_key(&(entry.priority, neighbor), |&i| { - (self.stages[i].priority, i) - }) - .unwrap_or_else(|e| e); - ready.insert(pos, neighbor); - } - } - } - } - - // Check for cycles - if result.len() != n { - let remaining: Vec<&str> = (0..n) - .filter(|i| !result.contains(i)) - .map(|i| self.stages[i].stage.name()) - .collect(); - return Err(crate::error::Error::Config(format!( - "Circular dependency detected involving stages: {:?}", - remaining - ))); - } - - Ok(result) - } - - /// Compute execution groups from resolved order. - /// - /// Stages with the same "level" in the dependency graph and no - /// inter-dependencies can run in parallel. - fn compute_execution_groups(&self, order: &[usize]) -> Vec { - if order.is_empty() { - return Vec::new(); - } - - // Build name -> index map - let name_to_idx: HashMap<&str, usize> = self - .stages - .iter() - .enumerate() - .map(|(i, entry)| (entry.stage.name(), i)) - .collect(); - - // Calculate level for each stage based on dependencies - let mut levels: HashMap = HashMap::new(); - - for &idx in order { - let entry = &self.stages[idx]; - let level = if entry.depends_on.is_empty() { - 0 - } else { - entry - .depends_on - .iter() - .filter_map(|dep| { - name_to_idx - .get(dep.as_str()) - .and_then(|&dep_idx| levels.get(&dep_idx)) - }) - .max() - .map(|&l| l + 1) - .unwrap_or(0) - }; - levels.insert(idx, level); - } - - // Group stages by level - let mut level_groups: HashMap> = HashMap::new(); - for &idx in order { - let level = levels[&idx]; - level_groups.entry(level).or_default().push(idx); - } - - // Convert to execution groups - let max_level = *levels.values().max().unwrap_or(&0); - (0..=max_level) - .filter_map(|level| { - level_groups.get(&level).map(|indices| ExecutionGroup { - stage_indices: indices.clone(), - parallel: indices.len() > 1, - }) - }) - .collect() - } - - /// Execute a stage with its failure policy applied. - async fn execute_stage_with_policy( - stage: &mut Box, - ctx: &mut IndexContext, - ) -> Result { - let policy = stage.failure_policy(); - let stage_name = stage.name().to_string(); - - match policy { - FailurePolicy::Fail => { - // Direct execution, errors propagate - stage.execute(ctx).await - } - - FailurePolicy::Skip => { - // Try once, skip on failure - match stage.execute(ctx).await { - Ok(result) => Ok(result), - Err(e) => { - warn!("Stage {} failed, skipping: {}", stage_name, e); - Ok(StageResult::failure(&stage_name, &e.to_string())) - } - } - } - - FailurePolicy::Retry(config) => { - let mut attempts = 0; - loop { - attempts += 1; - match stage.execute(ctx).await { - Ok(result) => { - if attempts > 1 { - info!("Stage {} succeeded on attempt {}", stage_name, attempts); - } - return Ok(result); - } - Err(e) => { - if attempts >= config.max_attempts { - warn!( - "Stage {} failed after {} attempts: {}", - stage_name, attempts, e - ); - return Err(e); - } - let delay = config.delay_for_attempt(attempts - 1); - warn!( - "Stage {} failed on attempt {}, retrying in {:?}: {}", - stage_name, attempts, delay, e - ); - tokio::time::sleep(delay).await; - } - } - } - } - } - } - - /// Handle the result of a stage execution (shared between sequential and parallel paths). - fn handle_stage_result( - result: Result, - stage_name: &str, - policy: &FailurePolicy, - ctx: &mut IndexContext, - ) -> Result<()> { - match result { - Ok(result) => { - ctx.stage_results.insert(stage_name.to_string(), result); - Ok(()) - } - Err(e) => { - if policy.allows_continuation() { - warn!( - "Stage {} failed but policy allows continuation: {}", - stage_name, e - ); - ctx.stage_results.insert( - stage_name.to_string(), - StageResult::failure(stage_name, &e.to_string()), - ); - Ok(()) - } else { - error!("Stage {} failed, stopping pipeline: {}", stage_name, e); - Err(e) - } - } - } - } - - /// Execute the pipeline. - /// - /// Stages are executed in dependency-resolved order. - /// Failure policies are applied per-stage. - pub async fn execute( - &mut self, - input: IndexInput, - options: PipelineOptions, - ) -> Result { - let total_start = Instant::now(); - info!( - "Starting orchestrated pipeline with {} stages", - self.stages.len() - ); - - // Resolve execution order - let order = self.resolve_order()?; - let stage_names: Vec<&str> = order.iter().map(|&i| self.stages[i].stage.name()).collect(); - info!("[pipeline] Execution order: {:?}", stage_names); - - // Compute execution groups for potential parallelization - let groups = self.compute_execution_groups(&order); - let parallel_count = groups.iter().filter(|g| g.parallel).count(); - if parallel_count > 0 { - info!( - "[pipeline] {} execution groups ({} parallelizable)", - groups.len(), - parallel_count - ); - } else { - debug!( - "[pipeline] {} execution groups (all sequential)", - groups.len() - ); - } - - // Create context - let mut opts = options; - let existing_tree = opts.existing_tree.take(); - let mut ctx = IndexContext::new(input, opts); - // Inject shared LLM client into context for stages that need it (e.g. ReasoningIndexStage) - if let Some(client) = self.llm_client.take() { - ctx = ctx.with_llm_client(client); - } - if let Some(tree) = existing_tree { - ctx = ctx.with_existing_tree(tree); - } - - // Try to resume from checkpoint - if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { - let manager = CheckpointManager::new(checkpoint_dir); - if let Some(checkpoint) = manager.load(&ctx.doc_id) { - if CheckpointManager::is_valid_for_resume( - &checkpoint, - &ctx.source_hash, - ctx.options.processing_version, - &ctx.options.logic_fingerprint().to_string(), - ) { - info!( - "Resuming from checkpoint: {} stages already completed", - checkpoint.completed_stages.len() - ); - // Restore context data from checkpoint - ctx.raw_nodes = checkpoint.context_data.raw_nodes; - if let Some(tree) = checkpoint.context_data.tree { - ctx.tree = Some(tree); - } - ctx.metrics = checkpoint.context_data.metrics; - ctx.page_count = checkpoint.context_data.page_count; - ctx.line_count = checkpoint.context_data.line_count; - ctx.description = checkpoint.context_data.description; - // Mark completed stages as done - for stage_name in &checkpoint.completed_stages { - ctx.stage_results - .insert(stage_name.clone(), StageResult::success(stage_name)); - } - } else { - info!("Checkpoint exists but invalid, starting fresh"); - } - } - } - - // Execute each group - for (group_idx, group) in groups.iter().enumerate() { - if group.parallel { - let names: Vec<&str> = group - .stage_indices - .iter() - .map(|&i| self.stages[i].stage.name()) - .collect(); - info!("[pipeline] Parallel group {}: {:?}", group_idx, names); - } - - if group.parallel && !group.stage_indices.is_empty() { - // Check if all stages in this group are already completed (from checkpoint) - let all_completed = group.stage_indices.iter().all(|&idx| { - let name = self.stages[idx].stage.name(); - ctx.stage_results.contains_key(name) - }); - if all_completed { - let names: Vec<&str> = group - .stage_indices - .iter() - .map(|&i| self.stages[i].stage.name()) - .collect(); - info!("[pipeline] Skipping completed parallel group: {:?}", names); - continue; - } - - // === N-stage parallel execution === - // - // At most one stage may write_tree — it gets the main ctx. - // All other stages get cloned contexts with tree snapshots. - // All stages run concurrently via futures::future::join_all. - // After all complete, outputs are merged back by AccessPattern. - - // Identify the tree writer (if any) - let tree_writer_idx: Option = group - .stage_indices - .iter() - .find(|&&idx| self.stages[idx].stage.access_pattern().writes_tree) - .copied(); - - // For each stage, prepare (stage, context) pair. - // Swap out stages from self.stages to get owned Box. - let mut entries: Vec = Vec::with_capacity(group.stage_indices.len()); - - for &idx in &group.stage_indices { - let stage = std::mem::replace(&mut self.stages[idx].stage, Box::new(NopStage)); - let name = stage.name().to_string(); - let policy = stage.failure_policy(); - let access = stage.access_pattern(); - - let stage_ctx = if Some(idx) == tree_writer_idx { - // Tree writer gets a placeholder; we'll use &mut ctx directly - None - } else { - // Reader gets a cloned context - let mut clone = - IndexContext::new(IndexInput::content(""), ctx.options.clone()); - clone.tree = ctx.tree.clone(); - clone.existing_tree = ctx.existing_tree.clone(); - clone.doc_id = ctx.doc_id.clone(); - clone.name = ctx.name.clone(); - clone.format = ctx.format; - clone.source_path = ctx.source_path.clone(); - if let Some(ref llm) = ctx.llm_client { - clone.llm_client = Some(llm.clone()); - } - Some(clone) - }; - - entries.push(ParallelEntry { - idx, - stage, - ctx: stage_ctx, - name, - policy, - access, - }); - } - - let parallel_names: Vec<&str> = entries.iter().map(|e| e.name.as_str()).collect(); - info!("[pipeline] Executing in parallel: {:?}", parallel_names); - - // Split into writer and readers - let mut writer_entry: Option = None; - let mut reader_entries: Vec = Vec::new(); - for entry in entries { - if entry.ctx.is_none() { - writer_entry = Some(entry); - } else { - reader_entries.push(entry); - } - } - - // Execute writer on main ctx concurrently with readers. - // Move each reader's stage+ctx into an owned async block. - // All futures are !Send (Box), but join_all - // works fine on the same thread. - - let reader_futs: Vec< - std::pin::Pin< - Box< - dyn std::future::Future< - Output = ( - ParallelEntry, - std::result::Result, - ), - > + Send, - >, - >, - > = reader_entries - .into_iter() - .map(|mut entry| { - Box::pin(async move { - let res = Self::execute_stage_with_policy( - &mut entry.stage, - entry.ctx.as_mut().unwrap(), - ) - .await; - (entry, res) - }) - as std::pin::Pin + Send>> - }) - .collect(); - - // If there's a tree writer, run it concurrently with readers. - // If no tree writer (all readers), just run readers. - if let Some(mut we) = writer_entry { - // Run writer + readers concurrently. - // The writer borrows &mut ctx; readers use their own cloned ctxs. - let (writer_res, completed_readers) = tokio::join!( - Self::execute_stage_with_policy(&mut we.stage, &mut ctx), - futures::future::join_all(reader_futs), - ); - - // Put writer stage back and handle result - self.stages[we.idx].stage = we.stage; - Self::handle_stage_result(writer_res, &we.name, &we.policy, &mut ctx)?; - - // Process reader results - for (re, reader_res) in completed_readers { - Self::merge_reader_outputs(&mut ctx, &re); - self.stages[re.idx].stage = re.stage; - Self::handle_stage_result(reader_res, &re.name, &re.policy, &mut ctx)?; - } - } else { - // All readers, no writer - let completed_readers = futures::future::join_all(reader_futs).await; - for (re, reader_res) in completed_readers { - Self::merge_reader_outputs(&mut ctx, &re); - self.stages[re.idx].stage = re.stage; - Self::handle_stage_result(reader_res, &re.name, &re.policy, &mut ctx)?; - } - } - } else { - // === Sequential execution (single stage or non-parallel group) === - for &idx in &group.stage_indices { - let entry = &mut self.stages[idx]; - let stage_name = entry.stage.name().to_string(); - - // Skip stages already completed (from checkpoint resume) - if ctx.stage_results.contains_key(&stage_name) { - info!("Skipping already completed stage: {}", stage_name); - continue; - } - - let policy = entry.stage.failure_policy(); - - info!( - "Executing stage: {} (priority {})", - stage_name, entry.priority - ); - - match Self::execute_stage_with_policy(&mut entry.stage, &mut ctx).await { - Ok(result) => { - ctx.stage_results.insert(stage_name.clone(), result); - } - Err(e) => { - if policy.allows_continuation() { - warn!( - "Stage {} failed but policy allows continuation: {}", - stage_name, e - ); - ctx.stage_results.insert( - stage_name.clone(), - StageResult::failure(&stage_name, &e.to_string()), - ); - } else { - error!("Stage {} failed, stopping pipeline: {}", stage_name, e); - // Save checkpoint before returning error - Self::save_checkpoint(&ctx); - return Err(e); - } - } - } - } - } - - // Save checkpoint after each group completes - Self::save_checkpoint(&ctx); - } - - let total_duration = total_start.elapsed().as_millis() as u64; - info!( - "[pipeline] Complete: {} stages in {}ms for '{}'", - ctx.stage_results.len(), - total_duration, - ctx.name, - ); - - // Clear checkpoint on successful completion - if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { - let manager = CheckpointManager::new(checkpoint_dir); - if let Err(e) = manager.clear(&ctx.doc_id) { - warn!("Failed to clear checkpoint for {}: {}", ctx.doc_id, e); - } - } - - // Finalize result - Ok(ctx.finalize()) - } - - /// Merge a reader stage's outputs back into the main context. - /// - /// Reads the reader's AccessPattern to know which fields to copy, - /// and merges additive metrics (LLM calls, tokens, etc.). - fn merge_reader_outputs(ctx: &mut IndexContext, reader: &ParallelEntry) { - if reader.access.writes_reasoning_index { - if let Some(ref rctx) = reader.ctx { - ctx.reasoning_index = rctx.reasoning_index.clone(); - } - } - if reader.access.writes_navigation_index { - if let Some(ref rctx) = reader.ctx { - ctx.navigation_index = rctx.navigation_index.clone(); - } - } - if reader.access.writes_description { - if let Some(ref rctx) = reader.ctx { - ctx.description = rctx.description.clone(); - } - } - // Merge additive metrics from reader - if let Some(ref rctx) = reader.ctx { - ctx.metrics.llm_calls += rctx.metrics.llm_calls; - ctx.metrics.summaries_generated += rctx.metrics.summaries_generated; - ctx.metrics.total_tokens_generated += rctx.metrics.total_tokens_generated; - ctx.metrics.nodes_processed += rctx.metrics.nodes_processed; - ctx.metrics.nodes_merged += rctx.metrics.nodes_merged; - ctx.metrics.nodes_skipped += rctx.metrics.nodes_skipped; - if rctx.metrics.reasoning_index_time_ms > 0 { - ctx.metrics.record_reasoning_index( - rctx.metrics.reasoning_index_time_ms, - rctx.metrics.topics_indexed, - rctx.metrics.keywords_indexed, - ); - } - if rctx.metrics.optimize_time_ms > 0 { - ctx.metrics.record_optimize(rctx.metrics.optimize_time_ms); - } - if rctx.metrics.navigation_index_time_ms > 0 { - ctx.metrics.record_navigation_index( - rctx.metrics.navigation_index_time_ms, - rctx.metrics.nav_entries_indexed, - rctx.metrics.child_routes_indexed, - ); - } - if rctx.metrics.enhance_time_ms > 0 { - ctx.metrics.record_enhance(rctx.metrics.enhance_time_ms); - } - if rctx.metrics.enrich_time_ms > 0 { - ctx.metrics.record_enrich(rctx.metrics.enrich_time_ms); - } - } - } - - /// Save a checkpoint of the current pipeline state. - fn save_checkpoint(ctx: &IndexContext) { - let checkpoint_dir = match ctx.options.checkpoint_dir { - Some(ref dir) => dir.clone(), - None => return, - }; - - let completed_stages: Vec = ctx.stage_results.keys().cloned().collect(); - let checkpoint = PipelineCheckpoint { - doc_id: ctx.doc_id.clone(), - source_hash: ctx.source_hash.clone(), - processing_version: ctx.options.processing_version, - config_fingerprint: ctx.options.logic_fingerprint().to_string(), - completed_stages, - context_data: CheckpointContextData { - raw_nodes: ctx.raw_nodes.clone(), - tree: ctx.tree.clone(), - metrics: ctx.metrics.clone(), - page_count: ctx.page_count, - line_count: ctx.line_count, - description: ctx.description.clone(), - }, - timestamp: chrono::Utc::now(), - }; - - let manager = CheckpointManager::new(checkpoint_dir); - if let Err(e) = manager.save(&ctx.doc_id, &checkpoint) { - warn!("Failed to save checkpoint for {}: {}", ctx.doc_id, e); - } - } - - /// Get list of stage names in execution order. - pub fn stage_names(&self) -> Result> { - let order = self.resolve_order()?; - Ok(order.iter().map(|&i| self.stages[i].stage.name()).collect()) - } - - /// Get execution groups for the current pipeline. - /// - /// This is useful for visualizing parallelization opportunities. - pub fn get_execution_groups(&self) -> Result> { - let order = self.resolve_order()?; - Ok(self.compute_execution_groups(&order)) - } -} - -/// Placeholder stage used during parallel execution when the real stage -/// is temporarily swapped out via `std::mem::replace`. -struct NopStage; - -#[async_trait::async_trait] -impl IndexStage for NopStage { - fn name(&self) -> &'static str { - "_nop" - } - - async fn execute(&mut self, _ctx: &mut IndexContext) -> Result { - Ok(StageResult::success("_nop")) - } -} - -/// Owned entry for parallel stage execution. -/// -/// Each stage in a parallel group is swapped out from the orchestrator's -/// stages vec into this struct, along with its own cloned context. -/// After execution, the stage is swapped back and outputs are merged. -struct ParallelEntry { - /// Index into orchestrator's stages vec (for swapping back). - idx: usize, - /// The owned stage implementation. - stage: Box, - /// Cloned context for reader stages; None for the tree writer - /// (which uses the main ctx directly). - ctx: Option, - /// Stage name (captured before swap). - name: String, - /// Failure policy (captured before swap). - policy: FailurePolicy, - /// Access pattern (captured before swap). - access: crate::index::stages::AccessPattern, -} - -/// Builder for creating custom stage configurations. -/// -/// This is a convenience type for configuring custom stages -/// without manually calling the orchestrator methods. -pub struct CustomStageBuilder { - name: String, - priority: i32, - depends_on: Vec, - optional: bool, -} - -impl CustomStageBuilder { - /// Create a new custom stage builder. - pub fn new(name: impl Into) -> Self { - Self { - name: name.into(), - priority: 100, - depends_on: Vec::new(), - optional: false, - } - } - - /// Set priority (lower = earlier). - pub fn priority(mut self, priority: i32) -> Self { - self.priority = priority; - self - } - - /// Add a dependency. - pub fn depends_on(mut self, stage: impl Into) -> Self { - self.depends_on.push(stage.into()); - self - } - - /// Mark as optional (failures won't stop pipeline). - pub fn optional(mut self) -> Self { - self.optional = true; - self - } - - /// Get the stage name. - pub fn name(&self) -> &str { - &self.name - } - - /// Get the priority. - pub fn get_priority(&self) -> i32 { - self.priority - } - - /// Get dependencies. - pub fn get_deps(&self) -> &[String] { - &self.depends_on - } - - /// Check if optional. - pub fn is_optional(&self) -> bool { - self.optional - } -} - -#[cfg(test)] -mod tests { - use super::super::context::StageResult; - use super::*; - - #[test] - fn test_orchestrator_creation() { - let orchestrator = PipelineOrchestrator::new(); - assert_eq!(orchestrator.stage_count(), 0); - } - - #[test] - fn test_add_stages() { - let orchestrator = PipelineOrchestrator::new() - .stage_with_priority(MockStage::new("a"), 10) - .stage_with_priority(MockStage::new("b"), 20) - .stage_with_priority(MockStage::new("c"), 5); - - assert_eq!(orchestrator.stage_count(), 3); - - let names = orchestrator.stage_names().unwrap(); - assert_eq!(names, vec!["c", "a", "b"]); // priority order - } - - #[test] - fn test_dependency_resolution() { - let orchestrator = PipelineOrchestrator::new() - .stage_with_priority(MockStage::new("a"), 10) - .stage_with_deps(MockStage::new("b"), 5, &["a"]) // b depends on a - .stage_with_deps(MockStage::new("c"), 1, &["b"]); // c depends on b - - let names = orchestrator.stage_names().unwrap(); - assert_eq!(names, vec!["a", "b", "c"]); - } - - #[test] - fn test_missing_dependency() { - let orchestrator = - PipelineOrchestrator::new().stage_with_deps(MockStage::new("a"), 10, &["nonexistent"]); - - let result = orchestrator.stage_names(); - assert!(result.is_err()); - } - - #[test] - fn test_remove_stage() { - let orchestrator = PipelineOrchestrator::new() - .stage(MockStage::new("a")) - .stage(MockStage::new("b")) - .remove_stage("a"); - - assert_eq!(orchestrator.stage_count(), 1); - assert!(!orchestrator.has_stage("a")); - assert!(orchestrator.has_stage("b")); - } - - #[test] - fn test_custom_stage_builder() { - let builder = CustomStageBuilder::new("my_stage") - .priority(50) - .depends_on("parse") - .optional(); - - assert_eq!(builder.name(), "my_stage"); - assert_eq!(builder.get_priority(), 50); - assert_eq!(builder.get_deps(), &["parse".to_string()]); - assert!(builder.is_optional()); - } - - /// Mock stage for testing. - struct MockStage { - name: String, - } - - impl MockStage { - fn new(name: &str) -> Self { - Self { - name: name.to_string(), - } - } - } - - #[async_trait::async_trait] - impl IndexStage for MockStage { - fn name(&self) -> &str { - &self.name - } - - async fn execute(&mut self, _ctx: &mut IndexContext) -> Result { - Ok(StageResult::success(&self.name)) - } - } -} diff --git a/vectorless-core/vectorless/src/index/pipeline/policy.rs b/vectorless-core/vectorless/src/index/pipeline/policy.rs deleted file mode 100644 index da3c5b2b..00000000 --- a/vectorless-core/vectorless/src/index/pipeline/policy.rs +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Failure policies for pipeline stages. -//! -//! This module provides configurable failure handling for index pipeline stages. -//! -//! # Policies -//! -//! - **Fail** - Stop the entire pipeline on stage failure (default for required stages) -//! - **Skip** - Skip the failed stage and continue the pipeline -//! - **Retry** - Retry the stage with exponential backoff before failing -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::index::pipeline::{FailurePolicy, StageRetryConfig}; -//! -//! // Simple skip policy -//! let policy = FailurePolicy::skip(); -//! -//! // Retry with custom config -//! let policy = FailurePolicy::retry_with( -//! StageRetryConfig::new() -//! .with_max_attempts(3) -//! .with_initial_delay(Duration::from_millis(500)) -//! ); -//! ``` - -use std::time::Duration; - -/// Retry configuration for stage execution. -#[derive(Debug, Clone)] -pub struct StageRetryConfig { - /// Maximum number of attempts (including initial). - pub max_attempts: usize, - /// Initial delay before first retry. - pub initial_delay: Duration, - /// Maximum delay between retries. - pub max_delay: Duration, - /// Exponential backoff multiplier. - pub multiplier: f64, -} - -impl Default for StageRetryConfig { - fn default() -> Self { - Self { - max_attempts: 3, - initial_delay: Duration::from_millis(100), - max_delay: Duration::from_secs(10), - multiplier: 2.0, - } - } -} - -impl StageRetryConfig { - /// Create a new retry config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set maximum number of attempts. - pub fn with_max_attempts(mut self, n: usize) -> Self { - self.max_attempts = n.max(1); - self - } - - /// Set initial delay before first retry. - pub fn with_initial_delay(mut self, delay: Duration) -> Self { - self.initial_delay = delay; - self - } - - /// Set maximum delay between retries. - pub fn with_max_delay(mut self, delay: Duration) -> Self { - self.max_delay = delay; - self - } - - /// Set exponential backoff multiplier. - pub fn with_multiplier(mut self, multiplier: f64) -> Self { - self.multiplier = multiplier; - self - } - - /// Calculate delay for a given attempt (0-indexed). - /// - /// Uses exponential backoff: `initial_delay * multiplier^attempt` - pub fn delay_for_attempt(&self, attempt: usize) -> Duration { - let delay_ms = - (self.initial_delay.as_millis() as f64) * self.multiplier.powi(attempt as i32); - let capped_ms = delay_ms.min(self.max_delay.as_millis() as f64); - Duration::from_millis(capped_ms as u64) - } -} - -/// Policy for handling stage failures. -#[derive(Debug, Clone)] -pub enum FailurePolicy { - /// Fail the entire pipeline on error (default for required stages). - Fail, - - /// Skip this stage on failure, continue pipeline. - /// The stage result will record the failure but execution continues. - Skip, - - /// Retry with specified configuration before failing. - /// If all retries fail, the pipeline behavior depends on `allows_continuation`. - Retry(StageRetryConfig), -} - -impl Default for FailurePolicy { - fn default() -> Self { - Self::Fail - } -} - -impl FailurePolicy { - /// Create a Fail policy. - pub fn fail() -> Self { - Self::Fail - } - - /// Create a Skip policy. - pub fn skip() -> Self { - Self::Skip - } - - /// Create a Retry policy with default configuration. - pub fn retry() -> Self { - Self::Retry(StageRetryConfig::default()) - } - - /// Create a Retry policy with custom configuration. - pub fn retry_with(config: StageRetryConfig) -> Self { - Self::Retry(config) - } - - /// Check if pipeline can continue after failure with this policy. - /// - /// - `Fail`: No, stops pipeline - /// - `Skip`: Yes, continues - /// - `Retry`: No (if all retries exhausted, it's treated as failure) - pub fn allows_continuation(&self) -> bool { - matches!(self, Self::Skip) - } - - /// Check if this policy involves retry attempts. - pub fn has_retry(&self) -> bool { - matches!(self, Self::Retry(_)) - } - - /// Get retry config if this is a Retry policy. - pub fn retry_config(&self) -> Option<&StageRetryConfig> { - match self { - Self::Retry(config) => Some(config), - _ => None, - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_default_retry_config() { - let config = StageRetryConfig::default(); - assert_eq!(config.max_attempts, 3); - assert_eq!(config.initial_delay, Duration::from_millis(100)); - assert_eq!(config.max_delay, Duration::from_secs(10)); - } - - #[test] - fn test_retry_config_builder() { - let config = StageRetryConfig::new() - .with_max_attempts(5) - .with_initial_delay(Duration::from_millis(200)) - .with_max_delay(Duration::from_secs(30)); - - assert_eq!(config.max_attempts, 5); - assert_eq!(config.initial_delay, Duration::from_millis(200)); - assert_eq!(config.max_delay, Duration::from_secs(30)); - } - - #[test] - fn test_delay_for_attempt() { - let config = StageRetryConfig::new() - .with_initial_delay(Duration::from_millis(100)) - .with_multiplier(2.0); - - assert_eq!(config.delay_for_attempt(0), Duration::from_millis(100)); - assert_eq!(config.delay_for_attempt(1), Duration::from_millis(200)); - assert_eq!(config.delay_for_attempt(2), Duration::from_millis(400)); - } - - #[test] - fn test_delay_respects_max() { - let config = StageRetryConfig::new() - .with_initial_delay(Duration::from_secs(1)) - .with_max_delay(Duration::from_secs(5)) - .with_multiplier(10.0); - - assert_eq!(config.delay_for_attempt(0), Duration::from_secs(1)); - assert_eq!(config.delay_for_attempt(1), Duration::from_secs(5)); // capped - assert_eq!(config.delay_for_attempt(2), Duration::from_secs(5)); // capped - } - - #[test] - fn test_failure_policy_constructors() { - assert!(matches!(FailurePolicy::fail(), FailurePolicy::Fail)); - assert!(matches!(FailurePolicy::skip(), FailurePolicy::Skip)); - assert!(matches!(FailurePolicy::retry(), FailurePolicy::Retry(_))); - } - - #[test] - fn test_allows_continuation() { - assert!(!FailurePolicy::fail().allows_continuation()); - assert!(FailurePolicy::skip().allows_continuation()); - assert!(!FailurePolicy::retry().allows_continuation()); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/build.rs b/vectorless-core/vectorless/src/index/stages/build.rs deleted file mode 100644 index 02b5eda8..00000000 --- a/vectorless-core/vectorless/src/index/stages/build.rs +++ /dev/null @@ -1,334 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Build stage - Build tree from raw nodes. - -use super::async_trait; -use std::time::Instant; -use tracing::{debug, info}; - -use crate::document::{DocumentTree, NodeId}; -use crate::error::Result; -use crate::index::parse::RawNode; -use crate::utils::estimate_tokens; - -use super::{IndexStage, StageResult}; -use crate::index::ThinningConfig; -use crate::index::pipeline::IndexContext; - -/// Build stage - constructs a tree from raw nodes. -pub struct BuildStage; - -impl BuildStage { - /// Create a new build stage. - pub fn new() -> Self { - Self - } - - /// Calculate total token counts for all nodes (recursive, includes children). - fn calculate_total_tokens(nodes: &mut [RawNode]) { - if nodes.is_empty() { - return; - } - - // Process from back to front - for i in (0..nodes.len()).rev() { - let own_tokens = nodes[i] - .token_count - .unwrap_or_else(|| estimate_tokens(&nodes[i].content)); - nodes[i].token_count = Some(own_tokens); - - // Find all children (direct and indirect) - let children_tokens: usize = Self::find_all_children_indices(i, nodes) - .iter() - .map(|&child_idx| nodes[child_idx].total_token_count.unwrap_or(0)) - .sum(); - - nodes[i].total_token_count = Some(own_tokens + children_tokens); - } - } - - /// Find all children (direct and indirect) of a node. - fn find_all_children_indices(parent_idx: usize, nodes: &[RawNode]) -> Vec { - let parent_level = nodes[parent_idx].level; - let mut children = Vec::new(); - - for i in (parent_idx + 1)..nodes.len() { - if nodes[i].level <= parent_level { - break; - } - children.push(i); - } - - children - } - - /// Find direct children of a node. - fn find_direct_children_indices(parent_idx: usize, nodes: &[RawNode]) -> Vec { - let parent_level = nodes[parent_idx].level; - let target_level = parent_level + 1; - let mut children = Vec::new(); - let mut i = parent_idx + 1; - - while i < nodes.len() { - if nodes[i].level <= parent_level { - break; - } - if nodes[i].level == target_level { - children.push(i); - } - i += 1; - } - - children - } - - /// Apply thinning to raw nodes before tree construction. - /// - /// When `merge_content` is true: small nodes are merged into their parent - /// by concatenating child content into the parent, then marking children for removal. - /// When `merge_content` is false: small nodes are simply marked for removal. - fn apply_thinning(nodes: &mut [RawNode], config: &ThinningConfig) -> Vec { - if !config.enabled || nodes.is_empty() { - return vec![true; nodes.len()]; - } - - let mut keep = vec![true; nodes.len()]; - - // Process from leaves to root (bottom-up) - for i in (0..nodes.len()).rev() { - if !keep[i] { - continue; - } - let total_tokens = nodes[i].total_token_count.unwrap_or(0); - - if total_tokens < config.threshold { - // Find all children of this node - let children_indices = Self::find_all_children_indices(i, nodes); - - if !children_indices.is_empty() && config.merge_content { - // Merge children content into this node - let mut merged_content = nodes[i].content.clone(); - for &child_idx in &children_indices { - if !nodes[child_idx].content.trim().is_empty() { - if !merged_content.is_empty() { - merged_content.push_str("\n\n"); - } - merged_content.push_str(&nodes[child_idx].content); - } - } - nodes[i].content = merged_content; - nodes[i].token_count = Some(nodes[i].token_count.unwrap_or(0)); - } - - // Mark children for removal - for &child_idx in &children_indices { - keep[child_idx] = false; - } - } - } - - // Ensure each parent keeps at least one child - Self::ensure_min_children(nodes, &mut keep); - - keep - } - - /// Ensure each parent keeps at least one direct child. - fn ensure_min_children(nodes: &[RawNode], keep: &mut [bool]) { - for i in 0..nodes.len() { - let children = Self::find_direct_children_indices(i, nodes); - - if !children.is_empty() { - let has_kept_child = children.iter().any(|&c| keep[c]); - - if !has_kept_child { - // Keep the child with the most content - let best_child = children - .iter() - .max_by_key(|&&c| nodes[c].total_token_count.unwrap_or(0)) - .copied(); - - if let Some(idx) = best_child { - keep[idx] = true; - } - } - } - } - } - - /// Build tree from raw nodes. - fn build_tree(&self, raw_nodes: Vec, ctx: &mut IndexContext) -> DocumentTree { - let root_title = ctx.name.clone(); - let root_content = String::new(); - - let mut tree = DocumentTree::new(&root_title, &root_content); - - // Stack to track parent nodes at each level - let mut level_stack: Vec> = vec![Some(tree.root())]; - - for raw in raw_nodes { - let level = raw.level; - - // Ensure stack has enough slots - while level_stack.len() <= level { - level_stack.push(None); - } - - // Find parent: closest ancestor with a lower level - let parent_id = (0..level) - .rev() - .find_map(|l| level_stack.get(l).copied().flatten()) - .unwrap_or(tree.root()); - - // Create the node - let content = if raw.content.is_empty() { - "" - } else { - &raw.content - }; - let node_id = tree.add_child(parent_id, &raw.title, content); - - // Set line indices - tree.set_line_indices(node_id, raw.line_start, raw.line_end); - - // Set page boundaries if available - if let Some(page) = raw.page { - tree.set_page_boundaries(node_id, page, page); - } - - // Set token count if available - if let Some(count) = raw.token_count { - if count > 0 { - tree.set_token_count(node_id, count); - } - } - - // Update the stack for this level - if level < level_stack.len() { - level_stack[level] = Some(node_id); - } - - // Clear deeper levels - for i in (level + 1)..level_stack.len() { - level_stack[i] = None; - } - } - - tree - } - - /// Assign unique node IDs (DFS traversal). - fn assign_node_ids(&self, tree: &mut DocumentTree) { - let mut counter: usize = 0; - self.assign_recursive(tree, tree.root(), &mut counter); - } - - fn assign_recursive(&self, tree: &mut DocumentTree, node_id: NodeId, counter: &mut usize) { - *counter += 1; - let id_str = format!("{:04}", counter); - tree.set_node_id(node_id, &id_str); - - let children = tree.children(node_id); - for child_id in children { - self.assign_recursive(tree, child_id, counter); - } - } -} - -impl Default for BuildStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for BuildStage { - fn name(&self) -> &'static str { - "build" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["parse"] - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - // Take raw nodes from context - let mut raw_nodes = std::mem::take(&mut ctx.raw_nodes); - - if raw_nodes.is_empty() { - info!("[build] No raw nodes, skipping"); - return Ok(StageResult::success("build")); - } - - info!( - "[build] Starting: {} raw nodes, thinning={}", - raw_nodes.len(), - ctx.options.thinning.enabled - ); - - // Step 1: Calculate total tokens - Self::calculate_total_tokens(&mut raw_nodes); - debug!( - "[build] Calculated total tokens for {} nodes", - raw_nodes.len() - ); - - // Step 2: Apply thinning if enabled - let _original_count = raw_nodes.len(); - let keep = Self::apply_thinning(&mut raw_nodes, &ctx.options.thinning); - - let nodes_before_merge = raw_nodes.len(); - raw_nodes = raw_nodes - .into_iter() - .zip(keep) - .filter_map(|(node, k)| if k { Some(node) } else { None }) - .collect(); - - let skipped = nodes_before_merge - raw_nodes.len(); - ctx.metrics.nodes_skipped += skipped; - if skipped > 0 { - debug!( - "[build] Thinning removed {} nodes ({} → {})", - skipped, - nodes_before_merge, - raw_nodes.len() - ); - } - - // Step 3: Build tree - let mut tree = self.build_tree(raw_nodes, ctx); - - // Step 4: Assign node IDs if configured - if ctx.options.generate_ids { - self.assign_node_ids(&mut tree); - } - - let node_count = tree.node_count(); - - // Store tree in context - ctx.tree = Some(tree); - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_build(duration); - - info!( - "[build] Complete: {} nodes (skipped {} via thinning) in {}ms", - node_count, skipped, duration - ); - - let mut stage_result = StageResult::success("build"); - stage_result.duration_ms = duration; - stage_result.metadata.insert( - "node_count".to_string(), - serde_json::json!(ctx.tree.as_ref().map(|t| t.node_count()).unwrap_or(0)), - ); - stage_result - .metadata - .insert("nodes_skipped".to_string(), serde_json::json!(skipped)); - - Ok(stage_result) - } -} diff --git a/vectorless-core/vectorless/src/index/stages/concept.rs b/vectorless-core/vectorless/src/index/stages/concept.rs deleted file mode 100644 index 7bffb660..00000000 --- a/vectorless-core/vectorless/src/index/stages/concept.rs +++ /dev/null @@ -1,238 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Concept extraction stage — extracts key concepts from topics and summaries. - -use std::collections::HashMap; - -use serde::Deserialize; -use tracing::{info, warn}; - -use crate::document::Concept; -use crate::error::Result; -use crate::llm::LlmClient; - -use super::async_trait; -use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; - -/// Maximum number of top keywords to send to the LLM for concept extraction. -const MAX_TOPICS: usize = 20; - -/// Maximum number of concepts to extract. -const MAX_CONCEPTS: usize = 15; - -/// Concept extraction stage. -/// -/// Takes the reasoning index's topic entries and tree summaries, then uses -/// a single LLM call to extract structured [`Concept`] values. -/// Falls back to basic keyword-based concepts when no LLM is available. -pub struct ConceptExtractionStage { - llm_client: Option, -} - -impl ConceptExtractionStage { - /// Create a new stage without LLM support (keyword-based fallback). - pub fn new() -> Self { - Self { llm_client: None } - } - - /// Create a stage with LLM support for rich concept extraction. - pub fn with_llm_client(client: LlmClient) -> Self { - Self { - llm_client: Some(client), - } - } -} - -#[async_trait] -impl IndexStage for ConceptExtractionStage { - fn name(&self) -> &str { - "concept_extraction" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["reasoning_index"] - } - - fn is_optional(&self) -> bool { - true - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_concepts: true, - ..AccessPattern::default() - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let concepts = if let Some(ref client) = self.llm_client { - extract_with_llm(ctx, client).await - } else { - extract_from_topics(ctx) - }; - - let count = concepts.len(); - ctx.concepts = concepts; - info!("[concept_extraction] Extracted {} concepts", count); - - Ok(StageResult::success("concept_extraction")) - } -} - -/// Extract concepts using LLM from topics and summaries. -async fn extract_with_llm(ctx: &mut IndexContext, client: &LlmClient) -> Vec { - let (topics, section_titles) = gather_source_data(ctx); - - if topics.is_empty() { - warn!("[concept_extraction] No topics available for extraction"); - return Vec::new(); - } - - let system = "You are a document analysis assistant. Extract the most important concepts \ - from the given topics and section titles. For each concept, provide:\n\ - - name: a short name (2-4 words)\n\ - - summary: a one-sentence explanation\n\ - - sections: list of section titles where this concept appears\n\n\ - Return ONLY a valid JSON array of objects. No explanation, no markdown. \ - Maximum 15 concepts, ordered by importance."; - - let user_prompt = format!( - "Document topics (keyword: relevance weight):\n{}\n\n\ - Section titles:\n{}", - topics - .iter() - .map(|(k, w)| format!("- {} (weight: {:.2})", k, w)) - .collect::>() - .join("\n"), - section_titles.join(", "), - ); - - #[derive(Debug, Deserialize)] - #[serde(rename_all = "snake_case")] - struct RawConcept { - name: String, - summary: String, - #[serde(default)] - sections: Vec, - } - - match client - .complete_json::>(&system, &user_prompt) - .await - { - Ok(raw) => raw - .into_iter() - .take(MAX_CONCEPTS) - .map(|c| Concept { - name: c.name, - summary: c.summary, - sections: c.sections, - }) - .collect(), - Err(e) => { - warn!("[concept_extraction] LLM extraction failed: {}, using fallback", e); - extract_from_topics(ctx) - } - } -} - -/// Fallback: derive basic concepts from topic keywords. -fn extract_from_topics(ctx: &mut IndexContext) -> Vec { - let (topics, section_titles) = gather_source_data(ctx); - - topics - .into_iter() - .take(MAX_CONCEPTS) - .map(|(name, _)| Concept { - name: name.clone(), - summary: String::new(), - sections: section_titles.clone(), - }) - .collect() -} - -/// Gather top topics and section titles from the pipeline context. -fn gather_source_data(ctx: &IndexContext) -> (Vec<(String, f32)>, Vec) { - // Collect top keywords by weight - let mut topics: Vec<(String, f32)> = Vec::new(); - - if let Some(ref ri) = ctx.reasoning_index { - let mut all: Vec<(String, f32)> = ri - .all_topic_entries() - .map(|(keyword, entries)| { - let max_weight = entries.iter().map(|e| e.weight).fold(0.0_f32, f32::max); - (keyword.clone(), max_weight) - }) - .collect(); - all.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - all.truncate(MAX_TOPICS); - topics = all; - } - - // Collect section titles from the tree - let section_titles: Vec = ctx - .tree - .as_ref() - .map(|tree| { - tree.traverse() - .iter() - .filter_map(|&id| { - let node = tree.get(id)?; - if !node.title.is_empty() { - Some(node.title.clone()) - } else { - None - } - }) - .collect() - }) - .unwrap_or_default(); - - (topics, section_titles) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_from_empty_topics() { - let topics = Vec::<(String, f32)>::new(); - let titles = vec!["Section 1".to_string()]; - // Basic sanity: empty topics produce empty concepts - let concepts: Vec = topics - .into_iter() - .take(MAX_CONCEPTS) - .map(|(name, _)| Concept { - name, - summary: String::new(), - sections: titles.clone(), - }) - .collect(); - assert!(concepts.is_empty()); - } - - #[test] - fn test_extract_from_topics_basic() { - let topics: Vec<(String, f32)> = vec![ - ("quantum".to_string(), 0.95), - ("error correction".to_string(), 0.88), - ("qubit".to_string(), 0.82), - ]; - let titles = vec!["Research Labs".to_string()]; - let concepts: Vec = topics - .into_iter() - .take(MAX_CONCEPTS) - .map(|(name, _)| Concept { - name, - summary: String::new(), - sections: titles.clone(), - }) - .collect(); - assert_eq!(concepts.len(), 3); - assert_eq!(concepts[0].name, "quantum"); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/enhance.rs b/vectorless-core/vectorless/src/index/stages/enhance.rs deleted file mode 100644 index 0223d572..00000000 --- a/vectorless-core/vectorless/src/index/stages/enhance.rs +++ /dev/null @@ -1,449 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Enhance stage - Generate summaries using LLM. - -use super::async_trait; -use futures::StreamExt; -use std::sync::Arc; -use std::time::{Duration, Instant}; -use tracing::{debug, info, warn}; - -use crate::document::NodeId; -use crate::error::Result; -use crate::index::incremental; -use crate::llm::LlmClient; -use crate::llm::memo::{MemoKey, MemoStore}; -use crate::utils::fingerprint::Fingerprint; - -use super::{IndexStage, StageResult}; -use crate::index::pipeline::{FailurePolicy, IndexContext, StageRetryConfig}; -use crate::index::summary::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy}; - -/// A node that needs LLM summary generation. -struct PendingNode { - node_id: NodeId, - title: String, - content: String, - is_leaf: bool, -} - -/// Enhance stage - generates summaries using LLM. -pub struct EnhanceStage { - /// LLM client for summary generation. - llm_client: Option>, - /// Memo store for caching LLM results. - memo_store: Option>, -} - -impl EnhanceStage { - /// Create a new enhance stage. - pub fn new() -> Self { - Self { - llm_client: None, - memo_store: None, - } - } - - /// Create with LLM client. - pub fn with_llm_client(client: LlmClient) -> Self { - Self { - llm_client: Some(Arc::new(client)), - memo_store: None, - } - } - - /// Create with LLM client and memo store. - pub fn with_llm_and_memo(client: LlmClient, memo_store: MemoStore) -> Self { - Self { - llm_client: Some(Arc::new(client)), - memo_store: Some(Arc::new(memo_store)), - } - } - - /// Set memo store for caching. - pub fn with_memo_store(mut self, store: MemoStore) -> Self { - self.memo_store = Some(Arc::new(store)); - self - } - - /// Parse structured navigation response from LLM. - /// - /// Expected format: - /// ```text - /// OVERVIEW: - /// QUESTIONS: q1, q2, q3 - /// TAGS: tag1, tag2, tag3 - /// ``` - /// - /// Falls back gracefully: if markers are missing, the entire response - /// becomes the overview and questions/tags remain empty. - fn parse_structured_nav_response(response: &str) -> (String, Vec, Vec) { - let mut overview = String::new(); - let mut questions: Vec = Vec::new(); - let mut tags: Vec = Vec::new(); - - for line in response.lines() { - let line = line.trim(); - if let Some(rest) = line.strip_prefix("OVERVIEW:") { - overview = rest.trim().to_string(); - } else if let Some(rest) = line.strip_prefix("QUESTIONS:") { - questions = rest - .split(',') - .map(|s| s.trim().to_string()) - .filter(|s| !s.is_empty()) - .collect(); - } else if let Some(rest) = line.strip_prefix("TAGS:") { - tags = rest - .split(',') - .map(|s| s.trim().to_string()) - .filter(|s| !s.is_empty()) - .collect(); - } - } - - // Fallback: if no OVERVIEW marker found, use entire response as overview - if overview.is_empty() { - overview = response.trim().to_string(); - } - - (overview, questions, tags) - } - - /// Check if summary generation is needed based on strategy. - fn needs_summaries(&self, ctx: &IndexContext) -> bool { - match &ctx.options.summary_strategy { - SummaryStrategy::None => false, - SummaryStrategy::Lazy { .. } => false, // Generated on-demand at query time - SummaryStrategy::Full { .. } | SummaryStrategy::Selective { .. } => true, - } - } -} - -impl Default for EnhanceStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for EnhanceStage { - fn name(&self) -> &'static str { - "enhance" - } - - fn is_optional(&self) -> bool { - true - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["build"] - } - - fn failure_policy(&self) -> FailurePolicy { - // LLM operations benefit from retry with backoff - FailurePolicy::retry_with( - StageRetryConfig::new() - .with_max_attempts(2) - .with_initial_delay(Duration::from_millis(500)), - ) - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - info!( - "[enhance] Starting: llm_client={}, strategy={:?}", - self.llm_client.is_some(), - ctx.options.summary_strategy - ); - - // Check if we need summaries - if !self.needs_summaries(ctx) { - info!( - "[enhance] Skipped: strategy={:?}", - ctx.options.summary_strategy - ); - return Ok(StageResult::success("enhance")); - } - - // Get LLM client - let llm_client = match &self.llm_client { - Some(client) => client, - None => { - warn!("[enhance] No LLM client, skipping summary generation"); - return Ok(StageResult::success("enhance")); - } - }; - - // Get tree - let tree = match ctx.tree.as_mut() { - Some(t) => t, - None => { - warn!("[enhance] No tree built, skipping"); - return Ok(StageResult::success("enhance")); - } - }; - - // Create summary generator (shared via Arc for concurrent use) - let generator = Arc::new( - LlmSummaryGenerator::new((*llm_client).as_ref().clone()) - .with_max_tokens(ctx.options.indexer.max_summary_tokens) - .with_memo_store( - self.memo_store - .as_ref() - .map(|s| (**s).clone()) - .unwrap_or_default(), - ), - ); - - // Get all nodes to process - let node_ids: Vec = tree.traverse(); - let total_nodes = node_ids.len(); - - // === Incremental: reuse summaries from existing tree for unchanged nodes === - if let Some(ref old_tree) = ctx.existing_tree { - let reusable = incremental::compute_reusable_summaries(old_tree, tree); - let applied = incremental::apply_reusable_summaries(tree, &reusable); - for _ in 0..applied { - ctx.metrics.increment_summaries(); - } - info!( - "[enhance] Incremental: {} of {} nodes unchanged, reusing summaries", - applied, total_nodes, - ); - } - - info!( - "[enhance] Processing {} nodes for summary generation", - total_nodes - ); - - // === Phase 1: Collect pending nodes (cache hits applied immediately) === - let strategy = ctx.options.summary_strategy.clone(); - let mut pending_llm: Vec = Vec::new(); - let mut generated = 0; - let mut skipped_no_content = 0; - let mut skipped_tokens = 0; - let mut shortcut_used = 0; - let shortcut_threshold = strategy.shortcut_threshold(); - - for node_id in node_ids { - let node = match tree.get(node_id) { - Some(n) => n.clone(), - None => continue, - }; - - // Skip if no content - if node.content.is_empty() { - skipped_no_content += 1; - continue; - } - - // Skip if summary already set (incremental: reused from old tree) - if !node.summary.is_empty() { - continue; - } - - // Check if strategy says we should generate - let token_count = node.token_count.unwrap_or(0); - if !strategy.should_generate(tree, node_id, token_count) { - skipped_tokens += 1; - continue; - } - - // Check memo store (fast path — apply immediately) - if let Some(store) = self.memo_store.as_deref() { - let content_fp = Fingerprint::from_str(&format!("{}|{}", node.title, node.content)); - let memo_key = MemoKey::summary(&content_fp); - if let Some(cached) = store - .get(&memo_key) - .and_then(|c| c.as_summary().map(|s| s.to_string())) - { - if !cached.is_empty() { - tree.set_summary(node_id, &cached); - debug!( - "[enhance] Cache hit: '{}' ({} chars)", - node.title, - cached.len() - ); - ctx.metrics.increment_summaries(); - generated += 1; - continue; - } - } - } - - // Shortcut: use original content as summary for short nodes (Borrow A) - let token_count = node - .token_count - .unwrap_or_else(|| crate::utils::estimate_tokens(&node.content)); - if shortcut_threshold > 0 && token_count > 0 && token_count <= shortcut_threshold { - tree.set_summary(node_id, &node.content); - debug!( - "[enhance] Shortcut: '{}' ({} tokens, using original content)", - node.title, token_count - ); - ctx.metrics.increment_summaries(); - generated += 1; - shortcut_used += 1; - continue; - } - - // Needs LLM call - let is_leaf = tree.is_leaf(node_id); - pending_llm.push(PendingNode { - node_id, - title: node.title, - content: node.content, - is_leaf, - }); - } - - // === Phase 2: Concurrent LLM calls with buffer_unordered === - let mut failed = 0; - let concurrency = ctx.options.concurrency.max_concurrent_requests; - - if !pending_llm.is_empty() { - info!( - "[enhance] Generating summaries for {} nodes (concurrency: {})", - pending_llm.len(), - concurrency - ); - - // Collect results: (NodeId, is_leaf, Result) - let results: Vec<(NodeId, bool, std::result::Result)> = - futures::stream::iter(pending_llm) - .map(|pending| { - let generator = Arc::clone(&generator); - async move { - let result = generator - .generate_for_node( - &pending.title, - &pending.content, - pending.is_leaf, - ) - .await; - ( - pending.node_id, - pending.is_leaf, - result.map_err(|e| e.to_string()), - ) - } - }) - .buffer_unordered(concurrency) - .collect() - .await; - - // Write results back to tree - for (node_id, is_leaf, result) in results { - ctx.metrics.increment_llm_calls(); - match result { - Ok(response) => { - if response.is_empty() { - failed += 1; - } else { - ctx.metrics - .add_tokens_generated(crate::utils::estimate_tokens(&response)); - - if is_leaf { - // Leaf node: response is a plain content summary - tree.set_summary(node_id, &response); - } else { - // Non-leaf node: response is structured (OVERVIEW/QUESTIONS/TAGS) - let (overview, questions, tags) = - Self::parse_structured_nav_response(&response); - tree.set_summary(node_id, &overview); - - if let Some(node) = tree.get_mut(node_id) { - node.question_hints = questions; - node.routing_keywords = tags; - } - } - generated += 1; - ctx.metrics.increment_summaries(); - } - } - Err(e) => { - warn!("[enhance] LLM summary failed: {}", e); - failed += 1; - } - } - } - } - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_enhance(duration); - if failed > 0 { - ctx.metrics.add_summaries_failed(failed); - } - - info!( - "[enhance] Complete: {} summaries ({} shortcut, {} failed, {} no-content, {} skipped-tokens) in {}ms", - generated, shortcut_used, failed, skipped_no_content, skipped_tokens, duration - ); - - let mut stage_result = StageResult::success("enhance"); - stage_result.duration_ms = duration; - stage_result.metadata.insert( - "summaries_generated".to_string(), - serde_json::json!(generated), - ); - stage_result - .metadata - .insert("summaries_failed".to_string(), serde_json::json!(failed)); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_structured_nav_response_full() { - let response = "\ -OVERVIEW: This section covers payment integration and billing configuration. -QUESTIONS: How to set up payments?, What currencies are supported?, How to configure invoices? -TAGS: payments, billing, invoices, currency"; - - let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(response); - - assert!(overview.contains("payment integration")); - assert_eq!(questions.len(), 3); - assert!(questions[0].contains("set up payments")); - assert_eq!(tags.len(), 4); - assert_eq!(tags[0], "payments"); - } - - #[test] - fn test_parse_structured_nav_response_partial() { - // Only overview, no questions or tags - let response = "OVERVIEW: A general introduction to the system."; - let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(response); - - assert!(overview.contains("general introduction")); - assert!(questions.is_empty()); - assert!(tags.is_empty()); - } - - #[test] - fn test_parse_structured_nav_response_fallback() { - // No markers at all — fallback to entire response as overview - let response = "This is just a plain summary without any markers."; - let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(response); - - assert_eq!(overview, response.trim()); - assert!(questions.is_empty()); - assert!(tags.is_empty()); - } - - #[test] - fn test_parse_structured_nav_response_empty() { - let (overview, questions, tags) = EnhanceStage::parse_structured_nav_response(""); - assert!(overview.is_empty()); - assert!(questions.is_empty()); - assert!(tags.is_empty()); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/enrich.rs b/vectorless-core/vectorless/src/index/stages/enrich.rs deleted file mode 100644 index 88ea8cc1..00000000 --- a/vectorless-core/vectorless/src/index/stages/enrich.rs +++ /dev/null @@ -1,240 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Enrich stage - Add metadata to the tree. - -use super::async_trait; -use std::time::Instant; -use tracing::{debug, info}; - -use crate::document::{DocumentTree, NodeId, ReferenceExtractor, TocView}; -use crate::error::Result; - -use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; - -/// Enrich stage - adds metadata to the tree. -pub struct EnrichStage; - -impl EnrichStage { - /// Create a new enrich stage. - pub fn new() -> Self { - Self - } - - /// Calculate page ranges for all nodes. - fn calculate_page_ranges(tree: &mut DocumentTree) { - // Propagate page ranges up the tree - Self::propagate_page_ranges(tree, tree.root()); - } - - /// Recursively propagate page ranges from children to parent. - fn propagate_page_ranges(tree: &mut DocumentTree, node_id: NodeId) { - let children = tree.children(node_id); - - if children.is_empty() { - return; - } - - // First, propagate to all children - for child_id in &children { - Self::propagate_page_ranges(tree, *child_id); - } - - // Then calculate this node's range from children - let mut min_page: Option = None; - let mut max_page: Option = None; - - for child_id in &children { - if let Some(child) = tree.get(*child_id) { - if let Some(start) = child.start_page { - min_page = Some(min_page.map_or(start, |m| m.min(start))); - } - if let Some(end) = child.end_page { - max_page = Some(max_page.map_or(end, |m| m.max(end))); - } - } - } - - // Update this node's page range - if let (Some(min), Some(max)) = (min_page, max_page) { - tree.set_page_boundaries(node_id, min, max); - } - } - - /// Calculate token statistics. - fn calculate_token_stats(tree: &DocumentTree) -> (usize, usize) { - let mut total_tokens = 0; - let mut node_count = 0; - - for node_id in tree.traverse() { - if let Some(node) = tree.get(node_id) { - total_tokens += node.token_count.unwrap_or(0); - node_count += 1; - } - } - - (total_tokens, node_count) - } - - /// Generate document description from root summary. - fn generate_description(&self, ctx: &mut IndexContext) { - if !ctx.options.generate_description { - return; - } - - // Use root summary if available - if let Some(tree) = &ctx.tree { - if let Some(root) = tree.get(tree.root()) { - if !root.summary.is_empty() { - ctx.description = Some(root.summary.clone()); - debug!("[enrich] Using root summary as document description"); - } - } - } - } - - /// Extract and resolve in-document cross-references for all nodes. - /// - /// Parses content for patterns like "see Section 2.1", "Appendix G", etc. - /// and resolves them to actual `NodeId`s in the tree using the retrieval - /// index for fast lookup. - fn resolve_references(tree: &mut DocumentTree) -> usize { - let retrieval_index = tree.build_retrieval_index(); - let node_ids: Vec = tree.traverse().into_iter().collect(); - let mut total_resolved = 0; - - for node_id in node_ids { - let content = tree - .get(node_id) - .map(|n| n.content.clone()) - .unwrap_or_default(); - if content.is_empty() { - continue; - } - - // Quick check: skip nodes without any reference-like patterns - let content_lower = content.to_lowercase(); - let has_ref_pattern = content_lower.contains("section") - || content_lower.contains("appendix") - || content_lower.contains("table") - || content_lower.contains("figure") - || content_lower.contains("page") - || content_lower.contains("equation"); - - if !has_ref_pattern { - continue; - } - - let refs = ReferenceExtractor::extract_and_resolve(&content, tree, &retrieval_index); - let resolved = refs.iter().filter(|r| r.is_resolved()).count(); - if resolved > 0 { - total_resolved += resolved; - } - tree.set_references(node_id, refs); - } - - total_resolved - } -} - -impl Default for EnrichStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for EnrichStage { - fn name(&self) -> &'static str { - "enrich" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["build"] - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_tree: true, // sets page_boundaries - writes_description: true, - ..Default::default() - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - let tree = ctx - .tree - .as_mut() - .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; - - let node_count = tree.node_count(); - info!("[enrich] Starting: {} nodes", node_count); - - // 1. Calculate page ranges - Self::calculate_page_ranges(tree); - debug!("[enrich] Calculated page ranges"); - - // 2. Generate ToC view (cached in context) - let toc_view = TocView::new(); - let toc = toc_view.generate(tree); - let _toc_markdown = toc_view.format_markdown(&toc); - debug!("[enrich] Generated ToC ({} children)", toc.children.len()); - - // 3. Calculate token statistics - let (total_tokens, stat_node_count) = Self::calculate_token_stats(tree); - debug!( - "[enrich] Token stats: {} total tokens across {} nodes", - total_tokens, stat_node_count - ); - - // 4. Extract and resolve cross-references - let resolved_refs = Self::resolve_references(tree); - if resolved_refs > 0 { - info!("[enrich] Resolved {} cross-references", resolved_refs); - } - - // 5. Generate document description - self.generate_description(ctx); - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_enrich(duration); - - info!( - "[enrich] Complete: {} tokens, {} refs resolved in {}ms", - total_tokens, resolved_refs, duration - ); - - let mut stage_result = StageResult::success("enrich"); - stage_result.duration_ms = duration; - stage_result - .metadata - .insert("total_tokens".to_string(), serde_json::json!(total_tokens)); - stage_result - .metadata - .insert("node_count".to_string(), serde_json::json!(node_count)); - stage_result.metadata.insert( - "resolved_references".to_string(), - serde_json::json!(resolved_refs), - ); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_resolve_references_no_refs() { - let mut tree = DocumentTree::new("Root", "root content"); - tree.add_child(tree.root(), "Section 1", "No references here."); - - let resolved = EnrichStage::resolve_references(&mut tree); - assert_eq!(resolved, 0); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/mod.rs b/vectorless-core/vectorless/src/index/stages/mod.rs deleted file mode 100644 index 2efed1e6..00000000 --- a/vectorless-core/vectorless/src/index/stages/mod.rs +++ /dev/null @@ -1,141 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Index pipeline stages. - -mod build; -mod concept; -mod enhance; -mod enrich; -mod navigation; -mod optimize; -mod parse; -mod reasoning; -mod split; -mod validate; -mod verify_ingest; - -pub use build::BuildStage; -pub use concept::ConceptExtractionStage; -pub use enhance::EnhanceStage; -pub use enrich::EnrichStage; -pub use navigation::NavigationIndexStage; -pub use optimize::OptimizeStage; -pub use parse::ParseStage; -pub use reasoning::ReasoningIndexStage; -pub use split::SplitStage; -pub use validate::ValidateStage; -pub use verify_ingest::VerifyStage; - -use super::pipeline::{FailurePolicy, IndexContext, StageResult}; -use crate::error::Result; -pub use async_trait::async_trait; - -/// Declares which context fields a stage reads/writes. -/// Used by the orchestrator to determine safe parallel execution. -#[derive(Debug, Clone, Default)] -pub struct AccessPattern { - /// Whether this stage reads the tree. - pub reads_tree: bool, - /// Whether this stage mutates the tree (summaries, structure, etc.). - pub writes_tree: bool, - /// Whether this stage writes to `reasoning_index`. - pub writes_reasoning_index: bool, - /// Whether this stage writes to `navigation_index`. - pub writes_navigation_index: bool, - /// Whether this stage writes to `description`. - pub writes_description: bool, - /// Whether this stage writes to `concepts`. - pub writes_concepts: bool, -} - -/// Index pipeline stage. -/// -/// Each stage represents a discrete step in the document indexing process. -/// Stages are executed in dependency order by the [`PipelineOrchestrator`]. -/// -/// # Stage Lifecycle -/// -/// 1. Stage is registered with the orchestrator -/// 2. Dependencies are resolved and execution order is determined -/// 3. `execute()` is called with the shared context -/// 4. Results are stored in `ctx.stage_results` -/// -/// # Example -/// -/// ```rust,ignore -/// struct MyStage; -/// -/// #[async_trait] -/// impl IndexStage for MyStage { -/// fn name(&self) -> &str { "my_stage" } -/// -/// fn depends_on(&self) -> Vec<&'static str> { -/// vec!["parse", "build"] -/// } -/// -/// async fn execute(&mut self, ctx: &mut IndexContext) -> Result { -/// // Process the context... -/// Ok(StageResult::success("my_stage")) -/// } -/// } -/// ``` -#[async_trait] -pub trait IndexStage: Send + Sync { - /// Stage name (must be unique within pipeline). - fn name(&self) -> &str; - - /// Execute the stage. - /// - /// This method receives a mutable reference to the shared context, - /// allowing stages to read from and write to it. - async fn execute(&mut self, ctx: &mut IndexContext) -> Result; - - /// Whether this stage is optional (can be skipped on failure). - /// - /// Optional stages that fail will not stop the pipeline. - /// Default: `false` - fn is_optional(&self) -> bool { - false - } - - /// Names of stages this stage depends on. - /// - /// Dependencies are validated during pipeline construction. - /// A stage will only execute after all its dependencies have completed. - /// - /// # Example - /// - /// ```rust,ignore - /// fn depends_on(&self) -> Vec<&'static str> { - /// vec!["parse", "build"] - /// } - /// ``` - fn depends_on(&self) -> Vec<&'static str> { - Vec::new() - } - - /// Failure policy for this stage. - /// - /// Determines how the pipeline handles failures in this stage: - /// - `Fail`: Stop the entire pipeline (default for required stages) - /// - `Skip`: Skip this stage, continue pipeline - /// - `Retry`: Retry with exponential backoff - /// - /// Default behavior: - /// - If `is_optional()` returns true, defaults to `FailurePolicy::Skip` - /// - Otherwise, defaults to `FailurePolicy::Fail` - fn failure_policy(&self) -> FailurePolicy { - if self.is_optional() { - FailurePolicy::skip() - } else { - FailurePolicy::fail() - } - } - - /// Declare which context fields this stage accesses. - /// Used by the orchestrator for safe parallel execution. - fn access_pattern(&self) -> AccessPattern { - AccessPattern::default() - } -} diff --git a/vectorless-core/vectorless/src/index/stages/navigation.rs b/vectorless-core/vectorless/src/index/stages/navigation.rs deleted file mode 100644 index 0a41517f..00000000 --- a/vectorless-core/vectorless/src/index/stages/navigation.rs +++ /dev/null @@ -1,563 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Navigation Index Stage — Build the Agent navigation index from the document tree. -//! -//! This stage runs after EnrichStage and ReasoningIndexStage. It reads the -//! enhanced TreeNode fields (summary, description, routing_keywords, leaf_count) -//! and builds a [`NavigationIndex`] containing compact [`NavEntry`] and -//! [`ChildRoute`] records for every non-leaf node. -//! -//! # No LLM Calls -//! -//! This stage performs pure data organization. All LLM-generated content -//! (summaries, descriptions, keywords) is already on the tree from the -//! Enhance stage. This stage only reads and restructures that data. - -use std::time::Instant; -use tracing::{debug, info, warn}; - -use crate::document::{ChildRoute, DocumentTree, NavEntry, NavigationIndex, NodeId}; -use crate::error::Result; - -use super::async_trait; -use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; - -/// Navigation Index Stage — builds the Agent navigation index. -/// -/// For every non-leaf node in the tree, this stage creates: -/// - A [`NavEntry`] with overview, question hints, topic tags, leaf count, and level. -/// - A list of [`ChildRoute`] entries, one per child, with title, description, and leaf count. -/// -/// The resulting [`NavigationIndex`] is stored in `ctx.navigation_index` and -/// serialized as part of [`PersistedDocument`](crate::storage::persistence::PersistedDocument). -pub struct NavigationIndexStage; - -impl NavigationIndexStage { - /// Create a new navigation index stage. - pub fn new() -> Self { - Self - } - - /// Count the number of leaf nodes in a subtree rooted at `node_id`. - fn count_leaves(tree: &DocumentTree, node_id: NodeId) -> usize { - if tree.is_leaf(node_id) { - return 1; - } - let mut count = 0; - let mut stack = vec![node_id]; - while let Some(id) = stack.pop() { - if tree.is_leaf(id) { - count += 1; - } else { - for child in tree.children_iter(id) { - stack.push(child); - } - } - } - count - } - - /// Build a NavEntry for a non-leaf node. - fn build_nav_entry(tree: &DocumentTree, node_id: NodeId, leaf_count: usize) -> NavEntry { - let node = match tree.get(node_id) { - Some(n) => n, - None => { - return NavEntry { - overview: String::new(), - question_hints: Vec::new(), - topic_tags: Vec::new(), - leaf_count: 0, - level: 0, - }; - } - }; - - // Overview: use summary if available, otherwise title - let overview = if !node.summary.is_empty() { - node.summary.clone() - } else { - node.title.clone() - }; - - NavEntry { - overview, - question_hints: node.question_hints.clone(), - topic_tags: node.routing_keywords.clone(), - leaf_count, - level: node.depth, - } - } - - /// Build a ChildRoute for a single child node. - fn build_child_route(tree: &DocumentTree, child_id: NodeId, leaf_count: usize) -> ChildRoute { - let node = tree.get(child_id); - let title = node.map(|n| n.title.clone()).unwrap_or_default(); - let description = node - .and_then(|n| { - // Use summary as description if available; otherwise use a truncated title - if !n.summary.is_empty() { - Some(n.summary.clone()) - } else if !n.content.is_empty() { - // Truncate content as fallback description - let s: String = n.content.chars().take(100).collect(); - Some(s) - } else { - None - } - }) - .unwrap_or_else(|| title.clone()); - - ChildRoute { - node_id: child_id, - title, - description, - leaf_count, - } - } -} - -impl Default for NavigationIndexStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for NavigationIndexStage { - fn name(&self) -> &'static str { - "navigation_index" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["enrich"] - } - - fn is_optional(&self) -> bool { - true - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_navigation_index: true, - ..Default::default() - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - let tree = match ctx.tree.as_ref() { - Some(t) => t, - None => { - warn!("[navigation_index] No tree, cannot build index"); - return Ok(StageResult::failure("navigation_index", "Tree not built")); - } - }; - - let all_nodes = tree.traverse(); - let leaf_count = all_nodes.iter().filter(|&&id| tree.is_leaf(id)).count(); - let non_leaf_count = all_nodes.len() - leaf_count; - - info!( - "[navigation_index] Starting: {} total nodes ({} leaves, {} non-leaf)", - all_nodes.len(), - leaf_count, - non_leaf_count, - ); - - let mut nav_entries_count = 0usize; - let mut child_routes_count = 0usize; - - // Phase 1: Pre-compute leaf counts for all nodes. - // We compute once per node to avoid repeated traversals. - debug!( - "[navigation_index] Phase 1: Pre-computing leaf counts for {} nodes", - all_nodes.len() - ); - let mut leaf_counts: std::collections::HashMap = - std::collections::HashMap::with_capacity(all_nodes.len()); - for &node_id in &all_nodes { - leaf_counts.insert(node_id, Self::count_leaves(tree, node_id)); - } - - // Phase 2: Build NavEntry + ChildRoutes for each non-leaf node. - debug!( - "[navigation_index] Phase 2: Building NavEntry + ChildRoutes for {} non-leaf nodes", - non_leaf_count - ); - let mut nav_index = NavigationIndex::new(); - - for &node_id in &all_nodes { - // Skip leaf nodes — they have no children to navigate to - if tree.is_leaf(node_id) { - continue; - } - - let lc = *leaf_counts.get(&node_id).unwrap_or(&0); - - // Build navigation entry for this non-leaf node - let nav_entry = Self::build_nav_entry(tree, node_id, lc); - nav_index.add_entry(node_id, nav_entry); - nav_entries_count += 1; - - // Build child routes for this node's children - let child_ids: Vec = tree.children_iter(node_id).collect(); - let mut routes = Vec::with_capacity(child_ids.len()); - - for child_id in child_ids { - let child_lc = *leaf_counts.get(&child_id).unwrap_or(&0); - let route = Self::build_child_route(tree, child_id, child_lc); - routes.push(route); - child_routes_count += 1; - } - - debug!( - "[navigation_index] node '{}' → {} child routes ({} leaves in subtree)", - tree.get(node_id).map(|n| n.title.as_str()).unwrap_or("?"), - routes.len(), - lc, - ); - - nav_index.add_child_routes(node_id, routes); - } - - // Phase 3: Build DocCard from root-level data (already computed, zero LLM). - // Provides a compact document summary for multi-document Orchestrator Agent. - if let Some(root_entry) = nav_index.get_entry(tree.root()) { - let sections: Vec = nav_index - .get_child_routes(tree.root()) - .map(|routes| { - routes - .iter() - .map(|r| crate::document::SectionCard { - title: r.title.clone(), - description: r.description.clone(), - leaf_count: r.leaf_count, - }) - .collect() - }) - .unwrap_or_default(); - - let doc_card = crate::document::DocCard { - title: tree - .get(tree.root()) - .map(|n| n.title.clone()) - .unwrap_or_default(), - overview: root_entry.overview.clone(), - question_hints: root_entry.question_hints.clone(), - topic_tags: root_entry.topic_tags.clone(), - sections, - total_leaves: root_entry.leaf_count, - }; - nav_index.set_doc_card(doc_card); - - debug!( - "[navigation_index] Phase 3: Built DocCard — {} sections, {} total leaves", - nav_index.doc_card().map(|c| c.sections.len()).unwrap_or(0), - nav_index.doc_card().map(|c| c.total_leaves).unwrap_or(0), - ); - } else { - debug!("[navigation_index] Phase 3: Skipped DocCard (no root entry)"); - } - - let duration = start.elapsed().as_millis() as u64; - - ctx.metrics - .record_navigation_index(duration, nav_entries_count, child_routes_count); - - info!( - "[navigation_index] Complete: {} nav entries, {} child routes in {}ms", - nav_entries_count, child_routes_count, duration, - ); - - ctx.navigation_index = Some(nav_index); - - let mut stage_result = StageResult::success("navigation_index"); - stage_result.duration_ms = duration; - stage_result.metadata.insert( - "nav_entries".to_string(), - serde_json::json!(nav_entries_count), - ); - stage_result.metadata.insert( - "child_routes".to_string(), - serde_json::json!(child_routes_count), - ); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - fn build_test_tree() -> DocumentTree { - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - - let sec1 = tree.add_child(root, "Section 1", "section 1 content"); - let _sec1_1 = tree.add_child(sec1, "Section 1.1", "s1.1 content"); - let _sec1_2 = tree.add_child(sec1, "Section 1.2", "s1.2 content"); - - let sec2 = tree.add_child(root, "Section 2", "section 2 content"); - let _sec2_1 = tree.add_child(sec2, "Section 2.1", "s2.1 content"); - - // Set some summaries - tree.set_summary(root, "A comprehensive guide"); - tree.set_summary(sec1, "Getting started with setup"); - tree.set_summary(sec2, "Advanced configuration"); - - tree - } - - #[test] - fn test_count_leaves() { - let tree = build_test_tree(); - let root = tree.root(); - - // Root has 3 leaves: 1.1, 1.2, 2.1 - assert_eq!(NavigationIndexStage::count_leaves(&tree, root), 3); - } - - #[test] - fn test_count_leaves_single_node() { - let tree = DocumentTree::new("Root", "content"); - let root = tree.root(); - - assert_eq!(NavigationIndexStage::count_leaves(&tree, root), 1); - } - - #[test] - fn test_build_nav_entry_with_summary() { - let tree = build_test_tree(); - let root = tree.root(); - - let entry = NavigationIndexStage::build_nav_entry(&tree, root, 3); - assert_eq!(entry.overview, "A comprehensive guide"); - assert_eq!(entry.leaf_count, 3); - assert_eq!(entry.level, 0); - } - - #[test] - fn test_build_nav_entry_without_summary() { - let tree = DocumentTree::new("Root", "content"); - let root = tree.root(); - - let entry = NavigationIndexStage::build_nav_entry(&tree, root, 1); - assert_eq!(entry.overview, "Root"); - } - - #[test] - fn test_build_child_route() { - let tree = build_test_tree(); - let root = tree.root(); - let children: Vec<_> = tree.children_iter(root).collect(); - - let route = NavigationIndexStage::build_child_route(&tree, children[0], 2); - assert_eq!(route.title, "Section 1"); - assert_eq!(route.leaf_count, 2); - } - - #[test] - fn test_stage_config() { - let stage = NavigationIndexStage::new(); - assert_eq!(stage.name(), "navigation_index"); - assert!(stage.is_optional()); - assert_eq!(stage.depends_on(), vec!["enrich"]); - - let ap = stage.access_pattern(); - assert!(ap.reads_tree); - assert!(ap.writes_navigation_index); - assert!(!ap.writes_tree); - assert!(!ap.writes_reasoning_index); - } - - #[tokio::test] - async fn test_execute_end_to_end() { - // Build a 3-level tree: Root -> [Sec1 -> [1.1, 1.2], Sec2 -> [2.1]] - let mut tree = DocumentTree::new("Root", "root content"); - let root = tree.root(); - let sec1 = tree.add_child(root, "Section 1", "s1 content"); - let _sec1_1 = tree.add_child(sec1, "Section 1.1", "s1.1 content"); - let _sec1_2 = tree.add_child(sec1, "Section 1.2", "s1.2 content"); - let sec2 = tree.add_child(root, "Section 2", "s2 content"); - let _sec2_1 = tree.add_child(sec2, "Section 2.1", "s2.1 content"); - - tree.set_summary(root, "A comprehensive guide"); - tree.set_summary(sec1, "Getting started"); - - // Build context with the tree - let mut ctx = IndexContext::new( - crate::index::pipeline::IndexInput::content("test"), - crate::index::config::PipelineOptions::default(), - ); - ctx.tree = Some(tree); - - // Execute the stage - let mut stage = NavigationIndexStage::new(); - let result = stage.execute(&mut ctx).await; - - assert!(result.is_ok()); - let stage_result = result.unwrap(); - assert!(stage_result.success); - assert_eq!( - stage_result.metadata["nav_entries"], - serde_json::json!(3) // root, sec1, sec2 - ); - assert_eq!( - stage_result.metadata["child_routes"], - serde_json::json!(5) // root→2 + sec1→2 + sec2→1 - ); - - // Verify the index structure - let nav_index = ctx.navigation_index.unwrap(); - assert_eq!(nav_index.entry_count(), 3); // 3 non-leaf nodes - assert_eq!(nav_index.total_child_routes(), 5); - - // Root entry - let root_id = ctx.tree.as_ref().unwrap().root(); - let root_entry = nav_index.get_entry(root_id).unwrap(); - assert_eq!(root_entry.overview, "A comprehensive guide"); - assert_eq!(root_entry.leaf_count, 3); - assert_eq!(root_entry.level, 0); - - // Root child routes - let root_routes = nav_index.get_child_routes(root_id).unwrap(); - assert_eq!(root_routes.len(), 2); - assert_eq!(root_routes[0].title, "Section 1"); - assert_eq!(root_routes[0].leaf_count, 2); - assert_eq!(root_routes[1].title, "Section 2"); - assert_eq!(root_routes[1].leaf_count, 1); - } - - #[tokio::test] - async fn test_execute_single_leaf_tree() { - // Single node = root is leaf → no non-leaf nodes → empty index - let tree = DocumentTree::new("Root", "content"); - - let mut ctx = IndexContext::new( - crate::index::pipeline::IndexInput::content("test"), - crate::index::config::PipelineOptions::default(), - ); - ctx.tree = Some(tree); - - let mut stage = NavigationIndexStage::new(); - let result = stage.execute(&mut ctx).await; - - assert!(result.is_ok()); - assert!(stage_result_is_success(&result)); - - let nav_index = ctx.navigation_index.unwrap(); - assert_eq!(nav_index.entry_count(), 0); - assert_eq!(nav_index.total_child_routes(), 0); - } - - #[tokio::test] - async fn test_execute_no_tree() { - let ctx = IndexContext::new( - crate::index::pipeline::IndexInput::content("test"), - crate::index::config::PipelineOptions::default(), - ); - // ctx.tree is None - - let mut stage = NavigationIndexStage::new(); - // Can't move ctx since tree is None, construct manually - let mut ctx = ctx; - ctx.tree = None; - - let result = stage.execute(&mut ctx).await.unwrap(); - assert!(!result.success); - assert!(ctx.navigation_index.is_none()); - } - - #[test] - fn test_build_child_route_no_summary_has_content() { - // Node with content but no summary → description = truncated content - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let child = tree.add_child(root, "Child", "this is a long content string that exceeds 100 characters and should be truncated when used as a fallback description for the child route"); - - let route = NavigationIndexStage::build_child_route(&tree, child, 1); - assert_eq!(route.title, "Child"); - // description should be truncated content, not the full string - assert!(route.description.len() <= 100); - assert!(route.description.starts_with("this is a long")); - } - - #[test] - fn test_build_child_route_no_summary_no_content() { - // Node with neither summary nor content → description = title - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let child = tree.add_child(root, "Orphan Section", ""); - // Clear any auto-generated content - tree.set_summary(child, ""); - - let route = NavigationIndexStage::build_child_route(&tree, child, 1); - assert_eq!(route.title, "Orphan Section"); - // Fallback: description = title when no summary and no content - assert_eq!(route.description, "Orphan Section"); - } - - #[test] - fn test_build_child_route_with_summary() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let child = tree.add_child(root, "Child", "some content"); - tree.set_summary(child, "A concise summary"); - - let route = NavigationIndexStage::build_child_route(&tree, child, 1); - assert_eq!(route.description, "A concise summary"); - } - - #[test] - fn test_build_nav_entry_depth_tracking() { - // Verify that depth/level is correctly captured from the tree - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let sec1 = tree.add_child(root, "S1", ""); - let sec1_1 = tree.add_child(sec1, "S1.1", "leaf"); - tree.set_summary(root, "Root overview"); - tree.set_summary(sec1, "Section overview"); - - let root_entry = NavigationIndexStage::build_nav_entry(&tree, root, 3); - assert_eq!(root_entry.level, 0); - - let sec1_entry = NavigationIndexStage::build_nav_entry(&tree, sec1, 1); - assert_eq!(sec1_entry.level, 1); - - // Leaf node should still return valid NavEntry if called - let leaf_entry = NavigationIndexStage::build_nav_entry(&tree, sec1_1, 1); - assert_eq!(leaf_entry.level, 2); - assert_eq!(leaf_entry.overview, "S1.1"); // no summary → fallback to title - } - - #[test] - fn test_count_leaves_subtree() { - // Verify leaf count is correct for a subtree, not the entire tree - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let sec1 = tree.add_child(root, "S1", ""); - let _s1a = tree.add_child(sec1, "S1.A", "leaf"); - let _s1b = tree.add_child(sec1, "S1.B", "leaf"); - let _s1c = tree.add_child(sec1, "S1.C", "leaf"); - let sec2 = tree.add_child(root, "S2", ""); - let _s2a = tree.add_child(sec2, "S2.A", "leaf"); - - // sec1 subtree has 3 leaves - assert_eq!(NavigationIndexStage::count_leaves(&tree, sec1), 3); - // sec2 subtree has 1 leaf - assert_eq!(NavigationIndexStage::count_leaves(&tree, sec2), 1); - // root has 4 leaves total - assert_eq!(NavigationIndexStage::count_leaves(&tree, root), 4); - } - - /// Helper to check success without destructuring. - fn stage_result_is_success(result: &Result) -> bool { - result.as_ref().map(|r| r.success).unwrap_or(false) - } -} diff --git a/vectorless-core/vectorless/src/index/stages/optimize.rs b/vectorless-core/vectorless/src/index/stages/optimize.rs deleted file mode 100644 index 8186d494..00000000 --- a/vectorless-core/vectorless/src/index/stages/optimize.rs +++ /dev/null @@ -1,455 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Optimize stage - Optimize tree structure. - -use super::{AccessPattern, async_trait}; -use std::time::Instant; -use tracing::{debug, info}; - -use crate::document::NodeId; -use crate::error::Result; -use crate::index::pipeline::IndexContext; - -use super::{IndexStage, StageResult}; - -/// Optimize stage - optimizes tree structure. -pub struct OptimizeStage; - -impl OptimizeStage { - /// Create a new optimize stage. - pub fn new() -> Self { - Self - } - - /// Merge adjacent small leaf nodes that are siblings under the same parent. - /// - /// Only merges nodes that are both **leaves** (no children of their own). - /// Non-leaf nodes (section headings with subsections) are never merged, - /// even if their own content is empty. - fn merge_small_leaves( - tree: &mut crate::document::DocumentTree, - min_tokens: usize, - metrics: &mut crate::index::IndexMetrics, - ) -> usize { - let mut merged_count = 0; - - // Get all non-leaf nodes (parents whose children may be candidates) - let non_leaves: Vec = tree - .traverse() - .into_iter() - .filter(|id| !tree.is_leaf(*id)) - .collect(); - - for parent_id in non_leaves { - let children = tree.children(parent_id); - if children.len() < 2 { - continue; - } - - // Collect children info: only leaf nodes are merge candidates - let candidates: Vec<(NodeId, usize, bool)> = children - .iter() - .map(|&id| { - let tokens = tree.get(id).and_then(|n| n.token_count).unwrap_or(0); - let is_leaf = tree.is_leaf(id); - (id, tokens, is_leaf) - }) - .collect(); - - // Find pairs of adjacent small leaf siblings - let mut i = 0; - while i < candidates.len() - 1 { - let (curr_id, curr_tokens, curr_is_leaf) = candidates[i]; - let (next_id, next_tokens, next_is_leaf) = candidates[i + 1]; - - // Both must be leaves with actual content, and both must be small - if curr_is_leaf - && next_is_leaf - && curr_tokens > 0 - && curr_tokens < min_tokens - && next_tokens > 0 - && next_tokens < min_tokens - { - // Merge next into current - if let Some(next_node) = tree.get(next_id).cloned() { - if let Some(curr) = tree.get_mut(curr_id) { - if !next_node.content.is_empty() { - if !curr.content.is_empty() { - curr.content.push_str("\n\n"); - } - // Prefix with heading to preserve boundary - curr.content.push_str(&format!( - "## {}\n{}", - next_node.title, next_node.content - )); - } - curr.token_count = Some(curr.token_count.unwrap_or(0) + next_tokens); - } - } - - // Mark next as merged - if let Some(node) = tree.get_mut(next_id) { - node.title = format!("[MERGED: {}]", node.title); - node.content.clear(); - node.token_count = Some(0); - } - - merged_count += 1; - metrics.increment_nodes_merged(); - i += 2; // Skip merged node - } else { - i += 1; - } - } - } - - merged_count - } - - /// Remove empty intermediate nodes (skip root). - fn remove_empty_nodes(tree: &mut crate::document::DocumentTree) -> usize { - let mut removed_count = 0; - let root = tree.root(); - - // Find non-root nodes with no content and only one child - let candidates: Vec = tree - .traverse() - .into_iter() - .filter(|id| { - // Skip root node - if *id == root { - return false; - } - if tree.is_leaf(*id) { - return false; - } - let children = tree.children(*id); - if children.len() != 1 { - return false; - } - if let Some(node) = tree.get(*id) { - node.content.trim().is_empty() - } else { - false - } - }) - .collect(); - - // Note: Actually removing nodes from arena tree is complex - // For now, we just mark them - for node_id in candidates { - if let Some(node) = tree.get_mut(node_id) { - node.title = format!("[EMPTY: {}]", node.title); - removed_count += 1; - } - } - - removed_count - } -} - -impl Default for OptimizeStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for OptimizeStage { - fn name(&self) -> &'static str { - "optimize" - } - - fn is_optional(&self) -> bool { - true - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["enrich", "navigation_index"] - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_tree: true, // merges small leaf nodes - ..Default::default() - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - let config = &ctx.options.optimization; - if !config.enabled { - debug!("[optimize] Disabled, skipping"); - return Ok(StageResult::success("optimize")); - } - - let tree = ctx - .tree - .as_mut() - .ok_or_else(|| crate::Error::IndexBuild("Tree not built".to_string()))?; - - let node_count = tree.node_count(); - info!( - "[optimize] Starting: {} nodes, merge_threshold={}", - node_count, config.merge_leaf_threshold, - ); - - let mut merged_count = 0; - - // 1. Merge small leaves - if config.merge_leaf_threshold > 0 { - merged_count = - Self::merge_small_leaves(tree, config.merge_leaf_threshold, &mut ctx.metrics); - if merged_count > 0 { - debug!("[optimize] Merged {} small leaf nodes", merged_count); - } - } - - // 2. Remove empty intermediate nodes - let removed_count = Self::remove_empty_nodes(tree); - if removed_count > 0 { - debug!( - "[optimize] Marked {} empty intermediate nodes", - removed_count - ); - } - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_optimize(duration); - - info!( - "[optimize] Complete: {} merged, {} emptied in {}ms", - merged_count, removed_count, duration - ); - - let mut stage_result = StageResult::success("optimize"); - stage_result.duration_ms = duration; - stage_result - .metadata - .insert("nodes_merged".to_string(), serde_json::json!(merged_count)); - stage_result.metadata.insert( - "nodes_removed".to_string(), - serde_json::json!(removed_count), - ); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - use crate::index::PipelineOptions; - use crate::index::pipeline::IndexContext; - use crate::index::pipeline::IndexInput; - - /// Create a tree with small leaf children under root for merge tests. - /// - /// ```text - /// Root - /// ├── Leaf A (50 tokens) - /// ├── Leaf B (30 tokens) ← should merge with Leaf A - /// ├── Leaf C (200 tokens) ← too large, not merged - /// └── Leaf D (40 tokens) ← no adjacent small sibling - /// ``` - fn make_merge_test_tree() -> DocumentTree { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - - let a = tree.add_child(root, "Leaf A", "content A"); - let b = tree.add_child(root, "Leaf B", "content B"); - let c = tree.add_child(root, "Leaf C", "content C long"); - let d = tree.add_child(root, "Leaf D", "content D"); - - // Set token counts - if let Some(n) = tree.get_mut(a) { - n.token_count = Some(50); - } - if let Some(n) = tree.get_mut(b) { - n.token_count = Some(30); - } - if let Some(n) = tree.get_mut(c) { - n.token_count = Some(200); - } - if let Some(n) = tree.get_mut(d) { - n.token_count = Some(40); - } - - tree - } - - #[test] - fn test_merge_small_leaves_merges_adjacent_pair() { - let mut tree = make_merge_test_tree(); - let root = tree.root(); - let mut metrics = crate::index::pipeline::IndexMetrics::new(); - - // Threshold 100: Leaf A (50) and Leaf B (30) should merge - let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); - - assert_eq!(merged, 1); - assert_eq!(metrics.nodes_merged, 1); - - // Leaf B should be marked as merged - let children = tree.children(root); - let leaf_b = children.iter().find(|&&id| { - tree.get(id) - .map(|n| n.title.starts_with("[MERGED")) - .unwrap_or(false) - }); - assert!(leaf_b.is_some(), "Leaf B should be marked as merged"); - } - - #[test] - fn test_merge_small_leaves_nothing_above_threshold() { - let mut tree = make_merge_test_tree(); - let mut metrics = crate::index::pipeline::IndexMetrics::new(); - - // Threshold 10: all leaves are above this, nothing merges - let merged = OptimizeStage::merge_small_leaves(&mut tree, 10, &mut metrics); - assert_eq!(merged, 0); - } - - #[test] - fn test_merge_small_leaves_preserves_content() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let a = tree.add_child(root, "A", "hello"); - let b = tree.add_child(root, "B", "world"); - if let Some(n) = tree.get_mut(a) { - n.token_count = Some(5); - } - if let Some(n) = tree.get_mut(b) { - n.token_count = Some(5); - } - - let mut metrics = crate::index::pipeline::IndexMetrics::new(); - let _ = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); - - // Leaf A should now contain both contents with heading prefix - let a_node = tree.get(a).unwrap(); - assert!(a_node.content.contains("hello")); - assert!(a_node.content.contains("## B")); - assert!(a_node.content.contains("world")); - assert_eq!(a_node.token_count, Some(10)); - } - - #[test] - fn test_merge_small_leaves_skips_non_leaf() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - - // Section is a non-leaf (has a child), should not be merged even if small - let section = tree.add_child(root, "Section", "section content"); - let _sub = tree.add_child(section, "Sub", "sub content"); - let leaf = tree.add_child(root, "Leaf", "leaf content"); - - if let Some(n) = tree.get_mut(section) { - n.token_count = Some(5); - } - if let Some(n) = tree.get_mut(leaf) { - n.token_count = Some(5); - } - - let mut metrics = crate::index::pipeline::IndexMetrics::new(); - let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); - - // Section is non-leaf, only Leaf is a leaf — no adjacent pair of leaves - assert_eq!(merged, 0); - } - - #[test] - fn test_remove_empty_nodes_marks_single_child_empty() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - - // Empty intermediate with single child - let section = tree.add_child(root, "Section", ""); - let _leaf = tree.add_child(section, "Leaf", "content"); - - let removed = OptimizeStage::remove_empty_nodes(&mut tree); - assert_eq!(removed, 1); - - let section_node = tree.get(section).unwrap(); - assert!(section_node.title.starts_with("[EMPTY")); - } - - #[test] - fn test_remove_empty_nodes_skips_root() { - let mut tree = DocumentTree::new("Root", ""); - let _child = tree.add_child(tree.root(), "Child", "content"); - - let removed = OptimizeStage::remove_empty_nodes(&mut tree); - assert_eq!(removed, 0); - } - - #[test] - fn test_remove_empty_nodes_skips_leaves() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let leaf = tree.add_child(root, "Leaf", ""); - - let removed = OptimizeStage::remove_empty_nodes(&mut tree); - assert_eq!(removed, 0, "Leaves should not be removed"); - - // Verify the leaf is indeed a leaf - assert!(tree.is_leaf(leaf)); - } - - #[test] - fn test_remove_empty_nodes_skips_multi_child() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let section = tree.add_child(root, "Section", ""); - let _c1 = tree.add_child(section, "C1", "a"); - let _c2 = tree.add_child(section, "C2", "b"); - - let removed = OptimizeStage::remove_empty_nodes(&mut tree); - assert_eq!( - removed, 0, - "Nodes with multiple children should not be removed" - ); - } - - #[test] - fn test_remove_empty_nodes_skips_non_empty() { - let mut tree = DocumentTree::new("Root", ""); - let root = tree.root(); - let section = tree.add_child(root, "Section", "has content"); - let _leaf = tree.add_child(section, "Leaf", "content"); - - let removed = OptimizeStage::remove_empty_nodes(&mut tree); - assert_eq!(removed, 0); - } - - #[tokio::test] - async fn test_optimize_disabled_skips() { - let mut stage = OptimizeStage::new(); - assert_eq!(stage.name(), "optimize"); - assert!(stage.is_optional()); - assert_eq!(stage.depends_on(), vec!["enrich", "navigation_index"]); - - let mut options = PipelineOptions::default(); - options.optimization.enabled = false; - - let input = IndexInput::content("# Test\nHello"); - let mut ctx = IndexContext::new(input, options); - ctx.tree = Some(DocumentTree::new("Root", "content")); - - let result = stage.execute(&mut ctx).await.unwrap(); - assert!(result.success); - } - - #[test] - fn test_merge_small_leaves_empty_tree() { - let mut tree = DocumentTree::new("Root", ""); - let mut metrics = crate::index::pipeline::IndexMetrics::new(); - - let merged = OptimizeStage::merge_small_leaves(&mut tree, 100, &mut metrics); - assert_eq!(merged, 0, "Root with no children should merge nothing"); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/parse.rs b/vectorless-core/vectorless/src/index/stages/parse.rs deleted file mode 100644 index 2eb8b6ae..00000000 --- a/vectorless-core/vectorless/src/index/stages/parse.rs +++ /dev/null @@ -1,166 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Parse stage - Parse documents into raw nodes. - -use super::async_trait; -use std::time::Instant; -use tracing::{debug, info}; - -use crate::document::DocumentFormat; -use crate::error::Result; - -use super::{IndexStage, StageResult}; -use crate::index::IndexMode; -use crate::index::pipeline::{IndexContext, IndexInput}; - -/// Parse stage - extracts raw nodes from documents. -pub struct ParseStage { - /// Optional LLM client for PDF structure extraction. - llm_client: Option, -} - -impl ParseStage { - /// Create a new parse stage. - pub fn new() -> Self { - Self { llm_client: None } - } - - /// Create a parse stage with an LLM client. - pub fn with_llm_client(client: crate::llm::LlmClient) -> Self { - Self { - llm_client: Some(client), - } - } - - /// Detect document format from path and options. - fn detect_format(&self, ctx: &IndexContext) -> Result { - match ctx.options.mode { - IndexMode::Auto => match &ctx.input { - IndexInput::File(path) => { - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - DocumentFormat::from_extension(ext) - .ok_or_else(|| crate::Error::Parse(format!("Unknown format: {}", ext))) - } - IndexInput::Content { format, .. } => Ok(*format), - IndexInput::Bytes { format, .. } => Ok(*format), - }, - IndexMode::Markdown => Ok(DocumentFormat::Markdown), - IndexMode::Pdf => Ok(DocumentFormat::Pdf), - } - } -} - -impl Default for ParseStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for ParseStage { - fn name(&self) -> &'static str { - "parse" - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - // Detect format - let format = self.detect_format(ctx)?; - ctx.format = format; - - let input_type = match &ctx.input { - IndexInput::File(_) => "file", - IndexInput::Content { .. } => "content", - IndexInput::Bytes { .. } => "bytes", - }; - info!( - "[parse] Starting: format={:?}, input={}, llm={}", - format, - input_type, - self.llm_client.is_some() - ); - - // Parse based on input type - let result = match &ctx.input { - IndexInput::File(path) => { - // Resolve path - let path = path.canonicalize().unwrap_or_else(|_| path.clone()); - ctx.source_path = Some(path.clone()); - - // Extract name from file - ctx.name = path - .file_stem() - .and_then(|n| n.to_str()) - .unwrap_or("document") - .to_string(); - - debug!("[parse] Reading file: {:?}", ctx.source_path); - - // Parse directly - crate::index::parse::parse_file(&path, format, self.llm_client.clone()).await? - } - IndexInput::Content { - content, - name, - format, - } => { - // Set name - ctx.name = name.clone(); - - debug!("[parse] Parsing inline content ({} chars)", content.len()); - - // Parse content directly - crate::index::parse::parse_content(content, *format, self.llm_client.clone()) - .await? - } - IndexInput::Bytes { data, name, format } => { - // Set name - ctx.name = name.clone(); - - debug!("[parse] Parsing bytes ({} bytes)", data.len()); - - // Parse bytes - crate::index::parse::parse_bytes(data, *format, self.llm_client.clone()).await? - } - }; - - // Store results - ctx.raw_nodes = result.nodes; - ctx.metrics.set_nodes_processed(ctx.raw_nodes.len()); - - // Store metadata - if let Some(page_count) = result.meta.page_count { - ctx.page_count = Some(page_count); - debug!("[parse] Document has {} pages", page_count); - } - ctx.line_count = Some(result.meta.line_count); - - if let Some(desc) = result.meta.description { - ctx.description = Some(desc); - } - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_parse(duration); - - info!( - "[parse] Complete: {} nodes from '{}' ({}ms)", - ctx.raw_nodes.len(), - ctx.name, - duration - ); - - let mut stage_result = StageResult::success("parse"); - stage_result.duration_ms = duration; - stage_result.metadata.insert( - "node_count".to_string(), - serde_json::json!(ctx.raw_nodes.len()), - ); - stage_result - .metadata - .insert("format".to_string(), serde_json::json!(format.extension())); - - Ok(stage_result) - } -} diff --git a/vectorless-core/vectorless/src/index/stages/reasoning.rs b/vectorless-core/vectorless/src/index/stages/reasoning.rs deleted file mode 100644 index d30e1303..00000000 --- a/vectorless-core/vectorless/src/index/stages/reasoning.rs +++ /dev/null @@ -1,639 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Reasoning Index Stage - Build pre-computed reasoning index. -//! -//! This stage runs after EnrichStage (which generates descriptions and -//! calculates metadata) and before OptimizeStage. It builds a -//! [`ReasoningIndex`] from the document tree's TOC, summaries, and keywords. - -use std::collections::HashMap; -use std::time::Instant; -use tracing::{debug, info, warn}; - -use crate::document::{ - NodeId, ReasoningIndexBuilder, ReasoningIndexConfig, SectionSummary, SummaryShortcut, - TopicEntry, -}; -use crate::error::Result; -use crate::llm::LlmClient; -use crate::scoring::extract_keywords; - -use super::async_trait; -use super::{AccessPattern, IndexStage, StageResult}; -use crate::index::pipeline::IndexContext; - -/// Reasoning Index Stage - builds a pre-computed reasoning index from the document tree. -/// -/// This stage creates a [`ReasoningIndex`] containing: -/// - Topic-to-path mappings from titles and summaries -/// - Summary shortcuts for high-frequency "overview" queries -/// - Section map for fast ToC lookup -pub struct ReasoningIndexStage { - config: ReasoningIndexConfig, -} - -impl ReasoningIndexStage { - /// Create a new reasoning index stage with default config. - pub fn new() -> Self { - Self { - config: ReasoningIndexConfig::default(), - } - } - - /// Create with custom config. - pub fn with_config(config: ReasoningIndexConfig) -> Self { - Self { config } - } - - /// Extract keywords from a text, filtering by minimum length. - fn extract_node_keywords(text: &str, min_length: usize) -> Vec { - extract_keywords(text) - .into_iter() - .filter(|k: &String| k.len() >= min_length) - .collect() - } - - /// Build the topic-to-path mapping by extracting keywords from all nodes. - fn build_topic_paths( - tree: &crate::document::DocumentTree, - config: &ReasoningIndexConfig, - ) -> (HashMap>, usize) { - let mut keyword_nodes: HashMap> = HashMap::new(); - - // Walk all nodes and extract keywords from title + summary - for node_id in tree.traverse() { - if let Some(node) = tree.get(node_id) { - let title_keywords = - Self::extract_node_keywords(&node.title, config.min_keyword_length); - let summary_keywords = - Self::extract_node_keywords(&node.summary, config.min_keyword_length); - // Always extract from content — keywords can appear anywhere - let content_keywords = - Self::extract_node_keywords(&node.content, config.min_keyword_length); - - // Title keywords get higher weight (2.0), summary (1.5), content (1.0) - for kw in &title_keywords { - keyword_nodes - .entry(kw.clone()) - .or_default() - .push((node_id, 2.0, node.depth)); - } - for kw in &summary_keywords { - keyword_nodes - .entry(kw.clone()) - .or_default() - .push((node_id, 1.5, node.depth)); - } - for kw in &content_keywords { - keyword_nodes - .entry(kw.clone()) - .or_default() - .push((node_id, 1.0, node.depth)); - } - } - } - - // Sort by keyword frequency (most common first) and trim to max_keyword_entries - let mut sorted_keywords: Vec<_> = keyword_nodes.into_iter().collect(); - sorted_keywords.sort_by(|a, b| b.1.len().cmp(&a.1.len())); - sorted_keywords.truncate(config.max_keyword_entries); - - let keyword_count = sorted_keywords.len(); - - // Build topic_paths: merge duplicate (keyword, node) pairs - let mut topic_paths: HashMap> = HashMap::new(); - - for (keyword, entries) in sorted_keywords { - // Merge duplicate node entries by summing weights - let mut merged: HashMap = HashMap::new(); - for (node_id, weight, depth) in entries { - let entry = merged.entry(node_id).or_insert((0.0, depth)); - entry.0 += weight; - } - - // Normalize weights to 0.0-1.0 range - let max_weight = merged.values().map(|(w, _)| *w).fold(0.0_f32, f32::max); - let scale = if max_weight > 0.0 { - 1.0 / max_weight - } else { - 1.0 - }; - - let mut topic_entries: Vec = merged - .into_iter() - .map(|(node_id, (weight, depth))| TopicEntry { - node_id, - weight: weight * scale, - depth, - }) - .collect(); - - topic_entries.sort_by(|a, b| { - b.weight - .partial_cmp(&a.weight) - .unwrap_or(std::cmp::Ordering::Equal) - }); - topic_entries.truncate(config.max_topic_entries); - - topic_paths.insert(keyword, topic_entries); - } - - (topic_paths, keyword_count) - } - - /// Build section map from depth-1 nodes. - fn build_section_map(tree: &crate::document::DocumentTree) -> HashMap { - let mut section_map = HashMap::new(); - let root = tree.root(); - for child_id in tree.children(root) { - if let Some(node) = tree.get(child_id) { - section_map.insert(node.title.to_lowercase(), child_id); - // Also index by structure index (e.g. "1", "2", "3") - if !node.structure.is_empty() { - section_map.insert(node.structure.clone(), child_id); - } - } - } - section_map - } - - /// Expand keywords with LLM-generated synonyms (single batch request). - /// - /// Sends all keywords to the LLM in one request and maps each to its - /// synonyms. Synonym entries inherit the same node mappings but with - /// a reduced weight (0.6x) to reflect the indirect match. - async fn expand_synonyms( - topic_paths: &mut HashMap>, - llm_client: &LlmClient, - max_keywords: usize, - ) -> usize { - use std::collections::HashSet; - - let existing_keys: HashSet = topic_paths.keys().cloned().collect(); - // Pick top keywords by entry count for synonym expansion - let mut ranked: Vec<(String, usize)> = topic_paths - .iter() - .map(|(k, v): (&String, &Vec)| (k.clone(), v.len())) - .collect(); - ranked.sort_by(|a, b| b.1.cmp(&a.1)); - ranked.truncate(max_keywords); - - let keyword_count = ranked.len(); - if keyword_count == 0 { - return 0; - } - - tracing::info!( - "[reasoning_index] Expanding synonyms for {} keywords (single request)", - keyword_count, - ); - - // Snapshot the source entries for each keyword. - let source_entries: HashMap> = ranked - .iter() - .map(|(kw, _): &(String, usize)| { - (kw.clone(), topic_paths.get(kw).cloned().unwrap_or_default()) - }) - .collect(); - - let keywords: Vec = ranked.into_iter().map(|(kw, _)| kw).collect(); - - let system = "You are a thesaurus assistant. For each keyword, provide up to 5 synonyms \ - or related search terms. Return ONLY a valid JSON object mapping each keyword to an \ - array of synonym strings. No explanation, no markdown."; - let user_prompt = format!( - "Keywords: {}\n\nReturn a JSON object: {{\"keyword\": [\"syn1\", \"syn2\"], ...}}", - keywords.join(", ") - ); - - let synonym_map: HashMap> = match llm_client - .complete_json::>>(system, &user_prompt) - .await - { - Ok(map) => map - .into_iter() - .map(|(k, v): (String, Vec)| (k.to_lowercase(), v)) - .collect(), - Err(e) => { - tracing::warn!("[reasoning_index] Batch synonym expansion failed: {}", e); - return 0; - } - }; - - // Write results back - let mut synonym_count = 0; - for keyword in &keywords { - if let Some(synonyms) = synonym_map.get(keyword) { - if let Some(entries) = source_entries.get(keyword) { - for syn in synonyms { - let syn_clean = syn.trim().to_lowercase(); - if syn_clean.is_empty() - || syn_clean.len() < 2 - || existing_keys.contains(&syn_clean) - { - continue; - } - let synonym_entries: Vec = entries - .iter() - .map(|e| TopicEntry { - node_id: e.node_id, - weight: e.weight * 0.6, - depth: e.depth, - }) - .collect(); - topic_paths.insert(syn_clean, synonym_entries); - synonym_count += 1; - } - } - } - } - - synonym_count - } - - /// Build summary shortcut from root and depth-1 nodes. - fn build_summary_shortcut(tree: &crate::document::DocumentTree) -> Option { - let root = tree.root(); - let root_node = tree.get(root)?; - - // Collect document summary from root - let document_summary = if !root_node.summary.is_empty() { - root_node.summary.clone() - } else { - // Fallback: concatenate depth-1 summaries - let mut parts = Vec::new(); - for child_id in tree.children(root) { - if let Some(child) = tree.get(child_id) { - if !child.summary.is_empty() { - parts.push(format!("{}: {}", child.title, child.summary)); - } - } - } - parts.join("\n") - }; - - // Collect section summaries - let mut section_summaries = Vec::new(); - for child_id in tree.children(root) { - if let Some(child) = tree.get(child_id) { - section_summaries.push(SectionSummary { - node_id: child_id, - title: child.title.clone(), - summary: child.summary.clone(), - depth: child.depth, - }); - } - } - - Some(SummaryShortcut { - root_node: root, - section_summaries, - document_summary, - }) - } -} - -impl Default for ReasoningIndexStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for ReasoningIndexStage { - fn name(&self) -> &'static str { - "reasoning_index" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["enrich"] - } - - fn is_optional(&self) -> bool { - true - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_reasoning_index: true, - ..Default::default() - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - // Check if enabled via pipeline options - if !ctx.options.reasoning_index.enabled { - info!("[reasoning_index] Disabled, skipping"); - return Ok(StageResult::success("reasoning_index")); - } - - // Use stage config, overridden by pipeline options - let config = &ctx.options.reasoning_index; - - let tree = match ctx.tree.as_ref() { - Some(t) => t, - None => { - warn!("[reasoning_index] No tree, cannot build index"); - return Ok(StageResult::failure("reasoning_index", "Tree not built")); - } - }; - - info!( - "[reasoning_index] Starting: synonyms={}, summary_shortcut={}, max_keywords={}", - config.enable_synonym_expansion, - config.build_summary_shortcut, - config.max_keyword_entries, - ); - - // 1. Build topic-to-path mapping - let (mut topic_paths, keyword_count) = Self::build_topic_paths(tree, config); - let topic_count: usize = topic_paths - .values() - .map(|v: &Vec| v.len()) - .sum(); - debug!( - "[reasoning_index] Topic paths: {} keywords, {} entries", - keyword_count, topic_count - ); - - // 1b. Optional: expand keywords with LLM-generated synonyms - let synonym_count = if config.enable_synonym_expansion { - if let Some(ref llm_client) = ctx.llm_client { - let max_kw = (keyword_count / 4).max(20).min(100); - let count = Self::expand_synonyms(&mut topic_paths, llm_client, max_kw).await; - if count > 0 { - info!("[reasoning_index] Expanded {} synonym keywords", count); - } - count - } else { - debug!("[reasoning_index] Synonym expansion enabled but no LLM client"); - 0 - } - } else { - 0 - }; - - // 2. Build section map - let section_map = Self::build_section_map(tree); - debug!( - "[reasoning_index] Section map: {} entries", - section_map.len() - ); - - // 3. Build summary shortcut - let summary_shortcut = if config.build_summary_shortcut { - let shortcut = Self::build_summary_shortcut(tree); - if shortcut.is_some() { - debug!("[reasoning_index] Built summary shortcut"); - } - shortcut - } else { - None - }; - - // 4. Assemble the reasoning index - let mut builder = ReasoningIndexBuilder::new(); - for (keyword, entries) in topic_paths { - for entry in entries { - builder.add_topic_entry(&keyword, entry); - } - } - for (title, node_id) in section_map { - builder.add_section(&title, node_id); - } - if let Some(shortcut) = summary_shortcut { - builder = builder.summary_shortcut(shortcut); - } - builder.sort_and_trim(config.max_topic_entries); - - let reasoning_index = builder.build(); - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics - .record_reasoning_index(duration, topic_count, keyword_count); - - info!( - "[reasoning_index] Complete: {} keywords, {} topics, {} sections, {} synonyms in {}ms", - keyword_count, - topic_count, - reasoning_index.section_count(), - synonym_count, - duration, - ); - - ctx.reasoning_index = Some(reasoning_index); - - let mut stage_result = StageResult::success("reasoning_index"); - stage_result.duration_ms = duration; - stage_result.metadata.insert( - "keywords_indexed".to_string(), - serde_json::json!(keyword_count), - ); - stage_result - .metadata - .insert("topics_indexed".to_string(), serde_json::json!(topic_count)); - stage_result.metadata.insert( - "synonyms_expanded".to_string(), - serde_json::json!(synonym_count), - ); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_node_keywords() { - let keywords = - ReasoningIndexStage::extract_node_keywords("Introduction to Machine Learning", 2); - assert!(keywords.contains(&"introduction".to_string())); - assert!(keywords.contains(&"machine".to_string())); - assert!(keywords.contains(&"learning".to_string())); - } - - #[test] - fn test_extract_node_keywords_min_length() { - let keywords = ReasoningIndexStage::extract_node_keywords("A B CD", 2); - assert!(!keywords.contains(&"a".to_string())); - assert!(!keywords.contains(&"b".to_string())); - assert!(keywords.contains(&"cd".to_string())); - } - - #[test] - fn test_stage_config_default() { - let stage = ReasoningIndexStage::new(); - assert!(stage.config.enabled); - assert_eq!(stage.name(), "reasoning_index"); - assert!(stage.is_optional()); - assert_eq!(stage.depends_on(), vec!["enrich"]); - } - - #[test] - fn test_build_topic_paths_basic() { - use crate::document::ReasoningIndexConfig; - - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "Machine Learning Introduction", ""); - let c2 = tree.add_child(root, "Deep Learning Methods", ""); - - // Set summaries for keyword extraction - if let Some(n) = tree.get_mut(c1) { - n.summary = "An overview of machine learning algorithms".to_string(); - } - if let Some(n) = tree.get_mut(c2) { - n.summary = "Advanced deep learning techniques".to_string(); - } - - let config = ReasoningIndexConfig::default(); - let (topic_paths, keyword_count) = ReasoningIndexStage::build_topic_paths(&tree, &config); - - assert!( - keyword_count > 0, - "Should extract keywords from title + summary" - ); - assert!(!topic_paths.is_empty(), "Should build topic paths"); - - // "learning" appears in both titles → should be a keyword - assert!( - topic_paths.contains_key("learning"), - "Expected 'learning' in topic paths, got: {:?}", - topic_paths.keys().collect::>() - ); - } - - #[test] - fn test_build_topic_paths_weight_normalization() { - use crate::document::ReasoningIndexConfig; - - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - let _c1 = tree.add_child(root, "rust ownership", "rust borrowing rules"); - - let config = ReasoningIndexConfig::default(); - let (topic_paths, _) = ReasoningIndexStage::build_topic_paths(&tree, &config); - - // All weights should be in 0.0-1.0 range - for entries in topic_paths.values() { - for entry in entries { - assert!( - entry.weight >= 0.0 && entry.weight <= 1.0, - "Weight {} out of [0, 1] range", - entry.weight - ); - } - } - } - - #[test] - fn test_build_topic_paths_respects_max_keyword_entries() { - use crate::document::ReasoningIndexConfig; - - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - - // Create many children with unique keywords - for i in 0..50 { - let c = tree.add_child(root, &format!("Section {} Alpha Beta Gamma Delta", i), ""); - if let Some(n) = tree.get_mut(c) { - n.summary = format!("keywords unique{} special{} terms{}", i, i, i); - } - } - - let mut config = ReasoningIndexConfig::default(); - config.max_keyword_entries = 5; - let (topic_paths, keyword_count) = ReasoningIndexStage::build_topic_paths(&tree, &config); - - assert!( - keyword_count <= 5, - "Should respect max_keyword_entries, got {}", - keyword_count - ); - assert_eq!(topic_paths.len(), keyword_count); - } - - #[test] - fn test_build_section_map() { - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "Introduction", "content"); - let c2 = tree.add_child(root, "Methods", "content"); - - // Set structure indices - if let Some(n) = tree.get_mut(c1) { - n.structure = "1".to_string(); - } - if let Some(n) = tree.get_mut(c2) { - n.structure = "2".to_string(); - } - - let section_map = ReasoningIndexStage::build_section_map(&tree); - - // Should index by title (lowercase) and structure index - assert!(section_map.contains_key("introduction")); - assert!(section_map.contains_key("methods")); - assert!(section_map.contains_key("1")); - assert!(section_map.contains_key("2")); - assert_eq!(section_map.len(), 4); - } - - #[test] - fn test_build_summary_shortcut() { - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "S1", "summary 1"); - let c2 = tree.add_child(root, "S2", "summary 2"); - - // Set root summary (not content — build_summary_shortcut reads summary field) - if let Some(n) = tree.get_mut(root) { - n.summary = "root summary text".to_string(); - } - if let Some(n) = tree.get_mut(c1) { - n.summary = "first section summary".to_string(); - } - if let Some(n) = tree.get_mut(c2) { - n.summary = "second section summary".to_string(); - } - - let shortcut = ReasoningIndexStage::build_summary_shortcut(&tree); - assert!(shortcut.is_some()); - - let sc = shortcut.unwrap(); - assert_eq!(sc.root_node, root); - assert_eq!(sc.document_summary, "root summary text"); - assert_eq!(sc.section_summaries.len(), 2); - } - - #[test] - fn test_build_summary_shortcut_fallback_to_children() { - // Root has no summary → fallback to concatenating children - let mut tree = crate::document::DocumentTree::new("Root", ""); - let root = tree.root(); - let c1 = tree.add_child(root, "S1", ""); - let c2 = tree.add_child(root, "S2", ""); - - if let Some(n) = tree.get_mut(c1) { - n.summary = "child summary 1".to_string(); - } - if let Some(n) = tree.get_mut(c2) { - n.summary = "child summary 2".to_string(); - } - - let shortcut = ReasoningIndexStage::build_summary_shortcut(&tree); - assert!(shortcut.is_some()); - - let sc = shortcut.unwrap(); - assert!( - sc.document_summary.contains("child summary 1"), - "Fallback should include child summaries" - ); - assert!(sc.document_summary.contains("S1")); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/split.rs b/vectorless-core/vectorless/src/index/stages/split.rs deleted file mode 100644 index 54fe3edd..00000000 --- a/vectorless-core/vectorless/src/index/stages/split.rs +++ /dev/null @@ -1,347 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Split stage - Break large leaf nodes into smaller ones. - -use std::time::Instant; -use tracing::{debug, info}; - -use crate::document::{DocumentTree, NodeId}; -use crate::error::Result; -use crate::utils::estimate_tokens; - -use super::{AccessPattern, IndexStage, StageResult, async_trait}; -use crate::index::config::SplitConfig; -use crate::index::pipeline::IndexContext; - -/// Split stage — breaks oversized leaf nodes into smaller children. -/// -/// When a leaf node exceeds the token limit, the stage searches for natural -/// split points (headings `\n#`, paragraph boundaries `\n\n`) and creates -/// child nodes from the resulting chunks. -/// -/// This stage runs after validate (priority 22) at priority 25. -pub struct SplitStage; - -impl SplitStage { - /// Create a new split stage. - pub fn new() -> Self { - Self - } - - /// Find natural split points in content. - /// - /// Returns byte offsets where the content can be split. - /// Prioritizes heading boundaries (`\n#`), then paragraph breaks (`\n\n`). - fn find_split_points(content: &str, max_tokens: usize) -> Vec { - let total_tokens = estimate_tokens(content); - if total_tokens <= max_tokens { - return Vec::new(); - } - - // Estimate how many parts we need - let estimated_parts = (total_tokens + max_tokens - 1) / max_tokens; - let target_size = content.len() / estimated_parts.max(1); - - let mut points = Vec::new(); - - // First pass: find heading boundaries - let mut last_split = 0; - for (i, line) in content.lines().enumerate() { - let byte_offset = line.as_ptr() as usize - content.as_ptr() as usize; - if i > 0 && line.starts_with('#') && byte_offset > last_split { - let chunk_tokens = estimate_tokens(&content[last_split..byte_offset]); - if chunk_tokens >= max_tokens / 2 { - points.push(byte_offset); - last_split = byte_offset; - } - } - } - - // If heading splits are sufficient, return them - if !points.is_empty() { - let approx_size = content.len() / (points.len() + 1); - if approx_size <= target_size * 2 { - return points; - } - } - - // Second pass: use paragraph boundaries - points.clear(); - let mut pos = 0; - for paragraph in content.split("\n\n") { - let para_end = pos + paragraph.len(); - if para_end > 0 && pos > 0 { - let chunk_tokens = - estimate_tokens(&content[points.last().copied().unwrap_or(0)..pos]); - if chunk_tokens >= max_tokens / 2 { - points.push(pos); - } - } - pos = para_end + 2; // skip "\n\n" - } - - // If still not enough split points, use approximate byte boundaries - if points.is_empty() { - let bytes_per_token = content.len().max(1) / total_tokens.max(1); - let target_bytes = max_tokens * bytes_per_token; - - let mut offset = target_bytes; - while offset < content.len() { - // Find the nearest newline - if let Some(nl_pos) = content[offset..].find('\n') { - points.push(offset + nl_pos); - } else { - break; - } - offset += target_bytes; - } - } - - points - } - - /// Split a single leaf node into children. - /// - /// Returns the number of new children created. - fn split_leaf(tree: &mut DocumentTree, leaf_id: NodeId, max_tokens: usize) -> usize { - let content = match tree.get(leaf_id) { - Some(node) => node.content.clone(), - None => return 0, - }; - - let split_points = Self::find_split_points(&content, max_tokens); - if split_points.is_empty() { - return 0; - } - - // Extract title for child naming - let parent_title = tree - .get(leaf_id) - .map(|n| n.title.clone()) - .unwrap_or_default(); - - // Create chunks from split points - let mut chunks: Vec<&str> = Vec::new(); - let mut prev = 0; - for &point in &split_points { - if point > prev { - chunks.push(&content[prev..point]); - } - prev = point; - } - if prev < content.len() { - chunks.push(&content[prev..]); - } - - let child_count = chunks.len(); - for (i, chunk) in chunks.into_iter().enumerate() { - let chunk_trimmed = chunk.trim(); - if chunk_trimmed.is_empty() { - continue; - } - - // Try to extract a title from the first line - let title = if chunk_trimmed.starts_with('#') { - chunk_trimmed - .lines() - .next() - .unwrap_or("") - .trim_start_matches('#') - .trim() - .to_string() - } else { - format!("{} (part {})", parent_title, i + 1) - }; - - let child_id = tree.add_child(leaf_id, &title, chunk_trimmed); - let token_count = estimate_tokens(chunk_trimmed); - tree.set_token_count(child_id, token_count); - } - - // Clear parent's content (moved to children) - tree.set_content(leaf_id, ""); - tree.set_token_count(leaf_id, 0); - - child_count - } - - /// Process all oversized leaf nodes in the tree. - fn split_tree(tree: &mut DocumentTree, config: &SplitConfig) -> usize { - if !config.enabled { - return 0; - } - - // Collect leaves first to avoid borrow issues - let leaves: Vec = tree.leaves(); - let mut total_split = 0; - - for leaf_id in leaves { - // Check if this leaf exceeds the token limit - let token_count = tree.get(leaf_id).and_then(|n| n.token_count).unwrap_or(0); - - // Use estimated tokens if no count set - let tokens = if token_count > 0 { - token_count - } else { - tree.get(leaf_id) - .map(|n| estimate_tokens(&n.content)) - .unwrap_or(0) - }; - - if tokens > config.max_tokens_per_node { - let split_count = Self::split_leaf(tree, leaf_id, config.max_tokens_per_node); - if split_count > 0 { - total_split += 1; - } - } - } - - total_split - } -} - -impl Default for SplitStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for SplitStage { - fn name(&self) -> &'static str { - "split" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["build"] - } - - fn is_optional(&self) -> bool { - true - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_tree: true, - writes_reasoning_index: false, - writes_navigation_index: false, - writes_description: false, - writes_concepts: false, - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - let tree = match ctx.tree.as_mut() { - Some(t) => t, - None => { - info!("[split] No tree, skipping"); - return Ok(StageResult::success("split")); - } - }; - - let config = &ctx.options.split; - if !config.enabled { - debug!("[split] Disabled, skipping"); - return Ok(StageResult::success("split")); - } - - info!( - "[split] Starting: max_tokens_per_node={}", - config.max_tokens_per_node - ); - - let node_count_before = tree.node_count(); - let split_count = Self::split_tree(tree, config); - let node_count_after = tree.node_count(); - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_split(duration); - ctx.metrics.nodes_merged += split_count; - - info!( - "[split] Complete: {} nodes split ({} → {} total) in {}ms", - split_count, node_count_before, node_count_after, duration - ); - - let mut stage_result = StageResult::success("split"); - stage_result.duration_ms = duration; - stage_result - .metadata - .insert("nodes_split".to_string(), serde_json::json!(split_count)); - stage_result.metadata.insert( - "node_count_before".to_string(), - serde_json::json!(node_count_before), - ); - stage_result.metadata.insert( - "node_count_after".to_string(), - serde_json::json!(node_count_after), - ); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_find_split_points_small_content() { - let content = "Hello world"; - let points = SplitStage::find_split_points(content, 8000); - assert!(points.is_empty()); - } - - #[test] - fn test_find_split_points_heading_boundaries() { - let mut content = String::from("Introduction text that is long enough. "); - // Pad to exceed token limit - for _ in 0..500 { - content.push_str("This is some content. "); - } - content.push_str("\n## Section One\n"); - for _ in 0..500 { - content.push_str("More content here. "); - } - content.push_str("\n## Section Two\n"); - for _ in 0..500 { - content.push_str("Final content. "); - } - - let points = SplitStage::find_split_points(&content, 200); - assert!(!points.is_empty()); - } - - #[test] - fn test_find_split_points_paragraph_boundaries() { - let mut content = String::new(); - for i in 0..10 { - for _ in 0..100 { - content.push_str(&format!("Paragraph {} content. ", i)); - } - content.push_str("\n\n"); - } - - let points = SplitStage::find_split_points(&content, 200); - assert!(!points.is_empty()); - } - - #[test] - fn test_split_tree_disabled() { - let mut tree = DocumentTree::new("Root", ""); - let child = tree.add_child( - tree.root(), - "Big", - "Very long content here with lots of text that would normally exceed limits", - ); - tree.set_token_count(child, 15000); - - let config = SplitConfig::disabled(); - let count = SplitStage::split_tree(&mut tree, &config); - assert_eq!(count, 0); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/validate.rs b/vectorless-core/vectorless/src/index/stages/validate.rs deleted file mode 100644 index e0909521..00000000 --- a/vectorless-core/vectorless/src/index/stages/validate.rs +++ /dev/null @@ -1,365 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Validate stage - Verify tree integrity after build. - -use std::collections::HashSet; -use std::time::Instant; -use tracing::{debug, info, warn}; - -use crate::error::Result; - -use super::{AccessPattern, IndexStage, StageResult, async_trait}; -use crate::index::pipeline::IndexContext; - -/// Maximum allowed tree depth. -const MAX_DEPTH: usize = 20; - -/// Minimum token count ratio for parent vs children consistency check. -/// A parent's token count should be at least `ratio` of the sum of its children. -const MIN_PARENT_TOKEN_RATIO: f32 = 0.8; - -/// Minimum content similarity threshold to flag potential duplicates. -/// Content is considered duplicate if normalized equality matches. -const DUPLICATE_MIN_LENGTH: usize = 50; - -/// Validation issue severity. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -enum Severity { - /// Warning — tree is usable but may have quality issues. - Warning, - /// Error — tree has structural problems. - Error, -} - -/// A single validation issue found during tree inspection. -#[derive(Debug, Clone)] -struct ValidationIssue { - /// Severity level. - severity: Severity, - /// Human-readable description. - message: String, -} - -/// Validate stage — checks tree integrity after build. -/// -/// Validates: -/// 1. Tree structural integrity (all nodes reachable from root) -/// 2. Depth sanity (max depth < 20) -/// 3. Empty title detection on leaf nodes -/// 4. Token count consistency (parent >= sum of children) -/// 5. Content duplication detection -pub struct ValidateStage; - -impl ValidateStage { - /// Create a new validate stage. - pub fn new() -> Self { - Self - } - - /// Run all validation checks and collect issues. - fn validate_tree(&self, ctx: &IndexContext) -> Vec { - let tree = match ctx.tree.as_ref() { - Some(t) => t, - None => { - return vec![ValidationIssue { - severity: Severity::Error, - message: "No tree available for validation".to_string(), - }]; - } - }; - - let mut issues = Vec::new(); - - Self::check_depth(tree, &mut issues); - Self::check_empty_titles(tree, &mut issues); - Self::check_token_consistency(tree, &mut issues); - Self::check_content_duplication(tree, &mut issues); - - issues - } - - /// Check that tree depth is reasonable. - fn check_depth(tree: &crate::document::DocumentTree, issues: &mut Vec) { - let all_nodes = tree.traverse(); - let max_depth = all_nodes - .iter() - .map(|&id| tree.depth(id)) - .max() - .unwrap_or(0); - - if max_depth > MAX_DEPTH { - issues.push(ValidationIssue { - severity: Severity::Warning, - message: format!( - "Tree depth ({}) exceeds recommended maximum ({})", - max_depth, MAX_DEPTH - ), - }); - } - } - - /// Check for leaf nodes with empty titles. - fn check_empty_titles(tree: &crate::document::DocumentTree, issues: &mut Vec) { - let leaves = tree.leaves(); - let mut empty_count = 0; - - for &leaf_id in &leaves { - if let Some(node) = tree.get(leaf_id) { - if node.title.trim().is_empty() { - empty_count += 1; - } - } - } - - if empty_count > 0 { - issues.push(ValidationIssue { - severity: Severity::Warning, - message: format!("Found {} leaf nodes with empty titles", empty_count), - }); - } - } - - /// Check token count consistency: parent's tokens should be >= sum of children's. - fn check_token_consistency( - tree: &crate::document::DocumentTree, - issues: &mut Vec, - ) { - let all_nodes = tree.traverse(); - let mut inconsistent = 0; - - for &node_id in &all_nodes { - let children: Vec<_> = tree.children(node_id); - if children.is_empty() { - continue; - } - - let parent_tokens = tree.get(node_id).and_then(|n| n.token_count).unwrap_or(0); - - let children_sum: usize = children - .iter() - .map(|&c| tree.get(c).and_then(|n| n.token_count).unwrap_or(0)) - .sum(); - - // Parent should have at least some proportion of children's tokens - // (parent has its own content plus children, but after thinning this may vary) - if parent_tokens > 0 - && children_sum > 0 - && (parent_tokens as f32 / children_sum as f32) < MIN_PARENT_TOKEN_RATIO - { - // Only flag if both are non-trivial - if children_sum >= 100 { - inconsistent += 1; - } - } - } - - if inconsistent > 0 { - issues.push(ValidationIssue { - severity: Severity::Warning, - message: format!( - "Found {} nodes with token counts significantly less than their children's sum", - inconsistent - ), - }); - } - } - - /// Check for content duplication across leaf nodes. - fn check_content_duplication( - tree: &crate::document::DocumentTree, - issues: &mut Vec, - ) { - let leaves = tree.leaves(); - let mut seen: HashSet = HashSet::new(); - let mut duplicate_count = 0; - - for &leaf_id in &leaves { - if let Some(node) = tree.get(leaf_id) { - let content = node.content.trim(); - if content.len() < DUPLICATE_MIN_LENGTH { - continue; - } - - // Simple hash of normalized content for duplicate detection - let hash = Self::simple_hash(content); - if !seen.insert(hash) { - duplicate_count += 1; - } - } - } - - if duplicate_count > 0 { - issues.push(ValidationIssue { - severity: Severity::Warning, - message: format!( - "Found {} leaf nodes with duplicate content", - duplicate_count - ), - }); - } - } - - /// Simple FNV-1a-like hash for duplicate detection. - /// Not cryptographic — just for grouping identical content. - fn simple_hash(s: &str) -> u64 { - let mut hash: u64 = 0xcbf29ce484222325; - for byte in s.bytes() { - hash ^= byte as u64; - hash = hash.wrapping_mul(0x100000001b3); - } - hash - } -} - -impl Default for ValidateStage { - fn default() -> Self { - Self::new() - } -} - -#[async_trait] -impl IndexStage for ValidateStage { - fn name(&self) -> &'static str { - "validate" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["build"] - } - - fn is_optional(&self) -> bool { - true - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - writes_tree: false, - writes_reasoning_index: false, - writes_navigation_index: false, - writes_description: false, - writes_concepts: false, - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - let start = Instant::now(); - - let node_count = ctx.tree.as_ref().map(|t| t.node_count()).unwrap_or(0); - info!("[validate] Starting: {} nodes", node_count); - - let issues = self.validate_tree(ctx); - - let warnings = issues - .iter() - .filter(|i| i.severity == Severity::Warning) - .count(); - let errors = issues - .iter() - .filter(|i| i.severity == Severity::Error) - .count(); - - // Log all issues - for issue in &issues { - match issue.severity { - Severity::Warning => warn!("[validate] {}", issue.message), - Severity::Error => warn!("[validate] ERROR: {}", issue.message), - } - } - - if warnings == 0 && errors == 0 { - debug!("[validate] No issues found"); - } - - let duration = start.elapsed().as_millis() as u64; - ctx.metrics.record_validate(duration); - - info!( - "[validate] Complete: {} warnings, {} errors in {}ms", - warnings, errors, duration - ); - - let mut stage_result = StageResult::success("validate"); - stage_result.duration_ms = duration; - stage_result - .metadata - .insert("warnings".to_string(), serde_json::json!(warnings)); - stage_result - .metadata - .insert("errors".to_string(), serde_json::json!(errors)); - - Ok(stage_result) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - fn make_context_with_tree(tree: DocumentTree) -> IndexContext { - let input = crate::index::IndexInput::content("test"); - let options = crate::index::config::PipelineOptions::default(); - let mut ctx = IndexContext::new(input, options); - ctx.tree = Some(tree); - ctx - } - - #[test] - fn test_validate_empty_tree() { - let tree = DocumentTree::new("Root", ""); - let ctx = make_context_with_tree(tree); - - let stage = ValidateStage::new(); - let issues = stage.validate_tree(&ctx); - - // Single root node is valid — no issues expected - assert!(issues.is_empty()); - } - - #[test] - fn test_validate_simple_tree() { - let mut tree = DocumentTree::new("Root", ""); - let child = tree.add_child(tree.root(), "Section 1", "Content of section 1"); - tree.set_token_count(child, 100); - - let ctx = make_context_with_tree(tree); - - let stage = ValidateStage::new(); - let issues = stage.validate_tree(&ctx); - - assert!(issues.is_empty()); - } - - #[test] - fn test_validate_empty_title_warning() { - let mut tree = DocumentTree::new("Root", ""); - let child = tree.add_child(tree.root(), "", "Some content here"); - tree.set_token_count(child, 50); - - let ctx = make_context_with_tree(tree); - - let stage = ValidateStage::new(); - let issues = stage.validate_tree(&ctx); - - let warning_count = issues - .iter() - .filter(|i| i.message.contains("empty titles")) - .count(); - assert_eq!(warning_count, 1); - } - - #[test] - fn test_validate_no_tree_error() { - let input = crate::index::IndexInput::content("test"); - let options = crate::index::config::PipelineOptions::default(); - let ctx = IndexContext::new(input, options); - - let stage = ValidateStage::new(); - let issues = stage.validate_tree(&ctx); - - assert_eq!(issues.len(), 1); - assert_eq!(issues[0].severity, Severity::Error); - } -} diff --git a/vectorless-core/vectorless/src/index/stages/verify_ingest.rs b/vectorless-core/vectorless/src/index/stages/verify_ingest.rs deleted file mode 100644 index 2d7125a6..00000000 --- a/vectorless-core/vectorless/src/index/stages/verify_ingest.rs +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Verify stage — validates ingest output reliability before persist. - -use tracing::{info, warn}; - -use super::{AccessPattern, IndexStage}; -use crate::error::{Error, Result}; -use crate::index::pipeline::{IndexContext, StageResult}; -use super::async_trait; - -/// Verification stage — ensures ingest produced reliable output. -/// -/// Checks: -/// - Tree is non-empty (at least root node) -/// - Document summary is non-empty -/// - At least one concept was extracted -/// -/// Any check failure produces an error — no silent degradation. -pub struct VerifyStage; - -#[async_trait] -impl IndexStage for VerifyStage { - fn name(&self) -> &str { - "verify" - } - - fn depends_on(&self) -> Vec<&'static str> { - vec!["concept_extraction"] - } - - fn is_optional(&self) -> bool { - false - } - - fn access_pattern(&self) -> AccessPattern { - AccessPattern { - reads_tree: true, - ..AccessPattern::default() - } - } - - async fn execute(&mut self, ctx: &mut IndexContext) -> Result { - // Tree must exist and have nodes - let tree = ctx.tree.as_ref().ok_or_else(|| { - Error::InvalidStructure("document tree is empty".into()) - })?; - let node_count = tree.node_count(); - if node_count == 0 { - return Err(Error::InvalidStructure( - "tree has no nodes".into(), - )); - } - - // Summary must be non-empty - let has_summary = ctx - .description - .as_ref() - .is_some_and(|s| !s.trim().is_empty()); - if !has_summary { - warn!("[verify] Document summary is empty"); - } - - // Concepts must be present (warning only — non-fatal) - if ctx.concepts.is_empty() { - warn!("[verify] No concepts extracted from document"); - } - - info!( - "[verify] Passed: {} nodes, summary={}, concepts={}", - node_count, - has_summary, - ctx.concepts.len() - ); - - Ok(StageResult::success("verify")) - } -} diff --git a/vectorless-core/vectorless/src/index/summary/full.rs b/vectorless-core/vectorless/src/index/summary/full.rs deleted file mode 100644 index c9e76e33..00000000 --- a/vectorless-core/vectorless/src/index/summary/full.rs +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Full summary strategy - generate summaries for all nodes. - -use crate::document::NodeId; -use crate::llm::LlmClient; - -use super::{SummaryGenerator, SummaryStrategyConfig}; - -/// Full summary strategy - generates summaries for all nodes. -pub struct FullStrategy { - /// Summary generator. - generator: Box, - /// Configuration. - config: SummaryStrategyConfig, -} - -impl FullStrategy { - /// Create a new full strategy with LLM client. - pub fn new(client: LlmClient) -> Self { - Self { - generator: Box::new(super::LlmSummaryGenerator::new(client)), - config: SummaryStrategyConfig::default(), - } - } - - /// Create with custom generator. - pub fn with_generator(generator: Box) -> Self { - Self { - generator, - config: SummaryStrategyConfig::default(), - } - } - - /// Set configuration. - pub fn with_config(mut self, config: SummaryStrategyConfig) -> Self { - self.config = config; - self - } - - /// Check if a node should have a summary generated. - pub fn should_generate(&self, _node_id: NodeId, content_tokens: usize) -> bool { - // In full mode, generate for all nodes with content - content_tokens >= self.config.min_content_tokens - } - - /// Generate a summary for content. - pub async fn generate(&self, title: &str, content: &str) -> crate::llm::LlmResult { - self.generator.generate(title, content).await - } - - /// Get the configuration. - pub fn config(&self) -> &SummaryStrategyConfig { - &self.config - } -} - -impl std::fmt::Debug for FullStrategy { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("FullStrategy") - .field("config", &self.config) - .finish() - } -} diff --git a/vectorless-core/vectorless/src/index/summary/lazy.rs b/vectorless-core/vectorless/src/index/summary/lazy.rs deleted file mode 100644 index 6d9cadef..00000000 --- a/vectorless-core/vectorless/src/index/summary/lazy.rs +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Lazy summary strategy - generate summaries on-demand at query time. - -use std::collections::HashMap; -use std::sync::Arc; -use tokio::sync::RwLock; - -use crate::llm::LlmClient; - -use super::{SummaryGenerator, SummaryStrategyConfig}; - -/// Lazy summary strategy - generates summaries on-demand. -/// -/// Summaries are generated when first requested and optionally cached -/// for future use. -pub struct LazyStrategy { - /// Summary generator. - generator: Arc>>, - /// Cache of generated summaries (node_id -> summary). - cache: Arc>>, - /// Whether to persist generated summaries. - persist: bool, - /// Configuration. - config: SummaryStrategyConfig, -} - -impl LazyStrategy { - /// Create a new lazy strategy with LLM client. - pub fn new(client: LlmClient) -> Self { - Self { - generator: Arc::new(RwLock::new(Box::new(super::LlmSummaryGenerator::new( - client, - )))), - cache: Arc::new(RwLock::new(HashMap::new())), - persist: false, - config: SummaryStrategyConfig::default(), - } - } - - /// Create with persistence enabled. - pub fn with_persist(client: LlmClient, persist: bool) -> Self { - Self { - generator: Arc::new(RwLock::new(Box::new(super::LlmSummaryGenerator::new( - client, - )))), - cache: Arc::new(RwLock::new(HashMap::new())), - persist, - config: SummaryStrategyConfig::default(), - } - } - - /// Create with custom generator. - pub fn with_generator(generator: Box) -> Self { - Self { - generator: Arc::new(RwLock::new(generator)), - cache: Arc::new(RwLock::new(HashMap::new())), - persist: false, - config: SummaryStrategyConfig::default(), - } - } - - /// Set persistence mode. - pub fn with_persist_mode(mut self, persist: bool) -> Self { - self.persist = persist; - self - } - - /// Set configuration. - pub fn with_config(mut self, config: SummaryStrategyConfig) -> Self { - self.config = config; - self - } - - /// Check if a cached summary exists. - pub async fn has_cached(&self, node_id: &str) -> bool { - let cache = self.cache.read().await; - cache.contains_key(node_id) - } - - /// Get a cached summary if available. - pub async fn get_cached(&self, node_id: &str) -> Option { - let cache = self.cache.read().await; - cache.get(node_id).cloned() - } - - /// Get or generate a summary. - /// - /// Returns the cached summary if available, otherwise generates a new one. - pub async fn get_or_generate( - &self, - node_id: &str, - title: &str, - content: &str, - ) -> crate::llm::LlmResult { - // Check cache first - if self.persist { - if let Some(cached) = self.get_cached(node_id).await { - return Ok(cached); - } - } - - // Generate new summary - let generator = self.generator.read().await; - let summary = generator.generate(title, content).await?; - - // Cache if persistence is enabled - if self.persist { - let mut cache = self.cache.write().await; - cache.insert(node_id.to_string(), summary.clone()); - } - - Ok(summary) - } - - /// Pre-populate the cache with existing summaries. - pub async fn populate_cache(&self, summaries: HashMap) { - let mut cache = self.cache.write().await; - cache.extend(summaries); - } - - /// Clear the cache. - pub async fn clear_cache(&self) { - let mut cache = self.cache.write().await; - cache.clear(); - } - - /// Get cache size. - pub async fn cache_size(&self) -> usize { - let cache = self.cache.read().await; - cache.len() - } - - /// Check if persistence is enabled. - pub fn is_persist_enabled(&self) -> bool { - self.persist - } - - /// Get the configuration. - pub fn config(&self) -> &SummaryStrategyConfig { - &self.config - } -} - -impl std::fmt::Debug for LazyStrategy { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LazyStrategy") - .field("persist", &self.persist) - .field("config", &self.config) - .finish() - } -} diff --git a/vectorless-core/vectorless/src/index/summary/mod.rs b/vectorless-core/vectorless/src/index/summary/mod.rs deleted file mode 100644 index f87593d0..00000000 --- a/vectorless-core/vectorless/src/index/summary/mod.rs +++ /dev/null @@ -1,24 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Summary generation strategies. -//! -//! This module provides different strategies for generating summaries: -//! - [`SummaryStrategy`] - Configuration for summary generation -//! - [`SummaryStrategyConfig`] - Configuration options -//! - [`SummaryGenerator`] - Trait for summary generation -//! - [`LlmSummaryGenerator`] - LLM-based implementation -//! -//! # Strategies -//! -//! - **None**: No summary generation -//! - **Full**: Generate summaries for all nodes -//! - **Selective**: Generate summaries only for qualifying nodes (default) -//! - **Lazy**: Generate summaries on-demand at query time - -mod full; -mod lazy; -mod selective; -mod strategy; - -pub use strategy::{LlmSummaryGenerator, SummaryGenerator, SummaryStrategy, SummaryStrategyConfig}; diff --git a/vectorless-core/vectorless/src/index/summary/selective.rs b/vectorless-core/vectorless/src/index/summary/selective.rs deleted file mode 100644 index 18c8946e..00000000 --- a/vectorless-core/vectorless/src/index/summary/selective.rs +++ /dev/null @@ -1,120 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Selective summary strategy - generate summaries only for qualifying nodes. - -use crate::document::{DocumentTree, NodeId}; -use crate::llm::LlmClient; - -use super::{SummaryGenerator, SummaryStrategyConfig}; - -/// Selective summary strategy - generates summaries only for nodes that meet criteria. -pub struct SelectiveStrategy { - /// Summary generator. - generator: Box, - /// Minimum token threshold. - min_tokens: usize, - /// Only generate for branch nodes (non-leaves). - branch_only: bool, - /// Configuration. - config: SummaryStrategyConfig, -} - -impl SelectiveStrategy { - /// Create a new selective strategy with default settings. - pub fn new(client: LlmClient) -> Self { - Self { - generator: Box::new(super::LlmSummaryGenerator::new(client)), - min_tokens: 100, - branch_only: true, - config: SummaryStrategyConfig::default(), - } - } - - /// Create with custom thresholds. - pub fn with_thresholds(client: LlmClient, min_tokens: usize, branch_only: bool) -> Self { - Self { - generator: Box::new(super::LlmSummaryGenerator::new(client)), - min_tokens, - branch_only, - config: SummaryStrategyConfig::default(), - } - } - - /// Create with custom generator. - pub fn with_generator(generator: Box) -> Self { - Self { - generator, - min_tokens: 100, - branch_only: true, - config: SummaryStrategyConfig::default(), - } - } - - /// Set minimum token threshold. - pub fn with_min_tokens(mut self, min_tokens: usize) -> Self { - self.min_tokens = min_tokens; - self - } - - /// Set branch-only mode. - pub fn with_branch_only(mut self, branch_only: bool) -> Self { - self.branch_only = branch_only; - self - } - - /// Set configuration. - pub fn with_config(mut self, config: SummaryStrategyConfig) -> Self { - self.config = config; - self - } - - /// Check if a node should have a summary generated. - pub fn should_generate( - &self, - tree: &DocumentTree, - node_id: NodeId, - token_count: usize, - ) -> bool { - // Check token threshold - let enough_tokens = token_count >= self.min_tokens; - - // Check if branch-only - if self.branch_only { - let is_branch = !tree.is_leaf(node_id); - is_branch && enough_tokens - } else { - enough_tokens - } - } - - /// Generate a summary for content. - pub async fn generate(&self, title: &str, content: &str) -> crate::llm::LlmResult { - self.generator.generate(title, content).await - } - - /// Get the minimum token threshold. - pub fn min_tokens(&self) -> usize { - self.min_tokens - } - - /// Check if branch-only mode is enabled. - pub fn is_branch_only(&self) -> bool { - self.branch_only - } - - /// Get the configuration. - pub fn config(&self) -> &SummaryStrategyConfig { - &self.config - } -} - -impl std::fmt::Debug for SelectiveStrategy { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("SelectiveStrategy") - .field("min_tokens", &self.min_tokens) - .field("branch_only", &self.branch_only) - .field("config", &self.config) - .finish() - } -} diff --git a/vectorless-core/vectorless/src/index/summary/strategy.rs b/vectorless-core/vectorless/src/index/summary/strategy.rs deleted file mode 100644 index 7937aa74..00000000 --- a/vectorless-core/vectorless/src/index/summary/strategy.rs +++ /dev/null @@ -1,322 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Summary generation strategies. - -use async_trait::async_trait; - -use crate::document::{DocumentTree, NodeId}; -use crate::llm::memo::{MemoKey, MemoStore, MemoValue}; -use crate::llm::{LlmClient, LlmResult}; -use crate::utils::fingerprint::Fingerprint; - -/// Configuration for summary strategies. -#[derive(Debug, Clone)] -pub struct SummaryStrategyConfig { - /// Maximum tokens for a summary. - pub max_tokens: usize, - - /// Minimum content tokens to generate summary. - pub min_content_tokens: usize, - - /// Whether to persist lazy-generated summaries. - pub persist_lazy: bool, - - /// Token threshold below which the original content is used as summary - /// instead of calling LLM. Saves API cost for short, self-contained nodes. - /// Set to 0 to always call LLM. - pub shortcut_threshold: usize, -} - -impl Default for SummaryStrategyConfig { - fn default() -> Self { - Self { - max_tokens: 200, - min_content_tokens: 50, - persist_lazy: false, - shortcut_threshold: 50, - } - } -} - -/// Strategy for generating summaries. -#[derive(Debug, Clone)] -pub enum SummaryStrategy { - /// No summary generation. - None, - - /// Generate for all nodes. - Full { - /// Strategy configuration. - config: SummaryStrategyConfig, - }, - - /// Generate selectively. - Selective { - /// Minimum tokens threshold. - min_tokens: usize, - - /// Only generate for branch nodes (non-leaves). - branch_only: bool, - - /// Strategy configuration. - config: SummaryStrategyConfig, - }, - - /// Generate on-demand at query time. - Lazy { - /// Whether to persist generated summaries. - persist: bool, - - /// Strategy configuration. - config: SummaryStrategyConfig, - }, -} - -impl Default for SummaryStrategy { - fn default() -> Self { - Self::Full { - config: SummaryStrategyConfig::default(), - } - } -} - -impl SummaryStrategy { - /// Create a "none" strategy. - pub fn none() -> Self { - Self::None - } - - /// Create a "full" strategy. - pub fn full() -> Self { - Self::Full { - config: SummaryStrategyConfig::default(), - } - } - - /// Create a "selective" strategy. - pub fn selective(min_tokens: usize, branch_only: bool) -> Self { - Self::Selective { - min_tokens, - branch_only, - config: SummaryStrategyConfig::default(), - } - } - - /// Create a "lazy" strategy. - pub fn lazy(persist: bool) -> Self { - Self::Lazy { - persist, - config: SummaryStrategyConfig::default(), - } - } - - /// Check if we should generate a summary for a node. - pub fn should_generate( - &self, - tree: &DocumentTree, - node_id: NodeId, - token_count: usize, - ) -> bool { - match self { - Self::None => false, - Self::Full { .. } => token_count > 0, - Self::Selective { - min_tokens, - branch_only, - .. - } => { - let is_branch = !tree.is_leaf(node_id); - let enough_tokens = token_count >= *min_tokens; - - if *branch_only { - is_branch && enough_tokens - } else { - enough_tokens - } - } - Self::Lazy { .. } => false, // Generated on-demand - } - } - - /// Check if lazy strategy is enabled. - pub fn is_lazy(&self) -> bool { - matches!(self, Self::Lazy { .. }) - } - - /// Get the config. - pub fn config(&self) -> SummaryStrategyConfig { - match self { - Self::None => SummaryStrategyConfig::default(), - Self::Full { config } => config.clone(), - Self::Selective { config, .. } => config.clone(), - Self::Lazy { config, .. } => config.clone(), - } - } - - /// Get the shortcut threshold (tokens below which content is used as-is). - pub fn shortcut_threshold(&self) -> usize { - self.config().shortcut_threshold - } -} - -/// Summary generator trait. -#[async_trait] -pub trait SummaryGenerator: Send + Sync { - /// Generate a summary for the given content. - async fn generate(&self, title: &str, content: &str) -> LlmResult; - - /// Generate a summary with leaf/non-leaf context. - /// Non-leaf nodes get a navigation-oriented prompt ("what does this section cover"), - /// leaf nodes get a content-oriented prompt ("what does this section say"). - async fn generate_for_node( - &self, - title: &str, - content: &str, - is_leaf: bool, - ) -> LlmResult { - let _ = is_leaf; - self.generate(title, content).await - } -} - -/// LLM-based summary generator. -pub struct LlmSummaryGenerator { - client: LlmClient, - max_tokens: usize, - /// Optional memo store for caching results. - memo_store: Option, -} - -impl LlmSummaryGenerator { - /// Create a new summary generator. - pub fn new(client: LlmClient) -> Self { - Self { - client, - max_tokens: 200, - memo_store: None, - } - } - - /// Set max tokens. - pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { - self.max_tokens = max_tokens; - self - } - - /// Set memo store for caching. - pub fn with_memo_store(mut self, store: MemoStore) -> Self { - self.memo_store = Some(store); - self - } -} - -#[async_trait] -impl SummaryGenerator for LlmSummaryGenerator { - async fn generate(&self, title: &str, content: &str) -> LlmResult { - // Compute content fingerprint for cache key - let content_fp = Fingerprint::from_str(&format!("{}|{}", title, content)); - let memo_key = MemoKey::summary(&content_fp); - - // Check memo store first - if let Some(ref store) = self.memo_store { - if let Some(cached) = store.get(&memo_key) { - if let Some(summary) = cached.as_summary() { - tracing::debug!("Memo cache hit for summary: {}", title); - return Ok(summary.to_string()); - } - } - } - - // Generate with LLM - let system_prompt = "You are a document summarization assistant. \ - Generate a concise summary (2-3 sentences) of the given section. \ - Focus on the main topics and key information. \ - Respond with only the summary, no additional text."; - - let user_prompt = format!("Title: {}\n\nContent:\n{}", title, content); - - let summary = self - .client - .complete_with_max_tokens(&system_prompt, &user_prompt, self.max_tokens as u16) - .await?; - - // Cache the result - if let Some(ref store) = self.memo_store { - // Estimate tokens saved (roughly: input + output tokens) - let tokens_saved = (title.len() + content.len() + summary.len()) / 4; - store.put_with_tokens( - memo_key, - MemoValue::Summary(summary.clone()), - tokens_saved as u64, - ); - tracing::debug!("Memo cache stored for summary: {}", title); - } - - Ok(summary) - } - - async fn generate_for_node( - &self, - title: &str, - content: &str, - is_leaf: bool, - ) -> LlmResult { - // Compute content fingerprint for cache key (include leaf flag) - let content_fp = Fingerprint::from_str(&format!("{}|{}|leaf={}", title, content, is_leaf)); - let memo_key = MemoKey::summary(&content_fp); - - // Check memo store first - if let Some(ref store) = self.memo_store { - if let Some(cached) = store.get(&memo_key) { - if let Some(summary) = cached.as_summary() { - tracing::debug!("Memo cache hit for summary: {}", title); - return Ok(summary.to_string()); - } - } - } - - // Choose prompt based on node type - let system_prompt = if is_leaf { - // Leaf nodes: content-oriented — "what does this section say" - "You are a document summarization assistant. \ - Generate a concise summary (2-3 sentences) of the given section's content. \ - Focus on the key information and facts presented. \ - Respond with only the summary, no additional text." - } else { - // Non-leaf (branch) nodes: navigation-oriented with structured output. - // Produces OVERVIEW, QUESTIONS, and TAGS sections that EnhanceStage parses. - "You are a document navigation assistant. \ - Generate a structured overview of this section for navigation purposes. \ - Respond in EXACTLY this format (one section per line):\n\ - OVERVIEW: <2-3 sentence description of what topics this section covers>\n\ - QUESTIONS: \n\ - TAGS: " - }; - - let user_prompt = if is_leaf { - format!("Title: {}\n\nContent:\n{}", title, content) - } else { - // For non-leaf nodes, include children info for better routing summaries - format!("Title: {}\n\nContent:\n{}", title, content) - }; - - let summary = self - .client - .complete_with_max_tokens(&system_prompt, &user_prompt, self.max_tokens as u16) - .await?; - - // Cache the result - if let Some(ref store) = self.memo_store { - let tokens_saved = (title.len() + content.len() + summary.len()) / 4; - store.put_with_tokens( - memo_key, - MemoValue::Summary(summary.clone()), - tokens_saved as u64, - ); - tracing::debug!("Memo cache stored for summary: {}", title); - } - - Ok(summary) - } -} diff --git a/vectorless-core/vectorless/src/lib.rs b/vectorless-core/vectorless/src/lib.rs deleted file mode 100644 index d2f75e6e..00000000 --- a/vectorless-core/vectorless/src/lib.rs +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -#![allow(dead_code)] - -//! # Vectorless -//! -//! A Document Understanding Engine for AI. -//! -//! It compiles documents into structured trees of meaning, then dispatches -//! multiple agents to reason through headings, sections, and paragraphs. -//! -//! ## Quick Start -//! -//! ```rust,no_run -//! use vectorless::{EngineBuilder, IngestInput}; -//! -//! #[tokio::main] -//! async fn main() -> Result<(), Box> { -//! let engine = EngineBuilder::new() -//! .with_key("sk-...") -//! .with_model("gpt-4o") -//! .with_endpoint("https://api.openai.com/v1") -//! .build() -//! .await?; -//! -//! // Understand a document -//! let doc = engine.ingest(IngestInput::Path("./report.pdf".into())).await?; -//! println!("{}: {}", doc.name, doc.summary); -//! -//! // Ask a question -//! let answer = engine.ask("What is the total revenue?", &[doc.doc_id.clone()]).await?; -//! println!("{}", answer.content); -//! -//! Ok(()) -//! } -//! ``` - -// ── Modules ────────────────────────────────────────────────────────────────── - -mod agent; -mod client; -mod config; -mod document; -mod error; -mod events; -mod graph; -mod metrics; - -mod index; -mod llm; -mod query; -mod rerank; -mod retrieval; -mod scoring; -mod storage; -mod utils; - -// ── Public API ─────────────────────────────────────────────────────────────── - -// Client -pub use client::{BuildError, Engine, EngineBuilder}; - -// Config -pub use config::Config; - -// Documents (understanding types) -pub use document::{ - Answer, Concept, Document, DocumentInfo, DocumentStructure, DocumentTree, Evidence, - IngestInput, NodeId, ReasoningIndexConfig, ReasoningTrace, StructureNode, TocConfig, - TocEntry, TocNode, TocView, TraceStep, TreeNode, -}; - -// Graph -pub use graph::{DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword}; - -// Events -pub use events::{EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; - -// Metrics -pub use metrics::{IndexMetrics, LlmMetricsReport, MetricsReport, RetrievalMetricsReport}; - -// Errors -pub use error::{Error, Result}; - -/// Test-only utilities. -/// -/// **Do not use in production code.** This module exposes helpers for writing -/// integration tests without a real LLM endpoint. -#[doc(hidden)] -pub mod __test_support { - pub use crate::client::test_support::*; -} diff --git a/vectorless-core/vectorless/src/llm/client.rs b/vectorless-core/vectorless/src/llm/client.rs deleted file mode 100644 index 3eeb60af..00000000 --- a/vectorless-core/vectorless/src/llm/client.rs +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified LLM client with retry and concurrency support. - -use serde::de::DeserializeOwned; -use std::borrow::Cow; -use std::sync::Arc; -use tracing::{debug, instrument}; - -use super::config::LlmConfig; -use super::error::{LlmError, LlmResult}; -use super::executor::LlmExecutor; -use super::fallback::FallbackChain; -use super::throttle::ConcurrencyController; - -/// Unified LLM client. -/// -/// This client provides: -/// - Unified interface for all LLM operations -/// - Automatic retry with exponential backoff -/// - Rate limiting and concurrency control -/// - JSON response parsing -/// - Error classification -/// - Graceful fallback on errors -/// -/// # Example -/// -/// ```rust,no_run -/// use vectorless::llm::{LlmClient, LlmConfig}; -/// -/// # #[tokio::main] -/// # async fn main() -> vectorless::llm::LlmResult<()> { -/// let config = LlmConfig::new("gpt-4o-mini"); -/// let client = LlmClient::new(config); -/// -/// // Simple completion -/// let response = client.complete("You are helpful.", "Hello!").await?; -/// println!("Response: {}", response); -/// -/// // JSON completion -/// #[derive(serde::Deserialize)] -/// struct Answer { -/// answer: String, -/// } -/// let answer: Answer = client.complete_json( -/// "You answer questions in JSON.", -/// "What is 2+2?" -/// ).await?; -/// # Ok(()) -/// # } -/// ``` -#[derive(Clone)] -pub struct LlmClient { - executor: LlmExecutor, -} - -impl std::fmt::Debug for LlmClient { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LlmClient") - .field("model", &self.executor.config().model) - .field("endpoint", &self.executor.config().endpoint) - .field( - "concurrency", - &self.executor.throttle().map(|c| format!("{:?}", c)), - ) - .field("fallback_enabled", &self.executor.fallback().is_some()) - .finish() - } -} - -impl LlmClient { - /// Create a new LLM client with the given configuration. - pub fn new(config: LlmConfig) -> Self { - Self { - executor: LlmExecutor::new(config), - } - } - - /// Create a client with default configuration. - pub fn with_defaults() -> Self { - Self::new(LlmConfig::default()) - } - - /// Create a client for a specific model. - pub fn for_model(model: impl Into) -> Self { - Self::new(LlmConfig::new(model)) - } - - /// Add concurrency control to the client. - /// - /// # Example - /// - /// ```rust,no_run - /// use vectorless::llm::LlmClient; - /// use vectorless::throttle::{ConcurrencyController, ConcurrencyConfig}; - /// - /// let config = ConcurrencyConfig::new() - /// .with_max_concurrent_requests(10) - /// .with_requests_per_minute(500); - /// - /// let client = LlmClient::for_model("gpt-4o-mini") - /// .with_concurrency(ConcurrencyController::new(config)); - /// ``` - pub fn with_concurrency(mut self, controller: ConcurrencyController) -> Self { - self.executor = self.executor.with_throttle(controller); - self - } - - /// Add concurrency control from an existing Arc. - pub fn with_shared_concurrency(mut self, controller: Arc) -> Self { - self.executor = self.executor.with_shared_throttle(controller); - self - } - - /// Replace the async-openai client with a shared instance (reuses connection pool). - pub fn with_shared_openai_client( - mut self, - client: Arc>, - ) -> Self { - self.executor = self.executor.with_openai_client(client); - self - } - - /// Add fallback chain for error recovery. - /// - /// # Example - /// - /// ```rust - /// use vectorless::llm::{LlmClient, FallbackChain, FallbackConfig}; - /// - /// let fallback = FallbackConfig::default(); - /// let client = LlmClient::for_model("gpt-4o") - /// .with_fallback(FallbackChain::new(fallback)); - /// - /// assert!(client.fallback().is_some()); - /// ``` - pub fn with_fallback(mut self, chain: FallbackChain) -> Self { - self.executor = self.executor.with_fallback(chain); - self - } - - /// Add fallback chain from an existing Arc. - pub fn with_shared_fallback(mut self, chain: Arc) -> Self { - self.executor = self.executor.with_shared_fallback(chain); - self - } - - /// Add metrics hub for recording LLM call statistics. - pub fn with_shared_metrics(mut self, hub: Arc) -> Self { - self.executor = self.executor.with_shared_metrics(hub); - self - } - - /// Get the configuration. - pub fn config(&self) -> &LlmConfig { - self.executor.config() - } - - /// Get the concurrency controller (if any). - pub fn concurrency(&self) -> Option<&ConcurrencyController> { - self.executor.throttle() - } - - /// Get the fallback chain (if any). - pub fn fallback(&self) -> Option<&FallbackChain> { - self.executor.fallback() - } - - /// Get the underlying executor (for advanced usage). - pub fn executor(&self) -> &LlmExecutor { - &self.executor - } - - /// Complete a prompt with system and user messages. - /// - /// This method includes: - /// - Automatic rate limiting (if configured) - /// - Automatic retry with exponential backoff - /// - Automatic fallback on persistent errors (if configured) - #[instrument(skip(self, system, user), fields(model = %self.executor.config().model))] - pub async fn complete(&self, system: &str, user: &str) -> LlmResult { - debug!( - system_len = system.len(), - user_len = user.len(), - "Starting LLM completion" - ); - self.executor.complete(system, user).await - } - - /// Complete a prompt with custom max tokens. - pub async fn complete_with_max_tokens( - &self, - system: &str, - user: &str, - max_tokens: u16, - ) -> LlmResult { - debug!( - system_len = system.len(), - user_len = user.len(), - max_tokens = max_tokens, - "Starting LLM completion with max tokens" - ); - self.executor - .complete_with_max_tokens(system, user, max_tokens) - .await - } - - /// Complete a prompt and parse the response as JSON. - /// - /// This method handles: - /// - JSON extraction from markdown code blocks - /// - Bracket matching for nested JSON - /// - /// # Example - /// - /// ```rust,no_run - /// # use vectorless::llm::{LlmClient, LlmConfig}; - /// # #[tokio::main] - /// # async fn main() -> vectorless::llm::LlmResult<()> { - /// #[derive(serde::Deserialize)] - /// struct TocEntry { - /// title: String, - /// page: usize, - /// } - /// - /// let client = LlmClient::for_model("gpt-4o-mini"); - /// let entries: Vec = client.complete_json( - /// "Extract TOC entries as JSON array.", - /// "Chapter 1: Introduction ... 5" - /// ).await?; - /// # Ok(()) - /// # } - /// ``` - pub async fn complete_json( - &self, - system: &str, - user: &str, - ) -> LlmResult { - let response = self.complete(system, user).await?; - self.parse_json(&response) - } - - /// Complete a prompt and parse the response as JSON with custom max tokens. - pub async fn complete_json_with_max_tokens( - &self, - system: &str, - user: &str, - max_tokens: u16, - ) -> LlmResult { - let response = self - .complete_with_max_tokens(system, user, max_tokens) - .await?; - self.parse_json(&response) - } - - /// Parse JSON from LLM response. - fn parse_json(&self, text: &str) -> LlmResult { - let json_text = self.extract_json(text); - serde_json::from_str(&json_text).map_err(|e| { - LlmError::Parse(format!("Failed to parse JSON: {}. Response: {}", e, text)) - }) - } - - /// Extract JSON from text (handles markdown code blocks). - fn extract_json<'a>(&self, text: &'a str) -> Cow<'a, str> { - let text = text.trim(); - - // Try markdown code block first - if text.starts_with("```") { - // Find the end of the first line (language identifier) - if let Some(start) = text.find('\n') { - let rest = &text[start + 1..]; - if let Some(end) = rest.find("```") { - return Cow::Borrowed(rest[..end].trim()); - } - } - } - - // Try to find JSON array or object - if text.starts_with('[') || text.starts_with('{') { - let open = text.chars().next().unwrap(); - let close = if open == '[' { ']' } else { '}' }; - - let mut depth = 0; - for (i, ch) in text.char_indices() { - match ch { - c if c == open => depth += 1, - c if c == close => { - depth -= 1; - if depth == 0 { - return Cow::Borrowed(&text[..=i]); - } - } - _ => {} - } - } - } - - Cow::Borrowed(text) - } -} - -impl Default for LlmClient { - fn default() -> Self { - Self::with_defaults() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_json_plain() { - let client = LlmClient::with_defaults(); - - let json = client.extract_json(r#"{"key": "value"}"#); - assert_eq!(json, r#"{"key": "value"}"#); - } - - #[test] - fn test_extract_json_code_block() { - let client = LlmClient::with_defaults(); - - let json = client.extract_json( - r#"```json -{"key": "value"} -```"#, - ); - assert_eq!(json, r#"{"key": "value"}"#); - } - - #[test] - fn test_extract_json_array() { - let client = LlmClient::with_defaults(); - - let json = client.extract_json(r#"[1, 2, 3]"#); - assert_eq!(json, r#"[1, 2, 3]"#); - } - - #[test] - fn test_extract_json_nested() { - let client = LlmClient::with_defaults(); - - let json = client.extract_json(r#"{"outer": {"inner": 1}}"#); - assert_eq!(json, r#"{"outer": {"inner": 1}}"#); - } - - #[test] - fn test_client_creation() { - let client = LlmClient::for_model("gpt-4o"); - assert_eq!(client.config().model, "gpt-4o"); - } - - #[test] - fn test_client_with_concurrency() { - use crate::llm::throttle::ConcurrencyConfig; - - let controller = ConcurrencyController::new(ConcurrencyConfig::conservative()); - let client = LlmClient::for_model("gpt-4o-mini").with_concurrency(controller); - - assert!(client.concurrency().is_some()); - } - - #[test] - fn test_client_with_shared_metrics() { - use crate::metrics::MetricsHub; - - let hub = MetricsHub::shared(); - let client = LlmClient::for_model("gpt-4o").with_shared_metrics(hub.clone()); - - // Client should still function normally - assert_eq!(client.config().model, "gpt-4o"); - assert!(client.fallback().is_none()); // no fallback added - assert!(client.concurrency().is_none()); // no concurrency added - } -} diff --git a/vectorless-core/vectorless/src/llm/config.rs b/vectorless-core/vectorless/src/llm/config.rs deleted file mode 100644 index 32685e36..00000000 --- a/vectorless-core/vectorless/src/llm/config.rs +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Runtime LLM configuration types. - -use serde::{Deserialize, Serialize}; -use std::time::Duration; - -/// Runtime LLM client configuration. -/// -/// This is the runtime representation used by [`LlmClient`](super::LlmClient). -/// Created from the config-layer [`LlmConfig`](crate::config::LlmConfig) -/// during pool construction — users never construct this directly. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct LlmConfig { - /// Model name (e.g., "gpt-4o-mini", "gpt-4o"). - #[serde(default)] - pub model: String, - - /// API endpoint URL. - #[serde(default)] - pub endpoint: String, - - /// API key. - #[serde(default)] - pub api_key: Option, - - /// Maximum tokens for response. - #[serde(default = "default_max_tokens")] - pub max_tokens: usize, - - /// Temperature for generation. - #[serde(default = "default_temperature")] - pub temperature: f32, - - /// Retry configuration. - #[serde(default)] - pub retry: RetryConfig, - - /// Per-request timeout. 0 means no timeout (wait indefinitely). - #[serde(default)] - pub request_timeout_secs: u64, -} - -fn default_max_tokens() -> usize { - 2000 -} - -fn default_temperature() -> f32 { - 0.0 -} - -impl Default for LlmConfig { - fn default() -> Self { - Self { - model: String::new(), - endpoint: String::new(), - api_key: None, - max_tokens: default_max_tokens(), - temperature: default_temperature(), - retry: RetryConfig::default(), - request_timeout_secs: 0, - } - } -} - -impl LlmConfig { - /// Create a new config with a specific model. - pub fn new(model: impl Into) -> Self { - Self { - model: model.into(), - ..Self::default() - } - } - - /// Set the model. - pub fn with_model(mut self, model: impl Into) -> Self { - self.model = model.into(); - self - } - - /// Set the endpoint. - pub fn with_endpoint(mut self, endpoint: impl Into) -> Self { - self.endpoint = endpoint.into(); - self - } - - /// Set the API key. - pub fn with_api_key(mut self, api_key: impl Into) -> Self { - self.api_key = Some(api_key.into()); - self - } - - /// Set the max tokens. - pub fn with_max_tokens(mut self, max_tokens: usize) -> Self { - self.max_tokens = max_tokens; - self - } - - /// Set the temperature. - pub fn with_temperature(mut self, temperature: f32) -> Self { - self.temperature = temperature; - self - } - - /// Set the retry configuration. - pub fn with_retry(mut self, retry: RetryConfig) -> Self { - self.retry = retry; - self - } -} - -/// Runtime retry configuration for LLM calls. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RetryConfig { - /// Maximum number of retry attempts (including initial call). - #[serde(default = "default_max_attempts")] - pub max_attempts: usize, - - /// Initial delay before first retry (milliseconds). - #[serde(default = "default_initial_delay_ms")] - pub initial_delay_ms: u64, - - /// Maximum delay between retries (milliseconds). - #[serde(default = "default_max_delay_ms")] - pub max_delay_ms: u64, - - /// Multiplier for exponential backoff. - #[serde(default = "default_multiplier")] - pub multiplier: f64, - - /// Whether to retry on rate limit errors. - #[serde(default = "default_true")] - pub retry_on_rate_limit: bool, -} - -fn default_max_attempts() -> usize { - 3 -} -fn default_initial_delay_ms() -> u64 { - 500 -} -fn default_max_delay_ms() -> u64 { - 30000 -} -fn default_multiplier() -> f64 { - 2.0 -} -fn default_true() -> bool { - true -} - -impl Default for RetryConfig { - fn default() -> Self { - Self { - max_attempts: default_max_attempts(), - initial_delay_ms: default_initial_delay_ms(), - max_delay_ms: default_max_delay_ms(), - multiplier: default_multiplier(), - retry_on_rate_limit: default_true(), - } - } -} - -impl RetryConfig { - /// Create a new retry config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the maximum number of attempts. - pub fn with_max_attempts(mut self, max_attempts: usize) -> Self { - self.max_attempts = max_attempts; - self - } - - /// Set the initial delay (milliseconds). - pub fn with_initial_delay(mut self, delay_ms: u64) -> Self { - self.initial_delay_ms = delay_ms; - self - } - - /// Set the maximum delay (milliseconds). - pub fn with_max_delay(mut self, delay_ms: u64) -> Self { - self.max_delay_ms = delay_ms; - self - } - - /// Set the backoff multiplier. - pub fn with_multiplier(mut self, multiplier: f64) -> Self { - self.multiplier = multiplier; - self - } - - /// Set whether to retry on rate limit. - pub fn with_retry_on_rate_limit(mut self, retry: bool) -> Self { - self.retry_on_rate_limit = retry; - self - } - - /// Calculate delay for a given attempt (0-indexed). - pub fn delay_for_attempt(&self, attempt: usize) -> Duration { - let delay_ms = (self.initial_delay_ms as f64) * self.multiplier.powf(attempt as f64); - let delay_ms = delay_ms.min(self.max_delay_ms as f64); - Duration::from_millis(delay_ms as u64) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_retry_delay_calculation() { - let config = RetryConfig::default(); - - // Initial delay is 500ms - assert_eq!(config.delay_for_attempt(0), Duration::from_millis(500)); - - // Second attempt: 500 * 2 = 1000ms - assert_eq!(config.delay_for_attempt(1), Duration::from_millis(1000)); - - // Third attempt: 500 * 4 = 2000ms - assert_eq!(config.delay_for_attempt(2), Duration::from_millis(2000)); - } - - #[test] - fn test_retry_delay_max_cap() { - let config = RetryConfig { - max_delay_ms: 1500, - ..RetryConfig::default() - }; - - // Should cap at max_delay_ms - assert_eq!(config.delay_for_attempt(5), Duration::from_millis(1500)); - } - - #[test] - fn test_llm_config_builder() { - let config = LlmConfig::new("gpt-4o") - .with_max_tokens(1000) - .with_temperature(0.5); - - assert_eq!(config.model, "gpt-4o"); - assert_eq!(config.max_tokens, 1000); - assert!((config.temperature - 0.5).abs() < 0.001); - } -} diff --git a/vectorless-core/vectorless/src/llm/error.rs b/vectorless-core/vectorless/src/llm/error.rs deleted file mode 100644 index 5969cf72..00000000 --- a/vectorless-core/vectorless/src/llm/error.rs +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified LLM error types. - -use thiserror::Error; - -/// LLM error types. -#[derive(Debug, Clone, Error)] -pub enum LlmError { - /// API error from the LLM provider. - #[error("LLM API error: {0}")] - Api(String), - - /// Request construction error. - #[error("Request error: {0}")] - Request(String), - - /// Configuration error. - #[error("Configuration error: {0}")] - Config(String), - - /// Response parsing error. - #[error("Failed to parse response: {0}")] - Parse(String), - - /// Rate limit exceeded. - #[error("Rate limit exceeded: {0}")] - RateLimit(String), - - /// Request timeout. - #[error("Request timeout: {0}")] - Timeout(String), - - /// No content returned. - #[error("LLM returned no content")] - NoContent, - - /// Retry exhausted. - #[error("Retry exhausted after {attempts} attempts: {last_error}")] - RetryExhausted { - /// Number of attempts made. - attempts: usize, - /// The last error encountered. - last_error: String, - }, -} - -impl LlmError { - /// Check if the error is retryable. - pub fn is_retryable(&self) -> bool { - match self { - LlmError::Api(msg) => { - // Rate limits and temporary failures are retryable - let msg_lower = msg.to_lowercase(); - msg_lower.contains("rate limit") - || msg_lower.contains("429") - || msg_lower.contains("503") - || msg_lower.contains("502") - || msg_lower.contains("timeout") - || msg_lower.contains("overloaded") - } - LlmError::Timeout(_) => true, - LlmError::RateLimit(_) => true, - _ => false, - } - } - - /// Classify an API error message into the appropriate error type. - pub fn from_api_message(msg: &str) -> Self { - let msg_lower = msg.to_lowercase(); - - if msg_lower.contains("rate limit") || msg_lower.contains("429") { - LlmError::RateLimit(msg.to_string()) - } else if msg_lower.contains("timeout") { - LlmError::Timeout(msg.to_string()) - } else { - LlmError::Api(msg.to_string()) - } - } -} - -impl From for LlmError { - fn from(e: async_openai::error::OpenAIError) -> Self { - let msg = e.to_string(); - LlmError::from_api_message(&msg) - } -} - -impl From for LlmError { - fn from(e: serde_json::Error) -> Self { - LlmError::Parse(e.to_string()) - } -} - -impl From for crate::Error { - fn from(e: LlmError) -> Self { - crate::Error::Llm(e.to_string()) - } -} - -impl From for String { - fn from(e: LlmError) -> Self { - e.to_string() - } -} - -/// Specialized result type for LLM operations. -pub type LlmResult = std::result::Result; - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_retryable() { - assert!(LlmError::RateLimit("test".to_string()).is_retryable()); - assert!(LlmError::Timeout("test".to_string()).is_retryable()); - assert!(LlmError::Api("rate limit exceeded".to_string()).is_retryable()); - assert!(!LlmError::Config("test".to_string()).is_retryable()); - assert!(!LlmError::Parse("test".to_string()).is_retryable()); - } - - #[test] - fn test_from_api_message() { - let err = LlmError::from_api_message("Rate limit exceeded"); - assert!(matches!(err, LlmError::RateLimit(_))); - - let err = LlmError::from_api_message("Request timeout"); - assert!(matches!(err, LlmError::Timeout(_))); - - let err = LlmError::from_api_message("Internal server error"); - assert!(matches!(err, LlmError::Api(_))); - } -} diff --git a/vectorless-core/vectorless/src/llm/executor.rs b/vectorless-core/vectorless/src/llm/executor.rs deleted file mode 100644 index 409b474e..00000000 --- a/vectorless-core/vectorless/src/llm/executor.rs +++ /dev/null @@ -1,568 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified executor coordinating throttle, retry, and fallback. -//! -//! This module provides the `LlmExecutor` which coordinates: -//! - **Throttle** — Rate limiting and concurrency control -//! - **Retry** — Exponential backoff on transient errors -//! - **Fallback** — Model/endpoint degradation on persistent failures -//! -//! # Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────┐ -//! │ LlmExecutor │ -//! │ │ -//! │ execute() ──▶ [Throttle] ──▶ [API Call] ──▶ [Success/Error] │ -//! │ │ │ │ -//! │ acquire permit do request │ -//! │ │ │ -//! │ ┌──────────┴──────────┐ │ -//! │ ▼ ▼ │ -//! │ [Retry] [Fallback] │ -//! │ │ │ │ -//! │ exponential model/endpoint │ -//! │ backoff degradation │ -//! │ │ -//! └─────────────────────────────────────────────────────────────────┘ -//! ``` -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::llm::{LlmExecutor, LlmConfig, FallbackChain, FallbackConfig}; -//! use vectorless::throttle::{ConcurrencyController, ConcurrencyConfig}; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::llm::LlmResult<()> { -//! let config = LlmConfig::new("gpt-4o"); -//! let throttle = ConcurrencyController::new(ConcurrencyConfig::default()); -//! let fallback = FallbackChain::new(FallbackConfig::default()); -//! -//! let executor = LlmExecutor::new(config) -//! .with_throttle(throttle) -//! .with_fallback(fallback); -//! -//! let result = executor.complete("You are helpful.", "Hello!").await?; -//! # Ok(()) -//! # } -//! ``` - -use std::sync::Arc; -use std::time::Duration; -use tracing::{debug, info, warn}; - -use async_openai::types::chat::{ - ChatCompletionRequestSystemMessage, ChatCompletionRequestUserMessage, - CreateChatCompletionRequestArgs, -}; - -use super::config::LlmConfig; -use super::error::{LlmError, LlmResult}; -use super::fallback::{FallbackChain, FallbackStep}; -use super::throttle::ConcurrencyController; -use crate::metrics::MetricsHub; - -/// Unified executor for LLM operations. -/// -/// Coordinates throttle, retry, and fallback mechanisms. -#[derive(Clone)] -pub struct LlmExecutor { - /// LLM configuration. - config: LlmConfig, - /// Reusable async-openai client (created once, shared via Arc). - openai_client: Arc>, - /// Throttle controller (optional). - throttle: Option>, - /// Fallback chain (optional). - fallback: Option>, - /// Metrics hub for recording LLM call statistics (optional). - metrics: Option>, -} - -impl std::fmt::Debug for LlmExecutor { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("LlmExecutor") - .field("model", &self.config.model) - .field("endpoint", &self.config.endpoint) - .field("has_throttle", &self.throttle.is_some()) - .field("has_fallback", &self.fallback.is_some()) - .field("has_openai_client", &true) - .field("has_metrics", &self.metrics.is_some()) - .finish() - } -} - -impl LlmExecutor { - /// Create a new executor with the given configuration. - pub fn new(config: LlmConfig) -> Self { - let openai_client = Self::build_openai_client(&config); - Self { - config, - openai_client: Arc::new(openai_client), - throttle: None, - fallback: None, - metrics: None, - } - } - - /// Build the async-openai client from config. - fn build_openai_client( - config: &LlmConfig, - ) -> async_openai::Client { - let api_key = config.api_key.clone().unwrap_or_default(); - let endpoint = if config.endpoint.is_empty() { - "https://api.openai.com/v1".to_string() - } else { - config.endpoint.clone() - }; - let openai_config = async_openai::config::OpenAIConfig::new() - .with_api_key(api_key) - .with_api_base(endpoint); - async_openai::Client::with_config(openai_config) - } - - /// Create an executor with default configuration. - pub fn with_defaults() -> Self { - Self::new(LlmConfig::default()) - } - - /// Create an executor for a specific model. - pub fn for_model(model: impl Into) -> Self { - Self::new(LlmConfig::new(model)) - } - - /// Add throttle control. - pub fn with_throttle(mut self, controller: ConcurrencyController) -> Self { - self.throttle = Some(Arc::new(controller)); - self - } - - /// Add throttle control from an existing Arc. - pub fn with_shared_throttle(mut self, controller: Arc) -> Self { - self.throttle = Some(controller); - self - } - - /// Add fallback chain. - pub fn with_fallback(mut self, chain: FallbackChain) -> Self { - self.fallback = Some(Arc::new(chain)); - self - } - - /// Add fallback chain from an existing Arc. - pub fn with_shared_fallback(mut self, chain: Arc) -> Self { - self.fallback = Some(chain); - self - } - - /// Add metrics hub for recording LLM call statistics. - pub fn with_shared_metrics(mut self, hub: Arc) -> Self { - self.metrics = Some(hub); - self - } - - /// Replace the async-openai client (used when pool reconfigures clients). - pub fn with_openai_client( - mut self, - client: Arc>, - ) -> Self { - self.openai_client = client; - self - } - - /// Get the configuration. - pub fn config(&self) -> &LlmConfig { - &self.config - } - - /// Get the throttle controller (if any). - pub fn throttle(&self) -> Option<&ConcurrencyController> { - self.throttle.as_deref() - } - - /// Get the fallback chain (if any). - pub fn fallback(&self) -> Option<&FallbackChain> { - self.fallback.as_deref() - } - - /// Execute a completion with unified coordination. - /// - /// This method coordinates: - /// 1. Throttle: Acquire permit before API call - /// 2. Retry: Exponential backoff on transient errors - /// 3. Fallback: Model/endpoint degradation on persistent failures - pub async fn complete(&self, system: &str, user: &str) -> LlmResult { - self.execute_with_context(system, user, None).await - } - - /// Execute a completion with custom max tokens. - pub async fn complete_with_max_tokens( - &self, - system: &str, - user: &str, - max_tokens: u16, - ) -> LlmResult { - self.execute_with_context(system, user, Some(max_tokens)) - .await - } - - /// Internal execution with full coordination. - async fn execute_with_context( - &self, - system: &str, - user: &str, - max_tokens: Option, - ) -> LlmResult { - let mut attempts = 0; - let mut current_model = self.config.model.clone(); - let mut fallback_history: Vec = vec![]; - let mut total_attempts_including_fallback = 0; - - loop { - attempts += 1; - total_attempts_including_fallback += 1; - - // Safety check: prevent infinite loops - const MAX_TOTAL_ATTEMPTS: usize = 20; - if total_attempts_including_fallback > MAX_TOTAL_ATTEMPTS { - warn!( - total_attempts = total_attempts_including_fallback, - "Exceeded maximum total attempts, aborting" - ); - return Err(LlmError::RetryExhausted { - attempts: total_attempts_including_fallback, - last_error: "Exceeded maximum total attempts including fallbacks".to_string(), - }); - } - - // Step 1: Acquire throttle permit - let _permit = self.acquire_throttle_permit().await; - - debug!( - attempt = attempts, - model = %current_model, - "Executing LLM request" - ); - - // Step 2: Execute the request (with optional timeout) - let request_future = self.do_request(¤t_model, system, user, max_tokens); - let result = if self.config.request_timeout_secs > 0 { - let timeout = Duration::from_secs(self.config.request_timeout_secs); - match tokio::time::timeout(timeout, request_future).await { - Ok(r) => r, - Err(_) => { - warn!( - timeout_secs = self.config.request_timeout_secs, - model = %current_model, - "LLM request timed out" - ); - if let Some(ref metrics) = self.metrics { - metrics.record_llm_timeout(); - } - Err(LlmError::Timeout(format!( - "Request timed out after {}s", - self.config.request_timeout_secs - ))) - } - } - } else { - request_future.await - }; - - match result { - Ok(response) => { - if fallback_history.is_empty() { - debug!( - attempts = attempts, - "LLM request succeeded without fallback" - ); - } else { - info!( - attempts = attempts, - fallback_steps = fallback_history.len(), - "LLM request succeeded after fallback" - ); - } - return Ok(response); - } - Err(error) => { - // Record specific error events - if let Some(ref metrics) = self.metrics { - match &error { - LlmError::RateLimit(_) => metrics.record_llm_rate_limit(), - LlmError::Timeout(_) => metrics.record_llm_timeout(), - _ => {} - } - } - - // Step 3: Check if we should retry - if self.should_retry(&error, attempts) { - let delay = self.retry_delay(attempts); - warn!( - attempt = attempts, - max_attempts = self.config.retry.max_attempts, - delay_ms = delay.as_millis() as u64, - error = %error, - "LLM call failed, retrying..." - ); - tokio::time::sleep(delay).await; - continue; - } - - // Step 4: Check if we should fallback - if let Some(ref fallback) = self.fallback { - if fallback.should_fallback(&error) { - let mut fell_back = false; - - // Try next model - if let Some(next_model) = fallback.next_model(¤t_model) { - info!( - from_model = %current_model, - to_model = %next_model, - "Falling back to next model" - ); - if let Some(ref metrics) = self.metrics { - metrics.record_llm_fallback(); - } - fallback.record_fallback( - &mut fallback_history, - current_model.clone(), - Some(next_model.clone()), - self.config.endpoint.clone(), - None, - error.to_string(), - ); - current_model = next_model; - attempts = 0; // Reset retry counter for new model - fell_back = true; - } - - if fell_back { - continue; - } - } - } - - // Step 5: No more retries or fallbacks, return error - warn!( - attempts = attempts, - fallback_steps = fallback_history.len(), - error = %error, - "LLM call failed, no more retries or fallbacks available" - ); - return Err(error); - } - } - } - } - - /// Acquire throttle permit (if configured). - async fn acquire_throttle_permit(&self) -> Option> { - if let Some(ref throttle) = self.throttle { - throttle.acquire().await - } else { - None - } - } - - /// Check if we should retry based on error and attempt count. - fn should_retry(&self, error: &LlmError, attempts: usize) -> bool { - if attempts >= self.config.retry.max_attempts { - return false; - } - - // Use unified retryable check, with rate-limit override - if matches!(error, LlmError::RateLimit(_)) { - self.config.retry.retry_on_rate_limit - } else { - error.is_retryable() - } - } - - /// Calculate retry delay for a given attempt. - fn retry_delay(&self, attempt: usize) -> Duration { - self.config.retry.delay_for_attempt(attempt - 1) - } - - /// Execute the actual API request. - async fn do_request( - &self, - model: &str, - system: &str, - user: &str, - _max_tokens: Option, - ) -> LlmResult { - // Build request — only set max_tokens when explicitly provided, - // letting the API use its own default otherwise. - let request = CreateChatCompletionRequestArgs::default() - .model(model) - .messages([ - ChatCompletionRequestSystemMessage::from(system).into(), - ChatCompletionRequestUserMessage::from(user).into(), - ]) - .temperature(self.config.temperature) - .build() - .map_err(|e| LlmError::Request(format!("Failed to build request: {}", e)))?; - - // if let Some(mt) = max_tokens { - // request.max_tokens = Some(mt as u32); - // } - - info!( - "LLM request → endpoint: {}, model: {}, system: {} chars, user: {} chars", - self.config.endpoint, - model, - system.len(), - user.len() - ); - - let request_start = std::time::Instant::now(); - let response = match self.openai_client.chat().create(request).await { - Ok(r) => r, - Err(e) => { - let elapsed = request_start.elapsed(); - if let Some(ref metrics) = self.metrics { - metrics.record_llm_call(0, 0, elapsed.as_millis() as u64, false); - } - let msg = e.to_string(); - return Err(LlmError::from_api_message(&msg)); - } - }; - let request_elapsed = request_start.elapsed(); - - let usage = response.usage.as_ref(); - let prompt_tokens = usage.map(|u| u.prompt_tokens).unwrap_or(0); - let completion_tokens = usage.map(|u| u.completion_tokens).unwrap_or(0); - - let first_choice = response.choices.first(); - - if first_choice.is_none() { - if let Some(ref metrics) = self.metrics { - metrics.record_llm_call( - prompt_tokens as u64, - completion_tokens as u64, - request_elapsed.as_millis() as u64, - false, - ); - } - return Err(LlmError::NoContent); - } - - let choice = first_choice.unwrap(); - let content = choice.message.content.clone().unwrap_or_default(); - - if content.is_empty() { - let has_tool_calls = choice - .message - .tool_calls - .as_ref() - .map_or(false, |t| !t.is_empty()); - let finish_reason = format!("{:?}", choice.finish_reason); - warn!( - elapsed_ms = request_elapsed.as_millis(), - prompt_tokens, - completion_tokens, - has_tool_calls, - finish_reason, - "LLM returned empty content field" - ); - } - - if let Some(ref metrics) = self.metrics { - metrics.record_llm_call( - prompt_tokens as u64, - completion_tokens as u64, - request_elapsed.as_millis() as u64, - true, - ); - } - - if content.is_empty() { - warn!( - elapsed_ms = request_elapsed.as_millis(), - prompt_tokens, completion_tokens, "LLM returned empty response" - ); - } else { - info!( - "LLM response ← {}ms, tokens: {} prompt + {} completion, content: {} chars", - request_elapsed.as_millis(), - prompt_tokens, - completion_tokens, - content.len() - ); - } - - Ok(content) - } -} - -impl Default for LlmExecutor { - fn default() -> Self { - Self::with_defaults() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_executor_creation() { - let executor = LlmExecutor::for_model("gpt-4o"); - assert_eq!(executor.config().model, "gpt-4o"); - assert!(executor.throttle().is_none()); - assert!(executor.fallback().is_none()); - } - - #[test] - fn test_executor_with_throttle() { - use crate::llm::throttle::ConcurrencyConfig; - - let controller = ConcurrencyController::new(ConcurrencyConfig::conservative()); - let executor = LlmExecutor::for_model("gpt-4o-mini").with_throttle(controller); - - assert!(executor.throttle().is_some()); - } - - #[test] - fn test_should_retry() { - let executor = LlmExecutor::with_defaults(); - - // Should retry on timeout - assert!(executor.should_retry(&LlmError::Timeout("test".to_string()), 1)); - - // Should retry on rate limit (if configured) - assert!(executor.should_retry(&LlmError::RateLimit("test".to_string()), 1)); - - // Should not retry on config error - assert!(!executor.should_retry(&LlmError::Config("test".to_string()), 1)); - - // Should not retry after max attempts - assert!(!executor.should_retry(&LlmError::Timeout("test".to_string()), 100)); - } - - #[test] - fn test_retry_delay() { - let executor = LlmExecutor::with_defaults(); - - // First retry attempt (attempt 1 -> delay_for_attempt(0)) - let delay = executor.retry_delay(1); - assert_eq!(delay, Duration::from_millis(500)); - } - - #[test] - fn test_executor_with_metrics() { - let hub = MetricsHub::shared(); - let executor = LlmExecutor::for_model("gpt-4o").with_shared_metrics(hub); - - assert!(executor.metrics.is_some()); - } - - #[test] - fn test_executor_without_metrics() { - let executor = LlmExecutor::for_model("gpt-4o"); - assert!(executor.metrics.is_none()); - } -} diff --git a/vectorless-core/vectorless/src/llm/fallback.rs b/vectorless-core/vectorless/src/llm/fallback.rs deleted file mode 100644 index fb6e37cd..00000000 --- a/vectorless-core/vectorless/src/llm/fallback.rs +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Fallback and error recovery for LLM calls. -//! -//! This module provides graceful degradation when LLM API calls fail: -//! - Automatic model switching (e.g., gpt-4o → gpt-4o-mini) -//! - Automatic endpoint switching -//! - Configurable retry and fallback behaviors -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::llm::fallback::{FallbackChain, FallbackConfig}; -//! -//! let config = FallbackConfig::default(); -//! let chain = FallbackChain::new(config); -//! -//! // Check if fallback is enabled -//! assert!(chain.is_enabled()); -//! ``` - -use serde::{Deserialize, Serialize}; -use tracing::{debug, info, warn}; - -use super::error::LlmError; -use crate::config::{ - FallbackBehavior, FallbackConfig as ConfigFallbackConfig, OnAllFailedBehavior, -}; - -/// Result from a fallback-aware LLM call. -#[derive(Debug, Clone)] -pub struct FallbackResult { - /// The actual result. - pub result: T, - /// Whether the result came from a fallback model/endpoint. - pub degraded: bool, - /// The model that was ultimately used. - pub model: String, - /// The endpoint that was ultimately used. - pub endpoint: String, - /// History of fallback attempts (for debugging). - pub fallback_history: Vec, -} - -impl FallbackResult { - /// Create a successful result without fallback. - pub fn success(result: T, model: String, endpoint: String) -> Self { - Self { - result, - degraded: false, - model, - endpoint, - fallback_history: Vec::new(), - } - } - - /// Create a result from a fallback. - pub fn from_fallback( - result: T, - model: String, - endpoint: String, - history: Vec, - ) -> Self { - Self { - result, - degraded: true, - model, - endpoint, - fallback_history: history, - } - } -} - -/// A single step in the fallback chain. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct FallbackStep { - /// The model we tried. - pub from_model: String, - /// The model we fell back to (if any). - pub to_model: Option, - /// The endpoint we tried. - pub from_endpoint: String, - /// The endpoint we fell back to (if any). - pub to_endpoint: Option, - /// The reason for fallback. - pub reason: String, -} - -/// Fallback chain manager. -#[derive(Debug, Clone)] -pub struct FallbackChain { - config: FallbackConfig, -} - -/// Runtime fallback configuration (converted from config::FallbackConfig). -#[derive(Debug, Clone)] -pub struct FallbackConfig { - /// Whether fallback is enabled. - pub enabled: bool, - /// Fallback models in priority order. - pub models: Vec, - /// Fallback endpoints in priority order. - pub endpoints: Vec, - /// Behavior on rate limit error. - pub on_rate_limit: FallbackBehavior, - /// Behavior on timeout error. - pub on_timeout: FallbackBehavior, - /// Behavior when all attempts fail. - pub on_all_failed: OnAllFailedBehavior, -} - -impl Default for FallbackConfig { - fn default() -> Self { - Self { - enabled: true, - models: vec!["gpt-4o-mini".to_string(), "glm-4-flash".to_string()], - endpoints: vec![], - on_rate_limit: FallbackBehavior::RetryThenFallback, - on_timeout: FallbackBehavior::RetryThenFallback, - on_all_failed: OnAllFailedBehavior::ReturnError, - } - } -} - -impl From for FallbackConfig { - fn from(config: ConfigFallbackConfig) -> Self { - Self { - enabled: config.enabled, - models: config.models, - endpoints: config.endpoints, - on_rate_limit: config.on_rate_limit, - on_timeout: config.on_timeout, - on_all_failed: config.on_all_failed, - } - } -} - -impl FallbackConfig { - /// Create a new fallback config. - pub fn new() -> Self { - Self::default() - } - - /// Disable fallback. - pub fn disabled() -> Self { - Self { - enabled: false, - ..Self::default() - } - } -} - -impl FallbackChain { - /// Create a new fallback chain with the given configuration. - pub fn new(config: FallbackConfig) -> Self { - Self { config } - } - - /// Create a disabled fallback chain (no fallback). - pub fn disabled() -> Self { - Self::new(FallbackConfig::disabled()) - } - - /// Get the configuration. - pub fn config(&self) -> &FallbackConfig { - &self.config - } - - /// Check if fallback is enabled. - pub fn is_enabled(&self) -> bool { - self.config.enabled - } - - /// Determine the appropriate behavior for an error. - pub fn behavior_for_error(&self, error: &LlmError) -> FallbackBehavior { - match error { - LlmError::RateLimit(_) => self.config.on_rate_limit, - LlmError::Timeout(_) => self.config.on_timeout, - _ => FallbackBehavior::Fail, - } - } - - /// Check if an error should trigger fallback. - pub fn should_fallback(&self, error: &LlmError) -> bool { - if !self.config.enabled { - return false; - } - - match self.behavior_for_error(error) { - FallbackBehavior::Fallback | FallbackBehavior::RetryThenFallback => true, - FallbackBehavior::Retry | FallbackBehavior::Fail => false, - } - } - - /// Check if an error should trigger retry. - pub fn should_retry(&self, error: &LlmError) -> bool { - if !self.config.enabled { - return false; - } - - match self.behavior_for_error(error) { - FallbackBehavior::Retry | FallbackBehavior::RetryThenFallback => true, - FallbackBehavior::Fallback | FallbackBehavior::Fail => false, - } - } - - /// Get the next fallback model. - pub fn next_model(&self, current: &str) -> Option { - let models = &self.config.models; - let current_idx = models.iter().position(|m| m == current); - - match current_idx { - // Current model is in the list, try next one - Some(idx) if idx + 1 < models.len() => { - let next = models[idx + 1].clone(); - info!(from = current, to = %next, "Falling back to next model"); - Some(next) - } - // Current model is the last in the list, no more fallbacks - Some(_) => { - warn!( - model = current, - "Already at last fallback model, no more available" - ); - None - } - // Current model not in fallback list, try first fallback - None => { - if !models.is_empty() && models[0] != current { - let next = models[0].clone(); - info!(from = current, to = %next, "Falling back to first fallback model"); - Some(next) - } else { - warn!(model = current, "No more fallback models available"); - None - } - } - } - } - - /// Get the next fallback endpoint. - pub fn next_endpoint(&self, current: &str) -> Option { - let endpoints = &self.config.endpoints; - let current_idx = endpoints.iter().position(|e| e == current); - - match current_idx { - // Current endpoint is in the list, try next one - Some(idx) if idx + 1 < endpoints.len() => { - let next = endpoints[idx + 1].clone(); - info!(from = current, to = %next, "Falling back to next endpoint"); - Some(next) - } - // Current endpoint is the last in the list, no more fallbacks - Some(_) => { - warn!( - endpoint = current, - "Already at last fallback endpoint, no more available" - ); - None - } - // Current endpoint not in fallback list, try first fallback - None => { - if !endpoints.is_empty() && endpoints[0] != current { - let next = endpoints[0].clone(); - info!(from = current, to = %next, "Falling back to first fallback endpoint"); - Some(next) - } else { - debug!(endpoint = current, "No more fallback endpoints available"); - None - } - } - } - } - - /// Record a fallback step. - pub fn record_fallback( - &self, - history: &mut Vec, - from_model: String, - to_model: Option, - from_endpoint: String, - to_endpoint: Option, - reason: String, - ) { - let step = FallbackStep { - from_model, - to_model, - from_endpoint, - to_endpoint, - reason, - }; - debug!(?step, "Recording fallback step"); - history.push(step); - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_fallback_config_default() { - let config = FallbackConfig::default(); - assert!(config.enabled); - assert!(!config.models.is_empty()); - } - - #[test] - fn test_fallback_chain_disabled() { - let chain = FallbackChain::disabled(); - assert!(!chain.is_enabled()); - } - - #[test] - fn test_next_model() { - let config = FallbackConfig { - models: vec![ - "gpt-4o".to_string(), - "gpt-4o-mini".to_string(), - "glm-4-flash".to_string(), - ], - ..FallbackConfig::default() - }; - let chain = FallbackChain::new(config); - - // Should get next model in chain - assert_eq!(chain.next_model("gpt-4o"), Some("gpt-4o-mini".to_string())); - assert_eq!( - chain.next_model("gpt-4o-mini"), - Some("glm-4-flash".to_string()) - ); - assert_eq!(chain.next_model("glm-4-flash"), None); - } - - #[test] - fn test_next_model_not_in_list() { - let config = FallbackConfig { - models: vec!["gpt-4o-mini".to_string()], - ..FallbackConfig::default() - }; - let chain = FallbackChain::new(config); - - // Should fall back to first model in list - assert_eq!( - chain.next_model("unknown-model"), - Some("gpt-4o-mini".to_string()) - ); - } - - #[test] - fn test_behavior_for_rate_limit() { - let config = FallbackConfig { - on_rate_limit: FallbackBehavior::Fallback, - ..FallbackConfig::default() - }; - let chain = FallbackChain::new(config); - - let error = LlmError::RateLimit("Rate limited".to_string()); - assert_eq!(chain.behavior_for_error(&error), FallbackBehavior::Fallback); - } - - #[test] - fn test_should_fallback() { - let config = FallbackConfig { - enabled: true, - on_rate_limit: FallbackBehavior::RetryThenFallback, - ..FallbackConfig::default() - }; - let chain = FallbackChain::new(config); - - let error = LlmError::RateLimit("Rate limited".to_string()); - assert!(chain.should_fallback(&error)); - - let chain_disabled = FallbackChain::disabled(); - assert!(!chain_disabled.should_fallback(&error)); - } -} diff --git a/vectorless-core/vectorless/src/llm/memo/mod.rs b/vectorless-core/vectorless/src/llm/memo/mod.rs deleted file mode 100644 index 79c9ae78..00000000 --- a/vectorless-core/vectorless/src/llm/memo/mod.rs +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! LLM Memoization system for caching expensive LLM calls. -//! -//! Provides a caching layer for LLM-generated content, avoiding -//! redundant API calls via content-addressed LRU cache with TTL -//! and optional disk persistence. - -mod store; -mod types; - -pub use store::MemoStore; -pub use types::{MemoKey, MemoValue}; diff --git a/vectorless-core/vectorless/src/llm/memo/store.rs b/vectorless-core/vectorless/src/llm/memo/store.rs deleted file mode 100644 index 2fdfcea4..00000000 --- a/vectorless-core/vectorless/src/llm/memo/store.rs +++ /dev/null @@ -1,679 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Memoization store implementation. -//! -//! Provides an in-memory LRU cache with optional disk persistence. - -use std::collections::HashMap; -use std::future::Future; -use std::path::Path; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; - -use chrono::Duration; -use lru::LruCache; -use parking_lot::RwLock; -use serde::{Deserialize, Serialize}; -use tracing::{debug, info}; - -use super::types::{MemoEntry, MemoKey, MemoOpType, MemoStats, MemoValue}; -use crate::error::Result; -use crate::utils::fingerprint::Fingerprint; - -/// Default TTL for cache entries (7 days). -const DEFAULT_TTL: Duration = Duration::days(7); - -/// Default maximum cache size. -const DEFAULT_MAX_SIZE: usize = 10_000; - -/// Serializable format for memo store persistence. -#[derive(Debug, Clone, Serialize, Deserialize)] -struct MemoStoreData { - /// Format version. - version: u32, - - /// Cache entries. - entries: HashMap, - - /// Statistics. - stats: MemoStats, -} - -/// Lock-free atomic statistics for concurrent access. -#[derive(Debug)] -struct AtomicStats { - hits: AtomicU64, - misses: AtomicU64, - tokens_saved: AtomicU64, -} - -impl AtomicStats { - fn new() -> Self { - Self { - hits: AtomicU64::new(0), - misses: AtomicU64::new(0), - tokens_saved: AtomicU64::new(0), - } - } - - fn record_hit(&self) { - self.hits.fetch_add(1, Ordering::Relaxed); - } - - fn record_miss(&self) { - self.misses.fetch_add(1, Ordering::Relaxed); - } - - fn add_tokens_saved(&self, tokens: u64) { - self.tokens_saved.fetch_add(tokens, Ordering::Relaxed); - } - - fn snapshot(&self) -> (u64, u64, u64) { - ( - self.hits.load(Ordering::Relaxed), - self.misses.load(Ordering::Relaxed), - self.tokens_saved.load(Ordering::Relaxed), - ) - } - - fn load_from(&self, hits: u64, misses: u64, tokens_saved: u64) { - self.hits.store(hits, Ordering::Relaxed); - self.misses.store(misses, Ordering::Relaxed); - self.tokens_saved.store(tokens_saved, Ordering::Relaxed); - } -} - -/// LLM Memoization store. -/// -/// Provides caching for expensive LLM operations with: -/// - LRU eviction policy -/// - TTL-based expiration -/// - Optional disk persistence -/// - Thread-safe access -/// -/// # Example -/// -/// ```rust,ignore -/// let store = MemoStore::new(); -/// -/// let summary = store.get_or_compute( -/// MemoKey::summary(&content_fp), -/// || async { -/// llm.generate_summary(content).await -/// } -/// ).await?; -/// ``` -pub struct MemoStore { - /// LRU cache for entries. - cache: Arc>>, - - /// Lock-free statistics. - stats: Arc, - - /// TTL for entries. - ttl: Duration, - - /// Model identifier for cache keys. - model_id: Option, - - /// Version for cache invalidation. - version: u32, -} - -impl std::fmt::Debug for MemoStore { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("MemoStore") - .field("ttl", &self.ttl) - .field("model_id", &self.model_id) - .field("version", &self.version) - .field("cache_len", &self.cache.read().len()) - .finish() - } -} - -impl Clone for MemoStore { - fn clone(&self) -> Self { - Self { - cache: Arc::clone(&self.cache), - stats: Arc::clone(&self.stats), - ttl: self.ttl, - model_id: self.model_id.clone(), - version: self.version, - } - } -} - -impl MemoStore { - /// Create a new memo store with default size. - pub fn new() -> Self { - Self::with_capacity(DEFAULT_MAX_SIZE) - } - - /// Create a new memo store with specified capacity. - pub fn with_capacity(capacity: usize) -> Self { - Self { - cache: Arc::new(RwLock::new(LruCache::new( - std::num::NonZeroUsize::new(capacity) - .unwrap_or(std::num::NonZeroUsize::new(1000).unwrap()), - ))), - stats: Arc::new(AtomicStats::new()), - ttl: DEFAULT_TTL, - model_id: None, - version: 1, - } - } - - /// Set the TTL for cache entries. - pub fn with_ttl(mut self, ttl: Duration) -> Self { - self.ttl = ttl; - self - } - - /// Set the model identifier. - pub fn with_model(mut self, model_id: &str) -> Self { - self.model_id = Some(model_id.to_string()); - self - } - - /// Set the version. - pub fn with_version(mut self, version: u32) -> Self { - self.version = version; - self - } - - /// Get a cached value if present and not expired. - pub fn get(&self, key: &MemoKey) -> Option { - let full_key = self.make_key(key); - let mut cache = self.cache.write(); - - if let Some(entry) = cache.get_mut(&full_key) { - if entry.is_expired(self.ttl) { - cache.pop(&full_key); - return None; - } - entry.record_hit(); - debug!("Memo cache hit for {:?}", key.op_type); - return Some(entry.value.clone()); - } - - None - } - - /// Put a value in the cache. - pub fn put(&self, key: MemoKey, value: MemoValue) { - self.put_with_tokens(key, value, 0); - } - - /// Put a value in the cache with token count. - pub fn put_with_tokens(&self, key: MemoKey, value: MemoValue, tokens_saved: u64) { - let full_key = self.make_key(&key); - let entry = MemoEntry::with_tokens(value, tokens_saved); - - let mut cache = self.cache.write(); - cache.put(full_key, entry); - - debug!("Memo cache put for {:?}", key.op_type); - } - - /// Get a value or compute it if not present. - /// - /// This is the primary method for using the memo store. - /// It will return the cached value if present, or call the - /// provided compute function and cache the result. - pub async fn get_or_compute(&self, key: MemoKey, compute: F) -> Result - where - F: FnOnce() -> Fut, - Fut: Future>, // (value, tokens) - { - // Check cache first (synchronous) - if let Some(value) = self.get(&key) { - self.stats.record_hit(); - return Ok(value); - } - - // Record miss - self.stats.record_miss(); - - // Compute - let (value, tokens) = compute().await?; - - // Cache result - self.put_with_tokens(key.clone(), value.clone(), tokens); - - // Update tokens saved - self.stats.add_tokens_saved(tokens); - - Ok(value) - } - - /// Check if a key exists in the cache. - pub fn contains(&self, key: &MemoKey) -> bool { - let full_key = self.make_key(key); - let cache = self.cache.read(); - cache.contains(&full_key) - } - - /// Remove a key from the cache. - pub fn remove(&self, key: &MemoKey) -> Option { - let full_key = self.make_key(key); - let mut cache = self.cache.write(); - cache.pop(&full_key).map(|e| e.value) - } - - /// Clear all entries from the cache. - pub fn clear(&self) { - let mut cache = self.cache.write(); - cache.clear(); - debug!("Memo cache cleared"); - } - - /// Get the number of entries in the cache. - pub fn len(&self) -> usize { - let cache = self.cache.read(); - cache.len() - } - - /// Check if the cache is empty. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Get cache statistics (synchronous, lock-free). - pub fn stats(&self) -> MemoStats { - let (hits, misses, tokens_saved) = self.stats.snapshot(); - MemoStats { - entries: self.len(), - hits, - misses, - tokens_saved, - cost_saved: 0.0, - } - } - - /// Invalidate all entries of a specific operation type. - /// - /// Useful when the algorithm for a specific operation changes. - pub fn invalidate_by_op_type(&self, op_type: MemoOpType) -> usize { - let mut cache = self.cache.write(); - let before = cache.len(); - - let keys_to_remove: Vec = cache - .iter() - .filter_map(|(key, entry)| { - let matches = match (&entry.value, op_type) { - (MemoValue::Summary(_), MemoOpType::Summary) => true, - (MemoValue::PilotDecision(_), MemoOpType::PilotDecision) => true, - (MemoValue::QueryAnalysis(_), MemoOpType::QueryAnalysis) => true, - (MemoValue::Extraction(_), MemoOpType::Extraction) => true, - _ => false, - }; - if matches { Some(key.clone()) } else { None } - }) - .collect(); - - for key in keys_to_remove { - cache.pop(&key); - } - - let removed = before - cache.len(); - if removed > 0 { - debug!("Invalidated {} entries for op_type {:?}", removed, op_type); - } - removed - } - - /// Invalidate all entries matching a model ID prefix. - /// - /// Useful when switching models or when a model's behavior changes. - pub fn invalidate_by_model_prefix(&self, prefix: &str) -> usize { - let mut cache = self.cache.write(); - let before = cache.len(); - - let should_clear = self - .model_id - .as_ref() - .map(|m| m.starts_with(prefix)) - .unwrap_or(false); - - if should_clear { - cache.clear(); - let removed = before; - debug!( - "Invalidated all {} entries (model prefix '{}')", - removed, prefix - ); - return removed; - } - - 0 - } - - /// Remove expired entries. - pub fn prune_expired(&self) -> usize { - let mut cache = self.cache.write(); - let before = cache.len(); - - let expired: Vec = cache - .iter() - .filter(|(_, entry)| entry.is_expired(self.ttl)) - .map(|(k, _)| k.clone()) - .collect(); - - for key in expired { - cache.pop(&key); - } - - let removed = before - cache.len(); - if removed > 0 { - debug!("Pruned {} expired memo entries", removed); - } - removed - } - - /// Save the cache to disk. - pub async fn save(&self, path: &Path) -> Result<()> { - // Prune expired entries before persisting - self.prune_expired(); - - let cache = self.cache.read(); - let stats = self.stats(); - - let entries: HashMap = - cache.iter().map(|(k, v)| (k.clone(), v.clone())).collect(); - - let data = MemoStoreData { - version: 1, - entries, - stats, - }; - - let parent = path - .parent() - .ok_or_else(|| crate::Error::Parse("Invalid path for memo store".to_string()))?; - tokio::fs::create_dir_all(parent).await?; - - let temp_path = path.with_extension("tmp"); - let json = serde_json::to_vec_pretty(&data) - .map_err(|e| crate::Error::Parse(format!("Failed to serialize memo store: {}", e)))?; - tokio::fs::write(&temp_path, &json).await?; - tokio::fs::rename(&temp_path, path).await?; - - info!( - "Saved memo store with {} entries to {:?}", - data.entries.len(), - path - ); - Ok(()) - } - - /// Load the cache from disk. - pub async fn load(&self, path: &Path) -> Result<()> { - if !path.exists() { - return Ok(()); - } - - let bytes = tokio::fs::read(path).await?; - let data: MemoStoreData = serde_json::from_slice(&bytes) - .map_err(|e| crate::Error::Parse(format!("Failed to deserialize memo store: {}", e)))?; - - let mut cache = self.cache.write(); - - for (key, entry) in data.entries { - if !entry.is_expired(self.ttl) { - cache.put(key, entry); - } - } - - // Restore stats - self.stats - .load_from(data.stats.hits, data.stats.misses, data.stats.tokens_saved); - - info!( - "Loaded memo store with {} entries from {:?}", - cache.len(), - path - ); - Ok(()) - } - - /// Make a full cache key from a MemoKey. - fn make_key(&self, key: &MemoKey) -> String { - let mut key_with_context = key.clone(); - if key_with_context.model_id.is_none() { - key_with_context.model_id = self.model_id.clone(); - } - if key_with_context.version == 0 { - key_with_context.version = self.version; - } - key_with_context.fingerprint().to_string() - } -} - -impl Default for MemoStore { - fn default() -> Self { - Self::new() - } -} - -/// A helper for building memo keys with context. -pub struct MemoKeyBuilder { - model_id: Option, - version: u32, -} - -impl MemoKeyBuilder { - /// Create a new key builder. - pub fn new() -> Self { - Self { - model_id: None, - version: 1, - } - } - - /// Set the model identifier. - pub fn with_model(mut self, model_id: &str) -> Self { - self.model_id = Some(model_id.to_string()); - self - } - - /// Set the version. - pub fn with_version(mut self, version: u32) -> Self { - self.version = version; - self - } - - /// Build a summary key. - pub fn summary_key(&self, content_fp: &Fingerprint) -> MemoKey { - MemoKey { - op_type: super::types::MemoOpType::Summary, - input_fp: *content_fp, - model_id: self.model_id.clone(), - version: self.version, - context_fp: Fingerprint::zero(), - } - } - - /// Build a pilot decision key. - pub fn pilot_key(&self, context_fp: &Fingerprint, query_fp: &Fingerprint) -> MemoKey { - MemoKey { - op_type: super::types::MemoOpType::PilotDecision, - input_fp: *query_fp, - model_id: self.model_id.clone(), - version: self.version, - context_fp: *context_fp, - } - } - - /// Build a query analysis key. - pub fn query_analysis_key(&self, query_fp: &Fingerprint) -> MemoKey { - MemoKey { - op_type: super::types::MemoOpType::QueryAnalysis, - input_fp: *query_fp, - model_id: self.model_id.clone(), - version: self.version, - context_fp: Fingerprint::zero(), - } - } - - /// Build an extraction key. - pub fn extraction_key(&self, content_fp: &Fingerprint) -> MemoKey { - MemoKey { - op_type: super::types::MemoOpType::Extraction, - input_fp: *content_fp, - model_id: self.model_id.clone(), - version: self.version, - context_fp: Fingerprint::zero(), - } - } -} - -impl Default for MemoKeyBuilder { - fn default() -> Self { - Self::new() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - - fn make_test_key() -> MemoKey { - let fp = Fingerprint::from_str("test content"); - MemoKey::summary(&fp) - } - - #[test] - fn test_memo_store_basic() { - let store = MemoStore::new(); - let key = make_test_key(); - - assert!(!store.contains(&key)); - - store.put(key.clone(), MemoValue::Summary("Test summary".to_string())); - - assert!(store.contains(&key)); - - let value = store.get(&key); - assert!(value.is_some()); - assert_eq!(value.unwrap().as_summary(), Some("Test summary")); - } - - #[test] - fn test_memo_store_lru_eviction() { - let store = MemoStore::with_capacity(3); - - for i in 0..5 { - let fp = Fingerprint::from_str(&format!("content {}", i)); - let key = MemoKey::summary(&fp); - store.put(key, MemoValue::Summary(format!("Summary {}", i))); - } - - assert_eq!(store.len(), 3); - } - - #[tokio::test] - async fn test_memo_store_get_or_compute() { - let store = MemoStore::new(); - let key = make_test_key(); - - let call_count = Arc::new(std::sync::atomic::AtomicU64::new(0)); - let count_clone = call_count.clone(); - - // First call should compute - let result = store - .get_or_compute(key.clone(), || { - let c = count_clone.clone(); - async move { - c.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - Ok((MemoValue::Summary("Computed".to_string()), 100)) - } - }) - .await - .unwrap(); - - assert_eq!(result.as_summary(), Some("Computed")); - assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 1); - - // Second call should use cache - let result2 = store - .get_or_compute(key.clone(), || { - let c = count_clone.clone(); - async move { - c.fetch_add(1, std::sync::atomic::Ordering::SeqCst); - Ok((MemoValue::Summary("Should not be called".to_string()), 100)) - } - }) - .await - .unwrap(); - - assert_eq!(result2.as_summary(), Some("Computed")); - assert_eq!(call_count.load(std::sync::atomic::Ordering::SeqCst), 1); - } - - #[tokio::test] - async fn test_memo_store_persistence() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("memo.json"); - - let store = MemoStore::new(); - let key = make_test_key(); - - store.put_with_tokens( - key.clone(), - MemoValue::Summary("Test summary".to_string()), - 100, - ); - - // Save - store.save(&path).await.unwrap(); - assert!(path.exists()); - - // Load into new store - let store2 = MemoStore::new(); - store2.load(&path).await.unwrap(); - - assert!(store2.contains(&key)); - let value = store2.get(&key); - assert_eq!(value.unwrap().as_summary(), Some("Test summary")); - } - - #[tokio::test] - async fn test_memo_store_stats() { - let store = MemoStore::new(); - let key = make_test_key(); - - // Miss - store - .get_or_compute(key.clone(), || async { - Ok((MemoValue::Summary("Test".to_string()), 100)) - }) - .await - .unwrap(); - - // Hit - store - .get_or_compute(key.clone(), || async { - Ok((MemoValue::Summary("Should not be called".to_string()), 0)) - }) - .await - .unwrap(); - - let stats = store.stats(); - assert_eq!(stats.misses, 1); - assert_eq!(stats.hits, 1); - assert_eq!(stats.tokens_saved, 100); - } - - #[test] - fn test_memo_key_builder() { - let builder = MemoKeyBuilder::new().with_model("gpt-4").with_version(2); - - let fp = Fingerprint::from_str("content"); - let key = builder.summary_key(&fp); - - assert_eq!(key.model_id, Some("gpt-4".to_string())); - assert_eq!(key.version, 2); - } -} diff --git a/vectorless-core/vectorless/src/llm/memo/types.rs b/vectorless-core/vectorless/src/llm/memo/types.rs deleted file mode 100644 index a45aed12..00000000 --- a/vectorless-core/vectorless/src/llm/memo/types.rs +++ /dev/null @@ -1,414 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Types for the memoization system. - -use chrono::{DateTime, Utc}; -use serde::{Deserialize, Serialize}; - -use crate::utils::fingerprint::Fingerprint; - -/// Types of operations that can be memoized. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum MemoOpType { - /// Node summary generation. - Summary, - - /// Pilot navigation decision. - PilotDecision, - - /// Query analysis result. - QueryAnalysis, - - /// Content extraction result. - Extraction, - - /// LLM node evaluation during retrieval. - NodeEvaluation, - - /// Sufficiency check result. - SufficiencyCheck, - - /// Query complexity detection. - ComplexityDetection, - - /// Query decomposition. - QueryDecomposition, - - /// Custom operation type. - Custom(u8), -} - -impl MemoOpType { - /// Get a unique byte identifier for this operation type. - pub fn as_byte(&self) -> u8 { - match self { - MemoOpType::Summary => 0, - MemoOpType::PilotDecision => 1, - MemoOpType::QueryAnalysis => 2, - MemoOpType::Extraction => 3, - MemoOpType::NodeEvaluation => 4, - MemoOpType::SufficiencyCheck => 5, - MemoOpType::ComplexityDetection => 6, - MemoOpType::QueryDecomposition => 7, - MemoOpType::Custom(n) => 100 + n, - } - } -} - -/// Key for memoization lookup. -/// -/// Keys are content-addressed using fingerprints, ensuring that -/// cache hits only occur when the input is semantically identical. -#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub struct MemoKey { - /// Type of operation being memoized. - pub op_type: MemoOpType, - - /// Fingerprint of the input content. - pub input_fp: Fingerprint, - - /// Optional model identifier for cache invalidation when model changes. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub model_id: Option, - - /// Optional version for cache invalidation when algorithm changes. - #[serde(default)] - pub version: u32, - - /// Additional context fingerprint (e.g., query context for pilot decisions). - #[serde(default, skip_serializing_if = "Fingerprint::is_zero")] - pub context_fp: Fingerprint, -} - -impl MemoKey { - /// Create a key for summary generation. - pub fn summary(content_fp: &Fingerprint) -> Self { - Self { - op_type: MemoOpType::Summary, - input_fp: *content_fp, - model_id: None, - version: 1, - context_fp: Fingerprint::zero(), - } - } - - /// Create a key for summary generation with model and version. - pub fn summary_with_model(content_fp: &Fingerprint, model_id: &str, version: u32) -> Self { - Self { - op_type: MemoOpType::Summary, - input_fp: *content_fp, - model_id: Some(model_id.to_string()), - version, - context_fp: Fingerprint::zero(), - } - } - - /// Create a key for pilot decision. - pub fn pilot_decision(context_fp: &Fingerprint, query_fp: &Fingerprint) -> Self { - Self { - op_type: MemoOpType::PilotDecision, - input_fp: *query_fp, - model_id: None, - version: 1, - context_fp: *context_fp, - } - } - - /// Create a key for query analysis. - pub fn query_analysis(query_fp: &Fingerprint) -> Self { - Self { - op_type: MemoOpType::QueryAnalysis, - input_fp: *query_fp, - model_id: None, - version: 1, - context_fp: Fingerprint::zero(), - } - } - - /// Create a key for content extraction. - pub fn extraction(content_fp: &Fingerprint) -> Self { - Self { - op_type: MemoOpType::Extraction, - input_fp: *content_fp, - model_id: None, - version: 1, - context_fp: Fingerprint::zero(), - } - } - - /// Set the model identifier. - pub fn with_model(mut self, model_id: &str) -> Self { - self.model_id = Some(model_id.to_string()); - self - } - - /// Set the version. - pub fn with_version(mut self, version: u32) -> Self { - self.version = version; - self - } - - /// Set the context fingerprint. - pub fn with_context(mut self, context_fp: &Fingerprint) -> Self { - self.context_fp = *context_fp; - self - } - - /// Compute a fingerprint of this key for storage. - pub fn fingerprint(&self) -> Fingerprint { - use crate::utils::fingerprint::Fingerprinter; - - let mut fp = Fingerprinter::new(); - fp.write_u64(self.op_type.as_byte() as u64); - fp.write_fingerprint(&self.input_fp); - fp.write_option_str(self.model_id.as_deref()); - fp.write_u64(self.version as u64); - if !self.context_fp.is_zero() { - fp.write_fingerprint(&self.context_fp); - } - fp.into_fingerprint() - } -} - -/// Cached value from an LLM operation. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub enum MemoValue { - /// Generated summary text. - Summary(String), - - /// Pilot navigation decision. - PilotDecision(PilotDecisionValue), - - /// Query analysis result. - QueryAnalysis(QueryAnalysisValue), - - /// Extracted content. - Extraction(serde_json::Value), - - /// Raw text (for custom operations). - Text(String), - - /// JSON value (for structured outputs). - Json(serde_json::Value), -} - -/// Serializable pilot decision value. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PilotDecisionValue { - /// Selected candidate index. - pub selected_idx: usize, - - /// Confidence score (0.0 to 1.0). - pub confidence: f32, - - /// Reasoning text. - pub reasoning: String, -} - -/// Serializable query analysis value. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct QueryAnalysisValue { - /// Query complexity score. - pub complexity: f32, - - /// Detected intent. - pub intent: String, - - /// Suggested strategy. - pub strategy: String, -} - -impl MemoValue { - /// Get the value as a string summary. - pub fn as_summary(&self) -> Option<&str> { - match self { - MemoValue::Summary(s) => Some(s), - _ => None, - } - } - - /// Get the value as text. - pub fn as_text(&self) -> Option<&str> { - match self { - MemoValue::Text(s) => Some(s), - MemoValue::Summary(s) => Some(s), - _ => None, - } - } - - /// Check if this is a summary value. - pub fn is_summary(&self) -> bool { - matches!(self, MemoValue::Summary(_)) - } -} - -/// A cached entry in the memo store. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct MemoEntry { - /// The cached value. - pub value: MemoValue, - - /// When this entry was created. - pub created_at: DateTime, - - /// When this entry was last accessed. - pub last_accessed: DateTime, - - /// Number of cache hits. - pub hits: u64, - - /// Token cost saved by this cache entry. - pub tokens_saved: u64, -} - -impl MemoEntry { - /// Create a new entry. - pub fn new(value: MemoValue) -> Self { - let now = Utc::now(); - Self { - value, - created_at: now, - last_accessed: now, - hits: 0, - tokens_saved: 0, - } - } - - /// Create a new entry with token count. - pub fn with_tokens(value: MemoValue, tokens_saved: u64) -> Self { - Self { - tokens_saved, - ..Self::new(value) - } - } - - /// Record a cache hit. - pub fn record_hit(&mut self) { - self.hits += 1; - self.last_accessed = Utc::now(); - } - - /// Check if this entry has expired. - pub fn is_expired(&self, ttl: chrono::Duration) -> bool { - let now = Utc::now(); - now - self.created_at > ttl - } - - /// Get the age of this entry. - pub fn age(&self) -> chrono::Duration { - Utc::now() - self.created_at - } -} - -/// Statistics for the memo store. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct MemoStats { - /// Total number of cache entries. - pub entries: usize, - - /// Total cache hits. - pub hits: u64, - - /// Total cache misses. - pub misses: u64, - - /// Total tokens saved by cache hits. - pub tokens_saved: u64, - - /// Estimated cost saved (in USD). - pub cost_saved: f64, -} - -impl MemoStats { - /// Calculate the cache hit rate. - pub fn hit_rate(&self) -> f64 { - let total = self.hits + self.misses; - if total == 0 { - 0.0 - } else { - self.hits as f64 / total as f64 - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_memo_key_summary() { - let fp = Fingerprint::from_str("test content"); - let key = MemoKey::summary(&fp); - - assert_eq!(key.op_type, MemoOpType::Summary); - assert_eq!(key.input_fp, fp); - assert!(key.model_id.is_none()); - } - - #[test] - fn test_memo_key_with_model() { - let fp = Fingerprint::from_str("test content"); - let key = MemoKey::summary(&fp).with_model("gpt-4").with_version(2); - - assert_eq!(key.model_id, Some("gpt-4".to_string())); - assert_eq!(key.version, 2); - } - - #[test] - fn test_memo_key_fingerprint() { - let fp = Fingerprint::from_str("test content"); - let key1 = MemoKey::summary(&fp); - let key2 = MemoKey::summary(&fp); - - assert_eq!(key1.fingerprint(), key2.fingerprint()); - - let key3 = MemoKey::summary_with_model(&fp, "gpt-4", 1); - assert_ne!(key1.fingerprint(), key3.fingerprint()); - } - - #[test] - fn test_memo_entry() { - let entry = MemoEntry::new(MemoValue::Summary("Test summary".to_string())); - - assert_eq!(entry.hits, 0); - assert!(entry.value.as_summary().is_some()); - } - - #[test] - fn test_memo_entry_hit() { - let mut entry = MemoEntry::new(MemoValue::Summary("Test summary".to_string())); - entry.record_hit(); - entry.record_hit(); - - assert_eq!(entry.hits, 2); - } - - #[test] - fn test_memo_stats_hit_rate() { - let mut stats = MemoStats::default(); - stats.hits = 80; - stats.misses = 20; - - assert!((stats.hit_rate() - 0.8).abs() < 0.001); - } - - #[test] - fn test_memo_key_serialization() { - let fp = Fingerprint::from_str("test content"); - let key = MemoKey::summary_with_model(&fp, "gpt-4", 1); - - let json = serde_json::to_string(&key).unwrap(); - let decoded: MemoKey = serde_json::from_str(&json).unwrap(); - - assert_eq!(key, decoded); - } - - #[test] - fn test_memo_value_serialization() { - let value = MemoValue::Summary("Test summary".to_string()); - let json = serde_json::to_string(&value).unwrap(); - let decoded: MemoValue = serde_json::from_str(&json).unwrap(); - assert_eq!(value.as_summary(), decoded.as_summary()); - } -} diff --git a/vectorless-core/vectorless/src/llm/mod.rs b/vectorless-core/vectorless/src/llm/mod.rs deleted file mode 100644 index bd65e58a..00000000 --- a/vectorless-core/vectorless/src/llm/mod.rs +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified LLM client module. -//! -//! This module provides a unified interface for all LLM operations across the codebase: -//! - **Index** — Document indexing and summarization -//! - **Retrieval** — Document tree navigation -//! - **Pilot** — Navigation guidance -//! -//! # Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────┐ -//! │ LlmPool │ -//! │ │ -//! │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ -//! │ │ index │ │ retrieval │ │ pilot │ │ -//! │ │ LlmClient │ │ LlmClient │ │ LlmClient │ │ -//! │ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │ -//! │ │ │ │ │ -//! │ └────────────────┼────────────────┘ │ -//! │ │ │ -//! │ ▼ │ -//! │ ┌─────────────────────┐ │ -//! │ │ async-openai │ │ -//! │ └─────────────────────┘ │ -//! └─────────────────────────────────────────────────────────────────┘ -//! ``` - -mod client; -pub(crate) mod config; -mod error; -mod executor; -mod fallback; -pub(crate) mod memo; -mod pool; -pub(crate) mod throttle; - -pub use client::LlmClient; -pub use error::LlmResult; -pub use pool::LlmPool; diff --git a/vectorless-core/vectorless/src/llm/pool.rs b/vectorless-core/vectorless/src/llm/pool.rs deleted file mode 100644 index 9acef8ca..00000000 --- a/vectorless-core/vectorless/src/llm/pool.rs +++ /dev/null @@ -1,176 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! LLM client pool for managing multiple clients. - -use std::sync::Arc; - -use super::client::LlmClient; -use super::config::LlmConfig; -use super::fallback::{FallbackChain, FallbackConfig}; -use super::throttle::ConcurrencyController; -use crate::metrics::MetricsHub; - -/// Pool of LLM clients for different purposes. -/// -/// This provides a centralized way to access LLM clients -/// configured for specific tasks: -/// - **Index** — Document indexing/summarization (fast, cheap model) -/// - **Retrieval** — Document navigation (capable model) -/// - **Pilot** — Navigation guidance (fast model) -/// -/// # Construction -/// -/// The pool is built from a [`config::LlmConfig`](crate::config::LlmConfig) -/// which defines the global credentials and per-slot overrides. -/// -/// ```rust,ignore -/// use vectorless::llm::LlmPool; -/// -/// let pool = LlmPool::from_config(&config.llm); -/// -/// // Use index client for summarization -/// let summary = pool.index().complete( -/// "You summarize text concisely.", -/// "Long text to summarize..." -/// ).await?; -/// ``` -#[derive(Debug, Clone)] -pub struct LlmPool { - index: Arc, - retrieval: Arc, -} - -impl LlmPool { - /// Create a pool from the unified LLM configuration. - /// - /// Resolves per-slot model overrides and creates individual - /// [`LlmClient`] instances with the appropriate settings. - /// When `metrics` is provided, all clients share the same hub - /// for unified LLM call statistics. - pub fn from_config( - config: &crate::config::LlmConfig, - metrics: Option>, - ) -> Self { - let api_key = config.api_key.clone(); - let endpoint = config.endpoint.clone().unwrap_or_default(); - let retry = config.retry.to_runtime_config(); - - let make_config = |slot: &crate::config::SlotConfig| -> LlmConfig { - LlmConfig { - model: config.resolve_model(slot), - endpoint: endpoint.clone(), - api_key: api_key.clone(), - max_tokens: slot.max_tokens, - temperature: slot.temperature, - retry: retry.clone(), - request_timeout_secs: 0, - } - }; - - // Build a single shared async-openai client (reuses connection pool) - let openai_base = if endpoint.is_empty() { - "https://api.openai.com/v1".to_string() - } else { - endpoint.clone() - }; - let openai_client = Arc::new(async_openai::Client::with_config( - async_openai::config::OpenAIConfig::new() - .with_api_key(api_key.clone().unwrap_or_default()) - .with_api_base(openai_base), - )); - - // Attach shared throttle controller from config - let concurrency_config = config.throttle.to_runtime_config(); - let controller = Arc::new(ConcurrencyController::new(concurrency_config)); - - // Attach shared fallback chain from config - let fallback_config: FallbackConfig = config.fallback.clone().into(); - let fallback_chain = Arc::new(FallbackChain::new(fallback_config)); - - let build_client = |slot_config: &crate::config::SlotConfig| { - let mut client = LlmClient::new(make_config(slot_config)) - .with_shared_concurrency(controller.clone()) - .with_shared_openai_client(openai_client.clone()) - .with_shared_fallback(fallback_chain.clone()); - if let Some(ref hub) = metrics { - client = client.with_shared_metrics(hub.clone()); - } - Arc::new(client) - }; - - Self { - index: build_client(&config.index), - retrieval: build_client(&config.retrieval), - } - } - - /// Create a pool with default configurations. - pub fn from_defaults() -> Self { - Self::from_config(&crate::config::LlmConfig::default(), None) - } - - /// Get the index client. - pub fn index(&self) -> &LlmClient { - &self.index - } - - /// Get the retrieval client. - pub fn retrieval(&self) -> &LlmClient { - &self.retrieval - } -} - -impl Default for LlmPool { - fn default() -> Self { - Self::from_defaults() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_pool_from_config() { - let config = crate::config::LlmConfig::new("gpt-4o") - .with_api_key("sk-test") - .with_endpoint("https://api.openai.com/v1") - .with_index(crate::config::SlotConfig::fast().with_model("gpt-4o-mini")); - - let pool = LlmPool::from_config(&config, None); - - assert_eq!(pool.index().config().model, "gpt-4o-mini"); - assert_eq!(pool.retrieval().config().model, "gpt-4o"); - assert_eq!(pool.index().config().max_tokens, 100); - } - - #[test] - fn test_pool_from_config_with_metrics() { - let config = crate::config::LlmConfig::new("gpt-4o") - .with_api_key("sk-test") - .with_endpoint("https://api.openai.com/v1"); - - let hub = MetricsHub::shared(); - let pool = LlmPool::from_config(&config, Some(hub.clone())); - - assert!(pool.index().fallback().is_some()); - assert!(pool.retrieval().fallback().is_some()); - - assert_eq!(pool.index().config().model, "gpt-4o"); - assert_eq!(pool.retrieval().config().model, "gpt-4o"); - } - - #[test] - fn test_pool_shared_metrics_hub() { - let config = crate::config::LlmConfig::new("gpt-4o") - .with_api_key("sk-test") - .with_endpoint("https://api.openai.com/v1"); - - let hub = MetricsHub::shared(); - let _pool = LlmPool::from_config(&config, Some(hub.clone())); - - // Hub is shared with all three clients — Arc refcount > 1 - assert!(Arc::strong_count(&hub) > 1); - } -} diff --git a/vectorless-core/vectorless/src/llm/throttle.rs b/vectorless-core/vectorless/src/llm/throttle.rs deleted file mode 100644 index 5de96743..00000000 --- a/vectorless-core/vectorless/src/llm/throttle.rs +++ /dev/null @@ -1,259 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Concurrency control for LLM API calls. -//! -//! Combines semaphore (concurrency limit) with token-bucket rate limiter (RPM). - -use std::num::NonZeroU32; -use std::sync::Arc; - -use governor::{ - Quota, RateLimiter as GovernorLimiter, - clock::{Clock, DefaultClock}, - state::{InMemoryState, NotKeyed}, -}; -use serde::{Deserialize, Serialize}; -use tokio::sync::{Semaphore, SemaphorePermit}; -use tracing::{debug, trace}; - -// ============================================================ -// ConcurrencyConfig -// ============================================================ - -/// Concurrency control configuration. -/// -/// Controls how LLM requests are rate-limited and throttled -/// to avoid overwhelming the API. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ConcurrencyConfig { - /// Maximum concurrent LLM API calls. - #[serde(default = "default_max_concurrent_requests")] - pub max_concurrent_requests: usize, - - /// Rate limit: requests per minute (token bucket). - #[serde(default = "default_requests_per_minute")] - pub requests_per_minute: usize, - - /// Whether rate limiting is enabled. - #[serde(default = "default_true")] - pub enabled: bool, - - /// Whether semaphore-based concurrency limiting is enabled. - #[serde(default = "default_true")] - pub semaphore_enabled: bool, -} - -fn default_max_concurrent_requests() -> usize { - 10 -} -fn default_requests_per_minute() -> usize { - 500 -} -fn default_true() -> bool { - true -} - -impl Default for ConcurrencyConfig { - fn default() -> Self { - Self { - max_concurrent_requests: default_max_concurrent_requests(), - requests_per_minute: default_requests_per_minute(), - enabled: true, - semaphore_enabled: true, - } - } -} - -impl ConcurrencyConfig { - /// Create a new config with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the maximum concurrent requests. - pub fn with_max_concurrent_requests(mut self, max: usize) -> Self { - self.max_concurrent_requests = max; - self - } - - /// Set the requests per minute rate limit. - pub fn with_requests_per_minute(mut self, rpm: usize) -> Self { - self.requests_per_minute = rpm; - self - } - - /// Enable or disable rate limiting. - pub fn with_enabled(mut self, enabled: bool) -> Self { - self.enabled = enabled; - self - } - - /// Create a config for conservative scenarios. - pub fn conservative() -> Self { - Self { - max_concurrent_requests: 5, - requests_per_minute: 100, - enabled: true, - semaphore_enabled: true, - } - } - - /// Create a config that disables all limits. - pub fn unlimited() -> Self { - Self { - max_concurrent_requests: usize::MAX, - requests_per_minute: usize::MAX, - enabled: false, - semaphore_enabled: false, - } - } -} - -// ============================================================ -// ConcurrencyController -// ============================================================ - -/// Concurrency controller for LLM API calls. -/// -/// Combines: -/// - **Rate Limiter** — Token bucket to limit requests per time period -/// - **Semaphore** — Limit concurrent requests -/// -/// The only operation needed by business code is [`acquire()`](ConcurrencyController::acquire). -#[derive(Clone)] -pub struct ConcurrencyController { - semaphore: Arc, - rate_limiter: Option>>, - semaphore_enabled: bool, -} - -impl ConcurrencyController { - /// Create a new concurrency controller with the given configuration. - pub fn new(config: ConcurrencyConfig) -> Self { - let semaphore = Arc::new(Semaphore::new(config.max_concurrent_requests)); - let rate_limiter = if config.enabled { - let rpm = NonZeroU32::new(config.requests_per_minute as u32) - .unwrap_or_else(|| NonZeroU32::new(1).unwrap()); - Some(Arc::new(GovernorLimiter::direct(Quota::per_minute(rpm)))) - } else { - None - }; - - Self { - semaphore, - rate_limiter, - semaphore_enabled: config.semaphore_enabled, - } - } - - /// Create a controller with default configuration. - pub fn with_defaults() -> Self { - Self::new(ConcurrencyConfig::default()) - } - - /// Acquire a permit for making an LLM request. - /// - /// This will: - /// 1. Wait for the rate limiter (if enabled) - /// 2. Acquire a semaphore permit (if enabled) - /// - /// The permit is automatically released when dropped. - pub async fn acquire(&self) -> Option> { - // Step 1: Wait for rate limiter - if let Some(ref limiter) = self.rate_limiter { - let clock = DefaultClock::default(); - loop { - match limiter.check() { - Ok(_) => { - trace!("Rate limiter: token acquired"); - break; - } - Err(negative) => { - let wait_duration = negative.wait_time_from(clock.now()); - trace!( - wait_ms = wait_duration.as_millis() as u64, - "Rate limiter: waiting for token" - ); - tokio::time::sleep(wait_duration).await; - } - } - } - debug!("Rate limiter: token acquired"); - } - - // Step 2: Acquire semaphore permit - if self.semaphore_enabled { - trace!("Waiting for semaphore permit"); - let permit = self - .semaphore - .acquire() - .await - .expect("semaphore should not be closed"); - debug!( - "Semaphore: permit acquired (available: {})", - self.semaphore.available_permits() - ); - Some(permit) - } else { - None - } - } -} - -impl std::fmt::Debug for ConcurrencyController { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("ConcurrencyController") - .field("available_permits", &self.semaphore.available_permits()) - .field("has_rate_limiter", &self.rate_limiter.is_some()) - .field("semaphore_enabled", &self.semaphore_enabled) - .finish() - } -} - -impl Default for ConcurrencyController { - fn default() -> Self { - Self::with_defaults() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[tokio::test] - async fn test_controller_acquire() { - let controller = ConcurrencyController::new(ConcurrencyConfig { - max_concurrent_requests: 2, - requests_per_minute: 100, - enabled: false, - semaphore_enabled: true, - }); - - let permit1 = controller.acquire().await; - assert!(permit1.is_some()); - - let permit2 = controller.acquire().await; - assert!(permit2.is_some()); - - drop(permit1); - } - - #[test] - fn test_controller_creation() { - let controller = ConcurrencyController::with_defaults(); - assert!(controller.semaphore.available_permits() > 0); - } - - #[test] - fn test_rate_limiter_creation() { - let config = ConcurrencyConfig { - max_concurrent_requests: 10, - requests_per_minute: 100, - enabled: true, - semaphore_enabled: true, - }; - let controller = ConcurrencyController::new(config); - assert!(controller.rate_limiter.is_some()); - } -} diff --git a/vectorless-core/vectorless/src/metrics/hub.rs b/vectorless-core/vectorless/src/metrics/hub.rs deleted file mode 100644 index c4f70fe4..00000000 --- a/vectorless-core/vectorless/src/metrics/hub.rs +++ /dev/null @@ -1,324 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Central metrics hub for unified collection. - -use std::sync::Arc; - -use super::llm::{LlmMetrics, LlmMetricsReport}; -use super::retrieval::{RetrievalMetrics, RetrievalMetricsReport}; -use crate::config::MetricsConfig; - -/// Central metrics hub for unified collection. -/// -/// Provides a single point for all metrics collection across: -/// - LLM operations (tokens, latency, cost) -/// - Retrieval operations (paths, scores, cache) -/// -/// # Thread Safety -/// -/// All metrics use atomic operations and are safe to use from multiple threads. -/// -/// # Example -/// -/// ```rust -/// use vectorless::metrics::{MetricsHub, MetricsConfig}; -/// -/// let config = MetricsConfig::default(); -/// let hub = MetricsHub::new(config); -/// -/// // Record LLM call -/// hub.record_llm_call(100, 50, 150, true); -/// -/// // Get report -/// let report = hub.generate_report(); -/// ``` -#[derive(Debug)] -pub struct MetricsHub { - config: MetricsConfig, - llm: LlmMetrics, - retrieval: RetrievalMetrics, -} - -impl MetricsHub { - /// Create a new metrics hub. - pub fn new(config: MetricsConfig) -> Self { - Self { - config, - llm: LlmMetrics::new(), - retrieval: RetrievalMetrics::new(), - } - } - - /// Create a new metrics hub with defaults. - pub fn with_defaults() -> Self { - Self::new(MetricsConfig::default()) - } - - /// Create an Arc-wrapped metrics hub. - pub fn shared() -> Arc { - Arc::new(Self::with_defaults()) - } - - /// Create an Arc-wrapped metrics hub with config. - pub fn shared_with_config(config: MetricsConfig) -> Arc { - Arc::new(Self::new(config)) - } - - /// Check if metrics are enabled. - pub fn is_enabled(&self) -> bool { - self.config.enabled - } - - /// Get the configuration. - pub fn config(&self) -> &MetricsConfig { - &self.config - } - - // ======================================================================== - // LLM Metrics - // ======================================================================== - - /// Record an LLM call. - pub fn record_llm_call( - &self, - input_tokens: u64, - output_tokens: u64, - latency_ms: u64, - success: bool, - ) { - if !self.config.enabled || !self.config.llm.track_tokens { - return; - } - self.llm.record_call( - input_tokens, - output_tokens, - latency_ms, - success, - &self.config.llm, - ); - } - - /// Record an LLM rate limit error. - pub fn record_llm_rate_limit(&self) { - if self.config.enabled { - self.llm.record_rate_limit(); - } - } - - /// Record an LLM timeout error. - pub fn record_llm_timeout(&self) { - if self.config.enabled { - self.llm.record_timeout(); - } - } - - /// Record an LLM fallback trigger. - pub fn record_llm_fallback(&self) { - if self.config.enabled { - self.llm.record_fallback(); - } - } - - /// Get LLM metrics report. - pub fn llm_report(&self) -> LlmMetricsReport { - self.llm.generate_report() - } - - // ======================================================================== - // Retrieval Metrics - // ======================================================================== - - /// Record a retrieval query. - pub fn record_retrieval_query(&self, iterations: u64, nodes_visited: u64, latency_ms: u64) { - if !self.config.enabled { - return; - } - self.retrieval.record_query( - iterations, - nodes_visited, - latency_ms, - &self.config.retrieval, - ); - } - - /// Record a found path. - pub fn record_retrieval_path(&self, length: u64, score: f64) { - if !self.config.enabled { - return; - } - self.retrieval - .record_path(length, score, &self.config.retrieval); - } - - /// Record a cache hit. - pub fn record_cache_hit(&self) { - if !self.config.enabled || !self.config.retrieval.track_cache { - return; - } - self.retrieval.record_cache_hit(&self.config.retrieval); - } - - /// Record a cache miss. - pub fn record_cache_miss(&self) { - if !self.config.enabled || !self.config.retrieval.track_cache { - return; - } - self.retrieval.record_cache_miss(&self.config.retrieval); - } - - /// Record a backtrack. - pub fn record_backtrack(&self) { - if self.config.enabled { - self.retrieval.record_backtrack(); - } - } - - /// Record a sufficiency check. - pub fn record_sufficiency_check(&self, was_sufficient: bool) { - if self.config.enabled { - self.retrieval.record_sufficiency_check(was_sufficient); - } - } - - /// Get retrieval metrics report. - pub fn retrieval_report(&self) -> RetrievalMetricsReport { - self.retrieval.generate_report() - } - - // ======================================================================== - // General Operations - // ======================================================================== - - /// Reset all metrics. - pub fn reset(&self) { - self.llm.reset(); - self.retrieval.reset(); - } - - /// Generate a complete report. - pub fn generate_report(&self) -> MetricsReport { - MetricsReport { - llm: self.llm_report(), - retrieval: self.retrieval_report(), - } - } -} - -impl Default for MetricsHub { - fn default() -> Self { - Self::with_defaults() - } -} - -/// Complete metrics report. -#[derive(Debug, Clone)] -pub struct MetricsReport { - /// LLM metrics. - pub llm: LlmMetricsReport, - /// Retrieval metrics. - pub retrieval: RetrievalMetricsReport, -} - -impl MetricsReport { - /// Calculate total estimated cost in USD. - pub fn total_cost_usd(&self) -> f64 { - self.llm.estimated_cost_usd - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_metrics_hub_recording() { - let hub = MetricsHub::with_defaults(); - - hub.record_llm_call(100, 50, 150, true); - hub.record_retrieval_query(5, 10, 100); - - let report = hub.generate_report(); - - assert_eq!(report.llm.total_calls, 1); - assert_eq!(report.retrieval.total_queries, 1); - } - - #[test] - fn test_metrics_hub_disabled() { - let config = MetricsConfig::disabled(); - let hub = MetricsHub::new(config); - - hub.record_llm_call(100, 50, 150, true); - - let report = hub.generate_report(); - - assert_eq!(report.llm.total_calls, 0); - } - - #[test] - fn test_metrics_hub_reset() { - let hub = MetricsHub::with_defaults(); - - hub.record_llm_call(100, 50, 150, true); - hub.reset(); - - let report = hub.generate_report(); - assert_eq!(report.llm.total_calls, 0); - } - - #[test] - fn test_llm_metrics_success_and_failure() { - let hub = MetricsHub::with_defaults(); - - hub.record_llm_call(100, 50, 150, true); - hub.record_llm_call(200, 100, 300, true); - hub.record_llm_call(0, 0, 50, false); - - let report = hub.llm_report(); - assert_eq!(report.total_calls, 3); - assert_eq!(report.successful_calls, 2); - assert_eq!(report.failed_calls, 1); - assert!((report.success_rate - 0.666).abs() < 0.01); - assert_eq!(report.total_input_tokens, 300); - assert_eq!(report.total_output_tokens, 150); - } - - #[test] - fn test_llm_error_events() { - let hub = MetricsHub::with_defaults(); - - hub.record_llm_rate_limit(); - hub.record_llm_rate_limit(); - hub.record_llm_timeout(); - hub.record_llm_fallback(); - - let report = hub.llm_report(); - assert_eq!(report.rate_limit_errors, 2); - assert_eq!(report.timeout_errors, 1); - assert_eq!(report.fallback_triggers, 1); - } - - #[test] - fn test_shared_arc_metrics() { - let hub = MetricsHub::shared(); - - let hub2 = hub.clone(); - hub.record_llm_call(100, 50, 100, true); - hub2.record_llm_call(200, 100, 200, true); - - let report = hub.generate_report(); - assert_eq!(report.llm.total_calls, 2); - assert_eq!(report.llm.total_input_tokens, 300); - } - - #[test] - fn test_metrics_report_cost() { - let hub = MetricsHub::with_defaults(); - - hub.record_llm_call(1000, 500, 200, true); - - let report = hub.generate_report(); - assert!(report.total_cost_usd() >= 0.0); - } -} diff --git a/vectorless-core/vectorless/src/metrics/index.rs b/vectorless-core/vectorless/src/metrics/index.rs deleted file mode 100644 index 3d1e5569..00000000 --- a/vectorless-core/vectorless/src/metrics/index.rs +++ /dev/null @@ -1,199 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Indexing pipeline metrics. - -use serde::{Deserialize, Serialize}; - -/// Performance metrics for the indexing pipeline. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct IndexMetrics { - /// Parse stage duration (ms). - #[serde(default)] - pub parse_time_ms: u64, - - /// Build stage duration (ms). - #[serde(default)] - pub build_time_ms: u64, - - /// Enhance stage duration (ms). - #[serde(default)] - pub enhance_time_ms: u64, - - /// Enrich stage duration (ms). - #[serde(default)] - pub enrich_time_ms: u64, - - /// Optimize stage duration (ms). - #[serde(default)] - pub optimize_time_ms: u64, - - /// Validate stage duration (ms). - #[serde(default)] - pub validate_time_ms: u64, - - /// Split stage duration (ms). - #[serde(default)] - pub split_time_ms: u64, - - /// Reasoning index build duration (ms). - #[serde(default)] - pub reasoning_index_time_ms: u64, - - /// Navigation index build duration (ms). - #[serde(default)] - pub navigation_index_time_ms: u64, - - /// Number of nav entries in navigation index. - #[serde(default)] - pub nav_entries_indexed: usize, - - /// Number of child routes in navigation index. - #[serde(default)] - pub child_routes_indexed: usize, - - /// Number of topics indexed in reasoning index. - #[serde(default)] - pub topics_indexed: usize, - - /// Number of keywords indexed in reasoning index. - #[serde(default)] - pub keywords_indexed: usize, - - /// Total tokens generated (summaries). - #[serde(default)] - pub total_tokens_generated: usize, - - /// Number of LLM calls. - #[serde(default)] - pub llm_calls: usize, - - /// Number of nodes processed. - #[serde(default)] - pub nodes_processed: usize, - - /// Number of summaries generated. - #[serde(default)] - pub summaries_generated: usize, - - /// Number of summaries that failed to generate (LLM error, rate limit, etc.). - #[serde(default)] - pub summaries_failed: usize, - - /// Number of nodes skipped (thinning). - #[serde(default)] - pub nodes_skipped: usize, - - /// Number of nodes merged. - #[serde(default)] - pub nodes_merged: usize, -} - -impl IndexMetrics { - /// Create new metrics with start time. - pub fn new() -> Self { - Self::default() - } - - /// Record parse stage time. - pub fn record_parse(&mut self, duration_ms: u64) { - self.parse_time_ms = duration_ms; - } - - /// Record build stage time. - pub fn record_build(&mut self, duration_ms: u64) { - self.build_time_ms = duration_ms; - } - - /// Record enhance stage time. - pub fn record_enhance(&mut self, duration_ms: u64) { - self.enhance_time_ms = duration_ms; - } - - /// Record enrich stage time. - pub fn record_enrich(&mut self, duration_ms: u64) { - self.enrich_time_ms = duration_ms; - } - - /// Record optimize stage time. - pub fn record_optimize(&mut self, duration_ms: u64) { - self.optimize_time_ms = duration_ms; - } - - /// Record validate stage time. - pub fn record_validate(&mut self, duration_ms: u64) { - self.validate_time_ms = duration_ms; - } - - /// Record split stage time. - pub fn record_split(&mut self, duration_ms: u64) { - self.split_time_ms = duration_ms; - } - - /// Record reasoning index build time. - pub fn record_reasoning_index(&mut self, duration_ms: u64, topics: usize, keywords: usize) { - self.reasoning_index_time_ms = duration_ms; - self.topics_indexed = topics; - self.keywords_indexed = keywords; - } - - /// Record navigation index build time. - pub fn record_navigation_index( - &mut self, - duration_ms: u64, - nav_entries: usize, - child_routes: usize, - ) { - self.navigation_index_time_ms = duration_ms; - self.nav_entries_indexed = nav_entries; - self.child_routes_indexed = child_routes; - } - - /// Increment LLM calls. - pub fn increment_llm_calls(&mut self) { - self.llm_calls += 1; - } - - /// Add to tokens generated. - pub fn add_tokens_generated(&mut self, tokens: usize) { - self.total_tokens_generated += tokens; - } - - /// Set nodes processed. - pub fn set_nodes_processed(&mut self, count: usize) { - self.nodes_processed = count; - } - - /// Increment summaries generated. - pub fn increment_summaries(&mut self) { - self.summaries_generated += 1; - } - - /// Add to summaries failed count. - pub fn add_summaries_failed(&mut self, count: usize) { - self.summaries_failed += count; - } - - /// Increment nodes skipped. - pub fn increment_nodes_skipped(&mut self) { - self.nodes_skipped += 1; - } - - /// Increment nodes merged. - pub fn increment_nodes_merged(&mut self) { - self.nodes_merged += 1; - } - - /// Get total time. - pub fn total_time_ms(&self) -> u64 { - self.parse_time_ms - + self.build_time_ms - + self.validate_time_ms - + self.split_time_ms - + self.enhance_time_ms - + self.enrich_time_ms - + self.reasoning_index_time_ms - + self.navigation_index_time_ms - + self.optimize_time_ms - } -} diff --git a/vectorless-core/vectorless/src/metrics/llm.rs b/vectorless-core/vectorless/src/metrics/llm.rs deleted file mode 100644 index 257747ae..00000000 --- a/vectorless-core/vectorless/src/metrics/llm.rs +++ /dev/null @@ -1,207 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! LLM metrics collection. - -use std::sync::atomic::{AtomicU64, Ordering}; - -use crate::config::LlmMetricsConfig; - -/// LLM metrics tracker. -#[derive(Debug, Default)] -pub struct LlmMetrics { - /// Total number of LLM calls. - pub total_calls: AtomicU64, - /// Number of successful calls. - pub successful_calls: AtomicU64, - /// Number of failed calls. - pub failed_calls: AtomicU64, - /// Total input tokens. - pub total_input_tokens: AtomicU64, - /// Total output tokens. - pub total_output_tokens: AtomicU64, - /// Total latency in milliseconds. - pub total_latency_ms: AtomicU64, - /// Estimated cost in micro-dollars. - pub estimated_cost_micros: AtomicU64, - /// Number of rate limit errors. - pub rate_limit_errors: AtomicU64, - /// Number of timeout errors. - pub timeout_errors: AtomicU64, - /// Number of fallback triggers. - pub fallback_triggers: AtomicU64, -} - -impl LlmMetrics { - /// Create new LLM metrics. - pub fn new() -> Self { - Self::default() - } - - /// Record an LLM call. - pub fn record_call( - &self, - input_tokens: u64, - output_tokens: u64, - latency_ms: u64, - success: bool, - config: &LlmMetricsConfig, - ) { - self.total_calls.fetch_add(1, Ordering::Relaxed); - - if success { - self.successful_calls.fetch_add(1, Ordering::Relaxed); - } else { - self.failed_calls.fetch_add(1, Ordering::Relaxed); - } - - if config.track_tokens { - self.total_input_tokens - .fetch_add(input_tokens, Ordering::Relaxed); - self.total_output_tokens - .fetch_add(output_tokens, Ordering::Relaxed); - } - - if config.track_latency { - self.total_latency_ms - .fetch_add(latency_ms, Ordering::Relaxed); - } - - if config.track_cost { - let cost = config.calculate_cost(input_tokens, output_tokens); - // Store in micro-dollars for precision - let cost_micros = (cost * 1_000_000.0) as u64; - self.estimated_cost_micros - .fetch_add(cost_micros, Ordering::Relaxed); - } - } - - /// Record a rate limit error. - pub fn record_rate_limit(&self) { - self.rate_limit_errors.fetch_add(1, Ordering::Relaxed); - } - - /// Record a timeout error. - pub fn record_timeout(&self) { - self.timeout_errors.fetch_add(1, Ordering::Relaxed); - } - - /// Record a fallback trigger. - pub fn record_fallback(&self) { - self.fallback_triggers.fetch_add(1, Ordering::Relaxed); - } - - /// Reset all metrics. - pub fn reset(&self) { - self.total_calls.store(0, Ordering::Relaxed); - self.successful_calls.store(0, Ordering::Relaxed); - self.failed_calls.store(0, Ordering::Relaxed); - self.total_input_tokens.store(0, Ordering::Relaxed); - self.total_output_tokens.store(0, Ordering::Relaxed); - self.total_latency_ms.store(0, Ordering::Relaxed); - self.estimated_cost_micros.store(0, Ordering::Relaxed); - self.rate_limit_errors.store(0, Ordering::Relaxed); - self.timeout_errors.store(0, Ordering::Relaxed); - self.fallback_triggers.store(0, Ordering::Relaxed); - } - - /// Generate a report snapshot. - pub fn generate_report(&self) -> LlmMetricsReport { - let total_calls = self.total_calls.load(Ordering::Relaxed); - let successful = self.successful_calls.load(Ordering::Relaxed); - let failed = self.failed_calls.load(Ordering::Relaxed); - let total_latency = self.total_latency_ms.load(Ordering::Relaxed); - - LlmMetricsReport { - total_calls, - successful_calls: successful, - failed_calls: failed, - success_rate: if total_calls > 0 { - successful as f64 / total_calls as f64 - } else { - 0.0 - }, - total_input_tokens: self.total_input_tokens.load(Ordering::Relaxed), - total_output_tokens: self.total_output_tokens.load(Ordering::Relaxed), - total_tokens: self.total_input_tokens.load(Ordering::Relaxed) - + self.total_output_tokens.load(Ordering::Relaxed), - avg_latency_ms: if total_calls > 0 { - total_latency as f64 / total_calls as f64 - } else { - 0.0 - }, - total_latency_ms: total_latency, - estimated_cost_usd: self.estimated_cost_micros.load(Ordering::Relaxed) as f64 - / 1_000_000.0, - rate_limit_errors: self.rate_limit_errors.load(Ordering::Relaxed), - timeout_errors: self.timeout_errors.load(Ordering::Relaxed), - fallback_triggers: self.fallback_triggers.load(Ordering::Relaxed), - } - } -} - -/// LLM metrics report. -#[derive(Debug, Clone)] -pub struct LlmMetricsReport { - /// Total number of LLM calls. - pub total_calls: u64, - /// Number of successful calls. - pub successful_calls: u64, - /// Number of failed calls. - pub failed_calls: u64, - /// Success rate (0.0 - 1.0). - pub success_rate: f64, - /// Total input tokens. - pub total_input_tokens: u64, - /// Total output tokens. - pub total_output_tokens: u64, - /// Total tokens (input + output). - pub total_tokens: u64, - /// Average latency in milliseconds. - pub avg_latency_ms: f64, - /// Total latency in milliseconds. - pub total_latency_ms: u64, - /// Estimated cost in USD. - pub estimated_cost_usd: f64, - /// Number of rate limit errors. - pub rate_limit_errors: u64, - /// Number of timeout errors. - pub timeout_errors: u64, - /// Number of fallback triggers. - pub fallback_triggers: u64, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_llm_metrics_recording() { - let config = LlmMetricsConfig::default(); - let metrics = LlmMetrics::new(); - - metrics.record_call(100, 50, 150, true, &config); - metrics.record_call(200, 100, 300, true, &config); - metrics.record_call(100, 0, 0, false, &config); - - let report = metrics.generate_report(); - assert_eq!(report.total_calls, 3); - assert_eq!(report.successful_calls, 2); - assert_eq!(report.failed_calls, 1); - assert!((report.success_rate - 0.666666).abs() < 0.01); - assert_eq!(report.total_input_tokens, 400); - assert_eq!(report.total_output_tokens, 150); - } - - #[test] - fn test_llm_metrics_reset() { - let config = LlmMetricsConfig::default(); - let metrics = LlmMetrics::new(); - - metrics.record_call(100, 50, 150, true, &config); - metrics.reset(); - - let report = metrics.generate_report(); - assert_eq!(report.total_calls, 0); - } -} diff --git a/vectorless-core/vectorless/src/metrics/mod.rs b/vectorless-core/vectorless/src/metrics/mod.rs deleted file mode 100644 index 26ab6411..00000000 --- a/vectorless-core/vectorless/src/metrics/mod.rs +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified metrics collection for Vectorless. -//! -//! This module provides centralized metrics collection across all components: -//! - **LLM Metrics** — Token usage, latency, cost -//! - **Retrieval Metrics** — Paths, scores, iterations, cache -//! -//! # Architecture -//! -//! ```text -//! ┌─────────────────────────────────────────────────────────────────┐ -//! │ MetricsHub │ -//! │ │ -//! │ ┌─────────────┐ ┌──────────────────┐ │ -//! │ │ LlmMetrics │ │RetrievalMetrics │ │ -//! │ │ │ │ │ │ -//! │ │ - tokens │ │ - paths │ │ -//! │ │ - latency │ │ - scores │ │ -//! │ │ - cost │ │ - cache │ │ -//! │ └─────────────┘ └──────────────────┘ │ -//! │ │ -//! │ ┌─────────────────────────────────────────────────────────┐ │ -//! │ │ MetricsReport │ │ -//! │ │ │ │ -//! │ │ Aggregated report with all metrics and statistics │ │ -//! │ └─────────────────────────────────────────────────────────┘ │ -//! └─────────────────────────────────────────────────────────────────┘ -//! ``` -//! -//! # Example -//! -//! ```rust -//! use vectorless::metrics::{MetricsHub, MetricsConfig}; -//! -//! let config = MetricsConfig::default(); -//! let hub = MetricsHub::new(config); -//! -//! // Record LLM call -//! hub.record_llm_call(100, 50, 150, true); -//! -//! // Generate report -//! let report = hub.generate_report(); -//! println!("Total cost: ${:.4}", report.llm.estimated_cost_usd); -//! ``` - -mod hub; -mod index; -mod llm; -mod retrieval; - -pub use hub::{MetricsHub, MetricsReport}; -pub use index::IndexMetrics; -pub use llm::LlmMetricsReport; -pub use retrieval::RetrievalMetricsReport; diff --git a/vectorless-core/vectorless/src/metrics/retrieval.rs b/vectorless-core/vectorless/src/metrics/retrieval.rs deleted file mode 100644 index 682250e9..00000000 --- a/vectorless-core/vectorless/src/metrics/retrieval.rs +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Retrieval metrics collection. - -use std::sync::atomic::{AtomicU64, Ordering}; - -use crate::config::RetrievalMetricsConfig; - -/// Retrieval metrics tracker. -#[derive(Debug, Default)] -pub struct RetrievalMetrics { - /// Total number of queries. - pub total_queries: AtomicU64, - /// Total number of search iterations. - pub total_iterations: AtomicU64, - /// Sum of iterations (for average). - pub iterations_sum: AtomicU64, - /// Total number of nodes visited. - pub nodes_visited: AtomicU64, - /// Total number of paths found. - pub paths_found: AtomicU64, - /// Sum of path lengths (for average). - pub path_length_sum: AtomicU64, - /// Sum of path scores stored as scaled integer (multiply by 1_000_000 for actual value). - pub path_score_sum_scaled: AtomicU64, - /// Number of paths with score >= 0.5. - pub high_score_paths: AtomicU64, - /// Number of paths with score < 0.3. - pub low_score_paths: AtomicU64, - /// Number of cache hits. - pub cache_hits: AtomicU64, - /// Number of cache misses. - pub cache_misses: AtomicU64, - /// Total latency in milliseconds. - pub total_latency_ms: AtomicU64, - /// Number of backtracks. - pub backtracks: AtomicU64, - /// Number of sufficiency checks. - pub sufficiency_checks: AtomicU64, - /// Number of times content was sufficient. - pub sufficient_results: AtomicU64, -} - -impl RetrievalMetrics { - /// Create new retrieval metrics. - pub fn new() -> Self { - Self::default() - } - - /// Record a query. - pub fn record_query( - &self, - iterations: u64, - nodes: u64, - latency_ms: u64, - config: &RetrievalMetricsConfig, - ) { - self.total_queries.fetch_add(1, Ordering::Relaxed); - - if config.track_iterations { - self.total_iterations - .fetch_add(iterations, Ordering::Relaxed); - self.iterations_sum.fetch_add(iterations, Ordering::Relaxed); - } - - if config.track_paths { - self.nodes_visited.fetch_add(nodes, Ordering::Relaxed); - } - - self.total_latency_ms - .fetch_add(latency_ms, Ordering::Relaxed); - } - - /// Record a found path. - pub fn record_path(&self, length: u64, score: f64, config: &RetrievalMetricsConfig) { - if !config.track_paths { - return; - } - - self.paths_found.fetch_add(1, Ordering::Relaxed); - self.path_length_sum.fetch_add(length, Ordering::Relaxed); - - if config.track_scores { - let scaled_score = (score * 1_000_000.0) as u64; - self.path_score_sum_scaled - .fetch_add(scaled_score, Ordering::Relaxed); - - if score >= 0.5 { - self.high_score_paths.fetch_add(1, Ordering::Relaxed); - } else if score < 0.3 { - self.low_score_paths.fetch_add(1, Ordering::Relaxed); - } - } - } - - /// Record a cache hit. - pub fn record_cache_hit(&self, config: &RetrievalMetricsConfig) { - if config.track_cache { - self.cache_hits.fetch_add(1, Ordering::Relaxed); - } - } - - /// Record a cache miss. - pub fn record_cache_miss(&self, config: &RetrievalMetricsConfig) { - if config.track_cache { - self.cache_misses.fetch_add(1, Ordering::Relaxed); - } - } - - /// Record a backtrack. - pub fn record_backtrack(&self) { - self.backtracks.fetch_add(1, Ordering::Relaxed); - } - - /// Record a sufficiency check. - pub fn record_sufficiency_check(&self, was_sufficient: bool) { - self.sufficiency_checks.fetch_add(1, Ordering::Relaxed); - if was_sufficient { - self.sufficient_results.fetch_add(1, Ordering::Relaxed); - } - } - - /// Reset all metrics. - pub fn reset(&self) { - self.total_queries.store(0, Ordering::Relaxed); - self.total_iterations.store(0, Ordering::Relaxed); - self.iterations_sum.store(0, Ordering::Relaxed); - self.nodes_visited.store(0, Ordering::Relaxed); - self.paths_found.store(0, Ordering::Relaxed); - self.path_length_sum.store(0, Ordering::Relaxed); - self.path_score_sum_scaled.store(0, Ordering::Relaxed); - self.high_score_paths.store(0, Ordering::Relaxed); - self.low_score_paths.store(0, Ordering::Relaxed); - self.cache_hits.store(0, Ordering::Relaxed); - self.cache_misses.store(0, Ordering::Relaxed); - self.total_latency_ms.store(0, Ordering::Relaxed); - self.backtracks.store(0, Ordering::Relaxed); - self.sufficiency_checks.store(0, Ordering::Relaxed); - self.sufficient_results.store(0, Ordering::Relaxed); - } - - /// Generate a report snapshot. - pub fn generate_report(&self) -> RetrievalMetricsReport { - let total_queries = self.total_queries.load(Ordering::Relaxed); - let paths_found = self.paths_found.load(Ordering::Relaxed); - let cache_hits = self.cache_hits.load(Ordering::Relaxed); - let cache_misses = self.cache_misses.load(Ordering::Relaxed); - let total_cache = cache_hits + cache_misses; - let sufficiency_checks = self.sufficiency_checks.load(Ordering::Relaxed); - - RetrievalMetricsReport { - total_queries, - total_iterations: self.total_iterations.load(Ordering::Relaxed), - avg_iterations: if total_queries > 0 { - self.iterations_sum.load(Ordering::Relaxed) as f64 / total_queries as f64 - } else { - 0.0 - }, - nodes_visited: self.nodes_visited.load(Ordering::Relaxed), - paths_found, - avg_path_length: if paths_found > 0 { - self.path_length_sum.load(Ordering::Relaxed) as f64 / paths_found as f64 - } else { - 0.0 - }, - avg_path_score: if paths_found > 0 { - (self.path_score_sum_scaled.load(Ordering::Relaxed) as f64 / 1_000_000.0) - / paths_found as f64 - } else { - 0.0 - }, - high_score_paths: self.high_score_paths.load(Ordering::Relaxed), - low_score_paths: self.low_score_paths.load(Ordering::Relaxed), - cache_hits, - cache_misses, - cache_hit_rate: if total_cache > 0 { - cache_hits as f64 / total_cache as f64 - } else { - 0.0 - }, - total_latency_ms: self.total_latency_ms.load(Ordering::Relaxed), - avg_latency_ms: if total_queries > 0 { - self.total_latency_ms.load(Ordering::Relaxed) as f64 / total_queries as f64 - } else { - 0.0 - }, - backtracks: self.backtracks.load(Ordering::Relaxed), - sufficiency_checks, - sufficiency_rate: if sufficiency_checks > 0 { - self.sufficient_results.load(Ordering::Relaxed) as f64 / sufficiency_checks as f64 - } else { - 0.0 - }, - } - } -} - -/// Retrieval metrics report. -#[derive(Debug, Clone)] -pub struct RetrievalMetricsReport { - /// Total number of queries. - pub total_queries: u64, - /// Total number of iterations. - pub total_iterations: u64, - /// Average iterations per query. - pub avg_iterations: f64, - /// Total nodes visited. - pub nodes_visited: u64, - /// Total paths found. - pub paths_found: u64, - /// Average path length. - pub avg_path_length: f64, - /// Average path score. - pub avg_path_score: f64, - /// Number of high-score paths (>= 0.5). - pub high_score_paths: u64, - /// Number of low-score paths (< 0.3). - pub low_score_paths: u64, - /// Number of cache hits. - pub cache_hits: u64, - /// Number of cache misses. - pub cache_misses: u64, - /// Cache hit rate. - pub cache_hit_rate: f64, - /// Total latency in milliseconds. - pub total_latency_ms: u64, - /// Average latency per query in milliseconds. - pub avg_latency_ms: f64, - /// Number of backtracks. - pub backtracks: u64, - /// Number of sufficiency checks. - pub sufficiency_checks: u64, - /// Sufficiency rate. - pub sufficiency_rate: f64, -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_retrieval_metrics_recording() { - let config = RetrievalMetricsConfig::default(); - let metrics = RetrievalMetrics::new(); - - metrics.record_query(5, 10, 100, &config); - metrics.record_query(3, 8, 80, &config); - - metrics.record_path(3, 0.8, &config); - metrics.record_path(2, 0.2, &config); - - metrics.record_cache_hit(&config); - metrics.record_cache_hit(&config); - metrics.record_cache_miss(&config); - - let report = metrics.generate_report(); - assert_eq!(report.total_queries, 2); - assert_eq!(report.total_iterations, 8); - assert_eq!(report.paths_found, 2); - assert!((report.cache_hit_rate - 0.666).abs() < 0.01); - } -} diff --git a/vectorless-core/vectorless/src/query/mod.rs b/vectorless-core/vectorless/src/query/mod.rs deleted file mode 100644 index bbe6806d..00000000 --- a/vectorless-core/vectorless/src/query/mod.rs +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Query understanding and planning. -//! -//! Analyzes a user's raw query and produces a structured [`QueryPlan`] -//! for downstream modules (Orchestrator, Worker). -//! -//! # Pipeline -//! -//! ```text -//! raw query string -//! → extract keywords (from scoring/bm25) -//! → LLM query understanding (intent, concepts, complexity) -//! → QueryPlan -//! ``` -//! -//! LLM understanding is required — this is a pure reasoning engine. -//! Errors are propagated, not silently degraded. - -mod types; -mod understand; - -pub use types::{QueryIntent, QueryPlan}; - -use crate::llm::LlmClient; -use crate::scoring::bm25::extract_keywords; - -/// Query understanding pipeline. -/// -/// Produces a [`QueryPlan`] from a raw query string via LLM analysis. -pub struct QueryPipeline; - -impl QueryPipeline { - /// Analyze a query and produce a structured plan. - /// - /// 1. Extract keywords (zero-cost, no LLM) - /// 2. LLM deep understanding (intent, concepts, complexity, strategy) - /// - /// Errors propagate — the caller handles retries or failure. - pub async fn understand(query: &str, llm: &LlmClient) -> crate::error::Result { - let keywords = extract_keywords(query); - understand::understand(query, &keywords, llm).await - } -} diff --git a/vectorless-core/vectorless/src/query/types.rs b/vectorless-core/vectorless/src/query/types.rs deleted file mode 100644 index f8e025e8..00000000 --- a/vectorless-core/vectorless/src/query/types.rs +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright (c) 2026 vectorless devices -// SPDX-License-Identifier: Apache-2.0 - -//! Core types for query understanding. - -use serde::{Deserialize, Serialize}; - -/// Query intent classification. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum QueryIntent { - /// Factoid: "What is the Q3 2024 revenue?" - Factual, - /// Analytical: "Compare market risk vs operational risk" - Analytical, - /// Navigation: "Find the section on compliance policy" - Navigational, - /// Summary: "Summarize the main points of this document" - Summary, -} - -impl Default for QueryIntent { - fn default() -> Self { - Self::Factual - } -} - -impl std::fmt::Display for QueryIntent { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - QueryIntent::Factual => write!(f, "factual"), - QueryIntent::Analytical => write!(f, "analytical"), - QueryIntent::Navigational => write!(f, "navigational"), - QueryIntent::Summary => write!(f, "summary"), - } - } -} - -/// Query complexity estimation. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] -pub enum Complexity { - /// Single keyword, simple factoid. - Simple, - /// Multi-concept, requires synthesis. - Moderate, - /// Cross-document, comparative, or multi-faceted. - Complex, -} - -impl Default for Complexity { - fn default() -> Self { - Self::Simple - } -} - -impl std::fmt::Display for Complexity { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Complexity::Simple => write!(f, "simple"), - Complexity::Moderate => write!(f, "moderate"), - Complexity::Complex => write!(f, "complex"), - } - } -} - -/// A sub-query produced by decomposition. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct SubQuery { - /// The sub-query text. - pub query: String, - /// Intent of this sub-query. - pub intent: QueryIntent, - /// Pre-identified target documents (if any). - pub target_docs: Option>, -} - -/// A structured query plan — the output of the query understanding pipeline. -/// -/// Produced by `QueryPipeline::understand()`. Consumed by the Orchestrator -/// and Worker agents for strategy selection. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct QueryPlan { - /// The original raw query string. - pub original: String, - /// Detected intent. - pub intent: QueryIntent, - /// Extracted keywords. - pub keywords: Vec, - /// Key concepts identified by LLM (distinct from keywords). - pub key_concepts: Vec, - /// Strategy hint for navigation agents. - pub strategy_hint: String, - /// Estimated complexity. - pub complexity: Complexity, - /// Rewritten queries (produced by LLM for better matching). - pub rewritten: Vec, - /// Decomposed sub-queries (for complex/multi-faceted queries). - pub sub_queries: Vec, -} - -impl QueryPlan { - /// LLM understanding failed — produce a minimal default plan. - pub fn default_for(query: &str, keywords: Vec) -> Self { - Self { - original: query.to_string(), - intent: QueryIntent::Factual, - keywords, - key_concepts: Vec::new(), - strategy_hint: "focused".to_string(), - complexity: Complexity::Simple, - rewritten: Vec::new(), - sub_queries: Vec::new(), - } - } -} diff --git a/vectorless-core/vectorless/src/query/understand.rs b/vectorless-core/vectorless/src/query/understand.rs deleted file mode 100644 index 9790e557..00000000 --- a/vectorless-core/vectorless/src/query/understand.rs +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! LLM-driven query understanding. -//! -//! Uses an LLM call to analyze the query and produce a structured [`QueryPlan`]. -//! Falls back to keyword-only analysis on LLM failure. - -use serde::Deserialize; -use tracing::{info, warn}; - -use crate::llm::LlmClient; - -use super::types::{Complexity, QueryIntent, QueryPlan, SubQuery}; - -/// Structured analysis returned by the LLM. -#[derive(Deserialize)] -struct QueryAnalysis { - intent: String, - key_concepts: Vec, - strategy_hint: String, - complexity: String, - rewritten: Option, - sub_queries: Vec, -} - -/// Use LLM to understand the query and produce a QueryPlan. -/// -/// Propagates LLM errors — no silent degradation. The caller decides -/// how to handle failure (retry, abort, etc.). -pub async fn understand( - query: &str, - keywords: &[String], - llm: &LlmClient, -) -> crate::error::Result { - let (system, user) = understand_prompt(query, keywords); - info!("Query understanding: calling LLM..."); - let response = llm.complete(&system, &user).await?; - - if response.trim().is_empty() { - warn!("Query understanding: LLM returned empty response"); - return Err(crate::error::Error::Config( - "Query understanding failed: LLM returned an empty response. \ - Check your API key, model, and endpoint configuration." - .to_string(), - )); - } - - let analysis = parse_analysis(&response).ok_or_else(|| { - let preview = &response[..response.len().min(300)]; - crate::error::Error::Config(format!( - "Query understanding returned unparseable response ({} chars): {}", - response.len(), - preview - )) - })?; - - info!( - intent = %analysis.intent, - complexity = %analysis.complexity, - concepts = ?analysis.key_concepts, - strategy = %analysis.strategy_hint, - rewritten = ?analysis.rewritten, - "Query understanding complete" - ); - Ok(analysis.into_plan(query, keywords)) -} - -/// Parse the LLM's JSON response into a QueryAnalysis. -fn parse_analysis(response: &str) -> Option { - let trimmed = response.trim(); - - // Try to extract JSON from the response (LLM may wrap it in markdown) - let json_str = if trimmed.starts_with("```") { - // Find the first newline after the opening fence (skips language tag) - let after_fence = if let Some(nl) = trimmed.find('\n') { - &trimmed[nl + 1..] - } else { - trimmed - }; - // Strip the closing fence - let without_end = if let Some(end) = after_fence.rfind("```") { - &after_fence[..end] - } else { - after_fence - }; - without_end.trim() - } else { - trimmed - }; - - match serde_json::from_str(json_str) { - Ok(analysis) => Some(analysis), - Err(e) => { - warn!( - error = %e, - json_len = json_str.len(), - "Query understanding: JSON parse failed" - ); - None - } - } -} - -impl QueryAnalysis { - fn into_plan(self, query: &str, keywords: &[String]) -> QueryPlan { - QueryPlan { - original: query.to_string(), - intent: parse_intent(&self.intent), - keywords: keywords.to_vec(), - key_concepts: self.key_concepts, - strategy_hint: self.strategy_hint, - complexity: parse_complexity(&self.complexity), - rewritten: self.rewritten.into_iter().collect(), - sub_queries: self - .sub_queries - .into_iter() - .map(|sq| SubQuery { - query: sq, - intent: QueryIntent::Factual, - target_docs: None, - }) - .collect(), - } - } -} - -fn parse_intent(s: &str) -> QueryIntent { - match s.to_lowercase().as_str() { - "analytical" | "analysis" | "compare" | "comparison" => QueryIntent::Analytical, - "navigational" | "navigation" | "find" | "locate" => QueryIntent::Navigational, - "summary" | "summarize" | "overview" => QueryIntent::Summary, - _ => QueryIntent::Factual, - } -} - -fn parse_complexity(s: &str) -> Complexity { - match s.to_lowercase().as_str() { - "complex" | "high" => Complexity::Complex, - "moderate" | "medium" => Complexity::Moderate, - _ => Complexity::Simple, - } -} - -/// Build the LLM prompt for query understanding. -fn understand_prompt(query: &str, keywords: &[String]) -> (String, String) { - let system = r#"You are a query analysis engine. Analyze the user's query and respond with a JSON object containing: - -- "intent": one of "factual", "analytical", "navigational", "summary" -- "key_concepts": array of the main concepts/entities in the query (distinct from keywords) -- "strategy_hint": one of "focused" (single-topic), "exploratory" (broad scan), "comparative" (cross-reference), or "summary" (aggregate) -- "complexity": one of "simple", "moderate", "complex" -- "rewritten": optional rewritten version of the query for better retrieval (null if not needed) -- "sub_queries": array of sub-query strings if the query can be decomposed (empty array if not) - -Respond with ONLY the JSON object, no additional text."#; - - let user = format!( - "Query: {}\nExtracted keywords: [{}]", - query, - keywords.join(", ") - ); - - (system.to_string(), user) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parse_intent() { - assert_eq!(parse_intent("factual"), QueryIntent::Factual); - assert_eq!(parse_intent("analytical"), QueryIntent::Analytical); - assert_eq!(parse_intent("analysis"), QueryIntent::Analytical); - assert_eq!(parse_intent("navigational"), QueryIntent::Navigational); - assert_eq!(parse_intent("summary"), QueryIntent::Summary); - assert_eq!(parse_intent("unknown"), QueryIntent::Factual); - } - - #[test] - fn test_parse_complexity() { - assert_eq!(parse_complexity("simple"), Complexity::Simple); - assert_eq!(parse_complexity("moderate"), Complexity::Moderate); - assert_eq!(parse_complexity("complex"), Complexity::Complex); - assert_eq!(parse_complexity("high"), Complexity::Complex); - assert_eq!(parse_complexity("unknown"), Complexity::Simple); - } - - #[test] - fn test_parse_analysis_json() { - let response = r#"{"intent":"factual","key_concepts":["revenue","Q3"],"strategy_hint":"focused","complexity":"simple","rewritten":null,"sub_queries":[]}"#; - let analysis = parse_analysis(response).unwrap(); - assert_eq!(analysis.intent, "factual"); - assert_eq!(analysis.key_concepts.len(), 2); - assert!(analysis.rewritten.is_none()); - } - - #[test] - fn test_parse_analysis_markdown_wrapped() { - let response = "```json\n{\"intent\":\"analytical\",\"key_concepts\":[\"risk\"],\"strategy_hint\":\"comparative\",\"complexity\":\"moderate\",\"rewritten\":\"compare risks\",\"sub_queries\":[]}\n```"; - let analysis = parse_analysis(response).unwrap(); - assert_eq!(analysis.intent, "analytical"); - } - - #[test] - fn test_parse_analysis_invalid() { - assert!(parse_analysis("not json").is_none()); - } - - #[test] - fn test_parse_analysis_code_fence_no_newline() { - // Edge case: ```json{"intent":...}``` with no newline after language tag - let response = "```json\n{\"intent\":\"factual\",\"key_concepts\":[\"test\"],\"strategy_hint\":\"focused\",\"complexity\":\"simple\",\"rewritten\":null,\"sub_queries\":[]}\n```"; - let analysis = parse_analysis(response).unwrap(); - assert_eq!(analysis.intent, "factual"); - } - - #[test] - fn test_parse_analysis_code_fence_no_closing() { - // LLM sometimes omits the closing fence - let response = "```json\n{\"intent\":\"summary\",\"key_concepts\":[\"overview\"],\"strategy_hint\":\"summary\",\"complexity\":\"simple\",\"rewritten\":null,\"sub_queries\":[]}"; - let analysis = parse_analysis(response).unwrap(); - assert_eq!(analysis.intent, "summary"); - } - - #[test] - fn test_parse_analysis_keys_starting_with_fence_letters() { - // The old trim_start_matches(|c| 'j' | 's' | 'o' | 'n') would eat - // JSON keys starting with those letters. Verify this works correctly. - let response = r#"{"intent":"navigational","key_concepts":["journal","offset","node"],"strategy_hint":"focused","complexity":"moderate","rewritten":null,"sub_queries":[]}"#; - let analysis = parse_analysis(response).unwrap(); - assert_eq!(analysis.intent, "navigational"); - assert_eq!(analysis.key_concepts, vec!["journal", "offset", "node"]); - } - - #[test] - fn test_default_plan() { - let plan = QueryPlan::default_for("test query", vec!["test".to_string()]); - assert_eq!(plan.original, "test query"); - assert_eq!(plan.intent, QueryIntent::Factual); - assert_eq!(plan.keywords.len(), 1); - assert!(plan.key_concepts.is_empty()); - assert!(plan.sub_queries.is_empty()); - } -} diff --git a/vectorless-core/vectorless/src/rerank/dedup.rs b/vectorless-core/vectorless/src/rerank/dedup.rs deleted file mode 100644 index 8644a932..00000000 --- a/vectorless-core/vectorless/src/rerank/dedup.rs +++ /dev/null @@ -1,216 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Evidence deduplication and quality filtering. - -use std::collections::HashSet; - -use crate::agent::Evidence; - -/// Minimum characters for an evidence item to be considered meaningful. -const MIN_EVIDENCE_CHARS: usize = 50; - -/// Jaccard similarity threshold for content dedup. -const SIMILARITY_THRESHOLD: f64 = 0.8; - -/// Filter low-quality and duplicate evidence. -/// -/// Steps: -/// 1. Drop evidence with no meaningful content (< MIN_EVIDENCE_CHARS) -/// 2. Deduplicate by source overlap (same path in same doc) -/// 3. Deduplicate by content similarity (Jaccard on token sets) -pub fn dedup(evidence: &[Evidence]) -> Vec { - // Step 1: Quality filter - let quality: Vec<&Evidence> = evidence - .iter() - .filter(|e| e.content.len() >= MIN_EVIDENCE_CHARS) - .collect(); - - // Step 2: Deduplicate by source overlap - let mut seen_sources: HashSet = HashSet::new(); - let source_deduped: Vec<&Evidence> = quality - .into_iter() - .filter(|e| { - let doc_key = e.doc_name.as_deref().unwrap_or("_unknown"); - let key = format!("{}:{}", doc_key, e.source_path); - seen_sources.insert(key) - }) - .collect(); - - // Step 3: Deduplicate by content similarity - let mut deduped: Vec = Vec::new(); - for ev in source_deduped { - let tokens = tokenize(&ev.content); - let dominated = deduped - .iter() - .any(|existing| jaccard(&tokens, &tokenize(&existing.content)) >= SIMILARITY_THRESHOLD); - if !dominated { - deduped.push(ev.clone()); - } - } - - deduped -} - -/// Tokenize text into a set of lowercase words. -fn tokenize(text: &str) -> HashSet { - text.to_lowercase() - .split_whitespace() - .map(|s| s.to_string()) - .collect() -} - -/// Compute Jaccard similarity between two sets. -fn jaccard(a: &HashSet, b: &HashSet) -> f64 { - if a.is_empty() && b.is_empty() { - return 1.0; - } - let intersection = a.intersection(b).count() as f64; - let union = a.union(b).count() as f64; - intersection / union -} - -#[cfg(test)] -mod tests { - use super::*; - - fn make_evidence(title: &str, content: &str) -> Evidence { - Evidence { - source_path: format!("root/{}", title), - node_title: title.to_string(), - content: content.to_string(), - doc_name: Some("doc".to_string()), - } - } - - #[test] - fn test_quality_filter() { - let evidence = vec![ - make_evidence("A", "short"), // < 50 chars, filtered - make_evidence("B", &"x".repeat(60)), // kept - ]; - let result = dedup(&evidence); - assert_eq!(result.len(), 1); - assert_eq!(result[0].node_title, "B"); - } - - #[test] - fn test_source_dedup() { - let evidence = vec![ - make_evidence( - "A", - &"content A with enough text to pass the quality filter threshold".to_string(), - ), - make_evidence( - "A", - &"different content A but same source path that is long enough".to_string(), - ), - ]; - let result = dedup(&evidence); - assert_eq!(result.len(), 1); - } - - #[test] - fn test_content_similarity_dedup() { - let base = "This is a piece of evidence about machine learning algorithms and their applications in real world scenarios".to_string(); - let similar = "This is a piece of evidence about machine learning algorithms and their applications in real world".to_string(); - let different = - "Completely unrelated content about quantum physics and particle accelerators at CERN" - .to_string(); - let evidence = vec![ - make_evidence("A", &base), - make_evidence("B", &similar), // high similarity, should be deduped - make_evidence("C", &different), // different, kept - ]; - let result = dedup(&evidence); - assert!(result.len() >= 2); // at least A and C - } - - #[test] - fn test_empty_input() { - let result = dedup(&[]); - assert!(result.is_empty()); - } - - #[test] - fn test_jaccard_identical() { - let a = tokenize("hello world foo"); - let b = tokenize("hello world foo"); - assert!((jaccard(&a, &b) - 1.0).abs() < 0.001); - } - - #[test] - fn test_jaccard_disjoint() { - let a = tokenize("aaa bbb"); - let b = tokenize("ccc ddd"); - assert!((jaccard(&a, &b)).abs() < 0.001); - } - - #[test] - fn test_source_dedup_none_doc_name() { - // Evidence with doc_name: None should use "_unknown" as doc key, - // so same source_path with None doc_name still deduplicates correctly. - let evidence = vec![ - Evidence { - source_path: "root/section_a".to_string(), - node_title: "A".to_string(), - content: "content A with enough text to pass the quality filter threshold" - .to_string(), - doc_name: None, - }, - Evidence { - source_path: "root/section_a".to_string(), - node_title: "A2".to_string(), - content: "different content but same source path that should be deduped" - .to_string(), - doc_name: None, - }, - ]; - let result = dedup(&evidence); - assert_eq!(result.len(), 1); - } - - #[test] - fn test_source_dedup_mixed_doc_name() { - // Same source_path but different doc_name should produce different dedup keys, - // so both survive source dedup. Content must be sufficiently different too. - let evidence = vec![ - Evidence { - source_path: "root/section".to_string(), - node_title: "A".to_string(), - content: "Revenue for Q4 was twelve million dollars driven by SaaS growth in the enterprise segment".to_string(), - doc_name: Some("doc_a".to_string()), - }, - Evidence { - source_path: "root/section".to_string(), - node_title: "B".to_string(), - content: "The encryption module uses AES-256 for data at rest and TLS 1.3 for all network communication".to_string(), - doc_name: Some("doc_b".to_string()), - }, - ]; - let result = dedup(&evidence); - assert_eq!(result.len(), 2); - } - - #[test] - fn test_source_dedup_none_vs_some_doc_name() { - // None doc_name ("_unknown") and Some doc_name produce different keys, - // so both survive source dedup. Content must be sufficiently different too. - let evidence = vec![ - Evidence { - source_path: "root/section".to_string(), - node_title: "A".to_string(), - content: "The database uses a log-structured merge tree with write-ahead logging for durability".to_string(), - doc_name: None, - }, - Evidence { - source_path: "root/section".to_string(), - node_title: "B".to_string(), - content: "Authentication requires Bearer tokens with automatic refresh after twenty-four hours".to_string(), - doc_name: Some("doc_x".to_string()), - }, - ]; - let result = dedup(&evidence); - assert_eq!(result.len(), 2); - } -} diff --git a/vectorless-core/vectorless/src/rerank/mod.rs b/vectorless-core/vectorless/src/rerank/mod.rs deleted file mode 100644 index bc179ec3..00000000 --- a/vectorless-core/vectorless/src/rerank/mod.rs +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Result reranking — dedup + format. -//! -//! Post-processing pipeline that runs after the agent collects raw evidence: -//! -//! ```text -//! agent (collect evidence) -//! → rerank::process() -//! → dedup (quality filter + dedup) -//! → format as answer (no LLM — return original text) -//! → Output with final answer -//! ``` -//! -//! This is a document retrieval engine. The answer IS the evidence. -//! No LLM synthesis, no rewriting. Find what you find, return what you find. - -pub mod dedup; -pub mod types; - -use tracing::info; - -use crate::agent::Evidence; -use crate::query::QueryIntent; -use types::RerankOutput; - -/// Process agent output through the rerank pipeline. -/// -/// Deduplicates evidence, then returns the original text as the answer. -/// No LLM calls — the Worker already retrieved the exact passages. -pub async fn process( - _query: &str, - evidence: &[Evidence], - _multi_doc: bool, - intent: QueryIntent, - confidence: f32, -) -> crate::error::Result { - let deduped = dedup::dedup(evidence); - if deduped.is_empty() { - info!("No evidence after dedup"); - return Ok(RerankOutput { - answer: String::new(), - llm_calls: 0, - confidence: 0.0, - }); - } - - info!( - evidence = deduped.len(), - intent = %intent, - "Evidence after dedup" - ); - - let answer = match intent { - QueryIntent::Navigational => format_locations(&deduped), - _ => format_evidence_as_answer(&deduped), - }; - - info!( - evidence = deduped.len(), - answer_len = answer.len(), - confidence, - "Rerank complete" - ); - - Ok(RerankOutput { - answer, - llm_calls: 0, - confidence, - }) -} - -/// Format evidence as a location listing for navigational queries. -fn format_locations(evidence: &[Evidence]) -> String { - if evidence.is_empty() { - return "No matching locations found.".to_string(); - } - let mut result = "Found at:\n".to_string(); - for e in evidence { - let doc = e.doc_name.as_deref().unwrap_or("unknown"); - result.push_str(&format!( - "- **{}** in {} at {}\n", - e.node_title, doc, e.source_path - )); - } - result -} - -/// Format collected evidence directly as the answer. -fn format_evidence_as_answer(evidence: &[Evidence]) -> String { - evidence - .iter() - .map(|e| { - let doc = e.doc_name.as_deref().unwrap_or(""); - if doc.is_empty() { - format!("[{}]\n{}", e.node_title, e.content) - } else { - format!("[{} — {}]\n{}", e.node_title, doc, e.content) - } - }) - .collect::>() - .join("\n\n") -} diff --git a/vectorless-core/vectorless/src/rerank/types.rs b/vectorless-core/vectorless/src/rerank/types.rs deleted file mode 100644 index 4b42f351..00000000 --- a/vectorless-core/vectorless/src/rerank/types.rs +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Rerank result types. - -/// Output from the rerank pipeline. -pub struct RerankOutput { - /// Synthesized answer. - pub answer: String, - /// Number of LLM calls used during synthesis/fusion. - pub llm_calls: u32, - /// Confidence score (0.0–1.0) — derived from LLM evaluate() result. - pub confidence: f32, -} diff --git a/vectorless-core/vectorless/src/retrieval/cache.rs b/vectorless-core/vectorless/src/retrieval/cache.rs deleted file mode 100644 index ecfce79a..00000000 --- a/vectorless-core/vectorless/src/retrieval/cache.rs +++ /dev/null @@ -1,577 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Tiered reasoning cache for the retrieval pipeline. -//! -//! Provides three levels of caching to avoid redundant computation: -//! -//! - **L1 (Exact)**: Cache full retrieval results keyed by exact query fingerprint. -//! Identical queries return instantly. -//! -//! - **L2 (Path Pattern)**: Cache navigation decisions for tree paths. If a previous -//! query navigated through Section 3.2, a new query about the same section can -//! reuse those path cues even when the full query differs. -//! -//! - **L3 (Strategy Score)**: Cache node scores from keyword/BM25 strategies. -//! Node scores are independent of the query, so they can be shared across -//! different queries on the same document. - -use std::collections::{HashMap, VecDeque}; -use std::sync::RwLock; -use std::time::Instant; - -use tracing::warn; - -use crate::document::NodeId; -use crate::utils::fingerprint::Fingerprint; - -/// A tiered reasoning cache for the retrieval pipeline. -/// -/// Thread-safe via `RwLock`. Each tier has independent size limits -/// and TTL-based expiration. -pub struct ReasoningCache { - /// L1: Exact query → cached candidate list. - l1: RwLock, - /// L2: Node path pattern → cached navigation cue score. - l2: RwLock, - /// L3: Node content fingerprint → cached strategy score. - l3: RwLock, - /// Configuration. - config: ReasoningCacheConfig, -} - -/// Configuration for the reasoning cache. -#[derive(Debug, Clone)] -pub struct ReasoningCacheConfig { - /// Maximum L1 entries (exact query results). - pub l1_max: usize, - /// Maximum L2 entries (path patterns). - pub l2_max: usize, - /// Maximum L3 entries (strategy scores). - pub l3_max: usize, -} - -impl Default for ReasoningCacheConfig { - fn default() -> Self { - Self { - l1_max: 200, - l2_max: 1000, - l3_max: 5000, - } - } -} - -// ---- L1: Exact Query Cache ---- - -#[derive(Debug, Clone)] -struct L1Entry { - /// Fingerprint of the workspace + document set used for this query. - scope_fp: Fingerprint, - /// Cached candidate nodes (pre-sorted by score). - candidates: Vec, - /// Strategy used. - strategy: String, - /// When cached. - created_at: Instant, -} - -/// A cached candidate from a previous retrieval. -#[derive(Debug, Clone)] -pub struct CachedCandidate { - /// Node ID. - pub node_id: NodeId, - /// Relevance score. - pub score: f32, - /// Depth in tree. - pub depth: usize, -} - -struct L1Store { - entries: HashMap, - order: VecDeque, // For LRU eviction — O(1) pop_front -} - -// ---- L2: Path Pattern Cache ---- - -#[derive(Debug, Clone)] -struct L2Entry { - /// Score for this navigation cue. - confidence: f32, - /// How many times this path was relevant. - hit_count: usize, - created_at: Instant, -} - -struct L2Store { - entries: HashMap, // Key: "doc_fp:node_path" - order: VecDeque, -} - -// ---- L3: Strategy Score Cache ---- - -#[derive(Debug, Clone)] -struct L3Entry { - /// BM25/Keyword score. - score: f32, - /// Which strategy produced this score. - strategy: String, - created_at: Instant, -} - -struct L3Store { - entries: HashMap, // Key: node content fingerprint - order: VecDeque, -} - -// ---- Public API ---- - -impl ReasoningCache { - /// Create a new reasoning cache with default configuration. - pub fn new() -> Self { - Self::with_config(ReasoningCacheConfig::default()) - } - - /// Create with custom configuration. - pub fn with_config(config: ReasoningCacheConfig) -> Self { - Self { - l1: RwLock::new(L1Store { - entries: HashMap::new(), - order: VecDeque::new(), - }), - l2: RwLock::new(L2Store { - entries: HashMap::new(), - order: VecDeque::new(), - }), - l3: RwLock::new(L3Store { - entries: HashMap::new(), - order: VecDeque::new(), - }), - config, - } - } - - // ============ L1: Exact Query ============ - - /// Look up an exact query result. - /// - /// Returns cached candidates if the same query was executed before - /// on the same document scope. - pub fn l1_get(&self, query: &str, scope_fp: &Fingerprint) -> Option> { - let query_fp = Fingerprint::from_str(query); - let l1 = read_lock(&self.l1)?; - let entry = l1.entries.get(&query_fp)?; - // Scope must match (same document set) - if &entry.scope_fp != scope_fp { - return None; - } - Some(entry.candidates.clone()) - } - - /// Store an L1 result. - pub fn l1_store( - &self, - query: &str, - scope_fp: Fingerprint, - candidates: Vec, - strategy: String, - ) { - let query_fp = Fingerprint::from_str(query); - if let Ok(mut l1) = self.l1.write() { - if l1.entries.len() >= self.config.l1_max { - Self::evict_lru_fingerprint(&mut l1); - } - l1.entries.insert( - query_fp, - L1Entry { - scope_fp, - candidates, - strategy, - created_at: Instant::now(), - }, - ); - l1.order.push_back(query_fp); - } - } - - // ============ L2: Path Pattern ============ - - /// Look up a cached navigation confidence for a document + node path. - /// - /// If a previous query successfully navigated through this path, - /// return the confidence score. - pub fn l2_get(&self, doc_key: &str, node_path: &str) -> Option { - let key = format!("{}:{}", doc_key, node_path); - let l2 = read_lock(&self.l2)?; - let entry = l2.entries.get(&key)?; - Some(entry.confidence) - } - - /// Record a successful navigation through a path. - /// - /// Call this after retrieval confirms a path was relevant. - pub fn l2_record(&self, doc_key: &str, node_path: &str, confidence: f32) { - let key = format!("{}:{}", doc_key, node_path); - if let Ok(mut l2) = self.l2.write() { - if let Some(entry) = l2.entries.get_mut(&key) { - // Update running average - entry.hit_count += 1; - entry.confidence = - entry.confidence + (confidence - entry.confidence) / entry.hit_count as f32; - } else { - if l2.entries.len() >= self.config.l2_max { - Self::evict_lru_string(&mut l2); - } - l2.entries.insert( - key.clone(), - L2Entry { - confidence, - hit_count: 1, - created_at: Instant::now(), - }, - ); - l2.order.push_back(key); - } - } - } - - /// Get top-N path hints for a document, sorted by confidence. - /// - /// Useful for bootstrapping new queries on a known document. - pub fn l2_top_paths(&self, doc_key: &str, n: usize) -> Vec<(String, f32)> { - let prefix = format!("{}:", doc_key); - let l2 = match read_lock(&self.l2) { - Some(guard) => guard, - None => return Vec::new(), - }; - - let mut paths: Vec<(String, f32)> = l2 - .entries - .iter() - .filter(|(k, _)| k.starts_with(&prefix)) - .map(|(k, v)| (k[prefix.len()..].to_string(), v.confidence)) - .collect(); - paths.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - paths.truncate(n); - paths - } - - // ============ L3: Strategy Score ============ - - /// Look up a cached strategy score for a node. - /// - /// Node scores from keyword/BM25 are content-dependent but - /// query-independent, so they can be shared across queries. - pub fn l3_get(&self, node_content_fp: &Fingerprint) -> Option<(f32, String)> { - let l3 = read_lock(&self.l3)?; - let entry = l3.entries.get(node_content_fp)?; - Some((entry.score, entry.strategy.clone())) - } - - /// Store a strategy score for a node. - pub fn l3_store(&self, node_content_fp: Fingerprint, score: f32, strategy: String) { - if let Ok(mut l3) = self.l3.write() { - if l3.entries.len() >= self.config.l3_max { - Self::evict_lru_fingerprint_l3(&mut l3); - } - l3.entries.insert( - node_content_fp, - L3Entry { - score, - strategy, - created_at: Instant::now(), - }, - ); - l3.order.push_back(node_content_fp); - } - } - - // ============ Stats ============ - - /// Get cache statistics. - pub fn stats(&self) -> ReasoningCacheStats { - let (l1_count, l2_count, l3_count) = ( - read_lock(&self.l1).map(|g| g.entries.len()).unwrap_or(0), - read_lock(&self.l2).map(|g| g.entries.len()).unwrap_or(0), - read_lock(&self.l3).map(|g| g.entries.len()).unwrap_or(0), - ); - ReasoningCacheStats { - l1_entries: l1_count, - l2_entries: l2_count, - l3_entries: l3_count, - } - } - - /// Clear all cache tiers. - pub fn clear(&self) { - if let Ok(mut l1) = self.l1.write() { - l1.entries.clear(); - l1.order.clear(); - } - if let Ok(mut l2) = self.l2.write() { - l2.entries.clear(); - l2.order.clear(); - } - if let Ok(mut l3) = self.l3.write() { - l3.entries.clear(); - l3.order.clear(); - } - } - - // ============ Eviction helpers ============ - - fn evict_lru_fingerprint(l1: &mut L1Store) { - if let Some(old) = l1.order.pop_front() { - l1.entries.remove(&old); - } - } - - fn evict_lru_string(l2: &mut L2Store) { - if let Some(old) = l2.order.pop_front() { - l2.entries.remove(&old); - } - } - - fn evict_lru_fingerprint_l3(l3: &mut L3Store) { - if let Some(old) = l3.order.pop_front() { - l3.entries.remove(&old); - } - } -} - -impl Default for ReasoningCache { - fn default() -> Self { - Self::new() - } -} - -/// Read from a RwLock, recovering from poison by taking the guard anyway. -/// -/// A poisoned lock means another thread panicked while holding it — the data -/// is still valid, just potentially in an inconsistent state. For a cache, -/// returning stale/empty data is always preferable to failing silently. -fn read_lock(lock: &RwLock) -> Option> { - match lock.read() { - Ok(guard) => Some(guard), - Err(poisoned) => { - warn!("ReasoningCache: recovering from poisoned lock"); - Some(poisoned.into_inner()) - } - } -} - -/// Cache statistics. -#[derive(Debug, Clone)] -pub struct ReasoningCacheStats { - /// L1 entries (exact query results). - pub l1_entries: usize, - /// L2 entries (path patterns). - pub l2_entries: usize, - /// L3 entries (strategy scores). - pub l3_entries: usize, -} - -#[cfg(test)] -mod tests { - use super::*; - - fn make_node_id(n: usize) -> NodeId { - let mut arena = indextree::Arena::new(); - NodeId(arena.new_node(n)) - } - - #[test] - fn test_l1_store_and_retrieve() { - let cache = ReasoningCache::new(); - let scope = Fingerprint::from_str("doc1"); - - let candidates = vec![CachedCandidate { - node_id: make_node_id(1), - score: 0.9, - depth: 2, - }]; - - cache.l1_store("what is rust?", scope, candidates.clone(), "keyword".into()); - let result = cache.l1_get("what is rust?", &scope); - assert!(result.is_some()); - assert_eq!(result.unwrap().len(), 1); - } - - #[test] - fn test_l1_miss_different_scope() { - let cache = ReasoningCache::new(); - let scope1 = Fingerprint::from_str("doc1"); - let scope2 = Fingerprint::from_str("doc2"); - - let candidates = vec![CachedCandidate { - node_id: make_node_id(1), - score: 0.9, - depth: 2, - }]; - - cache.l1_store("query", scope1, candidates, "keyword".into()); - assert!(cache.l1_get("query", &scope2).is_none()); - } - - #[test] - fn test_l2_record_and_get() { - let cache = ReasoningCache::new(); - - cache.l2_record("doc1", "3.2", 0.8); - let score = cache.l2_get("doc1", "3.2"); - assert!(score.is_some()); - assert!((score.unwrap() - 0.8).abs() < 0.01); - } - - #[test] - fn test_l2_running_average() { - let cache = ReasoningCache::new(); - - cache.l2_record("doc1", "3.2", 0.8); - cache.l2_record("doc1", "3.2", 0.6); - let score = cache.l2_get("doc1", "3.2").unwrap(); - // Running average: 0.8 + (0.6 - 0.8) / 2 = 0.7 - assert!((score - 0.7).abs() < 0.01); - } - - #[test] - fn test_l2_top_paths() { - let cache = ReasoningCache::new(); - - cache.l2_record("doc1", "3.1", 0.5); - cache.l2_record("doc1", "3.2", 0.9); - cache.l2_record("doc1", "2.1", 0.7); - - let top = cache.l2_top_paths("doc1", 2); - assert_eq!(top.len(), 2); - assert!((top[0].1 - 0.9).abs() < 0.01); // 3.2 is highest - } - - #[test] - fn test_l3_store_and_retrieve() { - let cache = ReasoningCache::new(); - let fp = Fingerprint::from_str("some node content"); - - cache.l3_store(fp, 0.85, "bm25".into()); - let (score, strategy) = cache.l3_get(&fp).unwrap(); - assert!((score - 0.85).abs() < 0.01); - assert_eq!(strategy, "bm25"); - } - - #[test] - fn test_clear() { - let cache = ReasoningCache::new(); - let scope = Fingerprint::from_str("doc1"); - - cache.l1_store("q", scope, vec![], "kw".into()); - cache.l2_record("doc1", "1", 0.5); - cache.l3_store(Fingerprint::from_str("c"), 0.5, "kw".into()); - - cache.clear(); - - let stats = cache.stats(); - assert_eq!(stats.l1_entries, 0); - assert_eq!(stats.l2_entries, 0); - assert_eq!(stats.l3_entries, 0); - } - - #[test] - fn test_l1_lru_eviction() { - let config = ReasoningCacheConfig { - l1_max: 2, - ..Default::default() - }; - let cache = ReasoningCache::with_config(config); - let scope = Fingerprint::from_str("doc"); - - cache.l1_store("q1", scope, vec![], "kw".into()); - cache.l1_store("q2", scope, vec![], "kw".into()); - cache.l1_store("q3", scope, vec![], "kw".into()); // evicts q1 - - assert!(cache.l1_get("q1", &scope).is_none()); - assert!(cache.l1_get("q2", &scope).is_some()); - assert!(cache.l1_get("q3", &scope).is_some()); - } - - #[test] - fn test_l2_lru_eviction() { - let config = ReasoningCacheConfig { - l2_max: 2, - ..Default::default() - }; - let cache = ReasoningCache::with_config(config); - - cache.l2_record("doc", "1", 0.5); - cache.l2_record("doc", "2", 0.6); - cache.l2_record("doc", "3", 0.7); // evicts "doc:1" - - assert!(cache.l2_get("doc", "1").is_none()); - assert!(cache.l2_get("doc", "2").is_some()); - assert!(cache.l2_get("doc", "3").is_some()); - } - - #[test] - fn test_l3_lru_eviction() { - let config = ReasoningCacheConfig { - l3_max: 2, - ..Default::default() - }; - let cache = ReasoningCache::with_config(config); - - let fp1 = Fingerprint::from_str("content_a"); - let fp2 = Fingerprint::from_str("content_b"); - let fp3 = Fingerprint::from_str("content_c"); - - cache.l3_store(fp1, 0.5, "kw".into()); - cache.l3_store(fp2, 0.6, "kw".into()); - cache.l3_store(fp3, 0.7, "kw".into()); // evicts fp1 - - assert!(cache.l3_get(&fp1).is_none()); - assert!(cache.l3_get(&fp2).is_some()); - assert!(cache.l3_get(&fp3).is_some()); - } - - #[test] - fn test_poisoned_lock_recovery() { - let cache = ReasoningCache::new(); - - // Verify normal operation: store and retrieve still works - let scope = Fingerprint::from_str("doc"); - cache.l1_store("query", scope, vec![], "kw".into()); - - let scope2 = Fingerprint::from_str("doc2"); - cache.l1_store("q2", scope2, vec![], "kw".into()); - assert!(cache.l1_get("q2", &scope2).is_some()); - - // Verify stats still works (internally uses read_lock) - let stats = cache.stats(); - assert!(stats.l1_entries >= 1); - } - - #[test] - fn test_poisoned_lock_read_recovery() { - use std::sync::Arc; - use std::thread; - - // Create a cache and populate it - let cache = Arc::new(ReasoningCache::new()); - let scope = Fingerprint::from_str("doc"); - cache.l1_store("query", scope, vec![], "kw".into()); - - // Poison the lock from another thread - let cache_clone = Arc::clone(&cache); - let handle = thread::spawn(move || { - // This will poison the L1 lock - let _guard = cache_clone.l1.write().unwrap(); - panic!("intentional panic to poison lock"); - }); - - // Wait for the panicking thread to finish - let _ = handle.join(); - - // The lock is now poisoned. Our read_lock() should recover from it. - // l1_get uses read_lock internally - let result = cache.l1_get("query", &scope); - // Should still return data (recovered from poison) - assert!(result.is_some()); - } -} diff --git a/vectorless-core/vectorless/src/retrieval/dispatcher.rs b/vectorless-core/vectorless/src/retrieval/dispatcher.rs deleted file mode 100644 index e92766fb..00000000 --- a/vectorless-core/vectorless/src/retrieval/dispatcher.rs +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Retrieval dispatcher — the single entry point for all query operations. -//! -//! All queries go through the Orchestrator. There is no separate Worker path. -//! The Orchestrator internally decides whether to run the full analysis phase -//! based on user intent: -//! -//! - **User specified doc_ids** → Orchestrator skips analysis, spawns N Workers -//! directly (N=1 is a normal case, not special). -//! - **User unspecified (workspace)** → Orchestrator analyzes DocCards, selects -//! relevant docs, then spawns Workers. -//! -//! Post-processing (synthesis, dedup, rerank) is always unified through the -//! Orchestrator's output — never duplicated in Worker. - -use tracing::info; - -use crate::agent::config::{AgentConfig, Scope, WorkspaceContext}; -use crate::agent::orchestrator::Orchestrator; -use crate::agent::{Agent, EventEmitter, Output}; -use crate::error::{Error, Result}; -use crate::llm::LlmClient; -use crate::query::QueryPipeline; - -/// Dispatch a query to the Orchestrator. -/// -/// This is the single entry point from the client layer into the retrieval system. -/// It always goes through the Orchestrator — never directly to Worker. -/// -/// Flow: -/// 1. Query understanding via LLM (produces [`QueryPlan`]) -/// 2. Orchestrator dispatch (uses QueryPlan for strategy) -/// -/// - `Scope::Specified(docs)` → Orchestrator skips analysis, dispatches all docs directly. -/// - `Scope::Workspace(ws)` → Orchestrator runs full flow (analyze → dispatch → fuse → synthesize). -pub async fn dispatch( - query: &str, - scope: Scope<'_>, - config: &AgentConfig, - llm: &LlmClient, - emitter: &EventEmitter, -) -> Result { - let (ws, skip_analysis) = match scope { - Scope::Specified(docs) => { - info!( - docs = docs.len(), - "Dispatch (user-specified, skip analysis)" - ); - (WorkspaceContext::new(docs), true) - } - Scope::Workspace(ws) => { - info!(docs = ws.doc_count(), "Dispatch (workspace, full flow)"); - (ws, false) - } - }; - - // Step 1: Query understanding — LLM analyzes intent, concepts, complexity. - // This is required. "Model fails, we fail." — errors propagate. - info!("Starting query understanding..."); - let query_plan = QueryPipeline::understand(query, llm).await?; - - // Step 2: Dispatch to Orchestrator with the query plan. - let orchestrator = Orchestrator::new( - query, - &ws, - config.clone(), - llm.clone(), - emitter.clone(), - skip_analysis, - query_plan, - ); - orchestrator - .run() - .await - .map_err(|e| Error::Retrieval(e.to_string())) -} diff --git a/vectorless-core/vectorless/src/retrieval/mod.rs b/vectorless-core/vectorless/src/retrieval/mod.rs deleted file mode 100644 index bab04971..00000000 --- a/vectorless-core/vectorless/src/retrieval/mod.rs +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Retrieval dispatch layer — the entry point for all query operations. -//! -//! This module sits between the client API and the agent execution layer. -//! It is responsible for: -//! -//! - **Dispatching** queries to the appropriate agent path (Worker vs Orchestrator) -//! - **Post-processing** agent output into client-facing results -//! - **Caching** query results (L1 exact, L2 path patterns, L3 strategy scores) -//! - **Streaming** retrieval events for async progress monitoring -//! -//! Call flow: -//! ```text -//! client → retrieval::dispatch() -//! ├── User specified doc_ids → parallel N × Worker -//! └── Workspace scope → Orchestrator (analyze → spawn → fusion) -//! ``` - -mod cache; -pub mod dispatcher; -pub mod postprocessor; -pub mod stream; -mod types; - -pub use stream::{RetrieveEvent, RetrieveEventReceiver}; -pub use types::{ReasoningChain, RetrieveResponse, SufficiencyLevel}; diff --git a/vectorless-core/vectorless/src/retrieval/postprocessor.rs b/vectorless-core/vectorless/src/retrieval/postprocessor.rs deleted file mode 100644 index fddc8c5e..00000000 --- a/vectorless-core/vectorless/src/retrieval/postprocessor.rs +++ /dev/null @@ -1,130 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Post-processing of agent output into client-facing results. -//! -//! Converts raw agent [`Output`] into one or more [`QueryResultItem`]s. -//! When evidence comes from multiple documents (distinct `doc_name` values), -//! results are split by document so the caller can see per-doc attribution. - -use std::collections::BTreeMap; - -use crate::agent::config::{Evidence, Metrics, Output}; -use crate::client::{Confidence, EvidenceItem, QueryMetrics, QueryResultItem}; - -/// Convert agent output to query result items, split by document. -/// -/// Groups evidence by `doc_name` and creates one `QueryResultItem` per document. -/// For single-document queries (all evidence has the same or no `doc_name`), -/// returns a single item with the given `doc_id`. -/// -/// The synthesized answer is shared across all items (it was produced from -/// cross-document evidence). Each item gets its own subset of evidence. -pub fn to_results(output: &Output, doc_id: &str) -> Vec { - if output.evidence.is_empty() { - return vec![empty_item(doc_id, &output.answer, output.confidence)]; - } - - // Group evidence by doc_name - let groups = group_by_doc(&output.evidence); - - if groups.len() <= 1 { - // Single doc — return one item - return vec![build_item( - doc_id, - &output.answer, - output.confidence, - &output.evidence, - &output.metrics, - )]; - } - - // Multi-doc — one item per document - groups - .into_iter() - .map(|(name, refs)| { - let did = name.as_deref().unwrap_or(doc_id); - let evidence: Vec = refs.iter().map(|e| (*e).clone()).collect(); - build_item( - did, - &output.answer, - output.confidence, - &evidence, - &output.metrics, - ) - }) - .collect() -} - -/// Group evidence by `doc_name`, preserving order. -fn group_by_doc(evidence: &[Evidence]) -> BTreeMap, Vec<&Evidence>> { - let mut groups: BTreeMap, Vec<&Evidence>> = BTreeMap::new(); - for ev in evidence { - groups.entry(ev.doc_name.clone()).or_default().push(ev); - } - groups -} - -/// Build a single enriched result item. -fn build_item( - doc_id: &str, - answer: &str, - confidence: Confidence, - evidence: &[Evidence], - metrics: &Metrics, -) -> QueryResultItem { - let node_ids: Vec = evidence.iter().map(|e| e.source_path.clone()).collect(); - let evidence_items: Vec = evidence - .iter() - .map(|e| EvidenceItem { - title: e.node_title.clone(), - path: e.source_path.clone(), - content: e.content.clone(), - doc_name: e.doc_name.clone(), - }) - .collect(); - - let content = if answer.is_empty() { - evidence - .iter() - .map(|e| format!("## {}\n{}", e.node_title, e.content)) - .collect::>() - .join("\n\n---\n\n") - } else { - answer.to_string() - }; - - let evidence_count = evidence.len(); - - QueryResultItem { - doc_id: doc_id.to_string(), - node_ids, - content, - evidence: evidence_items, - metrics: Some(QueryMetrics { - llm_calls: metrics.llm_calls, - rounds_used: metrics.rounds_used, - nodes_visited: metrics.nodes_visited, - evidence_count, - evidence_chars: metrics.evidence_chars, - }), - confidence, - } -} - -/// Build an empty result item (no evidence). -fn empty_item(doc_id: &str, answer: &str, confidence: Confidence) -> QueryResultItem { - let content = if answer.is_empty() { - String::new() - } else { - answer.to_string() - }; - QueryResultItem { - doc_id: doc_id.to_string(), - node_ids: Vec::new(), - content, - evidence: Vec::new(), - metrics: None, - confidence, - } -} diff --git a/vectorless-core/vectorless/src/retrieval/stream.rs b/vectorless-core/vectorless/src/retrieval/stream.rs deleted file mode 100644 index 33aa75b7..00000000 --- a/vectorless-core/vectorless/src/retrieval/stream.rs +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Streaming retrieval events. -//! -//! When `RetrieveOptions::streaming` is enabled, retrieval emits -//! [`RetrieveEvent`]s incrementally as the pipeline progresses through -//! its stages (Analyze → Plan → Search → Evaluate). -//! -//! # Example -//! -//! ```rust,ignore -//! let options = RetrieveOptions::new().with_streaming(true); -//! let rx = client.query_stream(&tree, "query", &options).await?; -//! -//! while let Some(event) = rx.recv().await { -//! match event { -//! RetrieveEvent::Started { query, .. } => println!("Started: {query}"), -//! RetrieveEvent::StageCompleted { stage, .. } => println!("Done: {stage}"), -//! RetrieveEvent::Completed { response } => { -//! println!("Confidence: {}", response.confidence); -//! break; -//! } -//! RetrieveEvent::Error { message } => { -//! eprintln!("Error: {message}"); -//! break; -//! } -//! _ => {} -//! } -//! } -//! ``` - -use tokio::sync::mpsc; - -use super::types::{RetrieveResponse, SufficiencyLevel}; - -/// Events emitted during streaming retrieval. -/// -/// Each event represents a meaningful milestone in the retrieval pipeline. -/// The stream always terminates with either [`Completed`](RetrieveEvent::Completed) -/// or [`Error`](RetrieveEvent::Error). -#[derive(Debug, Clone)] -pub enum RetrieveEvent { - /// Retrieval pipeline started. - Started { - /// The query string. - query: String, - /// Planned retrieval strategy name. - strategy: String, - }, - - /// A pipeline stage completed. - StageCompleted { - /// Stage name (analyze, plan, search, evaluate). - stage: String, - /// Time spent in this stage (ms). - elapsed_ms: u64, - }, - - /// A node was visited during tree traversal. - NodeVisited { - /// Node ID. - node_id: String, - /// Node title. - title: String, - /// Relevance score (0.0 - 1.0). - score: f32, - }, - - /// Relevant content was found. - ContentFound { - /// Node ID. - node_id: String, - /// Node title. - title: String, - /// Short preview of the content. - preview: String, - /// Relevance score. - score: f32, - }, - - /// Pipeline is backtracking to an earlier stage. - Backtracking { - /// Stage backtracking from. - from: String, - /// Stage backtracking to. - to: String, - /// Reason for backtracking. - reason: String, - }, - - /// Sufficiency check result. - SufficiencyCheck { - /// Sufficiency level. - level: SufficiencyLevel, - /// Total tokens collected so far. - tokens: usize, - }, - - /// Retrieval completed successfully with final results. - Completed { - /// The full retrieval response. - response: RetrieveResponse, - }, - - /// An error occurred during retrieval. - Error { - /// Error message. - message: String, - }, -} - -/// Sender half for streaming retrieval events. -pub(crate) type RetrieveEventSender = mpsc::Sender; - -/// Receiver half for streaming retrieval events. -pub type RetrieveEventReceiver = mpsc::Receiver; - -/// Create a bounded channel for streaming retrieval events. -/// -/// The bound defaults to 64 events. The sender will apply backpressure -/// when the receiver cannot keep up, preventing unbounded memory growth. -pub(crate) fn channel(bound: usize) -> (RetrieveEventSender, RetrieveEventReceiver) { - mpsc::channel(bound) -} - -/// Default channel bound for streaming events. -pub const DEFAULT_STREAM_BOUND: usize = 64; diff --git a/vectorless-core/vectorless/src/retrieval/types.rs b/vectorless-core/vectorless/src/retrieval/types.rs deleted file mode 100644 index f654f319..00000000 --- a/vectorless-core/vectorless/src/retrieval/types.rs +++ /dev/null @@ -1,193 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Core types for the retrieval system. - -use serde::{Deserialize, Serialize}; - -/// Re-export [`SufficiencyLevel`] from the document module. -pub use crate::document::SufficiencyLevel; - -/// Complete retrieval response. -#[derive(Debug, Clone)] -pub struct RetrieveResponse { - /// Retrieved results. - pub results: Vec, - - /// Aggregated content. - pub content: String, - - /// Overall confidence score. - pub confidence: f32, - - /// Whether information is sufficient. - pub is_sufficient: bool, - - /// Strategy that was used. - pub strategy_used: String, - - /// Reasoning chain explaining how results were found. - pub reasoning_chain: ReasoningChain, - - /// Total tokens used. - pub tokens_used: usize, -} - -impl Default for RetrieveResponse { - fn default() -> Self { - Self { - results: Vec::new(), - content: String::new(), - confidence: 0.0, - is_sufficient: false, - strategy_used: String::new(), - reasoning_chain: ReasoningChain::default(), - tokens_used: 0, - } - } -} - -impl RetrieveResponse { - /// Create a new empty response. - #[must_use] - pub fn new() -> Self { - Self::default() - } - - /// Check if there are any results. - #[must_use] - pub fn is_empty(&self) -> bool { - self.results.is_empty() - } - - /// Get the number of results. - #[must_use] - pub fn len(&self) -> usize { - self.results.len() - } -} - -/// A single retrieval result. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RetrievalResult { - /// Node ID in the tree. - pub node_id: Option, - - /// Node title. - pub title: String, - - /// Node content (if included). - pub content: Option, - - /// Node summary (if included). - pub summary: Option, - - /// Relevance score (0.0 - 1.0). - pub score: f32, - - /// Depth in the tree. - pub depth: usize, - - /// Page range (for PDFs). - pub page_range: Option<(usize, usize)>, -} - -impl RetrievalResult { - /// Create a new retrieval result. - #[must_use] - pub fn new(title: impl Into) -> Self { - Self { - node_id: None, - title: title.into(), - content: None, - summary: None, - score: 1.0, - depth: 0, - page_range: None, - } - } - - /// Set the node ID. - #[must_use] - pub fn with_node_id(mut self, id: impl Into) -> Self { - self.node_id = Some(id.into()); - self - } - - /// Set the content. - #[must_use] - pub fn with_content(mut self, content: impl Into) -> Self { - self.content = Some(content.into()); - self - } - - /// Set the summary. - #[must_use] - pub fn with_summary(mut self, summary: impl Into) -> Self { - self.summary = Some(summary.into()); - self - } - - /// Set the score. - #[must_use] - pub fn with_score(mut self, score: f32) -> Self { - self.score = score; - self - } - - /// Set the depth. - #[must_use] - pub fn with_depth(mut self, depth: usize) -> Self { - self.depth = depth; - self - } - - /// Set the page range. - #[must_use] - pub fn with_page_range(mut self, start: usize, end: usize) -> Self { - self.page_range = Some((start, end)); - self - } -} - -/// Complete reasoning chain for a retrieval operation. -/// -/// Provides an ordered, auditable trace of every decision the engine made -/// from query analysis through final evaluation. -#[derive(Debug, Clone, Default, Serialize, Deserialize)] -pub struct ReasoningChain { - /// Ordered reasoning steps. - pub steps: Vec, -} - -impl ReasoningChain { - /// Create an empty reasoning chain. - #[must_use] - pub fn new() -> Self { - Self::default() - } - - /// Append a reasoning step. - pub fn push(&mut self, step: ReasoningStep) { - self.steps.push(step); - } - - /// Number of reasoning steps. - #[must_use] - pub fn len(&self) -> usize { - self.steps.len() - } - - /// Whether the chain is empty. - #[must_use] - pub fn is_empty(&self) -> bool { - self.steps.is_empty() - } -} - -/// A single step in the reasoning chain. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct ReasoningStep { - /// Human-readable explanation of the decision. - pub reasoning: String, -} diff --git a/vectorless-core/vectorless/src/scoring/bm25.rs b/vectorless-core/vectorless/src/scoring/bm25.rs deleted file mode 100644 index 8bc20085..00000000 --- a/vectorless-core/vectorless/src/scoring/bm25.rs +++ /dev/null @@ -1,690 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! BM25 scoring module using the `bm25` crate. -//! -//! This module provides: -//! - Per-field weighting for document scoring -//! - Configurable length normalization -//! - IDF caching for efficient scoring -//! - Query expansion support - -use bm25::{ - DefaultTokenizer, Embedder, EmbedderBuilder, Language, ScoredDocument, Scorer, Tokenizer, -}; - -/// Field weights for BM25 scoring. -/// -/// Different document fields can have different importance. -/// For example, title matches are typically more important than content matches. -#[derive(Debug, Clone, Copy)] -pub struct FieldWeights { - /// Weight for title field matches. - pub title: f32, - /// Weight for summary field matches. - pub summary: f32, - /// Weight for content field matches. - pub content: f32, -} - -impl Default for FieldWeights { - fn default() -> Self { - Self { - title: 2.0, - summary: 1.5, - content: 1.0, - } - } -} - -/// BM25 parameters for fine-tuning. -#[derive(Debug, Clone, Copy)] -pub struct Bm25Params { - /// Term frequency saturation parameter (k1). - /// Controls how quickly term frequency saturates. - /// Typical value: 1.2 - pub k1: f32, - /// Length normalization parameter (b). - /// Controls how much document length affects scoring. - /// - 0.0: No length normalization - /// - 1.0: Full length normalization - /// Typical value: 0.75 - pub b: f32, - /// Average document length. - /// If not known, can be estimated or set to 1.0 with b=0. - pub avgdl: f32, -} - -impl Default for Bm25Params { - fn default() -> Self { - Self { - k1: 1.2, - b: 0.75, - avgdl: 100.0, - } - } -} - -/// A document with multiple fields for scoring. -#[derive(Debug, Clone)] -pub struct FieldDocument { - /// Document identifier. - pub id: K, - /// Title field. - pub title: String, - /// Summary field. - pub summary: String, - /// Content field. - pub content: String, -} - -impl FieldDocument { - /// Create a new field document. - pub fn new(id: K, title: String, summary: String, content: String) -> Self { - Self { - id, - title, - summary, - content, - } - } - - /// Get combined text for embedding. - fn combined_text(&self) -> String { - format!("{} {} {}", self.title, self.summary, self.content) - } -} - -/// Key for field-specific document storage. -#[derive(Debug, Clone, Hash, Eq, PartialEq)] -struct FieldKey { - doc_id: K, - field: Field, -} - -#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] -enum Field { - Title, - Summary, - Content, -} - -/// BM25 engine with per-field weighting support. -/// -/// This wraps the `bm25` crate's Embedder and Scorer to provide: -/// - Per-field weighting -/// - Configurable parameters -/// - IDF caching (handled internally by Scorer) -pub struct Bm25Engine { - /// The embedder for creating sparse vectors. - embedder: Embedder, - /// The scorer for scoring documents (combined text). - scorer: Scorer, - /// Field-specific scorers for weighted scoring. - title_scorer: Scorer, - summary_scorer: Scorer, - content_scorer: Scorer, - /// Field weights. - weights: FieldWeights, - /// Document count. - doc_count: usize, - /// Whether the engine has been fitted to a corpus. - fitted: bool, -} - -impl Bm25Engine { - /// Create a new BM25 engine with default parameters. - pub fn new() -> Self { - Self::with_params(Bm25Params::default()) - } - - /// Create a BM25 engine with custom parameters. - pub fn with_params(params: Bm25Params) -> Self { - let embedder = EmbedderBuilder::with_avgdl(params.avgdl) - .k1(params.k1) - .b(params.b) - .language_mode(Language::English) - .build(); - - Self { - embedder, - scorer: Scorer::new(), - title_scorer: Scorer::new(), - summary_scorer: Scorer::new(), - content_scorer: Scorer::new(), - weights: FieldWeights::default(), - doc_count: 0, - fitted: false, - } - } - - /// Create a BM25 engine fitted to a corpus. - /// - /// This calculates the true average document length from the corpus. - pub fn fit_to_corpus(documents: &[FieldDocument]) -> Self { - // Collect owned strings first - let corpus: Vec = documents.iter().map(|d| d.combined_text()).collect(); - let corpus_refs: Vec<&str> = corpus.iter().map(|s| s.as_str()).collect(); - - let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs).build(); - - let mut engine = Self { - embedder, - scorer: Scorer::new(), - title_scorer: Scorer::new(), - summary_scorer: Scorer::new(), - content_scorer: Scorer::new(), - weights: FieldWeights::default(), - doc_count: 0, - fitted: true, - }; - - // Index all documents - for doc in documents { - engine.upsert(doc); - } - - engine - } - - /// Set field weights. - pub fn with_weights(mut self, weights: FieldWeights) -> Self { - self.weights = weights; - self - } - - /// Set language for tokenization. - pub fn with_language(mut self, language: Language) -> Self { - self.embedder = EmbedderBuilder::with_avgdl(self.embedder.avgdl()) - .language_mode(language) - .build(); - self - } - - /// Get the average document length. - pub fn avgdl(&self) -> f32 { - self.embedder.avgdl() - } - - /// Check if the engine has been fitted to a corpus. - pub fn is_fitted(&self) -> bool { - self.fitted - } - - /// Upsert a document into the index. - /// - /// This stores embeddings for each field separately for weighted scoring. - pub fn upsert(&mut self, document: &FieldDocument) { - let id = &document.id; - - // Embed and store each field separately - let title_emb = self.embedder.embed(&document.title); - let summary_emb = self.embedder.embed(&document.summary); - let content_emb = self.embedder.embed(&document.content); - - self.title_scorer.upsert(id, title_emb); - self.summary_scorer.upsert(id, summary_emb); - self.content_scorer.upsert(id, content_emb); - - // Also store combined embedding for basic search - let combined = self.embedder.embed(&document.combined_text()); - self.scorer.upsert(id, combined); - - self.doc_count += 1; - } - - /// Remove a document from the index. - pub fn remove(&mut self, id: &K) { - self.scorer.remove(id); - self.title_scorer.remove(id); - self.summary_scorer.remove(id); - self.content_scorer.remove(id); - self.doc_count = self.doc_count.saturating_sub(1); - } - - /// Get the number of indexed documents. - pub fn len(&self) -> usize { - self.doc_count - } - - /// Check if the index is empty. - pub fn is_empty(&self) -> bool { - self.doc_count == 0 - } - - /// Score a single document against a query. - /// - /// Returns None if the document is not in the index. - pub fn score(&self, id: &K, query: &str) -> Option { - let query_emb = self.embedder.embed(query); - - // Score each field - let title_score = self.title_scorer.score(id, &query_emb)?; - let summary_score = self.summary_scorer.score(id, &query_emb)?; - let content_score = self.content_scorer.score(id, &query_emb)?; - - // Weighted combination - let total_weight = self.weights.title + self.weights.summary + self.weights.content; - let weighted_score = (title_score * self.weights.title - + summary_score * self.weights.summary - + content_score * self.weights.content) - / total_weight; - - Some(weighted_score) - } - - /// Search for documents matching a query. - /// - /// Returns documents sorted by score (descending). - pub fn search(&self, query: &str, limit: usize) -> Vec> { - let query_emb = self.embedder.embed(query); - self.scorer - .matches(&query_emb) - .into_iter() - .take(limit) - .collect() - } - - /// Search with per-field weighting. - /// - /// This is slower but provides more accurate weighted scores. - pub fn search_weighted(&self, query: &str, limit: usize) -> Vec<(K, f32)> { - let query_emb = self.embedder.embed(query); - - // Get all document IDs from the main scorer - let all_results = self.scorer.matches(&query_emb); - - let mut scored: Vec<(K, f32)> = all_results - .into_iter() - .filter_map(|scored_doc| { - let id = scored_doc.id; - - // Get per-field scores - let title_score = self.title_scorer.score(&id, &query_emb)?; - let summary_score = self.summary_scorer.score(&id, &query_emb)?; - let content_score = self.content_scorer.score(&id, &query_emb)?; - - let total_weight = self.weights.title + self.weights.summary + self.weights.content; - let weighted_score = (title_score * self.weights.title - + summary_score * self.weights.summary - + content_score * self.weights.content) - / total_weight; - - Some((id, weighted_score)) - }) - .collect(); - - scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); - scored.truncate(limit); - scored - } - - /// Extract keywords from a query (tokenize and filter). - pub fn tokenize(&self, text: &str) -> Vec { - let tokenizer = DefaultTokenizer::builder() - .language_mode(Language::English) - .normalization(true) - .stopwords(true) - .stemming(true) - .build(); - tokenizer.tokenize(text) - } - - /// Get the underlying embedder. - pub fn embedder(&self) -> &Embedder { - &self.embedder - } - - /// Get mutable access to the embedder. - pub fn embedder_mut(&mut self) -> &mut Embedder { - &mut self.embedder - } -} - -impl Default for Bm25Engine { - fn default() -> Self { - Self::new() - } -} - -/// Query expansion result from LLM. -#[derive(Debug, Clone)] -pub struct ExpandedQuery { - /// Original query. - pub original: String, - /// Expanded terms. - pub expansions: Vec, - /// Combined query (original + expansions). - pub combined: String, -} - -impl ExpandedQuery { - /// Create a new expanded query. - pub fn new(original: String, expansions: Vec) -> Self { - let combined = format!("{} {}", original, expansions.join(" ")); - Self { - original, - expansions, - combined, - } - } -} - -/// Query expander trait for LLM-based expansion. -#[async_trait::async_trait] -pub trait QueryExpander: Send + Sync { - /// Expand a query with related terms. - async fn expand(&self, query: &str) -> ExpandedQuery; -} - -/// Common English stop words for keyword filtering. -pub const STOPWORDS: &[&str] = &[ - "a", - "an", - "the", - "is", - "are", - "was", - "were", - "be", - "been", - "being", - "have", - "has", - "had", - "do", - "does", - "did", - "will", - "would", - "could", - "should", - "may", - "might", - "must", - "shall", - "can", - "need", - "dare", - "ought", - "used", - "to", - "of", - "in", - "for", - "on", - "with", - "at", - "by", - "from", - "as", - "into", - "through", - "during", - "before", - "after", - "above", - "below", - "between", - "under", - "again", - "further", - "then", - "once", - "here", - "there", - "when", - "where", - "why", - "how", - "all", - "each", - "few", - "more", - "most", - "other", - "some", - "such", - "no", - "nor", - "not", - "only", - "own", - "same", - "so", - "than", - "too", - "very", - "just", - "and", - "but", - "if", - "or", - "because", - "until", - "while", - "about", - "what", - "which", - "who", - "whom", - "this", - "that", - "these", - "those", - "i", - "me", - "my", - "myself", - "we", - "our", - "ours", - "ourselves", - "you", - "your", - "yours", - "yourself", - "yourselves", - "he", - "him", - "his", - "himself", - "she", - "her", - "hers", - "herself", - "it", - "its", - "itself", - "they", - "them", - "their", - "theirs", - "themselves", -]; - -/// Extract keywords from a query string, filtering stop words. -/// -/// This is a simple keyword extraction that: -/// - Converts to lowercase -/// - Splits on non-alphanumeric characters -/// - Filters out stop words -/// - Requires minimum length of 2 characters -#[must_use] -pub fn extract_keywords(query: &str) -> Vec { - query - .to_lowercase() - .split(|c: char| !c.is_alphanumeric()) - .filter(|s| { - let s = *s; - !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s) - }) - .map(String::from) - .collect() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_bm25_engine_creation() { - let engine: Bm25Engine = Bm25Engine::new(); - assert!(engine.is_empty()); - assert!(!engine.is_fitted()); - } - - #[test] - fn test_bm25_engine_fit_to_corpus() { - let docs = vec![ - FieldDocument::new( - 1u32, - "Rust Programming".to_string(), - "About Rust".to_string(), - "Rust is a systems programming language.".to_string(), - ), - FieldDocument::new( - 2u32, - "Python Guide".to_string(), - "About Python".to_string(), - "Python is a scripting language.".to_string(), - ), - ]; - - let engine = Bm25Engine::fit_to_corpus(&docs); - assert!(engine.is_fitted()); - assert_eq!(engine.len(), 2); - } - - #[test] - fn test_bm25_search() { - let docs = vec![ - FieldDocument::new( - 1u32, - "Rust Programming".to_string(), - "About Rust".to_string(), - "Rust is a systems programming language with memory safety.".to_string(), - ), - FieldDocument::new( - 2u32, - "Python Guide".to_string(), - "About Python".to_string(), - "Python is a scripting language for data science.".to_string(), - ), - FieldDocument::new( - 3u32, - "Rust Memory Safety".to_string(), - "Memory in Rust".to_string(), - "Rust provides guaranteed memory safety without garbage collection.".to_string(), - ), - ]; - - let engine = Bm25Engine::fit_to_corpus(&docs); - let results = engine.search("rust memory", 10); - - assert!(!results.is_empty()); - // Documents about Rust should rank higher - assert!(results.iter().any(|r| r.id == 1 || r.id == 3)); - } - - #[test] - fn test_bm25_weighted_search() { - let docs = vec![ - FieldDocument::new( - 1u32, - "Rust Programming".to_string(), - "About memory safety".to_string(), - "Content about other things.".to_string(), - ), - FieldDocument::new( - 2u32, - "Other Language".to_string(), - "About other things".to_string(), - "Rust memory safety is important.".to_string(), - ), - ]; - - let engine = Bm25Engine::fit_to_corpus(&docs).with_weights(FieldWeights { - title: 3.0, - summary: 2.0, - content: 1.0, - }); - - let results = engine.search_weighted("rust", 10); - - // Doc 1 has "Rust" in title, should rank higher - assert_eq!(results.first().map(|(id, _)| *id), Some(1u32)); - } - - #[test] - fn test_bm25_score() { - let docs = vec![FieldDocument::new( - 1u32, - "Rust Programming".to_string(), - "About Rust".to_string(), - "Rust is a systems programming language.".to_string(), - )]; - - let engine = Bm25Engine::fit_to_corpus(&docs); - let score = engine.score(&1u32, "rust programming"); - - assert!(score.is_some()); - assert!(score.unwrap() > 0.0); - } - - #[test] - fn test_bm25_tokenize() { - let engine: Bm25Engine = Bm25Engine::new(); - let tokens = engine.tokenize("What is the Rust programming language?"); - - // Should filter stop words and stem - assert!(tokens.contains(&"rust".to_string())); - assert!(tokens.contains(&"program".to_string())); // stemmed - assert!(!tokens.contains(&"what".to_string())); // stop word - assert!(!tokens.contains(&"the".to_string())); // stop word - } - - #[test] - fn test_bm25_remove() { - let docs = vec![FieldDocument::new( - 1u32, - "Rust".to_string(), - "About Rust".to_string(), - "Rust content.".to_string(), - )]; - - let mut engine = Bm25Engine::fit_to_corpus(&docs); - assert_eq!(engine.len(), 1); - - engine.remove(&1u32); - assert!(engine.is_empty()); - } - - #[test] - fn test_field_weights_default() { - let weights = FieldWeights::default(); - assert!((weights.title - 2.0).abs() < f32::EPSILON); - assert!((weights.summary - 1.5).abs() < f32::EPSILON); - assert!((weights.content - 1.0).abs() < f32::EPSILON); - } - - #[test] - fn test_bm25_params_default() { - let params = Bm25Params::default(); - assert!((params.k1 - 1.2).abs() < f32::EPSILON); - assert!((params.b - 0.75).abs() < f32::EPSILON); - assert!((params.avgdl - 100.0).abs() < f32::EPSILON); - } - - #[test] - fn test_expanded_query() { - let expanded = ExpandedQuery::new( - "rust".to_string(), - vec!["programming".to_string(), "language".to_string()], - ); - - assert_eq!(expanded.original, "rust"); - assert_eq!(expanded.expansions.len(), 2); - assert_eq!(expanded.combined, "rust programming language"); - } -} diff --git a/vectorless-core/vectorless/src/scoring/mod.rs b/vectorless-core/vectorless/src/scoring/mod.rs deleted file mode 100644 index eac4e435..00000000 --- a/vectorless-core/vectorless/src/scoring/mod.rs +++ /dev/null @@ -1,8 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Scoring utilities — keyword extraction via BM25. - -pub mod bm25; - -pub use bm25::extract_keywords; diff --git a/vectorless-core/vectorless/src/storage/backend/file.rs b/vectorless-core/vectorless/src/storage/backend/file.rs deleted file mode 100644 index ab461fe7..00000000 --- a/vectorless-core/vectorless/src/storage/backend/file.rs +++ /dev/null @@ -1,293 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! File system storage backend. - -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::RwLock; - -use tracing::debug; - -use super::StorageBackend; -use crate::Error; -use crate::error::Result; - -/// File system storage backend. -/// -/// Stores each key-value pair as a separate file in a directory. -/// The key is used as the filename (with `.bin` extension). -/// -/// # Structure -/// -/// ```text -/// workspace/ -/// ├── doc-1.bin # Document 1 -/// ├── doc-2.bin # Document 2 -/// ├── meta.bin # Metadata index -/// └── .workspace.lock # Lock file -/// ``` -/// -/// # Thread Safety -/// -/// Uses `RwLock` for thread-safe operations on the directory listing cache. -#[derive(Debug)] -pub struct FileBackend { - /// Root directory for storage. - root: PathBuf, - /// Cached directory listing (refreshed on miss). - cache: RwLock>>, -} - -impl FileBackend { - /// Create a new file backend at the given path. - /// - /// Creates the directory if it doesn't exist. - pub fn new(path: impl Into) -> Result { - let root = path.into(); - fs::create_dir_all(&root).map_err(Error::Io)?; - - Ok(Self { - root, - cache: RwLock::new(None), - }) - } - - /// Open an existing file backend. - /// - /// Creates the directory if it doesn't exist. - pub fn open(path: impl Into) -> Result { - Self::new(path) - } - - /// Get the root path. - pub fn root(&self) -> &Path { - &self.root - } - - /// Convert a key to a file path. - fn key_to_path(&self, key: &str) -> PathBuf { - // Sanitize key to prevent path traversal - let sanitized = key.replace("..", "_").replace(['/', '\\', ':'], "_"); - self.root.join(format!("{}.bin", sanitized)) - } - - /// Refresh the directory listing cache. - fn refresh_cache(&self) -> Result> { - let entries: Vec = fs::read_dir(&self.root) - .map_err(Error::Io)? - .filter_map(|entry| entry.ok()) - .filter_map(|entry| { - let path = entry.path(); - if path.extension()?.to_str()? == "bin" { - path.file_stem()?.to_str().map(|s| s.to_string()) - } else { - None - } - }) - .collect(); - - // Update cache - if let Ok(mut cache) = self.cache.write() { - *cache = Some(entries.clone()); - } - - Ok(entries) - } - - /// Get cached keys or refresh cache. - fn get_keys(&self) -> Result> { - // Try to read from cache first - if let Ok(cache) = self.cache.read() { - if let Some(ref keys) = *cache { - return Ok(keys.clone()); - } - } - - // Refresh cache - self.refresh_cache() - } - - /// Invalidate the cache. - pub fn invalidate_cache(&self) { - if let Ok(mut cache) = self.cache.write() { - *cache = None; - } - } -} - -impl StorageBackend for FileBackend { - fn get(&self, key: &str) -> Result>> { - let path = self.key_to_path(key); - - if !path.exists() { - return Ok(None); - } - - let data = fs::read(&path).map_err(Error::Io)?; - debug!("Read {} bytes from {}", data.len(), key); - - Ok(Some(data)) - } - - fn put(&self, key: &str, value: &[u8]) -> Result<()> { - let path = self.key_to_path(key); - - // Use atomic write (temp file + rename) - let temp_path = path.with_extension("tmp"); - - fs::write(&temp_path, value).map_err(Error::Io)?; - fs::rename(&temp_path, &path).map_err(Error::Io)?; - - // Invalidate cache - self.invalidate_cache(); - - debug!("Wrote {} bytes to {}", value.len(), key); - Ok(()) - } - - fn delete(&self, key: &str) -> Result { - let path = self.key_to_path(key); - - if !path.exists() { - return Ok(false); - } - - fs::remove_file(&path).map_err(Error::Io)?; - - // Invalidate cache - self.invalidate_cache(); - - debug!("Deleted {}", key); - Ok(true) - } - - fn exists(&self, key: &str) -> Result { - let path = self.key_to_path(key); - Ok(path.exists()) - } - - fn keys(&self) -> Result> { - self.get_keys() - } - - fn len(&self) -> Result { - Ok(self.get_keys()?.len()) - } - - fn clear(&self) -> Result<()> { - let keys = self.get_keys()?; - - for key in &keys { - let path = self.key_to_path(key); - if path.exists() { - fs::remove_file(&path).map_err(Error::Io)?; - } - } - - // Clear cache - if let Ok(mut cache) = self.cache.write() { - *cache = None; - } - - debug!("Cleared {} entries", keys.len()); - Ok(()) - } - - fn backend_name(&self) -> &'static str { - "file" - } - - fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { - for (key, value) in items { - self.put(key, value)?; - } - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - - #[test] - fn test_file_backend_basic() { - let temp = TempDir::new().unwrap(); - let backend = FileBackend::new(temp.path()).unwrap(); - - // Put and get - backend.put("key1", b"value1").unwrap(); - let value = backend.get("key1").unwrap(); - assert_eq!(value, Some(b"value1".to_vec())); - - // Exists - assert!(backend.exists("key1").unwrap()); - assert!(!backend.exists("key2").unwrap()); - - // Delete - assert!(backend.delete("key1").unwrap()); - assert!(!backend.exists("key1").unwrap()); - assert!(!backend.delete("key1").unwrap()); // Already deleted - } - - #[test] - fn test_file_backend_keys() { - let temp = TempDir::new().unwrap(); - let backend = FileBackend::new(temp.path()).unwrap(); - - backend.put("key1", b"v1").unwrap(); - backend.put("key2", b"v2").unwrap(); - backend.put("key3", b"v3").unwrap(); - - let keys = backend.keys().unwrap(); - assert_eq!(keys.len(), 3); - assert!(keys.contains(&"key1".to_string())); - } - - #[test] - fn test_file_backend_clear() { - let temp = TempDir::new().unwrap(); - let backend = FileBackend::new(temp.path()).unwrap(); - - backend.put("key1", b"v1").unwrap(); - backend.put("key2", b"v2").unwrap(); - - backend.clear().unwrap(); - - assert!(backend.is_empty().unwrap()); - } - - #[test] - fn test_file_backend_batch() { - let temp = TempDir::new().unwrap(); - let backend = FileBackend::new(temp.path()).unwrap(); - - let items: Vec<(&str, &[u8])> = vec![ - ("k1", b"v1".as_slice()), - ("k2", b"v2".as_slice()), - ("k3", b"v3".as_slice()), - ]; - - backend.batch_put(&items).unwrap(); - - let results = backend.batch_get(&["k1", "k2", "k3", "k4"]).unwrap(); - assert_eq!(results.len(), 4); - assert!(results[0].is_some()); - assert!(results[3].is_none()); - } - - #[test] - fn test_file_backend_key_sanitization() { - let temp = TempDir::new().unwrap(); - let backend = FileBackend::new(temp.path()).unwrap(); - - // Keys with special characters should be sanitized - backend.put("../etc/passwd", b"malicious").unwrap(); - backend.put("path/to/file", b"nested").unwrap(); - - // Both should be stored safely - assert!(backend.exists("../etc/passwd").unwrap()); - assert!(backend.exists("path/to/file").unwrap()); - } -} diff --git a/vectorless-core/vectorless/src/storage/backend/memory.rs b/vectorless-core/vectorless/src/storage/backend/memory.rs deleted file mode 100644 index 4844f8e2..00000000 --- a/vectorless-core/vectorless/src/storage/backend/memory.rs +++ /dev/null @@ -1,181 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! In-memory storage backend (for testing). - -use std::collections::HashMap; -use std::sync::RwLock; - -use super::StorageBackend; -use crate::error::Result; - -/// In-memory storage backend. -/// -/// Stores all data in a `HashMap`. Useful for testing and scenarios -/// where persistence is not required. -/// -/// # Thread Safety -/// -/// Uses `RwLock` for thread-safe access to the internal map. -#[derive(Debug, Default)] -pub struct MemoryBackend { - /// Internal storage. - data: RwLock>>, -} - -impl MemoryBackend { - /// Create a new in-memory backend. - pub fn new() -> Self { - Self::default() - } - - /// Create a new in-memory backend with pre-seeded data. - pub fn with_data(data: HashMap>) -> Self { - Self { - data: RwLock::new(data), - } - } -} - -impl StorageBackend for MemoryBackend { - fn get(&self, key: &str) -> Result>> { - let data = self - .data - .read() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - Ok(data.get(key).cloned()) - } - - fn put(&self, key: &str, value: &[u8]) -> Result<()> { - let mut data = self - .data - .write() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - data.insert(key.to_string(), value.to_vec()); - Ok(()) - } - - fn delete(&self, key: &str) -> Result { - let mut data = self - .data - .write() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - Ok(data.remove(key).is_some()) - } - - fn exists(&self, key: &str) -> Result { - let data = self - .data - .read() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - Ok(data.contains_key(key)) - } - - fn keys(&self) -> Result> { - let data = self - .data - .read() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - Ok(data.keys().cloned().collect()) - } - - fn len(&self) -> Result { - let data = self - .data - .read() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - Ok(data.len()) - } - - fn clear(&self) -> Result<()> { - let mut data = self - .data - .write() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - data.clear(); - Ok(()) - } - - fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { - let mut data = self - .data - .write() - .map_err(|_| crate::Error::Cache("Memory backend lock poisoned".to_string()))?; - for (key, value) in items { - data.insert(key.to_string(), value.to_vec()); - } - Ok(()) - } - - fn backend_name(&self) -> &'static str { - "memory" - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_memory_backend_basic() { - let backend = MemoryBackend::new(); - - // Put and get - backend.put("key1", b"value1").unwrap(); - let value = backend.get("key1").unwrap(); - assert_eq!(value, Some(b"value1".to_vec())); - - // Non-existent key - let missing = backend.get("missing").unwrap(); - assert!(missing.is_none()); - } - - #[test] - fn test_memory_backend_delete() { - let backend = MemoryBackend::new(); - - backend.put("key1", b"value1").unwrap(); - assert!(backend.exists("key1").unwrap()); - - let deleted = backend.delete("key1").unwrap(); - assert!(deleted); - assert!(!backend.exists("key1").unwrap()); - - // Delete non-existent - let not_deleted = backend.delete("missing").unwrap(); - assert!(!not_deleted); - } - - #[test] - fn test_memory_backend_keys() { - let backend = MemoryBackend::new(); - - backend.put("key1", b"v1").unwrap(); - backend.put("key2", b"v2").unwrap(); - backend.put("key3", b"v3").unwrap(); - - let keys = backend.keys().unwrap(); - assert_eq!(keys.len(), 3); - } - - #[test] - fn test_memory_backend_clear() { - let backend = MemoryBackend::new(); - - backend.put("key1", b"v1").unwrap(); - backend.put("key2", b"v2").unwrap(); - - backend.clear().unwrap(); - assert!(backend.is_empty().unwrap()); - } - - #[test] - fn test_memory_backend_with_data() { - let mut initial = HashMap::new(); - initial.insert("k1".to_string(), b"v1".to_vec()); - initial.insert("k2".to_string(), b"v2".to_vec()); - - let backend = MemoryBackend::with_data(initial); - assert_eq!(backend.len().unwrap(), 2); - } -} diff --git a/vectorless-core/vectorless/src/storage/backend/mod.rs b/vectorless-core/vectorless/src/storage/backend/mod.rs deleted file mode 100644 index a8bc8053..00000000 --- a/vectorless-core/vectorless/src/storage/backend/mod.rs +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Storage backend abstraction. -//! -//! This module provides a trait-based abstraction for different storage backends, -//! allowing the workspace to work with various storage systems: -//! -//! - **FileBackend**: File system storage (default) -//! - **MemoryBackend**: In-memory storage (for testing) -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::storage::backend::{StorageBackend, FileBackend}; -//! -//! let backend = FileBackend::new("./workspace"); -//! -//! // Store data -//! backend.put("doc-1", b"document data")?; -//! -//! // Retrieve data -//! let data = backend.get("doc-1")?; -//! -//! // List all keys -//! let keys = backend.keys()?; -//! ``` - -mod file; -mod memory; -mod trait_def; - -pub use file::FileBackend; -pub use trait_def::StorageBackend; diff --git a/vectorless-core/vectorless/src/storage/backend/trait_def.rs b/vectorless-core/vectorless/src/storage/backend/trait_def.rs deleted file mode 100644 index 782bdac0..00000000 --- a/vectorless-core/vectorless/src/storage/backend/trait_def.rs +++ /dev/null @@ -1,113 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Storage backend trait definition. - -use std::fmt::Debug; - -use crate::error::Result; - -/// Storage backend trait for abstracting different storage systems. -/// -/// This trait provides a simple key-value interface for document storage. -/// Implementations can use different underlying storage systems: -/// -/// - File system -/// - In-memory (for testing) -/// - Database (SQLite, RocksDB, etc.) -/// - Cloud storage (S3, etc.) -/// -/// # Thread Safety -/// -/// All implementations must be `Send + Sync` to support concurrent access. -pub trait StorageBackend: Debug + Send + Sync { - /// Get a value by key. - /// - /// Returns `None` if the key doesn't exist. - fn get(&self, key: &str) -> Result>>; - - /// Store a value with the given key. - /// - /// Overwrites any existing value. - fn put(&self, key: &str, value: &[u8]) -> Result<()>; - - /// Delete a value by key. - /// - /// Returns `true` if the value was deleted, `false` if it didn't exist. - fn delete(&self, key: &str) -> Result; - - /// Check if a key exists. - fn exists(&self, key: &str) -> Result; - - /// List all keys in the storage. - fn keys(&self) -> Result>; - - /// Get the number of entries in storage. - fn len(&self) -> Result; - - /// Check if storage is empty. - fn is_empty(&self) -> Result { - Ok(self.len()? == 0) - } - - /// Clear all entries from storage. - fn clear(&self) -> Result<()>; - - // ======================================================================== - // Batch operations (optional, default implementations) - // ======================================================================== - - /// Get multiple values by keys. - /// - /// Returns a vector of options, one for each key. - fn batch_get(&self, keys: &[&str]) -> Result>>> { - keys.iter().map(|k| self.get(k)).collect() - } - - /// Store multiple key-value pairs. - /// - /// Default implementation calls `put` for each item. - fn batch_put(&self, items: &[(&str, &[u8])]) -> Result<()> { - for (key, value) in items { - self.put(key, value)?; - } - Ok(()) - } - - /// Delete multiple keys. - /// - /// Returns the number of keys that were actually deleted. - fn batch_delete(&self, keys: &[&str]) -> Result { - let mut count = 0; - for key in keys { - if self.delete(key)? { - count += 1; - } - } - Ok(count) - } - - // ======================================================================== - // Metadata operations - // ======================================================================== - - /// Get storage backend name. - fn backend_name(&self) -> &'static str; - - /// Get storage statistics. - fn stats(&self) -> StorageStats { - StorageStats { - backend: self.backend_name().to_string(), - entries: self.len().unwrap_or(0), - } - } -} - -/// Storage statistics. -#[derive(Debug, Clone)] -pub struct StorageStats { - /// Backend name. - pub backend: String, - /// Number of entries. - pub entries: usize, -} diff --git a/vectorless-core/vectorless/src/storage/cache.rs b/vectorless-core/vectorless/src/storage/cache.rs deleted file mode 100644 index 70f7d4a9..00000000 --- a/vectorless-core/vectorless/src/storage/cache.rs +++ /dev/null @@ -1,381 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Document cache with LRU eviction policy. -//! -//! This module provides a thread-safe LRU cache for loaded documents, -//! allowing efficient reuse of loaded document data while limiting memory usage. -//! -//! # Metrics -//! -//! The cache tracks: -//! - Hits: Number of successful cache lookups -//! - Misses: Number of failed cache lookups -//! - Evictions: Number of entries evicted due to capacity -//! - Utilization: Current usage as percentage of capacity - -use std::num::NonZeroUsize; -use std::sync::Mutex; -use std::sync::atomic::{AtomicU64, Ordering}; - -use lru::LruCache; - -use super::persistence::PersistedDocument; -use crate::Error; -use crate::error::Result; - -/// Default cache size (number of documents). -const DEFAULT_CACHE_SIZE: usize = 100; - -/// A thread-safe LRU cache for documents. -/// -/// Uses interior mutability via `Mutex` for safe concurrent access. -/// The cache automatically evicts least-recently-used entries when full. -/// -/// # Metrics -/// -/// The cache maintains atomic counters for: -/// - **hits**: Successful cache lookups -/// - **misses**: Failed cache lookups (document not in cache) -/// - **evictions**: Entries removed due to capacity limits -#[derive(Debug)] -pub struct DocumentCache { - /// Inner cache protected by Mutex. - inner: Mutex>, - /// Maximum capacity. - capacity: usize, - /// Number of cache hits. - hits: AtomicU64, - /// Number of cache misses. - misses: AtomicU64, - /// Number of cache evictions. - evictions: AtomicU64, -} - -impl DocumentCache { - /// Create a new cache with default capacity (100 documents). - #[must_use] - pub fn new() -> Self { - Self::with_capacity(DEFAULT_CACHE_SIZE) - } - - /// Create a new cache with custom capacity. - /// - /// # Panics - /// - /// This function does not panic, but capacities below 1 are normalized to 1. - #[must_use] - pub fn with_capacity(capacity: usize) -> Self { - let capacity = capacity.max(1); - let non_zero = NonZeroUsize::new(capacity) - .unwrap_or_else(|| NonZeroUsize::new(DEFAULT_CACHE_SIZE).expect("default is non-zero")); - - Self { - inner: Mutex::new(LruCache::new(non_zero)), - capacity, - hits: AtomicU64::new(0), - misses: AtomicU64::new(0), - evictions: AtomicU64::new(0), - } - } - - /// Get a document from the cache. - /// - /// Returns `None` if the document is not in the cache. - /// Updates the access order (moves to most-recently-used). - /// - /// # Errors - /// - /// Returns an error if the cache lock is poisoned. - pub fn get(&self, id: &str) -> Result> { - let mut cache = self.lock()?; - let result = cache.get(id).cloned(); - - // Update metrics - if result.is_some() { - self.hits.fetch_add(1, Ordering::Relaxed); - } else { - self.misses.fetch_add(1, Ordering::Relaxed); - } - - Ok(result) - } - - /// Check if a document is in the cache. - pub fn contains(&self, id: &str) -> bool { - self.lock().map(|cache| cache.contains(id)).unwrap_or(false) - } - - /// Put a document into the cache. - /// - /// If the cache is full, evicts the least-recently-used entry. - /// Returns the evicted entry if any. - /// - /// # Errors - /// - /// Returns an error if the cache lock is poisoned. - pub fn put(&self, id: String, doc: PersistedDocument) -> Result> { - let mut cache = self.lock()?; - - // Track capacity before put to detect eviction - let was_full = cache.len() >= self.capacity; - - let evicted = cache.put(id, doc); - - // Track evictions - if evicted.is_some() || was_full { - self.evictions.fetch_add(1, Ordering::Relaxed); - } - - Ok(evicted) - } - - /// Remove a document from the cache. - /// - /// Returns the removed document if it was in the cache. - /// - /// # Errors - /// - /// Returns an error if the cache lock is poisoned. - pub fn remove(&self, id: &str) -> Result> { - let mut cache = self.lock()?; - Ok(cache.pop(id)) - } - - /// Clear all entries from the cache. - /// - /// # Errors - /// - /// Returns an error if the cache lock is poisoned. - pub fn clear(&self) -> Result<()> { - let mut cache = self.lock()?; - cache.clear(); - Ok(()) - } - - /// Get the number of entries currently in the cache. - pub fn len(&self) -> usize { - self.lock().map(|cache| cache.len()).unwrap_or(0) - } - - /// Check if the cache is empty. - pub fn is_empty(&self) -> bool { - self.len() == 0 - } - - /// Get the maximum capacity of the cache. - pub fn capacity(&self) -> usize { - self.capacity - } - - /// Get cache utilization (0.0 to 1.0). - pub fn utilization(&self) -> f64 { - let len = self.len(); - if self.capacity == 0 { - return 0.0; - } - len as f64 / self.capacity as f64 - } - - /// Get all document IDs currently in the cache. - /// - /// # Errors - /// - /// Returns an error if the cache lock is poisoned. - pub fn keys(&self) -> Result> { - let cache = self.lock()?; - Ok(cache.iter().map(|(k, _)| k.clone()).collect()) - } - - /// Get cache statistics including metrics. - pub fn stats(&self) -> CacheStats { - CacheStats { - len: self.len(), - capacity: self.capacity, - utilization: self.utilization(), - hits: self.hits.load(Ordering::Relaxed), - misses: self.misses.load(Ordering::Relaxed), - evictions: self.evictions.load(Ordering::Relaxed), - } - } - - /// Get the number of cache hits. - pub fn hits(&self) -> u64 { - self.hits.load(Ordering::Relaxed) - } - - /// Get the number of cache misses. - pub fn misses(&self) -> u64 { - self.misses.load(Ordering::Relaxed) - } - - /// Get the number of cache evictions. - pub fn evictions(&self) -> u64 { - self.evictions.load(Ordering::Relaxed) - } - - /// Get the cache hit rate (0.0 to 1.0). - pub fn hit_rate(&self) -> f64 { - let hits = self.hits.load(Ordering::Relaxed); - let misses = self.misses.load(Ordering::Relaxed); - let total = hits + misses; - if total == 0 { - 0.0 - } else { - hits as f64 / total as f64 - } - } - - /// Reset all metrics counters to zero. - pub fn reset_metrics(&self) { - self.hits.store(0, Ordering::Relaxed); - self.misses.store(0, Ordering::Relaxed); - self.evictions.store(0, Ordering::Relaxed); - } - - /// Lock the inner cache. - fn lock(&self) -> Result>> { - self.inner - .lock() - .map_err(|_| Error::Cache("Cache lock poisoned".to_string())) - } -} - -impl Default for DocumentCache { - fn default() -> Self { - Self::new() - } -} - -/// Cache statistics including metrics. -#[derive(Debug, Clone, Copy)] -pub struct CacheStats { - /// Number of entries in cache. - pub len: usize, - /// Maximum capacity. - pub capacity: usize, - /// Utilization (0.0 to 1.0). - pub utilization: f64, - /// Number of cache hits. - pub hits: u64, - /// Number of cache misses. - pub misses: u64, - /// Number of cache evictions. - pub evictions: u64, -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - use crate::storage::{DocumentMeta, PersistedDocument}; - - fn create_test_doc(id: &str) -> PersistedDocument { - let meta = DocumentMeta::new(id, "Test Doc", "md"); - let tree = DocumentTree::new("Root", "Content"); - PersistedDocument::new(meta, tree) - } - - #[test] - fn test_cache_basic() { - let cache = DocumentCache::with_capacity(3); - - // Add documents - let doc1 = create_test_doc("doc1"); - let doc2 = create_test_doc("doc2"); - - cache.put("doc1".to_string(), doc1.clone()).unwrap(); - cache.put("doc2".to_string(), doc2.clone()).unwrap(); - - assert_eq!(cache.len(), 2); - assert!(cache.contains("doc1")); - assert!(cache.contains("doc2")); - } - - #[test] - fn test_cache_get() { - let cache = DocumentCache::with_capacity(3); - let doc = create_test_doc("doc1"); - - cache.put("doc1".to_string(), doc).unwrap(); - - let retrieved = cache.get("doc1").unwrap(); - assert!(retrieved.is_some()); - assert_eq!(retrieved.unwrap().meta.id, "doc1"); - - let missing = cache.get("missing").unwrap(); - assert!(missing.is_none()); - } - - #[test] - fn test_cache_eviction() { - let cache = DocumentCache::with_capacity(2); - - cache - .put("doc1".to_string(), create_test_doc("doc1")) - .unwrap(); - cache - .put("doc2".to_string(), create_test_doc("doc2")) - .unwrap(); - cache - .put("doc3".to_string(), create_test_doc("doc3")) - .unwrap(); - - // doc1 should be evicted (least recently used) - assert!(!cache.contains("doc1")); - assert!(cache.contains("doc2")); - assert!(cache.contains("doc3")); - } - - #[test] - fn test_cache_remove() { - let cache = DocumentCache::new(); - - cache - .put("doc1".to_string(), create_test_doc("doc1")) - .unwrap(); - assert!(cache.contains("doc1")); - - let removed = cache.remove("doc1").unwrap(); - assert!(removed.is_some()); - assert!(!cache.contains("doc1")); - - let not_found = cache.remove("missing").unwrap(); - assert!(not_found.is_none()); - } - - #[test] - fn test_cache_clear() { - let cache = DocumentCache::new(); - - cache - .put("doc1".to_string(), create_test_doc("doc1")) - .unwrap(); - cache - .put("doc2".to_string(), create_test_doc("doc2")) - .unwrap(); - - assert_eq!(cache.len(), 2); - - cache.clear().unwrap(); - - assert!(cache.is_empty()); - } - - #[test] - fn test_cache_utilization() { - let cache = DocumentCache::with_capacity(10); - - assert_eq!(cache.utilization(), 0.0); - - cache - .put("doc1".to_string(), create_test_doc("doc1")) - .unwrap(); - assert!((cache.utilization() - 0.1).abs() < 0.01); - - cache - .put("doc2".to_string(), create_test_doc("doc2")) - .unwrap(); - assert!((cache.utilization() - 0.2).abs() < 0.01); - } -} diff --git a/vectorless-core/vectorless/src/storage/codec.rs b/vectorless-core/vectorless/src/storage/codec.rs deleted file mode 100644 index 4c7e864e..00000000 --- a/vectorless-core/vectorless/src/storage/codec.rs +++ /dev/null @@ -1,245 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Codec abstraction for compression and decompression. -//! -//! This module provides a codec trait for compressing/decompressing data, -//! with implementations for: -//! -//! - **Identity**: No compression (pass-through) -//! - **Gzip**: Standard gzip compression -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::storage::codec::{Codec, GzipCodec}; -//! -//! let codec = GzipCodec::new(6); -//! -//! let data = b"some data to compress"; -//! let compressed = codec.encode(data)?; -//! let decompressed = codec.decode(&compressed)?; -//! -//! assert_eq!(data.as_slice(), decompressed.as_slice()); -//! ``` - -use std::fmt::Debug; -use std::io::{Read, Write}; - -use flate2::Compression; -use flate2::read::GzDecoder; -use flate2::write::GzEncoder; - -use crate::Error; -use crate::error::Result; - -/// Codec trait for compression/decompression. -pub trait Codec: Debug + Send + Sync { - /// Encode (compress) data. - fn encode(&self, data: &[u8]) -> Result>; - - /// Decode (decompress) data. - fn decode(&self, data: &[u8]) -> Result>; - - /// Get the codec name. - fn name(&self) -> &'static str; -} - -/// Identity codec (no compression). -/// -/// Passes data through unchanged. -#[derive(Debug, Clone, Copy, Default)] -pub struct IdentityCodec; - -impl IdentityCodec { - /// Create a new identity codec. - pub fn new() -> Self { - Self::default() - } -} - -impl Codec for IdentityCodec { - fn encode(&self, data: &[u8]) -> Result> { - Ok(data.to_vec()) - } - - fn decode(&self, data: &[u8]) -> Result> { - Ok(data.to_vec()) - } - - fn name(&self) -> &'static str { - "identity" - } -} - -/// Gzip codec. -/// -/// Uses the `flate2` crate for gzip compression. -#[derive(Debug, Clone)] -pub struct GzipCodec { - /// Compression level (0-9). - level: u32, -} - -impl GzipCodec { - /// Create a new gzip codec with the given compression level. - /// - /// Level is clamped to 0-9: - /// - 0: No compression - /// - 1: Fastest compression - /// - 6: Default (good balance) - /// - 9: Best compression (slowest) - pub fn new(level: u32) -> Self { - Self { - level: level.clamp(0, 9), - } - } - - /// Create a codec with fast compression (level 1). - pub fn fast() -> Self { - Self::new(1) - } - - /// Create a codec with default compression (level 6). - pub fn default_level() -> Self { - Self::new(6) - } - - /// Create a codec with best compression (level 9). - pub fn best() -> Self { - Self::new(9) - } -} - -impl Default for GzipCodec { - fn default() -> Self { - Self::default_level() - } -} - -impl Codec for GzipCodec { - fn encode(&self, data: &[u8]) -> Result> { - let mut encoder = GzEncoder::new(Vec::new(), Compression::new(self.level)); - encoder - .write_all(data) - .map_err(|e| Error::Parse(format!("Gzip encode error: {}", e)))?; - encoder - .finish() - .map_err(|e| Error::Parse(format!("Gzip finish error: {}", e))) - } - - fn decode(&self, data: &[u8]) -> Result> { - let mut decoder = GzDecoder::new(data); - let mut decoded = Vec::new(); - decoder - .read_to_end(&mut decoded) - .map_err(|e| Error::Parse(format!("Gzip decode error: {}", e)))?; - Ok(decoded) - } - - fn name(&self) -> &'static str { - "gzip" - } -} - -/// Create a codec from configuration. -pub fn codec_from_config( - enabled: bool, - algorithm: crate::config::CompressionAlgorithm, - level: u32, -) -> Box { - if !enabled { - return Box::new(IdentityCodec::new()); - } - - match algorithm { - crate::config::CompressionAlgorithm::Gzip => Box::new(GzipCodec::new(level)), - crate::config::CompressionAlgorithm::Zstd => { - // Zstd not implemented yet, fallback to gzip - // TODO: Add zstd support when needed - Box::new(GzipCodec::new(level)) - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_identity_codec() { - let codec = IdentityCodec::new(); - let data = b"test data"; - - let encoded = codec.encode(data).unwrap(); - let decoded = codec.decode(&encoded).unwrap(); - - assert_eq!(data.as_slice(), decoded.as_slice()); - assert_eq!(codec.name(), "identity"); - } - - #[test] - fn test_gzip_codec_basic() { - let codec = GzipCodec::default(); - let data = b"Hello, World! This is a test string for compression."; - - let encoded = codec.encode(data).unwrap(); - let decoded = codec.decode(&encoded).unwrap(); - - assert_eq!(data.as_slice(), decoded.as_slice()); - assert_eq!(codec.name(), "gzip"); - - // Compressed should be smaller for repetitive data - // Note: For very small data, gzip overhead might make it larger - let repetitive = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"; - let compressed = codec.encode(repetitive).unwrap(); - assert!(compressed.len() < repetitive.len()); - } - - #[test] - fn test_gzip_codec_levels() { - let data = b"This is test data that should compress well. ".repeat(100); - let data = data.into_iter().map(|b| b as u8).collect::>(); - - let codec_fast = GzipCodec::fast(); - let codec_best = GzipCodec::best(); - - let compressed_fast = codec_fast.encode(&data).unwrap(); - let compressed_best = codec_best.encode(&data).unwrap(); - - // Both should decompress to the same data - assert_eq!(codec_fast.decode(&compressed_fast).unwrap(), data); - assert_eq!(codec_best.decode(&compressed_best).unwrap(), data); - - // Best compression should be smaller or equal - assert!(compressed_best.len() <= compressed_fast.len()); - } - - #[test] - fn test_gzip_empty_data() { - let codec = GzipCodec::default(); - let data = b""; - - let encoded = codec.encode(data).unwrap(); - let decoded = codec.decode(&encoded).unwrap(); - - assert!(decoded.is_empty()); - } - - #[test] - fn test_codec_from_config() { - use crate::config::CompressionAlgorithm; - - // Disabled compression - let codec = codec_from_config(false, CompressionAlgorithm::Gzip, 6); - let data = b"test"; - let encoded = codec.encode(data).unwrap(); - assert_eq!(encoded, data); - - // Enabled compression - let codec = codec_from_config(true, CompressionAlgorithm::Gzip, 6); - let encoded = codec.encode(data).unwrap(); - let decoded = codec.decode(&encoded).unwrap(); - assert_eq!(decoded, data); - } -} diff --git a/vectorless-core/vectorless/src/storage/lock.rs b/vectorless-core/vectorless/src/storage/lock.rs deleted file mode 100644 index feb484ba..00000000 --- a/vectorless-core/vectorless/src/storage/lock.rs +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! File locking for workspace safety. -//! -//! Provides cross-process file locking to prevent data corruption -//! when multiple processes access the same workspace. - -// File locking inherently requires unsafe FFI calls -#![allow(unsafe_code)] -//! -//! Provides cross-process file locking to prevent data corruption -//! when multiple processes access the same workspace. - -use std::fs::{File, OpenOptions}; -use std::path::Path; - -use crate::Error; -use crate::error::Result; - -/// A file lock that is automatically released when dropped. -/// -/// Uses the `flock` on Unix and `LockFileEx` on Windows. -#[derive(Debug)] -pub struct FileLock { - /// The locked file handle. - file: Option, - /// Path to the lock file (for debugging). - path: std::path::PathBuf, - /// Whether the lock is held exclusively. - exclusive: bool, -} - -impl FileLock { - /// Try to acquire an file lock. - /// - /// # Arguments - /// - /// * `path` - Path to the lock file (will be created if it doesn't exist) - /// * `exclusive` - If true, acquires an exclusive (write) lock; otherwise a shared (read) lock - /// - /// # Errors - /// - /// Returns `Error::WorkspaceLocked` if the lock is held by another process. - pub fn try_lock(path: impl Into, exclusive: bool) -> Result { - let path = path.into(); - - // Ensure parent directory exists - if let Some(parent) = path.parent() { - std::fs::create_dir_all(parent).map_err(Error::Io)?; - } - - // Open or create the lock file - let file = OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(false) - .open(&path) - .map_err(Error::Io)?; - - // Try to acquire the lock - #[cfg(unix)] - { - let fd = std::os::unix::io::AsRawFd::as_raw_fd(&file); - - let result = if exclusive { - // LOCK_EX | LOCK_NB - unsafe { libc::flock(fd, 0x02 | 0x04) } - } else { - // LOCK_SH | LOCK_NB - unsafe { libc::flock(fd, 0x01 | 0x04) } - }; - - if result != 0 { - return Err(Error::WorkspaceLocked); - } - - Ok(Self { - file: Some(file), - path, - exclusive, - }) - } - - #[cfg(windows)] - { - use std::os::windows::fs::OpenOptionsExt; - use windows_sys::Win32::Storage::FileSystem::{ - LOCKFILE_EXCLUSIVE_LOCK, LOCKFILE_FAIL_IMMEDIATELY, LockFileEx, - }; - - let handle = std::os::windows::io::AsRawHandle::as_raw_handle(&file); - - let mut overlapped = std::mem::MaybeUninit::zeroed(); - let result = unsafe { - LockFileEx( - handle, - if exclusive { - LOCKFILE_EXCLUSIVE_LOCK - } else { - 0 - } | LOCKFILE_FAIL_IMMEDIATELY, - 0, - 0xFFFFFFFF, - 0xFFFFFFFF, - overlapped.as_mut_ptr(), - ) - }; - - if result == 0 { - return Err(Error::WorkspaceLocked); - } - - Ok(Self { - file: Some(file), - path, - exclusive, - }) - } - - #[cfg(not(any(unix, windows)))] - { - // Fallback: No file locking available - // Just keep the file open, which provides some protection - Ok(Self { - file: Some(file), - path, - exclusive, - }) - } - } - - /// Try to acquire a lock without blocking. - /// - /// Returns `Ok(FileLock)` if the lock was acquired, or `Ok(None)` if it would block. - pub fn try_lock_no_wait( - path: impl Into, - exclusive: bool, - ) -> Result> { - match Self::try_lock(&path.into(), exclusive) { - Ok(lock) => Ok(Some(lock)), - Err(Error::WorkspaceLocked) => Ok(None), - Err(e) => Err(e), - } - } - - /// Check if the lock file is locked by another process. - /// - /// This is useful for checking without acquiring a lock. - pub fn is_locked(path: impl Into) -> bool { - Self::try_lock(&path.into(), false).is_err() - } - - /// Release the lock. - pub fn unlock(mut self) { - if let Some(file) = self.file.take() { - // File will be unlocked when dropped - drop(file); - } - } - - /// Get the lock file path. - pub fn path(&self) -> &Path { - &self.path - } - - /// Check if this is an exclusive lock. - pub fn is_exclusive(&self) -> bool { - self.exclusive - } -} - -impl Drop for FileLock { - fn drop(&mut self) { - if let Some(file) = self.file.take() { - // File descriptor closed, lock automatically released - drop(file); - } - } -} - -/// A scoped lock guard that releases the lock when dropped. -/// -/// This is useful for ensuring the lock is released even on panic. -pub struct ScopedLock { - lock: Option, -} - -impl ScopedLock { - /// Acquire a scoped lock. - pub fn new(path: impl Into, exclusive: bool) -> Result { - let lock = FileLock::try_lock(path, exclusive)?; - Ok(Self { lock: Some(lock) }) - } - - /// Release the lock early. - pub fn release(mut self) { - if let Some(lock) = self.lock.take() { - lock.unlock(); - } - } -} - -impl Drop for ScopedLock { - fn drop(&mut self) { - // Lock automatically released when FileLock is dropped - } -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - - #[test] - fn test_file_lock_acquire_release() { - let temp = TempDir::new().unwrap(); - let lock_path = temp.path().join("test.lock"); - - let lock = FileLock::try_lock(&lock_path, true).unwrap(); - assert!(lock.is_exclusive()); - - // Should be able to unlock - lock.unlock(); - } - - #[test] - fn test_file_lock_conflict() { - let temp = TempDir::new().unwrap(); - let lock_path = temp.path().join("conflict.lock"); - - // Acquire exclusive lock - let _lock1 = FileLock::try_lock(&lock_path, true).unwrap(); - - // Try to acquire another exclusive lock - should fail - let result = FileLock::try_lock(&lock_path, true); - assert!(matches!(result, Err(Error::WorkspaceLocked))); - } - - #[test] - fn test_file_lock_shared() { - let temp = TempDir::new().unwrap(); - let lock_path = temp.path().join("shared.lock"); - - // Acquire shared lock - let lock1 = FileLock::try_lock(&lock_path, false).unwrap(); - assert!(!lock1.is_exclusive()); - - // Should be able to acquire another shared lock - let lock2 = FileLock::try_lock(&lock_path, false).unwrap(); - assert!(!lock2.is_exclusive()); - - // But exclusive lock should fail - let result = FileLock::try_lock(&lock_path, true); - assert!(matches!(result, Err(Error::WorkspaceLocked))); - - lock1.unlock(); - lock2.unlock(); - } - - #[test] - fn test_scoped_lock() { - let temp = TempDir::new().unwrap(); - let lock_path = temp.path().join("scoped.lock"); - - { - let _scoped = ScopedLock::new(&lock_path, true).unwrap(); - // Lock held here - - // Another lock should fail - let result = FileLock::try_lock(&lock_path, true); - assert!(matches!(result, Err(Error::WorkspaceLocked))); - } - // Lock released here - - // Now should succeed - let _lock = FileLock::try_lock(&lock_path, true).unwrap(); - } -} diff --git a/vectorless-core/vectorless/src/storage/migration.rs b/vectorless-core/vectorless/src/storage/migration.rs deleted file mode 100644 index 1711169b..00000000 --- a/vectorless-core/vectorless/src/storage/migration.rs +++ /dev/null @@ -1,385 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Version migration system for persisted data. -//! -//! This module provides a framework for migrating data between format versions. -//! When the data format changes, migrations can automatically upgrade older data. -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::storage::migration::{Migration, Migrator, MigrationContext}; -//! -//! // Define a migration from v1 to v2 -//! struct V1ToV2; -//! -//! impl Migration for V1ToV2 { -//! fn from_version(&self) -> u32 { 1 } -//! fn to_version(&self) -> u32 { 2 } -//! fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result> { -//! // Transform data from v1 to v2 format -//! // ... -//! } -//! } -//! -//! // Register migrations -//! let mut migrator = Migrator::new(); -//! migrator.register(Box::new(V1ToV2)); -//! -//! // Migrate data -//! let migrated = migrator.migrate(data, 1, 2)?; -//! ``` - -use std::collections::HashMap; - -use tracing::{debug, info, warn}; - -use crate::Error; -use crate::error::Result; - -/// Current data format version. -pub const CURRENT_VERSION: u32 = 1; - -/// Migration context providing additional information for migrations. -#[derive(Debug, Clone)] -pub struct MigrationContext { - /// Source version. - pub from_version: u32, - /// Target version. - pub to_version: u32, - /// Additional metadata. - pub metadata: HashMap, -} - -impl MigrationContext { - /// Create a new migration context. - pub fn new(from_version: u32, to_version: u32) -> Self { - Self { - from_version, - to_version, - metadata: HashMap::new(), - } - } - - /// Add metadata. - pub fn with_metadata(mut self, key: impl Into, value: impl Into) -> Self { - self.metadata.insert(key.into(), value.into()); - self - } -} - -/// Trait for data migrations. -/// -/// A migration transforms data from one version to the next. -pub trait Migration: Send + Sync { - /// Get the source version this migration applies to. - fn from_version(&self) -> u32; - - /// Get the target version this migration produces. - fn to_version(&self) -> u32; - - /// Get a human-readable description of this migration. - fn description(&self) -> &str; - - /// Perform the migration. - /// - /// # Arguments - /// - /// * `data` - The data to migrate - /// * `ctx` - Migration context with additional information - /// - /// # Returns - /// - /// The migrated data in the new format. - fn migrate(&self, data: &[u8], ctx: &MigrationContext) -> Result>; - - /// Check if this migration can be applied to the given data. - /// - /// Default implementation always returns true. - fn can_migrate(&self, _data: &[u8]) -> bool { - true - } -} - -/// Migration registry and executor. -pub struct Migrator { - /// Registered migrations, keyed by (from_version, to_version). - migrations: HashMap<(u32, u32), Box>, -} - -impl Default for Migrator { - fn default() -> Self { - Self::new() - } -} - -impl std::fmt::Debug for Migrator { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Migrator") - .field("migration_count", &self.migrations.len()) - .finish() - } -} - -impl Migrator { - /// Create a new migrator. - pub fn new() -> Self { - Self { - migrations: HashMap::new(), - } - } - - /// Register a migration. - pub fn register(&mut self, migration: Box) { - let key = (migration.from_version(), migration.to_version()); - debug!("Registering migration: v{} -> v{}", key.0, key.1); - self.migrations.insert(key, migration); - } - - /// Check if a migration path exists between two versions. - pub fn can_migrate(&self, from_version: u32, to_version: u32) -> bool { - if from_version == to_version { - return true; - } - - // Check if we have a direct migration - if self.migrations.contains_key(&(from_version, to_version)) { - return true; - } - - // Check if we have a path through intermediate versions - self.find_migration_path(from_version, to_version).is_some() - } - - /// Find a migration path between two versions. - /// - /// Returns a sequence of version numbers to migrate through. - fn find_migration_path(&self, from_version: u32, to_version: u32) -> Option> { - if from_version == to_version { - return Some(vec![from_version]); - } - - // Simple BFS to find a path - use std::collections::{HashSet, VecDeque}; - - let mut visited: HashSet = HashSet::new(); - let mut queue: VecDeque = VecDeque::new(); - let mut parent: HashMap = HashMap::new(); - - queue.push_back(from_version); - visited.insert(from_version); - - while let Some(current) = queue.pop_front() { - // Find all migrations from current version - for ((from, to), _) in &self.migrations { - if *from == current && !visited.contains(to) { - visited.insert(*to); - parent.insert(*to, current); - queue.push_back(*to); - - if *to == to_version { - // Reconstruct path - let mut path = vec![to_version]; - let mut v = to_version; - while let Some(&p) = parent.get(&v) { - if p == from_version { - path.push(p); - break; - } - path.push(p); - v = p; - } - path.reverse(); - return Some(path); - } - } - } - } - - None - } - - /// Migrate data from one version to another. - /// - /// If a direct migration exists, it will be used. - /// Otherwise, the migrator will try to find a path through intermediate versions. - pub fn migrate(&self, data: &[u8], from_version: u32, to_version: u32) -> Result> { - if from_version == to_version { - return Ok(data.to_vec()); - } - - // Find migration path - let path = self - .find_migration_path(from_version, to_version) - .ok_or_else(|| { - Error::VersionMismatch(format!( - "No migration path from v{} to v{}", - from_version, to_version - )) - })?; - - if path.len() < 2 { - return Ok(data.to_vec()); - } - - info!( - "Migrating data from v{} to v{} via path: {:?}", - from_version, to_version, path - ); - - let mut current_data = data.to_vec(); - let mut current_version = from_version; - - for next_version in path.iter().skip(1) { - let key = (current_version, *next_version); - let migration = self.migrations.get(&key).ok_or_else(|| { - Error::VersionMismatch(format!( - "Missing migration from v{} to v{}", - current_version, next_version - )) - })?; - - let ctx = MigrationContext::new(current_version, *next_version); - - debug!( - "Applying migration: v{} -> v{} ({})", - current_version, - next_version, - migration.description() - ); - - current_data = migration.migrate(¤t_data, &ctx)?; - current_version = *next_version; - } - - Ok(current_data) - } - - /// Get the list of registered migrations. - pub fn list_migrations(&self) -> Vec<(u32, u32, &str)> { - self.migrations - .values() - .map(|m| (m.from_version(), m.to_version(), m.description())) - .collect() - } -} - -// ============================================================================ -// Built-in migrations -// ============================================================================ - -/// Placeholder migration for future versions. -/// This is a template that can be copied for actual migrations. -#[derive(Debug)] -pub struct PlaceholderMigration { - from: u32, - to: u32, -} - -impl PlaceholderMigration { - /// Create a new placeholder migration. - pub fn new(from: u32, to: u32) -> Self { - Self { from, to } - } -} - -impl Migration for PlaceholderMigration { - fn from_version(&self) -> u32 { - self.from - } - - fn to_version(&self) -> u32 { - self.to - } - - fn description(&self) -> &str { - "Placeholder migration (no-op)" - } - - fn migrate(&self, data: &[u8], _ctx: &MigrationContext) -> Result> { - warn!( - "Using placeholder migration from v{} to v{} - no changes made", - self.from, self.to - ); - Ok(data.to_vec()) - } -} - -/// Create a default migrator with all built-in migrations registered. -pub fn default_migrator() -> Migrator { - Migrator::new() - // Add migrations as needed when versions change - // migrator.register(Box::new(V1ToV2::new())); -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_migration_context() { - let ctx = MigrationContext::new(1, 2).with_metadata("key", "value"); - - assert_eq!(ctx.from_version, 1); - assert_eq!(ctx.to_version, 2); - assert_eq!(ctx.metadata.get("key"), Some(&"value".to_string())); - } - - #[test] - fn test_migrator_no_migration_needed() { - let migrator = Migrator::new(); - let data = b"test data"; - - let result = migrator.migrate(data, 1, 1).unwrap(); - assert_eq!(result, data); - } - - #[test] - fn test_migrator_no_path() { - let migrator = Migrator::new(); - let data = b"test data"; - - let result = migrator.migrate(data, 1, 2); - assert!(result.is_err()); - } - - #[test] - fn test_migrator_with_placeholder() { - let mut migrator = Migrator::new(); - migrator.register(Box::new(PlaceholderMigration::new(1, 2))); - - assert!(migrator.can_migrate(1, 2)); - assert!(!migrator.can_migrate(1, 3)); - - let data = b"test data"; - let result = migrator.migrate(data, 1, 2).unwrap(); - assert_eq!(result, data); - } - - #[test] - fn test_migrator_path_finding() { - let mut migrator = Migrator::new(); - migrator.register(Box::new(PlaceholderMigration::new(1, 2))); - migrator.register(Box::new(PlaceholderMigration::new(2, 3))); - - assert!(migrator.can_migrate(1, 3)); - - let path = migrator.find_migration_path(1, 3).unwrap(); - assert_eq!(path, vec![1, 2, 3]); - - let data = b"test data"; - let result = migrator.migrate(data, 1, 3).unwrap(); - assert_eq!(result, data); - } - - #[test] - fn test_list_migrations() { - let mut migrator = Migrator::new(); - migrator.register(Box::new(PlaceholderMigration::new(1, 2))); - migrator.register(Box::new(PlaceholderMigration::new(2, 3))); - - let list = migrator.list_migrations(); - assert_eq!(list.len(), 2); - } -} diff --git a/vectorless-core/vectorless/src/storage/mod.rs b/vectorless-core/vectorless/src/storage/mod.rs deleted file mode 100644 index ca7c27f3..00000000 --- a/vectorless-core/vectorless/src/storage/mod.rs +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Storage module for persisting document indices. -//! -//! This module provides: -//! - **Workspace** — An async directory-based document collection manager with LRU cache -//! - **Persistence** — Save/load document trees and metadata with atomic writes -//! - **Cache** — LRU cache for loaded documents -//! - **Lock** — File locking for multi-process safety -//! - **Backend** — Storage backend abstraction (file, memory, etc.) -//! -//! # Example -//! -//! ```rust,no_run -//! use vectorless::storage::{Workspace, PersistedDocument, DocumentMeta}; -//! use vectorless::document::DocumentTree; -//! -//! # #[tokio::main] -//! # async fn main() -> vectorless::error::Result<()> { -//! // Create a workspace -//! let workspace = Workspace::new("./my_workspace").await?; -//! -//! // Add a document -//! let meta = DocumentMeta::new("doc-1", "My Document", "md"); -//! let tree = DocumentTree::new("Root", "Content"); -//! let doc = PersistedDocument::new(meta, tree); -//! workspace.add(&doc).await?; -//! -//! // Load it back (uses LRU cache) -//! let loaded = workspace.load_and_cache("doc-1").await?.unwrap(); -//! # Ok(()) -//! # } -//! ``` - -pub mod backend; -pub mod cache; -pub mod codec; -pub mod lock; -pub mod migration; -mod persistence; -pub mod workspace; - -// Re-export main types -pub use persistence::{DocumentMeta, PageContent, PersistedDocument}; -pub use workspace::Workspace; diff --git a/vectorless-core/vectorless/src/storage/persistence.rs b/vectorless-core/vectorless/src/storage/persistence.rs deleted file mode 100644 index 38700925..00000000 --- a/vectorless-core/vectorless/src/storage/persistence.rs +++ /dev/null @@ -1,877 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Persistence utilities for saving and loading document indices. -//! -//! # Features -//! -//! - **Atomic writes**: Write to temp file, then rename for crash safety -//! - **Checksum verification**: SHA-256 checksums for data integrity -//! - **Version header**: Format version for future migrations - -use serde::{Deserialize, Serialize}; -use sha2::{Digest, Sha256}; -use std::fs::File; -use std::io::{BufReader, BufWriter, Write}; -use std::path::{Path, PathBuf}; - -use crate::Error; -use crate::document::{DocumentTree, NavigationIndex, ReasoningIndex}; -use crate::error::Result; - -/// Current format version for persisted documents. -const FORMAT_VERSION: u32 = 1; - -/// Current schema version for `PersistedDocument`. -/// -/// Increment this when the document structure changes in a -/// backward-incompatible way (e.g. field renames, new required fields). -/// Old documents will be detected and logged as stale on load. -const SCHEMA_VERSION: u32 = 1; - -/// Metadata for a persisted document. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentMeta { - /// Unique document identifier. - pub id: String, - - /// Document name/title. - pub name: String, - - /// Document format (md, pdf, etc.). - pub format: String, - - /// Source file path. - pub source_path: Option, - - /// Document description. - pub description: Option, - - /// Page count (for PDFs). - pub page_count: Option, - - /// Line count (for text files). - pub line_count: Option, - - /// Creation timestamp. - pub created_at: chrono::DateTime, - - /// Last modified timestamp. - pub modified_at: chrono::DateTime, - - // === Processing State (for incremental updates) === - /// Content fingerprint for change detection. - #[serde( - default, - skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero" - )] - pub content_fingerprint: crate::utils::fingerprint::Fingerprint, - - /// Logic fingerprint (hash of pipeline configuration used to produce this document). - /// If the pipeline config changes, a full reprocess is needed even if content didn't change. - #[serde( - default, - skip_serializing_if = "crate::utils::fingerprint::Fingerprint::is_zero" - )] - pub logic_fingerprint: crate::utils::fingerprint::Fingerprint, - - /// Processing version (incremented when algorithm changes). - #[serde(default)] - pub processing_version: u32, - - /// Node count in the tree. - #[serde(default)] - pub node_count: usize, - - /// Total tokens in summaries. - #[serde(default)] - pub total_summary_tokens: usize, - - /// LLM model used for processing. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub processing_model: Option, - - /// Last processing duration in milliseconds. - #[serde(default)] - pub processing_duration_ms: u64, -} - -impl DocumentMeta { - /// Create new document metadata. - pub fn new(id: impl Into, name: impl Into, format: impl Into) -> Self { - let now = chrono::Utc::now(); - Self { - id: id.into(), - name: name.into(), - format: format.into(), - source_path: None, - description: None, - page_count: None, - line_count: None, - created_at: now, - modified_at: now, - content_fingerprint: crate::utils::fingerprint::Fingerprint::zero(), - logic_fingerprint: crate::utils::fingerprint::Fingerprint::zero(), - processing_version: 0, - node_count: 0, - total_summary_tokens: 0, - processing_model: None, - processing_duration_ms: 0, - } - } - - /// Set the source path. - pub fn with_source_path(mut self, path: impl Into) -> Self { - self.source_path = Some(path.into()); - self - } - - /// Set the description. - pub fn with_description(mut self, desc: impl Into) -> Self { - self.description = Some(desc.into()); - self - } - - /// Set the content fingerprint. - pub fn with_fingerprint(mut self, fp: crate::utils::fingerprint::Fingerprint) -> Self { - self.content_fingerprint = fp; - self - } - - /// Set the logic fingerprint. - pub fn with_logic_fingerprint(mut self, fp: crate::utils::fingerprint::Fingerprint) -> Self { - self.logic_fingerprint = fp; - self - } - - /// Set the processing version. - pub fn with_processing_version(mut self, version: u32) -> Self { - self.processing_version = version; - self - } - - /// Set the processing model. - pub fn with_processing_model(mut self, model: impl Into) -> Self { - self.processing_model = Some(model.into()); - self - } - - /// Update processing statistics. - pub fn update_processing_stats( - &mut self, - node_count: usize, - summary_tokens: usize, - duration_ms: u64, - ) { - self.node_count = node_count; - self.total_summary_tokens = summary_tokens; - self.processing_duration_ms = duration_ms; - self.modified_at = chrono::Utc::now(); - } - - /// Mark as processed with given fingerprint and version. - pub fn mark_processed( - &mut self, - fp: crate::utils::fingerprint::Fingerprint, - version: u32, - model: Option<&str>, - ) { - self.content_fingerprint = fp; - self.processing_version = version; - self.processing_model = model.map(|s| s.to_string()); - self.modified_at = chrono::Utc::now(); - } - - /// Check if the document needs reprocessing. - pub fn needs_reprocessing( - &self, - current_fp: &crate::utils::fingerprint::Fingerprint, - current_version: u32, - ) -> bool { - // Never processed - if self.processing_version == 0 { - return true; - } - - // Algorithm version changed - if self.processing_version < current_version { - return true; - } - - // Content changed - if &self.content_fingerprint != current_fp { - return true; - } - - false - } -} - -/// A persisted document index containing tree and metadata. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PersistedDocument { - /// Schema version — incremented on backward-incompatible changes. - /// Old documents default to `0` via serde when the field is absent. - #[serde(default)] - pub schema_version: u32, - - /// Document metadata. - pub meta: DocumentMeta, - - /// The document tree structure. - pub tree: DocumentTree, - - /// Per-page content (for PDFs). - #[serde(default)] - pub pages: Vec, - - /// Pre-computed reasoning index for retrieval acceleration. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub reasoning_index: Option, - - /// Navigation index for Agent-based retrieval. - #[serde(default, skip_serializing_if = "Option::is_none")] - pub navigation_index: Option, - - /// Key concepts extracted from the document. - #[serde(default, skip_serializing_if = "Vec::is_empty")] - pub concepts: Vec, -} - -impl PersistedDocument { - /// Create a new persisted document. - pub fn new(meta: DocumentMeta, tree: DocumentTree) -> Self { - Self { - schema_version: SCHEMA_VERSION, - meta, - tree, - pages: Vec::new(), - reasoning_index: None, - navigation_index: None, - concepts: Vec::new(), - } - } - - /// Add page content. - pub fn add_page(&mut self, page: usize, content: impl Into) { - self.pages.push(PageContent { - page, - content: content.into(), - }); - } -} - -/// Content for a single page. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct PageContent { - /// Page number (1-based). - pub page: usize, - - /// Page text content. - pub content: String, -} - -/// Wrapper for persisted data with checksum. -#[derive(Debug, Serialize, Deserialize)] -struct PersistedWrapper { - /// Format version. - version: u32, - /// SHA-256 checksum of the payload. - checksum: String, - /// The actual data as raw JSON value (avoids re-serialization drift). - payload: serde_json::Value, -} - -/// Options for save/load operations. -#[derive(Debug, Clone)] -pub struct PersistenceOptions { - /// Use atomic writes (temp file + rename). - pub atomic_writes: bool, - /// Verify checksums on load. - pub verify_checksum: bool, -} - -impl Default for PersistenceOptions { - fn default() -> Self { - Self { - atomic_writes: true, - verify_checksum: true, - } - } -} - -impl PersistenceOptions { - /// Create new options with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set atomic writes option. - pub fn with_atomic_writes(mut self, enabled: bool) -> Self { - self.atomic_writes = enabled; - self - } - - /// Set checksum verification option. - pub fn with_verify_checksum(mut self, enabled: bool) -> Self { - self.verify_checksum = enabled; - self - } -} - -/// Calculate SHA-256 checksum of data. -fn calculate_checksum(data: &[u8]) -> String { - let mut hasher = Sha256::new(); - hasher.update(data); - format!("{:x}", hasher.finalize()) -} - -/// Save a document to a JSON file with atomic write and checksum. -/// -/// # Atomic Write -/// -/// When `atomic_writes` is enabled (default), this function: -/// 1. Writes to a temporary file (`.tmp` suffix) -/// 2. Renames temp file to target (atomic on most filesystems) -/// -/// This prevents data corruption if the process crashes during write. -/// -/// # Errors -/// -/// Returns an error if: -/// - Serialization fails -/// - Cannot create temp file -/// - Write fails -/// - Rename fails -pub fn save_document(path: &Path, doc: &PersistedDocument) -> Result<()> { - save_document_with_options(path, doc, &PersistenceOptions::default()) -} - -/// Save a document with custom options. -pub fn save_document_with_options( - path: &Path, - doc: &PersistedDocument, - options: &PersistenceOptions, -) -> Result<()> { - // Serialize to serde_json::Value first (avoids HashMap key ordering drift) - let payload_value = - serde_json::to_value(doc).map_err(|e| Error::Serialization(e.to_string()))?; - - // Calculate checksum on the Value's canonical bytes - let payload_bytes = - serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; - let checksum = calculate_checksum(&payload_bytes); - - // Create wrapper - let wrapper = PersistedWrapper { - version: FORMAT_VERSION, - checksum, - payload: payload_value, - }; - - // Serialize wrapper - let json = - serde_json::to_string_pretty(&wrapper).map_err(|e| Error::Serialization(e.to_string()))?; - - if options.atomic_writes { - // Atomic write: write to temp file, then rename - let temp_path = path.with_extension("tmp"); - - // Ensure parent directory exists - if let Some(parent) = path.parent() { - std::fs::create_dir_all(parent).map_err(Error::Io)?; - } - - // Write to temp file - { - let file = File::create(&temp_path).map_err(Error::Io)?; - let mut writer = BufWriter::new(file); - writer.write_all(json.as_bytes()).map_err(Error::Io)?; - writer.flush().map_err(Error::Io)?; - } - - // Atomic rename - std::fs::rename(&temp_path, path).map_err(Error::Io)?; - } else { - // Direct write (not atomic) - std::fs::write(path, json).map_err(Error::Io)?; - } - - Ok(()) -} - -/// Load a document from a JSON file with checksum verification. -/// -/// # Checksum Verification -/// -/// When `verify_checksum` is enabled (default), this function: -/// 1. Reads the file -/// 2. Parses the wrapper -/// 3. Re-serializes the payload -/// 4. Verifies the checksum matches -/// -/// # Errors -/// -/// Returns an error if: -/// - File doesn't exist -/// - Parse fails -/// - Checksum mismatch -/// - Version mismatch (future: migration) -pub fn load_document(path: &Path) -> Result { - load_document_with_options(path, &PersistenceOptions::default()) -} - -/// Load a document with custom options. -pub fn load_document_with_options( - path: &Path, - options: &PersistenceOptions, -) -> Result { - if !path.exists() { - return Err(Error::DocumentNotFound(path.display().to_string())); - } - - let file = File::open(path).map_err(Error::Io)?; - let reader = BufReader::new(file); - - // Parse wrapper (payload is serde_json::Value) - let wrapper: PersistedWrapper = serde_json::from_reader(reader) - .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; - - // Check version - if wrapper.version != FORMAT_VERSION { - return Err(Error::Parse(format!( - "Unsupported format version: {} (expected {})", - wrapper.version, FORMAT_VERSION - ))); - } - - // Verify checksum if enabled - if options.verify_checksum { - let payload_bytes = serde_json::to_vec(&wrapper.payload) - .map_err(|e| Error::Serialization(e.to_string()))?; - - let expected_checksum = calculate_checksum(&payload_bytes); - - if wrapper.checksum != expected_checksum { - return Err(Error::Parse(format!( - "Checksum mismatch: expected {}, got {}", - expected_checksum, wrapper.checksum - ))); - } - } - - // Deserialize Value to target type - let doc: PersistedDocument = serde_json::from_value(wrapper.payload) - .map_err(|e| Error::Parse(format!("Failed to deserialize document: {}", e)))?; - - // Check schema version — warn on stale documents, fail on future versions - if doc.schema_version == 0 { - tracing::warn!( - doc_id = %doc.meta.id, - "Document was created before schema versioning — consider re-indexing" - ); - } else if doc.schema_version > SCHEMA_VERSION { - return Err(Error::Parse(format!( - "Document schema version {} is newer than supported {} — please upgrade vectorless", - doc.schema_version, SCHEMA_VERSION - ))); - } - - Ok(doc) -} - -/// Save the workspace index (metadata for all documents). -pub fn save_index(path: &Path, entries: &[DocumentMeta]) -> Result<()> { - save_index_with_options(path, entries, &PersistenceOptions::default()) -} - -/// Save the workspace index with custom options. -pub fn save_index_with_options( - path: &Path, - entries: &[DocumentMeta], - options: &PersistenceOptions, -) -> Result<()> { - // Serialize to serde_json::Value first - let payload_value = - serde_json::to_value(entries).map_err(|e| Error::Serialization(e.to_string()))?; - - let payload_bytes = - serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; - - let checksum = calculate_checksum(&payload_bytes); - - let wrapper = PersistedWrapper { - version: FORMAT_VERSION, - checksum, - payload: payload_value, - }; - - let json = - serde_json::to_string_pretty(&wrapper).map_err(|e| Error::Serialization(e.to_string()))?; - - if options.atomic_writes { - let temp_path = path.with_extension("tmp"); - - // Ensure parent directory exists - if let Some(parent) = path.parent() { - std::fs::create_dir_all(parent).map_err(Error::Io)?; - } - - // Write to temp file - { - let file = File::create(&temp_path).map_err(Error::Io)?; - let mut writer = BufWriter::new(file); - writer.write_all(json.as_bytes()).map_err(Error::Io)?; - writer.flush().map_err(Error::Io)?; - } - - // Atomic rename - std::fs::rename(&temp_path, path).map_err(Error::Io)?; - } else { - std::fs::write(path, json).map_err(Error::Io)?; - } - - Ok(()) -} - -/// Load the workspace index. -pub fn load_index(path: &Path) -> Result> { - load_index_with_options(path, &PersistenceOptions::default()) -} - -/// Load the workspace index with custom options. -pub fn load_index_with_options( - path: &Path, - options: &PersistenceOptions, -) -> Result> { - if !path.exists() { - return Ok(Vec::new()); - } - - let file = File::open(path).map_err(Error::Io)?; - let reader = BufReader::new(file); - - let wrapper: PersistedWrapper = serde_json::from_reader(reader) - .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; - - // Check version - if wrapper.version != FORMAT_VERSION { - return Err(Error::Parse(format!( - "Unsupported format version: {} (expected {})", - wrapper.version, FORMAT_VERSION - ))); - } - - // Verify checksum if enabled - if options.verify_checksum { - let payload_bytes = serde_json::to_vec(&wrapper.payload) - .map_err(|e| Error::Serialization(e.to_string()))?; - - let expected_checksum = calculate_checksum(&payload_bytes); - - if wrapper.checksum != expected_checksum { - return Err(Error::Parse(format!( - "Checksum mismatch: expected {}, got {}", - expected_checksum, wrapper.checksum - ))); - } - } - - // Deserialize Value to target type - let entries: Vec = serde_json::from_value(wrapper.payload) - .map_err(|e| Error::Parse(format!("Failed to deserialize index: {}", e)))?; - - Ok(entries) -} - -// ============================================================================ -// Bytes-based serialization (for StorageBackend integration) -// ============================================================================ - -/// Serialize a document to bytes (JSON with checksum wrapper). -/// -/// This is useful for storage backends that work with byte arrays. -pub fn save_document_to_bytes(doc: &PersistedDocument) -> Result> { - // Serialize to serde_json::Value first - let payload_value = - serde_json::to_value(doc).map_err(|e| Error::Serialization(e.to_string()))?; - - // Calculate checksum on the Value's canonical bytes - let payload_bytes = - serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; - let checksum = calculate_checksum(&payload_bytes); - - // Create wrapper - let wrapper = PersistedWrapper { - version: FORMAT_VERSION, - checksum, - payload: payload_value, - }; - - // Serialize wrapper - serde_json::to_vec(&wrapper).map_err(|e| Error::Serialization(e.to_string())) -} - -/// Deserialize a document from bytes. -/// -/// Verifies checksum by default. -pub fn load_document_from_bytes(data: &[u8]) -> Result { - load_document_from_bytes_with_options(data, true) -} - -/// Deserialize a document from bytes with optional checksum verification. -pub fn load_document_from_bytes_with_options( - data: &[u8], - verify_checksum: bool, -) -> Result { - // Parse wrapper (payload is serde_json::Value) - let wrapper: PersistedWrapper = serde_json::from_slice(data) - .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?; - - // Check version - if wrapper.version != FORMAT_VERSION { - return Err(Error::VersionMismatch(format!( - "Expected version {}, got {}", - FORMAT_VERSION, wrapper.version - ))); - } - - // Verify checksum if enabled - if verify_checksum { - let payload_bytes = serde_json::to_vec(&wrapper.payload) - .map_err(|e| Error::Serialization(e.to_string()))?; - - let expected_checksum = calculate_checksum(&payload_bytes); - - if wrapper.checksum != expected_checksum { - return Err(Error::ChecksumMismatch(format!( - "Expected {}, got {}", - expected_checksum, wrapper.checksum - ))); - } - } - - // Deserialize Value to target type - let doc: PersistedDocument = serde_json::from_value(wrapper.payload) - .map_err(|e| Error::Parse(format!("Failed to deserialize document: {}", e)))?; - - // Check schema version - if doc.schema_version == 0 { - tracing::warn!( - doc_id = %doc.meta.id, - "Document was created before schema versioning — consider re-indexing" - ); - } else if doc.schema_version > SCHEMA_VERSION { - return Err(Error::Parse(format!( - "Document schema version {} is newer than supported {} — please upgrade vectorless", - doc.schema_version, SCHEMA_VERSION - ))); - } - - Ok(doc) -} - -/// Serialize an index to bytes. -pub fn save_index_to_bytes(entries: &[DocumentMeta]) -> Result> { - let payload_value = - serde_json::to_value(entries).map_err(|e| Error::Serialization(e.to_string()))?; - - let payload_bytes = - serde_json::to_vec(&payload_value).map_err(|e| Error::Serialization(e.to_string()))?; - let checksum = calculate_checksum(&payload_bytes); - - let wrapper = PersistedWrapper { - version: FORMAT_VERSION, - checksum, - payload: payload_value, - }; - - serde_json::to_vec(&wrapper).map_err(|e| Error::Serialization(e.to_string())) -} - -/// Deserialize an index from bytes. -pub fn load_index_from_bytes(data: &[u8]) -> Result> { - load_index_from_bytes_with_options(data, true) -} - -/// Deserialize an index from bytes with optional checksum verification. -pub fn load_index_from_bytes_with_options( - data: &[u8], - verify_checksum: bool, -) -> Result> { - let wrapper: PersistedWrapper = serde_json::from_slice(data) - .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?; - - // Check version - if wrapper.version != FORMAT_VERSION { - return Err(Error::VersionMismatch(format!( - "Expected version {}, got {}", - FORMAT_VERSION, wrapper.version - ))); - } - - // Verify checksum if enabled - if verify_checksum { - let payload_bytes = serde_json::to_vec(&wrapper.payload) - .map_err(|e| Error::Serialization(e.to_string()))?; - - let expected_checksum = calculate_checksum(&payload_bytes); - - if wrapper.checksum != expected_checksum { - return Err(Error::ChecksumMismatch(format!( - "Expected {}, got {}", - expected_checksum, wrapper.checksum - ))); - } - } - - // Deserialize Value to target type - let entries: Vec = serde_json::from_value(wrapper.payload) - .map_err(|e| Error::Parse(format!("Failed to deserialize index: {}", e)))?; - - Ok(entries) -} - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::TempDir; - - fn create_test_doc(id: &str) -> PersistedDocument { - let meta = DocumentMeta::new(id, "Test Doc", "md"); - let tree = DocumentTree::new("Root", "Content"); - PersistedDocument::new(meta, tree) - } - - #[test] - fn test_save_and_load_document() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("test.json"); - - let doc = create_test_doc("doc-1"); - save_document(&path, &doc).unwrap(); - - let loaded = load_document(&path).unwrap(); - assert_eq!(loaded.meta.id, "doc-1"); - assert_eq!(loaded.meta.name, "Test Doc"); - } - - #[test] - fn test_atomic_write() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("atomic.json"); - - let doc = create_test_doc("doc-atomic"); - let options = PersistenceOptions::new().with_atomic_writes(true); - save_document_with_options(&path, &doc, &options).unwrap(); - - // Temp file should not exist after save - assert!(!path.with_extension("tmp").exists()); - - let loaded = load_document(&path).unwrap(); - assert_eq!(loaded.meta.id, "doc-atomic"); - } - - #[test] - fn test_checksum_verification() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("checksum.json"); - - let doc = create_test_doc("doc-checksum"); - save_document(&path, &doc).unwrap(); - - // Corrupt the file - let content = std::fs::read_to_string(&path).unwrap(); - let corrupted = content.replace("doc-checksum", "doc-corrupted"); - std::fs::write(&path, corrupted).unwrap(); - - // Load should fail with checksum error - let result = load_document(&path); - assert!(result.is_err()); - let err = result.unwrap_err(); - assert!(matches!(err, Error::Parse(_))); - } - - #[test] - fn test_checksum_disabled() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("no-checksum.json"); - - let doc = create_test_doc("doc-no-check"); - save_document(&path, &doc).unwrap(); - - // Load with checksum disabled should succeed - let options = PersistenceOptions::new().with_verify_checksum(false); - let result = load_document_with_options(&path, &options); - assert!(result.is_ok()); - let loaded = result.unwrap(); - assert_eq!(loaded.meta.id, "doc-no-check"); - - // Now corrupt the checksum field specifically - let content = std::fs::read_to_string(&path).unwrap(); - // Change the checksum value but keep the payload intact - let payload_value = serde_json::to_value(&doc).unwrap(); - let corrupted = content.replace( - &calculate_checksum(&serde_json::to_vec(&payload_value).unwrap()), - "0000000000000000000000000000000000000000000000000000000000000000", - ); - std::fs::write(&path, corrupted).unwrap(); - - // Load with checksum disabled should still succeed - let result = load_document_with_options(&path, &options); - assert!(result.is_ok()); - - // Load with checksum enabled should fail - let options_enabled = PersistenceOptions::new().with_verify_checksum(true); - let result = load_document_with_options(&path, &options_enabled); - assert!(result.is_err()); - } - - #[test] - fn test_load_nonexistent() { - let result = load_document(Path::new("/nonexistent/path.json")); - assert!(result.is_err()); - assert!(result.unwrap_err().is_not_found()); - } - - #[test] - fn test_save_and_load_index() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("meta.bin"); - - let mut entries = Vec::new(); - entries.push(DocumentMeta::new("doc-1", "Doc 1", "md")); - entries.push(DocumentMeta::new("doc-2", "Doc 2", "pdf")); - - save_index(&path, &entries).unwrap(); - - let loaded = load_index(&path).unwrap(); - assert_eq!(loaded.len(), 2); - assert_eq!(loaded[0].id, "doc-1"); - assert_eq!(loaded[1].format, "pdf"); - } - - #[test] - fn test_load_empty_index() { - let temp = TempDir::new().unwrap(); - let path = temp.path().join("nonexistent.json"); - - let loaded = load_index(&path).unwrap(); - assert!(loaded.is_empty()); - } - - #[test] - fn test_checksum_calculation() { - let data1 = b"test data"; - let data2 = b"test data"; - let data3 = b"different data"; - - let checksum1 = calculate_checksum(data1); - let checksum2 = calculate_checksum(data2); - let checksum3 = calculate_checksum(data3); - - assert_eq!(checksum1, checksum2); - assert_ne!(checksum1, checksum3); - assert_eq!(checksum1.len(), 64); // SHA-256 produces 64 hex chars - } -} diff --git a/vectorless-core/vectorless/src/storage/workspace.rs b/vectorless-core/vectorless/src/storage/workspace.rs deleted file mode 100644 index 936fc815..00000000 --- a/vectorless-core/vectorless/src/storage/workspace.rs +++ /dev/null @@ -1,666 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Async workspace management for document collections. -//! -//! This module provides the primary workspace implementation for document -//! persistence, using async I/O for integration with runtimes like Tokio. -//! -//! # Features -//! -//! - **Async I/O** - All operations are async for non-blocking performance -//! - **LRU Cache** - Automatic caching with configurable size -//! - **Thread-Safe** - Fully thread-safe with `Arc` -//! - **Pluggable Backend** - Use file storage, in-memory, or custom backends -//! -//! # Example -//! -//! ```rust,ignore -//! use vectorless::storage::Workspace; -//! -//! #[tokio::main] -//! async fn main() -> Result<()> { -//! let workspace = Workspace::new("./workspace").await?; -//! -//! // Add a document -//! workspace.add(&doc).await?; -//! -//! // Load with caching -//! let loaded = workspace.load_and_cache("doc-1").await?; -//! -//! Ok(()) -//! } -//! ``` - -use std::collections::HashMap; -use std::path::PathBuf; -use std::sync::Arc; - -use serde::{Deserialize, Serialize}; -use tokio::sync::RwLock; -use tracing::{debug, info, warn}; - -use super::backend::{FileBackend, StorageBackend}; -use super::cache::DocumentCache; -use super::persistence::{PersistedDocument, load_document_from_bytes, save_document_to_bytes}; -use crate::Error; -use crate::error::Result; - -const META_KEY: &str = "meta"; -const CATALOG_KEY: &str = "catalog"; -const DEFAULT_CACHE_SIZE: usize = 100; - -/// Lightweight metadata entry for the async workspace index. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct DocumentMetaEntry { - /// Document ID. - pub id: String, - /// Document name/title. - pub doc_name: String, - /// Document description. - #[serde(default)] - pub doc_description: Option, - /// Document type (pdf, md, etc.). - pub doc_type: String, - /// Source file path. - #[serde(default)] - pub path: Option, - /// Page count (for PDFs). - #[serde(skip_serializing_if = "Option::is_none")] - pub page_count: Option, - /// Line count (for markdown). - #[serde(skip_serializing_if = "Option::is_none")] - pub line_count: Option, -} - -/// Options for async workspace creation. -#[derive(Debug, Clone)] -pub struct WorkspaceOptions { - /// LRU cache size (default: 100). - pub cache_size: usize, -} - -impl Default for WorkspaceOptions { - fn default() -> Self { - Self { - cache_size: DEFAULT_CACHE_SIZE, - } - } -} - -impl WorkspaceOptions { - /// Create new options with defaults. - pub fn new() -> Self { - Self::default() - } - - /// Set the cache size. - pub fn with_cache_size(mut self, size: usize) -> Self { - self.cache_size = size; - self - } -} - -/// Inner state for the async workspace. -struct WorkspaceInner { - /// Storage backend. - backend: Arc, - /// Root path (for file-based backends). - root: Option, - /// Document metadata index. - meta_index: HashMap, - /// DocCard catalog — lightweight document summaries for Orchestrator analysis. - catalog: HashMap, - /// LRU cache for loaded documents. - cache: DocumentCache, - /// Cross-document relationship graph (cached). - document_graph: Option, -} - -/// An async workspace for managing indexed documents. -/// -/// Uses `tokio::sync::RwLock` for async-safe concurrent access. -/// All operations are async and can be safely called from multiple tasks. -/// -/// # Thread Safety -/// -/// The async workspace is fully thread-safe and can be cloned cheaply -/// (it uses `Arc` internally). -#[derive(Clone)] -pub struct Workspace { - inner: Arc>, -} - -impl std::fmt::Debug for Workspace { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Workspace").finish() - } -} - -impl Workspace { - /// Create a new async workspace with a storage backend. - pub async fn with_backend(backend: Arc) -> Result { - Self::with_backend_and_options(backend, WorkspaceOptions::default()).await - } - - /// Create an async workspace with backend and options. - pub async fn with_backend_and_options( - backend: Arc, - options: WorkspaceOptions, - ) -> Result { - let mut inner = WorkspaceInner { - backend, - root: None, - meta_index: HashMap::new(), - catalog: HashMap::new(), - cache: DocumentCache::with_capacity(options.cache_size), - document_graph: None, - }; - - Self::load_meta_index(&mut inner)?; - Self::load_catalog_index(&mut inner)?; - - Ok(Self { - inner: Arc::new(RwLock::new(inner)), - }) - } - - /// Create a new file-based async workspace at the given path. - pub async fn new(path: impl Into) -> Result { - Self::with_options(path, WorkspaceOptions::default()).await - } - - /// Create a new async workspace with custom cache size. - pub async fn with_cache_size(path: impl Into, cache_size: usize) -> Result { - Self::with_options( - path, - WorkspaceOptions { - cache_size, - ..Default::default() - }, - ) - .await - } - - /// Create a new async workspace with custom options. - pub async fn with_options(path: impl Into, options: WorkspaceOptions) -> Result { - let root = path.into(); - let backend = Arc::new(FileBackend::new(&root)?); - - let mut inner = WorkspaceInner { - backend, - root: Some(root), - meta_index: HashMap::new(), - catalog: HashMap::new(), - cache: DocumentCache::with_capacity(options.cache_size), - document_graph: None, - }; - - Self::load_meta_index(&mut inner)?; - Self::load_catalog_index(&mut inner)?; - - Ok(Self { - inner: Arc::new(RwLock::new(inner)), - }) - } - - /// Get the workspace root path (if file-based). - pub async fn path(&self) -> Option { - let inner = self.inner.read().await; - inner.root.clone() - } - - /// List all document IDs in the workspace. - pub async fn list_documents(&self) -> Vec { - let inner = self.inner.read().await; - inner.meta_index.keys().cloned().collect() - } - - /// Get metadata for a document. - pub async fn get_meta(&self, id: &str) -> Option { - let inner = self.inner.read().await; - inner.meta_index.get(id).cloned() - } - - /// Check if a document exists. - pub async fn contains(&self, id: &str) -> bool { - let inner = self.inner.read().await; - inner.meta_index.contains_key(id) - } - - /// Add a document to the workspace. - pub async fn add(&self, doc: &PersistedDocument) -> Result<()> { - let mut inner = self.inner.write().await; - - let doc_id = doc.meta.id.clone(); - let key = Self::doc_key(&doc_id); - - // Serialize and save via backend - let bytes = save_document_to_bytes(doc)?; - inner.backend.put(&key, &bytes)?; - - // Update meta index - let meta_entry = DocumentMetaEntry { - id: doc_id.clone(), - doc_name: doc.meta.name.clone(), - doc_description: doc.meta.description.clone(), - doc_type: doc.meta.format.clone(), - path: doc - .meta - .source_path - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - page_count: if doc.pages.is_empty() { - None - } else { - Some(doc.pages.len()) - }, - line_count: doc.meta.line_count, - }; - - inner.meta_index.insert(doc_id.clone(), meta_entry); - Self::save_meta_index(&inner)?; - - // Update catalog with DocCard - if let Some(card) = doc - .navigation_index - .as_ref() - .and_then(|nav| nav.doc_card().cloned()) - { - inner.catalog.insert(doc_id.clone(), card); - Self::save_catalog_index(&inner)?; - } - - // Remove from cache if present - let _ = inner.cache.remove(&doc_id); - - info!("Saved document {} to async workspace", doc_id); - - // Invalidate document graph since documents changed - inner.document_graph = None; - - Ok(()) - } - - /// Load a document from the workspace. - /// - /// Uses LRU cache: returns cached version if available, - /// otherwise loads from backend and caches it. - pub async fn load(&self, id: &str) -> Result> { - // First check if document exists (read lock) - { - let inner = self.inner.read().await; - if !inner.meta_index.contains_key(id) { - return Ok(None); - } - - // Check LRU cache - if let Some(cached) = inner.cache.get(id)? { - debug!("Cache hit for document {}", id); - return Ok(Some(cached)); - } - } - - // Load from backend (need read lock for backend access) - let inner = self.inner.read().await; - let key = Self::doc_key(id); - - match inner.backend.get(&key)? { - Some(bytes) => { - let doc = load_document_from_bytes(&bytes)?; - - // Note: We can't modify the cache with only a read lock - // For now, we return the document without caching - // A more sophisticated implementation would use a separate cache structure - - debug!("Loaded document {} from backend", id); - Ok(Some(doc)) - } - None => { - warn!("Document {} in meta index but not in backend", id); - Ok(None) - } - } - } - - /// Load a document and cache it (requires write lock for caching). - pub async fn load_and_cache(&self, id: &str) -> Result> { - // First check if document exists (read lock) - { - let inner = self.inner.read().await; - if !inner.meta_index.contains_key(id) { - return Ok(None); - } - - // Check LRU cache - if let Some(cached) = inner.cache.get(id)? { - debug!("Cache hit for document {}", id); - return Ok(Some(cached)); - } - } - - // Load from backend and cache (write lock) - let inner = self.inner.write().await; - let key = Self::doc_key(id); - - match inner.backend.get(&key)? { - Some(bytes) => { - let doc = load_document_from_bytes(&bytes)?; - - // Add to cache - inner.cache.put(id.to_string(), doc.clone())?; - - debug!("Loaded and cached document {}", id); - Ok(Some(doc)) - } - None => { - warn!("Document {} in meta index but not in backend", id); - Ok(None) - } - } - } - - /// Remove a document from the workspace. - pub async fn remove(&self, id: &str) -> Result { - let mut inner = self.inner.write().await; - - if !inner.meta_index.contains_key(id) { - return Ok(false); - } - - let key = Self::doc_key(id); - inner.backend.delete(&key)?; - - inner.meta_index.remove(id); - - // Remove from cache and catalog - let _ = inner.cache.remove(id); - inner.catalog.remove(id); - - Self::save_meta_index(&inner)?; - Self::save_catalog_index(&inner)?; - - info!("Removed document {} from async workspace", id); - - // Invalidate document graph since documents changed - inner.document_graph = None; - - Ok(true) - } - - /// Get the number of documents in the workspace. - pub async fn len(&self) -> usize { - let inner = self.inner.read().await; - inner.meta_index.len() - } - - /// Check if the workspace is empty. - pub async fn is_empty(&self) -> bool { - let inner = self.inner.read().await; - inner.meta_index.is_empty() - } - - /// Find a document ID by its source path. - /// - /// Returns the first document whose `source_path` matches. - /// Used for incremental indexing to check if a file has already been indexed. - pub async fn find_by_source_path(&self, path: &std::path::Path) -> Option { - let target = path.to_string_lossy().to_string(); - let inner = self.inner.read().await; - for (_, entry) in &inner.meta_index { - if entry.path.as_deref() == Some(target.as_str()) { - return Some(entry.id.clone()); - } - } - None - } - - /// Get the number of items currently in the LRU cache. - pub async fn cache_len(&self) -> usize { - let inner = self.inner.read().await; - inner.cache.len() - } - - /// Get cache utilization (0.0 to 1.0). - pub async fn cache_utilization(&self) -> f64 { - let inner = self.inner.read().await; - inner.cache.utilization() - } - - /// Get cache statistics. - pub async fn cache_stats(&self) -> super::cache::CacheStats { - let inner = self.inner.read().await; - inner.cache.stats() - } - - /// Clear the LRU cache. - pub async fn clear_cache(&self) -> Result<()> { - let inner = self.inner.write().await; - inner.cache.clear()?; - debug!("Cleared async document cache"); - Ok(()) - } - - // ========================================================================= - // Document Graph Methods - // ========================================================================= - - /// Storage key for the document graph. - const GRAPH_KEY: &'static str = "_graph"; - - /// Get the document graph, loading from backend if not cached. - pub async fn get_graph(&self) -> Result> { - // Check cache first - { - let inner = self.inner.read().await; - if inner.document_graph.is_some() { - return Ok(inner.document_graph.clone()); - } - } - - // Load from backend - let inner = self.inner.read().await; - match inner.backend.get(Self::GRAPH_KEY)? { - Some(bytes) => { - let graph: crate::graph::DocumentGraph = - serde_json::from_slice(&bytes).map_err(|e| { - crate::Error::Serialization(format!("Failed to deserialize graph: {}", e)) - })?; - debug!("Loaded document graph from backend"); - Ok(Some(graph)) - } - None => Ok(None), - } - } - - /// Persist the document graph to the backend. - pub async fn set_graph(&self, graph: &crate::graph::DocumentGraph) -> Result<()> { - let mut inner = self.inner.write().await; - let bytes = serde_json::to_vec(graph).map_err(|e| { - crate::Error::Serialization(format!("Failed to serialize graph: {}", e)) - })?; - inner.backend.put(Self::GRAPH_KEY, &bytes)?; - inner.document_graph = Some(graph.clone()); - info!( - "Persisted document graph ({} nodes, {} edges)", - graph.node_count(), - graph.edge_count() - ); - Ok(()) - } - - /// Invalidate the cached document graph (e.g. after add/remove). - pub async fn invalidate_graph(&self) -> Result<()> { - let mut inner = self.inner.write().await; - inner.document_graph = None; - // Also remove from backend so stale graphs don't persist - let _ = inner.backend.delete(Self::GRAPH_KEY); - debug!("Invalidated document graph cache"); - Ok(()) - } - - /// Get the storage key for a document. - fn doc_key(id: &str) -> String { - id.to_string() - } - - /// Load the meta index from backend. - fn load_meta_index(inner: &mut WorkspaceInner) -> Result<()> { - match inner.backend.get(META_KEY)? { - Some(bytes) => { - let meta: HashMap = serde_json::from_slice(&bytes) - .map_err(|e| Error::Parse(format!("Failed to parse meta index: {}", e)))?; - inner.meta_index = meta; - info!( - "Loaded {} document(s) from async workspace index", - inner.meta_index.len() - ); - } - None => { - // Try to rebuild from existing keys - Self::rebuild_meta_index(inner)?; - } - } - Ok(()) - } - - /// Save the meta index to backend. - fn save_meta_index(inner: &WorkspaceInner) -> Result<()> { - let bytes = serde_json::to_vec_pretty(&inner.meta_index) - .map_err(|e| Error::Parse(format!("Failed to serialize meta index: {}", e)))?; - inner.backend.put(META_KEY, &bytes)?; - Ok(()) - } - - /// Load the DocCard catalog from backend. - fn load_catalog_index(inner: &mut WorkspaceInner) -> Result<()> { - match inner.backend.get(CATALOG_KEY)? { - Some(bytes) => { - let catalog: HashMap = - serde_json::from_slice(&bytes).map_err(|e| { - Error::Parse(format!("Failed to parse catalog index: {}", e)) - })?; - inner.catalog = catalog; - info!("Loaded DocCard catalog: {} entries", inner.catalog.len()); - } - None => { - // Rebuild from existing documents - Self::rebuild_catalog(inner)?; - } - } - Ok(()) - } - - /// Save the DocCard catalog to backend. - fn save_catalog_index(inner: &WorkspaceInner) -> Result<()> { - let bytes = serde_json::to_vec_pretty(&inner.catalog) - .map_err(|e| Error::Parse(format!("Failed to serialize catalog: {}", e)))?; - inner.backend.put(CATALOG_KEY, &bytes)?; - Ok(()) - } - - /// Rebuild the DocCard catalog from existing documents. - fn rebuild_catalog(inner: &mut WorkspaceInner) -> Result<()> { - let keys = inner.backend.keys()?; - let reserved = ["meta", "_graph", "catalog"]; - let doc_keys: Vec<_> = keys - .iter() - .filter(|k| !reserved.contains(&k.as_str())) - .collect(); - - for key in doc_keys { - if let Some(bytes) = inner.backend.get(key)? { - if let Ok(doc) = load_document_from_bytes(&bytes) { - if let Some(card) = doc - .navigation_index - .as_ref() - .and_then(|nav| nav.doc_card().cloned()) - { - inner.catalog.insert(doc.meta.id.clone(), card); - } - } - } - } - - if !inner.catalog.is_empty() { - Self::save_catalog_index(inner)?; - info!("Rebuilt DocCard catalog: {} entries", inner.catalog.len()); - } - - Ok(()) - } - - /// Get all DocCards from the catalog. - pub async fn list_catalog(&self) -> Vec<(String, crate::document::DocCard)> { - let inner = self.inner.read().await; - inner - .catalog - .iter() - .map(|(id, card)| (id.clone(), card.clone())) - .collect() - } - - /// Get a single DocCard by document ID. - pub async fn get_doc_card(&self, id: &str) -> Option { - let inner = self.inner.read().await; - inner.catalog.get(id).cloned() - } - - /// Rebuild the meta index from existing documents. - fn rebuild_meta_index(inner: &mut WorkspaceInner) -> Result<()> { - let keys = inner.backend.keys()?; - let reserved = ["meta", "_graph", "catalog"]; - let doc_keys: Vec<_> = keys - .iter() - .filter(|k| !reserved.contains(&k.as_str())) - .collect(); - - for key in doc_keys { - if let Some(bytes) = inner.backend.get(key)? { - if let Ok(doc) = load_document_from_bytes(&bytes) { - let doc_id = doc.meta.id.clone(); - let meta_entry = DocumentMetaEntry { - id: doc_id.clone(), - doc_name: doc.meta.name, - doc_description: doc.meta.description, - doc_type: doc.meta.format, - path: doc - .meta - .source_path - .as_ref() - .map(|p| p.to_string_lossy().to_string()), - page_count: if doc.pages.is_empty() { - None - } else { - Some(doc.pages.len()) - }, - line_count: doc.meta.line_count, - }; - inner.meta_index.insert(doc_id, meta_entry); - } - } - } - - if !inner.meta_index.is_empty() { - Self::save_meta_index(inner)?; - info!( - "Rebuilt async index from {} document(s)", - inner.meta_index.len() - ); - } - - Ok(()) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use crate::document::DocumentTree; - - fn create_test_doc(id: &str) -> PersistedDocument { - let meta = super::super::persistence::DocumentMeta::new(id, "Test Doc", "md"); - let tree = DocumentTree::new("Root", "Content"); - PersistedDocument::new(meta, tree) - } -} diff --git a/vectorless-core/vectorless/src/utils/fingerprint.rs b/vectorless-core/vectorless/src/utils/fingerprint.rs deleted file mode 100644 index d7b8a988..00000000 --- a/vectorless-core/vectorless/src/utils/fingerprint.rs +++ /dev/null @@ -1,496 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Fingerprint system for content and subtree identification. -//! -//! This module provides a robust fingerprinting system for content identification, -//! enabling precise change detection at both content and subtree levels. -//! -//! # Key Features -//! -//! - **Content Fingerprint**: Hash of node content (title + text) -//! - **Subtree Fingerprint**: Recursive hash including all descendants -//! - **Stable Serialization**: Type-tagged hashing for consistent results -//! -//! # Usage -//! -//! ```rust,ignore -//! use vectorless::fingerprint::{Fingerprint, Fingerprinter}; -//! -//! // Create a fingerprint from content -//! let fp = Fingerprinter::new() -//! .with_str("Hello, world!") -//! .into_fingerprint(); -//! -//! // Compare fingerprints -//! if old_fp == new_fp { -//! // Content unchanged -//! } -//! ``` - -use base64::prelude::*; -use blake2::digest::typenum; -use blake2::{Blake2b, Digest}; -use serde::{Deserialize, Serialize}; -use std::hash::{Hash, Hasher}; - -/// A 128-bit fingerprint for content identification. -/// -/// Uses BLAKE2b-128 for fast, collision-resistant hashing. -/// Displayed as base64 for compact representation. -#[derive(Clone, Copy, PartialEq, Eq)] -pub struct Fingerprint(pub [u8; 16]); - -impl Fingerprint { - /// Create a fingerprint from raw bytes. - pub fn new(bytes: [u8; 16]) -> Self { - Self(bytes) - } - - /// Create a fingerprint from a byte slice (hashes the slice). - pub fn from_bytes(data: &[u8]) -> Self { - let mut hasher = Blake2b::::default(); - hasher.update(data); - Self(hasher.finalize().into()) - } - - /// Create a fingerprint from a string. - pub fn from_str(s: &str) -> Self { - Self::from_bytes(s.as_bytes()) - } - - /// Encode fingerprint to base64 string. - pub fn to_base64(self) -> String { - BASE64_STANDARD.encode(self.0) - } - - /// Decode fingerprint from base64 string. - pub fn from_base64(s: &str) -> Result { - let bytes = BASE64_STANDARD - .decode(s) - .map_err(|e| FingerprintError::InvalidBase64(e.to_string()))?; - let bytes: [u8; 16] = bytes - .try_into() - .map_err(|e: Vec| FingerprintError::InvalidLength(e.len()))?; - Ok(Self(bytes)) - } - - /// Get the raw bytes. - pub fn as_bytes(&self) -> &[u8] { - &self.0 - } - - /// Check if this is a zero/null fingerprint. - pub fn is_zero(&self) -> bool { - self.0 == [0u8; 16] - } - - /// Create a zero/null fingerprint (for uninitialized state). - pub fn zero() -> Self { - Self([0u8; 16]) - } -} - -impl std::fmt::Display for Fingerprint { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - for byte in &self.0 { - write!(f, "{:02x}", byte)?; - } - Ok(()) - } -} - -impl std::fmt::Debug for Fingerprint { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Fingerprint({})", self) - } -} - -impl Hash for Fingerprint { - fn hash(&self, state: &mut H) { - // Fingerprint is already evenly distributed, use first 8 bytes - state.write(&self.0[..8]); - } -} - -impl Serialize for Fingerprint { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - serializer.serialize_str(&self.to_base64()) - } -} - -impl<'de> Deserialize<'de> for Fingerprint { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let s = String::deserialize(deserializer)?; - Self::from_base64(&s).map_err(serde::de::Error::custom) - } -} - -impl Default for Fingerprint { - fn default() -> Self { - Self::zero() - } -} - -/// Error type for fingerprint operations. -#[derive(Debug, thiserror::Error)] -pub enum FingerprintError { - /// Invalid base64 encoding. - #[error("Invalid base64: {0}")] - InvalidBase64(String), - - /// Invalid fingerprint length. - #[error("Invalid fingerprint length: {0}")] - InvalidLength(usize), - - /// Serialization error. - #[error("Serialization error: {0}")] - Serialization(String), -} - -/// Builder for creating fingerprints. -/// -/// Provides a fluent API for incrementally building fingerprints -/// from multiple values. -/// -/// # Example -/// -/// ```rust,ignore -/// let fp = Fingerprinter::new() -/// .with_str("title") -/// .with_str("content") -/// .with_usize(42) -/// .into_fingerprint(); -/// ``` -#[derive(Clone)] -pub struct Fingerprinter { - hasher: Blake2b, -} - -impl Fingerprinter { - /// Create a new fingerprinter. - pub fn new() -> Self { - Self { - hasher: Blake2b::::default(), - } - } - - /// Finalize and produce the fingerprint. - pub fn into_fingerprint(self) -> Fingerprint { - Fingerprint(self.hasher.finalize().into()) - } - - /// Add a string to the hash. - pub fn with_str(mut self, s: &str) -> Self { - self.write_str(s); - self - } - - /// Add a string to the hash (mutable). - pub fn write_str(&mut self, s: &str) { - self.write_type_tag("s"); - self.write_varlen_bytes(s.as_bytes()); - } - - /// Add bytes to the hash. - pub fn with_bytes(mut self, bytes: &[u8]) -> Self { - self.write_bytes(bytes); - self - } - - /// Add bytes to the hash (mutable). - pub fn write_bytes(&mut self, bytes: &[u8]) { - self.write_type_tag("b"); - self.write_varlen_bytes(bytes); - } - - /// Add a usize to the hash. - pub fn with_usize(mut self, n: usize) -> Self { - self.write_usize(n); - self - } - - /// Add a usize to the hash (mutable). - pub fn write_usize(&mut self, n: usize) { - self.write_type_tag("u"); - self.hasher.update((n as u64).to_le_bytes()); - } - - /// Add a u64 to the hash. - pub fn with_u64(mut self, n: u64) -> Self { - self.write_u64(n); - self - } - - /// Add a u64 to the hash (mutable). - pub fn write_u64(&mut self, n: u64) { - self.write_type_tag("u8"); - self.hasher.update(n.to_le_bytes()); - } - - /// Add an i64 to the hash. - pub fn with_i64(mut self, n: i64) -> Self { - self.write_i64(n); - self - } - - /// Add an i64 to the hash (mutable). - pub fn write_i64(&mut self, n: i64) { - self.write_type_tag("i8"); - self.hasher.update(n.to_le_bytes()); - } - - /// Add a bool to the hash. - pub fn with_bool(mut self, b: bool) -> Self { - self.write_bool(b); - self - } - - /// Add a bool to the hash (mutable). - pub fn write_bool(&mut self, b: bool) { - self.write_type_tag(if b { "t" } else { "f" }); - } - - /// Add an optional string to the hash. - pub fn with_option_str(mut self, opt: Option<&str>) -> Self { - self.write_option_str(opt); - self - } - - /// Add an optional string to the hash (mutable). - pub fn write_option_str(&mut self, opt: Option<&str>) { - match opt { - Some(s) => { - self.write_type_tag("some"); - self.write_str(s); - } - None => { - self.write_type_tag("none"); - } - } - } - - /// Add another fingerprint to the hash. - pub fn with_fingerprint(mut self, fp: &Fingerprint) -> Self { - self.write_fingerprint(fp); - self - } - - /// Add another fingerprint to the hash (mutable). - pub fn write_fingerprint(&mut self, fp: &Fingerprint) { - self.write_type_tag("fp"); - self.hasher.update(&fp.0); - } - - /// Add raw bytes directly (no type tag). - pub fn write_raw(&mut self, bytes: &[u8]) { - self.hasher.update(bytes); - } - - // Internal helpers - - fn write_type_tag(&mut self, tag: &str) { - self.hasher.update(tag.as_bytes()); - self.hasher.update(b";"); - } - - fn write_varlen_bytes(&mut self, bytes: &[u8]) { - self.hasher.update((bytes.len() as u32).to_le_bytes()); - self.hasher.update(bytes); - } -} - -/// Node fingerprint containing both content and subtree fingerprints. -/// -/// This enables precise change detection: -/// - If `content_fp` changes, the node's content was modified -/// - If `subtree_fp` changes, the node or its descendants were modified -/// - If `content_fp` is same but `subtree_fp` changed, only descendants changed -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] -pub struct NodeFingerprint { - /// Fingerprint of this node's content (title + text). - pub content: Fingerprint, - - /// Fingerprint of the entire subtree (including this node). - /// Computed recursively from all descendants. - pub subtree: Fingerprint, -} - -impl NodeFingerprint { - /// Create a new node fingerprint. - pub fn new(content: Fingerprint, subtree: Fingerprint) -> Self { - Self { content, subtree } - } - - /// Create a fingerprint for a leaf node (content == subtree). - pub fn leaf(content: Fingerprint) -> Self { - Self { - content, - subtree: content, - } - } - - /// Create a zero/null fingerprint. - pub fn zero() -> Self { - Self { - content: Fingerprint::zero(), - subtree: Fingerprint::zero(), - } - } - - /// Check if this is a zero fingerprint. - pub fn is_zero(&self) -> bool { - self.content.is_zero() && self.subtree.is_zero() - } - - /// Check if content changed compared to another fingerprint. - pub fn content_changed(&self, other: &Self) -> bool { - self.content != other.content - } - - /// Check if subtree changed compared to another fingerprint. - pub fn subtree_changed(&self, other: &Self) -> bool { - self.subtree != other.subtree - } - - /// Check if only descendants changed (content same, subtree different). - pub fn only_descendants_changed(&self, other: &Self) -> bool { - self.content == other.content && self.subtree != other.subtree - } -} - -impl Default for NodeFingerprint { - fn default() -> Self { - Self::zero() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_fingerprint_from_str() { - let fp1 = Fingerprint::from_str("hello"); - let fp2 = Fingerprint::from_str("hello"); - let fp3 = Fingerprint::from_str("world"); - - assert_eq!(fp1, fp2); - assert_ne!(fp1, fp3); - } - - #[test] - fn test_fingerprint_base64_roundtrip() { - let fp = Fingerprint::from_str("test content"); - let encoded = fp.to_base64(); - let decoded = Fingerprint::from_base64(&encoded).unwrap(); - assert_eq!(fp, decoded); - } - - #[test] - fn test_fingerprinter_chaining() { - let fp1 = Fingerprinter::new() - .with_str("title") - .with_str("content") - .into_fingerprint(); - - let fp2 = Fingerprinter::new() - .with_str("title") - .with_str("content") - .into_fingerprint(); - - let fp3 = Fingerprinter::new() - .with_str("title") - .with_str("different") - .into_fingerprint(); - - assert_eq!(fp1, fp2); - assert_ne!(fp1, fp3); - } - - #[test] - fn test_fingerprinter_types() { - let fp1 = Fingerprinter::new() - .with_str("test") - .with_usize(42) - .with_bool(true) - .into_fingerprint(); - - let fp2 = Fingerprinter::new() - .with_str("test") - .with_usize(42) - .with_bool(true) - .into_fingerprint(); - - let fp3 = Fingerprinter::new() - .with_str("test") - .with_usize(43) // different number - .with_bool(true) - .into_fingerprint(); - - assert_eq!(fp1, fp2); - assert_ne!(fp1, fp3); - } - - #[test] - fn test_node_fingerprint() { - let content = Fingerprint::from_str("content"); - let subtree = Fingerprint::from_str("subtree"); - - let fp = NodeFingerprint::new(content, subtree); - - assert!(!fp.is_zero()); - assert_eq!(fp.content, content); - assert_eq!(fp.subtree, subtree); - } - - #[test] - fn test_node_fingerprint_change_detection() { - let old = NodeFingerprint::new( - Fingerprint::from_str("content"), - Fingerprint::from_str("subtree"), - ); - - // Same content, different subtree - let new1 = NodeFingerprint::new( - Fingerprint::from_str("content"), - Fingerprint::from_str("different"), - ); - assert!(new1.only_descendants_changed(&old)); - assert!(!new1.content_changed(&old)); - assert!(new1.subtree_changed(&old)); - - // Different content - let new2 = NodeFingerprint::new( - Fingerprint::from_str("different"), - Fingerprint::from_str("subtree"), - ); - assert!(!new2.only_descendants_changed(&old)); - assert!(new2.content_changed(&old)); - } - - #[test] - fn test_fingerprint_serialization() { - let fp = Fingerprint::from_str("test serialization"); - let json = serde_json::to_string(&fp).unwrap(); - let decoded: Fingerprint = serde_json::from_str(&json).unwrap(); - assert_eq!(fp, decoded); - } - - #[test] - fn test_node_fingerprint_serialization() { - let fp = NodeFingerprint::new( - Fingerprint::from_str("content"), - Fingerprint::from_str("subtree"), - ); - let json = serde_json::to_string(&fp).unwrap(); - let decoded: NodeFingerprint = serde_json::from_str(&json).unwrap(); - assert_eq!(fp, decoded); - } -} diff --git a/vectorless-core/vectorless/src/utils/mod.rs b/vectorless-core/vectorless/src/utils/mod.rs deleted file mode 100644 index 472bed71..00000000 --- a/vectorless-core/vectorless/src/utils/mod.rs +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Utility functions and helpers. -//! -//! This module provides common utilities used across the codebase: -//! -//! - **Token estimation** — Fast and accurate token counting (tiktoken-based) -//! - **Fingerprint** — BLAKE2b content hashing for change detection -//! - **Validation** — Pre-index source validation (file, content, bytes) - -pub mod fingerprint; -mod token; -pub mod validation; - -pub use token::estimate_tokens; -pub use validation::{validate_bytes, validate_content, validate_file}; diff --git a/vectorless-core/vectorless/src/utils/token.rs b/vectorless-core/vectorless/src/utils/token.rs deleted file mode 100644 index 9e23ea85..00000000 --- a/vectorless-core/vectorless/src/utils/token.rs +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Unified token estimation module. -//! -//! Provides accurate token counting using tiktoken for OpenAI models, -//! with fallback to character-based estimation for other models. - -use std::sync::OnceLock; -use tiktoken_rs::CoreBPE; - -/// Global BPE encoder instance (cl100k_base is used by GPT-4, GPT-3.5-turbo, text-embedding-ada-002) -static BPE: OnceLock = OnceLock::new(); - -/// Get or initialize the BPE encoder. -fn get_bpe() -> &'static CoreBPE { - BPE.get_or_init(|| { - tiktoken_rs::cl100k_base().expect("Failed to initialize cl100k_base tokenizer") - }) -} - -/// Estimate token count for a text using tiktoken. -/// -/// This uses the cl100k_base encoding which is used by: -/// - GPT-4 -/// - GPT-3.5-turbo -/// - GPT-4o -/// - GPT-4o-mini -/// - text-embedding-ada-002 -/// - text-embedding-3-small/large -/// -/// # Example -/// -/// ``` -/// use vectorless::estimate_tokens; -/// -/// assert_eq!(estimate_tokens(""), 0); -/// assert!(estimate_tokens("hello world") > 0); -/// ``` -pub fn estimate_tokens(text: &str) -> usize { - if text.is_empty() { - return 0; - } - - // Use tiktoken for accurate counting - get_bpe().encode_with_special_tokens(text).len() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_estimate_tokens_empty() { - assert_eq!(estimate_tokens(""), 0); - } - - #[test] - fn test_estimate_tokens_simple() { - // "hello world" should be 2 tokens with tiktoken - let count = estimate_tokens("hello world"); - assert!(count >= 2, "Expected at least 2 tokens, got {}", count); - } -} diff --git a/vectorless-core/vectorless/src/utils/validation.rs b/vectorless-core/vectorless/src/utils/validation.rs deleted file mode 100644 index e5b1b64b..00000000 --- a/vectorless-core/vectorless/src/utils/validation.rs +++ /dev/null @@ -1,195 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Source validation utilities for indexing. - -use std::path::Path; - -use crate::document::DocumentFormat; -use crate::error::{Error, Result}; - -/// Maximum file size before emitting a warning (100 MB). -const LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024; - -/// Result of validating a source before indexing. -#[derive(Debug, Clone)] -pub struct SourceValidation { - /// Whether the source is valid for indexing. - pub valid: bool, - - /// Validation errors (prevents indexing). - pub errors: Vec, - - /// Validation warnings (non-blocking). - pub warnings: Vec, -} - -impl SourceValidation { - fn valid() -> Self { - Self { - valid: true, - errors: vec![], - warnings: vec![], - } - } - - fn invalid(errors: Vec) -> Self { - Self { - valid: false, - errors, - warnings: vec![], - } - } - - fn with_warnings(mut self, warnings: Vec) -> Self { - self.warnings = warnings; - self - } -} - -/// Validate a file path for indexing. -/// -/// Checks: exists, readable, supported format, size. -pub fn validate_file(path: &Path) -> Result { - if !path.exists() { - return Ok(SourceValidation::invalid(vec![format!( - "File not found: {}", - path.display() - )])); - } - - let metadata = std::fs::metadata(path) - .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; - - let size = metadata.len() as usize; - let mut warnings = Vec::new(); - - if size > LARGE_FILE_THRESHOLD { - warnings.push(format!( - "Large file ({}MB) may take longer to index", - size / (1024 * 1024) - )); - } - - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - if DocumentFormat::from_extension(ext).is_none() { - return Ok( - SourceValidation::invalid(vec![format!("Unsupported format: .{}", ext)]) - .with_warnings(warnings), - ); - } - - Ok(SourceValidation::valid().with_warnings(warnings)) -} - -/// Validate content string for indexing. -/// -/// Checks: non-empty. -pub fn validate_content(content: &str, _format: DocumentFormat) -> SourceValidation { - let mut errors = Vec::new(); - - if content.trim().is_empty() { - errors.push("Content is empty".to_string()); - } - - if errors.is_empty() { - SourceValidation::valid() - } else { - SourceValidation::invalid(errors) - } -} - -/// Validate binary data for indexing. -/// -/// Checks: non-empty, PDF magic number. -pub fn validate_bytes(data: &[u8], format: DocumentFormat) -> SourceValidation { - let mut errors = Vec::new(); - - if data.is_empty() { - errors.push("Byte data is empty".to_string()); - } - - // PDF magic number check - if format == DocumentFormat::Pdf && !data.is_empty() { - if !data.starts_with(b"%PDF") { - errors.push("Data does not appear to be a valid PDF (missing %PDF header)".to_string()); - } - } - - if errors.is_empty() { - SourceValidation::valid() - } else { - SourceValidation::invalid(errors) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_validate_file_missing() { - let result = validate_file(Path::new("./nonexistent.md")).unwrap(); - assert!(!result.valid); - assert!(result.errors[0].contains("not found")); - } - - #[test] - fn test_validate_file_unsupported_format() { - let tmp = std::env::temp_dir().join("vectorless_test_validate.dat"); - std::fs::write(&tmp, b"data").unwrap(); - let result = validate_file(&tmp).unwrap(); - assert!(!result.valid); - assert!(result.errors[0].contains("Unsupported")); - let _ = std::fs::remove_file(&tmp); - } - - #[test] - fn test_validate_file_valid() { - let tmp = std::env::temp_dir().join("vectorless_test_validate.md"); - std::fs::write(&tmp, b"# Hello").unwrap(); - let result = validate_file(&tmp).unwrap(); - assert!(result.valid); - assert!(result.errors.is_empty()); - let _ = std::fs::remove_file(&tmp); - } - - #[test] - fn test_validate_content_empty() { - let result = validate_content(" \n ", DocumentFormat::Markdown); - assert!(!result.valid); - assert!(result.errors[0].contains("empty")); - } - - #[test] - fn test_validate_content_valid() { - let result = validate_content("# Hello", DocumentFormat::Markdown); - assert!(result.valid); - } - - #[test] - fn test_validate_bytes_empty() { - let result = validate_bytes(&[], DocumentFormat::Pdf); - assert!(!result.valid); - assert!(result.errors[0].contains("empty")); - } - - #[test] - fn test_validate_bytes_invalid_pdf() { - let result = validate_bytes(b"not a pdf", DocumentFormat::Pdf); - assert!(!result.valid); - assert!(result.errors[0].contains("PDF")); - } - - #[test] - fn test_validate_bytes_valid_pdf() { - let result = validate_bytes(b"%PDF-1.4 some content", DocumentFormat::Pdf); - assert!(result.valid); - } - - #[test] - fn test_validate_bytes_valid_markdown() { - let result = validate_bytes(b"# Hello", DocumentFormat::Markdown); - assert!(result.valid); - } -} From 9d76eba52818710f5c07dca7757afb14b882f001 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 10:25:55 +0800 Subject: [PATCH 25/28] refactor(docs): update project structure documentation with fine-grained crates Update CLAUDE.md to reflect the new architecture with 17 fine-grained Rust crates instead of the previous monolithic structure. Add detailed tree view of the new crate organization and dependency layers showing compilation isolation benefits. Remove the fix_imports.py script that was used for the crate splitting process as it's no longer needed. Update development workflow instructions to reflect the new multi-crate structure and add information about cargo test counts and specific crate building commands. --- CLAUDE.md | 78 +++++++++++++++++---------- vectorless-core/fix_imports.py | 96 ---------------------------------- 2 files changed, 51 insertions(+), 123 deletions(-) delete mode 100644 vectorless-core/fix_imports.py diff --git a/CLAUDE.md b/CLAUDE.md index 6a335513..f3934322 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,32 +10,52 @@ Vectorless is a Document Understanding Engine for AI written in Rust. ## Project Structure -Cargo workspace with 2 crates + pure Python SDK: - -- `vectorless-core/` - Rust crates - - `vectorless/` - Core engine - - `src/client/` - Client API (EngineBuilder, Engine) - facade layer, no business logic - - `src/document/` - Document data structures (Document, DocumentTree, NavigationIndex, ReasoningIndex) - - `src/index/` - Compile pipeline (8-stage, checkpointing, incremental update) - - `src/retrieval/` - Retrieval dispatch layer (preprocessing, dispatch, postprocessing, cache, streaming) - - `src/query/` - Query understanding and planning (intent classification, rewrite, decomposition) - - `src/agent/` - Retrieval execution (Worker: doc navigation, Orchestrator: supervisor loop + multi-doc fusion) - - `src/rerank/` - Result reranking and answer synthesis (dedup, scoring, fusion, synthesis) - - `src/scoring/` - Scoring and ranking strategies (BM25, relevance scoring, score combination) - - `src/llm/` - LLM client (connection pool, memo/caching, throttle/rate-limiting, fallback) - - `src/storage/` - Persistence (Workspace, LRU cache, backend abstraction file/memory) - - `src/graph/` - Cross-document relationship graph - - `src/metrics/` - Metrics collection and reporting - - `src/events/` - Event system for progress monitoring - - `src/config/` - Configuration types and validation - - `src/error.rs` - Unified error types - - `src/utils/` - Utility functions (token counting, fingerprinting, validation) - - `examples/` - Rust examples (legacy, no new additions) - - `vectorless-py/` - PyO3 bindings (compiled into Python native module) +Cargo workspace with 17 fine-grained Rust crates + pure Python SDK: + +``` +vectorless-core/ +├── vectorless-error/ # Error types (Result, Error enum) +├── vectorless-document/ # Document types (Document, Tree, NavigationIndex, ReasoningIndex) +├── vectorless-config/ # Configuration hub (aggregates all config types) +├── vectorless-utils/ # Utilities (fingerprinting, token counting, validation) +├── vectorless-scoring/ # Scoring (BM25, keyword extraction) +├── vectorless-graph/ # Cross-document relationship graph +├── vectorless-events/ # Event system for progress monitoring +├── vectorless-metrics/ # Metrics collection and reporting +├── vectorless-llm/ # LLM client (pool, memo/cache, throttle, fallback) +├── vectorless-storage/ # Persistence (Workspace, LRU cache, file/memory backends) +├── vectorless-query/ # Query understanding (intent classification, rewrite) +├── vectorless-index/ # Compile pipeline (10-stage, checkpointing, incremental update) +├── vectorless-agent/ # Retrieval execution (Worker navigation + Orchestrator fusion) +├── vectorless-retrieval/ # Retrieval dispatch layer (dispatcher, cache, streaming) +├── vectorless-rerank/ # Result reranking (dedup, BM25 scoring, fusion) +├── vectorless-engine/ # Facade (Engine, EngineBuilder) — re-exports public API +└── vectorless-py/ # PyO3 bindings (compiled into Python native module) +``` + - `vectorless/` - Pure Python SDK (high-level wrappers, CLI, config loading, integrations) - `examples/` - Python examples (primary, for Python ecosystem) - `docs/` - Docusaurus documentation site +### Dependency Layers + +``` +Layer 0: error · document · utils · scoring (no workspace deps) +Layer 1: graph · events · config · metrics (depends on Layer 0) +Layer 2: llm · storage (depends on Layer 0–1) +Layer 3: query (depends on Layer 0–2) +Layer 4: index · agent (depends on Layer 0–3) +Layer 5: retrieval · rerank (depends on Layer 0–4) +Layer 6: engine (facade) · vectorless-py (bindings) (depends on all) +``` + +### Compilation Isolation + +改一个模块只重编译该 crate + 上游 facade: +- 改 `agent` → agent, retrieval, rerank, engine, py 重编译;index/llm/storage 不动 +- 改 `llm` → llm 及其上层重编译;index/agent/stage 不重编译 +- 改 `document` → 全部重编译(核心类型,预期行为) + ### Retrieval Call Flow ``` @@ -55,10 +75,13 @@ Engine.ask() ```bash # Build (workspace) cargo build # Build all crates -cargo test # Run tests +cargo test # Run tests (488 tests across all crates) cargo clippy # Lint cargo fmt # Format code +# Build specific crate (fast — only that crate + dependents) +cargo build -p vectorless-agent + # Python SDK pip install -e . # Install in editable mode (from project root, uses maturin) @@ -147,8 +170,9 @@ When uncertain whether an operation is safe, **default to asking user confirmati ## Common Development Workflow -1. **Adding features**: Implement in appropriate `vectorless-core/vectorless/src/` module, add tests +1. **Adding features**: Implement in the appropriate `vectorless-core/vectorless-*/` crate, add tests 2. **Fixing bugs**: Add failing test case first, fix and ensure tests pass -3. **Python bindings**: Update `vectorless-core/vectorless-py/src/lib.rs` (PyO3) when Rust APIs change -4. **Python SDK**: Update `vectorless/` when API surface changes -5. **Committing code**: Use semantic commit messages, format: `type(scope): description` +3. **Adding crates**: New modules get their own crate under `vectorless-core/`, add to workspace Cargo.toml +4. **Python bindings**: Update `vectorless-core/vectorless-py/src/lib.rs` (PyO3) when Rust APIs change +5. **Python SDK**: Update `vectorless/` when API surface changes +6. **Committing code**: Use semantic commit messages, format: `type(scope): description` diff --git a/vectorless-core/fix_imports.py b/vectorless-core/fix_imports.py deleted file mode 100644 index ce2c7aea..00000000 --- a/vectorless-core/fix_imports.py +++ /dev/null @@ -1,96 +0,0 @@ -#!/usr/bin/env python3 -"""Fix crate:: imports for the split crates. - -For each crate, self-references (crate::SELF_MODULE::) stay as crate::. -External references (crate::OTHER_MODULE::) become vectorless_other::. -Also handles bare `crate::Error` -> `vectorless_error::Error`. -""" -import os -import re -import sys - -# Mapping: crate_dir -> (self_module, [external_deps]) -CRATES = { - "vectorless-error": ("error", []), - "vectorless-document": ("document", []), - "vectorless-config": ("config", []), - "vectorless-utils": ("utils", ["error", "document"]), - "vectorless-scoring": ("scoring", []), - "vectorless-graph": ("graph", ["document"]), - "vectorless-events": ("events", ["error", "document"]), - "vectorless-metrics": ("metrics", ["config", "error"]), - "vectorless-llm": ("llm", ["config", "error", "metrics", "utils"]), - "vectorless-storage": ("storage", ["config", "document", "error", "utils"]), - "vectorless-query": ("query", ["error", "llm", "scoring"]), - "vectorless-index": ("index", ["config", "document", "error", "llm", "metrics", "scoring", "storage", "utils"]), - "vectorless-agent": ("agent", ["document", "error", "llm", "query", "scoring"]), - "vectorless-retrieval": ("retrieval", ["agent", "document", "error", "llm", "query", "storage", "utils"]), - "vectorless-rerank": ("rerank", ["agent", "error", "query"]), - "vectorless-engine": ("client", ["agent", "config", "document", "error", "events", "index", "llm", "metrics", "retrieval", "rerank", "storage"]), -} - -MODULE_TO_CRATE = { - "error": "vectorless_error", - "document": "vectorless_document", - "config": "vectorless_config", - "utils": "vectorless_utils", - "scoring": "vectorless_scoring", - "graph": "vectorless_graph", - "events": "vectorless_events", - "metrics": "vectorless_metrics", - "llm": "vectorless_llm", - "storage": "vectorless_storage", - "query": "vectorless_query", - "index": "vectorless_index", - "agent": "vectorless_agent", - "retrieval": "vectorless_retrieval", - "rerank": "vectorless_rerank", - "client": "vectorless_engine", -} - -BASE = "/home/ztgx/Desktop/vectorless/vectorless-core" - -def fix_file(filepath, self_module): - with open(filepath, 'r') as f: - content = f.read() - - original = content - - # Replace crate::OTHER_MODULE:: with vectorless_other:: - # But keep crate::SELF_MODULE:: as crate::SELF_MODULE:: - for module, crate_name in MODULE_TO_CRATE.items(): - if module == self_module: - continue - # Match crate::module:: (with word boundary to avoid partial matches) - pattern = r'crate::' + re.escape(module) + r'::' - replacement = crate_name + '::' - content = re.sub(pattern, replacement, content) - - # Replace bare crate::Error (without any module prefix) with vectorless_error::Error - # But only if self_module is not "error" - if self_module != "error": - # Match "crate::Error" that isn't followed by :: (i.e., not crate::error::) - content = re.sub(r'crate::Error(?!::)', 'vectorless_error::Error', content) - # Match "crate::Result" -> "vectorless_error::Result" - content = re.sub(r'crate::Result(?!::)', 'vectorless_error::Result', content) - - if content != original: - with open(filepath, 'w') as f: - f.write(content) - return True - return False - -changed_files = 0 -for crate_dir, (self_module, deps) in CRATES.items(): - src_dir = os.path.join(BASE, crate_dir, "src") - if not os.path.isdir(src_dir): - continue - for root, dirs, files in os.walk(src_dir): - for fname in files: - if fname.endswith('.rs'): - fpath = os.path.join(root, fname) - if fix_file(fpath, self_module): - changed_files += 1 - print(f" Fixed: {os.path.relpath(fpath, BASE)}") - -print(f"\nTotal files changed: {changed_files}") From 9ea317cdf88f2d009d66b08c9ad96c84bda8a846 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 10:27:51 +0800 Subject: [PATCH 26/28] refactor(config): reorder imports in types module Move DocumentGraphConfig import to maintain consistent ordering and improve code organization. fix(engine): format ConcurrencyConfig initialization Properly format the ConcurrencyConfig initialization across multiple lines to improve readability. refactor(lib): consolidate DocumentTree export Move DocumentTree export to correct location in engine lib to avoid duplicate exports and maintain proper module structure. refactor(python): format graph module imports Reformat imports in python graph module to follow consistent multi-line style for better readability. --- vectorless-core/vectorless-config/src/types/mod.rs | 2 +- vectorless-core/vectorless-engine/src/engine.rs | 4 +++- vectorless-core/vectorless-engine/src/lib.rs | 2 +- vectorless-core/vectorless-py/src/graph.rs | 4 +++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/vectorless-core/vectorless-config/src/types/mod.rs b/vectorless-core/vectorless-config/src/types/mod.rs index 3a840489..717b137d 100644 --- a/vectorless-core/vectorless-config/src/types/mod.rs +++ b/vectorless-core/vectorless-config/src/types/mod.rs @@ -11,7 +11,6 @@ mod storage; use serde::{Deserialize, Serialize}; -pub use vectorless_graph::DocumentGraphConfig; pub use indexer::IndexerConfig; pub use llm_pool::{ FallbackBehavior, FallbackConfig, LlmConfig, OnAllFailedBehavior, RetryConfig, SlotConfig, @@ -20,6 +19,7 @@ pub use llm_pool::{ pub use metrics::{LlmMetricsConfig, MetricsConfig, RetrievalMetricsConfig}; pub use retrieval::RetrievalConfig; pub use storage::{CompressionAlgorithm, StorageConfig}; +pub use vectorless_graph::DocumentGraphConfig; /// Main configuration for vectorless. /// diff --git a/vectorless-core/vectorless-engine/src/engine.rs b/vectorless-core/vectorless-engine/src/engine.rs index 7ca039c1..e1833f43 100644 --- a/vectorless-core/vectorless-engine/src/engine.rs +++ b/vectorless-core/vectorless-engine/src/engine.rs @@ -705,7 +705,9 @@ impl Engine { enable_synonym_expansion: options.enable_synonym_expansion, ..ReasoningIndexConfig::default() }, - concurrency: vectorless_llm::throttle::ConcurrencyConfig::from(&self.config.llm.throttle), + concurrency: vectorless_llm::throttle::ConcurrencyConfig::from( + &self.config.llm.throttle, + ), ..Default::default() } } diff --git a/vectorless-core/vectorless-engine/src/lib.rs b/vectorless-core/vectorless-engine/src/lib.rs index 80abbe6d..d6f656c1 100644 --- a/vectorless-core/vectorless-engine/src/lib.rs +++ b/vectorless-core/vectorless-engine/src/lib.rs @@ -109,6 +109,7 @@ pub use vectorless_document::DocumentFormat; // ============================================================ pub use vectorless_config::Config; +pub use vectorless_document::DocumentTree; pub use vectorless_document::{ Answer, Concept, DocumentInfo, Evidence, IngestInput, ReasoningTrace, TraceStep, }; @@ -118,4 +119,3 @@ pub use vectorless_graph::{ DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword, }; pub use vectorless_metrics::{LlmMetricsReport, MetricsReport, RetrievalMetricsReport}; -pub use vectorless_document::DocumentTree; diff --git a/vectorless-core/vectorless-py/src/graph.rs b/vectorless-core/vectorless-py/src/graph.rs index 556731fc..affba45a 100644 --- a/vectorless-core/vectorless-py/src/graph.rs +++ b/vectorless-core/vectorless-py/src/graph.rs @@ -5,7 +5,9 @@ use pyo3::prelude::*; -use ::vectorless_engine::{DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword}; +use ::vectorless_engine::{ + DocumentGraph, DocumentGraphNode, EdgeEvidence, GraphEdge, WeightedKeyword, +}; /// A keyword with weight from document analysis. #[pyclass(name = "WeightedKeyword")] From 2af0f2979c05d5c0684aa392be96feccdf3b71f7 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 11:00:26 +0800 Subject: [PATCH 27/28] feat: remove all example files and documentation Remove complete examples directory containing various demonstration files including batch indexing, document management, error handling, index metrics, PDF indexing, and session walkthrough examples. The entire examples folder with all subdirectories and files has been removed, including: - README.md files explaining each example - main.py implementation files - Directory indexing and management examples - Error handling demonstrations - Index metrics and PDF indexing examples - Session API walkthrough materials --- examples/batch_indexing/README.md | 28 -- examples/batch_indexing/main.py | 180 -------- examples/document_management/README.md | 28 -- examples/document_management/main.py | 132 ------ examples/error_handling/README.md | 33 -- examples/error_handling/main.py | 107 ----- examples/index_directory/main.py | 99 ----- examples/index_metrics/README.md | 42 -- examples/index_metrics/main.py | 233 ---------- examples/indexing/README.md | 15 - examples/indexing/main.py | 130 ------ examples/pdf_indexing/README.md | 27 -- examples/pdf_indexing/main.py | 123 ------ examples/session_walkthrough/README.md | 32 -- examples/session_walkthrough/main.py | 589 ------------------------- 15 files changed, 1798 deletions(-) delete mode 100644 examples/batch_indexing/README.md delete mode 100644 examples/batch_indexing/main.py delete mode 100644 examples/document_management/README.md delete mode 100644 examples/document_management/main.py delete mode 100644 examples/error_handling/README.md delete mode 100644 examples/error_handling/main.py delete mode 100644 examples/index_directory/main.py delete mode 100644 examples/index_metrics/README.md delete mode 100644 examples/index_metrics/main.py delete mode 100644 examples/indexing/README.md delete mode 100644 examples/indexing/main.py delete mode 100644 examples/pdf_indexing/README.md delete mode 100644 examples/pdf_indexing/main.py delete mode 100644 examples/session_walkthrough/README.md delete mode 100644 examples/session_walkthrough/main.py diff --git a/examples/batch_indexing/README.md b/examples/batch_indexing/README.md deleted file mode 100644 index 41e87fae..00000000 --- a/examples/batch_indexing/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Batch Indexing Example - -Demonstrates indexing multiple documents at once using: -- `from_paths` -- explicit list of file paths -- `from_dir` -- all supported files in a directory -- `from_bytes` -- raw in-memory content - -Also shows cross-document querying with `with_doc_ids`. - -## Setup - -```bash -pip install vectorless -``` - -## Run - -```bash -python main.py -``` - -## Environment Variables - -| Variable | Description | Default | -|------------------------|----------------------|-----------| -| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | -| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | -| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/batch_indexing/main.py b/examples/batch_indexing/main.py deleted file mode 100644 index c68b3626..00000000 --- a/examples/batch_indexing/main.py +++ /dev/null @@ -1,180 +0,0 @@ -""" -Batch indexing example -- demonstrates indexing multiple documents at once -using from_paths, from_dir, and from_bytes. - -Usage: - pip install vectorless - python main.py -""" - -import asyncio -import os - -from vectorless import ( - Engine, - IndexContext, - IndexOptions, - QueryContext, - VectorlessError, -) - -# --- Configuration --- -API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") -MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") -ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -# Sample documents for demonstration -DOCS = { - "alpha.md": """\ -# Alpha Report - -## Summary - -Alpha is a distributed key-value store designed for low-latency reads. -It uses a log-structured merge tree for storage. - -## Architecture - -Write requests go through a write-ahead log, then are buffered in memory. -When the buffer is full, it is flushed to disk as an immutable SSTable. -""", - "beta.md": """\ -# Beta Report - -## Summary - -Beta is a stream processing engine that consumes events from Kafka topics -and applies real-time transformations using a DAG-based execution model. - -## Performance - -Beta processes up to 2 million events per second per node on commodity hardware. -""", - "gamma.md": """\ -# Gamma Report - -## Summary - -Gamma is a feature store that bridges the gap between offline feature -computation and online serving. Features are computed in Spark and served -via a low-latency gRPC endpoint. - -## Integration - -Gamma integrates with Alpha for feature metadata storage and Beta for -real-time feature updates. -""", -} - - -def write_sample_docs(base_dir: str) -> list[str]: - """Write sample markdown files and return their paths.""" - paths = [] - for name, content in DOCS.items(): - path = os.path.join(base_dir, name) - with open(path, "w") as f: - f.write(content) - paths.append(path) - return paths - - -async def main() -> None: - engine = Engine( - api_key=API_KEY, - model=MODEL, - endpoint=ENDPOINT, - ) - - # Create a temp directory with sample documents - docs_dir = "./batch_docs" - os.makedirs(docs_dir, exist_ok=True) - paths = write_sample_docs(docs_dir) - - # ---- 1. Index multiple files at once via from_paths ---- - print("=" * 50) - print(" from_paths -- index a list of files") - print("=" * 50) - - ctx = IndexContext.from_paths(paths) - result = await engine.index(ctx) - - print(f" Indexed {len(result.items)} document(s)") - for item in result.items: - print(f" - {item.name} ({item.doc_id[:8]}...)") - if result.has_failures(): - for f in result.failed: - print(f" ! Failed: {f.source} -- {f.error}") - print() - - doc_ids = [item.doc_id for item in result.items] - - # ---- 2. Query across all batch-indexed documents ---- - print("=" * 50) - print(" Query across multiple documents") - print("=" * 50) - - answer = await engine.query( - QueryContext( - "Which system processes the most events per second?" - ).with_doc_ids(doc_ids) - ) - for item in answer.items: - print(f" [{item.doc_id[:8]}...] score={item.score:.2f}") - print(f" {item.content[:200]}...") - print() - - # ---- 3. Index a directory via from_dir ---- - print("=" * 50) - print(" from_dir -- index all supported files in a directory") - print("=" * 50) - - # Clear first so we see fresh results - await engine.clear() - - ctx = IndexContext.from_dir(docs_dir).with_options( - IndexOptions(generate_summaries=True, generate_description=True) - ) - result = await engine.index(ctx) - - print(f" Indexed {len(result.items)} document(s)") - for item in result.items: - desc = item.description[:80] if item.description else "N/A" - print(f" - {item.name}: {desc}...") - print() - - # ---- 4. Index from raw bytes via from_bytes ---- - print("=" * 50) - print(" from_bytes -- index in-memory content") - print("=" * 50) - - md_bytes = b"""# Delta Notes - -## Key Points - -- Delta uses CRDTs for conflict-free replication. -- Writes are locally committed then asynchronously propagated. -- Read repair ensures eventual consistency across all replicas. -""" - - ctx = IndexContext.from_bytes(md_bytes, "markdown").with_name("delta") - result = await engine.index(ctx) - - print(f" Indexed: {result.doc_id}") - print() - - # ---- Cleanup ---- - print("=" * 50) - print(" Cleanup") - print("=" * 50) - - removed = await engine.clear() - print(f" Removed {removed} document(s)") - - # Remove temp files - for p in paths: - os.remove(p) - os.rmdir(docs_dir) - print(f" Cleaned up {docs_dir}/") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/document_management/README.md b/examples/document_management/README.md deleted file mode 100644 index e41148e0..00000000 --- a/examples/document_management/README.md +++ /dev/null @@ -1,28 +0,0 @@ -# Document Management Example - -Demonstrates CRUD operations on indexed documents: - -- `engine.list()` -- list all documents -- `engine.exists(doc_id)` -- check if a document exists -- `engine.remove(doc_id)` -- remove a single document -- `engine.clear()` -- remove all documents - -## Setup - -```bash -pip install vectorless -``` - -## Run - -```bash -python main.py -``` - -## Environment Variables - -| Variable | Description | Default | -|------------------------|----------------------|-----------| -| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | -| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | -| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/document_management/main.py b/examples/document_management/main.py deleted file mode 100644 index 5d206a89..00000000 --- a/examples/document_management/main.py +++ /dev/null @@ -1,132 +0,0 @@ -""" -Document management example -- demonstrates CRUD operations on indexed documents: -list, exists, remove, and clear. - -Usage: - pip install vectorless - python main.py -""" - -import asyncio -import os - -from vectorless import ( - Engine, - IndexContext, - QueryContext, - VectorlessError, -) - -# --- Configuration --- -API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") -MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") -ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -# Sample documents -SAMPLE_A = """\ -# Project Alpha - -## Overview - -Project Alpha is a next-generation database engine written in Rust. -It supports ACID transactions and serializable isolation. - -## Features - -- MVCC concurrency control -- B-tree and LSM storage engines -- Query planner with cost-based optimization -""" - -SAMPLE_B = """\ -# Project Beta - -## Overview - -Project Beta is a web framework for building real-time applications. -It uses WebSocket-based communication and server-side rendering. - -## Features - -- Hot module reloading -- Built-in authentication middleware -- Automatic code splitting -""" - - -async def main() -> None: - engine = Engine( - api_key=API_KEY, - model=MODEL, - endpoint=ENDPOINT, - ) - - # ---- Index two documents ---- - print("Indexing two documents...") - - result_a = await engine.index( - IndexContext.from_content(SAMPLE_A, "markdown").with_name("alpha") - ) - doc_id_a = result_a.doc_id - print(f" A: {doc_id_a}") - - result_b = await engine.index( - IndexContext.from_content(SAMPLE_B, "markdown").with_name("beta") - ) - doc_id_b = result_b.doc_id - print(f" B: {doc_id_b}") - print() - - # ---- list() -- show all indexed documents ---- - print("--- list() ---") - docs = await engine.list() - for doc in docs: - pages = f", pages={doc.page_count}" if doc.page_count else "" - lines = f", lines={doc.line_count}" if doc.line_count else "" - print(f" {doc.name} id={doc.id[:8]}... format={doc.format}{pages}{lines}") - print(f" Total: {len(docs)} document(s)\n") - - # ---- exists() -- check if a document is indexed ---- - print("--- exists() ---") - for did, label in [(doc_id_a, "A"), (doc_id_b, "B"), ("nonexistent-id", "?")]: - found = await engine.exists(did) - print(f" {label}: exists={found}") - print() - - # ---- Query a specific document ---- - print("--- query(doc_id_a) ---") - answer = await engine.query( - QueryContext("What storage engines does Alpha support?").with_doc_ids([doc_id_a]) - ) - item = answer.single() - if item: - print(f" Score: {item.score:.2f}") - print(f" Answer: {item.content[:200]}...\n") - - # ---- remove() -- delete a single document ---- - print("--- remove(doc_id_a) ---") - removed = await engine.remove(doc_id_a) - print(f" Removed A: {removed}") - - # Verify it's gone - exists_a = await engine.exists(doc_id_a) - print(f" exists(A) after removal: {exists_a}") - print() - - # ---- list() again -- only B should remain ---- - print("--- list() after removal ---") - docs = await engine.list() - for doc in docs: - print(f" {doc.name} id={doc.id[:8]}...") - print(f" Total: {len(docs)} document(s)\n") - - # ---- clear() -- remove all remaining documents ---- - print("--- clear() ---") - cleared = await engine.clear() - print(f" Cleared {cleared} document(s)") - - docs = await engine.list() - print(f" Remaining: {len(docs)} document(s)") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/error_handling/README.md b/examples/error_handling/README.md deleted file mode 100644 index 2424d618..00000000 --- a/examples/error_handling/README.md +++ /dev/null @@ -1,33 +0,0 @@ -# Error Handling Example - -Demonstrates how to catch and inspect `VectorlessError` exceptions: - -- Invalid format strings -- Invalid indexing modes -- Querying non-existent documents -- Batch indexing with partial failures -- Engine creation with invalid credentials - -The `VectorlessError` exception provides: -- `kind` -- error category (`"config"`, `"not_found"`, `"parse"`, `"llm"`, etc.) -- `message` -- human-readable error description - -## Setup - -```bash -pip install vectorless -``` - -## Run - -```bash -python main.py -``` - -## Environment Variables - -| Variable | Description | Default | -|------------------------|----------------------|-----------| -| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | -| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | -| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/error_handling/main.py b/examples/error_handling/main.py deleted file mode 100644 index 22099e3d..00000000 --- a/examples/error_handling/main.py +++ /dev/null @@ -1,107 +0,0 @@ -""" -Error handling example -- demonstrates catching and inspecting VectorlessError. - -Usage: - pip install vectorless - python main.py -""" - -import asyncio -import os - -from vectorless import ( - Engine, - IndexContext, - IndexOptions, - QueryContext, - VectorlessError, -) - -# --- Configuration --- -API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") -MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") -ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) - -async def main() -> None: - engine = Engine( - api_key=API_KEY, - model=MODEL, - endpoint=ENDPOINT, - ) - - # ---- 1. Invalid format ---- - print("--- Invalid format in from_bytes ---") - try: - ctx = IndexContext.from_bytes(b"hello", "xml") - except VectorlessError as e: - print(f" Caught VectorlessError:") - print(f" kind: {e.kind}") - print(f" message: {e.message}") - print(f" repr: {repr(e)}") - print() - - # ---- 2. Invalid indexing mode ---- - print("--- Invalid indexing mode ---") - try: - opts = IndexOptions(mode="bad_mode") - except VectorlessError as e: - print(f" Caught VectorlessError:") - print(f" kind: {e.kind}") - print(f" message: {e.message}") - print() - - # ---- 3. Query a non-existent document ---- - print("--- Query non-existent document ---") - try: - await engine.query( - QueryContext("What is this?").with_doc_ids(["does-not-exist"]) - ) - except VectorlessError as e: - print(f" Caught VectorlessError:") - print(f" kind: {e.kind}") - print(f" message: {e.message}") - print() - - # ---- 4. Index with partial failure in batch ---- - print("--- Batch indexing with mixed results ---") - good = IndexContext.from_content("# Real Doc\n\nThis is valid content.", "markdown") - - result = await engine.index(good.with_name("good_doc")) - if result.has_failures(): - for f in result.failed: - print(f" Failed: {f.source} -- {f.error}") - else: - print(f" Success: {result.doc_id}") - - # Inspect individual items - for item in result.items: - print(f" Item: {item.name} ({item.format})") - if item.metrics: - m = item.metrics - print(f" Total time: {m.total_time_ms} ms, LLM calls: {m.llm_calls}") - print() - - # ---- 5. Engine creation with bad credentials ---- - print("--- Engine with invalid credentials ---") - try: - bad_engine = Engine( - api_key="sk-invalid-key-12345", - model="gpt-4o", - ) - # Try to use it -- the error will surface on the first LLM call - await bad_engine.index( - IndexContext.from_content("# Test\n", "markdown").with_name("fail_test") - ) - except VectorlessError as e: - print(f" Caught VectorlessError:") - print(f" kind: {e.kind}") - print(f" message: {e.message[:120]}...") - print() - - # ---- Cleanup ---- - await engine.clear() - print("Done.") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/index_directory/main.py b/examples/index_directory/main.py deleted file mode 100644 index 08b1c3bd..00000000 --- a/examples/index_directory/main.py +++ /dev/null @@ -1,99 +0,0 @@ -""" -Directory indexing example — recursively index all documents in a directory. - -Usage: - python index_directory.py /path/to/docs - python index_directory.py /path/to/docs --no-recursive - -Environment variables: - LLM_API_KEY — Your LLM API key (required) - LLM_MODEL — Model name (default: google/gemini-3-flash-preview) - LLM_ENDPOINT — API endpoint (default: http://localhost:4000/api/v1) -""" - -import argparse -import asyncio -import os - -from vectorless import Engine, IndexContext, QueryContext - - -async def main(): - parser = argparse.ArgumentParser(description="Index a directory of documents") - parser.add_argument("directory", help="Directory path to index") - parser.add_argument( - "--no-recursive", - action="store_true", - help="Only scan top-level files (default: recursive)", - ) - args = parser.parse_args() - - # Build engine - api_key = os.environ.get("LLM_API_KEY", "sk-or-v1-...") - model = os.environ.get("LLM_MODEL", "google/gemini-3-flash-preview") - endpoint = os.environ.get("LLM_ENDPOINT", "http://localhost:4000/api/v1") - - engine = Engine( - api_key=api_key, - model=model, - endpoint=endpoint, - ) - - recursive = not args.no_recursive - - # Index directory - ctx = IndexContext.from_dir(args.directory, recursive=recursive) - - if ctx.is_empty(): - print(f"No supported files found in: {args.directory}") - return - - print(f"{'Recursively scanning' if recursive else 'Scanning top-level files in'}: {args.directory}") - print(f"Found files to index") - - result = await engine.index(ctx) - - print(f"\nIndexed {len(result.items)} document(s):") - for item in result.items: - print(f" {item.name} ({item.doc_id})") - if item.metrics: - print(f" nodes: {item.metrics.nodes_processed}, time: {item.metrics.total_time_ms}ms") - - if result.has_failures(): - print("\nFailed:") - for f in result.failed: - print(f" {f.source} — {f.error}") - - # Query across all indexed documents - query = "What is this about?" - print(f'\nQuerying: "{query}"') - - answer = await engine.query(QueryContext(query)) - for item in answer.items: - print(f" [{item.doc_id} score={item.score:.2f}]") - preview = item.content[:200] - print(f" {preview}") - if len(item.content) > 200: - print(" ...") - - # Metrics report - report = engine.metrics_report() - print("\nMetrics:") - print( - f" LLM: {report.llm.total_calls} calls, " - f"{report.llm.total_tokens} tokens, " - f"${report.llm.estimated_cost_usd:.4f}" - ) - print( - f" Retrieval: {report.retrieval.total_queries} queries, " - f"avg score {report.retrieval.avg_path_score:.2f}" - ) - - # Cleanup - docs = await engine.list() - for doc in docs: - await engine.remove(doc.id) - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/index_metrics/README.md b/examples/index_metrics/README.md deleted file mode 100644 index 78bdd552..00000000 --- a/examples/index_metrics/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# IndexMetrics Example - -Demonstrates how to inspect detailed indexing pipeline metrics via `IndexMetrics`. - -`IndexMetrics` is attached to each `IndexItem` and provides: - -| Field | Description | -|------------------------|----------------------------------------------| -| `total_time_ms` | Total indexing time | -| `parse_time_ms` | Document parsing stage duration | -| `build_time_ms` | Tree building stage duration | -| `enhance_time_ms` | Summary/enhancement stage duration | -| `nodes_processed` | Number of tree nodes processed | -| `summaries_generated` | Successfully generated summaries | -| `summaries_failed` | Failed summary generations | -| `llm_calls` | Total LLM API calls made | -| `total_tokens_generated` | Total tokens produced by the LLM | -| `topics_indexed` | Topics added to the reasoning index | -| `keywords_indexed` | Keywords added to the reasoning index | - -This example compares documents indexed with and without summaries enabled -to show how `IndexOptions` affect pipeline stages and LLM usage. - -## Setup - -```bash -pip install vectorless -``` - -## Run - -```bash -python main.py -``` - -## Environment Variables - -| Variable | Description | Default | -|------------------------|----------------------|-----------| -| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | -| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | -| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/index_metrics/main.py b/examples/index_metrics/main.py deleted file mode 100644 index bfea4cf0..00000000 --- a/examples/index_metrics/main.py +++ /dev/null @@ -1,233 +0,0 @@ -""" -IndexMetrics example -- demonstrates inspecting detailed indexing pipeline metrics. - -IndexMetrics exposes timing, node processing, LLM usage, and reasoning index -statistics for each indexed document. This example compares two documents with -different IndexOptions to show how options affect the pipeline. - -Usage: - pip install vectorless - python main.py -""" - -import asyncio -import os - -from vectorless import ( - Engine, - IndexContext, - IndexItem, - IndexMetrics, - IndexOptions, - VectorlessError, -) - -# --- Configuration --- -API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") -MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") -ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -# --- Sample documents with varying complexity --- -SIMPLE_DOC = """\ -# Quick Note - -This is a short note about caching strategies. -Redis is commonly used as an in-memory cache. -""" - -COMPLEX_DOC = """\ -# Distributed Systems Design Guide - -## Consensus - -Raft is a consensus algorithm designed to be easy to understand. -It elects a leader via randomized timeouts and replicates log entries -to a majority of followers before committing them. - -## Replication - -State machine replication ensures that all replicas execute the same -commands in the same order. Primary-backup replication is simpler but -provides lower availability during leader failover. - -## Partitioning - -Consistent hashing distributes keys across nodes with minimal -remapping when the cluster size changes. Virtual nodes improve balance -when the key space is small. - -## Failure Detection - -Phi accrual failure detection treats failure as a continuous suspicion -level rather than a binary alive/dead state. This reduces false -positives during transient network issues. -""" - - -def print_pipeline_breakdown(m: IndexMetrics) -> None: - """Print a breakdown of pipeline stages and their percentages.""" - total = m.total_time_ms - if total == 0: - print(" (no timing data)") - return - - parse_pct = m.parse_time_ms / total * 100 - build_pct = m.build_time_ms / total * 100 - enhance_pct = m.enhance_time_ms / total * 100 - other_pct = max(0, 100 - parse_pct - build_pct - enhance_pct) - - print(f" Parse: {m.parse_time_ms:>5} ms ({parse_pct:5.1f}%)") - print(f" Build: {m.build_time_ms:>5} ms ({build_pct:5.1f}%)") - print(f" Enhance: {m.enhance_time_ms:>5} ms ({enhance_pct:5.1f}%)") - print(f" Other: {total - m.parse_time_ms - m.build_time_ms - m.enhance_time_ms:>5} ms ({other_pct:5.1f}%)") - - -def print_llm_stats(m: IndexMetrics) -> None: - """Print LLM utilization statistics.""" - print(f" LLM calls: {m.llm_calls}") - print(f" Tokens generated: {m.total_tokens_generated}") - if m.llm_calls > 0: - avg_tokens = m.total_tokens_generated / m.llm_calls - print(f" Avg tokens/call: {avg_tokens:.0f}") - - -def print_summary_stats(m: IndexMetrics) -> None: - """Print summary generation success/failure.""" - total = m.summaries_generated + m.summaries_failed - print(f" Summaries ok: {m.summaries_generated}") - print(f" Summaries failed: {m.summaries_failed}") - if total > 0: - success_rate = m.summaries_generated / total * 100 - print(f" Success rate: {success_rate:.1f}%") - - -def print_reasoning_index(m: IndexMetrics) -> None: - """Print reasoning index statistics.""" - print(f" Nodes processed: {m.nodes_processed}") - print(f" Topics indexed: {m.topics_indexed}") - print(f" Keywords indexed: {m.keywords_indexed}") - - -def print_full_report(item: IndexItem) -> None: - """Print a full metrics report for an indexed item.""" - m = item.metrics - print(f" Document: {item.name} ({item.format})") - if m is None: - print(" (no metrics)") - return - - print(f" Total time: {m.total_time_ms} ms") - print(f" repr: {repr(m)}") - - print() - print(" Pipeline stages:") - print_pipeline_breakdown(m) - - print() - print(" LLM usage:") - print_llm_stats(m) - - print() - print(" Summary generation:") - print_summary_stats(m) - - print() - print(" Reasoning index:") - print_reasoning_index(m) - - -async def main() -> None: - engine = Engine( - api_key=API_KEY, - model=MODEL, - endpoint=ENDPOINT, - ) - - # ================================================================ - # 1. Index a simple document WITHOUT summaries - # ================================================================ - print("=" * 55) - print(" Run 1: Simple doc, summaries OFF") - print("=" * 55) - - opts_no_summary = IndexOptions( - generate_summaries=False, - generate_description=False, - ) - result = await engine.index( - IndexContext.from_content(SIMPLE_DOC, "markdown") - .with_name("simple_no_summary") - .with_options(opts_no_summary) - ) - item = result.items[0] - print_full_report(item) - doc_id_1 = item.doc_id - print() - - # ================================================================ - # 2. Index the same simple document WITH summaries - # ================================================================ - print("=" * 55) - print(" Run 2: Simple doc, summaries ON") - print("=" * 55) - - opts_with_summary = IndexOptions( - generate_summaries=True, - generate_description=True, - ) - result = await engine.index( - IndexContext.from_content(SIMPLE_DOC, "markdown") - .with_name("simple_with_summary") - .with_options(opts_with_summary) - ) - item = result.items[0] - print_full_report(item) - doc_id_2 = item.doc_id - print() - - # ================================================================ - # 3. Compare: summaries OFF vs ON for the simple doc - # ================================================================ - m_off = (await engine.list())[0] # first indexed - # Find the second document's metrics via a fresh index - # (We already have both items above; let's compare directly) - - # ================================================================ - # 4. Index a complex document WITH summaries - # ================================================================ - print("=" * 55) - print(" Run 3: Complex doc, summaries ON") - print("=" * 55) - - result = await engine.index( - IndexContext.from_content(COMPLEX_DOC, "markdown") - .with_name("complex_with_summary") - .with_options(opts_with_summary) - ) - item = result.items[0] - print_full_report(item) - doc_id_3 = item.doc_id - print() - - # ================================================================ - # 5. Summary table - # ================================================================ - print("=" * 55) - print(" Comparison table") - print("=" * 55) - - docs = await engine.list() - for doc in docs: - print(f" {doc.name:<30} id={doc.id[:8]}...") - if doc.description: - print(f" description: {doc.description[:80]}") - - # ================================================================ - # Cleanup - # ================================================================ - print() - cleared = await engine.clear() - print(f"Cleaned up {cleared} document(s).") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/indexing/README.md b/examples/indexing/README.md deleted file mode 100644 index dc60506f..00000000 --- a/examples/indexing/README.md +++ /dev/null @@ -1,15 +0,0 @@ -# Indexing Example - -Demonstrates the full Vectorless workflow: index, query, list, graph, cleanup. - -## Setup - -```bash -pip install vectorless -``` - -## Run - -```bash -python main.py -``` diff --git a/examples/indexing/main.py b/examples/indexing/main.py deleted file mode 100644 index f2adce3b..00000000 --- a/examples/indexing/main.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -Indexing example — demonstrates the full Vectorless workflow. - -Usage: - pip install vectorless - python main.py -""" - -import asyncio -import os -from vectorless import Engine, IndexContext, IndexOptions, QueryContext - -# os is used only for removing the sample file - -# --- Configuration --- -# Replace with your own credentials -API_KEY = "sk-..." -MODEL = "gpt-4o" - - -async def main(): - # --- 1. Create engine --- - engine = Engine( - api_key=API_KEY, - model=MODEL, - ) - print("Engine created\n") - - # --- 2. Index from text --- - print("--- Index from text ---") - result = await engine.index( - IndexContext.from_content( - """# Architecture Guide - -## Overview - -Vectorless is a reasoning-native document intelligence engine. -It uses hierarchical semantic trees instead of vector embeddings. - -## Key Concepts - -- **Semantic Tree**: Documents are parsed into a tree of sections. -- **LLM Navigation**: Queries are resolved by traversing the tree. -- **No Vectors**: No embeddings, no similarity search, no vector DB. -""", - "markdown", - ).with_name("architecture") - ) - doc_id = result.doc_id - print(f" Indexed: {doc_id}") - print(f" Items: {result.total()}\n") - - # --- 3. Index from file --- - print("--- Index from file ---") - # Write a sample file first - sample_path = "./sample_report.md" - with open(sample_path, "w") as f: - f.write("""# Q4 Financial Report - -## Revenue - -Total revenue for Q4 was $12.3M, up 15% from Q3. -SaaS subscriptions accounted for $8.1M, consulting for $4.2M. - -## Costs - -Operating costs were $9.8M, including $3.2M in engineering salaries. -Marketing spend was reduced by 8% to $1.5M. - -## Outlook - -Projected Q1 revenue is $13.5M based on current pipeline. -""") - - result = await engine.index(IndexContext.from_path(sample_path)) - file_doc_id = result.doc_id - print(f" Indexed: {file_doc_id}\n") - os.remove(sample_path) - - # --- 4. Index with options --- - print("--- Index with options (summaries + description) ---") - result = await engine.index( - IndexContext.from_content( - "# API Reference\n\n## GET /users\n\nList all users.\n\n## POST /users\n\nCreate a user.", - "markdown", - ) - .with_name("api_ref") - .with_options(IndexOptions(generate_summaries=True, generate_description=True)), - ) - print(f" Indexed: {result.doc_id}\n") - - # --- 5. Query --- - print("--- Query ---") - answer = await engine.query( - QueryContext("What was the total revenue?").with_doc_ids([file_doc_id]) - ) - item = answer.single() - if item: - print(f" Score: {item.score:.2f}") - print(f" Answer: {item.content[:200]}\n") - - # --- 6. List documents --- - print("--- List documents ---") - docs = await engine.list() - for doc in docs: - desc = f" — {doc.description}" if doc.description else "" - print(f" {doc.name} ({doc.id[:8]}...){desc}") - print() - - # --- 7. Document graph --- - print("--- Document graph ---") - graph = await engine.get_graph() - if graph: - print(f" Nodes: {graph.node_count()}, Edges: {graph.edge_count()}") - for doc_id in graph.doc_ids(): - node = graph.get_node(doc_id) - if node: - neighbors = graph.get_neighbors(doc_id) - kw = ", ".join(k.keyword for k in node.top_keywords[:3]) - print(f" {node.title}: keywords=[{kw}], neighbors={len(neighbors)}") - print() - - # --- 8. Cleanup --- - print("--- Cleanup ---") - removed = await engine.clear() - print(f" Removed {removed} document(s)") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/pdf_indexing/README.md b/examples/pdf_indexing/README.md deleted file mode 100644 index cfee9a95..00000000 --- a/examples/pdf_indexing/README.md +++ /dev/null @@ -1,27 +0,0 @@ -# PDF Indexing Example - -Demonstrates indexing a PDF file, inspecting indexing metrics, and querying. - -## Setup - -```bash -pip install vectorless -``` - -## Run - -```bash -# Use the sample PDF from the repository -python main.py - -# Or specify your own PDF file -python main.py /path/to/document.pdf -``` - -## Environment Variables - -| Variable | Description | Default | -|------------------------|----------------------|-----------| -| `VECTORLESS_API_KEY` | LLM API key | `sk-...` | -| `VECTORLESS_MODEL` | LLM model name | `gpt-4o` | -| `VECTORLESS_ENDPOINT` | Custom API endpoint | `None` | diff --git a/examples/pdf_indexing/main.py b/examples/pdf_indexing/main.py deleted file mode 100644 index c1e36727..00000000 --- a/examples/pdf_indexing/main.py +++ /dev/null @@ -1,123 +0,0 @@ -""" -PDF indexing example -- demonstrates indexing PDF files and inspecting metrics. - -Usage: - pip install vectorless - python main.py [path/to/file.pdf] - -If no path is given, uses the sample PDF in the repository. -""" - -import asyncio -import os -import sys - -from vectorless import ( - Engine, - IndexContext, - IndexItem, - IndexMetrics, - IndexOptions, - QueryContext, - VectorlessError, -) - -# --- Configuration --- -API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") -MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") -ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -# Resolve the sample PDF path relative to the repo root -SAMPLE_PDF = os.path.join( - os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), - "samples", - "Docker_Cheat_Sheet.pdf", -) - - -def print_separator(title: str) -> None: - print(f"\n{'=' * 40}") - print(f" {title}") - print(f"{'=' * 40}") - - -def print_metrics(item: IndexItem) -> None: - """Pretty-print indexing metrics for a single item.""" - m: IndexMetrics | None = item.metrics - if m is None: - print(" (no metrics available)") - return - - print(f" Total time: {m.total_time_ms:>6} ms") - print(f" Parse time: {m.parse_time_ms:>6} ms") - print(f" Build time: {m.build_time_ms:>6} ms") - print(f" Enhance time: {m.enhance_time_ms:>6} ms") - print(f" Nodes processed: {m.nodes_processed:>6}") - print(f" Summaries ok: {m.summaries_generated:>6}") - print(f" Summaries failed: {m.summaries_failed:>6}") - print(f" LLM calls: {m.llm_calls:>6}") - print(f" Tokens generated: {m.total_tokens_generated:>6}") - print(f" Topics indexed: {m.topics_indexed:>6}") - print(f" Keywords indexed: {m.keywords_indexed:>6}") - - -async def main() -> None: - pdf_path = sys.argv[1] if len(sys.argv) > 1 else SAMPLE_PDF - - if not os.path.isfile(pdf_path): - print(f"Error: file not found: {pdf_path}") - sys.exit(1) - - engine = Engine( - api_key=API_KEY, - model=MODEL, - endpoint=ENDPOINT, - ) - - # ---- Index with description + summaries enabled ---- - print_separator("Indexing PDF") - - options = IndexOptions(generate_summaries=True, generate_description=True) - ctx = IndexContext.from_path(pdf_path).with_options(options) - - try: - result = await engine.index(ctx) - except VectorlessError as e: - print(f"Indexing failed: [{e.kind}] {e.message}") - return - - if result.has_failures(): - for f in result.failed: - print(f" Failed: {f.source} -- {f.error}") - return - - doc_id = result.doc_id - print(f" doc_id: {doc_id}") - - for item in result.items: - print(f"\n Item: {item.name} ({item.format})") - if item.page_count is not None: - print(f" Pages: {item.page_count}") - if item.description: - print(f" Description: {item.description[:120]}...") - print_metrics(item) - - # ---- Query the PDF ---- - print_separator("Query") - - answer = await engine.query( - QueryContext("What is this document about?").with_doc_ids([doc_id]) - ) - item = answer.single() - if item: - print(f" Score: {item.score:.2f}") - print(f" Nodes: {item.node_ids}") - print(f" Content: {item.content[:300]}...") - - # ---- Cleanup ---- - print_separator("Cleanup") - removed = await engine.clear() - print(f" Removed {removed} document(s)") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/examples/session_walkthrough/README.md b/examples/session_walkthrough/README.md deleted file mode 100644 index 17174a07..00000000 --- a/examples/session_walkthrough/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Session API Walkthrough - -Demonstrates the full high-level Vectorless Python API using the `Session` and `SyncSession` classes. - -## What it covers - -| # | Topic | API | -|---|-------|-----| -| 1 | Session creation | `Session()`, `from_env()`, `from_config_file()` | -| 2 | Indexing sources | `index(content=)`, `index(path=)`, `index(bytes_data=)`, `index(directory=)` | -| 3 | Batch indexing | `index_batch(paths, jobs=N)` | -| 4 | Querying | `ask(question, doc_ids=)`, `ask(question, workspace_scope=True)` | -| 5 | Streaming query | `query_stream()` async iterator | -| 6 | Document management | `list_documents()`, `document_exists()`, `remove_document()`, `clear_all()` | -| 7 | Document graph | `get_graph()` nodes, edges, keywords | -| 8 | Event callbacks | `EventEmitter` with `@on_index` / `@on_query` decorators | -| 9 | Metrics | `metrics_report()` | -| 10 | Sync API | `SyncSession` (no async/await) | - -## Setup - -```bash -pip install vectorless -export VECTORLESS_API_KEY="sk-..." -export VECTORLESS_MODEL="gpt-4o" -``` - -## Run - -```bash -python main.py -``` diff --git a/examples/session_walkthrough/main.py b/examples/session_walkthrough/main.py deleted file mode 100644 index f94e5bb6..00000000 --- a/examples/session_walkthrough/main.py +++ /dev/null @@ -1,589 +0,0 @@ -""" -Session API walkthrough -- demonstrates the full high-level Vectorless API. - -This example uses the Session class (recommended entry point) to cover: - 1. Session creation (constructor / from_env / from_config_file) - 2. Indexing from various sources (content, path, directory, bytes) - 3. Batch indexing with concurrency control - 4. Querying with doc_ids and workspace scope - 5. Streaming query with real-time events - 6. Document management (list, exists, remove, clear) - 7. Cross-document relationship graph - 8. Event callbacks for progress monitoring - 9. Metrics reporting - 10. SyncSession (synchronous API, no async/await) - -Usage: - export VECTORLESS_API_KEY="sk-..." - export VECTORLESS_MODEL="gpt-4o" - pip install vectorless - python main.py -""" - -import asyncio -import os -import tempfile - -from vectorless import ( - Session, - SyncSession, - EventEmitter, - VectorlessError, -) -from vectorless.events import IndexEventType, QueryEventType - - -# ────────────────────────────────────────────────────────────────── -# Sample documents used throughout the example -# ────────────────────────────────────────────────────────────────── - -ARCHITECTURE_DOC = """\ -# Vectorless Architecture - -## Overview - -Vectorless is a reasoning-native document intelligence engine. -It uses hierarchical semantic trees instead of vector embeddings. - -## Key Concepts - -- **Semantic Tree**: Documents are parsed into a tree of sections. -- **LLM Navigation**: Queries are resolved by traversing the tree. -- **No Vectors**: No embeddings, no similarity search, no vector DB. - -## Retrieval Flow - -Engine.query() - -> query/understand() -> QueryPlan - -> Orchestrator dispatches Workers - -> Workers navigate document trees - -> rerank -> synthesis -> answer -""" - -FINANCE_DOC = """\ -# Q4 Financial Report - -## Revenue - -Total revenue for Q4 was $12.3M, up 15% from Q3. -SaaS subscriptions accounted for $8.1M, consulting for $4.2M. - -## Costs - -Operating costs were $9.8M, including $3.2M in engineering salaries. -Marketing spend was reduced by 8% to $1.5M. - -## Outlook - -Projected Q1 revenue is $13.5M based on current pipeline. -""" - -SECURITY_DOC = """\ -# Security Policy - -## Authentication - -All API requests require a Bearer token in the Authorization header. -Tokens expire after 24 hours and must be refreshed. - -## Data Encryption - -Data at rest is encrypted using AES-256. Data in transit uses TLS 1.3. - -## Audit Logging - -All access to sensitive data is logged and retained for 90 days. -""" - - -# ────────────────────────────────────────────────────────────────── -# Helper: set up a temp directory with sample files -# ────────────────────────────────────────────────────────────────── - -def create_sample_directory() -> tuple[str, list[str]]: - """Create a temp directory with sample documents. Returns (dir, paths).""" - tmpdir = tempfile.mkdtemp(prefix="vectorless_walkthrough_") - docs = { - "architecture.md": ARCHITECTURE_DOC, - "finance.md": FINANCE_DOC, - "security.md": SECURITY_DOC, - } - paths = [] - for name, content in docs.items(): - path = os.path.join(tmpdir, name) - with open(path, "w") as f: - f.write(content) - paths.append(path) - return tmpdir, paths - - -def cleanup_directory(tmpdir: str) -> None: - """Remove all files in the temp directory.""" - for fname in os.listdir(tmpdir): - os.remove(os.path.join(tmpdir, fname)) - os.rmdir(tmpdir) - - -# ────────────────────────────────────────────────────────────────── -# Section 1: Session Creation -# ────────────────────────────────────────────────────────────────── - -async def demo_session_creation() -> Session: - """Demonstrate different ways to create a Session.""" - print("=" * 60) - print(" 1. Session Creation") - print("=" * 60) - - # Option A: Constructor with explicit credentials - api_key = os.environ.get("VECTORLESS_API_KEY", "sk-...") - model = os.environ.get("VECTORLESS_MODEL", "gpt-4o") - endpoint = os.environ.get("VECTORLESS_ENDPOINT") - - session = Session(api_key=api_key, model=model, endpoint=endpoint) - print(f" Created: {session}") - - # Option B: from environment variables - # session = Session.from_env() - - # Option C: from a config file - # session = Session.from_config_file("~/.vectorless/config.toml") - - # Option D: with an EventEmitter for progress callbacks - # events = EventEmitter() - # session = Session(api_key=api_key, model=model, events=events) - - print() - return session - - -# ────────────────────────────────────────────────────────────────── -# Section 2: Indexing from Various Sources -# ────────────────────────────────────────────────────────────────── - -async def demo_indexing(session: Session, tmpdir: str, paths: list[str]) -> dict[str, str]: - """Demonstrate indexing from content, path, directory, and bytes.""" - print("=" * 60) - print(" 2. Indexing") - print("=" * 60) - - doc_ids: dict[str, str] = {} - - # --- 2a. Index from in-memory content --- - print(" [content] Indexing from string...") - result = await session.index( - content=ARCHITECTURE_DOC, - format="markdown", - name="architecture", - ) - doc_ids["architecture"] = result.doc_id # type: ignore[assignment] - print(f" doc_id: {result.doc_id}") - print(f" items: {result.total()}") - - # --- 2b. Index from a file path --- - print(" [path] Indexing from file path...") - result = await session.index(path=paths[1], name="finance") - doc_ids["finance"] = result.doc_id # type: ignore[assignment] - print(f" doc_id: {result.doc_id}") - - # --- 2c. Index from raw bytes --- - print(" [bytes] Indexing from raw bytes...") - result = await session.index( - bytes_data=SECURITY_DOC.encode("utf-8"), - format="markdown", - name="security", - ) - doc_ids["security"] = result.doc_id # type: ignore[assignment] - print(f" doc_id: {result.doc_id}") - - # --- 2d. Index a directory --- - print(" [dir] Indexing a directory...") - # Clear first to see fresh results - await session.clear_all() - - result = await session.index(directory=tmpdir, name="all_docs") - print(f" doc_id: {result.doc_id}") - print(f" items: {len(result.items)}") - for item in result.items: - print(f" - {item.name} ({item.doc_id[:8]}...)") - doc_ids[item.name] = item.doc_id - - print() - return doc_ids - - -# ────────────────────────────────────────────────────────────────── -# Section 3: Batch Indexing with Concurrency -# ────────────────────────────────────────────────────────────────── - -async def demo_batch_indexing(session: Session, paths: list[str]) -> list[str]: - """Demonstrate batch indexing with concurrent jobs.""" - print("=" * 60) - print(" 3. Batch Indexing (concurrency=2)") - print("=" * 60) - - # Clear to start fresh - await session.clear_all() - - results = await session.index_batch( - paths, - mode="default", - jobs=2, # max 2 concurrent indexing operations - force=False, - ) - - doc_ids = [] - for r in results: - print(f" {r.doc_id[:8]}... ({len(r.items)} items)") - for item in r.items: - doc_ids.append(item.doc_id) - - print(f" Batch indexed {len(results)} file(s), {len(doc_ids)} document(s) total") - print() - return doc_ids - - -# ────────────────────────────────────────────────────────────────── -# Section 4: Querying -# ────────────────────────────────────────────────────────────────── - -async def demo_querying(session: Session, doc_ids: list[str]) -> None: - """Demonstrate querying with doc_ids and workspace scope.""" - print("=" * 60) - print(" 4. Querying") - print("=" * 60) - - # --- Query specific documents --- - print(" [ask] Query specific documents...") - response = await session.ask( - "What was the total revenue for Q4?", - doc_ids=doc_ids[:2], # limit to first two docs - ) - - result = response.single() - if result: - print(f" Score: {result.score:.2f}") - print(f" Confidence: {result.confidence:.2f}") - print(f" Answer: {result.content[:150]}...") - if result.evidence: - print(f" Evidence: {len(result.evidence)} item(s)") - for ev in result.evidence[:2]: - print(f" - {ev.title}: {ev.content[:80]}...") - if result.metrics: - print(f" LLM calls: {result.metrics.llm_calls}") - print(f" Nodes: {result.metrics.nodes_visited}") - - # --- Query across all documents --- - print() - print(" [workspace_scope] Query across entire workspace...") - response = await session.ask( - "How is data encrypted?", - workspace_scope=True, - ) - for item in response.items: - print(f" [{item.doc_id[:8]}...] score={item.score:.2f}") - print(f" {item.content[:120]}...") - - # --- Query with timeout --- - print() - print(" [timeout] Query with 30s timeout...") - try: - response = await session.ask( - "What is the retrieval flow?", - doc_ids=doc_ids, - timeout_secs=30, - ) - if response.single(): - print(f" Answer: {response.single().content[:150]}...") - except VectorlessError as e: - print(f" Error: {e}") - - print() - - -# ────────────────────────────────────────────────────────────────── -# Section 5: Streaming Query -# ────────────────────────────────────────────────────────────────── - -async def demo_streaming(session: Session, doc_ids: list[str]) -> None: - """Demonstrate streaming query with real-time events.""" - print("=" * 60) - print(" 5. Streaming Query") - print("=" * 60) - - stream = await session.query_stream( - "What are the key concepts?", - doc_ids=doc_ids[:1], - ) - - event_count = 0 - async for event in stream: - event_count += 1 - event_type = event.get("type", "unknown") - # Print a compact summary of each event - if event_type == "completed": - results = event.get("results", []) - print(f" [{event_count}] completed — {len(results)} result(s)") - elif event_type == "error": - print(f" [{event_count}] error — {event.get('message', '')}") - else: - print(f" [{event_count}] {event_type}") - - # The final result is available after iteration completes - if stream.result: - final = stream.result - item = final.single() - if item: - print(f" Final answer: {item.content[:150]}...") - - print() - - -# ────────────────────────────────────────────────────────────────── -# Section 6: Document Management -# ────────────────────────────────────────────────────────────────── - -async def demo_document_management(session: Session, doc_ids: list[str]) -> None: - """Demonstrate list, exists, remove, and clear.""" - print("=" * 60) - print(" 6. Document Management") - print("=" * 60) - - # --- List all documents --- - docs = await session.list_documents() - print(f" Listed {len(docs)} document(s):") - for doc in docs: - pages = f", pages={doc.page_count}" if doc.page_count else "" - print(f" {doc.name} id={doc.id[:8]}... format={doc.format}{pages}") - - # --- Check existence --- - if doc_ids: - exists = await session.document_exists(doc_ids[0]) - print(f"\n exists({doc_ids[0][:8]}...): {exists}") - - # --- Remove a document --- - if len(doc_ids) > 1: - removed = await session.remove_document(doc_ids[1]) - print(f" remove({doc_ids[1][:8]}...): {removed}") - - # Verify removal - exists_after = await session.document_exists(doc_ids[1]) - print(f" exists after removal: {exists_after}") - - # --- List again --- - docs = await session.list_documents() - print(f"\n After removal: {len(docs)} document(s)") - - print() - - -# ────────────────────────────────────────────────────────────────── -# Section 7: Cross-Document Relationship Graph -# ────────────────────────────────────────────────────────────────── - -async def demo_graph(session: Session) -> None: - """Demonstrate the cross-document relationship graph.""" - print("=" * 60) - print(" 7. Document Graph") - print("=" * 60) - - graph = await session.get_graph() - - if graph is None or graph.is_empty(): - print(" Graph is empty (no documents or no relationships found)") - print() - return - - print(f" Nodes: {graph.node_count()}, Edges: {graph.edge_count()}") - - for did in graph.doc_ids(): - node = graph.get_node(did) - if node: - keywords = ", ".join(k.keyword for k in node.top_keywords[:5]) - neighbors = graph.get_neighbors(did) - print(f" {node.title}") - print(f" format: {node.format}, nodes: {node.node_count}") - print(f" keywords: [{keywords}]") - print(f" neighbors: {len(neighbors)}") - for edge in neighbors[:3]: - target = graph.get_node(edge.target_doc_id) - target_name = target.title if target else edge.target_doc_id[:8] - weight_str = f"weight={edge.weight:.2f}" - evidence_str = "" - if edge.evidence: - evidence_str = f", shared_keywords={edge.evidence.shared_keyword_count}" - print(f" -> {target_name} ({weight_str}{evidence_str})") - - print() - - -# ────────────────────────────────────────────────────────────────── -# Section 8: Event Callbacks -# ────────────────────────────────────────────────────────────────── - -async def demo_events() -> None: - """Demonstrate event callbacks with EventEmitter.""" - print("=" * 60) - print(" 8. Event Callbacks") - print("=" * 60) - - events = EventEmitter() - - @events.on_index - def on_index_event(event): - if event.event_type == IndexEventType.STARTED: - print(f" [INDEX] Started: {event.path or event.message}") - elif event.event_type == IndexEventType.COMPLETE: - print(f" [INDEX] Complete: {event.doc_id or event.message}") - elif event.event_type == IndexEventType.ERROR: - print(f" [INDEX] Error: {event.message}") - - @events.on_query - def on_query_event(event): - if event.event_type == QueryEventType.STARTED: - print(f" [QUERY] Started: {event.query}") - elif event.event_type == QueryEventType.COMPLETE: - print(f" [QUERY] Complete: {event.total_results} result(s)") - - # Create a session with the event emitter - api_key = os.environ.get("VECTORLESS_API_KEY", "sk-...") - model = os.environ.get("VECTORLESS_MODEL", "gpt-4o") - session = Session(api_key=api_key, model=model, events=events) - - # Index and query — events fire automatically - await session.index(content=ARCHITECTURE_DOC, format="markdown", name="demo_events") - await session.ask("What are the key concepts?", workspace_scope=True) - - await session.clear_all() - print() - - -# ────────────────────────────────────────────────────────────────── -# Section 9: Metrics -# ────────────────────────────────────────────────────────────────── - -async def demo_metrics(session: Session) -> None: - """Demonstrate metrics reporting.""" - print("=" * 60) - print(" 9. Metrics Report") - print("=" * 60) - - report = session.metrics_report() - if report: - # The report contains llm and retrieval subsections - if hasattr(report, "llm"): - llm = report.llm - print(f" LLM Metrics:") - print(f" Total calls: {getattr(llm, 'total_calls', 'N/A')}") - print(f" Total tokens: {getattr(llm, 'total_tokens', 'N/A')}") - print(f" Cache hit rate: {getattr(llm, 'cache_hit_rate', 'N/A')}") - if hasattr(report, "retrieval"): - ret = report.retrieval - print(f" Retrieval Metrics:") - print(f" Total queries: {getattr(ret, 'total_queries', 'N/A')}") - print(f" Avg latency: {getattr(ret, 'avg_latency_ms', 'N/A')} ms") - else: - print(" No metrics available") - - print() - - -# ────────────────────────────────────────────────────────────────── -# Section 10: SyncSession (Synchronous API) -# ────────────────────────────────────────────────────────────────── - -def demo_sync_session() -> None: - """Demonstrate the synchronous Session (no async/await needed).""" - print("=" * 60) - print(" 10. SyncSession (no async/await)") - print("=" * 60) - - api_key = os.environ.get("VECTORLESS_API_KEY", "sk-...") - model = os.environ.get("VECTORLESS_MODEL", "gpt-4o") - - # Can also use: SyncSession.from_env() - with SyncSession(api_key=api_key, model=model) as session: - # Index from content - result = session.index( - content=FINANCE_DOC, - format="markdown", - name="sync_demo", - ) - print(f" Indexed: {result.doc_id}") - - # Query - response = session.ask( - "What was the total revenue?", - doc_ids=[result.doc_id], # type: ignore[list-item] - ) - item = response.single() - if item: - print(f" Answer: {item.content[:150]}...") - - # Cleanup - session.clear_all() - print(" Cleaned up") - - print() - - -# ────────────────────────────────────────────────────────────────── -# Main -# ────────────────────────────────────────────────────────────────── - -async def main() -> None: - print() - print(" Vectorless — Session API Walkthrough") - print(" " + "-" * 38) - print() - - # 1. Create session - session = await demo_session_creation() - - # Set up sample directory - tmpdir, paths = create_sample_directory() - - try: - # 2. Indexing - doc_id_map = await demo_indexing(session, tmpdir, paths) - all_doc_ids = list(doc_id_map.values()) - - # 3. Batch indexing (clears and re-indexes) - batch_doc_ids = await demo_batch_indexing(session, paths) - all_doc_ids = batch_doc_ids if batch_doc_ids else all_doc_ids - - # 4. Querying - if all_doc_ids: - await demo_querying(session, all_doc_ids) - - # 5. Streaming query - if all_doc_ids: - await demo_streaming(session, all_doc_ids) - - # 6. Document management - await demo_document_management(session, all_doc_ids) - - # 7. Graph - await demo_graph(session) - - # 8. Events (creates its own session) - await demo_events() - - # 9. Metrics - await demo_metrics(session) - - finally: - # Cleanup - await session.clear_all() - cleanup_directory(tmpdir) - print("=" * 60) - print(" Cleanup complete.") - print("=" * 60) - - # 10. SyncSession (separate, runs synchronously) - demo_sync_session() - - print(" Done.") - - -if __name__ == "__main__": - asyncio.run(main()) From 05e5fc9e6410396eb9156a68b9b0ab438cdcc262 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Thu, 23 Apr 2026 11:14:01 +0800 Subject: [PATCH 28/28] feat: add single-document reasoning challenge example Add a comprehensive example demonstrating advanced document indexing and querying capabilities. The example includes a realistic technical report about quantum computing research with complex inter-lab dependencies, financial data, and technical specifications. The challenge demonstrates the engine's ability to handle queries requiring deep navigation through the document tree, cross-referencing details across distant sections, and extracting information from nested structures rather than surface-level keyword matching. Includes five challenge questions that test: - Cross-referencing device characterization needs with equipment specs - Tracing dependency chains between research milestones - Calculating impacts from distributed data points - Complex multi-step reasoning across document sections --- examples/single_doc_challenge.py | 225 +++++++++++++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 examples/single_doc_challenge.py diff --git a/examples/single_doc_challenge.py b/examples/single_doc_challenge.py new file mode 100644 index 00000000..10a55fe6 --- /dev/null +++ b/examples/single_doc_challenge.py @@ -0,0 +1,225 @@ +# Copyright (c) 2026 vectorless developers +# SPDX-License-Identifier: Apache-2.0 + +"""Single-document reasoning challenge. + +Indexes a realistic technical document and asks questions that require +the engine to navigate deep into the tree, cross-reference details +across distant sections, and extract information buried in nested +structures — not surface-level keyword matches. + +```bash +LLM_API_KEY=sk-xxx LLM_MODEL=gpt-4o \ + LLM_ENDPOINT=https://api.openai.com/v1 \ + python examples/single_doc_challenge.py +``` +""" + +import asyncio +import os + +from vectorless import Engine + +# A research report with information scattered across sections. +# The answers to the challenge questions require connecting dots +# from different parts of the document, not simple keyword lookup. +REPORT = """ +# Quantum Computing Division — Annual Research Report 2025 + +## Executive Summary + +The Quantum Computing Division achieved several milestones in fiscal year 2025. +Total division revenue reached $47.2M, representing 23% year-over-year growth. +The division employed 312 staff across four research labs as of December 2025. +Headcount grew by 18% during the year, with the majority of new hires in the +error correction and cryogenics teams. + +The board approved a $200M capital investment program spanning 2025-2028. +Phase 1 ($52M) was fully deployed in 2025, primarily in dilution refrigerator +procurement and cleanroom expansion at the Zurich facility. + +## Research Labs + +### Lab A — Superconducting Qubits (Zurich) + +Lab A focuses on transmon qubit design and fabrication. The lab operates +two dilution refrigerators: FR-01 (purchased 2023, 20mK base temperature) +and FR-02 (commissioned Q3 2025, 15mK base temperature). FR-02 was the +single largest capital expenditure in 2025 at $8.7M. + +Current qubit specifications: +- Qubit count: 127 (FR-01: 64, FR-02: 63) +- Average T1 coherence time: 142 microseconds (up from 98μs in 2024) +- Average T2 coherence time: 89 microseconds +- Single-qubit gate fidelity: 99.92% +- Two-qubit gate fidelity: 99.67% +- Readout fidelity: 99.81% + +The 2025 coherence improvement was primarily driven by the transition from +aluminum to tantalum transmon junctions, which reduced two-level system (TLS) +defect density by 40%. + +### Lab B — Topological Qubits (Tokyo) + +Lab B pursues Majorana-based topological qubits using semiconductor-superconductor +nanowires. The team fabricated 12 nanowire devices during 2025, of which 3 +demonstrated measurable topological gap. This is a significant improvement +over 2024 when only 1 device out of 8 showed the gap. + +The topological gap measurement protocol requires the device temperature to +remain below 20mK throughout the 48-hour characterization cycle. Only FR-02 +in Zurich meets this requirement, so Lab B ships devices to Zurich for final +characterization — creating a logistical dependency between the two labs. + +Key metric: topological gap size averaged 0.35meV across successful devices, +compared to the theoretical target of 0.5meV. The gap-to-target ratio improved +from 48% in 2024 to 70% in 2025. + +### Lab C — Quantum Error Correction (Cambridge) + +Lab C develops surface code error correction protocols. In 2025, the team +achieved a critical milestone: below-threshold error correction on a 17-qubit +surface code patch, reducing logical error rate from 2.1×10⁻² to 3.4×10⁻³ +per correction cycle. + +The threshold simulations used Lab A's measured gate fidelities as input +parameters. The below-threshold result was only possible after Lab A's T1 +coherence improvement from 98μs to 142μs — the simulation models showed +that the 98μs regime was above the error correction threshold for the 17-qubit +code, making the Lab A / Lab C dependency critical. + +Lab C also developed a new decoder algorithm called "Cascade" that reduces +classical processing latency from 1.2μs to 0.4μs per syndrome extraction cycle. +This decoder runs on an FPGA co-processor board that was custom-designed by +Lab D. + +### Lab D — Control Systems (Boston) + +Lab D designs and manufactures the classical control electronics for all qubit +types. The flagship product is the QCS-4 control system, capable of driving +up to 256 qubit channels with 14-bit DAC resolution and sub-nanosecond timing +precision. + +In 2025, Lab D delivered 4 QCS-4 units to Lab A and 2 units to Lab B. +Lab C received a modified QCS-4 variant with the integrated FPGA decoder +co-processor. The FPGA decoder board is a custom design: Xilinx Ultrascale+ +XCU26 FPGA, 400k logic cells, running at 350MHz. Lab D is the sole source +for this board — there is no commercial equivalent. + +A notable incident occurred in August 2025 when a firmware bug in the QCS-4 +DAC calibration routine caused systematic phase errors in two-qubit gate +operations. The bug was traced to an integer overflow in the calibration LUT +when operating above 4.2 GHz. The issue affected Lab A's FR-01 for 11 days +before a patched firmware was deployed. During this period, Lab A's measured +two-qubit gate fidelity temporarily dropped to 97.31%. + +## Financial Summary + +| Category | 2024 | 2025 | Change | +|----------|------|------|--------| +| Revenue | $38.4M | $47.2M | +23% | +| R&D Expense | $31.6M | $38.9M | +23% | +| Capital Expenditure | $18.2M | $52.0M | +186% | +| Staff Count (Dec) | 264 | 312 | +18% | +| Patents Filed | 14 | 19 | +36% | + +Revenue breakdown by source: +- Government contracts: $19.8M (42%) +- Enterprise partnerships: $15.3M (32%) +- IP licensing: $8.6M (18%) +- Consulting services: $3.5M (8%) + +The $52M capital expenditure in 2025 included: +- FR-02 dilution refrigerator (Zurich): $8.7M +- Cleanroom expansion (Zurich): $14.2M +- Nanowire fabrication equipment (Tokyo): $6.1M +- FPGA development and QCS-4 production (Boston): $9.4M +- General infrastructure and IT: $13.6M + +## Outlook for 2026 + +Priority goals for 2026: +1. Scale to 256 superconducting qubits by Q3 (requires a third dilution + refrigerator, procurement estimated at $9-11M) +2. Achieve topological gap above 0.45meV (requires device process improvement) +3. Demonstrate below-threshold error correction on a 49-qubit surface code + (requires both 256-qubit hardware AND the Cascade decoder scaling to + larger code distances) +4. File 25+ patents +5. Grow revenue to $60M +""" + +CHALLENGE_QUESTIONS = [ + # Requires: cross-reference Lab B's device characterization needs with + # Lab A's FR-02 specs, then connect to the CapEx table for FR-02 cost + "How much did the only refrigerator capable of characterizing Lab B's devices cost, and where is it located?", + # Requires: trace Lab C's below-threshold result -> depends on Lab A's T1 + # improvement -> depends on tantalum junction transition + "What specific materials change in another lab made Lab C's error correction milestone possible?", + # Requires: find the firmware bug in Lab D section, then look at the + # Lab A FR-01 qubit count, then compute the impact window + "How many qubits were affected by the firmware bug, and for how many days?", + # Requires: Lab B gap/target ratio (70%) * theoretical target (0.5meV) + # -> actual gap = 0.35meV, compare with 2026 goal of 0.45meV + "What is the gap between Lab B's current topological gap achievement and the 2026 target, in meV?", + # Requires: trace the dependency chain: 256-qubit goal -> need FR-03 -> + # cost $9-11M -> government contracts are largest revenue source at $19.8M + "If the 2026 qubit scaling goal requires a new refrigerator, can the largest revenue source category alone cover its estimated cost?", +] + + +async def main() -> None: + print("=== Single-Document Reasoning Challenge ===\n") + + api_key = os.environ.get("LLM_API_KEY", "sk-...") + model = os.environ.get("LLM_MODEL", "gpt-4o") + endpoint = os.environ.get("LLM_ENDPOINT", "https://api.openai.com/v1") + + engine = Engine(api_key=api_key, model=model, endpoint=endpoint) + + doc_name = "qc_report_2025" + + # Check if already indexed + doc_id = None + docs = await engine.list_documents() + for doc in docs: + if doc.name == doc_name: + doc_id = doc.doc_id + print(f"Document already indexed, reusing: {doc_id}\n") + break + + if doc_id is None: + print("Indexing research report...") + from vectorless._core import IndexContext + + ctx = IndexContext.from_content(REPORT, "markdown").with_name(doc_name) + result = await engine.index(ctx) + doc_id = result.doc_id + print(f" doc_id: {doc_id}\n") + + # Challenge queries + for i, question in enumerate(CHALLENGE_QUESTIONS, 1): + print(f"Q{i}: {question}") + + try: + answer = await engine.ask(question, doc_ids=[doc_id]) + if not answer.content: + print(" (no answer found)\n") + else: + lines = answer.content.split("\n") + for line in lines[:3]: + print(f" {line}") + remaining = len(lines) - 3 + if remaining > 0: + print(f" ... ({remaining} more lines)") + print(f" confidence: {answer.confidence:.2f}\n") + except Exception as e: + print(f" error: {e}\n") + + # Uncomment to remove the document after testing: + # await engine.forget(doc_id) + # print("Cleaned up.") + + +if __name__ == "__main__": + asyncio.run(main())