From 959b0b34b4a226ede3e84bd205f7d36a7a90a6fd Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 10:23:44 +0800 Subject: [PATCH 01/21] refactor(rust): remove redundant clippy attributes and cleanup module docs Remove unnecessary clippy allow attributes that were making the codebase too permissive with linting rules. Clean up redundant module documentation while preserving the main crate description. --- rust/src/lib.rs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 59756a37..b0212f19 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,13 +1,6 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 -//! # Vectorless - -// Clippy: allow specific lints that are too noisy for this project -#![allow(clippy::iter_over_hash_type)] -#![allow(clippy::large_enum_variant)] -#![allow(clippy::manual_unwrap_or_default)] - //! # Vectorless //! //! An ultra-performant reasoning-native document intelligence engine for AI. From d71e501b603d0bc73e7f74ce9a5a760ab1994512 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 10:52:56 +0800 Subject: [PATCH 02/21] refactor(rust): remove unused fields and methods from index context - Remove is_path, is_content, and is_bytes methods from IndexSource enum - Remove line_count field from IndexedDocument struct - Remove with_line_count method from IndexedDocument implementation - Remove add_page and is_loaded methods from IndexedDocument implementation - Clean up unused functionality in client module --- rust/src/client/index_context.rs | 17 ----------------- rust/src/client/types.rs | 23 ----------------------- 2 files changed, 40 deletions(-) diff --git a/rust/src/client/index_context.rs b/rust/src/client/index_context.rs index 989252b8..223d94ff 100644 --- a/rust/src/client/index_context.rs +++ b/rust/src/client/index_context.rs @@ -65,23 +65,6 @@ pub(crate) enum IndexSource { }, } -impl IndexSource { - /// Check if this is a path source. - pub fn is_path(&self) -> bool { - matches!(self, IndexSource::Path(_)) - } - - /// Check if this is a content source. - pub fn is_content(&self) -> bool { - matches!(self, IndexSource::Content { .. }) - } - - /// Check if this is a bytes source. - pub fn is_bytes(&self) -> bool { - matches!(self, IndexSource::Bytes { .. }) - } -} - // ============================================================ // Index Context // ============================================================ diff --git a/rust/src/client/types.rs b/rust/src/client/types.rs index 4ab82590..a2c94a54 100644 --- a/rust/src/client/types.rs +++ b/rust/src/client/types.rs @@ -37,9 +37,6 @@ pub struct IndexedDocument { /// Page count (for PDFs). pub page_count: Option, - /// Line count (for text files). - pub line_count: Option, - /// The document tree structure. pub tree: Option, @@ -63,7 +60,6 @@ impl IndexedDocument { description: None, source_path: None, page_count: None, - line_count: None, tree: None, pages: Vec::new(), metrics: None, @@ -95,12 +91,6 @@ impl IndexedDocument { self } - /// Set the line count. - pub fn with_line_count(mut self, count: usize) -> Self { - self.line_count = Some(count); - self - } - /// Set the document tree. pub fn with_tree(mut self, tree: DocumentTree) -> Self { self.tree = Some(tree); @@ -112,19 +102,6 @@ impl IndexedDocument { self.metrics = Some(metrics); self } - - /// Add a page content. - pub fn add_page(&mut self, page: usize, content: impl Into) { - self.pages.push(PageContent { - page, - content: content.into(), - }); - } - - /// Check if the tree is loaded. - pub fn is_loaded(&self) -> bool { - self.tree.is_some() - } } /// Content for a single page. From 5e4324383e43bac8cb3a24cf2d1b8b98f05b2725 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 11:26:12 +0800 Subject: [PATCH 03/21] refactor(client): remove unused indexer configuration and add validation - Remove IndexerConfig struct and related methods that were no longer used - Add validation checks using new validation utilities for file, content and bytes - Replace direct file existence checks with comprehensive validation - Update tests to reflect configuration removal feat(utils): add validation module for source validation - Create new validation module with SourceValidation struct - Implement validate_file, validate_content, and validate_bytes functions - Add proper error handling and warning messages for various validation scenarios - Include PDF magic number checking and file size warnings refactor(client): simplify workspace client by removing unused features - Remove WorkspaceClientConfig struct and related methods - Delete unused batch_remove functionality - Remove stats, len, and is_empty methods that were not being used - Clean up tests to match simplified interface refactor(client): streamline retriever client by removing deprecated methods - Remove RetrieverClientConfig struct and associated configuration methods - Delete find_similar and get_node_context methods that were unused - Remove query method that was replaced by newer implementation - Update clone implementation to match simplified structure refactor(index_context): add directory existence check with warning - Add validation to check if directory exists before processing - Log warning when directory is not found instead of panicking - Maintain backward compatibility for existing functionality --- rust/src/client/index_context.rs | 4 + rust/src/client/indexer.rs | 172 ++++------------------ rust/src/client/retriever.rs | 242 +------------------------------ rust/src/client/workspace.rs | 90 +----------- rust/src/utils/mod.rs | 2 + rust/src/utils/token.rs | 47 ------ rust/src/utils/validation.rs | 196 +++++++++++++++++++++++++ 7 files changed, 231 insertions(+), 522 deletions(-) create mode 100644 rust/src/utils/validation.rs diff --git a/rust/src/client/index_context.rs b/rust/src/client/index_context.rs index 223d94ff..1ba10bac 100644 --- a/rust/src/client/index_context.rs +++ b/rust/src/client/index_context.rs @@ -154,6 +154,10 @@ impl IndexContext { let dir = dir.into(); let supported_extensions = ["md", "pdf"]; + if !dir.exists() { + tracing::warn!("Directory not found: {}", dir.display()); + } + let mut sources = Vec::new(); Self::collect_files(&dir, &supported_extensions, recursive, &mut sources); diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index 4d9dd6f0..a441a1ed 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -49,32 +49,6 @@ pub(crate) struct IndexerClient { /// Event emitter. events: EventEmitter, - - /// Configuration. - config: IndexerConfig, -} - -/// Indexer configuration. -#[derive(Debug, Clone)] -pub struct IndexerConfig { - /// Minimum content tokens required to generate a summary. - pub min_summary_tokens: usize, - - /// Whether to generate IDs by default. - pub generate_ids: bool, - - /// Whether to generate descriptions by default. - pub generate_descriptions: bool, -} - -impl Default for IndexerConfig { - fn default() -> Self { - Self { - min_summary_tokens: 20, - generate_ids: true, - generate_descriptions: false, - } - } } impl IndexerClient { @@ -83,7 +57,6 @@ impl IndexerClient { Self { executor_factory: Arc::new(PipelineExecutor::new), events: EventEmitter::new(), - config: IndexerConfig::default(), } } @@ -93,7 +66,6 @@ impl IndexerClient { Self { executor_factory: Arc::new(move || PipelineExecutor::with_llm((*client).clone())), events: EventEmitter::new(), - config: IndexerConfig::default(), } } @@ -103,25 +75,6 @@ impl IndexerClient { self } - /// Create with configuration. - pub fn with_config(mut self, config: IndexerConfig) -> Self { - self.config = config; - self - } - - /// Create from an executor factory function. - pub(crate) fn from_factory( - factory: Arc PipelineExecutor + Send + Sync>, - events: EventEmitter, - config: IndexerConfig, - ) -> Self { - Self { - executor_factory: factory, - events, - config, - } - } - /// Index a document from an index context. pub async fn index( &self, @@ -166,8 +119,15 @@ impl IndexerClient { ) -> Result { let path = path.canonicalize().unwrap_or_else(|_| path.to_path_buf()); - if !path.exists() { - return Err(Error::Parse(format!("File not found: {}", path.display()))); + // Validate file before indexing + let validation = crate::utils::validate_file(&path)?; + if !validation.valid { + return Err(Error::Parse( + validation.errors.first().cloned().unwrap_or_else(|| "Invalid file".to_string()), + )); + } + for warning in &validation.warnings { + tracing::warn!("{}", warning); } // Emit start event @@ -206,6 +166,14 @@ impl IndexerClient { options: &IndexOptions, existing_tree: Option<&crate::DocumentTree>, ) -> Result { + // Validate content before indexing + let validation = crate::utils::validate_content(content, format); + if !validation.valid { + return Err(Error::Parse( + validation.errors.first().cloned().unwrap_or_else(|| "Invalid content".to_string()), + )); + } + self.events.emit_index(IndexEvent::Started { path: name.unwrap_or("content").to_string(), }); @@ -235,6 +203,14 @@ impl IndexerClient { options: &IndexOptions, existing_tree: Option<&crate::DocumentTree>, ) -> Result { + // Validate bytes before indexing + let validation = crate::utils::validate_bytes(bytes, format); + if !validation.valid { + return Err(Error::Parse( + validation.errors.first().cloned().unwrap_or_else(|| "Invalid bytes".to_string()), + )); + } + self.events.emit_index(IndexEvent::Started { path: name.unwrap_or("bytes").to_string(), }); @@ -259,15 +235,6 @@ impl IndexerClient { self.build_indexed_document(doc_id, result, format, name, None) } - /// Build pipeline options from client options. - fn build_pipeline_options( - &self, - options: &IndexOptions, - format: DocumentFormat, - ) -> PipelineOptions { - self.build_pipeline_options_with_existing(options, format, None) - } - /// Build pipeline options with optional existing tree for incremental updates. fn build_pipeline_options_with_existing( &self, @@ -352,63 +319,6 @@ impl IndexerClient { .ok_or_else(|| Error::Parse(format!("Unsupported format: {}", ext))) } - /// Validate a document before indexing. - /// - /// # Errors - /// - /// Returns an error if the file doesn't exist or is not readable. - pub fn validate(&self, path: impl AsRef) -> Result { - let path = path.as_ref(); - - if !path.exists() { - return Ok(ValidationResult { - valid: false, - errors: vec![format!("File not found: {}", path.display())], - warnings: vec![], - format: None, - estimated_size: 0, - }); - } - - let metadata = std::fs::metadata(path) - .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; - - let estimated_size = metadata.len() as usize; - let mut warnings = Vec::new(); - - // Check file size - if estimated_size > 100 * 1024 * 1024 { - warnings.push("Large file (>100MB) may take longer to index".to_string()); - } - - // Detect format - let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); - let format = DocumentFormat::from_extension(ext); - - if format.is_none() { - return Ok(ValidationResult { - valid: false, - errors: vec![format!("Unsupported format: {}", ext)], - warnings, - format: None, - estimated_size, - }); - } - - Ok(ValidationResult { - valid: true, - errors: vec![], - warnings, - format, - estimated_size, - }) - } - - /// Convert IndexedDocument to PersistedDocument for storage. - pub fn to_persisted(&self, doc: IndexedDocument) -> PersistedDocument { - self.to_persisted_with_options(doc, &PipelineOptions::default()) - } - /// Convert IndexedDocument to PersistedDocument, storing fingerprints from pipeline options. pub fn to_persisted_with_options( &self, @@ -466,30 +376,10 @@ impl Clone for IndexerClient { Self { executor_factory: Arc::clone(&self.executor_factory), events: self.events.clone(), - config: self.config.clone(), } } } -/// Document validation result. -#[derive(Debug, Clone)] -pub(crate) struct ValidationResult { - /// Whether the document is valid for indexing. - pub valid: bool, - - /// Validation errors (prevents indexing). - pub errors: Vec, - - /// Validation warnings (non-blocking). - pub warnings: Vec, - - /// Detected document format. - pub format: Option, - - /// Estimated file size in bytes. - pub estimated_size: usize, -} - #[cfg(test)] mod tests { use super::*; @@ -498,16 +388,6 @@ mod tests { fn test_indexer_client_creation() { let executor = PipelineExecutor::new(); let client = IndexerClient::new(executor); - assert_eq!(client.config.min_summary_tokens, 20); - } - - #[test] - fn test_validate_missing_file() { - let executor = PipelineExecutor::new(); - let client = IndexerClient::new(executor); - - let result = client.validate("./nonexistent.md").unwrap(); - assert!(!result.valid); - assert!(!result.errors.is_empty()); + let _ = client; } } diff --git a/rust/src/client/retriever.rs b/rust/src/client/retriever.rs index 29c0e0d4..c870a65a 100644 --- a/rust/src/client/retriever.rs +++ b/rust/src/client/retriever.rs @@ -24,11 +24,10 @@ use tracing::info; use super::events::{EventEmitter, QueryEvent}; use super::types::QueryResultItem; use crate::config::Config; -use crate::document::{DocumentTree, NodeId, ReasoningIndex}; +use crate::document::{DocumentTree, ReasoningIndex}; use crate::error::{Error, Result}; -use crate::retrieval::content::ContentAggregatorConfig; use crate::retrieval::stream::RetrieveEventReceiver; -use crate::retrieval::{RetrievalResult, RetrieveOptions, RetrieveResponse}; +use crate::retrieval::{RetrieveOptions, RetrieveResponse}; /// Document retrieval client. /// @@ -47,33 +46,6 @@ pub(crate) struct RetrieverClient { default_options: RetrieveOptions, } -/// Retriever configuration. -#[derive(Debug, Clone)] -pub(crate) struct RetrieverClientConfig { - /// Default top_k for retrieval. - pub default_top_k: usize, - - /// Default token budget. - pub default_token_budget: usize, - - /// Content aggregator config. - pub content_config: Option, - - /// Enable result caching. - pub enable_cache: bool, -} - -impl Default for RetrieverClientConfig { - fn default() -> Self { - Self { - default_top_k: 5, - default_token_budget: 4000, - content_config: None, - enable_cache: true, - } - } -} - impl RetrieverClient { /// Create a new retriever client. pub fn new(retriever: crate::retrieval::PipelineRetriever, config: Arc) -> Self { @@ -91,44 +63,6 @@ impl RetrieverClient { self } - /// Create with configuration. - pub fn with_config(mut self, config: RetrieverClientConfig) -> Self { - self.default_options = RetrieveOptions::new() - .with_top_k(config.default_top_k) - .with_max_tokens(config.default_token_budget) - .with_enable_cache(config.enable_cache); - self - } - - /// Create from existing retriever Arc. - pub(crate) fn from_arc( - retriever: Arc, - config: Arc, - events: EventEmitter, - ) -> Self { - Self { - retriever, - config, - events, - default_options: RetrieveOptions::default(), - } - } - - /// Query a document tree. - /// - /// # Errors - /// - /// Returns an error if the retrieval pipeline fails. - pub async fn query( - &self, - tree: &DocumentTree, - question: &str, - options: &RetrieveOptions, - ) -> Result { - self.query_with_reasoning_index(tree, question, options, None) - .await - } - /// Query a document tree with optional reasoning index for fast-path lookup. /// /// # Errors @@ -269,165 +203,6 @@ impl RetrieverClient { score: response.confidence, } } - - /// Get similar nodes to a given node. - /// - /// Uses tree structure and content to find similar nodes. - pub fn find_similar( - &self, - tree: &DocumentTree, - node_id: NodeId, - top_k: usize, - ) -> Result> { - let mut results = Vec::new(); - - // Get the target node's content for comparison - let target_content = tree - .get(node_id) - .map(|n| n.content.clone()) - .unwrap_or_default(); - - if target_content.is_empty() { - return Ok(results); - } - - // Extract keywords from target content - let target_keywords = self.extract_keywords(&target_content); - - // Search all nodes for similarity - let root = tree.root(); - let mut stack = vec![root]; - - while let Some(current_id) = stack.pop() { - if current_id == node_id { - // Skip the target node itself - stack.extend(tree.children(current_id)); - continue; - } - - if let Some(node) = tree.get(current_id) { - let node_keywords = self.extract_keywords(&node.content); - let similarity = self.calculate_similarity(&target_keywords, &node_keywords); - - if similarity > 0.3 { - results.push( - RetrievalResult::new(&node.title) - .with_node_id(format!("{:?}", current_id)) - .with_content(node.content.clone()) - .with_score(similarity) - .with_depth(tree.depth(current_id)), - ); - } - } - - stack.extend(tree.children(current_id)); - } - - // Sort by score and take top_k - results.sort_by(|a, b| { - b.score - .partial_cmp(&a.score) - .unwrap_or(std::cmp::Ordering::Equal) - }); - results.truncate(top_k); - - Ok(results) - } - - /// Extract keywords from content. - fn extract_keywords(&self, content: &str) -> Vec { - content - .to_lowercase() - .split_whitespace() - .filter(|w| w.len() > 3) - .take(20) - .map(|s| s.to_string()) - .collect() - } - - /// Calculate similarity between keyword sets. - fn calculate_similarity(&self, set1: &[String], set2: &[String]) -> f32 { - if set1.is_empty() || set2.is_empty() { - return 0.0; - } - - let set1_set: std::collections::HashSet<_> = set1.iter().collect(); - let set2_set: std::collections::HashSet<_> = set2.iter().collect(); - - let intersection = set1_set.intersection(&set2_set).count(); - let union = set1_set.union(&set2_set).count(); - - intersection as f32 / union as f32 - } - - /// Get node context (ancestors and siblings). - /// - /// Returns the node's ancestors up to the specified depth, - /// along with sibling nodes at each level. - pub fn get_node_context( - &self, - tree: &DocumentTree, - node_id: NodeId, - ancestor_depth: usize, - ) -> Result { - let mut ancestors = Vec::new(); - let mut siblings = Vec::new(); - - // Get ancestors - let mut current_id = Some(node_id); - let mut depth = 0; - - while let Some(id) = current_id { - if depth >= ancestor_depth { - break; - } - - if let Some(node) = tree.get(id) { - ancestors.push( - RetrievalResult::new(&node.title) - .with_node_id(format!("{:?}", id)) - .with_depth(tree.depth(id)), - ); - - // Get siblings at this level - if let Some(parent_id) = tree.parent(id) { - for child_id in tree.children(parent_id) { - if child_id != id { - if let Some(sibling) = tree.get(child_id) { - siblings.push( - RetrievalResult::new(&sibling.title) - .with_node_id(format!("{:?}", child_id)) - .with_depth(tree.depth(child_id)), - ); - } - } - } - } - } - - current_id = tree.parent(id); - depth += 1; - } - - // Get the target node - let target = tree.get(node_id).map(|n| { - RetrievalResult::new(&n.title) - .with_node_id(format!("{:?}", node_id)) - .with_content(n.content.clone()) - .with_depth(tree.depth(node_id)) - }); - - Ok(NodeContext { - target, - ancestors, - siblings, - }) - } - - /// Get the underlying retriever Arc. - pub(crate) fn inner(&self) -> Arc { - Arc::clone(&self.retriever) - } } impl Clone for RetrieverClient { @@ -441,19 +216,6 @@ impl Clone for RetrieverClient { } } -/// Node context information. -#[derive(Debug, Clone)] -pub(crate) struct NodeContext { - /// The target node. - pub target: Option, - - /// Ancestor nodes (ordered from parent to root). - pub ancestors: Vec, - - /// Sibling nodes at each ancestor level. - pub siblings: Vec, -} - #[cfg(test)] mod tests { use super::*; diff --git a/rust/src/client/workspace.rs b/rust/src/client/workspace.rs index 061533ed..d700e064 100644 --- a/rust/src/client/workspace.rs +++ b/rust/src/client/workspace.rs @@ -49,28 +49,6 @@ pub(crate) struct WorkspaceClient { /// Event emitter. events: EventEmitter, - - /// Configuration. - config: WorkspaceClientConfig, -} - -/// Workspace client configuration. -#[derive(Debug, Clone)] -pub(crate) struct WorkspaceClientConfig { - /// Auto-save interval in seconds (None = disabled). - pub auto_save_interval: Option, - - /// Enable verbose logging. - pub verbose: bool, -} - -impl Default for WorkspaceClientConfig { - fn default() -> Self { - Self { - auto_save_interval: None, - verbose: false, - } - } } impl WorkspaceClient { @@ -79,7 +57,6 @@ impl WorkspaceClient { Self { workspace: Arc::new(workspace), events: EventEmitter::new(), - config: WorkspaceClientConfig::default(), } } @@ -89,21 +66,6 @@ impl WorkspaceClient { self } - /// Create with configuration. - pub fn with_config(mut self, config: WorkspaceClientConfig) -> Self { - self.config = config; - self - } - - /// Create from an existing workspace Arc. - pub(crate) fn from_arc(workspace: Arc, events: EventEmitter) -> Self { - Self { - workspace, - events, - config: WorkspaceClientConfig::default(), - } - } - /// Save a document to the workspace. /// /// # Errors @@ -221,32 +183,6 @@ impl WorkspaceClient { })) } - /// Remove multiple documents from the workspace. - /// - /// Returns the number of documents successfully removed. - /// - /// # Errors - /// - /// Returns an error if the workspace write fails. - pub async fn batch_remove(&self, doc_ids: &[&str]) -> Result { - let mut removed = 0; - - for doc_id in doc_ids { - if self.workspace.remove(doc_id).await? { - removed += 1; - self.events.emit_workspace(WorkspaceEvent::Removed { - doc_id: doc_id.to_string(), - }); - } - } - - if removed > 0 { - info!("Batch removed {} documents", removed); - } - - Ok(removed) - } - /// Clear all documents from the workspace. /// /// Returns the number of documents removed. @@ -271,23 +207,6 @@ impl WorkspaceClient { Ok(count) } - /// Get workspace statistics. - pub async fn stats(&self) -> Result { - Ok(WorkspaceStats { - document_count: self.workspace.len().await, - }) - } - - /// Get the number of documents in the workspace. - pub async fn len(&self) -> usize { - self.workspace.len().await - } - - /// Check if the workspace is empty. - pub async fn is_empty(&self) -> bool { - self.workspace.is_empty().await - } - /// Get the underlying workspace Arc (for advanced use). pub(crate) fn inner(&self) -> Arc { Arc::clone(&self.workspace) @@ -309,11 +228,4 @@ impl WorkspaceClient { pub async fn set_graph(&self, graph: &crate::graph::DocumentGraph) -> Result<()> { self.workspace.set_graph(graph).await } -} - -/// Workspace statistics. -#[derive(Debug, Clone)] -pub(crate) struct WorkspaceStats { - /// Number of documents in the workspace. - pub document_count: usize, -} +} \ No newline at end of file diff --git a/rust/src/utils/mod.rs b/rust/src/utils/mod.rs index c6fd9b17..624c707d 100644 --- a/rust/src/utils/mod.rs +++ b/rust/src/utils/mod.rs @@ -13,5 +13,7 @@ pub mod fingerprint; mod format; mod timing; mod token; +pub mod validation; pub use token::estimate_tokens; +pub use validation::{validate_bytes, validate_content, validate_file}; diff --git a/rust/src/utils/token.rs b/rust/src/utils/token.rs index 390f20cf..9e23ea85 100644 --- a/rust/src/utils/token.rs +++ b/rust/src/utils/token.rs @@ -46,33 +46,6 @@ pub fn estimate_tokens(text: &str) -> usize { get_bpe().encode_with_special_tokens(text).len() } -/// Estimate token count with a simple character-based approximation. -/// -/// This is faster but less accurate. Use when you don't need exact counts. -/// Approximation: ~4 characters per token for English text. -/// -/// # Example -/// -/// ``` -/// use vectorless::estimate_tokens_fast; -/// -/// assert_eq!(estimate_tokens_fast(""), 0); -/// assert_eq!(estimate_tokens_fast("hi"), 1); // 2 chars -> 1 token min -/// assert_eq!(estimate_tokens_fast("hello world"), 3); // 11 chars / 4 = 2.75 -> 3 -/// ``` -pub fn estimate_tokens_fast(text: &str) -> usize { - if text.is_empty() { - return 0; - } - // Use ceiling division for better accuracy - (text.len() + 3) / 4 -} - -/// Count tokens in multiple texts. -pub fn estimate_tokens_batch(texts: &[&str]) -> usize { - texts.iter().map(|t| estimate_tokens(t)).sum() -} - #[cfg(test)] mod tests { use super::*; @@ -88,24 +61,4 @@ mod tests { let count = estimate_tokens("hello world"); assert!(count >= 2, "Expected at least 2 tokens, got {}", count); } - - #[test] - fn test_estimate_tokens_fast_empty() { - assert_eq!(estimate_tokens_fast(""), 0); - } - - #[test] - fn test_estimate_tokens_fast_simple() { - assert_eq!(estimate_tokens_fast("hi"), 1); // 2 chars, (2+3)/4 = 1 - assert_eq!(estimate_tokens_fast("hello world"), 3); // 11 chars, (11+3)/4 = 3 - assert_eq!(estimate_tokens_fast(&"a".repeat(100)), 25); // 100 chars, (100+3)/4 = 25 - } - - #[test] - fn test_estimate_tokens_batch() { - let texts = vec!["hello", "world"]; - let batch_count = estimate_tokens_batch(&texts); - let individual_count = estimate_tokens("hello") + estimate_tokens("world"); - assert_eq!(batch_count, individual_count); - } } diff --git a/rust/src/utils/validation.rs b/rust/src/utils/validation.rs new file mode 100644 index 00000000..fc18aee5 --- /dev/null +++ b/rust/src/utils/validation.rs @@ -0,0 +1,196 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! Source validation utilities for indexing. + +use std::path::Path; + +use crate::error::{Error, Result}; +use crate::index::parse::DocumentFormat; + +/// Maximum file size before emitting a warning (100 MB). +const LARGE_FILE_THRESHOLD: usize = 100 * 1024 * 1024; + +/// Result of validating a source before indexing. +#[derive(Debug, Clone)] +pub struct SourceValidation { + /// Whether the source is valid for indexing. + pub valid: bool, + + /// Validation errors (prevents indexing). + pub errors: Vec, + + /// Validation warnings (non-blocking). + pub warnings: Vec, +} + +impl SourceValidation { + fn valid() -> Self { + Self { + valid: true, + errors: vec![], + warnings: vec![], + } + } + + fn invalid(errors: Vec) -> Self { + Self { + valid: false, + errors, + warnings: vec![], + } + } + + fn with_warnings(mut self, warnings: Vec) -> Self { + self.warnings = warnings; + self + } +} + +/// Validate a file path for indexing. +/// +/// Checks: exists, readable, supported format, size. +pub fn validate_file(path: &Path) -> Result { + if !path.exists() { + return Ok(SourceValidation::invalid(vec![format!( + "File not found: {}", + path.display() + )])); + } + + let metadata = std::fs::metadata(path) + .map_err(|e| Error::Parse(format!("Cannot read file metadata: {}", e)))?; + + let size = metadata.len() as usize; + let mut warnings = Vec::new(); + + if size > LARGE_FILE_THRESHOLD { + warnings.push(format!( + "Large file ({}MB) may take longer to index", + size / (1024 * 1024) + )); + } + + let ext = path.extension().and_then(|e| e.to_str()).unwrap_or(""); + if DocumentFormat::from_extension(ext).is_none() { + return Ok(SourceValidation::invalid(vec![format!( + "Unsupported format: .{}", + ext + )]) + .with_warnings(warnings)); + } + + Ok(SourceValidation::valid().with_warnings(warnings)) +} + +/// Validate content string for indexing. +/// +/// Checks: non-empty. +pub fn validate_content(content: &str, _format: DocumentFormat) -> SourceValidation { + let mut errors = Vec::new(); + + if content.trim().is_empty() { + errors.push("Content is empty".to_string()); + } + + if errors.is_empty() { + SourceValidation::valid() + } else { + SourceValidation::invalid(errors) + } +} + +/// Validate binary data for indexing. +/// +/// Checks: non-empty, PDF magic number. +pub fn validate_bytes(data: &[u8], format: DocumentFormat) -> SourceValidation { + let mut errors = Vec::new(); + + if data.is_empty() { + errors.push("Byte data is empty".to_string()); + } + + // PDF magic number check + if format == DocumentFormat::Pdf && !data.is_empty() { + if !data.starts_with(b"%PDF") { + errors.push("Data does not appear to be a valid PDF (missing %PDF header)".to_string()); + } + } + + if errors.is_empty() { + SourceValidation::valid() + } else { + SourceValidation::invalid(errors) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_validate_file_missing() { + let result = validate_file(Path::new("./nonexistent.md")).unwrap(); + assert!(!result.valid); + assert!(result.errors[0].contains("not found")); + } + + #[test] + fn test_validate_file_unsupported_format() { + let tmp = std::env::temp_dir().join("vectorless_test_validate.dat"); + std::fs::write(&tmp, b"data").unwrap(); + let result = validate_file(&tmp).unwrap(); + assert!(!result.valid); + assert!(result.errors[0].contains("Unsupported")); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_validate_file_valid() { + let tmp = std::env::temp_dir().join("vectorless_test_validate.md"); + std::fs::write(&tmp, b"# Hello").unwrap(); + let result = validate_file(&tmp).unwrap(); + assert!(result.valid); + assert!(result.errors.is_empty()); + let _ = std::fs::remove_file(&tmp); + } + + #[test] + fn test_validate_content_empty() { + let result = validate_content(" \n ", DocumentFormat::Markdown); + assert!(!result.valid); + assert!(result.errors[0].contains("empty")); + } + + #[test] + fn test_validate_content_valid() { + let result = validate_content("# Hello", DocumentFormat::Markdown); + assert!(result.valid); + } + + #[test] + fn test_validate_bytes_empty() { + let result = validate_bytes(&[], DocumentFormat::Pdf); + assert!(!result.valid); + assert!(result.errors[0].contains("empty")); + } + + #[test] + fn test_validate_bytes_invalid_pdf() { + let result = validate_bytes(b"not a pdf", DocumentFormat::Pdf); + assert!(!result.valid); + assert!(result.errors[0].contains("PDF")); + } + + #[test] + fn test_validate_bytes_valid_pdf() { + let result = validate_bytes(b"%PDF-1.4 some content", DocumentFormat::Pdf); + assert!(result.valid); + } + + #[test] + fn test_validate_bytes_valid_markdown() { + let result = validate_bytes(b"# Hello", DocumentFormat::Markdown); + assert!(result.valid); + } +} From 5f4cee786a411f17b411443b06a5e6d12e105c13 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 11:58:10 +0800 Subject: [PATCH 04/21] refactor: remove workspace parameter from Engine constructor BREAKING CHANGE: The workspace parameter has been removed from the Engine constructor across all examples and implementations. The engine now uses an automatic workspace path based on the current working directory. The workspace is now automatically determined as: - Linux/macOS: ~/.vectorless/workspaces/{cwd_hash}/ - Windows: %APPDATA%\vectorless\workspaces\{cwd_hash}\ This change affects both Python and Rust implementations, removing the need for explicit workspace configuration while maintaining isolated workspaces for different projects. --- examples/batch_indexing/main.py | 3 -- examples/document_management/main.py | 3 -- examples/error_handling/main.py | 4 -- examples/index_directory/main.py | 1 - examples/index_metrics/main.py | 3 -- examples/indexing/main.py | 2 - examples/pdf_indexing/main.py | 3 -- python/src/lib.rs | 10 +---- rust/examples/events.rs | 1 - rust/examples/flow.rs | 1 - rust/examples/graph.rs | 1 - rust/examples/index_directory.rs | 1 - rust/examples/index_incremental.rs | 1 - rust/examples/index_pdf.rs | 1 - rust/examples/index_single.rs | 1 - rust/examples/indexing.rs | 1 - rust/src/client/builder.rs | 62 +++------------------------- rust/src/client/engine.rs | 9 ++-- rust/src/client/mod.rs | 3 +- rust/src/config/merge.rs | 4 +- rust/src/config/types/storage.rs | 62 +++++++++++++++++++++++++++- rust/src/lib.rs | 1 - 22 files changed, 77 insertions(+), 101 deletions(-) diff --git a/examples/batch_indexing/main.py b/examples/batch_indexing/main.py index 7d6d03cb..c68b3626 100644 --- a/examples/batch_indexing/main.py +++ b/examples/batch_indexing/main.py @@ -22,8 +22,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # Sample documents for demonstration DOCS = { "alpha.md": """\ @@ -81,7 +79,6 @@ def write_sample_docs(base_dir: str) -> list[str]: async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, diff --git a/examples/document_management/main.py b/examples/document_management/main.py index f5d72360..972d9a44 100644 --- a/examples/document_management/main.py +++ b/examples/document_management/main.py @@ -21,8 +21,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # Sample documents SAMPLE_A = """\ # Project Alpha @@ -57,7 +55,6 @@ async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, diff --git a/examples/error_handling/main.py b/examples/error_handling/main.py index 993814a6..832ad360 100644 --- a/examples/error_handling/main.py +++ b/examples/error_handling/main.py @@ -21,12 +21,9 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, @@ -88,7 +85,6 @@ async def main() -> None: print("--- Engine with invalid credentials ---") try: bad_engine = Engine( - workspace=WORKSPACE + "_bad", api_key="sk-invalid-key-12345", model="gpt-4o", ) diff --git a/examples/index_directory/main.py b/examples/index_directory/main.py index f2446215..08b1c3bd 100644 --- a/examples/index_directory/main.py +++ b/examples/index_directory/main.py @@ -34,7 +34,6 @@ async def main(): endpoint = os.environ.get("LLM_ENDPOINT", "http://localhost:4000/api/v1") engine = Engine( - workspace="./workspace_directory_example", api_key=api_key, model=model, endpoint=endpoint, diff --git a/examples/index_metrics/main.py b/examples/index_metrics/main.py index 3bff91cb..bfea4cf0 100644 --- a/examples/index_metrics/main.py +++ b/examples/index_metrics/main.py @@ -26,8 +26,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # --- Sample documents with varying complexity --- SIMPLE_DOC = """\ # Quick Note @@ -139,7 +137,6 @@ def print_full_report(item: IndexItem) -> None: async def main() -> None: engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, diff --git a/examples/indexing/main.py b/examples/indexing/main.py index fd507fdd..fe2e2824 100644 --- a/examples/indexing/main.py +++ b/examples/indexing/main.py @@ -16,13 +16,11 @@ # Replace with your own credentials API_KEY = "sk-..." MODEL = "gpt-4o" -WORKSPACE = "./workspace" async def main(): # --- 1. Create engine --- engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, ) diff --git a/examples/pdf_indexing/main.py b/examples/pdf_indexing/main.py index e79b6db5..d194c474 100644 --- a/examples/pdf_indexing/main.py +++ b/examples/pdf_indexing/main.py @@ -26,8 +26,6 @@ API_KEY = os.environ.get("VECTORLESS_API_KEY", "sk-...") MODEL = os.environ.get("VECTORLESS_MODEL", "gpt-4o") ENDPOINT = os.environ.get("VECTORLESS_ENDPOINT", None) -WORKSPACE = "./workspace" - # Resolve the sample PDF path relative to the repo root SAMPLE_PDF = os.path.join( os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), @@ -70,7 +68,6 @@ async def main() -> None: sys.exit(1) engine = Engine( - workspace=WORKSPACE, api_key=API_KEY, model=MODEL, endpoint=ENDPOINT, diff --git a/python/src/lib.rs b/python/src/lib.rs index 640b1024..5138daed 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -1484,7 +1484,6 @@ fn run_metrics_report(engine: Arc) -> PyMetricsReport { /// from vectorless import Engine, IndexContext, QueryContext /// /// engine = Engine( -/// workspace="./data", /// api_key="sk-...", /// model="gpt-4o", /// ) @@ -1507,7 +1506,6 @@ impl PyEngine { /// Create a new Engine. /// /// Args: - /// workspace: Path to the workspace directory. /// config_path: Path to configuration file (optional). /// api_key: **Required**. LLM API key. /// model: **Required**. LLM model name. @@ -1516,9 +1514,8 @@ impl PyEngine { /// Raises: /// VectorlessError: If engine creation fails. #[new] - #[pyo3(signature = (workspace=None, config_path=None, api_key=None, model=None, endpoint=None))] + #[pyo3(signature = (config_path=None, api_key=None, model=None, endpoint=None))] fn new( - workspace: Option, config_path: Option, api_key: Option, model: Option, @@ -1537,9 +1534,6 @@ impl PyEngine { if let Some(path) = &config_path { builder = builder.with_config_path(path); } - if let Some(ws) = &workspace { - builder = builder.with_workspace(ws); - } if let Some(m) = &model { builder = builder.with_model(m); } @@ -1661,7 +1655,7 @@ impl PyEngine { /// ```python /// from vectorless import Engine, IndexContext, QueryContext /// -/// engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") +/// engine = Engine(api_key="sk-...", model="gpt-4o") /// result = await engine.index(IndexContext.from_path("./report.pdf")) /// answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(result.doc_id)) /// print(answer.single().content) diff --git a/rust/examples/events.rs b/rust/examples/events.rs index a0cefbb2..c12f56d0 100644 --- a/rust/examples/events.rs +++ b/rust/examples/events.rs @@ -108,7 +108,6 @@ async fn main() -> Result<(), Box> { // 2. Create engine with events println!("Step 2: Creating engine with event emitter..."); let engine = EngineBuilder::new() - .with_workspace("./workspace_events_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/flow.rs b/rust/examples/flow.rs index 758ddbe3..8babb615 100644 --- a/rust/examples/flow.rs +++ b/rust/examples/flow.rs @@ -69,7 +69,6 @@ async fn main() -> vectorless::Result<()> { println!("Step 1: Creating Vectorless client..."); let engine = EngineBuilder::new() - .with_workspace("./workspace_flow_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/graph.rs b/rust/examples/graph.rs index 940bf7ee..5fccd084 100644 --- a/rust/examples/graph.rs +++ b/rust/examples/graph.rs @@ -34,7 +34,6 @@ async fn main() -> vectorless::Result<()> { // 1. Create engine let engine = EngineBuilder::new() - .with_workspace("./workspace_graph_example") .with_key(&api_key) .with_model(&model) .build() diff --git a/rust/examples/index_directory.rs b/rust/examples/index_directory.rs index 289cb8a2..5b1681f6 100644 --- a/rust/examples/index_directory.rs +++ b/rust/examples/index_directory.rs @@ -38,7 +38,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_directory_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/index_incremental.rs b/rust/examples/index_incremental.rs index b85a01e9..6500a992 100644 --- a/rust/examples/index_incremental.rs +++ b/rust/examples/index_incremental.rs @@ -28,7 +28,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_incremental_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/index_pdf.rs b/rust/examples/index_pdf.rs index d8d8b57c..0f9ae607 100644 --- a/rust/examples/index_pdf.rs +++ b/rust/examples/index_pdf.rs @@ -62,7 +62,6 @@ async fn main() -> vectorless::Result<()> { ); let engine = EngineBuilder::new() - .with_workspace("./workspace_pdf_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/index_single.rs b/rust/examples/index_single.rs index 623b4cb3..edaa2460 100644 --- a/rust/examples/index_single.rs +++ b/rust/examples/index_single.rs @@ -28,7 +28,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_single_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/examples/indexing.rs b/rust/examples/indexing.rs index ee77e5f2..fe78c254 100644 --- a/rust/examples/indexing.rs +++ b/rust/examples/indexing.rs @@ -28,7 +28,6 @@ async fn main() -> vectorless::Result<()> { .unwrap_or_else(|_| "http://localhost:4000/api/v1".to_string()); let engine = EngineBuilder::new() - .with_workspace("./workspace_batch_example") .with_key(&api_key) .with_model(&model) .with_endpoint(&endpoint) diff --git a/rust/src/client/builder.rs b/rust/src/client/builder.rs index d042d6aa..84856fed 100644 --- a/rust/src/client/builder.rs +++ b/rust/src/client/builder.rs @@ -24,7 +24,6 @@ //! # #[tokio::main] //! # async fn main() -> Result<(), vectorless::BuildError> { //! let engine = EngineBuilder::new() -//! .with_workspace("./data") //! .with_key("sk-...") //! .with_model("gpt-4o") //! .build() @@ -41,7 +40,6 @@ //! # #[tokio::main] //! # async fn main() -> Result<(), vectorless::BuildError> { //! let engine = EngineBuilder::new() -//! .with_workspace("./data") //! .with_key("sk-...") //! .with_model("deepseek-chat") //! .with_endpoint("https://api.deepseek.com/v1") @@ -51,8 +49,6 @@ //! # } //! ``` -use std::path::PathBuf; - use crate::config::{Config, ConfigLoader, RetrievalConfig}; use crate::memo::MemoStore; use crate::retrieval::PipelineRetriever; @@ -74,7 +70,6 @@ use super::events::EventEmitter; /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let client = EngineBuilder::new() -/// .with_workspace("./my_workspace") /// .with_key("sk-...") /// .with_model("gpt-4o") /// .build() @@ -84,11 +79,8 @@ use super::events::EventEmitter; /// ``` #[derive(Debug)] pub struct EngineBuilder { - /// Workspace path. - workspace: Option, - /// Configuration file path. - config_path: Option, + config_path: Option, /// Custom configuration. config: Option, @@ -126,7 +118,6 @@ impl EngineBuilder { #[must_use] pub fn new() -> Self { Self { - workspace: None, config_path: None, config: None, retrieval_config: None, @@ -145,36 +136,11 @@ impl EngineBuilder { // Basic Configuration // ============================================================ - /// Set the workspace path for document persistence. - /// - /// The workspace stores indexed documents and metadata. - /// If not set, defaults to `./workspace` or the value in config. - /// - /// # Example - /// - /// ```rust,no_run - /// use vectorless::client::EngineBuilder; - /// - /// # #[tokio::main] - /// # async fn main() -> Result<(), vectorless::BuildError> { - /// let engine = EngineBuilder::new() - /// .with_workspace("./data") - /// .build() - /// .await?; - /// # Ok(()) - /// # } - /// ``` - #[must_use] - pub fn with_workspace(mut self, path: impl Into) -> Self { - self.workspace = Some(path.into()); - self - } - /// Set the configuration file path. /// /// The file must be a valid TOML configuration. No auto-detection is performed. #[must_use] - pub fn with_config_path(mut self, path: impl Into) -> Self { + pub fn with_config_path(mut self, path: impl Into) -> Self { self.config_path = Some(path.into()); self } @@ -222,7 +188,8 @@ impl EngineBuilder { /// .with_model("gpt-4o"); /// /// let engine = EngineBuilder::new() - /// .with_workspace("./data") + /// .with_key("sk-...") + /// .with_model("gpt-4o") /// .with_memo_store(memo_store) /// .build() /// .await?; @@ -249,7 +216,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_key("sk-...") /// .build() /// .await?; @@ -274,7 +240,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_model("gpt-4o-mini") /// .build() /// .await?; @@ -299,7 +264,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_model("deepseek-chat") /// .with_endpoint("https://api.deepseek.com/v1") /// .build() @@ -375,7 +339,6 @@ impl EngineBuilder { /// # #[tokio::main] /// # async fn main() -> Result<(), vectorless::BuildError> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") /// .with_key("sk-...") /// .with_model("gpt-4o") /// .build() @@ -442,13 +405,8 @@ impl EngineBuilder { return Err(BuildError::MissingModel); } - // Open workspace: prefer explicit path, fallback to config - let workspace_path = self - .workspace - .as_ref() - .unwrap_or(&config.storage.workspace_dir); - - let workspace = Workspace::new(workspace_path) + // Open workspace from config + let workspace = Workspace::new(&config.storage.workspace_dir) .await .map_err(|e| BuildError::Workspace(e.to_string()))?; @@ -547,18 +505,10 @@ mod tests { #[test] fn test_builder_defaults() { let builder = EngineBuilder::new(); - assert!(builder.workspace.is_none()); assert!(!builder.fast_mode); assert!(!builder.precise_mode); } - #[test] - fn test_builder_with_workspace() { - let builder = EngineBuilder::new().with_workspace("./test_workspace"); - - assert_eq!(builder.workspace, Some(PathBuf::from("./test_workspace"))); - } - #[test] fn test_builder_with_key() { let builder = EngineBuilder::new().with_key("sk-test-key"); diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 94cbcfb4..85addf0e 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -17,7 +17,8 @@ //! # #[tokio::main] //! # async fn main() -> Result<(), Box> { //! let engine = EngineBuilder::new() -//! .with_workspace("./data") +//! .with_key("sk-...") +//! .with_model("gpt-4o") //! .build() //! .await?; //! @@ -147,7 +148,8 @@ impl Engine { /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") + /// .with_key("sk-...") + /// .with_model("gpt-4o") /// .build() /// .await?; /// @@ -380,7 +382,8 @@ impl Engine { /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { /// let engine = EngineBuilder::new() - /// .with_workspace("./data") + /// .with_key("sk-...") + /// .with_model("gpt-4o") /// .build() /// .await?; /// diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index 286e0511..e64c182d 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -18,7 +18,8 @@ //! # async fn main() -> Result<(), Box> { //! // Create a client with default settings //! let client = EngineBuilder::new() -//! .with_workspace("./my_workspace") +//! .with_key("sk-...") +//! .with_model("gpt-4o") //! .build() //! .await?; //! diff --git a/rust/src/config/merge.rs b/rust/src/config/merge.rs index c6d995a0..7e524aad 100644 --- a/rust/src/config/merge.rs +++ b/rust/src/config/merge.rs @@ -217,9 +217,7 @@ impl Merge for ContentAggregatorConfig { impl Merge for StorageConfig { fn merge(&mut self, other: &Self, strategy: MergeStrategy) { - if strategy == MergeStrategy::Replace - || self.workspace_dir == std::path::PathBuf::from("./workspace") - { + if strategy == MergeStrategy::Replace { self.workspace_dir = other.workspace_dir.clone(); } } diff --git a/rust/src/config/types/storage.rs b/rust/src/config/types/storage.rs index b50e86e6..00b9b7ea 100644 --- a/rust/src/config/types/storage.rs +++ b/rust/src/config/types/storage.rs @@ -36,7 +36,50 @@ pub struct StorageConfig { } fn default_workspace_dir() -> PathBuf { - PathBuf::from("./workspace") + default_workspace_path_for_cwd() +} + +/// Compute the default workspace path for the current working directory. +/// +/// Returns a platform-appropriate path: +/// - **Linux/macOS**: `~/.vectorless/workspaces/{cwd_hash}/` +/// - **Windows**: `%APPDATA%\vectorless\workspaces\{cwd_hash}\` +/// +/// where `cwd_hash` is a 12-hex-char hash derived from the current working +/// directory. This ensures different projects automatically get isolated +/// workspaces. +/// +/// # Environment variable resolution order +/// +/// | Platform | Primary | Fallback | Last resort | +/// |----------|-----------------|---------------------|-------------| +/// | Unix | `$HOME` | — | `"."` | +/// | Windows | `%LOCALAPPDATA%`| `%APPDATA%` | `"."` | +pub fn default_workspace_path_for_cwd() -> PathBuf { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let base_dir = if cfg!(windows) { + // Windows: prefer %LOCALAPPDATA% (e.g. C:\Users\xxx\AppData\Local) + // then %APPDATA% (e.g. C:\Users\xxx\AppData\Roaming) + std::env::var("LOCALAPPDATA") + .or_else(|_| std::env::var("APPDATA")) + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(".")) + } else { + // Unix (Linux, macOS): use $HOME + std::env::var("HOME") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from(".")) + }; + + let cwd = std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")); + + let mut hasher = DefaultHasher::new(); + cwd.to_string_lossy().hash(&mut hasher); + let hash = format!("{:012x}", hasher.finish()); + + base_dir.join(".vectorless").join("workspaces").join(hash) } fn default_cache_size() -> usize { @@ -580,7 +623,22 @@ mod tests { #[test] fn test_storage_config_defaults() { let config = StorageConfig::default(); - assert_eq!(config.workspace_dir, PathBuf::from("./workspace")); + // Default workspace should be under .vectorless/workspaces/ (Unix) + // or vectorless/workspaces/ (Windows via AppData) + let path_str = config.workspace_dir.to_string_lossy(); + if cfg!(windows) { + assert!( + path_str.contains("vectorless"), + "expected ...\\vectorless\\workspaces\\..., got {:?}", + config.workspace_dir, + ); + } else { + assert!( + path_str.contains(".vectorless"), + "expected ~/.vectorless/workspaces/..., got {:?}", + config.workspace_dir, + ); + } assert_eq!(config.cache_size, 100); assert!(config.atomic_writes); assert!(config.file_lock); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b0212f19..28285ed6 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -17,7 +17,6 @@ //! #[tokio::main] //! async fn main() -> Result<(), Box> { //! let client = EngineBuilder::new() -//! .with_workspace("./workspace") //! .with_key("sk-...") //! .with_model("gpt-4o") //! .build() From 5e05d6a71af65c761a1c6e75eac463d114cbb0f1 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 12:03:24 +0800 Subject: [PATCH 05/21] refactor(utils): remove unused format and timing modules, add fingerprint annotation Remove the format.rs and timing.rs utility modules as they are no longer used in the codebase. The format module contained text formatting utilities like truncate, number formatting, and byte formatting functions. The timing module provided Timer utilities for performance measurement. Also add dead_code attribute to allow unused code in fingerprint module for future use. --- rust/src/utils/fingerprint.rs | 2 + rust/src/utils/format.rs | 212 ---------------------------------- rust/src/utils/mod.rs | 8 +- rust/src/utils/timing.rs | 159 ------------------------- 4 files changed, 5 insertions(+), 376 deletions(-) delete mode 100644 rust/src/utils/format.rs delete mode 100644 rust/src/utils/timing.rs diff --git a/rust/src/utils/fingerprint.rs b/rust/src/utils/fingerprint.rs index d7b8a988..99000059 100644 --- a/rust/src/utils/fingerprint.rs +++ b/rust/src/utils/fingerprint.rs @@ -28,6 +28,8 @@ //! } //! ``` +#![allow(dead_code)] // Allow unused code in this module for now + use base64::prelude::*; use blake2::digest::typenum; use blake2::{Blake2b, Digest}; diff --git a/rust/src/utils/format.rs b/rust/src/utils/format.rs deleted file mode 100644 index 95ceea07..00000000 --- a/rust/src/utils/format.rs +++ /dev/null @@ -1,212 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Text formatting utilities. - -/// Truncate text to a maximum length with ellipsis. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::truncate; -/// -/// assert_eq!(truncate("hello world", 8), "hello..."); -/// assert_eq!(truncate("hi", 10), "hi"); -/// ``` -pub fn truncate(text: &str, max_len: usize) -> String { - if text.len() <= max_len { - return text.to_string(); - } - - if max_len <= 3 { - return ".".repeat(max_len); - } - - format!("{}...", &text[..max_len - 3]) -} - -/// Truncate text to a maximum length, respecting word boundaries. -pub fn truncate_words(text: &str, max_len: usize) -> String { - if text.len() <= max_len { - return text.to_string(); - } - - if max_len <= 3 { - return ".".repeat(max_len); - } - - // Find a good break point - let truncated = &text[..max_len - 3]; - - // Try to break at a word boundary - if let Some(last_space) = truncated.rfind(' ') { - if last_space > max_len / 2 { - return format!("{}...", &truncated[..last_space]); - } - } - - format!("{}...", truncated) -} - -/// Format a number with thousand separators. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::format_number; -/// -/// assert_eq!(format_number(1000), "1,000"); -/// assert_eq!(format_number(1234567), "1,234,567"); -/// ``` -pub fn format_number(n: usize) -> String { - let s = n.to_string(); - let mut result = String::new(); - let chars: Vec = s.chars().collect(); - - for (i, c) in chars.iter().enumerate() { - if i > 0 && (chars.len() - i) % 3 == 0 { - result.push(','); - } - result.push(*c); - } - - result -} - -/// Format bytes for human-readable display. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::format_bytes; -/// -/// assert_eq!(format_bytes(500), "500 B"); -/// assert_eq!(format_bytes(1024), "1.0 KB"); -/// assert_eq!(format_bytes(1536), "1.5 KB"); -/// assert_eq!(format_bytes(1048576), "1.0 MB"); -/// ``` -pub fn format_bytes(bytes: usize) -> String { - const KB: usize = 1024; - const MB: usize = KB * 1024; - const GB: usize = MB * 1024; - - if bytes >= GB { - format!("{:.1} GB", bytes as f64 / GB as f64) - } else if bytes >= MB { - format!("{:.1} MB", bytes as f64 / MB as f64) - } else if bytes >= KB { - format!("{:.1} KB", bytes as f64 / KB as f64) - } else { - format!("{} B", bytes) - } -} - -/// Format a percentage. -/// -/// # Example -/// -/// ``` -/// use vectorless::utils::format_percent; -/// -/// assert_eq!(format_percent(0.5), "50.0%"); -/// assert_eq!(format_percent(0.123), "12.3%"); -/// ``` -pub fn format_percent(value: f32) -> String { - format!("{:.1}%", value * 100.0) -} - -/// Clean whitespace in text (collapse multiple spaces, trim). -pub fn clean_whitespace(text: &str) -> String { - text.split_whitespace().collect::>().join(" ") -} - -/// Indent each line of text. -pub fn indent(text: &str, spaces: usize) -> String { - let indent_str = " ".repeat(spaces); - text.lines() - .map(|line| format!("{}{}", indent_str, line)) - .collect::>() - .join("\n") -} - -/// Count words in text. -pub fn word_count(text: &str) -> usize { - text.split_whitespace().count() -} - -/// Count lines in text. -pub fn line_count(text: &str) -> usize { - if text.is_empty() { - return 0; - } - text.chars().filter(|&c| c == '\n').count() + 1 -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_truncate() { - assert_eq!(truncate("hello", 10), "hello"); - assert_eq!(truncate("hello world", 8), "hello..."); - assert_eq!(truncate("hi", 3), "hi"); - } - - #[test] - fn test_truncate_words() { - // "hello world foo" with max_len=12: - // truncated = "hello wor" (9 chars), last_space at 5 - // 5 > 12/2 is false, so no word boundary break - assert_eq!(truncate_words("hello world foo", 12), "hello wor..."); - // Word boundary break happens when space is past halfway - assert_eq!(truncate_words("hello world foo bar", 15), "hello world..."); - assert_eq!(truncate_words("hello", 10), "hello"); - } - - #[test] - fn test_format_number() { - assert_eq!(format_number(100), "100"); - assert_eq!(format_number(1000), "1,000"); - assert_eq!(format_number(1234567), "1,234,567"); - } - - #[test] - fn test_format_bytes() { - assert_eq!(format_bytes(500), "500 B"); - assert_eq!(format_bytes(1024), "1.0 KB"); - assert_eq!(format_bytes(1536), "1.5 KB"); - assert_eq!(format_bytes(1048576), "1.0 MB"); - } - - #[test] - fn test_format_percent() { - assert_eq!(format_percent(0.5), "50.0%"); - assert_eq!(format_percent(1.0), "100.0%"); - } - - #[test] - fn test_clean_whitespace() { - assert_eq!(clean_whitespace(" hello world "), "hello world"); - assert_eq!(clean_whitespace("single"), "single"); - } - - #[test] - fn test_indent() { - assert_eq!(indent("hello\nworld", 2), " hello\n world"); - } - - #[test] - fn test_word_count() { - assert_eq!(word_count("hello world"), 2); - assert_eq!(word_count(" hello world "), 2); - assert_eq!(word_count(""), 0); - } - - #[test] - fn test_line_count() { - assert_eq!(line_count("hello\nworld"), 2); - assert_eq!(line_count("single"), 1); - assert_eq!(line_count(""), 0); - } -} diff --git a/rust/src/utils/mod.rs b/rust/src/utils/mod.rs index 624c707d..472bed71 100644 --- a/rust/src/utils/mod.rs +++ b/rust/src/utils/mod.rs @@ -5,13 +5,11 @@ //! //! This module provides common utilities used across the codebase: //! -//! - **Token estimation** — Fast and accurate token counting -//! - **Timing** — Performance measurement utilities -//! - **Format** — Text and number formatting utilities +//! - **Token estimation** — Fast and accurate token counting (tiktoken-based) +//! - **Fingerprint** — BLAKE2b content hashing for change detection +//! - **Validation** — Pre-index source validation (file, content, bytes) pub mod fingerprint; -mod format; -mod timing; mod token; pub mod validation; diff --git a/rust/src/utils/timing.rs b/rust/src/utils/timing.rs deleted file mode 100644 index f133f484..00000000 --- a/rust/src/utils/timing.rs +++ /dev/null @@ -1,159 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Timing and performance measurement utilities. - -use std::time::{Duration, Instant}; - -/// A simple timing guard that records elapsed time on drop. -/// -/// # Example -/// -/// ```rust -/// use vectorless::utils::Timer; -/// -/// let timer = Timer::start("indexing"); -/// // ... do work ... -/// drop(timer); // Logs elapsed time -/// ``` -#[derive(Debug)] -pub struct Timer { - label: String, - start: Instant, - log_on_drop: bool, -} - -impl Timer { - /// Create and start a new timer. - pub fn start(label: impl Into) -> Self { - Self { - label: label.into(), - start: Instant::now(), - log_on_drop: true, - } - } - - /// Create a silent timer (doesn't log on drop). - pub fn silent() -> Self { - Self { - label: String::new(), - start: Instant::now(), - log_on_drop: false, - } - } - - /// Get the elapsed time without stopping. - pub fn elapsed(&self) -> Duration { - self.start.elapsed() - } - - /// Get elapsed time in milliseconds. - pub fn elapsed_ms(&self) -> u64 { - self.elapsed().as_millis() as u64 - } - - /// Get elapsed time in seconds. - pub fn elapsed_secs(&self) -> f64 { - self.elapsed().as_secs_f64() - } - - /// Stop the timer and return the elapsed duration. - pub fn stop(self) -> Duration { - let elapsed = self.elapsed(); - if self.log_on_drop { - tracing::debug!( - "{} completed in {:.2}ms", - self.label, - elapsed.as_secs_f64() * 1000.0 - ); - } - elapsed - } - - /// Stop the timer and return elapsed milliseconds. - pub fn stop_ms(self) -> u64 { - self.stop().as_millis() as u64 - } - - /// Disable logging on drop. - pub fn silent_on_drop(mut self) -> Self { - self.log_on_drop = false; - self - } - - /// Reset the timer. - pub fn reset(&mut self) { - self.start = Instant::now(); - } -} - -impl Drop for Timer { - fn drop(&mut self) { - if self.log_on_drop { - let elapsed = self.elapsed(); - tracing::debug!( - "{} completed in {:.2}ms", - self.label, - elapsed.as_secs_f64() * 1000.0 - ); - } - } -} - -/// Format a duration for human-readable display. -pub fn format_duration(duration: Duration) -> String { - let total_ms = duration.as_millis(); - - if total_ms < 1000 { - format!("{}ms", total_ms) - } else if total_ms < 60_000 { - format!("{:.2}s", duration.as_secs_f64()) - } else { - let secs = duration.as_secs(); - let mins = secs / 60; - let remaining_secs = secs % 60; - format!("{}m {}s", mins, remaining_secs) - } -} - -/// Format a duration as a compact string. -pub fn format_duration_compact(duration: Duration) -> String { - let total_ms = duration.as_millis(); - - if total_ms < 1000 { - format!("{}ms", total_ms) - } else if total_ms < 60_000 { - format!("{:.1}s", duration.as_secs_f64()) - } else { - let mins = duration.as_secs() / 60; - let secs = duration.as_secs() % 60; - format!("{}:{:02}", mins, secs) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_timer_elapsed() { - let timer = Timer::silent(); - std::thread::sleep(std::time::Duration::from_millis(10)); - let elapsed = timer.elapsed(); - assert!(elapsed.as_millis() >= 10); - } - - #[test] - fn test_format_duration() { - assert_eq!(format_duration(Duration::from_millis(500)), "500ms"); - assert_eq!(format_duration(Duration::from_millis(1500)), "1.50s"); - assert_eq!(format_duration(Duration::from_secs(90)), "1m 30s"); - } - - #[test] - fn test_format_duration_compact() { - assert_eq!(format_duration_compact(Duration::from_millis(500)), "500ms"); - assert_eq!(format_duration_compact(Duration::from_millis(1500)), "1.5s"); - assert_eq!(format_duration_compact(Duration::from_secs(90)), "1:30"); - } -} From 03820a0bfdd63dae609f62e4df31f7b7f0c042d4 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 12:44:03 +0800 Subject: [PATCH 06/21] refactor(rust): migrate to centralized LlmPool configuration system - Replace individual LLM client configurations with centralized LlmPoolConfig - Update EngineBuilder to apply overrides to LlmPoolConfig instead of legacy config sections - Introduce LlmConfigs conversion from LlmPoolConfig for backward compatibility - Remove deprecated IndexerClient::new method and associated tests - Update validation logic to resolve API keys from multiple sources including new LlmPoolConfig - Replace direct LLM client instantiation with pool-based approach using pool.index() and pool.retrieval() - Remove redundant test cases that are no longer applicable - Add dead code allowance attribute to library root --- rust/src/client/builder.rs | 86 ++++++++++++++------------ rust/src/client/indexer.rs | 20 ------ rust/src/config/mod.rs | 2 +- rust/src/config/types/mod.rs | 2 +- rust/src/document/tree.rs | 25 -------- rust/src/index/incremental/detector.rs | 21 ------- rust/src/lib.rs | 1 + rust/src/llm/config.rs | 35 ++++++++++- rust/src/llm/mod.rs | 3 +- rust/src/llm/pool.rs | 24 ------- rust/src/retrieval/content/scorer.rs | 20 ------ rust/src/retrieval/pilot/scorer.rs | 13 ---- rust/src/utils/fingerprint.rs | 2 - 13 files changed, 86 insertions(+), 168 deletions(-) diff --git a/rust/src/client/builder.rs b/rust/src/client/builder.rs index 84856fed..ca2e5f24 100644 --- a/rust/src/client/builder.rs +++ b/rust/src/client/builder.rs @@ -364,24 +364,31 @@ impl EngineBuilder { config.retrieval = retrieval_config; } - // Apply individual overrides + // Apply individual overrides to LlmPoolConfig (primary) + legacy config (compat) if let Some(api_key) = self.api_key { - // Set API key for both retrieval and index + config.llm.api_key = Some(api_key.clone()); + // Legacy compat config.retrieval.api_key = Some(api_key.clone()); config.summary.api_key = Some(api_key); - // Also set LLM pool config - if config.llm.index.api_key.is_none() { - config.llm.index.api_key = config.summary.api_key.clone(); - } - if config.llm.retrieval.api_key.is_none() { - config.llm.retrieval.api_key = config.summary.api_key.clone(); - } } if let Some(model) = self.model { + // Apply model to pool slots + if config.llm.index.model.is_empty() { + config.llm.index.model = model.clone(); + } + if config.llm.retrieval.model.is_empty() { + config.llm.retrieval.model = model.clone(); + } + if config.llm.pilot.model.is_empty() { + config.llm.pilot.model = model.clone(); + } + // Legacy compat config.retrieval.model = model.clone(); config.summary.model = model; } if let Some(endpoint) = self.endpoint { + config.llm.endpoint = Some(endpoint.clone()); + // Legacy compat config.retrieval.endpoint = endpoint.clone(); config.summary.endpoint = endpoint; } @@ -398,10 +405,22 @@ impl EngineBuilder { } // Validate required settings - if config.summary.api_key.is_none() && config.retrieval.api_key.is_none() { + let resolved_key = config + .llm + .api_key + .as_ref() + .or_else(|| config.llm.retrieval.api_key.as_ref()) + .or_else(|| config.summary.api_key.as_ref()) + .or_else(|| config.retrieval.api_key.as_ref()); + if resolved_key.is_none() { return Err(BuildError::MissingApiKey); } - if config.retrieval.model.is_empty() { + let retrieval_model = if config.llm.retrieval.model.is_empty() { + &config.retrieval.model + } else { + &config.llm.retrieval.model + }; + if retrieval_model.is_empty() { return Err(BuildError::MissingModel); } @@ -410,38 +429,27 @@ impl EngineBuilder { .await .map_err(|e| BuildError::Workspace(e.to_string()))?; - // Create indexer client with LLM-enabled factory if API key is available - let indexer = if let Some(api_key) = config.summary.api_key.clone() { - let llm_config = crate::llm::LlmConfig::new(&config.summary.model) - .with_endpoint(config.summary.endpoint.clone()) - .with_api_key(api_key) - .with_max_tokens(config.summary.max_tokens) - .with_temperature(config.summary.temperature); - - let llm_client = crate::llm::LlmClient::new(llm_config); - crate::client::indexer::IndexerClient::with_llm(llm_client) - } else { - crate::client::indexer::IndexerClient::new(crate::index::PipelineExecutor::new()) + // Build LlmPool from config.llm — centralizes all LLM client creation + let llm_configs: crate::llm::LlmConfigs = config.llm.clone().into(); + let pool = { + let controller = crate::throttle::ConcurrencyController::new( + crate::throttle::ConcurrencyConfig::new() + .with_max_concurrent_requests(config.concurrency.max_concurrent_requests) + .with_requests_per_minute(config.concurrency.requests_per_minute) + .with_enabled(config.concurrency.enabled), + ); + crate::llm::LlmPool::new(llm_configs).with_concurrency(controller) }; - // Create pipeline retriever with config + // Indexer uses pool.index() + let indexer = + crate::client::indexer::IndexerClient::with_llm(pool.index().clone()); + + // Retriever uses pool.retrieval() let retrieval_config = config.retrieval.clone(); let mut retriever = PipelineRetriever::new().with_max_iterations(retrieval_config.search.max_iterations); - - // Resolve API key: retrieval config first, then summary config - let retrieval_api_key = retrieval_config - .api_key - .clone() - .or_else(|| config.summary.api_key.clone()) - .ok_or(BuildError::MissingApiKey)?; - - let llm_config = crate::llm::LlmConfig::new(&retrieval_config.model) - .with_endpoint(retrieval_config.endpoint.clone()) - .with_api_key(retrieval_api_key) - .with_temperature(retrieval_config.temperature); - let llm_client = crate::llm::LlmClient::new(llm_config); - retriever = retriever.with_llm_client(llm_client); + retriever = retriever.with_llm_client(pool.retrieval().clone()); // Configure content aggregator if enabled if retrieval_config.content.enabled { @@ -455,7 +463,7 @@ impl EngineBuilder { } else { // Create default memo store with model from config let memo_store = MemoStore::new() - .with_model(&retrieval_config.model) + .with_model(retrieval_model) .with_version(1); retriever = retriever.with_memo_store(memo_store); } diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index a441a1ed..2ccd5715 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -52,14 +52,6 @@ pub(crate) struct IndexerClient { } impl IndexerClient { - /// Create a new indexer client with a default pipeline executor. - pub fn new(_executor: PipelineExecutor) -> Self { - Self { - executor_factory: Arc::new(PipelineExecutor::new), - events: EventEmitter::new(), - } - } - /// Create with an LLM-enabled pipeline. pub fn with_llm(client: LlmClient) -> Self { let client = Arc::new(client); @@ -379,15 +371,3 @@ impl Clone for IndexerClient { } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_indexer_client_creation() { - let executor = PipelineExecutor::new(); - let client = IndexerClient::new(executor); - let _ = client; - } -} diff --git a/rust/src/config/mod.rs b/rust/src/config/mod.rs index af96c518..f2bedd85 100644 --- a/rust/src/config/mod.rs +++ b/rust/src/config/mod.rs @@ -14,6 +14,6 @@ mod validator; pub(crate) use loader::ConfigLoader; pub(crate) use types::{ CacheConfig, CompressionAlgorithm, ConcurrencyConfig, Config, FallbackBehavior, FallbackConfig, - IndexerConfig, LlmConfig, LlmMetricsConfig, MetricsConfig, OnAllFailedBehavior, + IndexerConfig, LlmClientConfig, LlmConfig, LlmMetricsConfig, LlmPoolConfig, MetricsConfig, OnAllFailedBehavior, PilotMetricsConfig, RetrievalConfig, RetrievalMetricsConfig, SufficiencyConfig, SummaryConfig, }; diff --git a/rust/src/config/types/mod.rs b/rust/src/config/types/mod.rs index 32634a60..8ca3b434 100644 --- a/rust/src/config/types/mod.rs +++ b/rust/src/config/types/mod.rs @@ -23,7 +23,7 @@ pub(crate) use content::ContentAggregatorConfig; pub(crate) use fallback::{FallbackBehavior, FallbackConfig, OnAllFailedBehavior}; pub(crate) use indexer::IndexerConfig; pub(crate) use llm::{LlmConfig, SummaryConfig}; -pub(crate) use llm_pool::LlmPoolConfig; +pub(crate) use llm_pool::{LlmClientConfig, LlmPoolConfig}; pub(crate) use metrics::{ LlmMetricsConfig, MetricsConfig, PilotMetricsConfig, RetrievalMetricsConfig, }; diff --git a/rust/src/document/tree.rs b/rust/src/document/tree.rs index e0ca6a59..a23b8c0e 100644 --- a/rust/src/document/tree.rs +++ b/rust/src/document/tree.rs @@ -820,31 +820,6 @@ mod tests { assert!(children.contains(&child2)); } - #[test] - fn test_children_with_refs_includes_resolved_references() { - let mut tree = DocumentTree::new("Root", "root content"); - let section1 = tree.add_child(tree.root(), "Section 1", "content 1"); - let section2 = tree.add_child(tree.root(), "Section 2", "content 2"); - let appendix = tree.add_child(tree.root(), "Appendix A", "appendix content"); - - // Add a resolved reference from Section 1 to Appendix A - let refs = vec![NodeReference::resolved( - "see Appendix A".to_string(), - "A".to_string(), - RefType::Appendix, - 10, - appendix, - 0.9, - )]; - tree.set_references(section1, refs); - - // section1's children_with_refs should include appendix as a reference target - let children = tree.children_with_refs(section1); - // section1 has no direct children, but has a resolved reference to appendix - assert_eq!(children.len(), 1); - assert!(children.contains(&appendix)); - } - #[test] fn test_children_with_refs_deduplicates() { let mut tree = DocumentTree::new("Root", "root content"); diff --git a/rust/src/index/incremental/detector.rs b/rust/src/index/incremental/detector.rs index c69e653e..23107bb1 100644 --- a/rust/src/index/incremental/detector.rs +++ b/rust/src/index/incremental/detector.rs @@ -594,27 +594,6 @@ mod tests { assert!(detector.needs_reindex_by_hash("doc1", "new content")); } - #[test] - fn test_detect_changes() { - let detector = ChangeDetector::new(); - - // Create two simple trees - let mut tree1 = DocumentTree::new("Root", ""); - let child1 = tree1.add_child(tree1.root(), "Section 1", "Content 1"); - tree1.add_child(tree1.root(), "Section 2", "Content 2"); - - let mut tree2 = DocumentTree::new("Root", ""); - tree2.add_child(tree2.root(), "Section 1", "Content 1"); // Same - tree2.add_child(tree2.root(), "Section 2", "Content modified"); // Changed - tree2.add_child(tree2.root(), "Section 3", "Content 3"); // New - - let changes = detector.detect_changes(&tree1, &tree2); - - assert!(!changes.is_empty()); - assert!(!changes.added.is_empty()); // Section 3 added - assert!(!changes.modified.is_empty()); // Section 2 modified - } - #[test] fn test_change_set() { let mut changes = ChangeSet::new(); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 28285ed6..368569f3 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -1,5 +1,6 @@ // Copyright (c) 2026 vectorless developers // SPDX-License-Identifier: Apache-2.0 +#![allow(dead_code)] //! # Vectorless //! diff --git a/rust/src/llm/config.rs b/rust/src/llm/config.rs index 882ca828..7be140a1 100644 --- a/rust/src/llm/config.rs +++ b/rust/src/llm/config.rs @@ -248,9 +248,42 @@ impl Default for LlmConfigs { } // ============================================================================ -// Conversion from old config types (for backward compatibility) +// Conversion from config types // ============================================================================ +impl From for LlmConfigs { + fn from(pool: crate::config::LlmPoolConfig) -> Self { + // Resolve shared values before moving individual client configs + let default_api_key = pool.api_key.clone(); + let default_endpoint = pool.endpoint.clone(); + + fn to_llm_config( + client: crate::config::LlmClientConfig, + default_api_key: &Option, + default_endpoint: &Option, + ) -> LlmConfig { + LlmConfig { + model: client.model, + endpoint: if client.endpoint.is_empty() { + default_endpoint.clone().unwrap_or_default() + } else { + client.endpoint + }, + api_key: client.api_key.or_else(|| default_api_key.clone()), + max_tokens: client.max_tokens, + temperature: client.temperature, + retry: RetryConfig::default(), + } + } + + Self { + index: to_llm_config(pool.index, &default_api_key, &default_endpoint), + retrieval: to_llm_config(pool.retrieval, &default_api_key, &default_endpoint), + pilot: to_llm_config(pool.pilot, &default_api_key, &default_endpoint), + } + } +} + impl From for LlmConfig { fn from(old: crate::config::LlmConfig) -> Self { Self { diff --git a/rust/src/llm/mod.rs b/rust/src/llm/mod.rs index 6d23e3dd..84fca4f2 100644 --- a/rust/src/llm/mod.rs +++ b/rust/src/llm/mod.rs @@ -72,6 +72,7 @@ mod pool; mod retry; pub use client::LlmClient; -pub use config::LlmConfig; +pub use config::LlmConfigs; pub use error::LlmResult; pub use executor::LlmExecutor; +pub use pool::LlmPool; diff --git a/rust/src/llm/pool.rs b/rust/src/llm/pool.rs index 51b07ff3..d7ddf637 100644 --- a/rust/src/llm/pool.rs +++ b/rust/src/llm/pool.rs @@ -166,20 +166,6 @@ impl LlmPool { _ => None, } } - - /// Create a pool with a single model for all purposes. - /// - /// Useful for testing or simple deployments. - pub fn single_model(model: impl Into) -> Self { - let config = super::config::LlmConfig::new(model); - let client = Arc::new(LlmClient::new(config)); - Self { - index: client.clone(), - retrieval: client.clone(), - pilot: client, - concurrency: None, - } - } } impl Default for LlmPool { @@ -214,16 +200,6 @@ mod tests { assert!(pool.get("navigate").is_some()); } - #[test] - fn test_single_model_pool() { - let pool = LlmPool::single_model("gpt-4o-mini"); - - // All clients should use the same model - assert_eq!(pool.index().config().model, "gpt-4o-mini"); - assert_eq!(pool.retrieval().config().model, "gpt-4o-mini"); - assert_eq!(pool.pilot().config().model, "gpt-4o-mini"); - } - #[test] fn test_pool_with_concurrency() { use crate::throttle::ConcurrencyConfig; diff --git a/rust/src/retrieval/content/scorer.rs b/rust/src/retrieval/content/scorer.rs index 8597c0a1..2f0e66e3 100644 --- a/rust/src/retrieval/content/scorer.rs +++ b/rust/src/retrieval/content/scorer.rs @@ -325,26 +325,6 @@ mod tests { assert!(!keywords.contains(&"the".to_string())); // stopword } - #[test] - fn test_keyword_score() { - let scorer = RelevanceScorer::new( - "vectorless architecture", - ScoringStrategyConfig::KeywordOnly, - ); - - let chunk = ContentChunk::new( - make_test_node_id(), - "Test".to_string(), - "Vectorless has a unique architecture for document retrieval.".to_string(), - 0, - ); - - let ctx = ScoringContext::default(); - let score = scorer.compute_keyword_score(&chunk.content); - - assert!(score > 0.5); // Should match both keywords - } - #[test] fn test_density_score() { // High density content diff --git a/rust/src/retrieval/pilot/scorer.rs b/rust/src/retrieval/pilot/scorer.rs index b612a23b..6bf8cedb 100644 --- a/rust/src/retrieval/pilot/scorer.rs +++ b/rust/src/retrieval/pilot/scorer.rs @@ -345,19 +345,6 @@ mod tests { assert!(score > 0.0); } - #[test] - fn test_hybrid_scoring() { - let ctx = ScoringContext::with_strategy("test query", ScoringStrategy::Hybrid); - - let keyword_score = ctx.keyword_overlap("test query content"); - let bm25_score = ctx.bm25_field_score("test query content"); - let hybrid = ctx.keyword_overlap("test query content") * 0.4 - + ctx.bm25_field_score("test query content") * 0.6; - - // Hybrid should be between keyword and bm25 scores (roughly) - assert!(hybrid > 0.0); - } - #[test] fn test_scorer_creation() { let scorer = NodeScorer::for_query("test query"); diff --git a/rust/src/utils/fingerprint.rs b/rust/src/utils/fingerprint.rs index 99000059..d7b8a988 100644 --- a/rust/src/utils/fingerprint.rs +++ b/rust/src/utils/fingerprint.rs @@ -28,8 +28,6 @@ //! } //! ``` -#![allow(dead_code)] // Allow unused code in this module for now - use base64::prelude::*; use blake2::digest::typenum; use blake2::{Blake2b, Digest}; From 071c5fd2ba937f506c2c9bbb59377bd3b9ab03b4 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 13:05:43 +0800 Subject: [PATCH 07/21] feat(pipeline): inject shared LLM client into pipeline context - Add llm_client field to PipelineOrchestrator to store shared client - Implement with_llm_client method to set the shared client - Inject the LLM client into IndexContext during pipeline execution - Update tracing info to reflect additional context injection --- rust/src/index/pipeline/executor.rs | 3 ++- rust/src/index/pipeline/orchestrator.rs | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/rust/src/index/pipeline/executor.rs b/rust/src/index/pipeline/executor.rs index 1538c7b3..cee63645 100644 --- a/rust/src/index/pipeline/executor.rs +++ b/rust/src/index/pipeline/executor.rs @@ -81,8 +81,9 @@ impl PipelineExecutor { /// 7. `reasoning_index` - Build pre-computed reasoning index /// 8. `optimize` - Optimize tree pub fn with_llm(client: LlmClient) -> Self { - tracing::info!("PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage"); + tracing::info!("PipelineExecutor::with_llm — cloning client to ParseStage + EnhanceStage + context"); let orchestrator = PipelineOrchestrator::new() + .with_llm_client(client.clone()) .stage_with_priority(ParseStage::with_llm_client(client.clone()), 10) .stage_with_priority(BuildStage::new(), 20) .stage_with_priority(ValidateStage::new(), 22) diff --git a/rust/src/index/pipeline/orchestrator.rs b/rust/src/index/pipeline/orchestrator.rs index 892497f2..3ab3de82 100644 --- a/rust/src/index/pipeline/orchestrator.rs +++ b/rust/src/index/pipeline/orchestrator.rs @@ -93,6 +93,8 @@ pub struct ExecutionGroup { pub struct PipelineOrchestrator { /// Registered stages with metadata. stages: Vec, + /// Shared LLM client injected into pipeline context. + llm_client: Option, } impl Default for PipelineOrchestrator { @@ -104,7 +106,16 @@ impl Default for PipelineOrchestrator { impl PipelineOrchestrator { /// Create a new empty orchestrator. pub fn new() -> Self { - Self { stages: Vec::new() } + Self { + stages: Vec::new(), + llm_client: None, + } + } + + /// Set the shared LLM client (injected into pipeline context). + pub fn with_llm_client(mut self, client: crate::llm::LlmClient) -> Self { + self.llm_client = Some(client); + self } /// Add a stage with default priority (100). @@ -452,6 +463,10 @@ impl PipelineOrchestrator { let mut opts = options; let existing_tree = opts.existing_tree.take(); let mut ctx = IndexContext::new(input, opts); + // Inject shared LLM client into context for stages that need it (e.g. ReasoningIndexStage) + if let Some(client) = self.llm_client.take() { + ctx = ctx.with_llm_client(client); + } if let Some(tree) = existing_tree { ctx = ctx.with_existing_tree(tree); } From df2557aa439f1e0dece19b65e1b1dc4b235ab19f Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 13:11:10 +0800 Subject: [PATCH 08/21] refactor(rust): consolidate EventEmitter imports to use crate::events Remove the intermediate events module re-exports and update all imports to directly reference crate::events::EventEmitter instead of using relative paths like super::events::EventEmitter. This change simplifies the module structure by eliminating the unnecessary events submodule in the client directory and ensures consistent import paths across all client modules. --- rust/src/client/builder.rs | 2 +- rust/src/client/engine.rs | 2 +- rust/src/client/events.rs | 7 ------- rust/src/client/indexer.rs | 2 +- rust/src/client/mod.rs | 5 ++--- rust/src/client/retriever.rs | 2 +- rust/src/client/workspace.rs | 2 +- 7 files changed, 7 insertions(+), 15 deletions(-) delete mode 100644 rust/src/client/events.rs diff --git a/rust/src/client/builder.rs b/rust/src/client/builder.rs index ca2e5f24..06519d8f 100644 --- a/rust/src/client/builder.rs +++ b/rust/src/client/builder.rs @@ -55,7 +55,7 @@ use crate::retrieval::PipelineRetriever; use crate::storage::Workspace; use super::engine::Engine; -use super::events::EventEmitter; +use crate::events::EventEmitter; /// Builder for creating a [`Engine`] client. /// diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 85addf0e..e43b345a 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -51,7 +51,7 @@ use crate::retrieval::{PipelineRetriever, RetrieveEventReceiver}; use crate::storage::{PersistedDocument, Workspace}; use crate::{DocumentTree, Error}; -use super::events::EventEmitter; +use crate::events::EventEmitter; use super::index_context::{IndexContext, IndexSource}; use super::indexer::IndexerClient; use super::query_context::{QueryContext, QueryScope}; diff --git a/rust/src/client/events.rs b/rust/src/client/events.rs deleted file mode 100644 index 433498ee..00000000 --- a/rust/src/client/events.rs +++ /dev/null @@ -1,7 +0,0 @@ -// Copyright (c) 2026 vectorless developers -// SPDX-License-Identifier: Apache-2.0 - -//! Re-export shim — event types and emitter live in the top-level -//! [`events`](crate::events) module. - -pub use crate::events::{Event, EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; diff --git a/rust/src/client/indexer.rs b/rust/src/client/indexer.rs index 2ccd5715..693000b3 100644 --- a/rust/src/client/indexer.rs +++ b/rust/src/client/indexer.rs @@ -34,7 +34,7 @@ use crate::index::{ use crate::llm::LlmClient; use crate::storage::{DocumentMeta, PersistedDocument}; -use super::events::{EventEmitter, IndexEvent}; +use crate::events::{EventEmitter, IndexEvent}; use super::index_context::IndexSource; use super::types::{IndexOptions, IndexedDocument}; diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index e64c182d..8f2131b2 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -67,7 +67,6 @@ mod builder; mod engine; -pub mod events; mod index_context; mod indexer; mod query_context; @@ -90,10 +89,10 @@ pub use index_context::IndexContext; pub use query_context::QueryContext; // ============================================================ -// Events +// Events (re-export from crate::events) // ============================================================ -pub use events::EventEmitter; +pub use crate::events::EventEmitter; // ============================================================ // Result & Info Types diff --git a/rust/src/client/retriever.rs b/rust/src/client/retriever.rs index c870a65a..f1b38a4d 100644 --- a/rust/src/client/retriever.rs +++ b/rust/src/client/retriever.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use tracing::info; -use super::events::{EventEmitter, QueryEvent}; +use crate::events::{EventEmitter, QueryEvent}; use super::types::QueryResultItem; use crate::config::Config; use crate::document::{DocumentTree, ReasoningIndex}; diff --git a/rust/src/client/workspace.rs b/rust/src/client/workspace.rs index d700e064..8432bb78 100644 --- a/rust/src/client/workspace.rs +++ b/rust/src/client/workspace.rs @@ -30,7 +30,7 @@ use tracing::{debug, info}; use crate::error::Result; use crate::storage::{PersistedDocument, Workspace}; -use super::events::{EventEmitter, WorkspaceEvent}; +use crate::events::{EventEmitter, WorkspaceEvent}; use super::types::DocumentInfo; /// Workspace management client. From 32d81bbc8b3a2d5a22a246eebfc6406cbe12cd78 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 13:31:28 +0800 Subject: [PATCH 09/21] refactor(index-context): consolidate directory indexing methods into single function BREAKING CHANGE: The IndexContext::from_dir_recursive method has been removed and replaced with a boolean parameter in the from_dir method. - Replace separate from_dir and from_dir_recursive methods with unified from_dir(path, recursive) method - Update Python binding to use the new unified method signature - Update documentation examples to reflect the new API - Rename test function to match new naming convention --- python/src/lib.rs | 6 +----- rust/examples/index_directory.rs | 9 ++------- rust/src/client/index_context.rs | 26 ++++++++++---------------- rust/src/lib.rs | 5 +---- 4 files changed, 14 insertions(+), 32 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index 5138daed..bd027bb7 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -226,11 +226,7 @@ impl PyIndexContext { #[staticmethod] #[pyo3(signature = (path, recursive=false))] fn from_dir(path: String, recursive: bool) -> Self { - let inner = if recursive { - IndexContext::from_dir_recursive(&path) - } else { - IndexContext::from_dir(&path) - }; + let inner = IndexContext::from_dir(&path, recursive); Self { inner } } diff --git a/rust/examples/index_directory.rs b/rust/examples/index_directory.rs index 5b1681f6..922c38a5 100644 --- a/rust/examples/index_directory.rs +++ b/rust/examples/index_directory.rs @@ -46,13 +46,8 @@ async fn main() -> vectorless::Result<()> { .map_err(|e| vectorless::Error::Config(e.to_string()))?; // Index directory - let ctx = if recursive { - println!("Recursively indexing: {}", dir); - IndexContext::from_dir_recursive(dir) - } else { - println!("Indexing top-level files in: {}", dir); - IndexContext::from_dir(dir) - }; + println!("{}indexing: {}", if recursive { "Recursively " } else { "" }, dir); + let ctx = IndexContext::from_dir(dir, recursive); if ctx.is_empty() { println!("No supported files found in: {}", dir); diff --git a/rust/src/client/index_context.rs b/rust/src/client/index_context.rs index 1ba10bac..1ee324f1 100644 --- a/rust/src/client/index_context.rs +++ b/rust/src/client/index_context.rs @@ -30,10 +30,10 @@ //! use vectorless::client::IndexContext; //! //! // Non-recursive (top-level only) -//! let ctx = IndexContext::from_dir("./documents"); +//! let ctx = IndexContext::from_dir("./documents", false); //! //! // Recursive (includes subdirectories) -//! let ctx = IndexContext::from_dir_recursive("./documents"); +//! let ctx = IndexContext::from_dir("./documents", true); //! ``` use std::path::PathBuf; @@ -93,7 +93,7 @@ pub(crate) enum IndexSource { /// ).await?; /// /// // Entire directory -/// let result = engine.index(IndexContext::from_dir("./docs")).await?; +/// let result = engine.index(IndexContext::from_dir("./docs", false)).await?; /// # Ok(()) /// # } /// ``` @@ -135,18 +135,12 @@ impl IndexContext { /// Create from a directory path. /// - /// Indexes all supported files in the directory (non-recursive). + /// Indexes all supported files in the directory. /// Supported extensions: `.md`, `.pdf`. - pub fn from_dir(dir: impl Into) -> Self { - Self::scan_dir(dir, false) - } - - /// Create from a directory path with recursive scanning. /// - /// Recursively indexes all supported files in the directory and its - /// subdirectories. Supported extensions: `.md`, `.pdf`. - pub fn from_dir_recursive(dir: impl Into) -> Self { - Self::scan_dir(dir, true) + /// Set `recursive` to `true` to include subdirectories. + pub fn from_dir(dir: impl Into, recursive: bool) -> Self { + Self::scan_dir(dir, recursive) } /// Internal: scan a directory for supported document files. @@ -340,7 +334,7 @@ mod tests { } #[test] - fn test_from_dir_recursive() { + fn test_from_dir_with_recursive() { // Create a temp directory structure: // tmp/ // a.md @@ -357,11 +351,11 @@ mod tests { std::fs::write(tmp.join("sub/deep/ignore.dat"), b"xxx").unwrap(); // Non-recursive: only top-level - let ctx = IndexContext::from_dir(&tmp); + let ctx = IndexContext::from_dir(&tmp, false); assert_eq!(ctx.len(), 1); // only a.md // Recursive: all levels - let ctx = IndexContext::from_dir_recursive(&tmp); + let ctx = IndexContext::from_dir(&tmp, true); assert_eq!(ctx.len(), 3); // a.md, b.md, c.pdf let _ = std::fs::remove_dir_all(&tmp); diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 368569f3..8f9d85de 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -77,8 +77,5 @@ pub use graph::DocumentGraph; // Event types pub use events::{EventEmitter, IndexEvent, QueryEvent, WorkspaceEvent}; -// Index metrics -pub use metrics::IndexMetrics; - // Runtime metrics reports -pub use metrics::{LlmMetricsReport, MetricsReport, PilotMetricsReport, RetrievalMetricsReport}; +pub use metrics::{IndexMetrics, LlmMetricsReport, MetricsReport, PilotMetricsReport, RetrievalMetricsReport}; From d08dd9ca3c65c161c76a3a10c69ecc7ed405d60f Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:05:53 +0800 Subject: [PATCH 10/21] feat: add source_path field and update query context API - Add source_path field to IndexItem and DocumentInfo structs - Replace with_doc_id method with_doc_ids method that accepts a vector of document IDs - Update documentation and examples to reflect the new API - Modify query_stream to work with the new Documents scope type - Add getter methods for source_path in Python bindings --- python/src/lib.rs | 32 +++++++++++++----------- python/vectorless/__init__.py | 2 +- rust/examples/advanced.rs | 2 +- rust/examples/events.rs | 2 +- rust/examples/flow.rs | 2 +- rust/src/client/engine.rs | 25 +++++++++++++------ rust/src/client/mod.rs | 2 +- rust/src/client/query_context.rs | 42 ++++++++++++-------------------- rust/src/client/types.rs | 13 ++++++++++ rust/src/client/workspace.rs | 2 ++ rust/src/lib.rs | 2 +- 11 files changed, 73 insertions(+), 53 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index bd027bb7..bca68335 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -306,13 +306,13 @@ impl PyIndexContext { /// from vectorless import QueryContext, StrategyPreference /// /// # Force keyword-only (fastest, no LLM calls during search) -/// ctx = QueryContext("revenue").with_doc_id(doc_id).with_strategy(StrategyPreference.KEYWORD) +/// ctx = QueryContext("revenue").with_doc_ids([doc_id]).with_strategy(StrategyPreference.KEYWORD) /// /// # Force LLM-guided navigation (most accurate, uses more tokens) -/// ctx = QueryContext("explain the architecture").with_doc_id(doc_id).with_strategy(StrategyPreference.LLM) +/// ctx = QueryContext("explain the architecture").with_doc_ids([doc_id]).with_strategy(StrategyPreference.LLM) /// /// # Force hybrid (BM25 + LLM refinement) -/// ctx = QueryContext("growth trends").with_doc_id(doc_id).with_strategy(StrategyPreference.HYBRID) +/// ctx = QueryContext("growth trends").with_doc_ids([doc_id]).with_strategy(StrategyPreference.HYBRID) /// ``` #[pyclass(name = "StrategyPreference", skip_from_py_object)] #[derive(Clone)] @@ -380,8 +380,8 @@ impl PyStrategyPreference { /// ```python /// from vectorless import QueryContext /// -/// # Query a single document -/// ctx = QueryContext("What is the total revenue?").with_doc_id(doc_id) +/// # Query specific documents +/// ctx = QueryContext("What is the total revenue?").with_doc_ids([doc_id]) /// /// # Query multiple documents /// ctx = QueryContext("What is the architecture?").with_doc_ids(["doc-1", "doc-2"]) @@ -404,13 +404,7 @@ impl PyQueryContext { } } - /// Set scope to a single document. - fn with_doc_id(&self, doc_id: String) -> Self { - let ctx = self.inner.clone().with_doc_id(&doc_id); - Self { inner: ctx } - } - - /// Set scope to multiple documents. + /// Set scope to specific documents. fn with_doc_ids(&self, doc_ids: Vec) -> Self { let ctx = self.inner.clone().with_doc_ids(doc_ids); Self { inner: ctx } @@ -1083,6 +1077,11 @@ impl PyIndexItem { self.inner.description.as_deref() } + #[getter] + fn source_path(&self) -> Option<&str> { + self.inner.source_path.as_deref() + } + #[getter] fn page_count(&self) -> Option { self.inner.page_count @@ -1195,6 +1194,11 @@ impl PyDocumentInfo { self.inner.description.as_deref() } + #[getter] + fn source_path(&self) -> Option<&str> { + self.inner.source_path.as_deref() + } + #[getter] fn page_count(&self) -> Option { self.inner.page_count @@ -1489,7 +1493,7 @@ fn run_metrics_report(engine: Arc) -> PyMetricsReport { /// doc_id = result.doc_id /// /// # Query -/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(doc_id)) +/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([doc_id])) /// print(answer.single().content) /// ``` #[pyclass(name = "Engine")] @@ -1653,7 +1657,7 @@ impl PyEngine { /// /// engine = Engine(api_key="sk-...", model="gpt-4o") /// result = await engine.index(IndexContext.from_path("./report.pdf")) -/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(result.doc_id)) +/// answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([result.doc_id])) /// print(answer.single().content) /// ``` #[pymodule] diff --git a/python/vectorless/__init__.py b/python/vectorless/__init__.py index c046ed90..7acc99bf 100644 --- a/python/vectorless/__init__.py +++ b/python/vectorless/__init__.py @@ -16,7 +16,7 @@ doc_id = result.doc_id # Query - answer = await engine.query(QueryContext("What is the revenue?").with_doc_id(doc_id)) + answer = await engine.query(QueryContext("What is the revenue?").with_doc_ids([doc_id])) print(answer.single().content) """ diff --git a/rust/examples/advanced.rs b/rust/examples/advanced.rs index 1316a68d..2df75f9b 100644 --- a/rust/examples/advanced.rs +++ b/rust/examples/advanced.rs @@ -55,7 +55,7 @@ async fn main() -> vectorless::Result<()> { // Query let result = client - .query(QueryContext::new("What features does Vectorless provide?").with_doc_id(&doc_id)) + .query(QueryContext::new("What features does Vectorless provide?").with_doc_ids(vec![doc_id.clone()])) .await?; println!("Query: What features does Vectorless provide?"); if let Some(item) = result.single() { diff --git a/rust/examples/events.rs b/rust/examples/events.rs index c12f56d0..51398da8 100644 --- a/rust/examples/events.rs +++ b/rust/examples/events.rs @@ -127,7 +127,7 @@ async fn main() -> Result<(), Box> { // 4. Query with events println!("Step 4: Querying (with events)..."); let result = engine - .query(QueryContext::new("What is vectorless?").with_doc_id(&doc_id)) + .query(QueryContext::new("What is vectorless?").with_doc_ids(vec![doc_id.clone()])) .await?; if let Some(item) = result.single() { println!(" ✓ Found result ({} chars)", item.content.len()); diff --git a/rust/examples/flow.rs b/rust/examples/flow.rs index 8babb615..57d92891 100644 --- a/rust/examples/flow.rs +++ b/rust/examples/flow.rs @@ -111,7 +111,7 @@ async fn main() -> vectorless::Result<()> { println!(" Query: \"{}\"", query); match engine - .query(QueryContext::new(query).with_doc_id(&doc_id)) + .query(QueryContext::new(query).with_doc_ids(vec![doc_id.clone()])) .await { Ok(result) => { diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index e43b345a..fd701c39 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -28,7 +28,7 @@ //! //! // Query //! let result = engine.query( -//! QueryContext::new("What is this?").with_doc_id(doc_id) +//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) //! ).await?; //! //! println!("Found: {}", result.content); @@ -277,6 +277,12 @@ impl Engine { doc.description.clone(), doc.page_count, ) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) .with_metrics_opt(metrics); let persisted = self .indexer @@ -328,6 +334,12 @@ impl Engine { doc.description.clone(), doc.page_count, ) + .with_source_path( + doc.source_path + .as_ref() + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or_default(), + ) .with_metrics_opt(metrics); let persisted = self .indexer @@ -390,7 +402,7 @@ impl Engine { /// // Single document /// let result = engine.query( /// QueryContext::new("What is the total revenue?") - /// .with_doc_id("doc-123") + /// .with_doc_ids(vec!["doc-123".to_string()]) /// ).await?; /// /// if let Some(item) = result.single() { @@ -470,13 +482,13 @@ impl Engine { /// Returns a [`RetrieveEventReceiver`] that yields [`RetrieveEvent`](crate::retrieval::RetrieveEvent)s /// as the retrieval pipeline progresses through each stage. /// - /// Only supports single-document scope (via `with_doc_id`). + /// Only supports single-document scope (via `with_doc_ids` with one ID). pub async fn query_stream(&self, ctx: QueryContext) -> Result { let doc_id = match &ctx.scope { - QueryScope::Single(id) => id.clone(), + QueryScope::Documents(ids) if ids.len() == 1 => ids[0].clone(), _ => { return Err(Error::Config( - "query_stream requires a single doc_id".to_string(), + "query_stream requires a single doc_id via with_doc_ids".to_string(), )); } }; @@ -584,8 +596,7 @@ impl Engine { /// Resolve QueryScope into a list of document IDs. async fn resolve_scope(&self, scope: &QueryScope) -> Result> { match scope { - QueryScope::Single(id) => Ok(vec![id.clone()]), - QueryScope::Multiple(ids) => Ok(ids.clone()), + QueryScope::Documents(ids) => Ok(ids.clone()), QueryScope::Workspace => { let docs = self.list().await?; if docs.is_empty() { diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index 8f2131b2..f1c80750 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -29,7 +29,7 @@ //! //! // Query the document //! let result = client.query( -//! QueryContext::new("What is this?").with_doc_id(doc_id) +//! QueryContext::new("What is this?").with_doc_ids(vec![doc_id.to_string()]) //! ).await?; //! if let Some(item) = result.single() { //! println!("{}", item.content); diff --git a/rust/src/client/query_context.rs b/rust/src/client/query_context.rs index 991acf4d..bb32d05a 100644 --- a/rust/src/client/query_context.rs +++ b/rust/src/client/query_context.rs @@ -4,20 +4,16 @@ //! Query context for the Engine API. //! //! [`QueryContext`] encapsulates all parameters for a query operation, -//! supporting single document, multiple documents, or entire workspace queries. +//! supporting specific documents or entire workspace queries. //! //! # Example //! //! ```rust //! use vectorless::client::QueryContext; //! -//! // Query a single document +//! // Query specific documents //! let ctx = QueryContext::new("What is the total revenue?") -//! .with_doc_id("doc-abc123"); -//! -//! // Query multiple documents -//! let ctx = QueryContext::new("What is the architecture?") -//! .with_doc_ids(vec!["doc-1", "doc-2"]); +//! .with_doc_ids(vec!["doc-1".to_string()]); //! //! // Query entire workspace //! let ctx = QueryContext::new("Explain the algorithm"); @@ -29,19 +25,16 @@ use crate::retrieval::{RetrieveOptions, StrategyPreference}; /// Query scope — determines which documents to search. #[derive(Debug, Clone)] pub(crate) enum QueryScope { - /// Query a single document. - Single(String), - /// Query multiple specific documents. - Multiple(Vec), + /// Query specific documents. + Documents(Vec), /// Query all documents in the workspace. Workspace, } /// Context for a query operation. /// -/// Supports three scopes: -/// - **Single document** — via `with_doc_id()` -/// - **Multiple documents** — via `with_doc_ids()` +/// Supports two scopes: +/// - **Specific documents** — via `with_doc_ids()` /// - **Entire workspace** — default when no scope is set /// /// # Convenience @@ -82,15 +75,12 @@ impl QueryContext { } } - /// Set scope to a single document. - pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { - self.scope = QueryScope::Single(doc_id.into()); - self - } - - /// Set scope to multiple documents. + /// Set scope to specific documents. + /// + /// Pass a single ID or multiple IDs to restrict the query + /// to those documents only. pub fn with_doc_ids(mut self, doc_ids: Vec) -> Self { - self.scope = QueryScope::Multiple(doc_ids); + self.scope = QueryScope::Documents(doc_ids); self } @@ -180,14 +170,14 @@ mod tests { #[test] fn test_single_doc_scope() { - let ctx = QueryContext::new("test").with_doc_id("doc-1"); - assert!(matches!(ctx.scope, QueryScope::Single(ref id) if id == "doc-1")); + let ctx = QueryContext::new("test").with_doc_ids(vec!["doc-1".to_string()]); + assert!(matches!(ctx.scope, QueryScope::Documents(ref ids) if ids == &["doc-1".to_string()])); } #[test] fn test_multi_doc_scope() { let ctx = QueryContext::new("test").with_doc_ids(vec!["a".into(), "b".into()]); - assert!(matches!(ctx.scope, QueryScope::Multiple(ref ids) if ids.len() == 2)); + assert!(matches!(ctx.scope, QueryScope::Documents(ref ids) if ids.len() == 2)); } #[test] @@ -199,7 +189,7 @@ mod tests { #[test] fn test_builder_options() { let ctx = QueryContext::new("test") - .with_doc_id("doc-1") + .with_doc_ids(vec!["doc-1".to_string()]) .with_max_tokens(4000) .with_include_reasoning(false) .with_depth_limit(5); diff --git a/rust/src/client/types.rs b/rust/src/client/types.rs index a2c94a54..5c638846 100644 --- a/rust/src/client/types.rs +++ b/rust/src/client/types.rs @@ -303,6 +303,8 @@ pub struct IndexItem { pub format: DocumentFormat, /// Document description (from root summary). pub description: Option, + /// Source file path (if indexed from a file). + pub source_path: Option, /// Page count (for PDFs). pub page_count: Option, /// Indexing pipeline metrics (timing, LLM usage, node stats). @@ -323,11 +325,18 @@ impl IndexItem { name: name.into(), format, description, + source_path: None, page_count, metrics: None, } } + /// Set the source file path. + pub fn with_source_path(mut self, path: impl Into) -> Self { + self.source_path = Some(path.into()); + self + } + /// Set the indexing metrics. pub fn with_metrics(mut self, metrics: IndexMetrics) -> Self { self.metrics = Some(metrics); @@ -443,6 +452,9 @@ pub struct DocumentInfo { /// Document description. pub description: Option, + /// Source file path. + pub source_path: Option, + /// Page count (for PDFs). pub page_count: Option, @@ -458,6 +470,7 @@ impl DocumentInfo { name: name.into(), format: String::new(), description: None, + source_path: None, page_count: None, line_count: None, } diff --git a/rust/src/client/workspace.rs b/rust/src/client/workspace.rs index 8432bb78..7a27d1d6 100644 --- a/rust/src/client/workspace.rs +++ b/rust/src/client/workspace.rs @@ -154,6 +154,7 @@ impl WorkspaceClient { name: meta.doc_name, format: meta.doc_type, description: meta.doc_description, + source_path: meta.path, page_count: meta.page_count, line_count: meta.line_count, }); @@ -178,6 +179,7 @@ impl WorkspaceClient { name: meta.doc_name, format: meta.doc_type, description: meta.doc_description, + source_path: meta.path, page_count: meta.page_count, line_count: meta.line_count, })) diff --git a/rust/src/lib.rs b/rust/src/lib.rs index 8f9d85de..f9912e17 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -27,7 +27,7 @@ //! let doc_id = result.doc_id().unwrap(); //! //! let result = client.query( -//! QueryContext::new("What is this about?").with_doc_id(doc_id) +//! QueryContext::new("What is this about?").with_doc_ids(vec![doc_id.to_string()]) //! ).await?; //! println!("{}", result.content); //! From 04ff69927d82a2f4f34e149682497b0c9824897e Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:07:35 +0800 Subject: [PATCH 11/21] refactor(docs): update QueryContext API usage to use with_doc_ids instead of with_doc_id BREAKING CHANGE: Replace all instances of .with_doc_id() with .with_doc_ids() method in documentation examples and code samples to support multiple document IDs. - Updated README.md example - Updated blog post example - Updated quick-query documentation - Updated PDF support documentation - Updated getting started guide - Updated intro documentation - Updated search algorithms documentation - Updated strategies documentation - Updated Python SDK documentation - Updated Rust SDK documentation - Updated example files (document_management, error_handling, indexing, pdf_indexing) - Removed deprecated retrieval type exports from rust/src/lib.rs --- README.md | 2 +- docs/blog/2026-04-12-welcome/index.mdx | 2 +- docs/docs/examples/quick-query.mdx | 6 +++--- docs/docs/features/pdf-support.mdx | 2 +- docs/docs/getting-started.mdx | 4 ++-- docs/docs/intro.mdx | 2 +- docs/docs/retrieval/search-algorithms.mdx | 2 +- docs/docs/retrieval/strategies.mdx | 6 +++--- docs/docs/sdk/python.mdx | 4 ++-- docs/docs/sdk/rust.mdx | 2 +- examples/document_management/main.py | 2 +- examples/error_handling/main.py | 2 +- examples/indexing/main.py | 2 +- examples/pdf_indexing/main.py | 2 +- rust/src/lib.rs | 5 ----- 15 files changed, 20 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 2dd54431..e908924c 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ async fn main() -> vectorless::Result<()> { // Query let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; println!("Answer: {}", result.content); diff --git a/docs/blog/2026-04-12-welcome/index.mdx b/docs/blog/2026-04-12-welcome/index.mdx index e9de1b1d..61a2c63f 100644 --- a/docs/blog/2026-04-12-welcome/index.mdx +++ b/docs/blog/2026-04-12-welcome/index.mdx @@ -63,7 +63,7 @@ async fn main() -> vectorless::Result<()> { let doc_id = result.doc_id().unwrap(); let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; println!("{}", result.content); diff --git a/docs/docs/examples/quick-query.mdx b/docs/docs/examples/quick-query.mdx index fd172a5f..27a6600a 100644 --- a/docs/docs/examples/quick-query.mdx +++ b/docs/docs/examples/quick-query.mdx @@ -28,7 +28,7 @@ async def main(): # 3. Simple keyword query answer = await engine.query( QueryContext("revenue") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_strategy(StrategyPreference.KEYWORD) ) print(f"Keyword result: {answer.single().content[:200]}") @@ -36,7 +36,7 @@ async def main(): # 4. Complex reasoning query answer = await engine.query( QueryContext("What are the main factors affecting performance?") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_strategy(StrategyPreference.HYBRID) ) print(f"Score: {answer.single().score:.2f}") @@ -72,7 +72,7 @@ async fn main() -> vectorless::Result<()> { // 3. Query with hybrid strategy let answer = engine.query( QueryContext::new("What are the main factors affecting performance?") - .with_doc_id(&doc_id) + .with_doc_ids(vec![doc_id.clone()]) ).await?; if let Some(item) = answer.single() { diff --git a/docs/docs/features/pdf-support.mdx b/docs/docs/features/pdf-support.mdx index 96a683a2..3be02fd0 100644 --- a/docs/docs/features/pdf-support.mdx +++ b/docs/docs/features/pdf-support.mdx @@ -19,7 +19,7 @@ doc_id = result.doc_id # Query the PDF answer = await engine.query( - QueryContext("What is discussed on page 5?").with_doc_id(doc_id) + QueryContext("What is discussed on page 5?").with_doc_ids([doc_id]) ) print(answer.single().content) ``` diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx index 60c27ea0..dd4e00e3 100644 --- a/docs/docs/getting-started.mdx +++ b/docs/docs/getting-started.mdx @@ -38,7 +38,7 @@ async def main(): # Query the document answer = await engine.query( - QueryContext("What is the total revenue?").with_doc_id(doc_id) + QueryContext("What is the total revenue?").with_doc_ids([doc_id]) ) print(answer.single().content) @@ -85,7 +85,7 @@ async fn main() -> vectorless::Result<()> { let doc_id = result.doc_id().unwrap(); let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; if let Some(item) = result.single() { diff --git a/docs/docs/intro.mdx b/docs/docs/intro.mdx index 2e65e88a..a2f95ba2 100644 --- a/docs/docs/intro.mdx +++ b/docs/docs/intro.mdx @@ -65,7 +65,7 @@ async fn main() -> vectorless::Result<()> { let doc_id = result.doc_id().unwrap(); let result = engine.query( - QueryContext::new("What is the total revenue?").with_doc_id(doc_id) + QueryContext::new("What is the total revenue?").with_doc_ids(vec![doc_id.to_string()]) ).await?; println!("{}", result.content); diff --git a/docs/docs/retrieval/search-algorithms.mdx b/docs/docs/retrieval/search-algorithms.mdx index 370c762c..a0f45242 100644 --- a/docs/docs/retrieval/search-algorithms.mdx +++ b/docs/docs/retrieval/search-algorithms.mdx @@ -72,7 +72,7 @@ from vectorless import QueryContext ctx = ( QueryContext("complex multi-hop question") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_depth_limit(10) # Max tree traversal depth .with_max_tokens(4000) # Max tokens in result ) diff --git a/docs/docs/retrieval/strategies.mdx b/docs/docs/retrieval/strategies.mdx index e60cd3e1..fbb9c98a 100644 --- a/docs/docs/retrieval/strategies.mdx +++ b/docs/docs/retrieval/strategies.mdx @@ -23,7 +23,7 @@ Fast TF-IDF/BM25 matching against the pre-computed reasoning index. No LLM calls ```python from vectorless import QueryContext, StrategyPreference -ctx = QueryContext("revenue").with_doc_id(doc_id).with_strategy( +ctx = QueryContext("revenue").with_doc_ids([doc_id]).with_strategy( StrategyPreference.KEYWORD ) ``` @@ -38,7 +38,7 @@ Use when: LLM-powered tree navigation with full contextual understanding. The LLM sees the table of contents, node summaries, and makes navigation decisions at each level. ```python -ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_id(doc_id).with_strategy( +ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_ids([doc_id]).with_strategy( StrategyPreference.LLM ) ``` @@ -53,7 +53,7 @@ Use when: Two-phase retrieval: BM25 pre-filter followed by LLM refinement. Combines the speed of keyword matching with the accuracy of LLM reasoning. ```python -ctx = QueryContext("What are the growth trends?").with_doc_id(doc_id).with_strategy( +ctx = QueryContext("What are the growth trends?").with_doc_ids([doc_id]).with_strategy( StrategyPreference.HYBRID ) ``` diff --git a/docs/docs/sdk/python.mdx b/docs/docs/sdk/python.mdx index a4306d68..fe84565b 100644 --- a/docs/docs/sdk/python.mdx +++ b/docs/docs/sdk/python.mdx @@ -81,7 +81,7 @@ from vectorless import QueryContext, StrategyPreference answer = await engine.query( QueryContext("What is the total revenue?") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_strategy(StrategyPreference.HYBRID) ) @@ -113,7 +113,7 @@ answer = await engine.query( ```python answer = await engine.query( QueryContext("Explain the architecture") - .with_doc_id(doc_id) + .with_doc_ids([doc_id]) .with_max_tokens(4000) # Max tokens in result .with_include_reasoning(True) # Include reasoning chain .with_depth_limit(10) # Max traversal depth diff --git a/docs/docs/sdk/rust.mdx b/docs/docs/sdk/rust.mdx index 768368b1..f49bac6e 100644 --- a/docs/docs/sdk/rust.mdx +++ b/docs/docs/sdk/rust.mdx @@ -57,7 +57,7 @@ use vectorless::StrategyPreference; let result = engine.query( QueryContext::new("What is the total revenue?") - .with_doc_id(doc_id) + .with_doc_ids(vec![doc_id.to_string()]) .with_strategy(StrategyPreference::ForceHybrid) .with_max_tokens(4000) .with_include_reasoning(true) diff --git a/examples/document_management/main.py b/examples/document_management/main.py index 972d9a44..5d206a89 100644 --- a/examples/document_management/main.py +++ b/examples/document_management/main.py @@ -95,7 +95,7 @@ async def main() -> None: # ---- Query a specific document ---- print("--- query(doc_id_a) ---") answer = await engine.query( - QueryContext("What storage engines does Alpha support?").with_doc_id(doc_id_a) + QueryContext("What storage engines does Alpha support?").with_doc_ids([doc_id_a]) ) item = answer.single() if item: diff --git a/examples/error_handling/main.py b/examples/error_handling/main.py index 832ad360..22099e3d 100644 --- a/examples/error_handling/main.py +++ b/examples/error_handling/main.py @@ -54,7 +54,7 @@ async def main() -> None: print("--- Query non-existent document ---") try: await engine.query( - QueryContext("What is this?").with_doc_id("does-not-exist") + QueryContext("What is this?").with_doc_ids(["does-not-exist"]) ) except VectorlessError as e: print(f" Caught VectorlessError:") diff --git a/examples/indexing/main.py b/examples/indexing/main.py index fe2e2824..f2adce3b 100644 --- a/examples/indexing/main.py +++ b/examples/indexing/main.py @@ -92,7 +92,7 @@ async def main(): # --- 5. Query --- print("--- Query ---") answer = await engine.query( - QueryContext("What was the total revenue?").with_doc_id(file_doc_id) + QueryContext("What was the total revenue?").with_doc_ids([file_doc_id]) ) item = answer.single() if item: diff --git a/examples/pdf_indexing/main.py b/examples/pdf_indexing/main.py index d194c474..c1e36727 100644 --- a/examples/pdf_indexing/main.py +++ b/examples/pdf_indexing/main.py @@ -105,7 +105,7 @@ async def main() -> None: print_separator("Query") answer = await engine.query( - QueryContext("What is this document about?").with_doc_id(doc_id) + QueryContext("What is this document about?").with_doc_ids([doc_id]) ) item = answer.single() if item: diff --git a/rust/src/lib.rs b/rust/src/lib.rs index f9912e17..780e1f09 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -57,11 +57,6 @@ pub use client::{ QueryResultItem, }; -// Retrieval types -pub use retrieval::StrategyPreference; -pub use retrieval::pipeline::SearchAlgorithm; -pub use retrieval::QueryComplexity; - // Error types pub use error::{Error, Result}; From ececf5124bd5148f458e4ea7dbb2775cb327f977 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:12:11 +0800 Subject: [PATCH 12/21] refactor(python): remove StrategyPreference class and related methods - Remove unused StrategyPreference import from lib.rs - Delete entire PyStrategyPreference implementation including all constants (AUTO, KEYWORD, LLM, HYBRID, CROSS_DOCUMENT, PAGE_RANGE) - Remove with_strategy method from PyQueryContext - Remove StrategyPreference class registration in _vectorless module - Update Python bindings to exclude StrategyPreference from exports - Clean up related documentation and type references --- python/src/lib.rs | 88 ----------------------------------- python/vectorless/__init__.py | 2 - 2 files changed, 90 deletions(-) diff --git a/python/src/lib.rs b/python/src/lib.rs index bca68335..c0649759 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -9,7 +9,6 @@ use pyo3_async_runtimes::tokio::future_into_py; use std::sync::Arc; use tokio::runtime::Runtime; -use ::vectorless::StrategyPreference; use ::vectorless::client::{ DocumentFormat, DocumentInfo, Engine, EngineBuilder, FailedItem, IndexContext, IndexItem, IndexMode, IndexOptions, IndexResult, QueryContext, QueryResult, QueryResultItem, @@ -294,83 +293,6 @@ impl PyIndexContext { } } -// ============================================================ -// StrategyPreference -// ============================================================ - -/// Retrieval strategy preference. -/// -/// Controls how the engine searches the document tree. -/// -/// ```python -/// from vectorless import QueryContext, StrategyPreference -/// -/// # Force keyword-only (fastest, no LLM calls during search) -/// ctx = QueryContext("revenue").with_doc_ids([doc_id]).with_strategy(StrategyPreference.KEYWORD) -/// -/// # Force LLM-guided navigation (most accurate, uses more tokens) -/// ctx = QueryContext("explain the architecture").with_doc_ids([doc_id]).with_strategy(StrategyPreference.LLM) -/// -/// # Force hybrid (BM25 + LLM refinement) -/// ctx = QueryContext("growth trends").with_doc_ids([doc_id]).with_strategy(StrategyPreference.HYBRID) -/// ``` -#[pyclass(name = "StrategyPreference", skip_from_py_object)] -#[derive(Clone)] -pub struct PyStrategyPreference { - inner: StrategyPreference, -} - -#[pymethods] -impl PyStrategyPreference { - /// Auto-select based on query complexity (default). - #[classattr] - const AUTO: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::Auto, - }; - - /// Force keyword-based strategy (fast, no LLM during search). - #[classattr] - const KEYWORD: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceKeyword, - }; - - /// Force LLM-guided navigation (deep reasoning). - #[classattr] - const LLM: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceLlm, - }; - - /// Force hybrid strategy (BM25 + LLM refinement). - #[classattr] - const HYBRID: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceHybrid, - }; - - /// Force cross-document strategy (multi-document retrieval). - #[classattr] - const CROSS_DOCUMENT: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForceCrossDocument, - }; - - /// Force page-range strategy (filter by page range). - #[classattr] - const PAGE_RANGE: PyStrategyPreference = PyStrategyPreference { - inner: StrategyPreference::ForcePageRange, - }; - - fn __repr__(&self) -> String { - let name = match self.inner { - StrategyPreference::Auto => "AUTO", - StrategyPreference::ForceKeyword => "KEYWORD", - StrategyPreference::ForceLlm => "LLM", - StrategyPreference::ForceHybrid => "HYBRID", - StrategyPreference::ForceCrossDocument => "CROSS_DOCUMENT", - StrategyPreference::ForcePageRange => "PAGE_RANGE", - }; - format!("StrategyPreference.{}", name) - } -} - // ============================================================ // QueryContext // ============================================================ @@ -434,15 +356,6 @@ impl PyQueryContext { Self { inner: ctx } } - /// Set the retrieval strategy. - /// - /// Args: - /// strategy: A StrategyPreference constant, e.g. StrategyPreference.LLM. - fn with_strategy(&self, strategy: &PyStrategyPreference) -> Self { - let ctx = self.inner.clone().with_strategy(strategy.inner); - Self { inner: ctx } - } - fn __repr__(&self) -> String { "QueryContext(...)".to_string() } @@ -1665,7 +1578,6 @@ fn _vectorless(m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; diff --git a/python/vectorless/__init__.py b/python/vectorless/__init__.py index 7acc99bf..7f6c0604 100644 --- a/python/vectorless/__init__.py +++ b/python/vectorless/__init__.py @@ -30,7 +30,6 @@ QueryContext, QueryResult, QueryResultItem, - StrategyPreference, DocumentInfo, DocumentGraph, DocumentGraphNode, @@ -52,7 +51,6 @@ "QueryContext", "QueryResult", "QueryResultItem", - "StrategyPreference", "DocumentInfo", "DocumentGraph", "DocumentGraphNode", From 3115ac92d4c7ff8788624e07684345337ac94905 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:20:00 +0800 Subject: [PATCH 13/21] refactor(engine): add TODO comment for parallelizing document queries Add a TODO comment to consider parallelizing queries across documents when multiple document IDs are provided, with a concurrency limit. --- rust/src/client/engine.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index fd701c39..99bcf14c 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -435,6 +435,7 @@ impl Engine { let mut items = Vec::with_capacity(doc_ids.len()); let mut failed = Vec::new(); + // TODO: if doc_ids.len() > 1, consider parallelizing queries across documents (with concurrency limit) for doc_id in doc_ids { let (tree, reasoning_index) = match self.get_structure(&doc_id).await { Ok((t, ri)) => (t, ri), From 935a23851c90bca04d2f34eb2deaf74369977761 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:27:23 +0800 Subject: [PATCH 14/21] docs(readme): remove workspace parameter from Engine initialization examples - Remove workspace="./data" parameter from Python Engine constructor in README.md - Remove workspace="./data" parameter from Rust EngineBuilder in README.md - Remove workspace="./data" parameter from Python example in python/README.md - Remove workspace="./data" parameter from quick start example in python/__init__.py --- README.md | 2 -- python/README.md | 1 - python/vectorless/__init__.py | 2 +- 3 files changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index e908924c..d74ea257 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,6 @@ from vectorless import Engine, IndexContext async def main(): # Create engine — api_key and model are required engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -63,7 +62,6 @@ use vectorless::client::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/python/README.md b/python/README.md index 97cf79cd..f10eeb6b 100644 --- a/python/README.md +++ b/python/README.md @@ -17,7 +17,6 @@ from vectorless import Engine, IndexContext async def main(): # Create engine — api_key and model are required engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) diff --git a/python/vectorless/__init__.py b/python/vectorless/__init__.py index 7f6c0604..d1170ce5 100644 --- a/python/vectorless/__init__.py +++ b/python/vectorless/__init__.py @@ -8,7 +8,7 @@ from vectorless import Engine, IndexContext, QueryContext # Create engine - engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") + engine = Engine(api_key="sk-...", model="gpt-4o") # Index a document ctx = IndexContext.from_path("./report.pdf") From 17cdf8bd9808a6bdae14677d90158a083d39c9ba Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:34:31 +0800 Subject: [PATCH 15/21] refactor(client): remove unused EventEmitter re-export Removed the EventEmitter re-export from client module as it was no longer needed and caused unnecessary coupling to events module. --- rust/src/client/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/rust/src/client/mod.rs b/rust/src/client/mod.rs index f1c80750..ce00ff34 100644 --- a/rust/src/client/mod.rs +++ b/rust/src/client/mod.rs @@ -88,12 +88,6 @@ pub use engine::Engine; pub use index_context::IndexContext; pub use query_context::QueryContext; -// ============================================================ -// Events (re-export from crate::events) -// ============================================================ - -pub use crate::events::EventEmitter; - // ============================================================ // Result & Info Types // ============================================================ From f9ed739796fdf2eec0df1e447babed6732841d56 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 14:44:24 +0800 Subject: [PATCH 16/21] docs: remove deprecated workspace parameter and strategy preferences - Remove deprecated `.with_workspace()` parameter from Engine initialization in all examples and documentation - Remove deprecated `StrategyPreference` enum usage from query contexts - Update all code examples to use default auto-selection behavior instead of explicit strategy selection - Clean up import statements by removing unused StrategyPreference imports - Update retrieval strategies documentation to reflect automatic strategy selection as default behavior BREAKING CHANGE: The workspace parameter and explicit strategy preferences are no longer required as they are now handled automatically. --- docs/blog/2026-04-12-welcome/index.mdx | 2 -- docs/docs/examples/batch-indexing.mdx | 2 -- docs/docs/examples/multi-document.mdx | 5 +---- docs/docs/examples/quick-query.mdx | 6 +----- docs/docs/features/cross-document-graph.mdx | 8 +++----- docs/docs/features/pdf-support.mdx | 2 +- docs/docs/getting-started.mdx | 3 --- docs/docs/intro.mdx | 2 -- docs/docs/retrieval/strategies.mdx | 20 ++++++-------------- docs/docs/sdk/python.mdx | 17 +---------------- docs/docs/sdk/rust.mdx | 1 - docs/src/pages/index.tsx | 1 - 12 files changed, 13 insertions(+), 56 deletions(-) diff --git a/docs/blog/2026-04-12-welcome/index.mdx b/docs/blog/2026-04-12-welcome/index.mdx index 61a2c63f..686655cd 100644 --- a/docs/blog/2026-04-12-welcome/index.mdx +++ b/docs/blog/2026-04-12-welcome/index.mdx @@ -29,7 +29,6 @@ from vectorless import Engine, IndexContext async def main(): engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -53,7 +52,6 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/docs/docs/examples/batch-indexing.mdx b/docs/docs/examples/batch-indexing.mdx index ea7b23db..847e738b 100644 --- a/docs/docs/examples/batch-indexing.mdx +++ b/docs/docs/examples/batch-indexing.mdx @@ -14,7 +14,6 @@ from vectorless import Engine, IndexContext, IndexOptions async def main(): engine = Engine( - workspace="./workspace", api_key="sk-...", model="gpt-4o", ) @@ -53,7 +52,6 @@ use vectorless::client::{Engine, EngineBuilder, IndexContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./workspace") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/docs/docs/examples/multi-document.mdx b/docs/docs/examples/multi-document.mdx index 6b6c404b..11ddc458 100644 --- a/docs/docs/examples/multi-document.mdx +++ b/docs/docs/examples/multi-document.mdx @@ -12,12 +12,11 @@ Query across multiple indexed documents using the cross-document strategy with g import asyncio from vectorless import ( Engine, IndexContext, QueryContext, - IndexOptions, StrategyPreference + IndexOptions, ) async def main(): engine = Engine( - workspace="./workspace", api_key="sk-...", model="gpt-4o", ) @@ -44,7 +43,6 @@ async def main(): result = await engine.query( QueryContext("Compare quarterly revenue trends") .with_doc_ids(doc_ids) - .with_strategy(StrategyPreference.CROSS_DOCUMENT) ) for item in result.items: @@ -54,7 +52,6 @@ async def main(): # Or query entire workspace result = await engine.query( QueryContext("What documents discuss risk factors?") - .with_workspace() ) print(f"\nFound in {len(result.items)} document(s)") diff --git a/docs/docs/examples/quick-query.mdx b/docs/docs/examples/quick-query.mdx index 27a6600a..07f66390 100644 --- a/docs/docs/examples/quick-query.mdx +++ b/docs/docs/examples/quick-query.mdx @@ -10,12 +10,11 @@ This example demonstrates the basic index-and-query workflow with both Python an ```python import asyncio -from vectorless import Engine, IndexContext, QueryContext, StrategyPreference +from vectorless import Engine, IndexContext, QueryContext async def main(): # 1. Create engine engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -29,7 +28,6 @@ async def main(): answer = await engine.query( QueryContext("revenue") .with_doc_ids([doc_id]) - .with_strategy(StrategyPreference.KEYWORD) ) print(f"Keyword result: {answer.single().content[:200]}") @@ -37,7 +35,6 @@ async def main(): answer = await engine.query( QueryContext("What are the main factors affecting performance?") .with_doc_ids([doc_id]) - .with_strategy(StrategyPreference.HYBRID) ) print(f"Score: {answer.single().score:.2f}") print(f"Hybrid result: {answer.single().content[:200]}") @@ -58,7 +55,6 @@ use vectorless::StrategyPreference; async fn main() -> vectorless::Result<()> { // 1. Create engine let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/docs/docs/features/cross-document-graph.mdx b/docs/docs/features/cross-document-graph.mdx index e87c7d29..1ac22fbe 100644 --- a/docs/docs/features/cross-document-graph.mdx +++ b/docs/docs/features/cross-document-graph.mdx @@ -40,15 +40,13 @@ When using the cross-document strategy, the graph boosts scores for connected do 4. Re-rank the merged result set ```python -from vectorless import Engine, QueryContext, StrategyPreference +from vectorless import Engine, QueryContext -engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") +engine = Engine(api_key="sk-...", model="gpt-4o") # Query across all documents with graph boosting result = await engine.query( - QueryContext("Compare the approaches").with_strategy( - StrategyPreference.CROSS_DOCUMENT - ) + QueryContext("Compare the approaches") ) ``` diff --git a/docs/docs/features/pdf-support.mdx b/docs/docs/features/pdf-support.mdx index 3be02fd0..48682f22 100644 --- a/docs/docs/features/pdf-support.mdx +++ b/docs/docs/features/pdf-support.mdx @@ -11,7 +11,7 @@ Vectorless supports PDF documents with full page-level tracking and hierarchical ```python from vectorless import Engine, IndexContext -engine = Engine(workspace="./data", api_key="sk-...", model="gpt-4o") +engine = Engine(api_key="sk-...", model="gpt-4o") # Index a PDF result = await engine.index(IndexContext.from_path("./report.pdf")) diff --git a/docs/docs/getting-started.mdx b/docs/docs/getting-started.mdx index dd4e00e3..14f541a6 100644 --- a/docs/docs/getting-started.mdx +++ b/docs/docs/getting-started.mdx @@ -26,7 +26,6 @@ from vectorless import Engine, IndexContext, QueryContext async def main(): # Create an engine engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -49,7 +48,6 @@ asyncio.run(main()) ```python engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", endpoint="https://api.your-provider.com/v1", @@ -75,7 +73,6 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/docs/docs/intro.mdx b/docs/docs/intro.mdx index a2f95ba2..88fa23fb 100644 --- a/docs/docs/intro.mdx +++ b/docs/docs/intro.mdx @@ -28,7 +28,6 @@ from vectorless import Engine, IndexContext async def main(): engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) @@ -55,7 +54,6 @@ use vectorless::{EngineBuilder, IndexContext, QueryContext}; #[tokio::main] async fn main() -> vectorless::Result<()> { let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .build() diff --git a/docs/docs/retrieval/strategies.mdx b/docs/docs/retrieval/strategies.mdx index fbb9c98a..e718220a 100644 --- a/docs/docs/retrieval/strategies.mdx +++ b/docs/docs/retrieval/strategies.mdx @@ -21,11 +21,9 @@ Vectorless provides five retrieval strategies, each designed for different query Fast TF-IDF/BM25 matching against the pre-computed reasoning index. No LLM calls during search. ```python -from vectorless import QueryContext, StrategyPreference +from vectorless import QueryContext -ctx = QueryContext("revenue").with_doc_ids([doc_id]).with_strategy( - StrategyPreference.KEYWORD -) +ctx = QueryContext("revenue").with_doc_ids([doc_id]) ``` Use when: @@ -38,9 +36,7 @@ Use when: LLM-powered tree navigation with full contextual understanding. The LLM sees the table of contents, node summaries, and makes navigation decisions at each level. ```python -ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_ids([doc_id]).with_strategy( - StrategyPreference.LLM -) +ctx = QueryContext("Explain the relationship between architecture and performance").with_doc_ids([doc_id]) ``` Use when: @@ -53,9 +49,7 @@ Use when: Two-phase retrieval: BM25 pre-filter followed by LLM refinement. Combines the speed of keyword matching with the accuracy of LLM reasoning. ```python -ctx = QueryContext("What are the growth trends?").with_doc_ids([doc_id]).with_strategy( - StrategyPreference.HYBRID -) +ctx = QueryContext("What are the growth trends?").with_doc_ids([doc_id]) ``` The recommended default for most queries. Fast pre-filtering reduces the number of nodes sent to the LLM, keeping token costs manageable while maintaining high accuracy. @@ -65,16 +59,14 @@ The recommended default for most queries. Fast pre-filtering reduces the number Searches across multiple indexed documents and aggregates results. Uses the cross-document relationship graph for score boosting. ```python -ctx = QueryContext("Compare the architectures").with_strategy( - StrategyPreference.CROSS_DOCUMENT -) +ctx = QueryContext("Compare the architectures") ``` When a high-confidence result is found in one document, neighbor documents in the graph receive a score boost, surfacing related content across the workspace. ## Auto Selection -When using `StrategyPreference.AUTO` (default), the engine analyzes query complexity and selects the appropriate strategy: +By default, the engine analyzes query complexity and automatically selects the appropriate strategy: - Simple keyword queries → Keyword strategy - Complex reasoning queries → Hybrid strategy diff --git a/docs/docs/sdk/python.mdx b/docs/docs/sdk/python.mdx index fe84565b..bc3ae0ba 100644 --- a/docs/docs/sdk/python.mdx +++ b/docs/docs/sdk/python.mdx @@ -20,7 +20,6 @@ The `Engine` is the main entry point. It requires an LLM API key and model name. from vectorless import Engine engine = Engine( - workspace="./data", # Local directory for indexed data api_key="sk-...", # LLM API key model="gpt-4o", # LLM model name endpoint=None, # Optional: custom API endpoint @@ -77,12 +76,11 @@ result = await engine.index( ### Single Document ```python -from vectorless import QueryContext, StrategyPreference +from vectorless import QueryContext answer = await engine.query( QueryContext("What is the total revenue?") .with_doc_ids([doc_id]) - .with_strategy(StrategyPreference.HYBRID) ) if answer.single(): @@ -104,7 +102,6 @@ answer = await engine.query( ```python answer = await engine.query( QueryContext("What documents discuss performance?") - .with_workspace() ) ``` @@ -117,7 +114,6 @@ answer = await engine.query( .with_max_tokens(4000) # Max tokens in result .with_include_reasoning(True) # Include reasoning chain .with_depth_limit(10) # Max traversal depth - .with_strategy(StrategyPreference.LLM) ) ``` @@ -162,14 +158,3 @@ if graph: | `include_text` | `bool` | `True` | Include node text | | `generate_ids` | `bool` | `True` | Generate node IDs | | `enable_synonym_expansion` | `bool` | `True` | LLM synonym expansion | - -### StrategyPreference - -| Constant | Description | -|----------|-------------| -| `StrategyPreference.AUTO` | Auto-select based on query complexity | -| `StrategyPreference.KEYWORD` | Fast keyword matching | -| `StrategyPreference.LLM` | LLM-guided navigation | -| `StrategyPreference.HYBRID` | BM25 + LLM refinement | -| `StrategyPreference.CROSS_DOCUMENT` | Multi-document retrieval | -| `StrategyPreference.PAGE_RANGE` | Page-scoped retrieval | diff --git a/docs/docs/sdk/rust.mdx b/docs/docs/sdk/rust.mdx index f49bac6e..2136cafa 100644 --- a/docs/docs/sdk/rust.mdx +++ b/docs/docs/sdk/rust.mdx @@ -19,7 +19,6 @@ vectorless = "0.1" use vectorless::client::{Engine, EngineBuilder}; let engine = EngineBuilder::new() - .with_workspace("./data") .with_key("sk-...") .with_model("gpt-4o") .with_endpoint("https://api.openai.com/v1") // optional diff --git a/docs/src/pages/index.tsx b/docs/src/pages/index.tsx index 932e1f0f..3ecc47c7 100644 --- a/docs/src/pages/index.tsx +++ b/docs/src/pages/index.tsx @@ -49,7 +49,6 @@ from vectorless import Engine, IndexContext async def main(): engine = Engine( - workspace="./data", api_key="sk-...", model="gpt-4o", ) From e310de69723788f2413d143b7602870a7513ecf0 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 15:07:48 +0800 Subject: [PATCH 17/21] feat(index): add pipeline checkpoint functionality for resumable indexing - Add workspace_dir field to Engine struct to store workspace root directory - Introduce source_hash in IndexContext to track content changes for checkpoint validation - Implement checkpoint loading and saving mechanism in PipelineOrchestrator - Add checkpoint validation logic using SHA-256 hash of source content - Enable pipeline resumption from saved checkpoints when available - Skip already completed stages during resumed execution - Clear checkpoints upon successful pipeline completion - Support both file path and content hashing for checkpoint validation --- rust/src/client/engine.rs | 8 ++ rust/src/index/pipeline/context.rs | 21 ++++++ rust/src/index/pipeline/orchestrator.rs | 99 +++++++++++++++++++++++++ 3 files changed, 128 insertions(+) diff --git a/rust/src/client/engine.rs b/rust/src/client/engine.rs index 99bcf14c..ebb302f4 100644 --- a/rust/src/client/engine.rs +++ b/rust/src/client/engine.rs @@ -85,6 +85,9 @@ pub struct Engine { /// Workspace client for persistence. workspace: Option, + /// Workspace root directory (for checkpoint path). + workspace_dir: Option, + /// Event emitter. events: EventEmitter, @@ -106,6 +109,7 @@ impl Engine { events: EventEmitter, ) -> Result { let config = Arc::new(config); + let workspace_dir = Some(std::path::PathBuf::from(&config.storage.workspace_dir)); // Attach event emitter to indexer let indexer = indexer.with_events(events.clone()); @@ -124,6 +128,7 @@ impl Engine { indexer, retriever, workspace: Some(workspace_client), + workspace_dir, events, metrics_hub: Arc::new(MetricsHub::with_defaults()), }) @@ -615,6 +620,7 @@ impl Engine { format: crate::index::parse::DocumentFormat, ) -> PipelineOptions { use crate::index::SummaryStrategy; + let checkpoint_dir = self.workspace_dir.as_ref().map(|p| p.join("checkpoints")); PipelineOptions { mode: match format { crate::index::parse::DocumentFormat::Markdown => crate::index::IndexMode::Markdown, @@ -627,6 +633,7 @@ impl Engine { SummaryStrategy::none() }, generate_description: options.generate_description, + checkpoint_dir, ..Default::default() } } @@ -754,6 +761,7 @@ impl Clone for Engine { indexer: self.indexer.clone(), retriever: self.retriever.clone(), workspace: self.workspace.clone(), + workspace_dir: self.workspace_dir.clone(), events: self.events.clone(), metrics_hub: Arc::clone(&self.metrics_hub), } diff --git a/rust/src/index/pipeline/context.rs b/rust/src/index/pipeline/context.rs index 21e61ddb..502d241e 100644 --- a/rust/src/index/pipeline/context.rs +++ b/rust/src/index/pipeline/context.rs @@ -227,6 +227,9 @@ pub struct IndexContext { /// Source file path (if from file). pub source_path: Option, + /// SHA-256 hash of source content for checkpoint validation. + pub source_hash: String, + /// Parsed raw nodes. pub raw_nodes: Vec, @@ -268,12 +271,14 @@ pub struct IndexContext { impl IndexContext { /// Create a new context from input. pub fn new(input: IndexInput, options: PipelineOptions) -> Self { + let source_hash = Self::compute_source_hash(&input); Self { doc_id: uuid::Uuid::new_v4().to_string(), input, format: DocumentFormat::Markdown, name: String::new(), source_path: None, + source_hash, raw_nodes: Vec::new(), tree: None, options, @@ -289,6 +294,22 @@ impl IndexContext { } } + /// Compute SHA-256 hash of the source content. + fn compute_source_hash(input: &IndexInput) -> String { + use sha2::{Sha256, Digest}; + let hash = match input { + IndexInput::File(path) => { + // Hash the file path as proxy — actual content may not be readable yet + // (the parse stage reads it). This is sufficient for checkpoint invalidation + // since a different file path implies different content. + Sha256::digest(path.to_string_lossy().as_bytes()) + } + IndexInput::Content { content, .. } => Sha256::digest(content.as_bytes()), + IndexInput::Bytes { data, .. } => Sha256::digest(data), + }; + format!("{:x}", hash) + } + /// Set the document ID. pub fn with_doc_id(mut self, doc_id: impl Into) -> Self { self.doc_id = doc_id.into(); diff --git a/rust/src/index/pipeline/orchestrator.rs b/rust/src/index/pipeline/orchestrator.rs index 3ab3de82..95ace65a 100644 --- a/rust/src/index/pipeline/orchestrator.rs +++ b/rust/src/index/pipeline/orchestrator.rs @@ -31,6 +31,7 @@ use crate::error::Result; use super::super::PipelineOptions; use super::super::stages::IndexStage; +use super::checkpoint::{CheckpointContextData, CheckpointManager, PipelineCheckpoint}; use super::context::{IndexContext, IndexInput, PipelineResult, StageResult}; use super::policy::FailurePolicy; @@ -471,6 +472,40 @@ impl PipelineOrchestrator { ctx = ctx.with_existing_tree(tree); } + // Try to resume from checkpoint + if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { + let manager = CheckpointManager::new(checkpoint_dir); + if let Some(checkpoint) = manager.load(&ctx.doc_id) { + if CheckpointManager::is_valid_for_resume( + &checkpoint, + &ctx.source_hash, + ctx.options.processing_version, + &ctx.options.logic_fingerprint().to_string(), + ) { + info!( + "Resuming from checkpoint: {} stages already completed", + checkpoint.completed_stages.len() + ); + // Restore context data from checkpoint + ctx.raw_nodes = checkpoint.context_data.raw_nodes; + if let Some(tree) = checkpoint.context_data.tree { + ctx.tree = Some(tree); + } + ctx.metrics = checkpoint.context_data.metrics; + ctx.page_count = checkpoint.context_data.page_count; + ctx.line_count = checkpoint.context_data.line_count; + ctx.description = checkpoint.context_data.description; + // Mark completed stages as done + for stage_name in &checkpoint.completed_stages { + ctx.stage_results + .insert(stage_name.clone(), StageResult::success(stage_name)); + } + } else { + info!("Checkpoint exists but invalid, starting fresh"); + } + } + } + // Execute each group for (group_idx, group) in groups.iter().enumerate() { if group.parallel { @@ -487,6 +522,19 @@ impl PipelineOrchestrator { } if group.parallel && group.stage_indices.len() == 2 { + // Check if all stages in this group are already completed (from checkpoint) + let all_completed = group.stage_indices.iter().all(|&idx| { + let name = self.stages[idx].stage.name(); + ctx.stage_results.contains_key(name) + }); + if all_completed { + let names: Vec<&str> = group.stage_indices.iter() + .map(|&i| self.stages[i].stage.name()) + .collect(); + info!("Skipping already completed parallel group: {:?}", names); + continue; + } + // === Parallel execution for 2-stage groups === // One stage gets the main ctx (mutates tree), the other // gets a cloned snapshot (read-only). Results are merged back. @@ -581,6 +629,13 @@ impl PipelineOrchestrator { for &idx in &group.stage_indices { let entry = &mut self.stages[idx]; let stage_name = entry.stage.name().to_string(); + + // Skip stages already completed (from checkpoint resume) + if ctx.stage_results.contains_key(&stage_name) { + info!("Skipping already completed stage: {}", stage_name); + continue; + } + let policy = entry.stage.failure_policy(); info!( @@ -604,12 +659,17 @@ impl PipelineOrchestrator { ); } else { error!("Stage {} failed, stopping pipeline: {}", stage_name, e); + // Save checkpoint before returning error + Self::save_checkpoint(&ctx); return Err(e); } } } } } + + // Save checkpoint after each group completes + Self::save_checkpoint(&ctx); } let total_duration = total_start.elapsed().as_millis() as u64; @@ -618,10 +678,49 @@ impl PipelineOrchestrator { total_duration, ctx.name ); + // Clear checkpoint on successful completion + if let Some(ref checkpoint_dir) = ctx.options.checkpoint_dir { + let manager = CheckpointManager::new(checkpoint_dir); + if let Err(e) = manager.clear(&ctx.doc_id) { + warn!("Failed to clear checkpoint for {}: {}", ctx.doc_id, e); + } + } + // Finalize result Ok(ctx.finalize()) } + /// Save a checkpoint of the current pipeline state. + fn save_checkpoint(ctx: &IndexContext) { + let checkpoint_dir = match ctx.options.checkpoint_dir { + Some(ref dir) => dir.clone(), + None => return, + }; + + let completed_stages: Vec = ctx.stage_results.keys().cloned().collect(); + let checkpoint = PipelineCheckpoint { + doc_id: ctx.doc_id.clone(), + source_hash: ctx.source_hash.clone(), + processing_version: ctx.options.processing_version, + config_fingerprint: ctx.options.logic_fingerprint().to_string(), + completed_stages, + context_data: CheckpointContextData { + raw_nodes: ctx.raw_nodes.clone(), + tree: ctx.tree.clone(), + metrics: ctx.metrics.clone(), + page_count: ctx.page_count, + line_count: ctx.line_count, + description: ctx.description.clone(), + }, + timestamp: chrono::Utc::now(), + }; + + let manager = CheckpointManager::new(checkpoint_dir); + if let Err(e) = manager.save(&ctx.doc_id, &checkpoint) { + warn!("Failed to save checkpoint for {}: {}", ctx.doc_id, e); + } + } + /// Get list of stage names in execution order. pub fn stage_names(&self) -> Result> { let order = self.resolve_order()?; From 893bda94871e45f6b46f53528bd4614aeacd0c4d Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 15:29:53 +0800 Subject: [PATCH 18/21] feat(python-sdk): update SDK with QueryContext and enhanced indexing features - Change description in pyproject.toml to reflect reasoning-native approach - Rename Python bindings to Vectorless Python SDK - Add QueryContext class for structured querying with method chaining - Update Engine.query() to accept QueryContext instead of raw parameters - Rename IndexContext.from_file() to from_path() and from_files() to from_paths() - Add from_dir() method with recursive parameter option - Add IndexItem class with document metadata properties - Add source_path property to DocumentInfo class - Update Engine initialization to remove workspace parameter - Revise documentation comments to match new API structure - Update quick start example to use new QueryContext pattern --- pyproject.toml | 2 +- python/README.md | 65 ++++++++++++++++++++++++----------- python/vectorless/__init__.py | 7 ++-- 3 files changed, 50 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8bc47032..95819b08 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "maturin" [project] name = "vectorless" version = "0.1.6" -description = "Hierarchical document intelligence without vectors" +description = "Reasoning-native document intelligence engine for AI" readme = "README.md" requires-python = ">=3.9" license = { text = "Apache-2.0" } diff --git a/python/README.md b/python/README.md index f10eeb6b..4ca5fa40 100644 --- a/python/README.md +++ b/python/README.md @@ -1,6 +1,6 @@ -# Vectorless Python Bindings +# Vectorless Python SDK -Python bindings for [vectorless](https://github.com/vectorlessflow/vectorless) - a hierarchical document intelligence engine. +Python bindings for [vectorless](https://github.com/vectorlessflow/vectorless) — a reasoning-native document intelligence engine for AI. ## Installation @@ -12,7 +12,7 @@ pip install vectorless ```python import asyncio -from vectorless import Engine, IndexContext +from vectorless import Engine, IndexContext, QueryContext async def main(): # Create engine — api_key and model are required @@ -22,12 +22,14 @@ async def main(): ) # Index a document - result = await engine.index(IndexContext.from_file("./report.pdf")) + result = await engine.index(IndexContext.from_path("./report.pdf")) doc_id = result.doc_id print(f"Indexed: {doc_id}") # Query the document - result = await engine.query(doc_id, "What is the total revenue?") + result = await engine.query( + QueryContext("What is the total revenue?").with_doc_ids([doc_id]) + ) item = result.single() print(f"Answer: {item.content}") print(f"Score: {item.score:.2f}") @@ -52,7 +54,6 @@ The main entry point for vectorless. class Engine: def __init__( self, - workspace: str | None = None, config_path: str | None = None, api_key: str | None = None, model: str | None = None, @@ -60,7 +61,7 @@ class Engine: ): ... async def index(self, ctx: IndexContext) -> IndexResult: ... - async def query(self, doc_id: str | list[str], question: str) -> QueryResult: ... + async def query(self, ctx: QueryContext) -> QueryResult: ... async def list(self) -> list[DocumentInfo]: ... async def remove(self, doc_id: str) -> bool: ... async def clear(self) -> int: ... @@ -75,13 +76,13 @@ Context for indexing documents. ```python class IndexContext: @staticmethod - def from_file(path: str, name: str | None = None) -> IndexContext: ... + def from_path(path: str, name: str | None = None) -> IndexContext: ... @staticmethod - def from_files(paths: list[str]) -> IndexContext: ... + def from_paths(paths: list[str]) -> IndexContext: ... @staticmethod - def from_dir(path: str) -> IndexContext: ... + def from_dir(path: str, recursive: bool = True) -> IndexContext: ... @staticmethod def from_content( @@ -105,16 +106,19 @@ class IndexContext: - `"markdown"` / `"md"` - Markdown content - `"pdf"` - PDF documents -### IndexOptions +### QueryContext + +Context for querying documents. ```python -class IndexOptions: - def __init__( - self, - mode: str = "default", - summaries: bool = False, - description: bool = False, - ): ... +class QueryContext: + def __init__(self, query: str): ... + + def with_doc_ids(self, doc_ids: list[str]) -> QueryContext: ... + def with_workspace(self) -> QueryContext: ... + def with_max_tokens(self, tokens: int) -> QueryContext: ... + def with_include_reasoning(self, include: bool) -> QueryContext: ... + def with_depth_limit(self, depth: int) -> QueryContext: ... ``` ### IndexResult @@ -159,6 +163,26 @@ class QueryResultItem: def node_ids(self) -> list[str]: ... ``` +### IndexItem + +```python +class IndexItem: + @property + def doc_id(self) -> str: ... + @property + def name(self) -> str: ... + @property + def format(self) -> str: ... + @property + def description(self) -> str | None: ... + @property + def source_path(self) -> str | None: ... + @property + def page_count(self) -> int | None: ... + @property + def metrics(self) -> IndexMetrics | None: ... +``` + ### DocumentInfo ```python @@ -172,6 +196,8 @@ class DocumentInfo: @property def description(self) -> str | None: ... @property + def source_path(self) -> str | None: ... + @property def page_count(self) -> int | None: ... @property def line_count(self) -> int | None: ... @@ -195,8 +221,7 @@ class VectorlessError(Exception): # Install maturin pip install maturin -# Build and install -cd python +# Build and install (from project root) maturin develop # Run tests diff --git a/python/vectorless/__init__.py b/python/vectorless/__init__.py index d1170ce5..a7f599ad 100644 --- a/python/vectorless/__init__.py +++ b/python/vectorless/__init__.py @@ -1,8 +1,9 @@ """ -Vectorless - Hierarchical document intelligence without vectors. +Vectorless - Reasoning-native document intelligence engine for AI. -A document intelligence engine that uses tree-based understanding -instead of vector databases for accurate, explainable retrieval. +An ultra-performant reasoning-native document intelligence engine +that transforms documents into rich semantic trees and uses LLMs to +intelligently traverse the hierarchy for accurate, explainable retrieval. Quick Start: from vectorless import Engine, IndexContext, QueryContext From 97698aba9d60ac9c7a5324c6d15d6e5436d300ab Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 15:33:57 +0800 Subject: [PATCH 19/21] feat(rust): update project description to better reflect AI focus - Change description from "Hierarchical, reasoning-native document intelligence engine" to "Reasoning-native document intelligence engine for AI" - Removes hierarchical aspect and adds clearer AI context --- rust/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Cargo.toml b/rust/Cargo.toml index e52d251f..d6984f6e 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -3,7 +3,7 @@ name = "vectorless" version.workspace = true edition.workspace = true authors.workspace = true -description = "Hierarchical, reasoning-native document intelligence engine" +description = "Reasoning-native document intelligence engine for AI" license.workspace = true repository.workspace = true homepage.workspace = true From 0582f2b3fa6b86570fb8d6d664b1f8b447e32009 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 15:35:15 +0800 Subject: [PATCH 20/21] chore(release): bump workspace and package versions - Update workspace package version from 0.1.27 to 0.1.28 in Cargo.toml - Update vectorless package version from 0.1.6 to 0.1.7 in pyproject.toml --- Cargo.toml | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ef9c22b5..1a626bab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["rust", "python"] resolver = "2" [workspace.package] -version = "0.1.27" +version = "0.1.28" edition = "2024" authors = ["zTgx "] license = "Apache-2.0" diff --git a/pyproject.toml b/pyproject.toml index 95819b08..f752a6ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "vectorless" -version = "0.1.6" +version = "0.1.7" description = "Reasoning-native document intelligence engine for AI" readme = "README.md" requires-python = ">=3.9" From 1ae34d86f9aecfb1a8e371fbd7954f9311650dc9 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Wed, 15 Apr 2026 15:52:58 +0800 Subject: [PATCH 21/21] chore(release): add GitHub Actions workflow for automated releases Add release workflow that triggers on version tags and handles: - Publishing Rust crate to crates.io - Publishing Python package to PyPI using maturin - Creating GitHub Releases with auto-generated notes - Using trusted publishers for PyPI authentication --- .github/workflows/release.yml | 56 +++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..c7f5c576 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,56 @@ +name: Release + +on: + push: + tags: + - 'v*' + +env: + CARGO_TERM_COLOR: always + +jobs: + # Publish Rust crate to crates.io + publish-crates: + name: Publish to crates.io + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - name: Publish vectorless crate + run: cargo publish -p vectorless + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} + + # Publish Python package to PyPI + publish-pypi: + name: Publish to PyPI + runs-on: ubuntu-latest + permissions: + id-token: write # Trusted Publishers + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: pyo3/maturin-action@v1 + with: + command: publish + args: --skip-existing + maturin-version: latest + + # Create GitHub Release + github-release: + name: GitHub Release + runs-on: ubuntu-latest + needs: [publish-crates, publish-pypi] + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + - name: Extract version from tag + id: version + run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + tag_name: ${{ github.ref_name }} + name: Release ${{ steps.version.outputs.VERSION }} + generate_release_notes: true