diff --git a/README.md b/README.md index 8e2fd6a..f9f9afc 100644 --- a/README.md +++ b/README.md @@ -168,6 +168,8 @@ agent → adapter → daemon spawn chain. All are optional. | `CONTINUUM_PRELOAD_MODEL` | unset | Set to `1`, `true`, `yes`, or `on` to load semantic search at daemon startup instead of lazily. | | `CONTINUUM_IDLE_MINUTES` | `30` | Idle minutes before the daemon exits (`0` = never). | | `CONTINUUM_MAX_FILE_KIB` | `2048` | Largest file size indexed, in KiB. | +| `CONTINUUM_MAX_FILES` | `50000` | Files indexed per pass (`0` = unlimited). Caps memory on huge trees. | +| `CONTINUUM_ALLOW_LARGE_ROOT` | unset | Set truthy to auto-index even when the workspace root is a drive root or home directory. | | `CONTINUUM_DEBOUNCE_MS` | `300` | Filesystem-watch debounce window. | ## MCP tools diff --git a/crates/continuum-daemon/src/main.rs b/crates/continuum-daemon/src/main.rs index 34bf7de..637a571 100644 --- a/crates/continuum-daemon/src/main.rs +++ b/crates/continuum-daemon/src/main.rs @@ -6,7 +6,7 @@ mod lifecycle; mod mcp; mod tools; -use std::path::PathBuf; +use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU8, AtomicUsize, Ordering}; use std::sync::Arc; use std::time::{Duration, Instant}; @@ -92,22 +92,35 @@ async fn main() -> Result<()> { // blocks startup on a model download. let semantic = Arc::new(continuum_search::SemanticEngine::new()); - // Index in the background so the daemon serves immediately; navigation - // tools return progressively richer results as the scan completes. - { - let graph = graph.clone(); - let semantic = semantic.clone(); - let root = ws.root_path(); - let ws_snapshot = ws.clone(); - tokio::spawn(async move { - let n = continuum_indexer::index_workspace(&root, graph.clone(), semantic).await; - tracing::info!("initial index complete: {n} files"); - ws_snapshot.write_snapshot(&graph.read().await.snapshot()); - }); - } - let _watcher = - continuum_indexer::start_watcher(ws.root_path(), graph.clone(), semantic.clone()) - .map_err(|e| anyhow::anyhow!("start file watcher: {e}"))?; + // A workspace rooted at a drive/filesystem root or the user's home directory + // would walk an enormous tree and exhaust memory. Refuse to auto-index (and + // to recursively watch) such a root; the daemon still serves memory and + // on-demand text search. CONTINUUM_ALLOW_LARGE_ROOT=1 overrides. + let _watcher = if let Some(reason) = unsafe_index_root(&ws.root_path()) { + tracing::warn!( + "skipping automatic indexing: {reason}. Open a project subdirectory, \ + or set CONTINUUM_ALLOW_LARGE_ROOT=1 to override." + ); + None + } else { + // Index in the background so the daemon serves immediately; navigation + // tools return progressively richer results as the scan completes. + { + let graph = graph.clone(); + let semantic = semantic.clone(); + let root = ws.root_path(); + let ws_snapshot = ws.clone(); + tokio::spawn(async move { + let n = continuum_indexer::index_workspace(&root, graph.clone(), semantic).await; + tracing::info!("initial index complete: {n} files"); + ws_snapshot.write_snapshot(&graph.read().await.snapshot()); + }); + } + Some( + continuum_indexer::start_watcher(ws.root_path(), graph.clone(), semantic.clone()) + .map_err(|e| anyhow::anyhow!("start file watcher: {e}"))?, + ) + }; let listener = TcpListener::bind("127.0.0.1:0") .await @@ -306,10 +319,41 @@ pub(crate) fn maybe_start_semantic_load(daemon: &Arc) { ); } -fn semantic_preload_enabled() -> bool { - std::env::var("CONTINUUM_PRELOAD_MODEL") +/// A human-readable reason if `root` is too broad to auto-index — a filesystem +/// root or the user's home directory — or `None` when it is safe (or the +/// `CONTINUUM_ALLOW_LARGE_ROOT` escape hatch is set). `root` is already +/// canonicalized by [`Workspace::resolve`], as is the home directory here, so +/// the comparison is exact. +fn unsafe_index_root(root: &Path) -> Option { + if env_flag("CONTINUUM_ALLOW_LARGE_ROOT") { + return None; + } + if root.parent().is_none() { + return Some(format!("{} is a filesystem root", root.display())); + } + if home_dir().is_some_and(|home| home == root) { + return Some(format!("{} is your home directory", root.display())); + } + None +} + +/// The user's home directory, canonicalized to match a resolved workspace root. +fn home_dir() -> Option { + std::env::var_os("USERPROFILE") + .or_else(|| std::env::var_os("HOME")) + .map(PathBuf::from) + .and_then(|p| p.canonicalize().ok()) +} + +/// Whether an environment variable is set to a truthy value. +fn env_flag(name: &str) -> bool { + std::env::var(name) .ok() - .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "yes" | "on")) + .is_some_and(|v| matches!(v.as_str(), "1" | "true" | "yes" | "on")) +} + +fn semantic_preload_enabled() -> bool { + env_flag("CONTINUUM_PRELOAD_MODEL") } /// Validate the Continuum handshake, then serve MCP for the connection's life. diff --git a/crates/continuum-indexer/src/lib.rs b/crates/continuum-indexer/src/lib.rs index b32a6af..2448e40 100644 --- a/crates/continuum-indexer/src/lib.rs +++ b/crates/continuum-indexer/src/lib.rs @@ -20,15 +20,49 @@ pub use textsearch::search_text; pub use watcher::start_watcher; /// Directory names never descended into during indexing. +/// +/// Covers version-control metadata, dependency stores, build output, language +/// caches, and the bulky home-directory trees (`AppData`, `Library`) that a +/// misaimed workspace root would otherwise drag in. These are the safety net +/// over `.gitignore`/`.ignore`, which many of these directories lack. const SKIP_DIRS: &[&str] = &[ + // Version control ".git", - "target", - "node_modules", + ".svn", + ".hg", + // Continuum's own state ".continuum", + // Dependency stores + "node_modules", + "vendor", + "Pods", + // Build output + "target", "dist", "build", + "out", + // Python environments and caches ".venv", + "venv", "__pycache__", + ".mypy_cache", + ".pytest_cache", + ".tox", + // JS/TS framework output + ".next", + ".nuxt", + "coverage", + // Language / tool caches + ".cache", + ".cargo", + ".rustup", + ".npm", + ".gradle", + ".m2", + // Editor / OS home-directory bloat + ".idea", + "AppData", + "Library", ]; /// Full one-shot index of a workspace. Returns the number of files indexed. @@ -186,15 +220,41 @@ pub(crate) fn is_skipped_path(root: &Path, path: &Path) -> bool { .any(|name| SKIP_DIRS.contains(&name)) } +/// Hard ceiling on files pulled into one index pass — override with +/// `CONTINUUM_MAX_FILES` (`0` disables the cap). Keeps a workspace rooted at a +/// huge tree (a home directory, a drive root) from exhausting memory: every +/// indexed file's symbols, BM25 tokens, and embeddings live in RAM. +static MAX_FILES: std::sync::LazyLock = std::sync::LazyLock::new(|| { + match std::env::var("CONTINUUM_MAX_FILES") + .ok() + .and_then(|v| v.parse().ok()) + { + Some(0) => usize::MAX, + Some(n) => n, + None => 50_000, + } +}); + fn collect_source_files(root: &Path) -> Vec { - ignore::WalkBuilder::new(root) + let cap = *MAX_FILES; + let mut files = Vec::new(); + for entry in ignore::WalkBuilder::new(root) .require_git(false) .filter_entry(|entry| !is_skipped_dir(entry)) .build() .filter_map(|entry| entry.ok()) .filter(|entry| entry.file_type().is_some_and(|t| t.is_file())) - .map(|entry| entry.path().to_path_buf()) - .collect() + { + if files.len() >= cap { + tracing::warn!( + "file cap ({cap}) reached; indexing truncated — narrow the workspace \ + or raise CONTINUUM_MAX_FILES (0 disables the cap)" + ); + break; + } + files.push(entry.path().to_path_buf()); + } + files } #[cfg(test)] @@ -213,6 +273,14 @@ mod tests { root, Path::new("/workspace/pkg/node_modules/a.js") )); + assert!(is_skipped_path( + root, + Path::new("/workspace/.venv/lib/site-packages/a.py") + )); + assert!(is_skipped_path( + root, + Path::new("/workspace/AppData/Local/cache/a.ts") + )); assert!(!is_skipped_path(root, Path::new("/workspace/src/lib.rs"))); } }