diff --git a/Cargo.lock b/Cargo.lock index 0e6c873d..741cc2e6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -484,6 +484,7 @@ dependencies = [ "itoa", "rustversion", "ryu", + "serde", "static_assertions", ] diff --git a/crates/ecp-cli/src/commands/group/matching.rs b/crates/ecp-cli/src/commands/group/matching.rs index c98ceb2a..044b3304 100644 --- a/crates/ecp-cli/src/commands/group/matching.rs +++ b/crates/ecp-cli/src/commands/group/matching.rs @@ -166,6 +166,26 @@ fn is_excluded(contract_id: &str, cfg: &GroupConfig) -> bool { false } +/// Normalize a `contract_id` into a BM25-safe token stream for BOTH index and +/// query sides. Beyond `:` (field-qualifier syntax), Tantivy's query parser +/// treats `+ - && || ! ( ) { } [ ] ^ " ~ * ? \ /` as operators — so a REST +/// path-param like `http:GET:/users/{id}` parsed verbatim raises a SyntaxError, +/// and `bm25_search` silently returns no match for EVERY path-param contract. +/// Mapping any non-alphanumeric (keeping `_`) to a space matches the default +/// tokenizer's word splitting and keeps the two sides symmetric. +fn normalize_for_bm25(contract_id: &str) -> String { + contract_id + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '_' { + c + } else { + ' ' + } + }) + .collect() +} + fn build_bm25_schema() -> (Schema, tantivy::schema::Field, tantivy::schema::Field) { let mut builder = Schema::builder(); // `contract_id` is queried, never retrieved from the doc store (results read @@ -196,9 +216,7 @@ fn build_bm25_index(index_dir: &Path, kept: &[&StoredContract]) -> Result<(), St .map_err(|e| format!("acquire contracts writer: {e}"))?; for c in kept { - // Escape `:` → space so Tantivy's query parser doesn't treat - // `http:GET:/users` as field-qualified sub-queries. - let escaped = c.inner.contract_id.replace(':', " "); + let escaped = normalize_for_bm25(&c.inner.contract_id); let mut doc = tantivy::TantivyDocument::default(); doc.add_text(contract_id_field, &escaped); doc.add_text(uid_field, &c.inner.symbol_uid); @@ -247,8 +265,8 @@ fn bm25_search( contract_id: &str, limit: usize, ) -> Vec<(String, f32)> { - // Escape `:` same way as at index time. - let escaped = contract_id.replace(':', " "); + // Normalize the same way as at index time. + let escaped = normalize_for_bm25(contract_id); let Ok(query) = parser.parse_query(&escaped) else { return Vec::new(); }; @@ -274,3 +292,90 @@ fn bm25_search( } results } + +#[cfg(test)] +mod tests { + use super::normalize_for_bm25; + use tantivy::query::QueryParser; + use tantivy::schema::{Schema, TEXT}; + use tantivy::Index; + + fn parses(raw: &str) -> bool { + let mut b = Schema::builder(); + let f = b.add_text_field("contract_id", TEXT); + let index = Index::create_in_ram(b.build()); + let parser = QueryParser::for_index(&index, vec![f]); + parser.parse_query(&normalize_for_bm25(raw)).is_ok() + } + + #[test] + fn normalize_for_bm25_path_param_contract_parses() { + // Regression: `{id}` is Tantivy query syntax; verbatim parse raised a + // SyntaxError so every REST path-param contract silently matched nothing. + assert!(parses("http:GET:/users/{id}")); + assert!(parses( + "http:DELETE:/api/v2/orders/{orderId}/items/{itemId}" + )); + assert!(parses("grpc:UserService:GetUser")); + } + + #[test] + fn normalize_for_bm25_strips_query_operators_to_spaces() { + // All Tantivy operator chars map to spaces; alphanumerics and `_` survive. + let n = normalize_for_bm25("http:GET:/users/{id}?a=1&b=2"); + assert_eq!(n, "http GET users id a 1 b 2"); + assert!(!n.contains('{') && !n.contains('}') && !n.contains(':') && !n.contains('/')); + } + + /// End-to-end: a path-param consumer must BM25-match its same-id provider in + /// another repo. Before the normalize fix the `{id}` raised a parse error and + /// this returned no match — so this exercises the real index→query pipeline, + /// which is what "index/query symmetry" actually has to guarantee. + #[test] + fn bm25_matches_path_param_contract_across_repos() { + use super::{build_bm25_index, open_bm25_searcher}; + use crate::commands::group::types::{ + ContractRole, ContractType, ExtractedContract, StoredContract, SymbolRef, + }; + + fn contract(repo: &str, role: ContractRole, uid: &str) -> StoredContract { + StoredContract { + repo: repo.to_string(), + inner: ExtractedContract { + contract_id: "http:GET:/users/{id}".to_string(), + contract_type: ContractType::Http, + role, + symbol_uid: uid.to_string(), + symbol_ref: SymbolRef { + file_path: format!("{repo}/api.rs"), + name: "handler".to_string(), + }, + confidence: 1.0, + service: None, + meta: vec![], + }, + } + } + + let provider = contract("svc-a", ContractRole::Provider, "uid-provider"); + let consumer = contract("svc-b", ContractRole::Consumer, "uid-consumer"); + let kept = [&provider, &consumer]; + + let dir = tempfile::tempdir().unwrap(); + let index_dir = dir.path().join("contracts_index"); + build_bm25_index(&index_dir, &kept).unwrap(); + let (searcher, parser, uid_field) = open_bm25_searcher(&index_dir).unwrap(); + + let hits = super::bm25_search( + &searcher, + &parser, + uid_field, + &consumer.inner.contract_id, + 16, + ); + assert!( + hits.iter().any(|(uid, _)| uid == "uid-provider"), + "path-param consumer should BM25-match the provider; got {hits:?}" + ); + } +} diff --git a/crates/ecp-core/Cargo.toml b/crates/ecp-core/Cargo.toml index d87cbfdc..6a7ce2a7 100644 --- a/crates/ecp-core/Cargo.toml +++ b/crates/ecp-core/Cargo.toml @@ -34,7 +34,7 @@ phf = { version = "0.14", features = ["macros"] } xxhash-rust = { version = "0.8", features = ["xxh3"] } memmap2 = "0.9.10" smallvec = "1.13" -compact_str = "0.9" +compact_str = { version = "0.9", features = ["serde"] } [target.'cfg(unix)'.dependencies] nix = { version = "0.31", features = ["process", "signal"] } diff --git a/crates/ecp-core/src/cypher/executor.rs b/crates/ecp-core/src/cypher/executor.rs index 80e1f2ca..ee040a87 100644 --- a/crates/ecp-core/src/cypher/executor.rs +++ b/crates/ecp-core/src/cypher/executor.rs @@ -701,7 +701,7 @@ fn eval_return_expr( Ok(v.clone()) } else if let Some(&idx) = b.node_vars.get(var) { Ok(match graph.mnode(idx) { - Some(m) => Value::Str(m.name(&graph).to_string()), + Some(m) => Value::Str(m.name(&graph).into()), None => Value::Null, }) } else { @@ -744,8 +744,8 @@ fn eval_return_item_rich( }; return Value::NodeRef { idx, - name: m.name(&graph).to_string(), - kind: m.kind().as_str().to_string(), + name: m.name(&graph).into(), + kind: m.kind().as_str().into(), file_path: m.file_path(&graph).unwrap_or("").to_string(), }; } @@ -949,10 +949,10 @@ fn eval_scalar_funcall(name: &str, args: &[Expr], b: &Binding, graph: Gv<'_>) -> return Value::Null; }; if let Some(e) = graph.overlay_edge(eidx) { - return Value::Str(e.rel_type.as_str().to_string()); + return Value::Str(e.rel_type.as_str().into()); } let e = &graph.edges[eidx as usize]; - Value::Str(RelType::from(&e.rel_type).as_str().to_string()) + Value::Str(RelType::from(&e.rel_type).as_str().into()) } "ID" => { // id(n) — args[0] must be a Var bound to a node. @@ -973,7 +973,7 @@ fn eval_scalar_funcall(name: &str, args: &[Expr], b: &Binding, graph: Gv<'_>) -> return Value::Null; }; match graph.mnode(idx) { - Some(m) => Value::List(vec![Value::Str(m.kind().as_str().to_string())]), + Some(m) => Value::List(vec![Value::Str(m.kind().as_str().into())]), None => Value::Null, } } @@ -1448,9 +1448,9 @@ fn node_prop_no_cache(node_idx: u32, prop: &str, graph: Gv<'_>) -> Value { // symbol has no metas yet and takes the sparse defaults. let fm = m.fm_idx(node_idx); match prop { - "filePath" => Value::Str(m.file_path(&graph).unwrap_or("").to_string()), + "filePath" => Value::Str(m.file_path(&graph).unwrap_or("").into()), "ownerClass" => match m.owner_class(&graph) { - Some(oc) => Value::Str(oc.to_string()), + Some(oc) => Value::Str(oc.into()), None => Value::Null, }, "line" | "startLine" => Value::Int(m.start_line() as i64), @@ -1610,7 +1610,7 @@ fn eval_expr( } if let Some(&idx) = b.node_vars.get(var) { return Ok(match graph.mnode(idx) { - Some(m) => Value::Str(m.name(&graph).to_string()), + Some(m) => Value::Str(m.name(&graph).into()), None => Value::Null, }); } @@ -2047,7 +2047,7 @@ fn lit_to_value(l: &Literal) -> Value { Literal::Bool(b) => Value::Bool(*b), Literal::Int(i) => Value::Int(*i), Literal::Float(f) => Value::Float(*f), - Literal::Str(s) => Value::Str(s.clone()), + Literal::Str(s) => Value::Str(s.as_str().into()), Literal::List(xs) => Value::List(xs.iter().map(lit_to_value).collect()), } } @@ -2108,8 +2108,8 @@ fn prop_value( reason, } => match prop { "confidence" => Value::Float(*confidence as f64), - "reason" => Value::Str(reason.clone()), - "rel_type" => Value::Str(format!("{rel_type:?}")), + "reason" => Value::Str(reason.as_str().into()), + "rel_type" => Value::Str(format!("{rel_type:?}").into()), _ => Value::Null, }, // Scalar: only bare var reference makes sense; . returns Null. @@ -2129,16 +2129,16 @@ fn prop_value( if let Some(e) = graph.overlay_edge(edge_idx) { return match prop { "confidence" => Value::Float(e.confidence as f64), - "reason" => Value::Str("l1-overlay".to_string()), - "rel_type" => Value::Str(e.rel_type.as_str().to_string()), + "reason" => Value::Str("l1-overlay".into()), + "rel_type" => Value::Str(e.rel_type.as_str().into()), _ => Value::Null, }; } let e = &graph.edges[edge_idx as usize]; return match prop { "confidence" => Value::Float(e.confidence.to_native() as f64), - "reason" => Value::Str(e.reason.resolve(&graph.string_pool).to_string()), - "rel_type" => Value::Str(RelType::from(&e.rel_type).as_str().to_string()), + "reason" => Value::Str(e.reason.resolve(&graph.string_pool).into()), + "rel_type" => Value::Str(RelType::from(&e.rel_type).as_str().into()), _ => Value::Null, }; } @@ -2159,19 +2159,19 @@ fn node_prop_value(node_idx: u32, prop: &str, graph: Gv<'_>, cache: &mut Content }; let fm = m.fm_idx(node_idx); match prop { - "name" => Value::Str(m.name(&graph).to_string()), + "name" => Value::Str(m.name(&graph).into()), // u64 uid stored as i64 bits — no allocation per row. "uid" => Value::Int(m.uid() as i64), "ownerClass" => match m.owner_class(&graph) { - Some(oc) => Value::Str(oc.to_string()), + Some(oc) => Value::Str(oc.into()), None => Value::Null, }, - "kind" => Value::Str(m.kind().as_str().to_string()), + "kind" => Value::Str(m.kind().as_str().into()), // 1-based, matching impact/find/inspect output (see Node::start_line). // `span.0` is the raw 0-based tree-sitter row; never expose it as `line`. "line" | "startLine" => Value::Int(m.start_line() as i64), "endLine" => Value::Int(m.end_line() as i64), - "filePath" => Value::Str(m.file_path(&graph).unwrap_or("").to_string()), + "filePath" => Value::Str(m.file_path(&graph).unwrap_or("").into()), "content" => { let slice = match &m { MNode::Base(n) => { @@ -2206,7 +2206,7 @@ fn node_prop_value(node_idx: u32, prop: &str, graph: Gv<'_>, cache: &mut Content .unwrap_or_default() } }; - Value::Str(slice) + Value::Str(slice.into()) } // ── FunctionMeta flag properties ──────────────────────────────────── // FunctionMeta is sparse (only Function/Method/Constructor nodes). @@ -2293,7 +2293,7 @@ fn archived_fm_decorators(graph: Gv<'_>, node_idx: u32) -> Value { .map(|d| { let s = d.resolve(&graph.string_pool); let normalized = s.strip_prefix('@').unwrap_or(s); - Value::Str(normalized.to_string()) + Value::Str(normalized.into()) }) .collect(), Err(_) => vec![], @@ -3778,7 +3778,7 @@ mod tests { assert_eq!(result.columns, vec!["a.content"]); assert_eq!(result.rows.len(), 1); let content = match &result.rows[0][0] { - Value::Str(s) => s.clone(), + Value::Str(s) => s.to_string(), other => panic!("expected Str, got {other:?}"), }; // span (0,0,2,1) covers "function hello() {\n return 42;\n}" @@ -3971,7 +3971,7 @@ mod tests { .rows .iter() .map(|row| match &row[0] { - Value::Str(s) => s.clone(), + Value::Str(s) => s.to_string(), other => panic!("expected Str, got {other:?}"), }) .collect(); @@ -3995,7 +3995,7 @@ mod tests { .rows .iter() .map(|row| match &row[0] { - Value::Str(s) => s.clone(), + Value::Str(s) => s.to_string(), other => panic!("expected Str, got {other:?}"), }) .collect(); diff --git a/crates/ecp-core/src/cypher/value.rs b/crates/ecp-core/src/cypher/value.rs index 82895f99..8b31ab64 100644 --- a/crates/ecp-core/src/cypher/value.rs +++ b/crates/ecp-core/src/cypher/value.rs @@ -1,4 +1,5 @@ use crate::graph::RelType; +use compact_str::CompactString; #[derive(Debug, Clone, PartialEq)] pub enum Value { @@ -6,14 +7,20 @@ pub enum Value { Bool(bool), Int(i64), Float(f64), - Str(String), + /// `CompactString` inlines values ≤24 bytes with no heap allocation. The + /// dominant result strings — symbol `kind` ("Function", "Class"…) and + /// `rel_type` ("Calls", "Implements"…) — always fit, so a query returning + /// N edges drops N heap allocs for those columns. + Str(CompactString), List(Vec), /// Reference to a graph node. CLI side resolves `.name`/`.kind`/`.filePath` - /// for human-readable serialization. + /// for human-readable serialization. `name`/`kind` inline as `CompactString` + /// (kind always; realistic symbol names usually); `file_path` stays `String` + /// since paths routinely exceed the inline budget. NodeRef { idx: u32, - name: String, - kind: String, + name: CompactString, + kind: CompactString, file_path: String, }, EdgeRef { diff --git a/crates/ecp-core/tests/cypher_where_label_test.rs b/crates/ecp-core/tests/cypher_where_label_test.rs index f1b259f3..ccfe23bc 100644 --- a/crates/ecp-core/tests/cypher_where_label_test.rs +++ b/crates/ecp-core/tests/cypher_where_label_test.rs @@ -95,7 +95,7 @@ fn names_returned(cypher_query: &str) -> Vec { .rows .iter() .map(|r| match &r[0] { - cypher::Value::Str(s) => s.clone(), + cypher::Value::Str(s) => s.to_string(), v => panic!("expected Str, got {v:?}"), }) .collect()