Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

115 changes: 110 additions & 5 deletions crates/ecp-cli/src/commands/group/matching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,26 @@ fn is_excluded(contract_id: &str, cfg: &GroupConfig) -> bool {
false
}

/// Normalize a `contract_id` into a BM25-safe token stream for BOTH index and
/// query sides. Beyond `:` (field-qualifier syntax), Tantivy's query parser
/// treats `+ - && || ! ( ) { } [ ] ^ " ~ * ? \ /` as operators — so a REST
/// path-param like `http:GET:/users/{id}` parsed verbatim raises a SyntaxError,
/// and `bm25_search` silently returns no match for EVERY path-param contract.
/// Mapping any non-alphanumeric (keeping `_`) to a space matches the default
/// tokenizer's word splitting and keeps the two sides symmetric.
fn normalize_for_bm25(contract_id: &str) -> String {
contract_id
.chars()
.map(|c| {
if c.is_alphanumeric() || c == '_' {
c
} else {
' '
}
})
.collect()
}

fn build_bm25_schema() -> (Schema, tantivy::schema::Field, tantivy::schema::Field) {
let mut builder = Schema::builder();
// `contract_id` is queried, never retrieved from the doc store (results read
Expand Down Expand Up @@ -196,9 +216,7 @@ fn build_bm25_index(index_dir: &Path, kept: &[&StoredContract]) -> Result<(), St
.map_err(|e| format!("acquire contracts writer: {e}"))?;

for c in kept {
// Escape `:` → space so Tantivy's query parser doesn't treat
// `http:GET:/users` as field-qualified sub-queries.
let escaped = c.inner.contract_id.replace(':', " ");
let escaped = normalize_for_bm25(&c.inner.contract_id);
let mut doc = tantivy::TantivyDocument::default();
doc.add_text(contract_id_field, &escaped);
doc.add_text(uid_field, &c.inner.symbol_uid);
Expand Down Expand Up @@ -247,8 +265,8 @@ fn bm25_search(
contract_id: &str,
limit: usize,
) -> Vec<(String, f32)> {
// Escape `:` same way as at index time.
let escaped = contract_id.replace(':', " ");
// Normalize the same way as at index time.
let escaped = normalize_for_bm25(contract_id);
let Ok(query) = parser.parse_query(&escaped) else {
return Vec::new();
};
Expand All @@ -274,3 +292,90 @@ fn bm25_search(
}
results
}

#[cfg(test)]
mod tests {
use super::normalize_for_bm25;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, TEXT};
use tantivy::Index;

fn parses(raw: &str) -> bool {
let mut b = Schema::builder();
let f = b.add_text_field("contract_id", TEXT);
let index = Index::create_in_ram(b.build());
let parser = QueryParser::for_index(&index, vec![f]);
parser.parse_query(&normalize_for_bm25(raw)).is_ok()
}

#[test]
fn normalize_for_bm25_path_param_contract_parses() {
// Regression: `{id}` is Tantivy query syntax; verbatim parse raised a
// SyntaxError so every REST path-param contract silently matched nothing.
assert!(parses("http:GET:/users/{id}"));
assert!(parses(
"http:DELETE:/api/v2/orders/{orderId}/items/{itemId}"
));
assert!(parses("grpc:UserService:GetUser"));
}

#[test]
fn normalize_for_bm25_strips_query_operators_to_spaces() {
// All Tantivy operator chars map to spaces; alphanumerics and `_` survive.
let n = normalize_for_bm25("http:GET:/users/{id}?a=1&b=2");
assert_eq!(n, "http GET users id a 1 b 2");
assert!(!n.contains('{') && !n.contains('}') && !n.contains(':') && !n.contains('/'));
}

/// End-to-end: a path-param consumer must BM25-match its same-id provider in
/// another repo. Before the normalize fix the `{id}` raised a parse error and
/// this returned no match — so this exercises the real index→query pipeline,
/// which is what "index/query symmetry" actually has to guarantee.
#[test]
fn bm25_matches_path_param_contract_across_repos() {
use super::{build_bm25_index, open_bm25_searcher};
use crate::commands::group::types::{
ContractRole, ContractType, ExtractedContract, StoredContract, SymbolRef,
};

fn contract(repo: &str, role: ContractRole, uid: &str) -> StoredContract {
StoredContract {
repo: repo.to_string(),
inner: ExtractedContract {
contract_id: "http:GET:/users/{id}".to_string(),
contract_type: ContractType::Http,
role,
symbol_uid: uid.to_string(),
symbol_ref: SymbolRef {
file_path: format!("{repo}/api.rs"),
name: "handler".to_string(),
},
confidence: 1.0,
service: None,
meta: vec![],
},
}
}

let provider = contract("svc-a", ContractRole::Provider, "uid-provider");
let consumer = contract("svc-b", ContractRole::Consumer, "uid-consumer");
let kept = [&provider, &consumer];

let dir = tempfile::tempdir().unwrap();
let index_dir = dir.path().join("contracts_index");
build_bm25_index(&index_dir, &kept).unwrap();
let (searcher, parser, uid_field) = open_bm25_searcher(&index_dir).unwrap();

let hits = super::bm25_search(
&searcher,
&parser,
uid_field,
&consumer.inner.contract_id,
16,
);
assert!(
hits.iter().any(|(uid, _)| uid == "uid-provider"),
"path-param consumer should BM25-match the provider; got {hits:?}"
);
}
}
2 changes: 1 addition & 1 deletion crates/ecp-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ phf = { version = "0.14", features = ["macros"] }
xxhash-rust = { version = "0.8", features = ["xxh3"] }
memmap2 = "0.9.10"
smallvec = "1.13"
compact_str = "0.9"
compact_str = { version = "0.9", features = ["serde"] }

[target.'cfg(unix)'.dependencies]
nix = { version = "0.31", features = ["process", "signal"] }
Expand Down
50 changes: 25 additions & 25 deletions crates/ecp-core/src/cypher/executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ fn eval_return_expr(
Ok(v.clone())
} else if let Some(&idx) = b.node_vars.get(var) {
Ok(match graph.mnode(idx) {
Some(m) => Value::Str(m.name(&graph).to_string()),
Some(m) => Value::Str(m.name(&graph).into()),
None => Value::Null,
})
} else {
Expand Down Expand Up @@ -744,8 +744,8 @@ fn eval_return_item_rich(
};
return Value::NodeRef {
idx,
name: m.name(&graph).to_string(),
kind: m.kind().as_str().to_string(),
name: m.name(&graph).into(),
kind: m.kind().as_str().into(),
file_path: m.file_path(&graph).unwrap_or("").to_string(),
};
}
Expand Down Expand Up @@ -949,10 +949,10 @@ fn eval_scalar_funcall(name: &str, args: &[Expr], b: &Binding, graph: Gv<'_>) ->
return Value::Null;
};
if let Some(e) = graph.overlay_edge(eidx) {
return Value::Str(e.rel_type.as_str().to_string());
return Value::Str(e.rel_type.as_str().into());
}
let e = &graph.edges[eidx as usize];
Value::Str(RelType::from(&e.rel_type).as_str().to_string())
Value::Str(RelType::from(&e.rel_type).as_str().into())
}
"ID" => {
// id(n) — args[0] must be a Var bound to a node.
Expand All @@ -973,7 +973,7 @@ fn eval_scalar_funcall(name: &str, args: &[Expr], b: &Binding, graph: Gv<'_>) ->
return Value::Null;
};
match graph.mnode(idx) {
Some(m) => Value::List(vec![Value::Str(m.kind().as_str().to_string())]),
Some(m) => Value::List(vec![Value::Str(m.kind().as_str().into())]),
None => Value::Null,
}
}
Expand Down Expand Up @@ -1448,9 +1448,9 @@ fn node_prop_no_cache(node_idx: u32, prop: &str, graph: Gv<'_>) -> Value {
// symbol has no metas yet and takes the sparse defaults.
let fm = m.fm_idx(node_idx);
match prop {
"filePath" => Value::Str(m.file_path(&graph).unwrap_or("").to_string()),
"filePath" => Value::Str(m.file_path(&graph).unwrap_or("").into()),
"ownerClass" => match m.owner_class(&graph) {
Some(oc) => Value::Str(oc.to_string()),
Some(oc) => Value::Str(oc.into()),
None => Value::Null,
},
"line" | "startLine" => Value::Int(m.start_line() as i64),
Expand Down Expand Up @@ -1610,7 +1610,7 @@ fn eval_expr(
}
if let Some(&idx) = b.node_vars.get(var) {
return Ok(match graph.mnode(idx) {
Some(m) => Value::Str(m.name(&graph).to_string()),
Some(m) => Value::Str(m.name(&graph).into()),
None => Value::Null,
});
}
Expand Down Expand Up @@ -2047,7 +2047,7 @@ fn lit_to_value(l: &Literal) -> Value {
Literal::Bool(b) => Value::Bool(*b),
Literal::Int(i) => Value::Int(*i),
Literal::Float(f) => Value::Float(*f),
Literal::Str(s) => Value::Str(s.clone()),
Literal::Str(s) => Value::Str(s.as_str().into()),
Literal::List(xs) => Value::List(xs.iter().map(lit_to_value).collect()),
}
}
Expand Down Expand Up @@ -2108,8 +2108,8 @@ fn prop_value(
reason,
} => match prop {
"confidence" => Value::Float(*confidence as f64),
"reason" => Value::Str(reason.clone()),
"rel_type" => Value::Str(format!("{rel_type:?}")),
"reason" => Value::Str(reason.as_str().into()),
"rel_type" => Value::Str(format!("{rel_type:?}").into()),
_ => Value::Null,
},
// Scalar: only bare var reference makes sense; <var>.<prop> returns Null.
Expand All @@ -2129,16 +2129,16 @@ fn prop_value(
if let Some(e) = graph.overlay_edge(edge_idx) {
return match prop {
"confidence" => Value::Float(e.confidence as f64),
"reason" => Value::Str("l1-overlay".to_string()),
"rel_type" => Value::Str(e.rel_type.as_str().to_string()),
"reason" => Value::Str("l1-overlay".into()),
"rel_type" => Value::Str(e.rel_type.as_str().into()),
_ => Value::Null,
};
}
let e = &graph.edges[edge_idx as usize];
return match prop {
"confidence" => Value::Float(e.confidence.to_native() as f64),
"reason" => Value::Str(e.reason.resolve(&graph.string_pool).to_string()),
"rel_type" => Value::Str(RelType::from(&e.rel_type).as_str().to_string()),
"reason" => Value::Str(e.reason.resolve(&graph.string_pool).into()),
"rel_type" => Value::Str(RelType::from(&e.rel_type).as_str().into()),
_ => Value::Null,
};
}
Expand All @@ -2159,19 +2159,19 @@ fn node_prop_value(node_idx: u32, prop: &str, graph: Gv<'_>, cache: &mut Content
};
let fm = m.fm_idx(node_idx);
match prop {
"name" => Value::Str(m.name(&graph).to_string()),
"name" => Value::Str(m.name(&graph).into()),
// u64 uid stored as i64 bits — no allocation per row.
"uid" => Value::Int(m.uid() as i64),
"ownerClass" => match m.owner_class(&graph) {
Some(oc) => Value::Str(oc.to_string()),
Some(oc) => Value::Str(oc.into()),
None => Value::Null,
},
"kind" => Value::Str(m.kind().as_str().to_string()),
"kind" => Value::Str(m.kind().as_str().into()),
// 1-based, matching impact/find/inspect output (see Node::start_line).
// `span.0` is the raw 0-based tree-sitter row; never expose it as `line`.
"line" | "startLine" => Value::Int(m.start_line() as i64),
"endLine" => Value::Int(m.end_line() as i64),
"filePath" => Value::Str(m.file_path(&graph).unwrap_or("").to_string()),
"filePath" => Value::Str(m.file_path(&graph).unwrap_or("").into()),
"content" => {
let slice = match &m {
MNode::Base(n) => {
Expand Down Expand Up @@ -2206,7 +2206,7 @@ fn node_prop_value(node_idx: u32, prop: &str, graph: Gv<'_>, cache: &mut Content
.unwrap_or_default()
}
};
Value::Str(slice)
Value::Str(slice.into())
}
// ── FunctionMeta flag properties ────────────────────────────────────
// FunctionMeta is sparse (only Function/Method/Constructor nodes).
Expand Down Expand Up @@ -2293,7 +2293,7 @@ fn archived_fm_decorators(graph: Gv<'_>, node_idx: u32) -> Value {
.map(|d| {
let s = d.resolve(&graph.string_pool);
let normalized = s.strip_prefix('@').unwrap_or(s);
Value::Str(normalized.to_string())
Value::Str(normalized.into())
})
.collect(),
Err(_) => vec![],
Expand Down Expand Up @@ -3778,7 +3778,7 @@ mod tests {
assert_eq!(result.columns, vec!["a.content"]);
assert_eq!(result.rows.len(), 1);
let content = match &result.rows[0][0] {
Value::Str(s) => s.clone(),
Value::Str(s) => s.to_string(),
other => panic!("expected Str, got {other:?}"),
};
// span (0,0,2,1) covers "function hello() {\n return 42;\n}"
Expand Down Expand Up @@ -3971,7 +3971,7 @@ mod tests {
.rows
.iter()
.map(|row| match &row[0] {
Value::Str(s) => s.clone(),
Value::Str(s) => s.to_string(),
other => panic!("expected Str, got {other:?}"),
})
.collect();
Expand All @@ -3995,7 +3995,7 @@ mod tests {
.rows
.iter()
.map(|row| match &row[0] {
Value::Str(s) => s.clone(),
Value::Str(s) => s.to_string(),
other => panic!("expected Str, got {other:?}"),
})
.collect();
Expand Down
15 changes: 11 additions & 4 deletions crates/ecp-core/src/cypher/value.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
use crate::graph::RelType;
use compact_str::CompactString;

#[derive(Debug, Clone, PartialEq)]
pub enum Value {
Null,
Bool(bool),
Int(i64),
Float(f64),
Str(String),
/// `CompactString` inlines values ≤24 bytes with no heap allocation. The
/// dominant result strings — symbol `kind` ("Function", "Class"…) and
/// `rel_type` ("Calls", "Implements"…) — always fit, so a query returning
/// N edges drops N heap allocs for those columns.
Str(CompactString),
List(Vec<Value>),
/// Reference to a graph node. CLI side resolves `.name`/`.kind`/`.filePath`
/// for human-readable serialization.
/// for human-readable serialization. `name`/`kind` inline as `CompactString`
/// (kind always; realistic symbol names usually); `file_path` stays `String`
/// since paths routinely exceed the inline budget.
NodeRef {
idx: u32,
name: String,
kind: String,
name: CompactString,
kind: CompactString,
file_path: String,
},
EdgeRef {
Expand Down
2 changes: 1 addition & 1 deletion crates/ecp-core/tests/cypher_where_label_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ fn names_returned(cypher_query: &str) -> Vec<String> {
.rows
.iter()
.map(|r| match &r[0] {
cypher::Value::Str(s) => s.clone(),
cypher::Value::Str(s) => s.to_string(),
v => panic!("expected Str, got {v:?}"),
})
.collect()
Expand Down
Loading