diff --git a/libs/openant-core/parsers/c/call_graph_builder.py b/libs/openant-core/parsers/c/call_graph_builder.py index 9de6fae..893ea2d 100644 --- a/libs/openant-core/parsers/c/call_graph_builder.py +++ b/libs/openant-core/parsers/c/call_graph_builder.py @@ -307,9 +307,18 @@ def _resolve_call(self, call_name: str, caller_file: str) -> Optional[str]: func_data = self.functions.get(func_id, {}) fname = func_data.get('name', '') base_name = fname.split('::')[-1] if '::' in fname else fname - # A static function in an included header has internal linkage - # and is not callable from the including file. - if base_name == call_name and self._is_visible_from(func_id, caller_file): + # A function in an INCLUDED header is callable from the including + # TU regardless of `static`: a `static inline` header helper (the + # C header-inline idiom) is copied into every includer. The + # cross-file static-linkage rejection in _is_visible_from is + # correct only for a .c TU, so it must not gate an included-header + # candidate. (Repo-wide and prototype fallbacks below stay strict.) + header_candidate = func_data.get('file_path', '').endswith( + ('.h', '.hpp', '.hxx', '.hh') + ) + if base_name == call_name and ( + header_candidate or self._is_visible_from(func_id, caller_file) + ): return func_id # 3. Unique name match across entire repo diff --git a/libs/openant-core/parsers/c/function_extractor.py b/libs/openant-core/parsers/c/function_extractor.py index cae5860..5fbe3e2 100644 --- a/libs/openant-core/parsers/c/function_extractor.py +++ b/libs/openant-core/parsers/c/function_extractor.py @@ -66,6 +66,10 @@ def __init__(self, repo_path: str): self.macros: Dict[str, List[Dict]] = {} self.macro_aliases: Dict[str, str] = {} # e.g. OPENSSL_malloc -> CRYPTO_malloc self.prototypes: Dict[str, Dict] = {} # function name -> declaration info + # class/struct name -> list of direct base-class names, for inheritance + # walks in member dispatch (bug [30]). Populated from the + # base_class_clause of each class_specifier/struct_specifier. + self.class_bases: Dict[str, List[str]] = {} self.c_parser = Parser(C_LANGUAGE) self.cpp_parser = Parser(CPP_LANGUAGE) @@ -148,11 +152,18 @@ def _extract_identifier_from_declarator(self, node, source: bytes) -> Optional[s if node.type == 'qualified_identifier': return self._node_text(node, source) - # template_function + # operator overload (operator+, operator==, operator[], ...) + if node.type == 'operator_name': + return self._node_text(node, source) + + # conversion operator (operator int, operator MyType) + if node.type == 'operator_cast': + return self._node_text(node, source) + + # template_function: g — keep the template arguments so an + # explicit specialization does not collide with the primary template. if node.type == 'template_function': - name_node = node.child_by_field_name('name') - if name_node: - return self._extract_identifier_from_declarator(name_node, source) + return self._node_text(node, source) # reference_declarator (C++ int& func()) if node.type == 'reference_declarator': @@ -170,7 +181,8 @@ def _extract_identifier_from_declarator(self, node, source: bytes) -> Optional[s if child.type in ('identifier', 'field_identifier', 'qualified_identifier', 'function_declarator', 'pointer_declarator', 'parenthesized_declarator', 'destructor_name', - 'template_function', 'reference_declarator'): + 'template_function', 'reference_declarator', + 'operator_name', 'operator_cast'): result = self._extract_identifier_from_declarator(child, source) if result: return result @@ -218,6 +230,62 @@ def _get_parameters(self, node, source: bytes) -> List[str]: params.append('...') return params + @staticmethod + def _signature_discriminator(parameters: List[str]) -> str: + """A deterministic, COLON-FREE signature string for the parameter list. + + Colon-free is mandatory: the func_id is split on ':' downstream + (`core/diff_filter.py`, `utilities/agentic_enhancer/repository_index.py` + rsplit on ':' to recover the file), so any colon a C++ type carries + (`std::string`, a ternary default arg) is mapped to '.' here. + """ + joined = ','.join(p.strip() for p in parameters) + joined = ' '.join(joined.split()) # collapse newlines/runs of whitespace + return joined.replace(':', '.') + + @staticmethod + def _same_signature(a: dict, b: dict) -> bool: + """Two definitions share a signature iff their parameter lists match. + + Identical params => a redefinition of the SAME function under a different + preprocessor branch (#ifdef/#else) or an ODR-duplicate. Different params + => a genuine C++ overload. + """ + return a.get('parameters', []) == b.get('parameters', []) + + def _store_function(self, func_id: str, func_data: dict) -> None: + """Store a function, disambiguating same-(file,name) collisions instead + of silently overwriting (Bug 1: #ifdef/#else stubs and C++ overloads + collapsed onto one node — see reachability-bugs.md / bug-3482.md). + + Collision-only: when func_id is free, store byte-identically (the + hardcoded `path:name` id literals across the suite depend on unique names + keeping the plain `path:name` id). On a collision: + - SAME signature -> keep ONE node, prefer the larger body. The #else + stub is shorter than the real implementation regardless of which + preprocessor arm tree-sitter emits first, so this is order- and + direction-independent. + - DIFFERENT signature -> a genuine overload; both are real. Mint a + distinct key by folding the colon-free signature into the func_id. + The `name` field is left bare (`area`, not `area(int,int)`) because + the call-graph builder resolves calls by `name` (functions_by_name), + so both overloads stay findable. (Contrast bug [39], which folds the + template-arg discriminator into the NAME — correct there because a + template call is written `g`; an overload call is written bare.) + """ + existing = self.functions.get(func_id) + if existing is None: + self.functions[func_id] = func_data + return + if self._same_signature(existing, func_data): + if len(func_data.get('code', '')) > len(existing.get('code', '')): + self.functions[func_id] = func_data + return + overload_id = f"{func_id}({self._signature_discriminator(func_data.get('parameters', []))})" + prior = self.functions.get(overload_id) + if prior is None or len(func_data.get('code', '')) > len(prior.get('code', '')): + self.functions[overload_id] = func_data + def _find_function_declarator(self, node): """Find the function_declarator within a declarator tree.""" if node.type == 'function_declarator': @@ -257,10 +325,15 @@ def _classify_function(self, name: str, file_path: str, is_static: bool, return 'main' if is_cpp and class_name: - if name == class_name: - return 'constructor' - if name.startswith('~'): + # Compare the UNqualified leaves: an out-of-line definition arrives + # as a qualified name (e.g. 'Foo::Foo', 'Foo::~Foo') whose whole + # string never equals the bare class_name. + unqualified = name.rsplit('::', 1)[-1] + class_leaf = class_name.rsplit('::', 1)[-1] + if unqualified.startswith('~'): return 'destructor' + if unqualified == class_leaf: + return 'constructor' return 'method' if '__attribute__((constructor))' in code: @@ -289,6 +362,27 @@ def _get_class_name_from_qualified(self, name: str) -> Optional[str]: return '::'.join(parts[:-1]) return None + def _extract_base_classes(self, record_node, source: bytes) -> List[str]: + """Return the direct base-class names of a class/struct specifier. + + tree-sitter represents inheritance as a `base_class_clause` child of the + class_specifier/struct_specifier (sibling of name and body). The clause + holds one `type_identifier` per base, interleaved with optional + access_specifier / `virtual` / `,` tokens; we collect only the simple + `type_identifier` bases. Qualified or templated bases (ns::Base, + Base) are NOT recorded — matching the same-name keying used for + class_name elsewhere, so the inheritance walk stays sound (an unknown + base simply yields no edge rather than a wrong one). + """ + bases: List[str] = [] + for child in record_node.children: + if child.type != 'base_class_clause': + continue + for sub in child.children: + if sub.type == 'type_identifier': + bases.append(self._node_text(sub, source)) + return bases + def _extract_includes(self, tree, source: bytes) -> List[str]: """Extract #include directives from a file.""" includes = [] @@ -355,23 +449,38 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: str, """Extract all function definitions from a parsed tree.""" is_header = os.path.splitext(file_path)[1].lower() in ('.h', '.hpp', '.hxx', '.hh') - # Iterative traversal with explicit stack carrying (node, namespace_prefix) - stack = [(tree.root_node, '')] + # Iterative traversal with explicit stack carrying + # (node, namespace_prefix, class_context). namespace_prefix is the + # qualified prefix used to build the func_id; class_context is the + # enclosing class/struct/union name (or None) used for metadata so a + # namespace qualifier is never mistaken for a class qualifier. + stack = [(tree.root_node, '', None)] + + # struct/union member functions are C++ methods exactly like class + # members; only `class_specifier` was special-cased originally. + record_specifiers = ('class_specifier', 'struct_specifier', 'union_specifier') while stack: - node, namespace_prefix = stack.pop() + node, namespace_prefix, class_context = stack.pop() if node.type == 'function_definition': self._process_function_node(node, source, relative_path, - is_cpp, is_header, namespace_prefix) + is_cpp, is_header, namespace_prefix, + class_context) elif node.type == 'declaration' and not is_header: - # Skip standalone declarations in .c files (prototypes only) - pass + # Standalone declarations in .c/.cpp files are prototypes, EXCEPT + # a declaration whose initializer is a lambda — a named callable. + if is_cpp: + self._process_lambda_declaration(node, source, relative_path, + namespace_prefix) elif node.type == 'declaration' and is_header: # In headers, track prototypes for call resolution self._process_declaration_node(node, source, relative_path) + if is_cpp: + self._process_lambda_declaration(node, source, relative_path, + namespace_prefix) elif node.type == 'namespace_definition' and is_cpp: ns_name_node = node.child_by_field_name('name') @@ -380,42 +489,62 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: str, body_node = node.child_by_field_name('body') if body_node: for child in reversed(body_node.children): - stack.append((child, new_prefix)) + # class_context unchanged: a namespace is NOT a class. + stack.append((child, new_prefix, class_context)) continue # Don't walk children again - elif node.type == 'class_specifier' and is_cpp: + elif node.type in record_specifiers and is_cpp: class_name_node = node.child_by_field_name('name') if class_name_node: class_name = self._node_text(class_name_node, source) new_prefix = f"{namespace_prefix}{class_name}::" + # Record direct base classes for the inheritance walk in + # member dispatch (bug [30]). Keyed by the bare class name + # (matching the class_name stored on each method). + bases = self._extract_base_classes(node, source) + if bases: + existing = self.class_bases.setdefault(class_name, []) + for base in bases: + if base not in existing: + existing.append(base) body_node = node.child_by_field_name('body') if body_node: for child in reversed(body_node.children): if child.type == 'function_definition': self._process_function_node( child, source, relative_path, - is_cpp, is_header, new_prefix + is_cpp, is_header, new_prefix, class_name ) elif child.type == 'access_specifier': pass else: - stack.append((child, new_prefix)) + stack.append((child, new_prefix, class_name)) continue else: for child in reversed(node.children): - stack.append((child, namespace_prefix)) + stack.append((child, namespace_prefix, class_context)) def _process_function_node(self, node, source: bytes, relative_path: str, is_cpp: bool, is_header: bool, - namespace_prefix: str = '') -> None: + namespace_prefix: str = '', + class_context: Optional[str] = None) -> None: """Process a single function_definition node.""" name = self._get_function_name(node, source) if not name: return full_name = namespace_prefix + name if namespace_prefix and '::' not in name else name - class_name = self._get_class_name_from_qualified(full_name) + # class_name comes from the lexical enclosing class/struct/union + # (class_context) OR from a qualifier written in the source name + # (out-of-line def: Foo::method). A namespace qualifier in + # namespace_prefix must NOT become a class_name. + if class_context is not None: + class_name = class_context + elif '::' in name: + class_name = self._get_class_name_from_qualified(name) + else: + class_name = None code = self._node_text(node, source) start_line = node.start_point[0] + 1 # tree-sitter is 0-indexed @@ -447,7 +576,7 @@ def _process_function_node(self, node, source: bytes, relative_path: str, 'class_name': class_name, } - self.functions[func_id] = func_data + self._store_function(func_id, func_data) self.stats['total_functions'] += 1 if is_static: @@ -457,6 +586,52 @@ def _process_function_node(self, node, source: bytes, relative_path: str, self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1 + def _process_lambda_declaration(self, node, source: bytes, relative_path: str, + namespace_prefix: str = '') -> None: + """Extract a named lambda from a declaration (auto f = [](){...};). + + A lambda is a `lambda_expression` initializer inside an + `init_declarator`; the declaration node is otherwise skipped by the + traversal, so the callable would never be recorded as a unit. + """ + # A declaration may declare several init_declarators. + stack = list(node.children) + while stack: + child = stack.pop() + if child.type != 'init_declarator': + stack.extend(child.children) + continue + + value_node = child.child_by_field_name('value') + if value_node is None or value_node.type != 'lambda_expression': + continue + + decl_node = child.child_by_field_name('declarator') + name = (self._extract_identifier_from_declarator(decl_node, source) + if decl_node is not None else None) + if not name: + continue + + full_name = (namespace_prefix + name + if namespace_prefix and '::' not in name else name) + func_id = f"{relative_path}:{full_name}" + self._store_function(func_id, { + 'name': full_name, + 'file_path': relative_path, + 'start_line': node.start_point[0] + 1, + 'end_line': node.end_point[0] + 1, + 'code': self._node_text(node, source), + 'parameters': self._get_parameters(value_node, source), + 'return_type': 'auto', + 'is_static': False, + 'is_exported': False, + 'is_inline': False, + 'unit_type': 'lambda', + 'class_name': None, + }) + self.stats['total_functions'] += 1 + self.stats['by_type']['lambda'] = self.stats['by_type'].get('lambda', 0) + 1 + def _process_declaration_node(self, node, source: bytes, relative_path: str) -> None: """Process a declaration node in a header to track prototypes.""" # Look for function declarations (prototypes) @@ -550,6 +725,7 @@ def export(self) -> Dict: 'macros': self.macros, 'macro_aliases': self.macro_aliases, 'prototypes': self.prototypes, + 'class_bases': self.class_bases, 'statistics': self.stats, } diff --git a/libs/openant-core/tests/parsers/c/test_c_collision_discriminator.py b/libs/openant-core/tests/parsers/c/test_c_collision_discriminator.py new file mode 100644 index 0000000..9459fb0 --- /dev/null +++ b/libs/openant-core/tests/parsers/c/test_c_collision_discriminator.py @@ -0,0 +1,108 @@ +"""Bug 1 regression: same-(file,name) C/C++ functions must not collapse to one +`func_id`. Two trigger families, two policies (see reachability-bugs.md Bug 1): + + * C++ overloads — `area(int)` and `area(int,int)` are BOTH real. They must + survive as two distinct func_ids, and each must keep its OWN call edges + (the conflation bug routed a call to overload-1 into overload-2's callee). + The discriminator is folded into the func_id KEY only; the `name` field + stays bare (`area`) so name-based call resolution still finds both. + * `#ifdef`/`#else` redefinition — same name, same signature, different body. + Only one is the compiled implementation; keep a SINGLE node and prefer the + larger body (the `#else` stub is shorter regardless of source order). + +Plus a regression guard: a uniquely-named function's id is byte-identical to the +pre-fix `path:name` form (the collision-only contract — non-colliding ids never +change, so the 299 hardcoded id literals across the suite are untouched). +""" +import sys +import tempfile +from pathlib import Path + +import pytest + +_CORE = Path(__file__).resolve().parents[3] +if str(_CORE) not in sys.path: + sys.path.insert(0, str(_CORE)) + +pytest.importorskip("tree_sitter_c") + +from parsers.c.function_extractor import FunctionExtractor # noqa: E402 +from parsers.c.call_graph_builder import CallGraphBuilder # noqa: E402 + + +def _extract(filename: str, source: str) -> dict: + repo = Path(tempfile.mkdtemp()).resolve() + fp = repo / filename + fp.write_text(source) + ex = FunctionExtractor(str(repo)) + ex.process_file(fp) + return ex.functions + + +def test_cpp_overloads_both_survive(): + fns = _extract("ovl.cpp", + "int area(int s){return s*s;}\n" + "int area(int w,int h){return w*h;}\n") + areas = [fid for fid, d in fns.items() if d["name"] == "area"] + assert len(areas) == 2, f"both overloads must survive; got {areas}" + assert len({fid for fid in areas}) == 2, "overload func_ids must be distinct" + + +def test_overload_ids_are_colon_free_in_name_part(): + # The discriminator must not introduce a colon into the name part of the id + # (downstream diff_filter / repository_index rsplit on ':'). + fns = _extract("ovl.cpp", + "int f(int a){return a;}\n" + "int f(const char* s,int n){return n;}\n") + for fid, d in fns.items(): + if d["name"] != "f": + continue + name_part = fid.split(":", 1)[1] # everything after the path colon + assert ":" not in name_part, f"colon leaked into discriminated id: {fid!r}" + + +def test_ifdef_else_keeps_larger_body_not_stub(): + # tree-sitter parses BOTH preprocessor arms; the #else stub comes last and, + # pre-fix, overwrote the real implementation. Prefer the larger body. + src = ( + "#ifdef FEATURE\n" + "int run(int n){int t=0;for(int i=0;i