diff --git a/libs/openant-core/parsers/c/call_graph_builder.py b/libs/openant-core/parsers/c/call_graph_builder.py index 9de6fae..9da5400 100644 --- a/libs/openant-core/parsers/c/call_graph_builder.py +++ b/libs/openant-core/parsers/c/call_graph_builder.py @@ -110,6 +110,11 @@ def __init__(self, extractor_output: Dict, options: Optional[Dict] = None): self.macros = extractor_output.get('macros', {}) self.macro_aliases = extractor_output.get('macro_aliases', {}) self.prototypes = extractor_output.get('prototypes', {}) + # class_name -> [direct base-class name, ...] for the inheritance walk in + # member dispatch (bug [30]). Defaults to {} when the extractor output + # predates base-class extraction, so resolution degrades to the [51] + # same-type behavior rather than erroring. + self.class_bases: Dict[str, List[str]] = extractor_output.get('class_bases', {}) self.repo_path = extractor_output.get('repository', '') self.max_depth = options.get('max_depth', 3) @@ -121,6 +126,10 @@ def __init__(self, extractor_output: Dict, options: Optional[Dict] = None): # Indexes for faster lookup self.functions_by_name: Dict[str, List[str]] = {} self.functions_by_file: Dict[str, List[str]] = {} + # class_name -> {base_method_name -> [func_id, ...]} for member dispatch. + # Scoped per (class, method) so a receiver-typed call resolves only to a + # method actually declared on that class, never a sibling/free function. + self.methods_by_class: Dict[str, Dict[str, List[str]]] = {} # Include map: file -> set of included header files self.include_map: Dict[str, Set[str]] = {} @@ -153,6 +162,13 @@ def _build_indexes(self) -> None: self.functions_by_file[file_path] = [] self.functions_by_file[file_path].append(func_id) + # Index methods by their declaring class for receiver-type dispatch. + class_name = func_data.get('class_name') + if class_name and name: + method_base = name.split('::')[-1] if '::' in name else name + self.methods_by_class.setdefault(class_name, {}) \ + .setdefault(method_base, []).append(func_id) + # Build include map for file_path, inc_list in self.includes.items(): self.include_map[file_path] = set() @@ -191,15 +207,25 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]: except Exception: return self._extract_calls_regex(code, caller_id) + # Receiver static types inferred from local declarations in this body, + # used to resolve member calls (w.compute() / w->compute()) to the + # method on the receiver's known type. + local_var_types = self._extract_local_var_types(tree.root_node, code_bytes) + stack = [tree.root_node] while stack: node = stack.pop() if node.type == 'call_expression': func_node = node.child_by_field_name('function') if func_node: - call_name = self._extract_call_name(func_node, code_bytes) + call_name, receiver = self._extract_call_name_and_receiver( + func_node, code_bytes + ) if call_name: - resolved = self._resolve_call(call_name, caller_file) + receiver_type = local_var_types.get(receiver) if receiver else None + resolved = self._resolve_call(call_name, caller_file, + receiver_type=receiver_type, + is_member=func_node.type == 'field_expression') if resolved: calls.add(resolved) # A function passed by name as an argument (e.g. @@ -232,6 +258,81 @@ def _extract_callback_args(self, call_node, source: bytes, caller_file: str) -> found.add(resolved) return found + def _extract_call_name_and_receiver(self, node, source: bytes): + """Return (call_name, receiver_identifier) for a call's function child. + + receiver_identifier is the bare identifier text of a member-call receiver + (the `w` in `w.compute()` / `w->compute()`) when it is a simple + identifier, else None. The call_name is identical to what + _extract_call_name returns, so non-member calls are unaffected. + """ + if node.type == 'field_expression': + receiver = None + arg = node.child_by_field_name('argument') + if arg is not None and arg.type == 'identifier': + receiver = source[arg.start_byte:arg.end_byte].decode( + 'utf-8', errors='replace') + # _extract_call_name declines field_expression (no false free-function + # edges); the member name is recovered here from the `field` child and + # resolved ONLY through typed/same-file member dispatch in _resolve_call. + field = node.child_by_field_name('field') + name = None + if field is not None: + name = source[field.start_byte:field.end_byte].decode( + 'utf-8', errors='replace') + if not name.isidentifier(): + name = None + return name, receiver + return self._extract_call_name(node, source), None + + def _extract_local_var_types(self, root, source: bytes) -> Dict[str, str]: + """Map local variable name -> declared type name within a function body. + + Walks `declaration` nodes and records the (type_identifier, variable) + pairs for both plain declarations (`Widget w;`) and pointer declarations + (`Widget* w = ...;`). Only simple type_identifier types are recorded; + anything else (templates, qualified types, multiple declarators we can't + cleanly attribute) is skipped so callers fall back to base-name + resolution rather than risk a wrong-type edge. + """ + var_types: Dict[str, str] = {} + stack = [root] + while stack: + node = stack.pop() + if node.type == 'declaration': + type_node = node.child_by_field_name('type') + if type_node is not None and type_node.type == 'type_identifier': + type_name = source[type_node.start_byte:type_node.end_byte] \ + .decode('utf-8', errors='replace') + # A declaration can hold several declarators (Widget a, b;); + # attribute the type to every variable name we extract. + for child in node.children: + var_name = self._declared_var_name(child, source) + if var_name: + var_types[var_name] = type_name + stack.extend(reversed(node.children)) + return var_types + + def _declared_var_name(self, node, source: bytes) -> Optional[str]: + """Extract the declared variable identifier from a declarator subtree. + + Handles the plain identifier (`w`), pointer_declarator (`* w`) and + init_declarator (`* w = ...` / `w = ...`) shapes. Returns None for nodes + that are not a variable declarator (e.g. the type node, `;`). + """ + if node.type == 'identifier': + return source[node.start_byte:node.end_byte].decode('utf-8', errors='replace') + if node.type in ('pointer_declarator', 'init_declarator', 'reference_declarator'): + inner = node.child_by_field_name('declarator') + if inner is not None: + return self._declared_var_name(inner, source) + # init_declarator with no declarator field: scan children. + for child in node.children: + name = self._declared_var_name(child, source) + if name: + return name + return None + def _extract_call_name(self, node, source: bytes) -> Optional[str]: """Extract the function name from a call_expression's function child.""" text = source[node.start_byte:node.end_byte].decode('utf-8', errors='replace') @@ -274,8 +375,101 @@ def _is_visible_from(self, func_id: str, caller_file: str) -> bool: return True return not func_data.get('is_static', False) - def _resolve_call(self, call_name: str, caller_file: str) -> Optional[str]: - """Resolve a function call name to a function ID.""" + def _resolve_same_file(self, call_name: str, caller_file: str) -> Optional[str]: + """Resolve a call to a user-defined function in the same file, if any.""" + same_file_funcs = self.functions_by_file.get(caller_file, []) + for func_id in same_file_funcs: + func_data = self.functions.get(func_id, {}) + fname = func_data.get('name', '') + base_name = fname.split('::')[-1] if '::' in fname else fname + if base_name == call_name: + return func_id + return None + + def _resolve_method_on_class(self, class_name: str, call_name: str, + caller_file: str) -> Optional[str]: + """Resolve call_name to a method DIRECTLY declared on class_name (same file). + + Returns the func_id of a method named call_name declared on class_name and + defined in caller_file, else None. No inheritance — this is the single-hop + lookup the walk in _resolve_member_call composes over the base chain. + """ + by_method = self.methods_by_class.get(class_name) + if not by_method: + return None + for func_id in by_method.get(call_name, []): + func_data = self.functions.get(func_id, {}) + if func_data.get('file_path', '') == caller_file: + return func_id + return None + + def _resolve_member_call(self, call_name: str, caller_file: str, + receiver_type: str) -> Optional[str]: + """Resolve a member call to the method on the receiver's STATIC type, + walking UP the base-class chain to the first ancestor that defines it. + + Sound static-type floor (bug [30]): start at the receiver's declared type + and return its own method if it defines call_name; otherwise walk up its + direct base classes (BFS, cycle-guarded) and resolve to the FIRST ancestor + that declares call_name in the same file. The walk STOPS at the first + definer, so a derived override resolves to itself, never an ancestor. + + Deliberately does NOT link derived overrides of an ancestor's virtual + method (a documented non-goal that would create false edges): a call via a + Base* receiver resolves to Base's method only — the static-type floor. + + Same-file only: if no class on the chain defines call_name in this + translation unit, returns None so the caller falls back to base-name + resolution (never a wrong-type / unrelated-free-function edge). + """ + visited: Set[str] = set() + queue: List[str] = [receiver_type] + while queue: + cls = queue.pop(0) + if cls in visited: + continue + visited.add(cls) + # First definer on the chain wins (own type before ancestors). + match = self._resolve_method_on_class(cls, call_name, caller_file) + if match: + return match + for base in self.class_bases.get(cls, []): + if base not in visited: + queue.append(base) + return None + + def _resolve_call(self, call_name: str, caller_file: str, + receiver_type: Optional[str] = None, + is_member: bool = False) -> Optional[str]: + """Resolve a function call name to a function ID. + + When receiver_type is given (a member call like w.compute() whose receiver + w has a known same-file type), resolve to that type's method FIRST. If + that fails, fall through to the unchanged base-name resolution below. + """ + if receiver_type: + member_match = self._resolve_member_call(call_name, caller_file, + receiver_type) + if member_match: + return member_match + + # A user-defined function in the SAME FILE shadows any stdlib/builtin + # of the same name, so it must be checked BEFORE the stdlib filter. + # Scope is deliberately same-file only: a genuine stdlib call (no + # same-file definition) still falls through to _is_stdlib below, so we + # never wrongly link a real stdlib call (e.g. printf/open) to an + # unrelated same-named user function in another file. + same_file_user_func = self._resolve_same_file(call_name, caller_file) + if same_file_user_func: + return same_file_user_func + + # A member call (obj->m() / obj.m()) whose receiver type is unknown or + # whose chain defines no such method resolves same-file only: declining + # here keeps the field-expression precision guarantee (never an edge to + # an unrelated cross-file free function of the same name). + if is_member: + return None + if self._is_stdlib(call_name): return None @@ -288,13 +482,9 @@ def _resolve_call(self, call_name: str, caller_file: str) -> Optional[str]: return result # 1. Same-file functions - same_file_funcs = self.functions_by_file.get(caller_file, []) - for func_id in same_file_funcs: - func_data = self.functions.get(func_id, {}) - fname = func_data.get('name', '') - base_name = fname.split('::')[-1] if '::' in fname else fname - if base_name == call_name: - return func_id + same_file_match = self._resolve_same_file(call_name, caller_file) + if same_file_match: + return same_file_match # 2. Functions in included headers included_files = self.include_map.get(caller_file, set()) @@ -392,10 +582,13 @@ def _extract_calls_regex(self, code: str, caller_id: str) -> Set[str]: if func_name in ('if', 'while', 'for', 'switch', 'return', 'sizeof', 'typeof', 'alignof', 'offsetof', 'case', 'else'): continue - if not self._is_stdlib(func_name): - resolved = self._resolve_call(func_name, caller_file) - if resolved: - calls.add(resolved) + # No _is_stdlib gate here: _resolve_call applies the same-file-first + # rule and the stdlib filter internally, so a user function whose + # name collides with a builtin still resolves (same leak as the + # tree-sitter path otherwise). + resolved = self._resolve_call(func_name, caller_file) + if resolved: + calls.add(resolved) return calls diff --git a/libs/openant-core/parsers/c/function_extractor.py b/libs/openant-core/parsers/c/function_extractor.py index cae5860..973e61e 100644 --- a/libs/openant-core/parsers/c/function_extractor.py +++ b/libs/openant-core/parsers/c/function_extractor.py @@ -66,6 +66,10 @@ def __init__(self, repo_path: str): self.macros: Dict[str, List[Dict]] = {} self.macro_aliases: Dict[str, str] = {} # e.g. OPENSSL_malloc -> CRYPTO_malloc self.prototypes: Dict[str, Dict] = {} # function name -> declaration info + # class/struct name -> list of direct base-class names, for inheritance + # walks in member dispatch (bug [30]). Populated from the + # base_class_clause of each class_specifier/struct_specifier. + self.class_bases: Dict[str, List[str]] = {} self.c_parser = Parser(C_LANGUAGE) self.cpp_parser = Parser(CPP_LANGUAGE) @@ -148,11 +152,18 @@ def _extract_identifier_from_declarator(self, node, source: bytes) -> Optional[s if node.type == 'qualified_identifier': return self._node_text(node, source) - # template_function + # operator overload (operator+, operator==, operator[], ...) + if node.type == 'operator_name': + return self._node_text(node, source) + + # conversion operator (operator int, operator MyType) + if node.type == 'operator_cast': + return self._node_text(node, source) + + # template_function: g — keep the template arguments so an + # explicit specialization does not collide with the primary template. if node.type == 'template_function': - name_node = node.child_by_field_name('name') - if name_node: - return self._extract_identifier_from_declarator(name_node, source) + return self._node_text(node, source) # reference_declarator (C++ int& func()) if node.type == 'reference_declarator': @@ -170,7 +181,8 @@ def _extract_identifier_from_declarator(self, node, source: bytes) -> Optional[s if child.type in ('identifier', 'field_identifier', 'qualified_identifier', 'function_declarator', 'pointer_declarator', 'parenthesized_declarator', 'destructor_name', - 'template_function', 'reference_declarator'): + 'template_function', 'reference_declarator', + 'operator_name', 'operator_cast'): result = self._extract_identifier_from_declarator(child, source) if result: return result @@ -257,10 +269,15 @@ def _classify_function(self, name: str, file_path: str, is_static: bool, return 'main' if is_cpp and class_name: - if name == class_name: - return 'constructor' - if name.startswith('~'): + # Compare the UNqualified leaves: an out-of-line definition arrives + # as a qualified name (e.g. 'Foo::Foo', 'Foo::~Foo') whose whole + # string never equals the bare class_name. + unqualified = name.rsplit('::', 1)[-1] + class_leaf = class_name.rsplit('::', 1)[-1] + if unqualified.startswith('~'): return 'destructor' + if unqualified == class_leaf: + return 'constructor' return 'method' if '__attribute__((constructor))' in code: @@ -289,6 +306,27 @@ def _get_class_name_from_qualified(self, name: str) -> Optional[str]: return '::'.join(parts[:-1]) return None + def _extract_base_classes(self, record_node, source: bytes) -> List[str]: + """Return the direct base-class names of a class/struct specifier. + + tree-sitter represents inheritance as a `base_class_clause` child of the + class_specifier/struct_specifier (sibling of name and body). The clause + holds one `type_identifier` per base, interleaved with optional + access_specifier / `virtual` / `,` tokens; we collect only the simple + `type_identifier` bases. Qualified or templated bases (ns::Base, + Base) are NOT recorded — matching the same-name keying used for + class_name elsewhere, so the inheritance walk stays sound (an unknown + base simply yields no edge rather than a wrong one). + """ + bases: List[str] = [] + for child in record_node.children: + if child.type != 'base_class_clause': + continue + for sub in child.children: + if sub.type == 'type_identifier': + bases.append(self._node_text(sub, source)) + return bases + def _extract_includes(self, tree, source: bytes) -> List[str]: """Extract #include directives from a file.""" includes = [] @@ -355,23 +393,38 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: str, """Extract all function definitions from a parsed tree.""" is_header = os.path.splitext(file_path)[1].lower() in ('.h', '.hpp', '.hxx', '.hh') - # Iterative traversal with explicit stack carrying (node, namespace_prefix) - stack = [(tree.root_node, '')] + # Iterative traversal with explicit stack carrying + # (node, namespace_prefix, class_context). namespace_prefix is the + # qualified prefix used to build the func_id; class_context is the + # enclosing class/struct/union name (or None) used for metadata so a + # namespace qualifier is never mistaken for a class qualifier. + stack = [(tree.root_node, '', None)] + + # struct/union member functions are C++ methods exactly like class + # members; only `class_specifier` was special-cased originally. + record_specifiers = ('class_specifier', 'struct_specifier', 'union_specifier') while stack: - node, namespace_prefix = stack.pop() + node, namespace_prefix, class_context = stack.pop() if node.type == 'function_definition': self._process_function_node(node, source, relative_path, - is_cpp, is_header, namespace_prefix) + is_cpp, is_header, namespace_prefix, + class_context) elif node.type == 'declaration' and not is_header: - # Skip standalone declarations in .c files (prototypes only) - pass + # Standalone declarations in .c/.cpp files are prototypes, EXCEPT + # a declaration whose initializer is a lambda — a named callable. + if is_cpp: + self._process_lambda_declaration(node, source, relative_path, + namespace_prefix) elif node.type == 'declaration' and is_header: # In headers, track prototypes for call resolution self._process_declaration_node(node, source, relative_path) + if is_cpp: + self._process_lambda_declaration(node, source, relative_path, + namespace_prefix) elif node.type == 'namespace_definition' and is_cpp: ns_name_node = node.child_by_field_name('name') @@ -380,42 +433,62 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: str, body_node = node.child_by_field_name('body') if body_node: for child in reversed(body_node.children): - stack.append((child, new_prefix)) + # class_context unchanged: a namespace is NOT a class. + stack.append((child, new_prefix, class_context)) continue # Don't walk children again - elif node.type == 'class_specifier' and is_cpp: + elif node.type in record_specifiers and is_cpp: class_name_node = node.child_by_field_name('name') if class_name_node: class_name = self._node_text(class_name_node, source) new_prefix = f"{namespace_prefix}{class_name}::" + # Record direct base classes for the inheritance walk in + # member dispatch (bug [30]). Keyed by the bare class name + # (matching the class_name stored on each method). + bases = self._extract_base_classes(node, source) + if bases: + existing = self.class_bases.setdefault(class_name, []) + for base in bases: + if base not in existing: + existing.append(base) body_node = node.child_by_field_name('body') if body_node: for child in reversed(body_node.children): if child.type == 'function_definition': self._process_function_node( child, source, relative_path, - is_cpp, is_header, new_prefix + is_cpp, is_header, new_prefix, class_name ) elif child.type == 'access_specifier': pass else: - stack.append((child, new_prefix)) + stack.append((child, new_prefix, class_name)) continue else: for child in reversed(node.children): - stack.append((child, namespace_prefix)) + stack.append((child, namespace_prefix, class_context)) def _process_function_node(self, node, source: bytes, relative_path: str, is_cpp: bool, is_header: bool, - namespace_prefix: str = '') -> None: + namespace_prefix: str = '', + class_context: Optional[str] = None) -> None: """Process a single function_definition node.""" name = self._get_function_name(node, source) if not name: return full_name = namespace_prefix + name if namespace_prefix and '::' not in name else name - class_name = self._get_class_name_from_qualified(full_name) + # class_name comes from the lexical enclosing class/struct/union + # (class_context) OR from a qualifier written in the source name + # (out-of-line def: Foo::method). A namespace qualifier in + # namespace_prefix must NOT become a class_name. + if class_context is not None: + class_name = class_context + elif '::' in name: + class_name = self._get_class_name_from_qualified(name) + else: + class_name = None code = self._node_text(node, source) start_line = node.start_point[0] + 1 # tree-sitter is 0-indexed @@ -457,6 +530,52 @@ def _process_function_node(self, node, source: bytes, relative_path: str, self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1 + def _process_lambda_declaration(self, node, source: bytes, relative_path: str, + namespace_prefix: str = '') -> None: + """Extract a named lambda from a declaration (auto f = [](){...};). + + A lambda is a `lambda_expression` initializer inside an + `init_declarator`; the declaration node is otherwise skipped by the + traversal, so the callable would never be recorded as a unit. + """ + # A declaration may declare several init_declarators. + stack = list(node.children) + while stack: + child = stack.pop() + if child.type != 'init_declarator': + stack.extend(child.children) + continue + + value_node = child.child_by_field_name('value') + if value_node is None or value_node.type != 'lambda_expression': + continue + + decl_node = child.child_by_field_name('declarator') + name = (self._extract_identifier_from_declarator(decl_node, source) + if decl_node is not None else None) + if not name: + continue + + full_name = (namespace_prefix + name + if namespace_prefix and '::' not in name else name) + func_id = f"{relative_path}:{full_name}" + self.functions[func_id] = { + 'name': full_name, + 'file_path': relative_path, + 'start_line': node.start_point[0] + 1, + 'end_line': node.end_point[0] + 1, + 'code': self._node_text(node, source), + 'parameters': self._get_parameters(value_node, source), + 'return_type': 'auto', + 'is_static': False, + 'is_exported': False, + 'is_inline': False, + 'unit_type': 'lambda', + 'class_name': None, + } + self.stats['total_functions'] += 1 + self.stats['by_type']['lambda'] = self.stats['by_type'].get('lambda', 0) + 1 + def _process_declaration_node(self, node, source: bytes, relative_path: str) -> None: """Process a declaration node in a header to track prototypes.""" # Look for function declarations (prototypes) @@ -550,6 +669,7 @@ def export(self) -> Dict: 'macros': self.macros, 'macro_aliases': self.macro_aliases, 'prototypes': self.prototypes, + 'class_bases': self.class_bases, 'statistics': self.stats, } diff --git a/libs/openant-core/parsers/c/unit_generator.py b/libs/openant-core/parsers/c/unit_generator.py index fcca506..455bd16 100644 --- a/libs/openant-core/parsers/c/unit_generator.py +++ b/libs/openant-core/parsers/c/unit_generator.py @@ -237,6 +237,7 @@ def create_unit(self, func_id: str, func_data: Dict) -> Dict: 'metadata': { 'is_static': func_data.get('is_static', False), 'is_exported': func_data.get('is_exported', True), + 'is_inline': func_data.get('is_inline', False), 'return_type': func_data.get('return_type', ''), 'parameters': func_data.get('parameters', []), 'generator': 'c_unit_generator.py', @@ -306,6 +307,7 @@ def generate_analyzer_output(self) -> Dict: 'endLine': func_data.get('end_line', 0), 'isStatic': func_data.get('is_static', False), 'isExported': func_data.get('is_exported', True), + 'isInline': func_data.get('is_inline', False), 'returnType': func_data.get('return_type', ''), 'parameters': func_data.get('parameters', []), 'className': func_data.get('class_name'), diff --git a/libs/openant-core/tests/parsers/c/test_c_schema_completeness.py b/libs/openant-core/tests/parsers/c/test_c_schema_completeness.py new file mode 100644 index 0000000..bfd2190 --- /dev/null +++ b/libs/openant-core/tests/parsers/c/test_c_schema_completeness.py @@ -0,0 +1,151 @@ +"""Schema-completeness contract test for the C parser (BUG 29 family guard). + +BUG 29 was a *field drift*: the function extractor produced `is_inline` in its +per-function `func_data`, but the unit generator's create_unit() silently +dropped it when assembling the unit's metadata. The root family is +"producer/consumer field-contract not schema-enforced" — a textual, per-field +review misses the next drop. This test makes the contract explicit and +machine-checked. + +Design: + * FIELD_CONTRACT maps each extractor-produced metadata key that the unit + MUST expose -> the dotted location where create_unit() should place it. + * A small reusable `get_path(obj, dotted)` walks that location. + * One parametrized test drives the REAL extractor + REAL UnitGenerator on a + function exercising every flag (static, inline) and asserts each contracted + field is present at its location and round-trips the producer's value. + +To add a future field to the contract: extend FIELD_CONTRACT. If create_unit() +forgets to carry it, this test fails — no per-field hand audit needed. +""" + +import sys +from pathlib import Path + +import pytest + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.c.function_extractor import FunctionExtractor +from parsers.c.unit_generator import UnitGenerator + + +# Source exercising the contracted flags: `static inline` => is_static=True, +# is_inline=True, is_exported=False, plus return_type/parameters populated. +CONTRACT_SRC = "static inline int add(int a, int b) {\n return a + b;\n}\n" +TARGET_ID = "m.c:add" + +# extractor func_data key -> dotted location in the assembled unit. +# This is the field-contract the consumer (create_unit) must honor. +FIELD_CONTRACT = { + "is_static": "metadata.is_static", + "is_exported": "metadata.is_exported", + "is_inline": "metadata.is_inline", + "return_type": "metadata.return_type", + "parameters": "metadata.parameters", + "unit_type": "unit_type", + "name": "code.primary_origin.function_name", + "file_path": "code.primary_origin.file_path", + "class_name": "code.primary_origin.class_name", + "start_line": "code.primary_origin.start_line", + "end_line": "code.primary_origin.end_line", +} + +_MISSING = object() + + +def get_path(obj, dotted): + """Walk a dotted path through nested dicts; return _MISSING if any hop absent.""" + cur = obj + for part in dotted.split("."): + if not isinstance(cur, dict) or part not in cur: + return _MISSING + cur = cur[part] + return cur + + +@pytest.fixture(scope="module") +def extracted_and_unit(tmp_path_factory): + tmp_path = tmp_path_factory.mktemp("c_schema") + (tmp_path / "m.c").write_text(CONTRACT_SRC) + + extractor = FunctionExtractor(str(tmp_path)) + functions = extractor.extract_all(files=["m.c"])["functions"] + assert TARGET_ID in functions, f"target not extracted; got {list(functions)}" + + gen = UnitGenerator({ + "repository": str(tmp_path), + "functions": functions, + "call_graph": {}, + "reverse_call_graph": {}, + }) + unit = gen.create_unit(TARGET_ID, functions[TARGET_ID]) + return functions[TARGET_ID], unit + + +# Parallel consumer: generate_analyzer_output() re-emits a camelCase schema. +# It is the SECOND place BUG-29's field could drift; guard it too. +# extractor snake_case key -> analyzer_output camelCase key. +ANALYZER_CONTRACT = { + "name": "name", + "unit_type": "unitType", + "code": "code", + "file_path": "filePath", + "start_line": "startLine", + "end_line": "endLine", + "is_static": "isStatic", + "is_exported": "isExported", + "is_inline": "isInline", + "return_type": "returnType", + "parameters": "parameters", + "class_name": "className", +} + + +@pytest.mark.parametrize("producer_key,camel_key", sorted(ANALYZER_CONTRACT.items())) +def test_analyzer_output_contract_carried(extracted_and_unit, producer_key, camel_key): + """generate_analyzer_output() must re-emit every contracted field (camelCase).""" + func_data, _unit = extracted_and_unit + gen = UnitGenerator({ + "repository": "", + "functions": {TARGET_ID: func_data}, + "call_graph": {}, + "reverse_call_graph": {}, + }) + out = gen.generate_analyzer_output()["functions"][TARGET_ID] + assert camel_key in out, ( + f"field '{producer_key}' dropped from analyzer_output: " + f"expected camelCase key '{camel_key}'; keys = {sorted(out)}" + ) + assert out[camel_key] == func_data[producer_key] + + +@pytest.mark.parametrize("producer_key,unit_location", sorted(FIELD_CONTRACT.items())) +def test_field_contract_carried(extracted_and_unit, producer_key, unit_location): + """Each extractor-produced field must appear at its contracted unit location.""" + func_data, unit = extracted_and_unit + + assert producer_key in func_data, ( + f"contract assumes extractor produces '{producer_key}', but it did not " + f"(producer keys = {sorted(func_data)})" + ) + + value = get_path(unit, unit_location) + assert value is not _MISSING, ( + f"field '{producer_key}' dropped at unit assembly: " + f"expected at unit location '{unit_location}'" + ) + assert value == func_data[producer_key], ( + f"field '{producer_key}' did not round-trip: " + f"producer={func_data[producer_key]!r} unit={value!r}" + ) + + +def test_drift_prone_keys_stay_in_contract(): + """Self-check: the BUG-29 drift-prone key must remain in both contract maps + so this suite cannot silently go toothless if someone deletes the entry + (parity with the PHP schema test). If `is_inline` is dropped from a map, the + field-drift guard above evaporates -- fail loudly instead.""" + assert "is_inline" in FIELD_CONTRACT, "is_inline dropped from FIELD_CONTRACT -> guard disabled" + assert ANALYZER_CONTRACT.get("is_inline") == "isInline", "is_inline->isInline dropped from ANALYZER_CONTRACT -> guard disabled" diff --git a/libs/openant-core/tests/parsers/c/test_call_graph_builder_dispatch.py b/libs/openant-core/tests/parsers/c/test_call_graph_builder_dispatch.py new file mode 100644 index 0000000..1b6f645 --- /dev/null +++ b/libs/openant-core/tests/parsers/c/test_call_graph_builder_dispatch.py @@ -0,0 +1,396 @@ +"""Regression tests for the C/C++ call graph builder — member-dispatch (bug [51]). + +Bug report (C member-dispatch): + `Widget w; w.compute();` (and `Widget* w = ...; w->compute();`) should resolve + to `Widget::compute` (the method on w's known type), but the current resolver + discards the receiver and resolves `compute` base-name-first — linking a free + function `compute` (or, with two classes, the first-defined sibling method). + +Root cause: + parsers/c/call_graph_builder.py `_extract_call_name` returns ONLY the field + name for a `field_expression`, dropping the receiver. `_resolve_call` then has + no type context and resolves the bare base name (first/unique same-file match). + +Fix scope: SAME-FILE only. The receiver's static type is inferred from a local +variable declaration in the caller's body; if the type is unknown or has no such +method in the same translation unit, resolution FALLS BACK to the existing +base-name behavior (no false edge). + +Each test builds a real FunctionExtractor output from temp C++ source and runs the +real CallGraphBuilder pipeline, then asserts on the resulting edges. The RED cases +are constructed so the base-name-first resolver picks the WRONG target pre-fix. +""" + +import sys +import tempfile +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.c.call_graph_builder import CallGraphBuilder +from parsers.c.function_extractor import FunctionExtractor + + +def _extract(source: str, filename: str = "main.cpp") -> dict: + """Write source to a temp repo, run the real extractor, return its output.""" + tmpdir = tempfile.mkdtemp() + src_path = Path(tmpdir) / filename + src_path.write_text(source, encoding="utf-8") + extractor = FunctionExtractor(tmpdir) + return extractor.extract_all([filename]) + + +def _build(source: str, filename: str = "main.cpp") -> CallGraphBuilder: + builder = CallGraphBuilder(_extract(source, filename)) + builder.build_call_graph() + return builder + + +# --------------------------------------------------------------------------- # +# RED1: value-receiver member dispatch (Widget w; w.compute();) +# A free `compute` is present, so the base-name-first resolver mislinks to it +# pre-fix. Post-fix it must resolve to Widget::compute (the method on w's type). +# --------------------------------------------------------------------------- # + +_VALUE_SRC = """ +void compute() { } +class Widget { +public: + void compute() { } + void run() { + Widget w; + w.compute(); + } +}; +""" + + +def test_value_receiver_member_call_resolves_to_method(): + builder = _build(_VALUE_SRC) + caller = "main.cpp:Widget::run" + method = "main.cpp:Widget::compute" + free = "main.cpp:compute" + edges = builder.call_graph.get(caller, []) + assert method in edges, ( + f"Member call w.compute() did not resolve to the method on w's type.\n" + f" Expected edge to: {method}\n" + f" Got: {edges}\n" + f" Functions: {list(builder.functions)}" + ) + assert free not in edges, ( + f"Member call w.compute() wrongly linked to the free function compute.\n" + f" Got: {edges}" + ) + + +# --------------------------------------------------------------------------- # +# RED1b: pointer-receiver member dispatch (Widget* w = ...; w->compute();) +# --------------------------------------------------------------------------- # + +_PTR_SRC = """ +void compute() { } +class Widget { +public: + void compute() { } +}; +Widget* acquire(); +void run() { + Widget* w = acquire(); + w->compute(); +} +""" + + +def test_pointer_receiver_member_call_resolves_to_method(): + builder = _build(_PTR_SRC) + caller = "main.cpp:run" + method = "main.cpp:Widget::compute" + free = "main.cpp:compute" + edges = builder.call_graph.get(caller, []) + assert method in edges, ( + f"Member call w->compute() did not resolve to the method on w's type.\n" + f" Expected edge to: {method}\n" + f" Got: {edges}\n" + f" Functions: {list(builder.functions)}" + ) + assert free not in edges, ( + f"Member call w->compute() wrongly linked to the free function compute.\n" + f" Got: {edges}" + ) + + +# --------------------------------------------------------------------------- # +# PRECISION NEG1: unknown receiver type must fall back to base-name resolution +# (and never invent a type-dispatch edge). `x` is not declared in run(); the +# only same-file `compute` is a free function — fallback links it, as today. +# --------------------------------------------------------------------------- # + +_UNKNOWN_RECV_SRC = """ +void compute() { } +void run() { + x.compute(); +} +""" + + +def test_unknown_receiver_falls_back_to_base_name(): + builder = _build(_UNKNOWN_RECV_SRC) + caller = "main.cpp:run" + free = "main.cpp:compute" + assert builder.call_graph.get(caller, []) == [free], ( + f"Unknown-receiver member call did not resolve as the base-name fallback.\n" + f" Expected: [{free}]\n" + f" Got: {builder.call_graph.get(caller)}" + ) + + +# --------------------------------------------------------------------------- # +# PRECISION NEG2: plain free-function call (no receiver) unchanged +# --------------------------------------------------------------------------- # + +_FREE_SRC = """ +void helper() { } +void run() { + helper(); +} +""" + + +def test_plain_free_function_call_unchanged(): + builder = _build(_FREE_SRC) + caller = "main.cpp:run" + callee = "main.cpp:helper" + assert builder.call_graph.get(caller, []) == [callee], ( + f"Plain free-function call regressed.\n" + f" Expected: [{callee}]\n" + f" Got: {builder.call_graph.get(caller)}" + ) + + +# --------------------------------------------------------------------------- # +# PRECISION NEG3: two classes each with compute; B defined FIRST so the +# base-name-first resolver would pick B::compute pre-fix. `A a; a.compute()` +# must resolve to A::compute, NOT the sibling B::compute. +# --------------------------------------------------------------------------- # + +_TWO_CLASS_SRC = """ +class B { +public: + void compute() { } +}; +class A { +public: + void compute() { } +}; +void run() { + A a; + a.compute(); +} +""" + + +def test_member_call_resolves_to_correct_class_not_sibling(): + builder = _build(_TWO_CLASS_SRC) + caller = "main.cpp:run" + a_compute = "main.cpp:A::compute" + b_compute = "main.cpp:B::compute" + edges = builder.call_graph.get(caller, []) + assert a_compute in edges, ( + f"a.compute() did not resolve to A::compute.\n Got: {edges}" + ) + assert b_compute not in edges, ( + f"a.compute() wrongly linked to the sibling (first-defined) B::compute.\n" + f" Got: {edges}" + ) + + +# =========================================================================== # +# Bug [30]: virtual / inherited member dispatch — inheritance walk. +# +# `Base* b = ...; b->compute();` where compute is defined on Base (or an +# ancestor) and the receiver's STATICALLY-DECLARED type does NOT define it +# directly. Resolution must walk UP the base-class chain to the first +# ancestor that defines compute. This is the SOUND FLOOR: it resolves to the +# static type's method (or its nearest ancestor's), and deliberately does NOT +# link every derived override (a documented non-goal that creates false +# edges). Same-file only; no ancestor defining the method => no edge. +# =========================================================================== # + + +# --------------------------------------------------------------------------- # +# RED1 (inherited, non-virtual): Derived doesn't define compute -> walk up to +# Base::compute. A free `compute` is present so the pre-fix base-name-first +# resolver mislinks; pre-[30] the receiver-type path also fails (Derived has +# no compute) and falls back to the free function. +# --------------------------------------------------------------------------- # + +_INHERITED_SRC = """ +void compute() { } +struct Base { + void compute() { } +}; +struct Derived : Base { + void run() { + Derived* d = nullptr; + d->compute(); + } +}; +""" + + +def test_inherited_member_call_walks_up_to_base(): + builder = _build(_INHERITED_SRC) + caller = "main.cpp:Derived::run" + base_compute = "main.cpp:Base::compute" + free = "main.cpp:compute" + edges = builder.call_graph.get(caller, []) + assert base_compute in edges, ( + f"d->compute() did not walk up to the ancestor that defines compute.\n" + f" Expected edge to: {base_compute}\n" + f" Got: {edges}\n" + f" Functions: {list(builder.functions)}" + ) + assert free not in edges, ( + f"d->compute() wrongly linked to the free function compute.\n" + f" Got: {edges}" + ) + + +# --------------------------------------------------------------------------- # +# RED2 (virtual, static-type floor): Base declares+defines virtual compute; +# Derived overrides it. Called via a Base* receiver -> resolve to the STATIC +# type's method, Base::compute (the sound floor). Derived::compute is +# intentionally NOT also linked (documented non-goal: no over-approximation). +# --------------------------------------------------------------------------- # + +_VIRTUAL_SRC = """ +struct Base { + virtual void compute() { } +}; +struct Derived : Base { + void compute() override { } +}; +void run() { + Base* b = nullptr; + b->compute(); +} +""" + + +def test_virtual_member_call_resolves_to_static_type_floor(): + builder = _build(_VIRTUAL_SRC) + caller = "main.cpp:run" + base_compute = "main.cpp:Base::compute" + derived_compute = "main.cpp:Derived::compute" + edges = builder.call_graph.get(caller, []) + assert base_compute in edges, ( + f"b->compute() (Base* receiver) did not resolve to the static type's " + f"method Base::compute.\n Expected edge to: {base_compute}\n Got: {edges}" + ) + # Documented FLOOR: the derived override is NOT linked from a Base* call. + assert derived_compute not in edges, ( + f"b->compute() over-approximated: linked the derived override " + f"{derived_compute}. The [30] floor links only the static type's method.\n" + f" Got: {edges}" + ) + + +# --------------------------------------------------------------------------- # +# PRECISION NEG1: Derived DOES define compute (override) called via Derived* -> +# resolves to Derived::compute (its own), NOT Base::compute. The walk stops at +# the first definer (the receiver's own type). +# --------------------------------------------------------------------------- # + +_OVERRIDE_SRC = """ +struct Base { + virtual void compute() { } +}; +struct Derived : Base { + void compute() override { } + void run() { + Derived* d = nullptr; + d->compute(); + } +}; +""" + + +def test_override_resolves_to_own_method_not_ancestor(): + builder = _build(_OVERRIDE_SRC) + caller = "main.cpp:Derived::run" + derived_compute = "main.cpp:Derived::compute" + base_compute = "main.cpp:Base::compute" + edges = builder.call_graph.get(caller, []) + assert derived_compute in edges, ( + f"d->compute() (Derived* receiver, Derived overrides) did not resolve to " + f"its own Derived::compute.\n Got: {edges}" + ) + assert base_compute not in edges, ( + f"d->compute() walked past its own override to Base::compute. The walk " + f"must stop at the first definer.\n Got: {edges}" + ) + + +# --------------------------------------------------------------------------- # +# PRECISION NEG2: no ancestor defines the method -> NO type-dispatch edge, and +# no mislink to an unrelated free `compute`. Receiver type is known (Derived) +# but neither Derived nor its base Base defines `compute`; a free compute +# exists. The inheritance walk must return None (fall back). Fallback then +# links the free function (unchanged base-name behavior), but crucially the +# walk must NOT have invented a Base::compute / Derived::compute edge. +# --------------------------------------------------------------------------- # + +_NO_ANCESTOR_DEF_SRC = """ +void compute() { } +struct Base { + void other() { } +}; +struct Derived : Base { + void run() { + Derived* d = nullptr; + d->compute(); + } +}; +""" + + +def test_no_ancestor_defines_method_no_false_edge(): + builder = _build(_NO_ANCESTOR_DEF_SRC) + caller = "main.cpp:Derived::run" + edges = builder.call_graph.get(caller, []) + base_compute = "main.cpp:Base::compute" + derived_compute = "main.cpp:Derived::compute" + # The walk must not fabricate a method edge for a method no ancestor defines. + assert base_compute not in edges and derived_compute not in edges, ( + f"Inheritance walk fabricated a method edge for an undefined method.\n" + f" Got: {edges}" + ) + + +# --------------------------------------------------------------------------- # +# PRECISION NEG3: cycle in (malformed) inheritance -> the BFS terminates and +# does not crash. A: B, B: A, neither defines compute; a call via A* must not +# hang/recurse forever and must produce no fabricated method edge. +# --------------------------------------------------------------------------- # + +_CYCLE_SRC = """ +struct B; +struct A : B { + void run() { + A* a = nullptr; + a->compute(); + } +}; +struct B : A { +}; +""" + + +def test_inheritance_cycle_terminates_no_crash(): + # Must not raise / hang. + builder = _build(_CYCLE_SRC) + caller = "main.cpp:A::run" + edges = builder.call_graph.get(caller, []) + assert "main.cpp:A::compute" not in edges + assert "main.cpp:B::compute" not in edges diff --git a/libs/openant-core/tests/parsers/c/test_call_graph_builder_u1.py b/libs/openant-core/tests/parsers/c/test_call_graph_builder_u1.py new file mode 100644 index 0000000..5249fb5 --- /dev/null +++ b/libs/openant-core/tests/parsers/c/test_call_graph_builder_u1.py @@ -0,0 +1,178 @@ +"""Regression tests for the C call graph builder — builtin-name-collision leak. + +Bug report (BUG-NEW-2026-06-02-c-builtin_filter_leak, OpenAnt base 601e588): + A call to a *user-defined* function whose name collides with a C + stdlib/POSIX builtin (e.g. `close`, which is in STDLIB_FUNCTIONS) produces + NO edge in the call graph, because `_resolve_call` short-circuits with + `return None` on `_is_stdlib(call_name)` BEFORE it ever consults the + same-file user-function table. The callee is then falsely "isolated" / + unreachable. + +Root cause: + parsers/c/call_graph_builder.py `_resolve_call` — the `_is_stdlib` filter + runs first; the same-file lookup (step 1) is never reached for a colliding + name. + +Fix scope decision (see report): the pre-check is SCOPED to same-file +user-defined functions only. A genuine stdlib call (e.g. a real `printf` with +NO same-file definition) must still resolve to None — we must NOT route a +colliding name through a global cross-file single-match that could wrongly +link a real stdlib call to an unrelated same-named user function elsewhere. +""" + +import sys +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.c.call_graph_builder import CallGraphBuilder + + +def _make_extractor_output() -> dict: + """User function `close` (collides with POSIX stdlib) + a caller, same file. + + Mirrors function_extractor output shape: each function id is + `:` and func_data carries name / file_path / code. + """ + file_path = "main.c" + return { + "repository": "/tmp/fake", + "functions": { + f"{file_path}:close": { + "name": "close", + "file_path": file_path, + "code": "void close(int x) {\n}\n", + }, + f"{file_path}:caller": { + "name": "caller", + "file_path": file_path, + "code": "void caller(void) {\n close(3);\n}\n", + }, + }, + } + + +def test_user_function_colliding_with_stdlib_name_gets_edge(): + """caller() calls a same-file user `close` — the edge must exist.""" + builder = CallGraphBuilder(_make_extractor_output()) + builder.build_call_graph() + + caller = "main.c:caller" + callee = "main.c:close" + + assert callee in builder.call_graph[caller], ( + f"Forward call graph missing edge to same-file user function whose " + f"name collides with a stdlib builtin.\n" + f" Expected: {caller} -> {callee}\n" + f" Got: {builder.call_graph[caller]}" + ) + + +def test_user_function_colliding_with_stdlib_name_reverse_edge(): + """The colliding-name callee must list its caller in the reverse graph.""" + builder = CallGraphBuilder(_make_extractor_output()) + builder.build_call_graph() + + caller = "main.c:caller" + callee = "main.c:close" + + callers = builder.reverse_call_graph.get(callee, []) + assert caller in callers, ( + f"Reverse call graph missing caller for colliding-name user function.\n" + f" Expected to contain: {caller}\n" + f" Got: {callers}" + ) + + +def _make_real_stdlib_output() -> dict: + """A genuine stdlib call with NO same-file user definition. + + `printf` is a real stdlib call here and there is NO user-defined `printf` + anywhere — it must still resolve to nothing (no spurious edge). + """ + file_path = "real.c" + return { + "repository": "/tmp/fake", + "functions": { + f"{file_path}:greet": { + "name": "greet", + "file_path": file_path, + "code": 'void greet(void) {\n printf("hi");\n}\n', + }, + }, + } + + +def test_real_stdlib_call_still_filtered(): + """A real stdlib call with no same-file user def must produce NO edge.""" + builder = CallGraphBuilder(_make_real_stdlib_output()) + builder.build_call_graph() + + caller = "real.c:greet" + assert builder.call_graph[caller] == [], ( + f"Real stdlib call should not resolve to any edge.\n" + f" Got: {builder.call_graph[caller]}" + ) + + +def _make_cross_file_stdlib_output() -> dict: + """SCOPE guard: a real stdlib call in file A must NOT link to a same-named + user function defined in an UNRELATED file B (no include relationship). + + File A (`a.c`) calls `open(...)` — a genuine stdlib call. File B (`b.c`) + happens to define a user function `open`. With NO include linking them, + the call in A must NOT be wired to B's `open`. + """ + return { + "repository": "/tmp/fake", + "functions": { + "a.c:user_a": { + "name": "user_a", + "file_path": "a.c", + "code": "void user_a(void) {\n open(0);\n}\n", + }, + "b.c:open": { + "name": "open", + "file_path": "b.c", + "code": "void open(int x) {\n}\n", + }, + }, + } + + +def test_cross_file_stdlib_not_wrongly_linked(): + """A real stdlib call must not be linked to an unrelated same-named user + function in another (non-included) file.""" + builder = CallGraphBuilder(_make_cross_file_stdlib_output()) + builder.build_call_graph() + + caller = "a.c:user_a" + assert builder.call_graph[caller] == [], ( + f"Real stdlib call wrongly linked across files to an unrelated " + f"same-named user function.\n" + f" Got: {builder.call_graph[caller]}" + ) + + +def test_regex_fallback_resolves_colliding_user_function(): + """Sibling site: the regex fallback (_extract_calls_regex) must also resolve + a same-file user function whose name collides with a stdlib builtin, instead + of dropping it via an _is_stdlib pre-gate.""" + builder = CallGraphBuilder(_make_extractor_output()) + caller_id = "main.c:caller" + calls = builder._extract_calls_regex("close(3);", caller_id) + assert "main.c:close" in calls, ( + f"Regex fallback dropped the colliding-name same-file user function.\n" + f" Got: {calls}" + ) + + +def test_regex_fallback_still_filters_real_stdlib(): + """Sibling scope guard: the regex fallback must still drop a genuine stdlib + call with no same-file user definition.""" + builder = CallGraphBuilder(_make_real_stdlib_output()) + calls = builder._extract_calls_regex('printf("hi");', "real.c:greet") + assert calls == set(), ( + f"Regex fallback wrongly resolved a real stdlib call.\n Got: {calls}" + ) diff --git a/libs/openant-core/tests/parsers/c/test_function_extractor_u2.py b/libs/openant-core/tests/parsers/c/test_function_extractor_u2.py new file mode 100644 index 0000000..d92d9a9 --- /dev/null +++ b/libs/openant-core/tests/parsers/c/test_function_extractor_u2.py @@ -0,0 +1,150 @@ +"""Regression tests for the C/C++ FunctionExtractor — U2 blind-fix batch. + +Seven confirmed bugs (OpenAnt base 601e588 / 2e78d6a), all reproduced on the +real extractor (`FunctionExtractor(...).process_file(...)`): + + [15] out-of-line C++ constructor `Foo::Foo` recorded unit_type='method' + instead of 'constructor' (qualified name compared whole to class_name). + [14] out-of-line C++ destructor `Foo::~Foo` recorded 'method' not + 'destructor' (same qualified-vs-unqualified comparison bug). + [32] C++ struct member fn dropped class_name / unit_type='function' — the + tree walk only special-cased `class_specifier`, not `struct_specifier`. + [33] file-scope C++ lambda (`auto f = [](){...}`) never extracted — it lives + in a `declaration`/`init_declarator`/`lambda_expression`, not a + `function_definition`. + [35] C++ operator overload (`operator+`) never extracted — the + `operator_name` declarator node was unhandled, name resolved to None. + [39] explicit template specialization `g` collided with the primary + template `g` on func_id and silently overwrote it (template args + dropped from the id). + [40] free function inside a `namespace` wrongly carried class_name= + — namespace `::` qualifier conflated with a class qualifier. +""" + +import sys +import tempfile +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.c.function_extractor import FunctionExtractor + + +def _extract(filename: str, source: str) -> dict: + """Run the real extractor on a temp source file; return the functions dict.""" + repo = Path(tempfile.mkdtemp()).resolve() + fp = repo / filename + fp.write_text(source) + ex = FunctionExtractor(str(repo)) + ex.process_file(fp) + return ex.functions + + +def _find(functions: dict, predicate): + for fid, data in functions.items(): + if predicate(fid, data): + return data + return None + + +# ---------------------------------------------------------------- [15] ctor +def test_outofline_constructor_classified_constructor(): + functions = _extract("ctor.cpp", "Foo::Foo() { }\n") + data = _find(functions, lambda fid, d: d["name"] == "Foo::Foo") + assert data is not None, f"ctor not extracted; got {list(functions)}" + assert data["unit_type"] == "constructor", ( + f"expected constructor, got {data['unit_type']!r}" + ) + + +# ---------------------------------------------------------------- [14] dtor +def test_outofline_destructor_classified_destructor(): + functions = _extract("dtor.cpp", "Foo::~Foo() { }\n") + data = _find(functions, lambda fid, d: d["name"] == "Foo::~Foo") + assert data is not None, f"dtor not extracted; got {list(functions)}" + assert data["unit_type"] == "destructor", ( + f"expected destructor, got {data['unit_type']!r}" + ) + + +# ------------------------------------------------------------ [32] struct member +def test_struct_member_method_metadata(): + functions = _extract("m.cpp", "struct Point {\n int dist() { return 0; }\n};\n") + data = _find(functions, lambda fid, d: d["name"].endswith("dist")) + assert data is not None, f"struct member not extracted; got {list(functions)}" + assert data["unit_type"] == "method", ( + f"expected method, got {data['unit_type']!r}" + ) + assert data["class_name"] == "Point", ( + f"expected class_name Point, got {data['class_name']!r}" + ) + + +# ------------------------------------------------------------ [32b] union member +def test_union_member_method_metadata(): + functions = _extract("u.cpp", "union U {\n int tag() { return 0; }\n};\n") + data = _find(functions, lambda fid, d: d["name"].endswith("tag")) + assert data is not None, f"union member not extracted; got {list(functions)}" + assert data["unit_type"] == "method", ( + f"expected method, got {data['unit_type']!r}" + ) + assert data["class_name"] == "U", ( + f"expected class_name U, got {data['class_name']!r}" + ) + + +# ---------------------------------------------------------------- [33] lambda +def test_file_scope_lambda_extracted(): + functions = _extract( + "m.cpp", "int ctrl() { return 0; }\nauto f = [](int x){ return x + 1; };\n" + ) + data = _find(functions, lambda fid, d: d["name"] == "f") + assert data is not None, f"lambda 'f' not extracted; got {list(functions)}" + + +# ---------------------------------------------------------------- [35] operator +def test_operator_overload_extracted(): + functions = _extract( + "m.cpp", + "int ctrl() { return 0; }\n\nclass V {\npublic:\n" + " int operator+(int x) { return x + 1; }\n};\n", + ) + data = _find(functions, lambda fid, d: "operator" in d["name"]) + assert data is not None, f"operator+ not extracted; got {list(functions)}" + assert data["class_name"] == "V", ( + f"expected class_name V, got {data['class_name']!r}" + ) + + +# ------------------------------------------------------------ [39] template spec +def test_template_specialization_distinct_from_primary(): + functions = _extract( + "m.cpp", + "int control(){return 1;}\n" + "template T g(T x){return x;}\n" + "template<> int g(int x){return x+1;}\n", + ) + # Both the primary template `g` and the specialization `g` must survive. + spec = _find(functions, lambda fid, d: "g" in fid or "g" in d["name"]) + primary = _find( + functions, + lambda fid, d: (fid.endswith(":g") or d["name"] == "g"), + ) + assert spec is not None, f"g specialization absent; got {list(functions)}" + assert primary is not None, f"primary g absent; got {list(functions)}" + + +# ------------------------------------------------------------ [40] namespace free fn +def test_namespace_free_function_no_class_name(): + functions = _extract( + "m.cpp", "namespace ns {\nint freefunc(int x) {\n return x;\n}\n}\n" + ) + data = _find(functions, lambda fid, d: d["name"].endswith("freefunc")) + assert data is not None, f"freefunc not extracted; got {list(functions)}" + assert data["class_name"] is None, ( + f"expected class_name None for namespace free fn, got {data['class_name']!r}" + ) + assert data["unit_type"] != "method", ( + f"namespace free fn must not be a method, got {data['unit_type']!r}" + ) diff --git a/libs/openant-core/tests/parsers/c/test_unit_generator_u3.py b/libs/openant-core/tests/parsers/c/test_unit_generator_u3.py new file mode 100644 index 0000000..f7456e2 --- /dev/null +++ b/libs/openant-core/tests/parsers/c/test_unit_generator_u3.py @@ -0,0 +1,72 @@ +"""Regression test for BUG 29 (c schema_field_drift in unit_generator.py). + +Bug report (OpenAnt blind mining, 2026-06-04): + The C function extractor computes and stores `is_inline` for every function + (function_extractor._process_function_node, func_data['is_inline']), but the + unit generator's create_unit() never copies it into the assembled unit's + `metadata` block. Sibling boolean flags `is_static` and `is_exported` ARE + carried there, so `is_inline` is silently dropped at unit assembly. + +Reproduction: + Source `m.c` = 'inline int add(int a, int b) {\\n return a + b;\\n}\\n' + target = m.c:add, check = metadata, expected is_inline == True. + +This test drives the REAL extractor to produce func_data, feeds a call-graph- +shaped dict to the REAL UnitGenerator, and asserts the unit exposes is_inline +at the same location as its sibling flags (unit['metadata']). +""" + +import sys +from pathlib import Path + +import pytest + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.c.function_extractor import FunctionExtractor +from parsers.c.unit_generator import UnitGenerator + + +INLINE_SRC = "inline int add(int a, int b) {\n return a + b;\n}\n" + + +def _extract_functions(tmp_path: Path) -> dict: + """Run the real C function extractor on an inline function and return func_data map.""" + src_file = tmp_path / "m.c" + src_file.write_text(INLINE_SRC) + + extractor = FunctionExtractor(str(tmp_path)) + result = extractor.extract_all(files=["m.c"]) + return result["functions"] + + +def _call_graph_data(functions: dict, repo: Path) -> dict: + """Shape the extractor output as call-graph data that UnitGenerator consumes.""" + return { + "repository": str(repo), + "functions": functions, + "call_graph": {}, + "reverse_call_graph": {}, + } + + +def test_extractor_produces_is_inline(tmp_path): + """Sanity: the producer actually emits is_inline=True for an inline function.""" + functions = _extract_functions(tmp_path) + assert "m.c:add" in functions, f"add not extracted; got {list(functions)}" + assert functions["m.c:add"]["is_inline"] is True + + +def test_unit_exposes_is_inline(tmp_path): + """BUG 29: the assembled unit must carry is_inline alongside its sibling flags.""" + functions = _extract_functions(tmp_path) + gen = UnitGenerator(_call_graph_data(functions, tmp_path)) + unit = gen.create_unit("m.c:add", functions["m.c:add"]) + + # Sibling flags live in unit['metadata']; is_inline must too. + assert "is_inline" in unit["metadata"], ( + "is_inline dropped at unit assembly; " + f"metadata keys = {sorted(unit['metadata'])}" + ) + assert unit["metadata"]["is_inline"] is True