Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 189 additions & 8 deletions libs/openant-core/parsers/ruby/call_graph_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,60 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]:
except Exception:
return self._extract_calls_regex(code, caller_id)

# First pass: collect local-variable names (assignment LHS) and
# method-object bindings (`m = method(:sym)` / proc / lambda). These
# inform parenless-call precision [BUG 1] and `.call` resolution [BUG 31].
#
# SINGLE-UNCONDITIONAL-ASSIGNMENT GUARD for method-object bindings: a
# binding is kept ONLY when the variable is assigned exactly once AND at
# the method/program top level (a direct statement of the body, not
# nested inside an `if`/`unless`/`while`/`case`/`begin`/block). A var
# assigned 2+ times (last-write-wins) or bound conditionally is a
# "maybe" binding; resolving its `.call` would assert a maybe as
# definite, so we drop it and let `m.call` go unresolved (no edge).
# `local_vars` (for parenless precision) is NOT narrowed -- any
# assignment target is still a local variable for the bare-identifier
# guard, regardless of how many times / where it is bound.
local_vars: Set[str] = set()
assign_counts: Dict[str, int] = {}
top_level_binding: Dict[str, str] = {}
stack = [tree.root_node]
while stack:
node = stack.pop()
if node.type == 'assignment':
lhs = node.children[0] if node.children else None
if lhs is not None and lhs.type == 'identifier':
var_name = self._node_str(lhs, code_bytes)
local_vars.add(var_name)
assign_counts[var_name] = assign_counts.get(var_name, 0) + 1
if self._is_top_level_statement(node):
bound = self._method_object_target(node, code_bytes)
if bound:
top_level_binding[var_name] = bound
stack.extend(reversed(node.children))

# Keep a method-object binding only if it is single + unconditional:
# exactly one assignment to that name, and that binding is top-level.
method_object_bindings: Dict[str, str] = {
name: bound
for name, bound in top_level_binding.items()
if assign_counts.get(name, 0) == 1
}

# Second pass: resolve calls and bare (parenless) identifier calls.
stack = [tree.root_node]
while stack:
node = stack.pop()
if node.type == 'call':
resolved = self._resolve_call_node(node, code_bytes, caller_file,
caller_class, caller_module, caller_method)
caller_class, caller_module, caller_method,
method_object_bindings)
if resolved:
calls.add(resolved)
elif node.type == 'identifier':
resolved = self._resolve_bare_identifier(
node, code_bytes, caller_file, caller_class, local_vars
)
if resolved:
calls.add(resolved)
elif node.type == 'super':
Expand All @@ -209,15 +257,98 @@ def _extract_calls_from_code(self, code: str, caller_id: str) -> Set[str]:

return calls

def _node_str(self, node, source: bytes) -> str:
return source[node.start_byte:node.end_byte].decode('utf-8', errors='replace')

@staticmethod
def _is_top_level_statement(node) -> bool:
"""True if `node` is a direct statement of a method/program body.

Top-level means the parent is the method body (`body_statement`) or the
file program node. Anything nested in an `if`/`then`/`else`/`while`/
`case`/`begin`/block has some other parent, so it is conditional/looped
and not a single-unconditional binding.
"""
parent = node.parent
return parent is not None and parent.type in ('body_statement', 'program')

def _method_object_target(self, assignment_node, source: bytes) -> Optional[str]:
"""If RHS is method(:sym)/proc/lambda binding a named method, return the name.

Tracks `m = method(:helper)` so a later `m.call` resolves to `helper`.
"""
rhs = assignment_node.children[-1] if assignment_node.children else None
if rhs is None or rhs.type != 'call':
return None
callee = None
arg_list = None
for child in rhs.children:
if child.type == 'identifier' and callee is None:
callee = self._node_str(child, source)
elif child.type == 'argument_list':
arg_list = child
if callee != 'method' or arg_list is None:
return None
for arg in arg_list.children:
if arg.type in ('simple_symbol', 'symbol'):
sym = self._node_str(arg, source)
return sym.lstrip(':')
return None

def _resolve_bare_identifier(self, node, source: bytes, caller_file: str,
caller_class: Optional[str],
local_vars: Set[str]) -> Optional[str]:
"""Resolve a bare (parenless) identifier used as a call.

Precision guard: only treat a bare identifier as a call when its name
is a KNOWN user function, it is NOT a Ruby builtin, and it is NOT a
local variable / assignment target in this method body. We also skip
identifiers that are a structural part of a call/method/assignment
(their own handlers cover those) to avoid double-counting.
"""
parent = node.parent
if parent is not None and parent.type in (
'call', 'method', 'singleton_method', 'assignment', 'method_call',
):
return None
name = self._node_str(node, source)
if name in local_vars:
return None
if self._is_builtin(name):
return None
if name not in self.functions_by_name:
return None
return self._resolve_simple_call(name, caller_file, caller_class)

def _resolve_call_node(self, node, source: bytes, caller_file: str,
caller_class: Optional[str],
caller_module: Optional[str] = None,
caller_method: Optional[str] = None) -> Optional[str]:
caller_method: Optional[str] = None,
method_object_bindings: Optional[Dict[str, str]] = None
) -> Optional[str]:
"""Resolve a tree-sitter call node to a function ID."""
# `super(args)` is a call node whose head is a `super` node, not an
# identifier method name -- resolve it against the superclass.
if any(c.type == 'super' for c in node.children):
return self._resolve_super_call(caller_file, caller_class, caller_method)

method_object_bindings = method_object_bindings or {}

# Method-object call: `m = method(:helper)` then `m.call` resolves to the
# bound target [BUG 31]. The positional heuristic below mis-parses a
# lowercase-variable receiver (it captures the var as the method name),
# so detect this precisely via tree-sitter's named receiver/method fields.
recv_field = node.child_by_field_name('receiver')
meth_field = node.child_by_field_name('method')
if recv_field is not None and meth_field is not None:
recv_text = source[recv_field.start_byte:recv_field.end_byte].decode(
'utf-8', errors='replace')
meth_text = source[meth_field.start_byte:meth_field.end_byte].decode(
'utf-8', errors='replace')
if meth_text == 'call' and recv_text in method_object_bindings:
target_name = method_object_bindings[recv_text]
return self._resolve_simple_call(target_name, caller_file, caller_class)

# Extract method name
method_name = None
receiver = None
Expand Down Expand Up @@ -262,7 +393,16 @@ def _resolve_call_node(self, node, source: bytes, caller_file: str,
return self._resolve_class_call(receiver, target, caller_file)
return None

# A user method may share a name with a Ruby builtin (e.g. render, log,
# open). Don't let the builtin filter drop a call that resolves to a
# user-defined function visible in scope. Scope = same class or same
# file ONLY (no global cross-file single-match) so a genuine builtin
# call isn't wired to an unrelated same-named user method elsewhere.
if self._is_builtin(method_name):
if receiver is None and self._has_scoped_user_function(
method_name, caller_file, caller_class
):
return self._resolve_simple_call(method_name, caller_file, caller_class)
return None

# self.method(...) - same class
Expand Down Expand Up @@ -431,6 +571,25 @@ def _resolve_module_call(self, module_name: str, method_name: str,
fallback = func_id
return fallback

def _has_scoped_user_function(self, name: str, caller_file: str,
caller_class: Optional[str]) -> bool:
"""True if `name` is a user function visible in the caller's scope.

Scope is deliberately narrow (same class or same file). This lets a
user method that shadows a builtin name resolve, without globally
rescuing a genuine builtin call into an unrelated same-named method.
"""
if caller_class:
class_key = f"{caller_file}:{caller_class}"
for func_id in self.methods_by_class.get(class_key, []):
if self.functions.get(func_id, {}).get('name') == name:
return True
for func_id in self.functions_by_file.get(caller_file, []):
func_data = self.functions.get(func_id, {})
if func_data.get('name') == name and not func_data.get('class_name'):
return True
return False

def _resolve_self_call(self, method_name: str, caller_file: str,
caller_class: str) -> Optional[str]:
"""Resolve a self.method() call within a class."""
Expand Down Expand Up @@ -459,17 +618,39 @@ def _resolve_class_call(self, class_name: str, method_name: str,
if func_data.get('name') == method_name:
return func_id

# Check all files for the class
# Cross-file fallback. The previous behaviour linked the FIRST same-named
# class found ANYWHERE, which is unsound: it wired callers to unrelated
# files (and is wrong under same-name class collisions). Scope it: only
# accept a cross-file class when the caller's file require/require_relative
# resolves to the file that defines the class.
for key, func_ids in self.methods_by_class.items():
if key.endswith(f":{class_name}"):
for func_id in func_ids:
func_data = self.functions.get(func_id, {})
if func_data.get('name') == method_name:
return func_id
if not key.endswith(f":{class_name}"):
continue
key_file = key.rsplit(':', 1)[0]
if not self._file_is_required_by(key_file, caller_file):
continue
for func_id in func_ids:
func_data = self.functions.get(func_id, {})
if func_data.get('name') == method_name:
return func_id

# The receiver may be a module (e.g. Utils.helper for a module_function).
return self._resolve_module_call(class_name, method_name, caller_file)

def _file_is_required_by(self, target_file: str, caller_file: str) -> bool:
"""True if caller_file has a require/require_relative resolving to target_file."""
target_stem = target_file.replace('\\', '/').rsplit('/', 1)[-1]
if target_stem.endswith('.rb'):
target_stem = target_stem[:-len('.rb')]
file_imports = self.imports.get(caller_file, {})
for import_name, import_type in file_imports.items():
if import_type not in ('require', 'require_relative'):
continue
import_basename = import_name.replace('\\', '/').rsplit('/', 1)[-1]
if import_basename == target_stem:
return True
return False

def _extract_calls_regex(self, code: str, caller_id: str) -> Set[str]:
"""Fallback regex-based call extraction for unparseable code."""
calls = set()
Expand Down
43 changes: 39 additions & 4 deletions libs/openant-core/parsers/ruby/function_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,15 @@ def _classify_function(self, func_name: str, class_name: Optional[str],
"""
path_lower = file_path.lower()

if func_name == 'initialize':
return 'constructor'

# is_singleton must be checked BEFORE the initialize/constructor branch:
# `def self.initialize` is a class-level singleton method, not the
# instance constructor (Ruby's constructor is the instance `initialize`).
if is_singleton:
return 'singleton_method'

if func_name == 'initialize':
return 'constructor'

# Non-public methods inside a class are helpers/callbacks, never
# route handlers or callbacks-by-name, regardless of the enclosing
# class. This must precede the controller branch so private params
Expand Down Expand Up @@ -281,6 +284,16 @@ def _extract_imports(self, tree, source: bytes) -> Dict[str, str]:

return imports

def _body_node(self, node):
"""Return a node's body (`body` field, else a `body_statement` child)."""
body_node = node.child_by_field_name('body')
if body_node is None:
for child in node.children:
if child.type == 'body_statement':
body_node = child
break
return body_node

def _extract_functions_from_tree(self, tree, source: bytes, file_path: Path,
relative_path: str) -> None:
"""Extract all method definitions from a parsed tree.
Expand All @@ -303,12 +316,27 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: Path,
node, source, relative_path, class_name, module_name,
is_singleton=False, visibility=vis_state[0],
)
# A nested `def` lives in the method's body -- keep traversing so
# methods defined inside another method are not lost. Nested defs
# inherit the enclosing class but default to public visibility.
body_node = self._body_node(node)
if body_node:
nested_vis = ['public']
for child in reversed(body_node.children):
stack.append((child, class_name, module_name, nested_vis))
continue

elif node.type == 'singleton_method':
self._process_method_node(
node, source, relative_path, class_name, module_name,
is_singleton=True, visibility=vis_state[0],
)
body_node = self._body_node(node)
if body_node:
nested_vis = ['public']
for child in reversed(body_node.children):
stack.append((child, class_name, module_name, nested_vis))
continue

elif node.type == 'alias':
# `alias new old` keyword form.
Expand Down Expand Up @@ -364,7 +392,13 @@ def _extract_functions_from_tree(self, tree, source: bytes, file_path: Path,
elif node.type == 'class':
# Extract class name
name_node = node.child_by_field_name('name')
new_class_name = self._node_text(name_node, source) if name_node else None
local_class_name = self._node_text(name_node, source) if name_node else None
# Compose with the enclosing class so a nested class keeps its
# outer qualifier, e.g. `Outer::Inner`.
if local_class_name and class_name:
new_class_name = f"{class_name}::{local_class_name}"
else:
new_class_name = local_class_name

# Extract superclass
superclass = None
Expand Down Expand Up @@ -533,6 +567,7 @@ def _emit_synthetic_method(self, node, source: bytes, relative_path: str,
self.stats['standalone_functions'] += 1
self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1


def _process_method_node(self, node, source: bytes, relative_path: str,
class_name: Optional[str], module_name: Optional[str],
is_singleton: bool, visibility: str = 'public') -> None:
Expand Down
Loading
Loading