Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions libs/openant-core/parsers/php/function_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,7 +482,7 @@ def _process_function_node(self, node, source: bytes, relative_path: str,
'unit_type': unit_type,
}

self.functions[func_id] = func_data
self._store_function(func_id, func_data)
self.stats['total_functions'] += 1

if class_name:
Expand Down Expand Up @@ -524,7 +524,7 @@ def _process_closure_node(self, node, source: bytes, relative_path: str,

func_id = f"{relative_path}:{qualified_name}"

self.functions[func_id] = {
self._store_function(func_id, {
'name': name,
'qualified_name': qualified_name,
'file_path': relative_path,
Expand All @@ -536,11 +536,35 @@ def _process_closure_node(self, node, source: bytes, relative_path: str,
'parameters': parameters,
'is_static': False,
'unit_type': 'closure',
}
})
self.stats['total_functions'] += 1
self.stats['standalone_functions'] += 1
self.stats['by_type']['closure'] = self.stats['by_type'].get('closure', 0) + 1

def _store_function(self, func_id: str, func_data: dict) -> str:
"""Insert a function unit, keeping BOTH on a same-(file,name) collision.

PHP forbids plain redefinition (fatal), so legal duplicates arise only
from mutually-exclusive conditional branches — an `if/else` or a
defensive double `if(!function_exists('x')){...}` — where which branch
is live is environment-dependent and the EARLIER branch may be the one
that runs. Keying solely on `qualified_name` let the later branch
overwrite the earlier (a silent false negative). Disambiguate
DETERMINISTICALLY by source line (`#L<line>`); the earlier-in-source
unit keeps the clean id. Mirrors the Python extractor's `_store_function`.
"""
if func_id not in self.functions:
self.functions[func_id] = func_data
return func_id
line = func_data.get('start_line', 0)
unique_id = f"{func_id}#L{line}"
n = 2
while unique_id in self.functions:
unique_id = f"{func_id}#L{line}.{n}"
n += 1
self.functions[unique_id] = func_data
return unique_id

def _extract_module_level_unit(self, tree, source: bytes,
relative_path: str) -> None:
"""Synthesise a `module_level` unit for top-level procedural statements.
Expand Down
33 changes: 30 additions & 3 deletions libs/openant-core/parsers/ruby/function_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,7 +512,7 @@ def _emit_synthetic_method(self, node, source: bytes, relative_path: str,
qualified_name = name

func_id = f"{relative_path}:{qualified_name}"
self.functions[func_id] = {
self._store_function(func_id, {
'name': name,
'qualified_name': qualified_name,
'file_path': relative_path,
Expand All @@ -525,14 +525,41 @@ def _emit_synthetic_method(self, node, source: bytes, relative_path: str,
'is_singleton': False,
'visibility': visibility,
'unit_type': unit_type,
}
})
self.stats['total_functions'] += 1
if class_name:
self.stats['total_methods'] += 1
else:
self.stats['standalone_functions'] += 1
self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1


def _store_function(self, func_id: str, func_data: dict) -> str:
"""Insert a function unit, keeping BOTH on a same-(file,name) collision.

Ruby `def` executes when reached, so same-name defs in mutually-exclusive
conditional branches (`if/else`) are both runtime-reachable depending on
the condition, and the EARLIER branch may be the live one. A plain
keep-last store let the later (possibly dead) branch overwrite the
earlier (possibly live) one — a silent false negative, confirmed against
the real `ruby` interpreter. Keep BOTH via a deterministic `#L<line>`
suffix (earlier-in-source keeps the clean id), mirroring the Python
extractor's `_store_function`. Unconditional reopening is last-wins at
runtime; keeping both there is the same benign tradeoff Python accepts.
Collision-only: a unique name keeps its byte-identical `path:name` id.
"""
if func_id not in self.functions:
self.functions[func_id] = func_data
return func_id
line = func_data.get('start_line', 0)
unique_id = f"{func_id}#L{line}"
n = 2
while unique_id in self.functions:
unique_id = f"{func_id}#L{line}.{n}"
n += 1
self.functions[unique_id] = func_data
return unique_id

def _process_method_node(self, node, source: bytes, relative_path: str,
class_name: Optional[str], module_name: Optional[str],
is_singleton: bool, visibility: str = 'public') -> None:
Expand Down Expand Up @@ -575,7 +602,7 @@ def _process_method_node(self, node, source: bytes, relative_path: str,
'unit_type': unit_type,
}

self.functions[func_id] = func_data
self._store_function(func_id, func_data)
self.stats['total_functions'] += 1

if class_name:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
"""Bug 1 (PHP): same-name functions defined in mutually-exclusive conditional
branches must NOT silently collapse to the last-in-source one.

Verified harmful (independent + judge, real `php` interpreter): for
if (...) { function k(){ A } } else { function k(){ B } }
and the defensive double `function_exists` guard, the live definition can be the
EARLIER branch, but the extractor kept only the later (else) one — a silent
false negative for a SAST tool. Both branches are env-dependently reachable, so
the fix keeps BOTH (the `#L<line>` disambiguation the Python extractor already
uses), not prefer-first/larger. Collision-only: a unique name keeps its clean id.
"""
import sys
import tempfile
from pathlib import Path

_CORE_ROOT = Path(__file__).resolve().parents[3]
sys.path.insert(0, str(_CORE_ROOT))

from parsers.php.function_extractor import FunctionExtractor


def _extract(php_source: str, filename: str = "cond.php") -> dict:
repo = tempfile.mkdtemp()
Path(repo, filename).write_text(php_source)
return FunctionExtractor(repo).extract_all([filename])


def _named(out: dict, name: str):
return [fid for fid, d in out["functions"].items() if d.get("name") == name]


def test_conditional_ifelse_keeps_both_branches():
src = (
"<?php\n"
"if (defined('X')) {\n"
" function k() { return if_branch(); }\n"
"} else {\n"
" function k() { return else_branch(); }\n"
"}\n"
)
out = _extract(src)
ks = _named(out, "k")
assert len(ks) == 2, f"both conditional branches of k() must be kept; got {ks}"
bodies = " ".join(out["functions"][f]["code"] for f in ks)
assert "if_branch" in bodies and "else_branch" in bodies, f"a branch was dropped: {bodies}"


def test_double_function_exists_guard_keeps_both():
src = (
"<?php\n"
"if (!function_exists('h')) {\n"
" function h() { return real(); }\n"
"}\n"
"if (!function_exists('h')) {\n"
" function h() { return fallback(); }\n"
"}\n"
)
out = _extract(src)
hs = _named(out, "h")
assert len(hs) == 2, f"both guarded defs of h() must be kept; got {hs}"


def test_unique_name_id_unchanged():
out = _extract("<?php\nfunction solo() { return 1; }\n", filename="u.php")
ids = _named(out, "solo")
assert ids == ["u.php:solo"], f"unique-name id must stay byte-identical; got {ids}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""Bug 1 (Ruby): same-name defs in mutually-exclusive conditional branches must
NOT collapse to the last-in-source one.

Verified harmful (independent + judge, real `ruby` interpreter): for
if COND then def k; A end else def k; B end end
only the taken branch's `def` executes — and it may be the EARLIER (if) branch,
but the extractor kept only the later (else) one. Both branches are
env-dependently reachable, so the fix keeps BOTH via the `#L<line>`
disambiguation the Python extractor already uses. Collision-only: a unique name
keeps its clean id. (Unconditional reopening stays correct — last-wins — and is
simply also kept-both, the same tradeoff Python accepts.)
"""
import sys
from pathlib import Path

_CORE_ROOT = Path(__file__).resolve().parents[3]
sys.path.insert(0, str(_CORE_ROOT))

from parsers.ruby.function_extractor import FunctionExtractor


def _extract(tmp_path: Path, filename: str, source: str) -> dict:
(tmp_path / filename).write_text(source)
return FunctionExtractor(str(tmp_path)).extract_all([filename])


def _named(out: dict, name: str):
return [fid for fid, d in out["functions"].items() if d.get("name") == name]


def test_conditional_ifelse_keeps_both_branches(tmp_path):
src = (
"if ENV['X']\n"
" def k; if_branch; end\n"
"else\n"
" def k; else_branch; end\n"
"end\n"
)
out = _extract(tmp_path, "cond.rb", src)
ks = _named(out, "k")
assert len(ks) == 2, f"both conditional branches of k must be kept; got {ks}"
bodies = " ".join(out["functions"][f]["code"] for f in ks)
assert "if_branch" in bodies and "else_branch" in bodies, f"a branch was dropped: {bodies}"


def test_unique_name_id_unchanged(tmp_path):
out = _extract(tmp_path, "u.rb", "def solo; 1; end\n")
ids = _named(out, "solo")
assert ids == ["u.rb:solo"], f"unique-name id must stay byte-identical; got {ids}"