diff --git a/libs/openant-core/parsers/php/function_extractor.py b/libs/openant-core/parsers/php/function_extractor.py index 582a689..4e68b9f 100644 --- a/libs/openant-core/parsers/php/function_extractor.py +++ b/libs/openant-core/parsers/php/function_extractor.py @@ -482,7 +482,7 @@ def _process_function_node(self, node, source: bytes, relative_path: str, 'unit_type': unit_type, } - self.functions[func_id] = func_data + self._store_function(func_id, func_data) self.stats['total_functions'] += 1 if class_name: @@ -524,7 +524,7 @@ def _process_closure_node(self, node, source: bytes, relative_path: str, func_id = f"{relative_path}:{qualified_name}" - self.functions[func_id] = { + self._store_function(func_id, { 'name': name, 'qualified_name': qualified_name, 'file_path': relative_path, @@ -536,11 +536,35 @@ def _process_closure_node(self, node, source: bytes, relative_path: str, 'parameters': parameters, 'is_static': False, 'unit_type': 'closure', - } + }) self.stats['total_functions'] += 1 self.stats['standalone_functions'] += 1 self.stats['by_type']['closure'] = self.stats['by_type'].get('closure', 0) + 1 + def _store_function(self, func_id: str, func_data: dict) -> str: + """Insert a function unit, keeping BOTH on a same-(file,name) collision. + + PHP forbids plain redefinition (fatal), so legal duplicates arise only + from mutually-exclusive conditional branches — an `if/else` or a + defensive double `if(!function_exists('x')){...}` — where which branch + is live is environment-dependent and the EARLIER branch may be the one + that runs. Keying solely on `qualified_name` let the later branch + overwrite the earlier (a silent false negative). Disambiguate + DETERMINISTICALLY by source line (`#L`); the earlier-in-source + unit keeps the clean id. Mirrors the Python extractor's `_store_function`. + """ + if func_id not in self.functions: + self.functions[func_id] = func_data + return func_id + line = func_data.get('start_line', 0) + unique_id = f"{func_id}#L{line}" + n = 2 + while unique_id in self.functions: + unique_id = f"{func_id}#L{line}.{n}" + n += 1 + self.functions[unique_id] = func_data + return unique_id + def _extract_module_level_unit(self, tree, source: bytes, relative_path: str) -> None: """Synthesise a `module_level` unit for top-level procedural statements. diff --git a/libs/openant-core/parsers/ruby/function_extractor.py b/libs/openant-core/parsers/ruby/function_extractor.py index 4ee6131..6225795 100644 --- a/libs/openant-core/parsers/ruby/function_extractor.py +++ b/libs/openant-core/parsers/ruby/function_extractor.py @@ -512,7 +512,7 @@ def _emit_synthetic_method(self, node, source: bytes, relative_path: str, qualified_name = name func_id = f"{relative_path}:{qualified_name}" - self.functions[func_id] = { + self._store_function(func_id, { 'name': name, 'qualified_name': qualified_name, 'file_path': relative_path, @@ -525,7 +525,7 @@ def _emit_synthetic_method(self, node, source: bytes, relative_path: str, 'is_singleton': False, 'visibility': visibility, 'unit_type': unit_type, - } + }) self.stats['total_functions'] += 1 if class_name: self.stats['total_methods'] += 1 @@ -533,6 +533,33 @@ def _emit_synthetic_method(self, node, source: bytes, relative_path: str, self.stats['standalone_functions'] += 1 self.stats['by_type'][unit_type] = self.stats['by_type'].get(unit_type, 0) + 1 + + def _store_function(self, func_id: str, func_data: dict) -> str: + """Insert a function unit, keeping BOTH on a same-(file,name) collision. + + Ruby `def` executes when reached, so same-name defs in mutually-exclusive + conditional branches (`if/else`) are both runtime-reachable depending on + the condition, and the EARLIER branch may be the live one. A plain + keep-last store let the later (possibly dead) branch overwrite the + earlier (possibly live) one — a silent false negative, confirmed against + the real `ruby` interpreter. Keep BOTH via a deterministic `#L` + suffix (earlier-in-source keeps the clean id), mirroring the Python + extractor's `_store_function`. Unconditional reopening is last-wins at + runtime; keeping both there is the same benign tradeoff Python accepts. + Collision-only: a unique name keeps its byte-identical `path:name` id. + """ + if func_id not in self.functions: + self.functions[func_id] = func_data + return func_id + line = func_data.get('start_line', 0) + unique_id = f"{func_id}#L{line}" + n = 2 + while unique_id in self.functions: + unique_id = f"{func_id}#L{line}.{n}" + n += 1 + self.functions[unique_id] = func_data + return unique_id + def _process_method_node(self, node, source: bytes, relative_path: str, class_name: Optional[str], module_name: Optional[str], is_singleton: bool, visibility: str = 'public') -> None: @@ -575,7 +602,7 @@ def _process_method_node(self, node, source: bytes, relative_path: str, 'unit_type': unit_type, } - self.functions[func_id] = func_data + self._store_function(func_id, func_data) self.stats['total_functions'] += 1 if class_name: diff --git a/libs/openant-core/tests/parsers/php/test_php_conditional_collision.py b/libs/openant-core/tests/parsers/php/test_php_conditional_collision.py new file mode 100644 index 0000000..ed698d9 --- /dev/null +++ b/libs/openant-core/tests/parsers/php/test_php_conditional_collision.py @@ -0,0 +1,66 @@ +"""Bug 1 (PHP): same-name functions defined in mutually-exclusive conditional +branches must NOT silently collapse to the last-in-source one. + +Verified harmful (independent + judge, real `php` interpreter): for + if (...) { function k(){ A } } else { function k(){ B } } +and the defensive double `function_exists` guard, the live definition can be the +EARLIER branch, but the extractor kept only the later (else) one — a silent +false negative for a SAST tool. Both branches are env-dependently reachable, so +the fix keeps BOTH (the `#L` disambiguation the Python extractor already +uses), not prefer-first/larger. Collision-only: a unique name keeps its clean id. +""" +import sys +import tempfile +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.php.function_extractor import FunctionExtractor + + +def _extract(php_source: str, filename: str = "cond.php") -> dict: + repo = tempfile.mkdtemp() + Path(repo, filename).write_text(php_source) + return FunctionExtractor(repo).extract_all([filename]) + + +def _named(out: dict, name: str): + return [fid for fid, d in out["functions"].items() if d.get("name") == name] + + +def test_conditional_ifelse_keeps_both_branches(): + src = ( + "` +disambiguation the Python extractor already uses. Collision-only: a unique name +keeps its clean id. (Unconditional reopening stays correct — last-wins — and is +simply also kept-both, the same tradeoff Python accepts.) +""" +import sys +from pathlib import Path + +_CORE_ROOT = Path(__file__).resolve().parents[3] +sys.path.insert(0, str(_CORE_ROOT)) + +from parsers.ruby.function_extractor import FunctionExtractor + + +def _extract(tmp_path: Path, filename: str, source: str) -> dict: + (tmp_path / filename).write_text(source) + return FunctionExtractor(str(tmp_path)).extract_all([filename]) + + +def _named(out: dict, name: str): + return [fid for fid, d in out["functions"].items() if d.get("name") == name] + + +def test_conditional_ifelse_keeps_both_branches(tmp_path): + src = ( + "if ENV['X']\n" + " def k; if_branch; end\n" + "else\n" + " def k; else_branch; end\n" + "end\n" + ) + out = _extract(tmp_path, "cond.rb", src) + ks = _named(out, "k") + assert len(ks) == 2, f"both conditional branches of k must be kept; got {ks}" + bodies = " ".join(out["functions"][f]["code"] for f in ks) + assert "if_branch" in bodies and "else_branch" in bodies, f"a branch was dropped: {bodies}" + + +def test_unique_name_id_unchanged(tmp_path): + out = _extract(tmp_path, "u.rb", "def solo; 1; end\n") + ids = _named(out, "solo") + assert ids == ["u.rb:solo"], f"unique-name id must stay byte-identical; got {ids}"