diff --git a/libs/openant-core/parsers/javascript/typescript_analyzer.js b/libs/openant-core/parsers/javascript/typescript_analyzer.js index baf219c..30336f4 100644 --- a/libs/openant-core/parsers/javascript/typescript_analyzer.js +++ b/libs/openant-core/parsers/javascript/typescript_analyzer.js @@ -219,6 +219,68 @@ class TypeScriptAnalyzer { /** * Extract all functions/methods from a source file */ + // Module-level FunctionDeclarations: top-level OR nested only inside blocks / + // control-flow statements (if/else, try/catch, for/while, switch, bare {}) — + // NOT inside another function/method/accessor (their text already rides inside + // the parent unit) and NOT inside a class `static {}` block (block-scoped to + // the initializer, callable nowhere). `getFunctions()` returned only top-level + // declarations, so block-scoped functions were silently dropped from both the + // inventory and the call graph. Returns [{node, id}] with COLLISION-ONLY + // `#L` disambiguation (sibling-block same-name functions are both + // runtime-reachable, so keep both; unique names keep the plain `path:name` + // id). Both the inventory and call-graph builders iterate this same list so + // their keys match exactly (the `len(callGraph) === len(functions)` lockstep). + // The named FunctionDeclaration nodes that are module-level (top-level OR only + // block-nested), excluding function-nested and static-block-scoped ones. Used + // by the inventory builder, the call-graph builder, AND the resolver — all + // three must see the same set so a block function is a unit, has its own + // edges, and is resolvable as a call target. + _moduleLevelFunctionNodes(sourceFile) { + const STOP = new Set([ + ts.SyntaxKind.FunctionDeclaration, + ts.SyntaxKind.FunctionExpression, + ts.SyntaxKind.ArrowFunction, + ts.SyntaxKind.MethodDeclaration, + ts.SyntaxKind.Constructor, + ts.SyntaxKind.GetAccessor, + ts.SyntaxKind.SetAccessor, + ts.SyntaxKind.ClassStaticBlockDeclaration, + ]); + const isModuleLevel = (fn) => { + for (let p = fn.getParent(); p; p = p.getParent()) { + if (STOP.has(p.getKind())) return false; + } + return true; + }; + return sourceFile + .getDescendantsOfKind(ts.SyntaxKind.FunctionDeclaration) + .filter((fn) => fn.getName() && isModuleLevel(fn)); + } + + _moduleLevelFunctionEntries(sourceFile, relativePath) { + const fns = this._moduleLevelFunctionNodes(sourceFile); + const countByName = {}; + for (const fn of fns) { + const n = fn.getName(); + countByName[n] = (countByName[n] || 0) + 1; + } + const seen = new Set(); + return fns.map((fn) => { + const name = fn.getName(); + let id = `${relativePath}:${name}`; + if (countByName[name] > 1) { + // collision-only: keep both, deterministic by source line (never by + // emit order). Colon-free `#L` suffix survives the downstream + // `split(':')[0]` (file) / `rsplit(':',1)[-1]` (name) id contracts. + let uid = `${id}#L${fn.getStartLineNumber()}`; + while (seen.has(uid)) uid = `${uid}.x`; + seen.add(uid); + id = uid; + } + return { node: fn, id }; + }); + } + extractFunctionsFromFile(sourceFile) { // Always emit POSIX-style relative paths so functionId values are // stable across platforms (Python downstream consumers and dataset @@ -227,13 +289,13 @@ class TypeScriptAnalyzer { path.relative(this.repoPath, sourceFile.getFilePath()), ); - // Extract function declarations - for (const func of sourceFile.getFunctions()) { + // Extract function declarations (top-level + module-level block-scoped). + for (const { node: func, id: functionId } of this._moduleLevelFunctionEntries( + sourceFile, + relativePath, + )) { const name = func.getName(); - if (!name) continue; - const code = func.getFullText(); - const functionId = `${relativePath}:${name}`; this.functions[functionId] = { name: name, code: code, @@ -1249,12 +1311,13 @@ class TypeScriptAnalyzer { path.relative(this.repoPath, sourceFile.getFilePath()), ); - // Analyze function declarations - for (const func of sourceFile.getFunctions()) { - const name = func.getName(); - if (!name) continue; - - const callerId = `${relativePath}:${name}`; + // Analyze function declarations (same enumeration + id scheme as the + // inventory builder, so callGraph keys match functions keys exactly — + // a block function must get its REAL outgoing edges, not a backstop []). + for (const { node: func, id: callerId } of this._moduleLevelFunctionEntries( + sourceFile, + relativePath, + )) { this.callGraph[callerId] = this.extractCallsFromFunction( func, relativePath, @@ -1529,9 +1592,10 @@ function extractSingleFunction(filePath, functionRef) { } } - // 2. Try standalone function declarations + // 2. Try standalone function declarations (incl. module-level block-scoped, + // so a call TO a function defined inside an if/try/for block resolves). if (!foundFunction) { - for (const func of sourceFile.getFunctions()) { + for (const func of this._moduleLevelFunctionNodes(sourceFile)) { if (func.getName() === functionName) { foundFunction = { node: func, diff --git a/libs/openant-core/tests/parsers/javascript/test_block_scoped_functions.py b/libs/openant-core/tests/parsers/javascript/test_block_scoped_functions.py new file mode 100644 index 0000000..ee19b87 --- /dev/null +++ b/libs/openant-core/tests/parsers/javascript/test_block_scoped_functions.py @@ -0,0 +1,141 @@ +"""Bug: block-scoped function declarations are silently dropped from BOTH the +function inventory and the call graph. + +The analyzer enumerated functions via `sourceFile.getFunctions()` (top-level +only) in extractFunctionsFromFile AND buildCallGraphForFile, so a function +declared inside any block (if/else, try/catch, for/while/switch, bare {}) was +invisible — never a unit, never a reachability node, never an entry point. For a +SAST tool a vulnerable function gated behind `if (process.env.X)` or in a +`catch` fallback is then never analyzed. + +Investigated independent + expert + judge. Settled scope (verified against the +`node` runtime): + * MODULE-LEVEL block functions (no function-like ancestor) MUST be surfaced. + * Functions nested inside another function/method stay omitted (their text + rides inside the parent unit). + * Functions inside a `static {}` class block stay omitted (uncallable). + * Both loops patched in lockstep so callGraph keys == functions keys (the + backstop must not mask a missing-edge gap). + * Sibling-block same-name functions are both runtime-reachable -> keep BOTH + via a `#L` suffix (collision-only; unique names keep the clean id). +""" +import json +import shutil +import subprocess +from pathlib import Path + +import pytest + +PARSERS_JS_DIR = Path(__file__).parent.parent.parent.parent / "parsers" / "javascript" +NODE_MODULES = PARSERS_JS_DIR / "node_modules" + +pytestmark = pytest.mark.skipif( + not shutil.which("node") or not NODE_MODULES.exists(), + reason="Node.js or JS parser npm dependencies not available", +) + + +def _analyze(tmp_path, source, filename="a.js"): + repo = tmp_path / "r" + repo.mkdir(exist_ok=True) + fp = repo / filename + fp.write_text(source) + cmd = ["node", str(PARSERS_JS_DIR / "typescript_analyzer.js"), str(repo), str(fp)] + res = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + assert res.returncode == 0, f"analyzer failed:\n{res.stderr}" + return json.loads(res.stdout) + + +def _names(out): + return sorted(k.split(":", 1)[1] for k in out["functions"]) + + +# --- the core bug: module-level block functions surfaced in BOTH maps -------- + +@pytest.mark.parametrize("wrap", [ + "if (c) { %s }", + "if (c) {} else { %s }", + "try { %s } catch (e) {}", + "try {} catch (e) { %s }", + "for (let i=0;i<1;i++) { %s }", + "while (c) { %s }", + "switch (x) { case 1: { %s } }", + "{ %s }", +]) +def test_block_scoped_function_is_extracted(tmp_path, wrap): + src = "function topLevel(){ return 1; }\n" + wrap % "function vuln(){ return sink(); }" + out = _analyze(tmp_path, src) + assert "vuln" in _names(out), f"block function dropped from inventory: {_names(out)}" + # and it must be a real call-graph node, not a backstop-filled empty entry + vuln_id = next(k for k in out["functions"] if k.endswith(":vuln")) + assert vuln_id in out["callGraph"], "vuln missing from callGraph" + edges = out["callGraph"][vuln_id] + # callGraph edges are dicts: {"name": "sink", "resolved": bool, ...} + edge_names = [e.get("name") if isinstance(e, dict) else e for e in edges] + assert "sink" in edge_names, ( + f"vuln's outgoing edge to sink lost (backstop masked it): {edges}" + ) + + +def test_incoming_call_to_block_function_resolves(tmp_path): + # A call TO a module-level block-scoped function must resolve in the RESOLVED + # call graph, so the block function is reachable (not pruned). Guards the + # sibling resolver site (_buildResolvedGraphs) — without it the function is a + # unit but has no resolved incoming edge. + src = "function caller(){ return helper(); }\nif (c) { function helper(){ return 1; } }\n" + out = _analyze(tmp_path, src) + caller_id = next(k for k in out["functions"] if k.endswith(":caller")) + helper_id = next(k for k in out["functions"] if k.endswith(":helper")) + assert helper_id in out["call_graph"].get(caller_id, []), ( + f"caller->helper not resolved: {out['call_graph'].get(caller_id)}" + ) + assert caller_id in out["reverse_call_graph"].get(helper_id, []), ( + f"block fn has no resolved incoming edge: {out['reverse_call_graph'].get(helper_id)}" + ) + + +def test_invariant_callgraph_keys_equal_functions_keys(tmp_path): + src = "if (c) { function a(){ b(); } }\nfunction b(){}\n" + out = _analyze(tmp_path, src) + assert set(out["functions"]) == set(out["callGraph"]), ( + "callGraph keys diverge from functions keys (lockstep broken)" + ) + + +# --- keep-both for sibling-block same-name (both runtime-reachable) ----------- + +def test_sibling_block_same_name_keeps_both(tmp_path): + src = ( + "if (c) { function dup(){ return ifBranch(); } }\n" + "else { function dup(){ return elseBranch(); } }\n" + ) + out = _analyze(tmp_path, src) + dups = [k for k in out["functions"] if k.split(":", 1)[1].startswith("dup")] + assert len(dups) == 2, f"both sibling-block dup defs must survive; got {dups}" + bodies = " ".join(out["functions"][k]["code"] for k in dups) + assert "ifBranch" in bodies and "elseBranch" in bodies, f"a branch was dropped: {bodies}" + + +# --- the omit cases (must NOT over-extract) ----------------------------------- + +def test_function_nested_in_function_stays_omitted(tmp_path): + out = _analyze(tmp_path, "function outer(){ function inner(){ return 1; } return inner(); }") + assert "inner" not in _names(out), f"nested-in-function should be omitted: {_names(out)}" + assert "outer" in _names(out) + + +def test_function_in_static_block_stays_omitted(tmp_path): + # class static-init block: the function is block-scoped to the initializer + # and callable nowhere -> must not become a (false-positive) unit. + out = _analyze(tmp_path, "class C { static { function s(){ return 1; } } }") + assert "s" not in _names(out), f"static-block function should be omitted: {_names(out)}" + + +# --- regression: unique top-level id unchanged -------------------------------- + +def test_unique_top_level_id_unchanged(tmp_path): + out = _analyze(tmp_path, "function solo(){ return 1; }") + ids = [k for k in out["functions"] if k.endswith("solo")] + assert len(ids) == 1 and ids[0].endswith(":solo") and "#L" not in ids[0], ( + f"unique-name id must stay byte-identical: {ids}" + )