From 45f36554622965d82f7132eddcc4699bcdb37b8a Mon Sep 17 00:00:00 2001 From: azizur100389 Date: Sat, 11 Apr 2026 23:10:23 +0100 Subject: [PATCH] feat(bash): add .ksh extension to bash parser (#235) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register .ksh (Korn shell) with tree-sitter-bash alongside the existing .sh / .bash / .zsh entries added in #227. Korn shell is close enough to bash syntactically that tree-sitter-bash handles the structural features the graph captures (function definitions, commands, source/. includes) correctly. Context ------- In the close comment on PR #230, @tirth8205 explicitly flagged .ksh as a missing extension: "The .ksh extension in particular looks worth adding — I didn't include it in #227." This PR addresses exactly that gap. Issue #235 tracks the request. Why it matters -------------- Korn shell is still used in legacy AIX/Solaris operations, IBM internal tooling, and enterprise CI scripts. Repositories that ship .ksh scripts currently index to 0 nodes because the extension is unrecognized — the same failure mode that motivated #197. Implementation -------------- One line added to EXTENSION_TO_LANGUAGE in parser.py: ".ksh": "bash" All of the bash parsing machinery shipped in #227 (_FUNCTION_TYPES, _CALL_TYPES, _extract_bash_source_command, name/call resolution) already supports any file parsed through the "bash" language path, so no further changes are needed. Tests added (tests/test_multilang.py::TestBashParsing) ------------------------------------------------------ 1. test_detects_language — extended with a .ksh assertion to lock in the extension mapping (regression guard for #235). 2. test_ksh_extension_parses_as_bash — end-to-end regression test that copies the existing tests/fixtures/sample.sh to a temp .ksh file, parses it through the real CodeParser, and asserts: - every node's language field is "bash" - the set of extracted Function names is identical to the .sh run - the CONTAINS / CALLS / IMPORTS_FROM edge counts per kind match The second assertion proves the .ksh path is fully wired through to the same structural extraction as .sh, not a degenerate zero-result read. Test results ------------ Stage 1 (new targeted tests): 2/2 passed. Stage 2 (tests/test_multilang.py full): 152/152 passed — zero regressions across any language. Stage 3 (tests/test_parser.py adjacent): 67/67 passed. Stage 4 (full suite): 733 passed. 8 pre-existing Windows failures in test_incremental (3) + test_main async coroutine detection (1) + test_notebook Databricks (4) — verified identical on unchanged main. Stage 5 (ruff check on parser.py and test_multilang.py): clean. Stage 6 (end-to-end smoke): detect_language("legacy.ksh") -> "bash"; parsing a real .ksh file produces 6 Function nodes, 18 edges, all tagged language=bash. Zero regressions. Single-line extension mapping change plus a targeted regression guard against the specific issue the maintainer flagged. --- code_review_graph/parser.py | 1 + tests/test_multilang.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py index 31af17f..69c7526 100644 --- a/code_review_graph/parser.py +++ b/code_review_graph/parser.py @@ -108,6 +108,7 @@ class EdgeInfo: ".sh": "bash", ".bash": "bash", ".zsh": "bash", + ".ksh": "bash", # Korn shell — close enough to bash for tree-sitter-bash (#235) ".ex": "elixir", ".exs": "elixir", ".ipynb": "notebook", diff --git a/tests/test_multilang.py b/tests/test_multilang.py index 1264dc9..60e3595 100644 --- a/tests/test_multilang.py +++ b/tests/test_multilang.py @@ -1087,6 +1087,39 @@ def test_detects_language(self): assert self.parser.detect_language(Path("build.sh")) == "bash" assert self.parser.detect_language(Path("build.bash")) == "bash" assert self.parser.detect_language(Path("run.zsh")) == "bash" + # Regression for #235 — Korn shell (.ksh) should parse as bash. + assert self.parser.detect_language(Path("legacy.ksh")) == "bash" + + def test_ksh_extension_parses_as_bash(self, tmp_path): + """Regression for #235: a real .ksh file is parsed through the bash + grammar end-to-end and produces the same structural nodes/edges + as an equivalent .sh file.""" + fixture_source = (FIXTURES / "sample.sh").read_text(encoding="utf-8") + ksh_copy = tmp_path / "legacy.ksh" + ksh_copy.write_text(fixture_source, encoding="utf-8") + + ksh_nodes, ksh_edges = self.parser.parse_file(ksh_copy) + + # Language tagging: every node must be "bash". + assert ksh_nodes, "parser produced zero nodes for .ksh file" + for n in ksh_nodes: + assert n.language == "bash" + + # Same function set as the .sh fixture. + ksh_funcs = {n.name for n in ksh_nodes if n.kind == "Function"} + sh_funcs = {n.name for n in self.nodes if n.kind == "Function"} + assert ksh_funcs == sh_funcs, ( + f".ksh and .sh produced different function sets: " + f"sh-only={sh_funcs - ksh_funcs}, ksh-only={ksh_funcs - sh_funcs}" + ) + + # Same structural-edge totals by kind. + def by_kind(edges): + counts: dict[str, int] = {} + for e in edges: + counts[e.kind] = counts.get(e.kind, 0) + 1 + return counts + assert by_kind(ksh_edges) == by_kind(self.edges) def test_nodes_have_bash_language(self): for n in self.nodes: