From 42cbb41a01edea6daae503cdee104ba550288cb1 Mon Sep 17 00:00:00 2001
From: azizur100389 <azizur100389@gmail.com>
Date: Sun, 12 Apr 2026 00:22:07 +0100
Subject: [PATCH] feat(parser): shebang-based language detection for
 extension-less scripts (#237)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a shebang fallback to `CodeParser.detect_language()` so that
extension-less Unix scripts (`bin/myapp`, `.git/hooks/pre-commit`,
`scripts/deploy`, `.husky/pre-push`, `installer`, ...) are routed to the
correct tree-sitter grammar based on their first line.

Root cause of #237
------------------
`detect_language()` was a single-line lookup against `EXTENSION_TO_LANGUAGE`
keyed on `path.suffix.lower()`.  Any file with no extension returned
`None`, which filters it out of both `incremental_update()` and
`full_build()` before parsing.  Real-world repos rely heavily on
extension-less scripts for entrypoints, git hooks, CI installers, and
shell tooling — all currently invisible to `callers_of`,
`get_impact_radius`, `detect_changes`, and architecture mapping.

Fix
---
1. New module-level `SHEBANG_INTERPRETER_TO_LANGUAGE` table mapping common
   interpreter basenames to languages that are *already* registered:
     - bash / sh / zsh / ksh / dash / ash -> "bash"
     - python / python2 / python3 / pypy / pypy3 -> "python"
     - node / nodejs -> "javascript"
     - ruby, perl, lua, Rscript, php
   This file strictly *routes* extension-less files to existing languages;
   it does NOT introduce new grammars.

2. New `_SHEBANG_PROBE_BYTES = 256` constant — maximum bytes read from the
   head of a file when probing.  Enough for any reasonable shebang line
   while keeping worst-case I/O tiny.

3. New `CodeParser._detect_language_from_shebang(path)` static method.
   Opens the file, reads up to 256 bytes, verifies `#!` prefix, splits on
   the first newline AND first NUL byte (defensive against binary), and
   decodes UTF-8 strictly so malformed content returns None instead of
   raising.  Handles:
     - direct form            #!/bin/bash
     - env indirection        #!/usr/bin/env bash
     - env -S flag (Linux)    #!/usr/bin/env -S node --experimental-vm-modules
     - trailing flags         #!/bin/bash -e
     - interpreter basename extraction from any absolute path
     - CRLF line endings (`.split(b"\n", 1)`)

4. `detect_language(path)` now tries the extension lookup first, and if it
   returns None AND `path.suffix == ""`, falls back to the shebang probe.
   Files with a *known* extension are NEVER re-read — extension-based
   detection remains authoritative.

Non-regressions guaranteed by the design
----------------------------------------
- `.py` files still parse as Python even if the first line is a misleading
  `#!/bin/bash`  (`test_detect_shebang_does_not_override_extension`)
- Extension-less README / LICENSE files return None with a 256-byte read
  that finds no shebang.
- Binary files whose first bytes are not `#!` return None without raising.
- Unknown interpreters (e.g. `#!/usr/bin/env ocaml`) return None — same
  semantics as an unmapped extension.

Tests added (tests/test_parser.py::TestCodeParser — 16 tests)
-------------------------------------------------------------
- test_detect_shebang_bin_bash
- test_detect_shebang_bin_sh_routed_to_bash
- test_detect_shebang_env_bash
- test_detect_shebang_env_python3
- test_detect_shebang_direct_python
- test_detect_shebang_node
- test_detect_shebang_env_dash_s_flag
- test_detect_shebang_ruby
- test_detect_shebang_perl
- test_detect_shebang_with_trailing_flags
- test_detect_shebang_missing_returns_none
- test_detect_shebang_empty_file_returns_none
- test_detect_shebang_binary_content_returns_none
- test_detect_shebang_unknown_interpreter_returns_none
- test_detect_shebang_does_not_override_extension
- test_parse_shebang_script_produces_function_nodes (end-to-end parse_file
  check: extension-less bash script is detected AND parsed into File +
  Function nodes, all tagged language="bash")

Test results
------------
Stage 1 (new targeted shebang tests):       16/16 passed.
Stage 2 (tests/test_parser.py full):        83/83 passed.
Stage 3 (tests/test_multilang.py adjacent): 151/151 passed.
Stage 4 (full suite):                       748 passed (up from 733),
  8 pre-existing Windows failures in test_incremental (3) + test_main
  async coroutine detection (1) + test_notebook Databricks (4) —
  verified identical on unchanged main.
Stage 5 (ruff check):
  - code_review_graph/parser.py: clean
  - tests/test_parser.py: 1 pre-existing F841 on line 1038
    (test_map_dispatch_qualified_reference, unrelated to this PR —
    reproducible on unchanged main at line 901).

Zero regressions. Purely additive fallback that only fires for files
with no extension.
---
 code_review_graph/parser.py | 118 ++++++++++++++++++++++++++++++-
 tests/test_parser.py        | 137 ++++++++++++++++++++++++++++++++++++
 2 files changed, 254 insertions(+), 1 deletion(-)

diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py
index 31af17f7..3e055fea 100644
--- a/code_review_graph/parser.py
+++ b/code_review_graph/parser.py
@@ -119,6 +119,41 @@ class EdgeInfo:
     ".jl": "julia",
 }
 
+# Shebang interpreter → language mapping for extension-less Unix scripts.
+# Each key is the **basename** of the interpreter path as it appears after
+# ``#!`` (or after ``#!/usr/bin/env``).  Only languages already registered
+# above are listed — this file strictly routes extension-less scripts, it
+# does NOT introduce new languages on its own.  See issue #237.
+SHEBANG_INTERPRETER_TO_LANGUAGE: dict[str, str] = {
+    # POSIX / bash-compatible shells — all routed through tree-sitter-bash
+    "bash": "bash",
+    "sh": "bash",
+    "zsh": "bash",
+    "ksh": "bash",
+    "dash": "bash",
+    "ash": "bash",
+    # Python (every common variant)
+    "python": "python",
+    "python2": "python",
+    "python3": "python",
+    "pypy": "python",
+    "pypy3": "python",
+    # JavaScript via Node
+    "node": "javascript",
+    "nodejs": "javascript",
+    # Ruby / Perl / Lua / R / PHP
+    "ruby": "ruby",
+    "perl": "perl",
+    "lua": "lua",
+    "Rscript": "r",
+    "php": "php",
+}
+
+# Maximum bytes to read from the head of a file when probing for a shebang.
+# 256 is enough for any reasonable shebang line (``#!/usr/bin/env python3 -u\n``
+# is ~30 chars) while keeping the worst-case read tiny even on fat binaries.
+_SHEBANG_PROBE_BYTES = 256
+
 # Tree-sitter node type mappings per language
 # Maps (language) -> dict of semantic role -> list of TS node types
 _CLASS_TYPES: dict[str, list[str]] = {
@@ -383,7 +418,88 @@ def _get_parser(self, language: str):  # type: ignore[arg-type]
         return self._parsers[language]
 
     def detect_language(self, path: Path) -> Optional[str]:
-        return EXTENSION_TO_LANGUAGE.get(path.suffix.lower())
+        """Map a file path to its language name.
+
+        Extension-based lookup is tried first.  For extension-less files
+        (typical for Unix scripts like ``bin/myapp`` or ``.git/hooks/pre-commit``)
+        we fall back to reading the first line for a shebang.  Files that
+        already have a known extension are never re-read — shebang probing
+        only runs when the extension lookup returns ``None`` **and** the path
+        has no suffix at all.  See issue #237.
+        """
+        suffix = path.suffix.lower()
+        lang = EXTENSION_TO_LANGUAGE.get(suffix)
+        if lang is not None:
+            return lang
+        # Only probe shebang for files without any extension — "README", "LICENSE",
+        # and other extension-less text files also fall here, but the probe is a
+        # cheap 256-byte read that returns None when no shebang is found.
+        if suffix == "":
+            return self._detect_language_from_shebang(path)
+        return None
+
+    @staticmethod
+    def _detect_language_from_shebang(path: Path) -> Optional[str]:
+        """Inspect the first line of ``path`` for a shebang interpreter.
+
+        Returns the mapped language name or ``None`` if the file has no
+        shebang, is unreadable, or names an interpreter we don't map.
+
+        Accepted shapes::
+
+            #!/bin/bash
+            #!/usr/bin/env python3
+            #!/usr/bin/env -S node --experimental-vm-modules
+            #!/usr/bin/bash -e
+
+        Only the basename of the interpreter is consulted.  Trailing flags
+        after the interpreter are ignored.  Windows-style ``\r\n`` line
+        endings are handled.  Binary files read as garbage bytes simply
+        fail the ``#!`` prefix check and return ``None``.
+        """
+        try:
+            with path.open("rb") as fh:
+                head = fh.read(_SHEBANG_PROBE_BYTES)
+        except (OSError, PermissionError):
+            return None
+        if not head.startswith(b"#!"):
+            return None
+
+        # Take just the first line, stripped of leading "#!" and any
+        # surrounding whitespace.  Split on NUL to defend against accidental
+        # binary content following a ``#!`` prefix.
+        first_line = head.split(b"\n", 1)[0].split(b"\0", 1)[0]
+        try:
+            line = first_line[2:].decode("utf-8", errors="strict").strip()
+        except UnicodeDecodeError:
+            return None
+        if not line:
+            return None
+
+        tokens = line.split()
+        if not tokens:
+            return None
+
+        first = tokens[0]
+        # `/usr/bin/env` indirection: the interpreter is the next token.
+        # `/usr/bin/env -S node --flag` is also valid — skip any leading
+        # ``-`` options after env.
+        if first.endswith("/env") or first == "env":
+            interpreter_token: Optional[str] = None
+            for tok in tokens[1:]:
+                if tok.startswith("-"):
+                    # ``-S`` takes no argument in most envs; skip and continue.
+                    continue
+                interpreter_token = tok
+                break
+            if interpreter_token is None:
+                return None
+            interpreter = interpreter_token.rsplit("/", 1)[-1]
+        else:
+            # Direct form: ``#!/bin/bash`` or ``#!/usr/local/bin/python3``.
+            interpreter = first.rsplit("/", 1)[-1]
+
+        return SHEBANG_INTERPRETER_TO_LANGUAGE.get(interpreter)
 
     def parse_file(self, path: Path) -> tuple[list[NodeInfo], list[EdgeInfo]]:
         """Parse a single file and return extracted nodes and edges."""
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 1c629a5f..84ceecb8 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -21,6 +21,143 @@ def test_detect_language_typescript(self):
     def test_detect_language_unknown(self):
         assert self.parser.detect_language(Path("foo.txt")) is None
 
+    # --- Shebang detection for extension-less Unix scripts (#237) ---
+
+    def _write_shebang_file(self, tmp_path: Path, name: str, content: str) -> Path:
+        """Helper: write an extension-less file with ``content`` and return its path."""
+        p = tmp_path / name
+        p.write_text(content, encoding="utf-8")
+        return p
+
+    def test_detect_shebang_bin_bash(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "deploy", "#!/bin/bash\nfoo() { echo hi; }\n",
+        )
+        assert self.parser.detect_language(p) == "bash"
+
+    def test_detect_shebang_bin_sh_routed_to_bash(self, tmp_path):
+        """/bin/sh scripts are parsed through the bash grammar."""
+        p = self._write_shebang_file(
+            tmp_path, "install-hook", "#!/bin/sh\necho hello\n",
+        )
+        assert self.parser.detect_language(p) == "bash"
+
+    def test_detect_shebang_env_bash(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "runner", "#!/usr/bin/env bash\nfoo() { echo hi; }\n",
+        )
+        assert self.parser.detect_language(p) == "bash"
+
+    def test_detect_shebang_env_python3(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "myapp",
+            "#!/usr/bin/env python3\ndef main():\n    pass\n",
+        )
+        assert self.parser.detect_language(p) == "python"
+
+    def test_detect_shebang_direct_python(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "tool", "#!/usr/bin/python3\nprint('hi')\n",
+        )
+        assert self.parser.detect_language(p) == "python"
+
+    def test_detect_shebang_node(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "cli", "#!/usr/bin/env node\nconsole.log(1);\n",
+        )
+        assert self.parser.detect_language(p) == "javascript"
+
+    def test_detect_shebang_env_dash_s_flag(self, tmp_path):
+        """``#!/usr/bin/env -S node --flag`` (Linux -S) resolves to the interpreter."""
+        p = self._write_shebang_file(
+            tmp_path, "esm-tool",
+            "#!/usr/bin/env -S node --experimental-vm-modules\n"
+            "console.log('esm');\n",
+        )
+        assert self.parser.detect_language(p) == "javascript"
+
+    def test_detect_shebang_ruby(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "rake-task", "#!/usr/bin/env ruby\nputs 1\n",
+        )
+        assert self.parser.detect_language(p) == "ruby"
+
+    def test_detect_shebang_perl(self, tmp_path):
+        p = self._write_shebang_file(
+            tmp_path, "cgi-script", "#!/usr/bin/env perl\nprint 1;\n",
+        )
+        assert self.parser.detect_language(p) == "perl"
+
+    def test_detect_shebang_with_trailing_flags(self, tmp_path):
+        """``#!/bin/bash -e`` still maps to bash (flags ignored)."""
+        p = self._write_shebang_file(
+            tmp_path, "strict", "#!/bin/bash -e\nfoo() { echo hi; }\n",
+        )
+        assert self.parser.detect_language(p) == "bash"
+
+    def test_detect_shebang_missing_returns_none(self, tmp_path):
+        """Extension-less text files without a shebang return None, not bash."""
+        p = self._write_shebang_file(
+            tmp_path, "README", "# just a readme, no shebang\nsome content\n",
+        )
+        assert self.parser.detect_language(p) is None
+
+    def test_detect_shebang_empty_file_returns_none(self, tmp_path):
+        p = tmp_path / "EMPTY"
+        p.write_bytes(b"")
+        assert self.parser.detect_language(p) is None
+
+    def test_detect_shebang_binary_content_returns_none(self, tmp_path):
+        """A garbage-byte first line that happens not to start with ``#!``
+        must not raise and must return None."""
+        p = tmp_path / "binary-blob"
+        p.write_bytes(b"\x00\x01\x02\x03 garbage bytes not a shebang\n")
+        assert self.parser.detect_language(p) is None
+
+    def test_detect_shebang_unknown_interpreter_returns_none(self, tmp_path):
+        """A valid shebang to an interpreter we don't route is treated as
+        'unknown language' — same as an unmapped extension."""
+        p = self._write_shebang_file(
+            tmp_path, "ocaml-script", "#!/usr/bin/env ocaml\nlet x = 1\n",
+        )
+        assert self.parser.detect_language(p) is None
+
+    def test_detect_shebang_does_not_override_extension(self, tmp_path):
+        """A file with a known extension must still use extension-based
+        detection, even if its first line is a misleading shebang."""
+        p = tmp_path / "script.py"
+        p.write_text("#!/bin/bash\nprint('hi')\n", encoding="utf-8")
+        # .py wins over the bash shebang — non-intuitive-looking content
+        # in a .py file must not fool the detector.
+        assert self.parser.detect_language(p) == "python"
+
+    def test_parse_shebang_script_produces_function_nodes(self, tmp_path):
+        """End-to-end regression: an extension-less bash script is not only
+        detected but also fully parsed into structural nodes via parse_file.
+        """
+        script = (
+            "#!/usr/bin/env bash\n"
+            "greet() {\n"
+            '    echo "hi $1"\n'
+            "}\n"
+            "main() {\n"
+            "    greet world\n"
+            "}\n"
+            "main\n"
+        )
+        p = self._write_shebang_file(tmp_path, "deploy", script)
+
+        nodes, edges = self.parser.parse_file(p)
+
+        # We at least got the File node plus both functions.
+        assert len(nodes) >= 3
+        funcs = [n for n in nodes if n.kind == "Function"]
+        func_names = {f.name for f in funcs}
+        assert "greet" in func_names
+        assert "main" in func_names
+        for n in nodes:
+            assert n.language == "bash"
+
     def test_parse_python_file(self):
         nodes, edges = self.parser.parse_file(FIXTURES / "sample_python.py")