diff --git a/code_review_graph/parser.py b/code_review_graph/parser.py index 31af17f..3e055fe 100644 --- a/code_review_graph/parser.py +++ b/code_review_graph/parser.py @@ -119,6 +119,41 @@ class EdgeInfo: ".jl": "julia", } +# Shebang interpreter → language mapping for extension-less Unix scripts. +# Each key is the **basename** of the interpreter path as it appears after +# ``#!`` (or after ``#!/usr/bin/env``). Only languages already registered +# above are listed — this file strictly routes extension-less scripts, it +# does NOT introduce new languages on its own. See issue #237. +SHEBANG_INTERPRETER_TO_LANGUAGE: dict[str, str] = { + # POSIX / bash-compatible shells — all routed through tree-sitter-bash + "bash": "bash", + "sh": "bash", + "zsh": "bash", + "ksh": "bash", + "dash": "bash", + "ash": "bash", + # Python (every common variant) + "python": "python", + "python2": "python", + "python3": "python", + "pypy": "python", + "pypy3": "python", + # JavaScript via Node + "node": "javascript", + "nodejs": "javascript", + # Ruby / Perl / Lua / R / PHP + "ruby": "ruby", + "perl": "perl", + "lua": "lua", + "Rscript": "r", + "php": "php", +} + +# Maximum bytes to read from the head of a file when probing for a shebang. +# 256 is enough for any reasonable shebang line (``#!/usr/bin/env python3 -u\n`` +# is ~30 chars) while keeping the worst-case read tiny even on fat binaries. +_SHEBANG_PROBE_BYTES = 256 + # Tree-sitter node type mappings per language # Maps (language) -> dict of semantic role -> list of TS node types _CLASS_TYPES: dict[str, list[str]] = { @@ -383,7 +418,88 @@ def _get_parser(self, language: str): # type: ignore[arg-type] return self._parsers[language] def detect_language(self, path: Path) -> Optional[str]: - return EXTENSION_TO_LANGUAGE.get(path.suffix.lower()) + """Map a file path to its language name. + + Extension-based lookup is tried first. For extension-less files + (typical for Unix scripts like ``bin/myapp`` or ``.git/hooks/pre-commit``) + we fall back to reading the first line for a shebang. Files that + already have a known extension are never re-read — shebang probing + only runs when the extension lookup returns ``None`` **and** the path + has no suffix at all. See issue #237. + """ + suffix = path.suffix.lower() + lang = EXTENSION_TO_LANGUAGE.get(suffix) + if lang is not None: + return lang + # Only probe shebang for files without any extension — "README", "LICENSE", + # and other extension-less text files also fall here, but the probe is a + # cheap 256-byte read that returns None when no shebang is found. + if suffix == "": + return self._detect_language_from_shebang(path) + return None + + @staticmethod + def _detect_language_from_shebang(path: Path) -> Optional[str]: + """Inspect the first line of ``path`` for a shebang interpreter. + + Returns the mapped language name or ``None`` if the file has no + shebang, is unreadable, or names an interpreter we don't map. + + Accepted shapes:: + + #!/bin/bash + #!/usr/bin/env python3 + #!/usr/bin/env -S node --experimental-vm-modules + #!/usr/bin/bash -e + + Only the basename of the interpreter is consulted. Trailing flags + after the interpreter are ignored. Windows-style ``\r\n`` line + endings are handled. Binary files read as garbage bytes simply + fail the ``#!`` prefix check and return ``None``. + """ + try: + with path.open("rb") as fh: + head = fh.read(_SHEBANG_PROBE_BYTES) + except (OSError, PermissionError): + return None + if not head.startswith(b"#!"): + return None + + # Take just the first line, stripped of leading "#!" and any + # surrounding whitespace. Split on NUL to defend against accidental + # binary content following a ``#!`` prefix. + first_line = head.split(b"\n", 1)[0].split(b"\0", 1)[0] + try: + line = first_line[2:].decode("utf-8", errors="strict").strip() + except UnicodeDecodeError: + return None + if not line: + return None + + tokens = line.split() + if not tokens: + return None + + first = tokens[0] + # `/usr/bin/env` indirection: the interpreter is the next token. + # `/usr/bin/env -S node --flag` is also valid — skip any leading + # ``-`` options after env. + if first.endswith("/env") or first == "env": + interpreter_token: Optional[str] = None + for tok in tokens[1:]: + if tok.startswith("-"): + # ``-S`` takes no argument in most envs; skip and continue. + continue + interpreter_token = tok + break + if interpreter_token is None: + return None + interpreter = interpreter_token.rsplit("/", 1)[-1] + else: + # Direct form: ``#!/bin/bash`` or ``#!/usr/local/bin/python3``. + interpreter = first.rsplit("/", 1)[-1] + + return SHEBANG_INTERPRETER_TO_LANGUAGE.get(interpreter) def parse_file(self, path: Path) -> tuple[list[NodeInfo], list[EdgeInfo]]: """Parse a single file and return extracted nodes and edges.""" diff --git a/tests/test_parser.py b/tests/test_parser.py index 1c629a5..84ceecb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,6 +21,143 @@ def test_detect_language_typescript(self): def test_detect_language_unknown(self): assert self.parser.detect_language(Path("foo.txt")) is None + # --- Shebang detection for extension-less Unix scripts (#237) --- + + def _write_shebang_file(self, tmp_path: Path, name: str, content: str) -> Path: + """Helper: write an extension-less file with ``content`` and return its path.""" + p = tmp_path / name + p.write_text(content, encoding="utf-8") + return p + + def test_detect_shebang_bin_bash(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "deploy", "#!/bin/bash\nfoo() { echo hi; }\n", + ) + assert self.parser.detect_language(p) == "bash" + + def test_detect_shebang_bin_sh_routed_to_bash(self, tmp_path): + """/bin/sh scripts are parsed through the bash grammar.""" + p = self._write_shebang_file( + tmp_path, "install-hook", "#!/bin/sh\necho hello\n", + ) + assert self.parser.detect_language(p) == "bash" + + def test_detect_shebang_env_bash(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "runner", "#!/usr/bin/env bash\nfoo() { echo hi; }\n", + ) + assert self.parser.detect_language(p) == "bash" + + def test_detect_shebang_env_python3(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "myapp", + "#!/usr/bin/env python3\ndef main():\n pass\n", + ) + assert self.parser.detect_language(p) == "python" + + def test_detect_shebang_direct_python(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "tool", "#!/usr/bin/python3\nprint('hi')\n", + ) + assert self.parser.detect_language(p) == "python" + + def test_detect_shebang_node(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "cli", "#!/usr/bin/env node\nconsole.log(1);\n", + ) + assert self.parser.detect_language(p) == "javascript" + + def test_detect_shebang_env_dash_s_flag(self, tmp_path): + """``#!/usr/bin/env -S node --flag`` (Linux -S) resolves to the interpreter.""" + p = self._write_shebang_file( + tmp_path, "esm-tool", + "#!/usr/bin/env -S node --experimental-vm-modules\n" + "console.log('esm');\n", + ) + assert self.parser.detect_language(p) == "javascript" + + def test_detect_shebang_ruby(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "rake-task", "#!/usr/bin/env ruby\nputs 1\n", + ) + assert self.parser.detect_language(p) == "ruby" + + def test_detect_shebang_perl(self, tmp_path): + p = self._write_shebang_file( + tmp_path, "cgi-script", "#!/usr/bin/env perl\nprint 1;\n", + ) + assert self.parser.detect_language(p) == "perl" + + def test_detect_shebang_with_trailing_flags(self, tmp_path): + """``#!/bin/bash -e`` still maps to bash (flags ignored).""" + p = self._write_shebang_file( + tmp_path, "strict", "#!/bin/bash -e\nfoo() { echo hi; }\n", + ) + assert self.parser.detect_language(p) == "bash" + + def test_detect_shebang_missing_returns_none(self, tmp_path): + """Extension-less text files without a shebang return None, not bash.""" + p = self._write_shebang_file( + tmp_path, "README", "# just a readme, no shebang\nsome content\n", + ) + assert self.parser.detect_language(p) is None + + def test_detect_shebang_empty_file_returns_none(self, tmp_path): + p = tmp_path / "EMPTY" + p.write_bytes(b"") + assert self.parser.detect_language(p) is None + + def test_detect_shebang_binary_content_returns_none(self, tmp_path): + """A garbage-byte first line that happens not to start with ``#!`` + must not raise and must return None.""" + p = tmp_path / "binary-blob" + p.write_bytes(b"\x00\x01\x02\x03 garbage bytes not a shebang\n") + assert self.parser.detect_language(p) is None + + def test_detect_shebang_unknown_interpreter_returns_none(self, tmp_path): + """A valid shebang to an interpreter we don't route is treated as + 'unknown language' — same as an unmapped extension.""" + p = self._write_shebang_file( + tmp_path, "ocaml-script", "#!/usr/bin/env ocaml\nlet x = 1\n", + ) + assert self.parser.detect_language(p) is None + + def test_detect_shebang_does_not_override_extension(self, tmp_path): + """A file with a known extension must still use extension-based + detection, even if its first line is a misleading shebang.""" + p = tmp_path / "script.py" + p.write_text("#!/bin/bash\nprint('hi')\n", encoding="utf-8") + # .py wins over the bash shebang — non-intuitive-looking content + # in a .py file must not fool the detector. + assert self.parser.detect_language(p) == "python" + + def test_parse_shebang_script_produces_function_nodes(self, tmp_path): + """End-to-end regression: an extension-less bash script is not only + detected but also fully parsed into structural nodes via parse_file. + """ + script = ( + "#!/usr/bin/env bash\n" + "greet() {\n" + ' echo "hi $1"\n' + "}\n" + "main() {\n" + " greet world\n" + "}\n" + "main\n" + ) + p = self._write_shebang_file(tmp_path, "deploy", script) + + nodes, edges = self.parser.parse_file(p) + + # We at least got the File node plus both functions. + assert len(nodes) >= 3 + funcs = [n for n in nodes if n.kind == "Function"] + func_names = {f.name for f in funcs} + assert "greet" in func_names + assert "main" in func_names + for n in nodes: + assert n.language == "bash" + def test_parse_python_file(self): nodes, edges = self.parser.parse_file(FIXTURES / "sample_python.py")