Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 117 additions & 1 deletion code_review_graph/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,41 @@ class EdgeInfo:
".jl": "julia",
}

# Shebang interpreter → language mapping for extension-less Unix scripts.
# Each key is the **basename** of the interpreter path as it appears after
# ``#!`` (or after ``#!/usr/bin/env``). Only languages already registered
# above are listed — this file strictly routes extension-less scripts, it
# does NOT introduce new languages on its own. See issue #237.
SHEBANG_INTERPRETER_TO_LANGUAGE: dict[str, str] = {
# POSIX / bash-compatible shells — all routed through tree-sitter-bash
"bash": "bash",
"sh": "bash",
"zsh": "bash",
"ksh": "bash",
"dash": "bash",
"ash": "bash",
# Python (every common variant)
"python": "python",
"python2": "python",
"python3": "python",
"pypy": "python",
"pypy3": "python",
# JavaScript via Node
"node": "javascript",
"nodejs": "javascript",
# Ruby / Perl / Lua / R / PHP
"ruby": "ruby",
"perl": "perl",
"lua": "lua",
"Rscript": "r",
"php": "php",
}

# Maximum bytes to read from the head of a file when probing for a shebang.
# 256 is enough for any reasonable shebang line (``#!/usr/bin/env python3 -u\n``
# is ~30 chars) while keeping the worst-case read tiny even on fat binaries.
_SHEBANG_PROBE_BYTES = 256

# Tree-sitter node type mappings per language
# Maps (language) -> dict of semantic role -> list of TS node types
_CLASS_TYPES: dict[str, list[str]] = {
Expand Down Expand Up @@ -383,7 +418,88 @@ def _get_parser(self, language: str): # type: ignore[arg-type]
return self._parsers[language]

def detect_language(self, path: Path) -> Optional[str]:
return EXTENSION_TO_LANGUAGE.get(path.suffix.lower())
"""Map a file path to its language name.

Extension-based lookup is tried first. For extension-less files
(typical for Unix scripts like ``bin/myapp`` or ``.git/hooks/pre-commit``)
we fall back to reading the first line for a shebang. Files that
already have a known extension are never re-read — shebang probing
only runs when the extension lookup returns ``None`` **and** the path
has no suffix at all. See issue #237.
"""
suffix = path.suffix.lower()
lang = EXTENSION_TO_LANGUAGE.get(suffix)
if lang is not None:
return lang
# Only probe shebang for files without any extension — "README", "LICENSE",
# and other extension-less text files also fall here, but the probe is a
# cheap 256-byte read that returns None when no shebang is found.
if suffix == "":
return self._detect_language_from_shebang(path)
return None

@staticmethod
def _detect_language_from_shebang(path: Path) -> Optional[str]:
"""Inspect the first line of ``path`` for a shebang interpreter.

Returns the mapped language name or ``None`` if the file has no
shebang, is unreadable, or names an interpreter we don't map.

Accepted shapes::

#!/bin/bash
#!/usr/bin/env python3
#!/usr/bin/env -S node --experimental-vm-modules
#!/usr/bin/bash -e

Only the basename of the interpreter is consulted. Trailing flags
after the interpreter are ignored. Windows-style ``\r\n`` line
endings are handled. Binary files read as garbage bytes simply
fail the ``#!`` prefix check and return ``None``.
"""
try:
with path.open("rb") as fh:
head = fh.read(_SHEBANG_PROBE_BYTES)
except (OSError, PermissionError):
return None
if not head.startswith(b"#!"):
return None

# Take just the first line, stripped of leading "#!" and any
# surrounding whitespace. Split on NUL to defend against accidental
# binary content following a ``#!`` prefix.
first_line = head.split(b"\n", 1)[0].split(b"\0", 1)[0]
try:
line = first_line[2:].decode("utf-8", errors="strict").strip()
except UnicodeDecodeError:
return None
if not line:
return None

tokens = line.split()
if not tokens:
return None

first = tokens[0]
# `/usr/bin/env` indirection: the interpreter is the next token.
# `/usr/bin/env -S node --flag` is also valid — skip any leading
# ``-`` options after env.
if first.endswith("/env") or first == "env":
interpreter_token: Optional[str] = None
for tok in tokens[1:]:
if tok.startswith("-"):
# ``-S`` takes no argument in most envs; skip and continue.
continue
interpreter_token = tok
break
if interpreter_token is None:
return None
interpreter = interpreter_token.rsplit("/", 1)[-1]
else:
# Direct form: ``#!/bin/bash`` or ``#!/usr/local/bin/python3``.
interpreter = first.rsplit("/", 1)[-1]

return SHEBANG_INTERPRETER_TO_LANGUAGE.get(interpreter)

def parse_file(self, path: Path) -> tuple[list[NodeInfo], list[EdgeInfo]]:
"""Parse a single file and return extracted nodes and edges."""
Expand Down
137 changes: 137 additions & 0 deletions tests/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,143 @@ def test_detect_language_typescript(self):
def test_detect_language_unknown(self):
assert self.parser.detect_language(Path("foo.txt")) is None

# --- Shebang detection for extension-less Unix scripts (#237) ---

def _write_shebang_file(self, tmp_path: Path, name: str, content: str) -> Path:
"""Helper: write an extension-less file with ``content`` and return its path."""
p = tmp_path / name
p.write_text(content, encoding="utf-8")
return p

def test_detect_shebang_bin_bash(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "deploy", "#!/bin/bash\nfoo() { echo hi; }\n",
)
assert self.parser.detect_language(p) == "bash"

def test_detect_shebang_bin_sh_routed_to_bash(self, tmp_path):
"""/bin/sh scripts are parsed through the bash grammar."""
p = self._write_shebang_file(
tmp_path, "install-hook", "#!/bin/sh\necho hello\n",
)
assert self.parser.detect_language(p) == "bash"

def test_detect_shebang_env_bash(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "runner", "#!/usr/bin/env bash\nfoo() { echo hi; }\n",
)
assert self.parser.detect_language(p) == "bash"

def test_detect_shebang_env_python3(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "myapp",
"#!/usr/bin/env python3\ndef main():\n pass\n",
)
assert self.parser.detect_language(p) == "python"

def test_detect_shebang_direct_python(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "tool", "#!/usr/bin/python3\nprint('hi')\n",
)
assert self.parser.detect_language(p) == "python"

def test_detect_shebang_node(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "cli", "#!/usr/bin/env node\nconsole.log(1);\n",
)
assert self.parser.detect_language(p) == "javascript"

def test_detect_shebang_env_dash_s_flag(self, tmp_path):
"""``#!/usr/bin/env -S node --flag`` (Linux -S) resolves to the interpreter."""
p = self._write_shebang_file(
tmp_path, "esm-tool",
"#!/usr/bin/env -S node --experimental-vm-modules\n"
"console.log('esm');\n",
)
assert self.parser.detect_language(p) == "javascript"

def test_detect_shebang_ruby(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "rake-task", "#!/usr/bin/env ruby\nputs 1\n",
)
assert self.parser.detect_language(p) == "ruby"

def test_detect_shebang_perl(self, tmp_path):
p = self._write_shebang_file(
tmp_path, "cgi-script", "#!/usr/bin/env perl\nprint 1;\n",
)
assert self.parser.detect_language(p) == "perl"

def test_detect_shebang_with_trailing_flags(self, tmp_path):
"""``#!/bin/bash -e`` still maps to bash (flags ignored)."""
p = self._write_shebang_file(
tmp_path, "strict", "#!/bin/bash -e\nfoo() { echo hi; }\n",
)
assert self.parser.detect_language(p) == "bash"

def test_detect_shebang_missing_returns_none(self, tmp_path):
"""Extension-less text files without a shebang return None, not bash."""
p = self._write_shebang_file(
tmp_path, "README", "# just a readme, no shebang\nsome content\n",
)
assert self.parser.detect_language(p) is None

def test_detect_shebang_empty_file_returns_none(self, tmp_path):
p = tmp_path / "EMPTY"
p.write_bytes(b"")
assert self.parser.detect_language(p) is None

def test_detect_shebang_binary_content_returns_none(self, tmp_path):
"""A garbage-byte first line that happens not to start with ``#!``
must not raise and must return None."""
p = tmp_path / "binary-blob"
p.write_bytes(b"\x00\x01\x02\x03 garbage bytes not a shebang\n")
assert self.parser.detect_language(p) is None

def test_detect_shebang_unknown_interpreter_returns_none(self, tmp_path):
"""A valid shebang to an interpreter we don't route is treated as
'unknown language' — same as an unmapped extension."""
p = self._write_shebang_file(
tmp_path, "ocaml-script", "#!/usr/bin/env ocaml\nlet x = 1\n",
)
assert self.parser.detect_language(p) is None

def test_detect_shebang_does_not_override_extension(self, tmp_path):
"""A file with a known extension must still use extension-based
detection, even if its first line is a misleading shebang."""
p = tmp_path / "script.py"
p.write_text("#!/bin/bash\nprint('hi')\n", encoding="utf-8")
# .py wins over the bash shebang — non-intuitive-looking content
# in a .py file must not fool the detector.
assert self.parser.detect_language(p) == "python"

def test_parse_shebang_script_produces_function_nodes(self, tmp_path):
"""End-to-end regression: an extension-less bash script is not only
detected but also fully parsed into structural nodes via parse_file.
"""
script = (
"#!/usr/bin/env bash\n"
"greet() {\n"
' echo "hi $1"\n'
"}\n"
"main() {\n"
" greet world\n"
"}\n"
"main\n"
)
p = self._write_shebang_file(tmp_path, "deploy", script)

nodes, edges = self.parser.parse_file(p)

# We at least got the File node plus both functions.
assert len(nodes) >= 3
funcs = [n for n in nodes if n.kind == "Function"]
func_names = {f.name for f in funcs}
assert "greet" in func_names
assert "main" in func_names
for n in nodes:
assert n.language == "bash"

def test_parse_python_file(self):
nodes, edges = self.parser.parse_file(FIXTURES / "sample_python.py")

Expand Down
Loading