diff --git a/tests/test_help_example_flags.py b/tests/test_help_example_flags.py new file mode 100644 index 00000000..cf26eff0 --- /dev/null +++ b/tests/test_help_example_flags.py @@ -0,0 +1,131 @@ +"""Guard that every ``--help`` example still parses against the live CLI tree. + +``test_help_examples_coverage`` proves each leaf command *has* an ``Examples`` +epilog, but not that the example commands are real. The examples are the snippets +users copy-paste, so a flag rename (the ``login --api-key`` → ``--with-api-key`` +deprecation is the canonical case) or a removed subcommand silently rots them. +``docs_consistency_gate.py`` keeps REFERENCE.md/README in sync with the code, but +nothing checked the in-``--help`` examples — this closes that gap. + +The check is deliberately scoped to *flags and subcommand paths* (the parts that +break on a rename), not full argument validation: examples carry placeholders +(````, ``TRANSCRIPT_ID``) that aren't real paths, so parsing them with +Click would false-positive. Pipelines (``a | assembly … | assembly …``) are split +into per-``assembly`` segments; a segment whose ``assembly`` token is glued to +other shell syntax (``$(assembly …``) is skipped rather than mis-parsed — +conservative by design, since those same commands appear unglued elsewhere. +""" + +from __future__ import annotations + +import shlex + +import typer + +from aai_cli.main import app +from tests._cli_tree import leaf_command_items + +# Shell tokens that end one command and start another; an example may chain several +# `assembly` invocations through a pipe, so each segment is validated independently. +_BOUNDARIES = frozenset({"|", ">", ">>", "<", ";", "&&", "||", "&"}) + + +def _option_names(command): + """Every flag spelling a Click command accepts (long, short, and --no- forms).""" + names = {"--help"} + for param in command.params: + if param.param_type_name == "option": + names.update(param.opts) + names.update(param.secondary_opts) + return names + + +def _example_commands(command): + """The ``$ …`` command lines from a leaf command's rendered examples epilog.""" + epilog = getattr(command, "epilog", None) or "" + return [line.strip()[2:] for line in epilog.splitlines() if line.strip().startswith("$ ")] + + +def _assembly_segments(tokens): + """Split a token stream into the argv of each literal ``assembly`` invocation. + + Each ``assembly`` token opens a fresh segment (appended up front, then grown in + place), and a shell boundary closes the current one — so tokens belonging to a + non-``assembly`` command (``ls``, ``jq``, ``$(assembly …``) are dropped. + """ + segments: list[list[str]] = [] + current: list[str] | None = None + for token in tokens: + if token == "assembly": + current = [] + segments.append(current) + elif token in _BOUNDARIES: + current = None + elif current is not None: + current.append(token) + return segments + + +def _unknown_flags(argv, root): + """Flags in one ``assembly`` argv that no command at their position accepts. + + Walks the tree token by token: a token matching a subcommand descends, and a + flag is checked against whatever command is current (so a root flag like + ``--sandbox`` is validated against the root, a leaf flag against the leaf). + """ + command = root + bad = [] + for token in argv: + sub = getattr(command, "commands", None) + if sub and token in sub: + command = sub[token] + continue + if token.startswith("-") and token not in ("-", "--"): + flag = token.split("=", 1)[0] # --model=x → --model + if flag not in _option_names(command): + bad.append(flag) + return bad + + +def _stale_examples(items, root): + """Map each command path to the (example, unknown-flags) pairs it ships, if any.""" + stale: dict[str, list[tuple[str, list[str]]]] = {} + for path, command in items: + for example in _example_commands(command): + for segment in _assembly_segments(shlex.split(example)): + bad = _unknown_flags(segment, root) + if bad: + stale.setdefault(" ".join(path), []).append((example, bad)) + return stale + + +def test_help_examples_reference_only_real_flags(): + root = typer.main.get_command(app) + stale = _stale_examples(leaf_command_items(), root) + assert stale == {}, f"--help examples reference flags the CLI no longer accepts: {stale}" + + +class _FakeLeaf: + def __init__(self, epilog): + self.epilog = epilog + + +def test_stale_examples_detects_renamed_and_removed_flags(): + # Drives the detection path the real examples (correctly) never trigger: a stale + # flag is reported under its command, and a command with no epilog contributes + # nothing — proving the guard would actually fail on drift, not just pass vacuously. + root = typer.main.get_command(app) + items = [ + (("renamed",), _FakeLeaf("[bold]Examples[/bold]\n\n$ assembly transcribe x --gone-flag")), + (("blank",), _FakeLeaf(None)), + ] + assert _stale_examples(items, root) == { + "renamed": [("assembly transcribe x --gone-flag", ["--gone-flag"])] + } + + +def test_assembly_segments_splits_pipelines_and_drops_foreign_commands(): + # The parser splits a chained pipeline into per-`assembly` argv and drops tokens + # owned by a non-`assembly` command (the leading `ls`). + tokens = shlex.split("ls *.wav | assembly stream --from-stdin | assembly llm -f") + assert _assembly_segments(tokens) == [["stream", "--from-stdin"], ["llm", "-f"]] diff --git a/tests/test_json_stdout_purity.py b/tests/test_json_stdout_purity.py new file mode 100644 index 00000000..a4d727a7 --- /dev/null +++ b/tests/test_json_stdout_purity.py @@ -0,0 +1,68 @@ +"""Stream-discipline sweep over every leaf command. + +The repo-wide invariant is *"Errors → stderr, data → stdout"* (root ``AGENTS.md``) +— it's what keeps ``assembly … --json | next-tool`` pipeline-safe. Individual +commands assert their own happy-path JSON shape, but nothing swept the contract +across *all* of them, so a command that leaked a human-readable line onto stdout +in ``--json`` mode (or printed an error payload to stdout) would pass the whole +gate. This walks the live Typer tree and pins the contract for every leaf. + +The trigger is an **unknown flag**: it fails during Click's argument parsing, +before the command body, before any credential resolution or network — so it is +the one error path that is uniform across all commands *and* deterministic under +the suite's ``--disable-socket`` (no command-specific required-arg knowledge, and +no risk of an interactive command like ``login``/``stream`` blocking on a browser +or a mic). Click 8.2+ keeps ``result.stdout`` and ``result.stderr`` as separate +streams on the ``CliRunner`` ``Result``, so the split is observable directly. +""" + +from __future__ import annotations + +import json + +import pytest +from typer.testing import CliRunner + +from aai_cli.main import app +from tests._cli_tree import leaf_command_argvs + +runner = CliRunner() + +# A flag no command defines, so it always trips Click's "No such option" parse +# error rather than reaching a command body. +UNKNOWN_FLAG = "--__definitely-not-a-real-flag__" + + +def _json_lines(stream: str) -> list[object]: + """Parse a stream as NDJSON, asserting every line is valid JSON. + + A human-prose leak (a Rich-rendered error, a bare status line) is exactly a + line that fails ``json.loads`` — so this raises rather than silently skipping. + """ + return [json.loads(line) for line in stream.splitlines()] + + +@pytest.mark.parametrize("path", leaf_command_argvs(), ids=lambda p: " ".join(p)) +def test_json_mode_keeps_stdout_clean_and_error_on_stderr(path: list[str]) -> None: + result = runner.invoke(app, [*path, UNKNOWN_FLAG, "--json"]) + + # Usage/parse error: the stable exit code for a bad flag (REFERENCE.md table). + assert result.exit_code == 2 + # The whole point of the pipeline contract: a parse error puts *nothing* on + # stdout, so a downstream consumer never sees a partial/garbage record. + assert result.stdout == "" + # The error rides stderr as the uniform JSON envelope — machine-readable, and + # every emitted line parses (a human-prose leak onto stderr fails here too). + objs = _json_lines(result.stderr) + assert {"error": {"type": "usage_error", "message": "No such option: " + UNKNOWN_FLAG}} in objs + + +@pytest.mark.parametrize("path", leaf_command_argvs(), ids=lambda p: " ".join(p)) +def test_human_mode_routes_errors_to_stderr(path: list[str]) -> None: + # The same contract without --json: a human error still belongs on stderr, never + # stdout, so `assembly … -o text > out` keeps the error out of the data file. + result = runner.invoke(app, [*path, UNKNOWN_FLAG]) + + assert result.exit_code == 2 + assert result.stdout == "" + assert "No such option" in result.stderr