diff --git a/tests/test_help_example_flags.py b/tests/test_help_example_flags.py
new file mode 100644
index 00000000..cf26eff0
--- /dev/null
+++ b/tests/test_help_example_flags.py
@@ -0,0 +1,131 @@
+"""Guard that every ``--help`` example still parses against the live CLI tree.
+
+``test_help_examples_coverage`` proves each leaf command *has* an ``Examples``
+epilog, but not that the example commands are real. The examples are the snippets
+users copy-paste, so a flag rename (the ``login --api-key`` → ``--with-api-key``
+deprecation is the canonical case) or a removed subcommand silently rots them.
+``docs_consistency_gate.py`` keeps REFERENCE.md/README in sync with the code, but
+nothing checked the in-``--help`` examples — this closes that gap.
+
+The check is deliberately scoped to *flags and subcommand paths* (the parts that
+break on a rename), not full argument validation: examples carry placeholders
+(``<file>``, ``TRANSCRIPT_ID``) that aren't real paths, so parsing them with
+Click would false-positive. Pipelines (``a | assembly … | assembly …``) are split
+into per-``assembly`` segments; a segment whose ``assembly`` token is glued to
+other shell syntax (``$(assembly …``) is skipped rather than mis-parsed —
+conservative by design, since those same commands appear unglued elsewhere.
+"""
+
+from __future__ import annotations
+
+import shlex
+
+import typer
+
+from aai_cli.main import app
+from tests._cli_tree import leaf_command_items
+
+# Shell tokens that end one command and start another; an example may chain several
+# `assembly` invocations through a pipe, so each segment is validated independently.
+_BOUNDARIES = frozenset({"|", ">", ">>", "<", ";", "&&", "||", "&"})
+
+
+def _option_names(command):
+    """Every flag spelling a Click command accepts (long, short, and --no- forms)."""
+    names = {"--help"}
+    for param in command.params:
+        if param.param_type_name == "option":
+            names.update(param.opts)
+            names.update(param.secondary_opts)
+    return names
+
+
+def _example_commands(command):
+    """The ``$ …`` command lines from a leaf command's rendered examples epilog."""
+    epilog = getattr(command, "epilog", None) or ""
+    return [line.strip()[2:] for line in epilog.splitlines() if line.strip().startswith("$ ")]
+
+
+def _assembly_segments(tokens):
+    """Split a token stream into the argv of each literal ``assembly`` invocation.
+
+    Each ``assembly`` token opens a fresh segment (appended up front, then grown in
+    place), and a shell boundary closes the current one — so tokens belonging to a
+    non-``assembly`` command (``ls``, ``jq``, ``$(assembly …``) are dropped.
+    """
+    segments: list[list[str]] = []
+    current: list[str] | None = None
+    for token in tokens:
+        if token == "assembly":
+            current = []
+            segments.append(current)
+        elif token in _BOUNDARIES:
+            current = None
+        elif current is not None:
+            current.append(token)
+    return segments
+
+
+def _unknown_flags(argv, root):
+    """Flags in one ``assembly`` argv that no command at their position accepts.
+
+    Walks the tree token by token: a token matching a subcommand descends, and a
+    flag is checked against whatever command is current (so a root flag like
+    ``--sandbox`` is validated against the root, a leaf flag against the leaf).
+    """
+    command = root
+    bad = []
+    for token in argv:
+        sub = getattr(command, "commands", None)
+        if sub and token in sub:
+            command = sub[token]
+            continue
+        if token.startswith("-") and token not in ("-", "--"):
+            flag = token.split("=", 1)[0]  # --model=x → --model
+            if flag not in _option_names(command):
+                bad.append(flag)
+    return bad
+
+
+def _stale_examples(items, root):
+    """Map each command path to the (example, unknown-flags) pairs it ships, if any."""
+    stale: dict[str, list[tuple[str, list[str]]]] = {}
+    for path, command in items:
+        for example in _example_commands(command):
+            for segment in _assembly_segments(shlex.split(example)):
+                bad = _unknown_flags(segment, root)
+                if bad:
+                    stale.setdefault(" ".join(path), []).append((example, bad))
+    return stale
+
+
+def test_help_examples_reference_only_real_flags():
+    root = typer.main.get_command(app)
+    stale = _stale_examples(leaf_command_items(), root)
+    assert stale == {}, f"--help examples reference flags the CLI no longer accepts: {stale}"
+
+
+class _FakeLeaf:
+    def __init__(self, epilog):
+        self.epilog = epilog
+
+
+def test_stale_examples_detects_renamed_and_removed_flags():
+    # Drives the detection path the real examples (correctly) never trigger: a stale
+    # flag is reported under its command, and a command with no epilog contributes
+    # nothing — proving the guard would actually fail on drift, not just pass vacuously.
+    root = typer.main.get_command(app)
+    items = [
+        (("renamed",), _FakeLeaf("[bold]Examples[/bold]\n\n$ assembly transcribe x --gone-flag")),
+        (("blank",), _FakeLeaf(None)),
+    ]
+    assert _stale_examples(items, root) == {
+        "renamed": [("assembly transcribe x --gone-flag", ["--gone-flag"])]
+    }
+
+
+def test_assembly_segments_splits_pipelines_and_drops_foreign_commands():
+    # The parser splits a chained pipeline into per-`assembly` argv and drops tokens
+    # owned by a non-`assembly` command (the leading `ls`).
+    tokens = shlex.split("ls *.wav | assembly stream --from-stdin | assembly llm -f")
+    assert _assembly_segments(tokens) == [["stream", "--from-stdin"], ["llm", "-f"]]
diff --git a/tests/test_json_stdout_purity.py b/tests/test_json_stdout_purity.py
new file mode 100644
index 00000000..a4d727a7
--- /dev/null
+++ b/tests/test_json_stdout_purity.py
@@ -0,0 +1,68 @@
+"""Stream-discipline sweep over every leaf command.
+
+The repo-wide invariant is *"Errors → stderr, data → stdout"* (root ``AGENTS.md``)
+— it's what keeps ``assembly … --json | next-tool`` pipeline-safe. Individual
+commands assert their own happy-path JSON shape, but nothing swept the contract
+across *all* of them, so a command that leaked a human-readable line onto stdout
+in ``--json`` mode (or printed an error payload to stdout) would pass the whole
+gate. This walks the live Typer tree and pins the contract for every leaf.
+
+The trigger is an **unknown flag**: it fails during Click's argument parsing,
+before the command body, before any credential resolution or network — so it is
+the one error path that is uniform across all commands *and* deterministic under
+the suite's ``--disable-socket`` (no command-specific required-arg knowledge, and
+no risk of an interactive command like ``login``/``stream`` blocking on a browser
+or a mic). Click 8.2+ keeps ``result.stdout`` and ``result.stderr`` as separate
+streams on the ``CliRunner`` ``Result``, so the split is observable directly.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+from typer.testing import CliRunner
+
+from aai_cli.main import app
+from tests._cli_tree import leaf_command_argvs
+
+runner = CliRunner()
+
+# A flag no command defines, so it always trips Click's "No such option" parse
+# error rather than reaching a command body.
+UNKNOWN_FLAG = "--__definitely-not-a-real-flag__"
+
+
+def _json_lines(stream: str) -> list[object]:
+    """Parse a stream as NDJSON, asserting every line is valid JSON.
+
+    A human-prose leak (a Rich-rendered error, a bare status line) is exactly a
+    line that fails ``json.loads`` — so this raises rather than silently skipping.
+    """
+    return [json.loads(line) for line in stream.splitlines()]
+
+
+@pytest.mark.parametrize("path", leaf_command_argvs(), ids=lambda p: " ".join(p))
+def test_json_mode_keeps_stdout_clean_and_error_on_stderr(path: list[str]) -> None:
+    result = runner.invoke(app, [*path, UNKNOWN_FLAG, "--json"])
+
+    # Usage/parse error: the stable exit code for a bad flag (REFERENCE.md table).
+    assert result.exit_code == 2
+    # The whole point of the pipeline contract: a parse error puts *nothing* on
+    # stdout, so a downstream consumer never sees a partial/garbage record.
+    assert result.stdout == ""
+    # The error rides stderr as the uniform JSON envelope — machine-readable, and
+    # every emitted line parses (a human-prose leak onto stderr fails here too).
+    objs = _json_lines(result.stderr)
+    assert {"error": {"type": "usage_error", "message": "No such option: " + UNKNOWN_FLAG}} in objs
+
+
+@pytest.mark.parametrize("path", leaf_command_argvs(), ids=lambda p: " ".join(p))
+def test_human_mode_routes_errors_to_stderr(path: list[str]) -> None:
+    # The same contract without --json: a human error still belongs on stderr, never
+    # stdout, so `assembly … -o text > out` keeps the error out of the data file.
+    result = runner.invoke(app, [*path, UNKNOWN_FLAG])
+
+    assert result.exit_code == 2
+    assert result.stdout == ""
+    assert "No such option" in result.stderr