From 4ebd20511f8eacf4901d8756e7b3a90923f7bd81 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 15:58:00 -0400
Subject: [PATCH 01/12] fix: respect .plumbignore in coverage source scan

_collect_source_summaries was walking every .py file via rglob without
consulting .plumbignore. In projects with large .venv directories this
caused tens of thousands of irrelevant files to be AST-parsed and sent
to the LLM, making plumb coverage extremely slow.

Reuse the existing parse_plumbignore/is_ignored helpers so patterns like
.venv/ are honoured during coverage analysis.
---
 plumb/coverage_reporter.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/plumb/coverage_reporter.py b/plumb/coverage_reporter.py
index 6b1beb8..e8af374 100644
--- a/plumb/coverage_reporter.py
+++ b/plumb/coverage_reporter.py
@@ -11,6 +11,7 @@
 from rich.table import Table
 
 from plumb.config import load_config
+from plumb.ignore import is_ignored, parse_plumbignore
 
 PLUMB_MARKER_RE = re.compile(r'#\s*plumb:(req-[a-f0-9]+)')
 FUNC_NAME_RE = re.compile(r'def test_req_([a-f0-9]+)_')
@@ -118,11 +119,14 @@ def _collect_source_summaries(repo_root: Path) -> dict[str, str]:
     """
     import ast
 
+    ignore_patterns = parse_plumbignore(repo_root)
     per_file: dict[str, str] = {}
     for item in sorted(repo_root.rglob("*.py")):
         rel = str(item.relative_to(repo_root))
         if ".plumb" in rel or "test_" in item.name or rel.startswith("tests/"):
             continue
+        if is_ignored(rel, ignore_patterns):
+            continue
         try:
             content = item.read_text()
         except Exception:

From dc17427ef906d1d33d9733ff136464ea442d6098 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 16:02:48 -0400
Subject: [PATCH 02/12] fix: support glob patterns in directory ignore rules

is_ignored now fnmatches directory patterns (e.g. .venv*/) against
the top-level path component, so .venv3.10/, .virtualenv3.10/ etc.
are matched by a single .venv*/ rule in .plumbignore.
---
 plumb/ignore.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/plumb/ignore.py b/plumb/ignore.py
index 4f21137..16e84e2 100644
--- a/plumb/ignore.py
+++ b/plumb/ignore.py
@@ -50,12 +50,17 @@ def is_ignored(filepath: str, patterns: list[str]) -> bool:
     - Exact match: ``README.md``
     - Glob matched against the basename: ``*.txt``
     - Directory prefix (pattern ends with ``/``): ``docs/`` matches ``docs/foo``
+    - Glob directory prefix: ``.venv*/`` matches ``.venv3.10/foo``
     """
     basename = Path(filepath).name
+    top_dir = filepath.split("/")[0]
     for pat in patterns:
         if pat.endswith("/"):
-            # Directory prefix — match if filepath starts with the prefix
-            if filepath == pat.rstrip("/") or filepath.startswith(pat):
+            prefix = pat.rstrip("/")
+            # Directory prefix — exact startswith or fnmatch on top directory
+            if filepath == prefix or filepath.startswith(pat):
+                return True
+            if fnmatch(top_dir, prefix):
                 return True
         else:
             # Exact full-path match or fnmatch against basename

From 51f8c733a23048c3dc5aeb14e7c0f0f42d288a57 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 17:23:04 -0400
Subject: [PATCH 03/12] feat: add Claude Code CLI as alternative LLM backend

When ANTHROPIC_API_KEY is not set, plumb now detects the `claude` CLI
and routes LLM calls through `claude -p` (non-interactive mode). This
lets users with a Claude Code subscription run plumb without a separate
API key.

- Add ClaudeCodeLM (dspy.BaseLM subclass) that shells out to `claude -p`
- Update get_lm() / validate_api_access() / get_program_lm() to fall
  back to CLI when no API key is set
- Update CodeModifier to use _call_claude() when no API key
- API key always takes precedence when set (zero regression)
---
 plumb/programs/__init__.py       |  67 ++++++++----
 plumb/programs/claude_code_lm.py | 124 ++++++++++++++++++++++
 plumb/programs/code_modifier.py  |  22 +++-
 tests/test_claude_code_lm.py     | 176 +++++++++++++++++++++++++++++++
 tests/test_programs.py           | 128 +++++++++++++++++++---
 5 files changed, 481 insertions(+), 36 deletions(-)
 create mode 100644 plumb/programs/claude_code_lm.py
 create mode 100644 tests/test_claude_code_lm.py

diff --git a/plumb/programs/__init__.py b/plumb/programs/__init__.py
index 2cb7334..61647c8 100644
--- a/plumb/programs/__init__.py
+++ b/plumb/programs/__init__.py
@@ -5,24 +5,42 @@
 
 import dspy
 from dspy.adapters import XMLAdapter
+from dspy.clients.base_lm import BaseLM
 
 from plumb import PlumbAuthError, PlumbInferenceError
 
 _configured = False
 
+_NO_BACKEND_MSG = (
+    "No LLM backend available.\n"
+    "Option 1: Set ANTHROPIC_API_KEY in .env or environment (direct API, fastest)\n"
+    "Option 2: Install Claude Code CLI — https://claude.ai/code (uses your subscription)"
+)
 
-def get_lm() -> dspy.LM:
-    return dspy.LM("anthropic/claude-sonnet-4-20250514", max_tokens=28000)
+
+def get_lm() -> BaseLM:
+    """Return the best available LM: direct API if ANTHROPIC_API_KEY is set,
+    otherwise Claude Code CLI if available."""
+    if os.environ.get("ANTHROPIC_API_KEY"):
+        return dspy.LM("anthropic/claude-sonnet-4-20250514", max_tokens=28000)
+
+    from plumb.programs.claude_code_lm import ClaudeCodeLM, find_claude_cli
+
+    if find_claude_cli():
+        return ClaudeCodeLM(model="sonnet", max_tokens=28000)
+
+    raise PlumbAuthError(_NO_BACKEND_MSG)
 
 
 def configure_dspy() -> None:
     """Lazy DSPy configuration. No-op if already configured.
-    Never call at import time — ANTHROPIC_API_KEY absence would break
+    Never call at import time — missing auth would break
     non-LLM commands like plumb status."""
     global _configured
     if _configured:
         return
     from dotenv import load_dotenv
+
     load_dotenv(override=False)
     lm = get_lm()
     dspy.configure(lm=lm, adapter=XMLAdapter())
@@ -30,37 +48,34 @@ def configure_dspy() -> None:
 
 
 def validate_api_access() -> None:
-    """Check that ANTHROPIC_API_KEY is set and works. Loads .env first, then
-    falls back to exported environment variables. Performs a smoke test to
-    verify the key is valid. Raises PlumbAuthError if not found or invalid."""
+    """Check that an LLM backend is available and working.
+
+    Tries ANTHROPIC_API_KEY first (direct API), then falls back to the
+    Claude Code CLI. Performs a smoke test to verify the backend works.
+    Raises PlumbAuthError if neither is available or working.
+    """
     from dotenv import load_dotenv
 
     load_dotenv(override=False)
-    if not os.environ.get("ANTHROPIC_API_KEY"):
-        raise PlumbAuthError(
-            "ANTHROPIC_API_KEY is not set. "
-            "Plumb requires a valid Anthropic API key to analyze commits.\n"
-            "Set it in a .env file or export it: export ANTHROPIC_API_KEY=your-key-here"
-        )
-
-    # Smoke test: verify the key actually works
-    lm = get_lm()
+
+    lm = get_lm()  # raises PlumbAuthError if no backend available
+
     try:
         response = lm("Reply with only the word: hello")
         if not response:
-            raise PlumbAuthError("API returned empty response - key may be invalid")
+            raise PlumbAuthError("LLM returned empty response - backend may be misconfigured")
+    except PlumbAuthError:
+        raise
     except Exception as e:
         err_str = str(e).lower()
         if "auth" in err_str or "api key" in err_str or "401" in err_str:
             raise PlumbAuthError(
                 f"ANTHROPIC_API_KEY is invalid or rejected: {e}"
             ) from e
-        raise PlumbAuthError(
-            f"Failed to verify API access: {e}"
-        ) from e
+        raise PlumbAuthError(f"Failed to verify LLM access: {e}") from e
 
 
-def get_program_lm(program_name: str, repo_root: str | Path | None = None) -> dspy.LM | None:
+def get_program_lm(program_name: str, repo_root: str | Path | None = None) -> BaseLM | None:
     """Return a per-program LM override from config, or None for the default."""
     from plumb.config import find_repo_root, load_config
 
@@ -78,7 +93,17 @@ def get_program_lm(program_name: str, repo_root: str | Path | None = None) -> ds
     if not model:
         return None
     max_tokens = entry.get("max_tokens", 8192)
-    return dspy.LM(model, max_tokens=max_tokens)
+
+    if os.environ.get("ANTHROPIC_API_KEY"):
+        return dspy.LM(model, max_tokens=max_tokens)
+
+    from plumb.programs.claude_code_lm import ClaudeCodeLM, find_claude_cli
+
+    if find_claude_cli():
+        cli_model = model.removeprefix("anthropic/")
+        return ClaudeCodeLM(model=cli_model, max_tokens=max_tokens)
+
+    return None
 
 
 def run_with_retries(fn, *args, max_retries: int = 2, **kwargs):
diff --git a/plumb/programs/claude_code_lm.py b/plumb/programs/claude_code_lm.py
new file mode 100644
index 0000000..7a493be
--- /dev/null
+++ b/plumb/programs/claude_code_lm.py
@@ -0,0 +1,124 @@
+"""DSPy BaseLM subclass that routes completions through the claude CLI.
+
+Uses ``claude -p`` (non-interactive print mode) so that users with a Claude
+Code subscription can run plumb without a separate ANTHROPIC_API_KEY.
+
+Pattern adapted from tinaudio/skills@b0cbd3d.
+"""
+
+from __future__ import annotations
+
+import os
+import shutil
+import subprocess
+from types import SimpleNamespace
+from typing import Any
+
+from dspy.clients.base_lm import BaseLM
+
+from plumb import PlumbInferenceError
+
+
+def find_claude_cli() -> str | None:
+    """Return the path to the ``claude`` CLI binary, or None if not found."""
+    return shutil.which("claude")
+
+
+def _call_claude(prompt: str, model: str | None = None, timeout: int = 300) -> str:
+    """Run ``claude -p`` with *prompt* on stdin and return the text response.
+
+    Strips the ``CLAUDECODE`` env var to allow nesting inside a Claude Code
+    session (the guard is for interactive terminal conflicts; programmatic
+    subprocess usage is safe).
+    """
+    cmd = ["claude", "-p", "--output-format", "text"]
+    if model:
+        cmd.extend(["--model", model])
+
+    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+
+    result = subprocess.run(
+        cmd,
+        input=prompt,
+        capture_output=True,
+        text=True,
+        env=env,
+        timeout=timeout,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"claude -p exited {result.returncode}\nstderr: {result.stderr}"
+        )
+    return result.stdout
+
+
+def _serialize_messages(
+    prompt: str | None = None,
+    messages: list[dict[str, str]] | None = None,
+) -> str:
+    """Convert a DSPy messages list into a single text prompt for the CLI.
+
+    System messages get ``<system>`` tags, multi-turn conversations get
+    ``[role]`` prefixes.  Single user messages are passed through as-is.
+    """
+    if not messages:
+        return prompt or ""
+
+    # Single user message — pass through without decoration
+    if len(messages) == 1 and messages[0].get("role") == "user":
+        return messages[0]["content"]
+
+    parts: list[str] = []
+    for msg in messages:
+        role = msg.get("role", "user")
+        content = msg.get("content", "")
+        if role == "system":
+            parts.append(f"<system>\n{content}\n</system>")
+        else:
+            parts.append(f"[{role}]\n{content}")
+    return "\n\n".join(parts)
+
+
+def _make_response(text: str, model: str) -> SimpleNamespace:
+    """Build a minimal OpenAI-compatible response object for BaseLM."""
+    return SimpleNamespace(
+        choices=[
+            SimpleNamespace(
+                message=SimpleNamespace(content=text, role="assistant"),
+                finish_reason="stop",
+            )
+        ],
+        usage={"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0},
+        model=model,
+    )
+
+
+class ClaudeCodeLM(BaseLM):
+    """DSPy LM that routes completions through the ``claude`` CLI."""
+
+    def __init__(
+        self,
+        model: str = "sonnet",
+        max_tokens: int = 28000,
+        timeout: int = 300,
+        **kwargs: Any,
+    ):
+        super().__init__(
+            model=f"claude-code/{model}",
+            model_type="chat",
+            temperature=0.0,
+            max_tokens=max_tokens,
+            **kwargs,
+        )
+        self.cli_model = model
+        self.timeout = timeout
+
+    def forward(
+        self,
+        prompt: str | None = None,
+        messages: list[dict[str, Any]] | None = None,
+        **kwargs: Any,
+    ) -> SimpleNamespace:
+        text_input = _serialize_messages(prompt, messages)
+        response_text = _call_claude(text_input, model=self.cli_model, timeout=self.timeout)
+        return _make_response(response_text, self.model)
diff --git a/plumb/programs/code_modifier.py b/plumb/programs/code_modifier.py
index 53b844a..d569683 100644
--- a/plumb/programs/code_modifier.py
+++ b/plumb/programs/code_modifier.py
@@ -1,20 +1,34 @@
 from __future__ import annotations
 
 import json
+import os
 import re
 
 import anthropic
 from dotenv import load_dotenv
 
+from plumb.programs.claude_code_lm import _call_claude, find_claude_cli
+
 
 class CodeModifier:
     """Modify staged code to satisfy a rejected decision.
     Uses Anthropic API directly (not DSPy) because code modification
-    is inherently open-ended."""
+    is inherently open-ended. Falls back to claude CLI when no API key."""
 
     def __init__(self, client: anthropic.Anthropic | None = None):
         load_dotenv(override=False)
-        self.client = client or anthropic.Anthropic()
+        if client is not None:
+            self.client = client
+            self._use_cli = False
+        elif os.environ.get("ANTHROPIC_API_KEY"):
+            self.client = anthropic.Anthropic()
+            self._use_cli = False
+        elif find_claude_cli():
+            self.client = None
+            self._use_cli = True
+        else:
+            self.client = anthropic.Anthropic()
+            self._use_cli = False
 
     def modify(
         self,
@@ -50,6 +64,10 @@ def modify(
 }}
 ```"""
 
+        if self._use_cli:
+            text = _call_claude(prompt)
+            return self._parse_response(text)
+
         response = self.client.messages.create(
             model="claude-sonnet-4-20250514",
             max_tokens=4096,
diff --git a/tests/test_claude_code_lm.py b/tests/test_claude_code_lm.py
new file mode 100644
index 0000000..52000e3
--- /dev/null
+++ b/tests/test_claude_code_lm.py
@@ -0,0 +1,176 @@
+"""Tests for ClaudeCodeLM — DSPy BaseLM subclass that routes through claude CLI."""
+
+import subprocess
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from plumb.programs.claude_code_lm import (
+    ClaudeCodeLM,
+    _call_claude,
+    _make_response,
+    _serialize_messages,
+    find_claude_cli,
+)
+
+
+class TestFindClaudeCli:
+    def test_returns_path_when_found(self):
+        with patch("shutil.which", return_value="/usr/local/bin/claude"):
+            assert find_claude_cli() == "/usr/local/bin/claude"
+
+    def test_returns_none_when_missing(self):
+        with patch("shutil.which", return_value=None):
+            assert find_claude_cli() is None
+
+
+class TestSerializeMessages:
+    def test_prompt_only(self):
+        result = _serialize_messages(prompt="hello", messages=None)
+        assert result == "hello"
+
+    def test_single_user_message(self):
+        msgs = [{"role": "user", "content": "hello"}]
+        result = _serialize_messages(prompt=None, messages=msgs)
+        assert result == "hello"
+
+    def test_system_and_user_messages(self):
+        msgs = [
+            {"role": "system", "content": "You are helpful."},
+            {"role": "user", "content": "hello"},
+        ]
+        result = _serialize_messages(prompt=None, messages=msgs)
+        assert "<system>\nYou are helpful.\n</system>" in result
+        assert "hello" in result
+
+    def test_multi_turn_with_assistant(self):
+        msgs = [
+            {"role": "user", "content": "hi"},
+            {"role": "assistant", "content": "hey"},
+            {"role": "user", "content": "bye"},
+        ]
+        result = _serialize_messages(prompt=None, messages=msgs)
+        assert "[user]\nhi" in result
+        assert "[assistant]\nhey" in result
+        assert "[user]\nbye" in result
+
+    def test_empty_messages_falls_back_to_prompt(self):
+        result = _serialize_messages(prompt="fallback", messages=[])
+        assert result == "fallback"
+
+
+class TestMakeResponse:
+    def test_has_correct_structure(self):
+        resp = _make_response("hello world", "claude-sonnet")
+        assert resp.choices[0].message.content == "hello world"
+        assert resp.choices[0].message.role == "assistant"
+        assert resp.choices[0].finish_reason == "stop"
+        assert resp.model == "claude-sonnet"
+
+    def test_usage_is_dictable(self):
+        resp = _make_response("text", "model")
+        usage = dict(resp.usage)
+        assert "prompt_tokens" in usage
+        assert "completion_tokens" in usage
+        assert "total_tokens" in usage
+
+
+class TestCallClaude:
+    def test_success(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=0, stdout="hello\n", stderr=""
+        )
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            result = _call_claude("say hello")
+            assert result == "hello\n"
+            args = mock_run.call_args
+            assert args[0][0][:2] == ["claude", "-p"]
+            assert "--output-format" in args[0][0]
+            assert "text" in args[0][0]
+            assert args[1]["input"] == "say hello"
+
+    def test_strips_claudecode_env_var(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=0, stdout="ok", stderr=""
+        )
+        with patch("subprocess.run", return_value=mock_result) as mock_run, \
+             patch.dict("os.environ", {"CLAUDECODE": "1", "PATH": "/usr/bin"}):
+            _call_claude("test")
+            env = mock_run.call_args[1]["env"]
+            assert "CLAUDECODE" not in env
+            assert "PATH" in env
+
+    def test_passes_model_flag(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=0, stdout="ok", stderr=""
+        )
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            _call_claude("test", model="opus")
+            cmd = mock_run.call_args[0][0]
+            assert "--model" in cmd
+            idx = cmd.index("--model")
+            assert cmd[idx + 1] == "opus"
+
+    def test_raises_on_nonzero_exit(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=1, stdout="", stderr="auth failed"
+        )
+        with patch("subprocess.run", return_value=mock_result):
+            with pytest.raises(RuntimeError, match="auth failed"):
+                _call_claude("test")
+
+    def test_raises_on_timeout(self):
+        with patch("subprocess.run", side_effect=subprocess.TimeoutExpired("claude", 300)):
+            with pytest.raises(subprocess.TimeoutExpired):
+                _call_claude("test")
+
+
+class TestClaudeCodeLM:
+    def test_is_base_lm_subclass(self):
+        import dspy
+        with patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            lm = ClaudeCodeLM()
+            assert isinstance(lm, dspy.BaseLM)
+
+    def test_forward_calls_claude_cli(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=0, stdout="response text", stderr=""
+        )
+        with patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"), \
+             patch("subprocess.run", return_value=mock_result):
+            lm = ClaudeCodeLM()
+            response = lm.forward(prompt="hello")
+            assert response.choices[0].message.content == "response text"
+
+    def test_forward_serializes_messages(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=0, stdout="answer", stderr=""
+        )
+        messages = [
+            {"role": "system", "content": "Be concise."},
+            {"role": "user", "content": "What is 1+1?"},
+        ]
+        with patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"), \
+             patch("subprocess.run", return_value=mock_result) as mock_run:
+            lm = ClaudeCodeLM()
+            lm.forward(messages=messages)
+            stdin_input = mock_run.call_args[1]["input"]
+            assert "Be concise." in stdin_input
+            assert "What is 1+1?" in stdin_input
+
+    def test_forward_raises_on_cli_error(self):
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=1, stdout="", stderr="error"
+        )
+        with patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"), \
+             patch("subprocess.run", return_value=mock_result):
+            lm = ClaudeCodeLM()
+            with pytest.raises(RuntimeError, match="error"):
+                lm.forward(prompt="hello")
+
+    def test_model_name_stored(self):
+        with patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            lm = ClaudeCodeLM(model="opus")
+            assert lm.cli_model == "opus"
+            assert "claude-code/" in lm.model
diff --git a/tests/test_programs.py b/tests/test_programs.py
index 55b9e3a..81dee87 100644
--- a/tests/test_programs.py
+++ b/tests/test_programs.py
@@ -7,9 +7,10 @@
 import dspy
 import pytest
 
-from plumb.programs import run_with_retries, configure_dspy, validate_api_access, get_program_lm
+from plumb.programs import run_with_retries, configure_dspy, validate_api_access, get_lm, get_program_lm
 from plumb.config import PlumbConfig, save_config, ensure_plumb_dir
 from plumb import PlumbAuthError, PlumbInferenceError
+from plumb.programs.claude_code_lm import ClaudeCodeLM
 from plumb.programs.diff_analyzer import (
     ChangeSummary,
     DiffAnalyzerSignature,
@@ -39,23 +40,36 @@
 
 
 class TestValidateApiAccess:
-    def test_raises_when_key_missing(self):
+    def test_raises_when_key_missing_and_no_cli(self):
         # plumb:req-60f97012
         # plumb:req-ab686eaa
         # plumb:req-222ddbbd
         with patch("dotenv.load_dotenv"), \
-             patch.dict("os.environ", {}, clear=True):
+             patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value=None):
             import os
             os.environ.pop("ANTHROPIC_API_KEY", None)
-            with pytest.raises(PlumbAuthError, match="ANTHROPIC_API_KEY is not set"):
+            with pytest.raises(PlumbAuthError, match="No LLM backend available"):
                 validate_api_access()
 
-    def test_raises_when_key_empty(self):
+    def test_raises_when_key_empty_and_no_cli(self):
         with patch("dotenv.load_dotenv"), \
-             patch.dict("os.environ", {"ANTHROPIC_API_KEY": ""}):
-            with pytest.raises(PlumbAuthError, match="ANTHROPIC_API_KEY is not set"):
+             patch.dict("os.environ", {"ANTHROPIC_API_KEY": ""}), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value=None):
+            with pytest.raises(PlumbAuthError, match="No LLM backend available"):
                 validate_api_access()
 
+    def test_passes_with_cli_when_no_key(self):
+        """CLI fallback works when ANTHROPIC_API_KEY is not set."""
+        mock_lm = MagicMock(return_value=["hello"])
+        with patch("dotenv.load_dotenv"), \
+             patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"), \
+             patch("plumb.programs.claude_code_lm.ClaudeCodeLM", return_value=mock_lm):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+            validate_api_access()  # should not raise
+
     def test_passes_when_key_set_and_api_works(self):
         mock_lm = MagicMock(return_value="hello")
         with patch("dotenv.load_dotenv"), \
@@ -324,6 +338,41 @@ def test_prompt_includes_all_inputs(self):
         assert "reason text" in prompt
         assert "spec text" in prompt
 
+    def test_uses_cli_when_no_api_key(self):
+        """CodeModifier falls back to claude CLI when no API key."""
+        json_response = '```json\n{"src/a.py": "modified via cli"}\n```'
+        with patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.code_modifier.find_claude_cli", return_value="/usr/bin/claude"), \
+             patch("plumb.programs.code_modifier._call_claude", return_value=json_response) as mock_call:
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+            modifier = CodeModifier()
+            result = modifier.modify(
+                staged_diff="diff",
+                decision="Use async",
+                rejection_reason="Too complex",
+                spec_content="# Spec",
+            )
+            assert result == {"src/a.py": "modified via cli"}
+            mock_call.assert_called_once()
+            prompt = mock_call.call_args[0][0]
+            assert "diff" in prompt
+            assert "Use async" in prompt
+
+    def test_uses_api_when_key_set(self):
+        """CodeModifier uses Anthropic API when ANTHROPIC_API_KEY is set."""
+        mock_client = MagicMock()
+        mock_response = MagicMock()
+        mock_response.content = [MagicMock(text='{"a.py": "content"}')]
+        mock_client.messages.create.return_value = mock_response
+
+        with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "sk-ant-test"}), \
+             patch("plumb.programs.code_modifier.anthropic") as mock_anthropic:
+            mock_anthropic.Anthropic.return_value = mock_client
+            modifier = CodeModifier()
+            result = modifier.modify("diff", "dec", "reason", "spec")
+            mock_client.messages.create.assert_called_once()
+
 
 class TestGetProgramLm:
     def test_returns_none_when_no_config(self, tmp_path):
@@ -339,8 +388,8 @@ def test_returns_none_when_program_not_listed(self, tmp_repo):
         result = get_program_lm("decision_deduplicator", repo_root=tmp_repo)
         assert result is None
 
-    def test_returns_lm_when_override_exists(self, tmp_repo):
-        """Config has an override → returns a dspy.LM."""
+    def test_returns_lm_when_override_exists_with_api_key(self, tmp_repo):
+        """Config has an override + API key → returns a dspy.LM."""
         ensure_plumb_dir(tmp_repo)
         cfg = PlumbConfig(
             spec_paths=["spec.md"],
@@ -349,13 +398,66 @@ def test_returns_lm_when_override_exists(self, tmp_repo):
             },
         )
         save_config(tmp_repo, cfg)
-        lm = get_program_lm("decision_deduplicator", repo_root=tmp_repo)
-        assert isinstance(lm, dspy.LM)
-        assert lm.model == "openai/gpt-4o-mini"
-        assert lm.kwargs["max_tokens"] == 4096
+        with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "sk-ant-test"}):
+            lm = get_program_lm("decision_deduplicator", repo_root=tmp_repo)
+            assert isinstance(lm, dspy.LM)
+            assert lm.model == "openai/gpt-4o-mini"
+            assert lm.kwargs["max_tokens"] == 4096
+
+    def test_returns_claude_code_lm_when_override_exists_no_key(self, tmp_repo):
+        """Config has an override + no API key + CLI available → returns ClaudeCodeLM."""
+        ensure_plumb_dir(tmp_repo)
+        cfg = PlumbConfig(
+            spec_paths=["spec.md"],
+            program_models={
+                "decision_deduplicator": {"model": "anthropic/claude-sonnet-4-20250514", "max_tokens": 4096},
+            },
+        )
+        save_config(tmp_repo, cfg)
+        with patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+            lm = get_program_lm("decision_deduplicator", repo_root=tmp_repo)
+            assert isinstance(lm, ClaudeCodeLM)
+            assert lm.cli_model == "claude-sonnet-4-20250514"
 
     def test_returns_none_when_no_repo_root(self):
         """No repo root found → returns None."""
         with patch("plumb.config.find_repo_root", return_value=None):
             result = get_program_lm("decision_deduplicator")
             assert result is None
+
+
+class TestGetLm:
+    def test_returns_dspy_lm_with_api_key(self):
+        """ANTHROPIC_API_KEY set → returns dspy.LM."""
+        with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "sk-ant-test"}):
+            lm = get_lm()
+            assert isinstance(lm, dspy.LM)
+
+    def test_returns_claude_code_lm_without_api_key(self):
+        """No API key + CLI available → returns ClaudeCodeLM."""
+        with patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+            lm = get_lm()
+            assert isinstance(lm, ClaudeCodeLM)
+
+    def test_raises_when_neither_available(self):
+        """No API key + no CLI → raises PlumbAuthError."""
+        with patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value=None):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+            with pytest.raises(PlumbAuthError, match="No LLM backend available"):
+                get_lm()
+
+    def test_api_key_takes_precedence_over_cli(self):
+        """When both API key and CLI exist, API key wins."""
+        with patch.dict("os.environ", {"ANTHROPIC_API_KEY": "sk-ant-test"}), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            lm = get_lm()
+            assert isinstance(lm, dspy.LM)
+            assert not isinstance(lm, ClaudeCodeLM)

From 02d203419397496dfb0180d3f2ae410532554f9d Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 17:25:45 -0400
Subject: [PATCH 04/12] fix: exclude slow tests from plumb coverage pytest run

The run_pytest_coverage() subprocess call now passes -m "not slow"
so that `plumb coverage` skips slow-marked tests, matching the
project convention of running quick tests by default.
---
 plumb/coverage_reporter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plumb/coverage_reporter.py b/plumb/coverage_reporter.py
index e8af374..e65d277 100644
--- a/plumb/coverage_reporter.py
+++ b/plumb/coverage_reporter.py
@@ -30,6 +30,7 @@ def run_pytest_coverage(repo_root: str | Path) -> dict | None:
         result = subprocess.run(
             [
                 sys.executable, "-m", "pytest",
+                "-m", "not slow",
                 "--cov=.",
                 f"--cov-report=json:{cov_json}",
                 "--cov-report=",

From 9d9c11ae8efb87f2e68478a40b55a8269d5121c5 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 17:46:03 -0400
Subject: [PATCH 05/12] test: add integration tests for ClaudeCodeLM with real
 claude -p

- Add test_claude_code_lm_raw_call: smoke test for basic prompt/response
- Add test_claude_code_lm_parse_spec_single_file: end-to-end spec parsing
  through DSPy RequirementParser with a tiny markdown spec
- Register slow marker and set addopts to skip slow tests by default
- Add stderr logging to ClaudeCodeLM.forward() for debugging
---
 plumb/programs/claude_code_lm.py      |  5 +++
 pyproject.toml                        |  2 +
 tests/test_claude_code_integration.py | 59 +++++++++++++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 tests/test_claude_code_integration.py

diff --git a/plumb/programs/claude_code_lm.py b/plumb/programs/claude_code_lm.py
index 7a493be..561a52f 100644
--- a/plumb/programs/claude_code_lm.py
+++ b/plumb/programs/claude_code_lm.py
@@ -119,6 +119,11 @@ def forward(
         messages: list[dict[str, Any]] | None = None,
         **kwargs: Any,
     ) -> SimpleNamespace:
+        import sys
+
         text_input = _serialize_messages(prompt, messages)
+        input_len = len(text_input)
+        print(f"[ClaudeCodeLM] Calling claude -p ({input_len} chars)...", file=sys.stderr)
         response_text = _call_claude(text_input, model=self.cli_model, timeout=self.timeout)
+        print(f"[ClaudeCodeLM] Got response ({len(response_text)} chars)", file=sys.stderr)
         return _make_response(response_text, self.model)
diff --git a/pyproject.toml b/pyproject.toml
index a36323d..185c682 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,3 +29,5 @@ packages = ["plumb"]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
+addopts = "-m 'not slow'"
+markers = ["slow: marks tests as slow (deselect with '-m \"not slow\"')"]
diff --git a/tests/test_claude_code_integration.py b/tests/test_claude_code_integration.py
new file mode 100644
index 0000000..078a935
--- /dev/null
+++ b/tests/test_claude_code_integration.py
@@ -0,0 +1,59 @@
+"""Integration test for ClaudeCodeLM with a real claude -p call.
+
+Marked slow — skipped by default. Run with: pytest -m slow
+Requires the claude CLI to be installed and authenticated.
+"""
+
+import json
+import shutil
+
+import dspy
+import pytest
+from dspy.adapters import XMLAdapter
+
+from plumb.programs.claude_code_lm import ClaudeCodeLM, find_claude_cli
+
+needs_claude_cli = pytest.mark.skipif(
+    shutil.which("claude") is None,
+    reason="claude CLI not installed",
+)
+
+
+@pytest.mark.slow
+@needs_claude_cli
+def test_claude_code_lm_parse_spec_single_file():
+    """End-to-end: parse a tiny spec through ClaudeCodeLM → DSPy RequirementParser."""
+    from plumb.programs.requirement_parser import RequirementParser
+
+    lm = ClaudeCodeLM(model="sonnet", max_tokens=4000, timeout=60)
+    dspy.configure(lm=lm, adapter=XMLAdapter())
+
+    parser = RequirementParser()
+
+    spec = """\
+# Widget API
+
+## Requirements
+
+The system must accept a widget name as a string.
+The system must return a 400 error if the name is empty.
+"""
+
+    parsed = parser(markdown=spec)
+    assert len(parsed) >= 2, f"Expected at least 2 requirements, got {len(parsed)}"
+
+    texts = [r.text.lower() for r in parsed]
+    assert any("name" in t for t in texts), f"No requirement mentions 'name': {texts}"
+
+
+@pytest.mark.slow
+@needs_claude_cli
+def test_claude_code_lm_raw_call():
+    """Smoke test: ClaudeCodeLM returns a non-empty response for a simple prompt."""
+    lm = ClaudeCodeLM(model="sonnet", max_tokens=100, timeout=30)
+
+    response = lm("Reply with only the word: hello")
+    assert response, "Got empty response from claude CLI"
+    assert isinstance(response, list)
+    assert len(response) > 0
+    assert "hello" in response[0].lower()

From c90544654dd55ac1bbba7ad60a4414f8811cd988 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 18:30:32 -0400
Subject: [PATCH 06/12] fix: log retry errors in run_with_retries instead of
 silently swallowing

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plumb/programs/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/plumb/programs/__init__.py b/plumb/programs/__init__.py
index 61647c8..21a3b0f 100644
--- a/plumb/programs/__init__.py
+++ b/plumb/programs/__init__.py
@@ -119,6 +119,7 @@ def run_with_retries(fn, *args, max_retries: int = 2, **kwargs):
                 raise PlumbAuthError(
                     f"API key is invalid or rejected: {e}"
                 ) from e
+            print(f"[retry {attempt+1}/{max_retries+1}] {type(e).__name__}: {e}")
             last_error = e
     raise PlumbInferenceError(
         f"LLM inference failed after {max_retries + 1} attempts: {last_error}"

From 7d2fb32de26182358d2057c8a3bbbe08cc09de8e Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 18:49:29 -0400
Subject: [PATCH 07/12] feat: wire program_models overrides for coverage
 mapper, test mapper, and requirement parser

Previously only question_synthesizer and decision_deduplicator respected
program_models config. Now code_coverage_mapper, test_mapper, and
requirement_parser also check for per-program LM overrides via
dspy.context(). Also adds model name to ClaudeCodeLM log output.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plumb/cli.py                          |  16 +-
 plumb/coverage_reporter.py            |  19 ++-
 plumb/programs/claude_code_lm.py      |   2 +-
 plumb/sync.py                         |  10 +-
 tests/test_program_model_overrides.py | 210 ++++++++++++++++++++++++++
 5 files changed, 245 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_program_model_overrides.py

diff --git a/plumb/cli.py b/plumb/cli.py
index 253f012..34113fb 100644
--- a/plumb/cli.py
+++ b/plumb/cli.py
@@ -806,11 +806,12 @@ def map_tests(dry_run):
     console.print(f"Found {len(test_summaries)} test functions and {len(requirements)} requirements.")
     console.print("Running LLM mapping...")
 
-    from plumb.programs import configure_dspy, run_chunked_mapper
+    from plumb.programs import configure_dspy, run_chunked_mapper, get_program_lm
     from plumb.programs.test_mapper import TestMapper
 
     configure_dspy()
     mapper = TestMapper()
+    override_lm = get_program_lm("test_mapper")
 
     req_json = json.dumps([{"id": r["id"], "text": r["text"]} for r in requirements])
     items = [(s["name"], json.dumps(s)) for s in test_summaries]
@@ -819,9 +820,16 @@ def _combine(chunk):
         return json.dumps([json.loads(t) for _, t in chunk])
 
     try:
-        mappings = run_chunked_mapper(
-            mapper, req_json, items, budget=60000, combine_fn=_combine,
-        )
+        if override_lm:
+            import dspy
+            with dspy.context(lm=override_lm):
+                mappings = run_chunked_mapper(
+                    mapper, req_json, items, budget=60000, combine_fn=_combine,
+                )
+        else:
+            mappings = run_chunked_mapper(
+                mapper, req_json, items, budget=60000, combine_fn=_combine,
+            )
     except Exception as e:
         console.print(f"[red]Mapping failed: {e}[/red]")
         raise SystemExit(1)
diff --git a/plumb/coverage_reporter.py b/plumb/coverage_reporter.py
index e65d277..203d965 100644
--- a/plumb/coverage_reporter.py
+++ b/plumb/coverage_reporter.py
@@ -329,11 +329,12 @@ def check_spec_to_code_coverage(
         return (0, len(requirements))
 
     # --- LLM mapping ---
-    from plumb.programs import configure_dspy, run_chunked_mapper
+    from plumb.programs import configure_dspy, run_chunked_mapper, get_program_lm
     from plumb.programs.code_coverage_mapper import CodeCoverageMapper
 
     configure_dspy()
     mapper = CodeCoverageMapper()
+    override_lm = get_program_lm("code_coverage_mapper", repo_root)
 
     if full_remap:
         dirty_reqs = requirements
@@ -351,10 +352,18 @@ def check_spec_to_code_coverage(
     def _combine(chunk):
         return "\n\n".join(text for _, text in chunk)
 
-    results = run_chunked_mapper(
-        mapper, req_json, items, budget=60000,
-        combine_fn=_combine, merge_fn=merge_coverage_results,
-    )
+    if override_lm:
+        import dspy
+        with dspy.context(lm=override_lm):
+            results = run_chunked_mapper(
+                mapper, req_json, items, budget=60000,
+                combine_fn=_combine, merge_fn=merge_coverage_results,
+            )
+    else:
+        results = run_chunked_mapper(
+            mapper, req_json, items, budget=60000,
+            combine_fn=_combine, merge_fn=merge_coverage_results,
+        )
 
     # Build fresh results dict from LLM output
     fresh_results: dict[str, dict] = {}
diff --git a/plumb/programs/claude_code_lm.py b/plumb/programs/claude_code_lm.py
index 561a52f..570fa9b 100644
--- a/plumb/programs/claude_code_lm.py
+++ b/plumb/programs/claude_code_lm.py
@@ -123,7 +123,7 @@ def forward(
 
         text_input = _serialize_messages(prompt, messages)
         input_len = len(text_input)
-        print(f"[ClaudeCodeLM] Calling claude -p ({input_len} chars)...", file=sys.stderr)
+        print(f"[ClaudeCodeLM] Calling claude -p --model {self.cli_model} ({input_len} chars)...", file=sys.stderr)
         response_text = _call_claude(text_input, model=self.cli_model, timeout=self.timeout)
         print(f"[ClaudeCodeLM] Got response ({len(response_text)} chars)", file=sys.stderr)
         return _make_response(response_text, self.model)
diff --git a/plumb/sync.py b/plumb/sync.py
index 076081e..0065440 100644
--- a/plumb/sync.py
+++ b/plumb/sync.py
@@ -146,7 +146,7 @@ def insert_new_sections(
 def parse_spec_files(repo_root: str | Path) -> list[dict]:
     """Read markdown spec files, run RequirementParser, assign stable IDs,
     write requirements.json."""
-    from plumb.programs import configure_dspy, run_with_retries
+    from plumb.programs import configure_dspy, run_with_retries, get_program_lm
     from plumb.programs.requirement_parser import RequirementParser
 
     repo_root = Path(repo_root)
@@ -169,6 +169,7 @@ def parse_spec_files(repo_root: str | Path) -> list[dict]:
 
     configure_dspy()
     parser = RequirementParser()
+    override_lm = get_program_lm("requirement_parser", repo_root)
 
     for spec_path_str in config.spec_paths:
         spec_path = repo_root / spec_path_str
@@ -182,7 +183,12 @@ def parse_spec_files(repo_root: str | Path) -> list[dict]:
         for md_file in md_files:
             content = md_file.read_text()
             try:
-                parsed = run_with_retries(parser, content)
+                if override_lm:
+                    import dspy
+                    with dspy.context(lm=override_lm):
+                        parsed = run_with_retries(parser, content)
+                else:
+                    parsed = run_with_retries(parser, content)
             except Exception:
                 continue
 
diff --git a/tests/test_program_model_overrides.py b/tests/test_program_model_overrides.py
new file mode 100644
index 0000000..435c9dd
--- /dev/null
+++ b/tests/test_program_model_overrides.py
@@ -0,0 +1,210 @@
+"""Tests that program_models config overrides actually reach the LLM call site.
+
+The contract: when a user puts an entry in program_models for a given program,
+that LM — not the global default — must be the one that receives the prompt.
+
+These tests don't verify get_program_lm() in isolation (that's in test_programs.py).
+They verify the end-to-end wiring: config → get_program_lm → dspy.context → program call.
+"""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import dspy
+import pytest
+
+from plumb.config import PlumbConfig, save_config, ensure_plumb_dir
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_repo_with_override(tmp_repo: Path, program_name: str, model: str) -> Path:
+    """Set up a plumb repo with a single program_models override."""
+    ensure_plumb_dir(tmp_repo)
+    cfg = PlumbConfig(
+        spec_paths=["spec.md"],
+        test_paths=["tests/"],
+        program_models={program_name: {"model": model}},
+    )
+    save_config(tmp_repo, cfg)
+    return tmp_repo
+
+
+def _make_requirements_file(repo: Path, reqs: list[dict]) -> None:
+    """Write a requirements.json that check_spec_to_code_coverage expects."""
+    req_path = repo / ".plumb" / "requirements.json"
+    req_path.write_text(json.dumps(reqs))
+
+
+def _make_source_file(repo: Path, name: str, content: str) -> None:
+    """Create a Python source file in the repo."""
+    path = repo / name
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(content)
+
+
+# ---------------------------------------------------------------------------
+# Core principle: the override LM must be the one that receives the call
+# ---------------------------------------------------------------------------
+
+
+class TestCoverageMapperUsesOverride:
+    """When program_models has 'code_coverage_mapper', coverage mapping
+    must use that LM, not the global default."""
+
+    def test_override_lm_receives_the_call(self, tmp_repo):
+        """The configured override LM should be invoked, not the default."""
+        repo = _make_repo_with_override(tmp_repo, "code_coverage_mapper", "anthropic/claude-haiku-4-5-20251001")
+        _make_requirements_file(repo, [
+            {"id": "req-1", "text": "The system must do X."},
+        ])
+        _make_source_file(repo, "app.py", "def do_x():\n    pass\n")
+
+        # Track which LM actually gets called
+        called_models = []
+
+        original_forward = dspy.Predict.forward
+
+        def tracking_forward(self, **kwargs):
+            # Inside dspy.context, dspy.settings.lm reflects the active LM
+            active_lm = dspy.settings.lm
+            called_models.append(active_lm.model)
+            # Return a plausible result so the pipeline doesn't crash
+            from plumb.programs.code_coverage_mapper import RequirementCoverage
+            mock_result = MagicMock()
+            mock_result.coverage = [
+                RequirementCoverage(requirement_id="req-1", implemented=False, evidence=""),
+            ]
+            return mock_result
+
+        with patch.object(dspy.Predict, "forward", tracking_forward), \
+             patch("plumb.programs.configure_dspy"), \
+             patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+
+            from plumb.coverage_reporter import check_spec_to_code_coverage
+            check_spec_to_code_coverage(repo, use_llm=True)
+
+        assert len(called_models) >= 1, "DSPy Predict was never called"
+        # The override model should have been used (ClaudeCodeLM strips 'anthropic/' prefix)
+        assert any("haiku" in m for m in called_models), (
+            f"Expected haiku override to be active, but saw: {called_models}"
+        )
+
+class TestTestMapperUsesOverride:
+    """When program_models has 'test_mapper', the test mapping command
+    must use that LM."""
+
+    def test_override_lm_receives_the_call(self, tmp_repo):
+        repo = _make_repo_with_override(tmp_repo, "test_mapper", "anthropic/claude-haiku-4-5-20251001")
+
+        called_models = []
+
+        def tracking_forward(self, **kwargs):
+            active_lm = dspy.settings.lm
+            called_models.append(active_lm.model)
+            mock_result = MagicMock()
+            mock_result.mappings = []
+            return mock_result
+
+        with patch.object(dspy.Predict, "forward", tracking_forward), \
+             patch("plumb.programs.configure_dspy"), \
+             patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+
+            from plumb.programs import run_chunked_mapper, get_program_lm
+            from plumb.programs.test_mapper import TestMapper
+
+            mapper = TestMapper()
+            override_lm = get_program_lm("test_mapper", repo)
+
+            assert override_lm is not None, "Override should have been returned"
+
+            req_json = json.dumps([{"id": "req-1", "text": "Must do X"}])
+            items = [("test_foo", json.dumps({"name": "test_foo", "file": "tests/test_foo.py"}))]
+
+            def _combine(chunk):
+                return json.dumps([json.loads(t) for _, t in chunk])
+
+            with dspy.context(lm=override_lm):
+                run_chunked_mapper(mapper, req_json, items, budget=60000, combine_fn=_combine)
+
+        assert len(called_models) >= 1
+        assert any("haiku" in m for m in called_models), (
+            f"Expected haiku override, but saw: {called_models}"
+        )
+
+
+class TestRequirementParserUsesOverride:
+    """When program_models has 'requirement_parser', spec parsing
+    must use that LM."""
+
+    def test_override_lm_receives_the_call(self, tmp_repo):
+        repo = _make_repo_with_override(tmp_repo, "requirement_parser", "anthropic/claude-haiku-4-5-20251001")
+
+        # Create a spec file the parser will read
+        spec = repo / "spec.md"
+        spec.write_text("# Spec\n\n## Features\n\nThe system must do X.\n")
+
+        called_models = []
+
+        def tracking_forward(self, **kwargs):
+            active_lm = dspy.settings.lm
+            called_models.append(active_lm.model)
+            from plumb.programs.requirement_parser import ParsedRequirement
+            mock_result = MagicMock()
+            mock_result.requirements = [
+                ParsedRequirement(text="The system must do X.", ambiguous=False),
+            ]
+            return mock_result
+
+        with patch.object(dspy.Predict, "forward", tracking_forward), \
+             patch("plumb.programs.configure_dspy"), \
+             patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+
+            from plumb.sync import parse_spec_files
+            parse_spec_files(repo)
+
+        assert len(called_models) >= 1
+        assert any("haiku" in m for m in called_models), (
+            f"Expected haiku override, but saw: {called_models}"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Negative case: override for one program must not leak to another
+# ---------------------------------------------------------------------------
+
+
+class TestOverrideIsolation:
+    """An override for program A must not affect program B."""
+
+    def test_coverage_mapper_override_does_not_affect_other_programs(self, tmp_repo):
+        """Configuring code_coverage_mapper should not change the LM for
+        requirement_parser."""
+        repo = _make_repo_with_override(tmp_repo, "code_coverage_mapper", "anthropic/claude-haiku-4-5-20251001")
+
+        from plumb.programs import get_program_lm
+
+        with patch.dict("os.environ", {}, clear=True), \
+             patch("plumb.programs.claude_code_lm.find_claude_cli", return_value="/usr/bin/claude"):
+            import os
+            os.environ.pop("ANTHROPIC_API_KEY", None)
+
+            coverage_lm = get_program_lm("code_coverage_mapper", repo)
+            parser_lm = get_program_lm("requirement_parser", repo)
+
+        assert coverage_lm is not None, "Coverage mapper override should exist"
+        assert parser_lm is None, "Requirement parser should have no override"

From 1159ced838fd493c7dade65ae664639aca5f768f Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Mon, 30 Mar 2026 19:02:08 -0400
Subject: [PATCH 08/12] feat: add configurable token budget per program via
 program_models config

Adds "budget" field to program_models entries so users can control
chunking granularity. Lower budgets create more parallel chunks,
which can improve latency for CLI-based LLM calls.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plumb/cli.py               |  8 +++++---
 plumb/coverage_reporter.py |  8 +++++---
 plumb/programs/__init__.py | 11 ++++++++---
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/plumb/cli.py b/plumb/cli.py
index 34113fb..7e3036f 100644
--- a/plumb/cli.py
+++ b/plumb/cli.py
@@ -806,12 +806,14 @@ def map_tests(dry_run):
     console.print(f"Found {len(test_summaries)} test functions and {len(requirements)} requirements.")
     console.print("Running LLM mapping...")
 
-    from plumb.programs import configure_dspy, run_chunked_mapper, get_program_lm
+    from plumb.programs import configure_dspy, run_chunked_mapper, get_program_lm, get_program_config
     from plumb.programs.test_mapper import TestMapper
 
     configure_dspy()
     mapper = TestMapper()
     override_lm = get_program_lm("test_mapper")
+    prog_cfg = get_program_config("test_mapper") or {}
+    budget = prog_cfg.get("budget", 60000)
 
     req_json = json.dumps([{"id": r["id"], "text": r["text"]} for r in requirements])
     items = [(s["name"], json.dumps(s)) for s in test_summaries]
@@ -824,11 +826,11 @@ def _combine(chunk):
             import dspy
             with dspy.context(lm=override_lm):
                 mappings = run_chunked_mapper(
-                    mapper, req_json, items, budget=60000, combine_fn=_combine,
+                    mapper, req_json, items, budget=budget, combine_fn=_combine,
                 )
         else:
             mappings = run_chunked_mapper(
-                mapper, req_json, items, budget=60000, combine_fn=_combine,
+                mapper, req_json, items, budget=budget, combine_fn=_combine,
             )
     except Exception as e:
         console.print(f"[red]Mapping failed: {e}[/red]")
diff --git a/plumb/coverage_reporter.py b/plumb/coverage_reporter.py
index 203d965..3da9993 100644
--- a/plumb/coverage_reporter.py
+++ b/plumb/coverage_reporter.py
@@ -329,12 +329,14 @@ def check_spec_to_code_coverage(
         return (0, len(requirements))
 
     # --- LLM mapping ---
-    from plumb.programs import configure_dspy, run_chunked_mapper, get_program_lm
+    from plumb.programs import configure_dspy, run_chunked_mapper, get_program_lm, get_program_config
     from plumb.programs.code_coverage_mapper import CodeCoverageMapper
 
     configure_dspy()
     mapper = CodeCoverageMapper()
     override_lm = get_program_lm("code_coverage_mapper", repo_root)
+    prog_cfg = get_program_config("code_coverage_mapper", repo_root) or {}
+    budget = prog_cfg.get("budget", 60000)
 
     if full_remap:
         dirty_reqs = requirements
@@ -356,12 +358,12 @@ def _combine(chunk):
         import dspy
         with dspy.context(lm=override_lm):
             results = run_chunked_mapper(
-                mapper, req_json, items, budget=60000,
+                mapper, req_json, items, budget=budget,
                 combine_fn=_combine, merge_fn=merge_coverage_results,
             )
     else:
         results = run_chunked_mapper(
-            mapper, req_json, items, budget=60000,
+            mapper, req_json, items, budget=budget,
             combine_fn=_combine, merge_fn=merge_coverage_results,
         )
 
diff --git a/plumb/programs/__init__.py b/plumb/programs/__init__.py
index 21a3b0f..43f80f2 100644
--- a/plumb/programs/__init__.py
+++ b/plumb/programs/__init__.py
@@ -75,8 +75,8 @@ def validate_api_access() -> None:
         raise PlumbAuthError(f"Failed to verify LLM access: {e}") from e
 
 
-def get_program_lm(program_name: str, repo_root: str | Path | None = None) -> BaseLM | None:
-    """Return a per-program LM override from config, or None for the default."""
+def get_program_config(program_name: str, repo_root: str | Path | None = None) -> dict | None:
+    """Return the raw program_models entry for a program, or None."""
     from plumb.config import find_repo_root, load_config
 
     if repo_root is None:
@@ -86,7 +86,12 @@ def get_program_lm(program_name: str, repo_root: str | Path | None = None) -> Ba
     cfg = load_config(repo_root)
     if cfg is None:
         return None
-    entry = cfg.program_models.get(program_name)
+    return cfg.program_models.get(program_name)
+
+
+def get_program_lm(program_name: str, repo_root: str | Path | None = None) -> BaseLM | None:
+    """Return a per-program LM override from config, or None for the default."""
+    entry = get_program_config(program_name, repo_root)
     if entry is None:
         return None
     model = entry.get("model")

From 51e233b2c91ffa2d9bd2f74b9ca69f87f9ccfb08 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Tue, 31 Mar 2026 08:55:57 -0400
Subject: [PATCH 09/12] chore: add .DS_Store to .gitignore

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 8e979e9..3b7edae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,5 @@ __pycache__/
 *.pyc
 .coverage
 .pytest_cache/
-.env
\ No newline at end of file
+.env
+.DS_Store
\ No newline at end of file

From 33247ad411cccc1dad75a588e899304a96f772ef Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Tue, 31 Mar 2026 14:11:48 -0400
Subject: [PATCH 10/12] fix: isolate claude subprocess from worktree cwd and
 add PLUMB_SKIP hook guard

_call_claude() now runs subprocess.run with cwd=tempfile.gettempdir() to
prevent Claude Code plugin init from corrupting git worktree indexes.
Pre-commit hook script now checks PLUMB_SKIP=1 for an escape hatch.

Fixes ktinubu/plumb#1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plumb/cli.py                     |  4 ++--
 plumb/programs/claude_code_lm.py |  2 ++
 tests/test_claude_code_lm.py     | 13 +++++++++++++
 tests/test_cli.py                | 16 ++++++++++++++++
 4 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/plumb/cli.py b/plumb/cli.py
index 7e3036f..c9fcd99 100644
--- a/plumb/cli.py
+++ b/plumb/cli.py
@@ -115,7 +115,7 @@ def _init_clone_setup(repo_root: Path, cfg: PlumbConfig) -> None:
         hooks_dir = repo_root / ".git" / "hooks"
         hooks_dir.mkdir(exist_ok=True)
         hook_path = hooks_dir / "pre-commit"
-        hook_path.write_text("#!/bin/sh\nplumb hook\nexit $?\n")
+        hook_path.write_text('#!/bin/sh\n[ "$PLUMB_SKIP" = "1" ] && exit 0\nplumb hook\nexit $?\n')
         hook_path.chmod(0o755)
         post_commit_path = hooks_dir / "post-commit"
         post_commit_path.write_text("#!/bin/sh\nplumb post-commit\n")
@@ -249,7 +249,7 @@ def init():
         hooks_dir = repo_root / ".git" / "hooks"
         hooks_dir.mkdir(exist_ok=True)
         hook_path = hooks_dir / "pre-commit"
-        hook_path.write_text("#!/bin/sh\nplumb hook\nexit $?\n")
+        hook_path.write_text('#!/bin/sh\n[ "$PLUMB_SKIP" = "1" ] && exit 0\nplumb hook\nexit $?\n')
         hook_path.chmod(0o755)
         post_commit_path = hooks_dir / "post-commit"
         post_commit_path.write_text("#!/bin/sh\nplumb post-commit\n")
diff --git a/plumb/programs/claude_code_lm.py b/plumb/programs/claude_code_lm.py
index 570fa9b..aedeeff 100644
--- a/plumb/programs/claude_code_lm.py
+++ b/plumb/programs/claude_code_lm.py
@@ -11,6 +11,7 @@
 import os
 import shutil
 import subprocess
+import tempfile
 from types import SimpleNamespace
 from typing import Any
 
@@ -44,6 +45,7 @@ def _call_claude(prompt: str, model: str | None = None, timeout: int = 300) -> s
         text=True,
         env=env,
         timeout=timeout,
+        cwd=tempfile.gettempdir(),
     )
     if result.returncode != 0:
         raise RuntimeError(
diff --git a/tests/test_claude_code_lm.py b/tests/test_claude_code_lm.py
index 52000e3..28be342 100644
--- a/tests/test_claude_code_lm.py
+++ b/tests/test_claude_code_lm.py
@@ -125,6 +125,19 @@ def test_raises_on_timeout(self):
             with pytest.raises(subprocess.TimeoutExpired):
                 _call_claude("test")
 
+    def test_runs_in_temp_directory_not_inherited_cwd(self):
+        """subprocess.run must set cwd to a temp dir to avoid corrupting
+        a git worktree's index when Claude Code plugins run git operations."""
+        import tempfile
+
+        mock_result = subprocess.CompletedProcess(
+            args=["claude"], returncode=0, stdout="ok", stderr=""
+        )
+        with patch("subprocess.run", return_value=mock_result) as mock_run:
+            _call_claude("test")
+            cwd = mock_run.call_args[1]["cwd"]
+            assert cwd == tempfile.gettempdir()
+
 
 class TestClaudeCodeLM:
     def test_is_base_lm_subclass(self):
diff --git a/tests/test_cli.py b/tests/test_cli.py
index a20b46e..3e3b72a 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -50,6 +50,22 @@ def test_successful_init(self, runner, tmp_repo):
         hook = tmp_repo / ".git" / "hooks" / "pre-commit"
         assert os.access(str(hook), os.X_OK)
 
+    def test_pre_commit_hook_checks_plumb_skip(self, runner, tmp_repo):
+        """The pre-commit hook must exit 0 when PLUMB_SKIP=1 so users
+        can bypass Plumb in worktrees or automated scripts."""
+        spec = tmp_repo / "spec.md"
+        spec.write_text("# Spec\n")
+        (tmp_repo / "tests").mkdir(exist_ok=True)
+
+        with patch("plumb.cli.find_repo_root", return_value=tmp_repo), \
+             patch("plumb.sync.parse_spec_files", return_value=[]):
+            runner.invoke(cli, ["init"], input="spec.md\ntests/\n")
+
+        hook = tmp_repo / ".git" / "hooks" / "pre-commit"
+        content = hook.read_text()
+        assert 'PLUMB_SKIP' in content
+        assert 'exit 0' in content.split('PLUMB_SKIP')[1].split('\n')[0]
+
 
 class TestInitPlumbignore:
     def test_init_creates_plumbignore(self, runner, tmp_repo):

From 2cfd27b8877ed6073ed53e6ca8eb76fec40b4998 Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Tue, 31 Mar 2026 17:38:38 -0400
Subject: [PATCH 11/12] test: add e2e tests for worktree index integrity with
 claude -p

Two integration tests (real git, real claude -p, real worktree, no mocks):

- Test A: claude -p called directly from pre-commit hook
- Test B: plumb hook called from pre-commit hook (full code path)

Both assert the desired behavior: index stays intact and commit succeeds.
Currently FAIL because _call_claude() does not strip GIT_INDEX_FILE.

Refs ktinubu/plumb#1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/test_worktree_index_corruption.py | 180 ++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 tests/test_worktree_index_corruption.py

diff --git a/tests/test_worktree_index_corruption.py b/tests/test_worktree_index_corruption.py
new file mode 100644
index 0000000..3f0dfc6
--- /dev/null
+++ b/tests/test_worktree_index_corruption.py
@@ -0,0 +1,180 @@
+"""E2E tests: git worktree index corruption caused by GIT_INDEX_FILE inheritance.
+
+During git commit in a worktree, git sets GIT_INDEX_FILE to the worktree's
+index path. claude -p inherits this env var, and Claude Code's plugin init
+runs git operations that overwrite the worktree's index with plugin cache
+entries. Result: "error: Error building trees" and a destroyed index.
+
+Test A: Pre-commit hook calls claude -p directly.
+Test B: Pre-commit hook calls plumb hook (the real code path).
+
+All tests: real git, real claude -p, real worktree, real commit. No mocks.
+Marked slow — requires claude CLI installed and authenticated.
+
+See: https://github.com/ktinubu/plumb/issues/1
+"""
+
+import shutil
+import subprocess
+from datetime import datetime, timezone
+from pathlib import Path
+
+import pytest
+from git import Repo
+
+from plumb.config import PlumbConfig, save_config, ensure_plumb_dir
+
+needs_claude_cli = pytest.mark.skipif(
+    shutil.which("claude") is None,
+    reason="claude CLI not installed",
+)
+
+
+def _create_repo_with_worktree(tmp_path, num_files=20):
+    """Create a main repo with files and a worktree.
+
+    Returns (main_repo_path, worktree_path).
+    """
+    main_dir = tmp_path / "main-repo"
+    main_dir.mkdir()
+    repo = Repo.init(main_dir)
+
+    for i in range(num_files):
+        (main_dir / f"file_{i}.txt").write_text(f"content {i}\n")
+    repo.index.add([f"file_{i}.txt" for i in range(num_files)])
+    repo.index.commit("initial commit")
+
+    wt_dir = tmp_path / "worktree"
+    repo.git.worktree("add", str(wt_dir), "-b", "wt-branch", "HEAD")
+
+    return main_dir, wt_dir
+
+
+def _count_index_entries(repo_path):
+    """Return the number of entries in the git index."""
+    result = subprocess.run(
+        ["git", "ls-files"],
+        cwd=str(repo_path),
+        capture_output=True,
+        text=True,
+    )
+    lines = result.stdout.strip().splitlines()
+    return len(lines) if lines != [""] else 0
+
+
+def _install_hook(main_repo_path, hook_script):
+    """Install a pre-commit hook in the main repo (shared with worktrees)."""
+    hooks_dir = main_repo_path / ".git" / "hooks"
+    hooks_dir.mkdir(exist_ok=True)
+    hook_path = hooks_dir / "pre-commit"
+    hook_path.write_text(hook_script)
+    hook_path.chmod(0o755)
+
+
+def _stage_and_commit(wt_dir):
+    """Stage a new file and attempt git commit. Returns CompletedProcess."""
+    (wt_dir / "new_file.txt").write_text("trigger commit\n")
+    subprocess.run(["git", "add", "new_file.txt"], cwd=str(wt_dir))
+    return subprocess.run(
+        ["git", "commit", "-m", "test commit"],
+        cwd=str(wt_dir),
+        capture_output=True,
+        text=True,
+        timeout=300,
+    )
+
+
+def _init_plumb(repo_path):
+    """Initialize plumb in a repo programmatically (same as plumb init)."""
+    ensure_plumb_dir(repo_path)
+    (repo_path / ".plumb" / "decisions").mkdir(exist_ok=True)
+
+    spec = repo_path / "spec.md"
+    spec.write_text("# Spec\n\n## Features\n\nThe system must do X.\n")
+
+    tests_dir = repo_path / "tests"
+    tests_dir.mkdir(exist_ok=True)
+
+    cfg = PlumbConfig(
+        spec_paths=["spec.md"],
+        test_paths=["tests/"],
+        initialized_at=datetime.now(timezone.utc).isoformat(),
+    )
+    save_config(repo_path, cfg)
+
+    # Install the real plumb pre-commit hook (same string as cli.py)
+    hooks_dir = repo_path / ".git" / "hooks"
+    hooks_dir.mkdir(exist_ok=True)
+    hook_path = hooks_dir / "pre-commit"
+    hook_path.write_text(
+        '#!/bin/sh\n[ "$PLUMB_SKIP" = "1" ] && exit 0\nplumb hook\nexit $?\n'
+    )
+    hook_path.chmod(0o755)
+
+
+@pytest.mark.slow
+@needs_claude_cli
+class TestClaudePWorktreeIndex:
+    """Test A: claude -p called directly from a pre-commit hook."""
+
+    def test_commit_succeeds_with_index_intact(self, tmp_path):
+        """git commit in a worktree must succeed with index intact when
+        claude -p is called from the pre-commit hook."""
+        main_dir, wt_dir = _create_repo_with_worktree(tmp_path)
+        baseline = _count_index_entries(wt_dir)
+        assert baseline == 20
+
+        _install_hook(
+            main_dir,
+            '#!/bin/sh\necho "say hello" | claude -p --output-format text >/dev/null 2>&1\nexit 0\n',
+        )
+
+        result = _stage_and_commit(wt_dir)
+        after = _count_index_entries(wt_dir)
+
+        assert result.returncode == 0, (
+            f"Commit failed: {result.stderr[:300]}"
+        )
+        assert after == baseline + 1, (
+            f"Expected {baseline + 1} index entries (original + new file), got {after}"
+        )
+
+
+@pytest.mark.slow
+@needs_claude_cli
+class TestPlumbHookWorktreeIndex:
+    """Test B: plumb hook called from a pre-commit hook (real code path)."""
+
+    def test_commit_succeeds_with_index_intact(self, tmp_path):
+        """git commit in a worktree must succeed with index intact when
+        plumb hook calls _call_claude() during pre-commit."""
+        main_dir, wt_dir = _create_repo_with_worktree(tmp_path)
+
+        # plumb init needs to happen in the main repo (worktree shares hooks)
+        _init_plumb(main_dir)
+
+        # Commit plumb files so they appear in the worktree
+        repo = Repo(main_dir)
+        repo.index.add([
+            ".plumb/config.json",
+            "spec.md",
+        ])
+        repo.index.commit("add plumb config")
+
+        # Pull the new commit into the worktree
+        wt_repo = Repo(wt_dir)
+        wt_repo.git.merge("main", "--no-edit")
+
+        baseline = _count_index_entries(wt_dir)
+
+        result = _stage_and_commit(wt_dir)
+        after = _count_index_entries(wt_dir)
+
+        assert after == baseline + 1, (
+            f"Expected {baseline + 1} index entries (original + new file), got {after}. "
+            f"rc={result.returncode}, stderr={result.stderr[:300]}"
+        )
+        # plumb hook may return non-zero (pending decisions), but the index must not be corrupted
+        assert "Error building trees" not in result.stderr, (
+            f"Index was corrupted: {result.stderr[:300]}"
+        )

From a0dd821de661e12ba45aaac6008bfd0c2664a24f Mon Sep 17 00:00:00 2001
From: KT <khaledt@google.com>
Date: Tue, 31 Mar 2026 18:24:54 -0400
Subject: [PATCH 12/12] fix: strip repo-local GIT_* env vars from claude -p
 subprocess

Root cause: during git commit in a worktree, git sets GIT_INDEX_FILE
and GIT_DIR. claude -p inherits these and its plugin init overwrites
the worktree's index with plugin cache entries (~130 entries).

Strips all repo-local GIT_* env vars from the subprocess env in
_call_claude(), keeping only safe transport/config vars (GIT_SSH,
GIT_CONFIG_*, etc.). Pattern from pre-commit framework's no_git_env().
Removes ineffective cwd=tempfile.gettempdir().

Three e2e tests (real git, real claude -p, real worktree, no mocks):
- Test A: proves raw claude -p corrupts the index (upstream bug)
- Test B: proves shell-level env stripping prevents corruption
- Test C: proves plumb hook protects the index via _call_claude() fix

Fixes ktinubu/plumb#1

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 plumb/programs/claude_code_lm.py        | 35 ++++++++++---
 tests/test_claude_code_lm.py            | 35 ++++++++++---
 tests/test_worktree_index_corruption.py | 65 ++++++++++++++++++++-----
 3 files changed, 109 insertions(+), 26 deletions(-)

diff --git a/plumb/programs/claude_code_lm.py b/plumb/programs/claude_code_lm.py
index aedeeff..03d06d6 100644
--- a/plumb/programs/claude_code_lm.py
+++ b/plumb/programs/claude_code_lm.py
@@ -11,7 +11,6 @@
 import os
 import shutil
 import subprocess
-import tempfile
 from types import SimpleNamespace
 from typing import Any
 
@@ -25,18 +24,43 @@ def find_claude_cli() -> str | None:
     return shutil.which("claude")
 
 
+# GIT_* env vars that are safe to pass through to claude -p.
+# Everything else starting with GIT_ is stripped to prevent claude -p's
+# plugin init from corrupting a worktree's git index during pre-commit hooks.
+# Pattern from pre-commit framework:
+# https://github.com/pre-commit/pre-commit/blob/ec1928f37e8abd7bab0b7ed29a031e5fd8875be7/pre_commit/git.py#L27
+_GIT_ENV_WHITELIST = {
+    "GIT_EXEC_PATH",
+    "GIT_SSH",
+    "GIT_SSH_COMMAND",
+    "GIT_SSL_CAINFO",
+    "GIT_SSL_NO_VERIFY",
+    "GIT_CONFIG_COUNT",
+    "GIT_HTTP_PROXY_AUTHMETHOD",
+    "GIT_ALLOW_PROTOCOL",
+    "GIT_ASKPASS",
+}
+
+
 def _call_claude(prompt: str, model: str | None = None, timeout: int = 300) -> str:
     """Run ``claude -p`` with *prompt* on stdin and return the text response.
 
-    Strips the ``CLAUDECODE`` env var to allow nesting inside a Claude Code
-    session (the guard is for interactive terminal conflicts; programmatic
-    subprocess usage is safe).
+    Strips ``CLAUDECODE`` and repo-local ``GIT_*`` env vars so that claude -p's
+    plugin init cannot corrupt a worktree's git index during pre-commit hooks.
+    See https://github.com/ktinubu/plumb/issues/1.
     """
     cmd = ["claude", "-p", "--output-format", "text"]
     if model:
         cmd.extend(["--model", model])
 
-    env = {k: v for k, v in os.environ.items() if k != "CLAUDECODE"}
+    env = {
+        k: v for k, v in os.environ.items()
+        if k != "CLAUDECODE" and (
+            not k.startswith("GIT_")
+            or k.startswith(("GIT_CONFIG_KEY_", "GIT_CONFIG_VALUE_"))
+            or k in _GIT_ENV_WHITELIST
+        )
+    }
 
     result = subprocess.run(
         cmd,
@@ -45,7 +69,6 @@ def _call_claude(prompt: str, model: str | None = None, timeout: int = 300) -> s
         text=True,
         env=env,
         timeout=timeout,
-        cwd=tempfile.gettempdir(),
     )
     if result.returncode != 0:
         raise RuntimeError(
diff --git a/tests/test_claude_code_lm.py b/tests/test_claude_code_lm.py
index 28be342..2ef67f7 100644
--- a/tests/test_claude_code_lm.py
+++ b/tests/test_claude_code_lm.py
@@ -125,18 +125,37 @@ def test_raises_on_timeout(self):
             with pytest.raises(subprocess.TimeoutExpired):
                 _call_claude("test")
 
-    def test_runs_in_temp_directory_not_inherited_cwd(self):
-        """subprocess.run must set cwd to a temp dir to avoid corrupting
-        a git worktree's index when Claude Code plugins run git operations."""
-        import tempfile
-
+    def test_strips_repo_local_git_env_vars(self):
+        """Repo-local GIT_* vars must be stripped to prevent claude -p
+        from corrupting a worktree's git index during pre-commit hooks.
+        Safe transport/config vars (GIT_SSH, GIT_CONFIG_*, etc.) are kept."""
         mock_result = subprocess.CompletedProcess(
             args=["claude"], returncode=0, stdout="ok", stderr=""
         )
-        with patch("subprocess.run", return_value=mock_result) as mock_run:
+        with patch("subprocess.run", return_value=mock_result) as mock_run, \
+             patch.dict("os.environ", {
+                 "GIT_INDEX_FILE": "/tmp/.git/worktrees/wt/index",
+                 "GIT_DIR": "/tmp/.git/worktrees/wt",
+                 "GIT_WORK_TREE": "/tmp/worktree",
+                 "GIT_SSH_COMMAND": "ssh -i ~/.ssh/id_rsa",
+                 "GIT_CONFIG_COUNT": "1",
+                 "GIT_CONFIG_KEY_0": "user.name",
+                 "GIT_CONFIG_VALUE_0": "Test",
+                 "PATH": "/usr/bin",
+             }):
             _call_claude("test")
-            cwd = mock_run.call_args[1]["cwd"]
-            assert cwd == tempfile.gettempdir()
+            env = mock_run.call_args[1]["env"]
+            # Repo-local vars stripped
+            assert "GIT_INDEX_FILE" not in env
+            assert "GIT_DIR" not in env
+            assert "GIT_WORK_TREE" not in env
+            # Transport/config vars kept
+            assert env["GIT_SSH_COMMAND"] == "ssh -i ~/.ssh/id_rsa"
+            assert env["GIT_CONFIG_COUNT"] == "1"
+            assert env["GIT_CONFIG_KEY_0"] == "user.name"
+            assert env["GIT_CONFIG_VALUE_0"] == "Test"
+            # Non-GIT vars kept
+            assert "PATH" in env
 
 
 class TestClaudeCodeLM:
diff --git a/tests/test_worktree_index_corruption.py b/tests/test_worktree_index_corruption.py
index 3f0dfc6..5026596 100644
--- a/tests/test_worktree_index_corruption.py
+++ b/tests/test_worktree_index_corruption.py
@@ -115,11 +115,17 @@ def _init_plumb(repo_path):
 @pytest.mark.slow
 @needs_claude_cli
 class TestClaudePWorktreeIndex:
-    """Test A: claude -p called directly from a pre-commit hook."""
+    """Test A: claude -p called directly from a pre-commit hook.
 
-    def test_commit_succeeds_with_index_intact(self, tmp_path):
-        """git commit in a worktree must succeed with index intact when
-        claude -p is called from the pre-commit hook."""
+    Documents the upstream Claude Code CLI bug: claude -p corrupts
+    worktree indexes when GIT_INDEX_FILE is inherited. This test
+    asserts the buggy behavior so it will break (become a passing
+    test) if/when Claude Code fixes the upstream issue.
+    """
+
+    def test_raw_claude_p_corrupts_worktree_index(self, tmp_path):
+        """claude -p called directly from a hook (no plumb) corrupts
+        the worktree index — this is an upstream Claude Code bug."""
         main_dir, wt_dir = _create_repo_with_worktree(tmp_path)
         baseline = _count_index_entries(wt_dir)
         assert baseline == 20
@@ -132,28 +138,65 @@ def test_commit_succeeds_with_index_intact(self, tmp_path):
         result = _stage_and_commit(wt_dir)
         after = _count_index_entries(wt_dir)
 
+        # Upstream bug: claude -p corrupts the index
+        assert after != baseline, (
+            "Expected corruption (upstream bug) but index stayed intact. "
+            "If this fails, Claude Code may have fixed the upstream issue!"
+        )
+        assert result.returncode != 0, (
+            "Expected commit failure (upstream bug) but it succeeded. "
+            "If this fails, Claude Code may have fixed the upstream issue!"
+        )
+
+
+@pytest.mark.slow
+@needs_claude_cli
+class TestShellLevelStrippingPreventsCorruption:
+    """Test B: stripping GIT_INDEX_FILE and GIT_DIR at the shell level
+    before calling claude -p prevents the corruption."""
+
+    def test_unset_git_env_vars_before_claude_p(self, tmp_path):
+        """Unsetting GIT_INDEX_FILE and GIT_DIR in the hook script
+        before calling claude -p keeps the index intact."""
+        main_dir, wt_dir = _create_repo_with_worktree(tmp_path)
+        baseline = _count_index_entries(wt_dir)
+        assert baseline == 20
+
+        _install_hook(
+            main_dir,
+            '#!/bin/sh\n'
+            'env -u GIT_INDEX_FILE -u GIT_DIR '
+            'sh -c \'echo "say hello" | claude -p --output-format text >/dev/null 2>&1\'\n'
+            'exit 0\n',
+        )
+
+        result = _stage_and_commit(wt_dir)
+        after = _count_index_entries(wt_dir)
+
         assert result.returncode == 0, (
             f"Commit failed: {result.stderr[:300]}"
         )
         assert after == baseline + 1, (
-            f"Expected {baseline + 1} index entries (original + new file), got {after}"
+            f"Expected {baseline + 1} index entries, got {after}"
         )
 
 
 @pytest.mark.slow
 @needs_claude_cli
 class TestPlumbHookWorktreeIndex:
-    """Test B: plumb hook called from a pre-commit hook (real code path)."""
+    """Test C: plumb hook called from a pre-commit hook (real code path).
+
+    Verifies that plumb's fix (stripping GIT_INDEX_FILE/GIT_DIR) protects
+    the worktree index when plumb hook runs during git commit.
+    """
 
     def test_commit_succeeds_with_index_intact(self, tmp_path):
         """git commit in a worktree must succeed with index intact when
         plumb hook calls _call_claude() during pre-commit."""
         main_dir, wt_dir = _create_repo_with_worktree(tmp_path)
 
-        # plumb init needs to happen in the main repo (worktree shares hooks)
         _init_plumb(main_dir)
 
-        # Commit plumb files so they appear in the worktree
         repo = Repo(main_dir)
         repo.index.add([
             ".plumb/config.json",
@@ -161,7 +204,6 @@ def test_commit_succeeds_with_index_intact(self, tmp_path):
         ])
         repo.index.commit("add plumb config")
 
-        # Pull the new commit into the worktree
         wt_repo = Repo(wt_dir)
         wt_repo.git.merge("main", "--no-edit")
 
@@ -172,9 +214,8 @@ def test_commit_succeeds_with_index_intact(self, tmp_path):
 
         assert after == baseline + 1, (
             f"Expected {baseline + 1} index entries (original + new file), got {after}. "
-            f"rc={result.returncode}, stderr={result.stderr[:300]}"
+            f"rc={result.returncode}, stderr={result.stderr[:1000]}"
         )
-        # plumb hook may return non-zero (pending decisions), but the index must not be corrupted
         assert "Error building trees" not in result.stderr, (
-            f"Index was corrupted: {result.stderr[:300]}"
+            f"Index was corrupted: {result.stderr[:1000]}"
         )