diff --git a/backend/app/llm/agent.py b/backend/app/llm/agent.py index 1d7ddcb..f601e47 100644 --- a/backend/app/llm/agent.py +++ b/backend/app/llm/agent.py @@ -11,7 +11,7 @@ import os from pathlib import Path -from typing import Any, Awaitable, Callable, Dict, List +from typing import Any, Awaitable, Callable, Dict, List, Optional from app import tools as tools_module from app.llm.context import build as build_context @@ -22,6 +22,7 @@ ) from app.llm.providers import build_provider from app.tools import set_session_context +from app.tools.policy import is_blocked, needs_approval EmitFn = Callable[[Dict[str, Any]], Awaitable[None]] @@ -38,6 +39,19 @@ def _max_rounds() -> int: MAX_TOOL_ROUNDS = _max_rounds() + +# Tools blocked while a session is in plan mode. The model can still +# explore (Read/Grep/Glob/WebSearch) but can't modify state or reach +# out to the network until the user accepts / rejects the plan. +_PLAN_MODE_BLOCKED_TOOLS = frozenset({ + "Bash", + "Write", + "Edit", + "MultiEdit", + "NotebookEdit", + "WebFetch", +}) + DEFAULT_MODELS = { "claude": "claude-sonnet-4-6", "openai": "gpt-4o", @@ -61,6 +75,10 @@ async def run_turn( is_first_turn: bool, attachments: List[Dict[str, Any]] = None, # type: ignore[assignment] ask_user: Any = None, + approve_tool: Any = None, + is_tool_auto_approved: Any = None, + enter_plan_mode: Any = None, + is_in_plan_mode: Any = None, ) -> None: """Execute one user → agent turn. Mutates ``history`` in place.""" set_session_context({ @@ -69,6 +87,7 @@ async def run_turn( "api_key": api_key, "emit": emit, "ask_user": ask_user, + "enter_plan_mode": enter_plan_mode, }) ctx = build_context(folder, agent_kind, model) @@ -163,7 +182,13 @@ async def run_turn( if not result["tool_uses"]: return # turn complete - # Execute tools and feed back as tool_result blocks. + # Execute tools and feed back as tool_result blocks. Before + # each call, consult the tool policy: "allow" runs free, "ask" + # pauses for user approval via the runtime, "deny" is a + # hard-coded no-op (reserved for future policy-file blocks). + # A denied tool gets a synthetic error tool_result so the + # model sees the refusal and can course-correct rather than + # the turn crashing. tool_result_blocks: List[Dict[str, Any]] = [] for tu in result["tool_uses"]: await emit({ @@ -172,14 +197,56 @@ async def run_turn( "tool": tu["name"], "input": tu["input"], }) - try: - output = await tools_module.execute( - tu["name"], tu["input"], folder + + approved = True + denied_reason: Optional[str] = None + if is_blocked(tu["name"]): + approved = False + denied_reason = ( + f"Tool '{tu['name']}' is blocked by policy." + ) + elif ( + is_in_plan_mode is not None + and is_in_plan_mode() + and tu["name"] in _PLAN_MODE_BLOCKED_TOOLS + ): + approved = False + denied_reason = ( + f"Session is in PLAN MODE; '{tu['name']}' is disabled " + "until the user accepts or rejects the plan. Use " + "read-only tools (Read, Grep, Glob, WebSearch) to " + "refine the plan, or wait for the user's decision." ) - is_error = False - except Exception as exc: - output = f"Tool execution error: {exc}" + elif needs_approval(tu["name"]): + already_remembered = bool( + is_tool_auto_approved + and is_tool_auto_approved(tu["name"]) + ) + if not already_remembered and approve_tool is not None: + decision = await approve_tool( + tu["id"], tu["name"], tu["input"] + ) + approved = bool(decision.get("approved")) + if not approved: + denied_reason = ( + "User denied this tool call. Do not retry " + "the same call; ask the user what they " + "want, or try a different approach." + ) + + if approved: + try: + output = await tools_module.execute( + tu["name"], tu["input"], folder + ) + is_error = False + except Exception as exc: + output = f"Tool execution error: {exc}" + is_error = True + else: + output = denied_reason or "Tool call not approved." is_error = True + await emit({ "type": "tool.call.result", "call_id": tu["id"], diff --git a/backend/app/llm/prompts.py b/backend/app/llm/prompts.py index 97d5515..dfa07d9 100644 --- a/backend/app/llm/prompts.py +++ b/backend/app/llm/prompts.py @@ -67,6 +67,80 @@ Avoid open-ended clarifying questions like "what do you want me to do?" — give the user a short menu when you can. +# Execute, don't just describe + +When the user asks for data or state the tools can fetch (list rows, read a file, run a command, query an API), EXECUTE the fetch and report the actual result. Do not write a documentation-style answer ("here's the endpoint, here's the curl, here's the SQL you would run") as a substitute for running the thing — that's strictly worse than just running it. + +Examples: + +- user: "list all the users in the db" → read the connection info (e.g. from `.env`), run the query, summarize the result. Don't reply with a curl example. +- user: "what's in this folder?" → run `ls` or Glob. Don't describe how to list files. +- user: "is the server running?" → run the check (`curl`, `ps`, `lsof`). Don't tell them how to check. + +Only fall back to a documentation answer if (a) the user explicitly asked for docs / a recipe, or (b) you genuinely lack a tool to run it (no shell, no network, no credentials reachable). + +# Summarize large result sets + +When a fetch returns more than ~50 rows (or lines, or entries), do NOT paste them all into the reply. Roll them up: counts by category, min/max/recent, a small table of headline numbers, or first-N plus a total. Then offer 2-4 concrete follow-up "cuts" the user is likely to want next (filter by X, restrict to time window Y, drill into id Z). + +# Tabular data → markdown tables, not raw terminal output + +The UI renders markdown tables as real HTML `` elements with proper column alignment, a header row, and horizontal scroll when wide. Use GitHub-flavored pipe syntax for any tabular result. Do NOT paste psql / `column -t` / space-padded terminal output into a code block — column widths drift when any cell is longer than its peers, and the result is unreadable. + + +✓ Good — markdown table: +| id | name | email | role | created_at | +|----|---------|-------------------------|------|----------------------| +| 1 | Admin | admin@verifywise.com | 5 | 2025-07-02T00:00:00Z | +| 4 | Patel | harsh@example.com | 3 | 2025-07-02T00:00:00Z | + +✗ Bad — raw psql / fixed-width text in a code fence: +1 Admin admin@verifywise.com 5 2025-07-02T00:00:00Z +4 Patel harsh@example.com 3 2025-07-02T00:00:00Z + + +For ≥ 6 columns, consider whether all columns are actually needed — drop the ones the user didn't ask for before rendering. Many narrow columns are easier to read than every column compressed. + +# Pick the right format for the content + +Markdown renders differently depending on how you wrap content. Picking the wrong wrapper gives ugly or illegible output even when the underlying answer is correct. + +- **Code, commands, or anything the user might copy** → fenced code block with a language hint (` ```python `, ` ```bash `, ` ```json `, ` ```sql `, etc.). The hint drives syntax highlighting; an untagged fence is plain monospace and loses the color signal. +- **Diffs** → ` ```diff ` fence. Lines starting with `+` go green, `-` go red, headers like `@@` go blue. Do NOT hand-color diffs by prefixing plain-text lines with `+` or `-` — they look like bullet lists to the renderer. +- **Logs, stack traces, command output, or any fixed-width content** → always a fenced code block. A stack trace in a paragraph word-wraps into noise. +- **Enumerations, status updates, step-by-step results, checklists** → bullet list (or numbered list if order matters), not a run-on paragraph. One item per line. +- **Tabular data** → pipe-syntax markdown table (see the *Tabular data* rule above). + + +✗ Bad: diff in a plain paragraph. + +Add a null check before the return: - return user.name + return user?.name ?? "unknown" + +✓ Good: fenced as `diff`. + +```diff +- return user.name ++ return user?.name ?? "unknown" +``` + + + +✗ Bad: enumeration as prose. + +Tests: auth.test.ts passed, users.test.ts passed, orgs.test.ts failed with a timeout on line 47, billing.test.ts passed. + +✓ Good: bullet list. + +- `auth.test.ts` — passed +- `users.test.ts` — passed +- `orgs.test.ts` — FAILED (timeout on line 47) +- `billing.test.ts` — passed + + +# Flag sensitive data + +If a fetched result contains PII (names, emails, addresses, phone numbers, government ids), secrets (api keys, tokens, passwords), or anything else the user probably doesn't want pasted into chat history, flag it in one line before showing output and prefer summaries / aggregates over raw rows. If the user explicitly asked for the raw rows, comply — but still call out once that the data is sensitive so they can't say they weren't warned. + # Tone and style - Only use emojis if the user explicitly requests it. Avoid using emojis in all communication unless asked. diff --git a/backend/app/runtime.py b/backend/app/runtime.py index 98f7b39..34cb49a 100644 --- a/backend/app/runtime.py +++ b/backend/app/runtime.py @@ -32,6 +32,20 @@ def __init__(self, session_id: str) -> None: self._history: List[Dict[str, Any]] = [] self._is_first_turn = True self._pending_questions: Dict[str, asyncio.Future] = {} + # Per-tool approval state. ``_pending_approvals`` keys are tool + # call_ids (assigned by the provider, already unique per turn). + # ``_auto_approved_tools`` is populated when the user hits + # "Approve & remember for this session" on an approval card, so + # subsequent calls to the same tool skip the modal. + self._pending_approvals: Dict[str, asyncio.Future] = {} + self._auto_approved_tools: Set[str] = set() + # Plan mode: while True, the agent loop blocks mutating tools + # and surfaces a plan decision card to the UI. Entered by the + # model via the EnterPlanMode tool; exited by the user through + # ``resolve_plan``. + self._plan_mode: bool = False + self._current_plan: Optional[str] = None + self._pending_plan: Optional[asyncio.Future] = None # Set while a compact is in flight so we don't kick off a # second one concurrently and so run_turn / submit_prompt can # check before proceeding. @@ -113,6 +127,92 @@ def resolve_question(self, qid: str, answers: Any) -> bool: future.set_result(answers) return True + async def approve( + self, call_id: str, tool: str, tool_input: Dict[str, Any] + ) -> Dict[str, Any]: + """Pause until the user approves or denies a tool call. Called + by the agent loop before executing any tool whose policy is + ``ask`` (and not already remembered for this session). + + Emits ``tool.approve.request`` and blocks on a Future resolved + by ``resolve_approval(call_id, ...)`` when the WS layer + forwards the user's click. Returns a dict with ``approved`` + (bool) and optionally ``remember`` (``"session"``). + + Session-scope remember is handled here: if the user ticks + "Approve & remember," we stash the tool name so the next call + to the same tool bypasses this method. + """ + loop = asyncio.get_event_loop() + future: asyncio.Future = loop.create_future() + self._pending_approvals[call_id] = future + await self.emit({ + "type": "tool.approve.request", + "call_id": call_id, + "tool": tool, + "input": tool_input, + }) + try: + result = await future + finally: + self._pending_approvals.pop(call_id, None) + if result.get("approved") and result.get("remember") == "session": + self._auto_approved_tools.add(tool) + return result + + def is_tool_auto_approved(self, tool: str) -> bool: + return tool in self._auto_approved_tools + + def is_in_plan_mode(self) -> bool: + return self._plan_mode + + async def enter_plan_mode(self, plan: str) -> None: + """Called by the EnterPlanMode tool's executor. Flips the + session into plan mode and emits a proposal event for the UI.""" + self._plan_mode = True + self._current_plan = plan + await self.emit({ + "type": "plan.proposal", + "plan": plan, + }) + + def resolve_plan( + self, approved: bool, feedback: Optional[str] = None + ) -> bool: + """WS entry point. Accept: clears plan mode, emits + ``plan.accepted``, next turn executes normally. Reject: clears + plan mode, emits ``plan.rejected`` with feedback. The UI then + typically sends the feedback as the next user prompt so the + model can revise.""" + if not self._plan_mode: + return False + self._plan_mode = False + plan = self._current_plan + self._current_plan = None + if approved: + # Fire-and-forget emit; we don't hold the WS loop. + asyncio.create_task( + self.emit({"type": "plan.accepted", "plan": plan}) + ) + else: + asyncio.create_task( + self.emit({ + "type": "plan.rejected", + "plan": plan, + "feedback": feedback or "", + }) + ) + return True + + def resolve_approval( + self, call_id: str, approved: bool, remember: Optional[str] = None + ) -> bool: + future = self._pending_approvals.get(call_id) + if future is None or future.done(): + return False + future.set_result({"approved": bool(approved), "remember": remember}) + return True + async def submit_prompt( self, prompt: str, @@ -366,6 +466,10 @@ async def _run_turn( emit=self.emit, is_first_turn=self._is_first_turn, ask_user=self.ask_user, + approve_tool=self.approve, + is_tool_auto_approved=self.is_tool_auto_approved, + enter_plan_mode=self.enter_plan_mode, + is_in_plan_mode=self.is_in_plan_mode, ) self._is_first_turn = False sess_store.update_status(self.session_id, "idle") diff --git a/backend/app/tools/__init__.py b/backend/app/tools/__init__.py index 7b90c19..02261a4 100644 --- a/backend/app/tools/__init__.py +++ b/backend/app/tools/__init__.py @@ -152,9 +152,11 @@ def names(scope: str | None = None) -> List[str]: from app.tools import grep # noqa: E402,F401 from app.tools import read # noqa: E402,F401 from app.tools import edit # noqa: E402,F401 +from app.tools import multi_edit # noqa: E402,F401 from app.tools import write # noqa: E402,F401 from app.tools import notebook_edit # noqa: E402,F401 from app.tools import web # noqa: E402,F401 from app.tools import todo # noqa: E402,F401 from app.tools import subagent # noqa: E402,F401 from app.tools import ask_user # noqa: E402,F401 +from app.tools import plan_mode # noqa: E402,F401 diff --git a/backend/app/tools/multi_edit.py b/backend/app/tools/multi_edit.py new file mode 100644 index 0000000..19da5c2 --- /dev/null +++ b/backend/app/tools/multi_edit.py @@ -0,0 +1,145 @@ +"""MultiEdit tool — apply a batch of string replacements atomically. + +Reads the file once, applies every edit in order against the evolving +in-memory buffer, and writes once at the end. If any edit's match rule +fails (not found, or non-unique without ``replace_all``), the whole +batch aborts and the on-disk file is untouched. That's the Claude Code +contract — callers can rely on all-or-nothing semantics. +""" + +import asyncio +from pathlib import Path +from typing import Any, Dict, List + +from app.tools import Tool, register +from app.tools._common import resolve + + +def _apply_edits(buffer: str, edits: List[Dict[str, Any]]) -> tuple[str, List[str]]: + """Run the edits sequentially against ``buffer``; return the new + buffer plus a per-edit summary. Raises ``ValueError`` on any match + failure so the caller can bail without writing.""" + summaries: List[str] = [] + for i, edit in enumerate(edits, 1): + if not isinstance(edit, dict): + raise ValueError(f"edit #{i}: must be an object") + old = edit.get("old_string") + new = edit.get("new_string") + replace_all = bool(edit.get("replace_all")) + if not isinstance(old, str) or not isinstance(new, str): + raise ValueError(f"edit #{i}: old_string and new_string are required") + if old == new: + raise ValueError(f"edit #{i}: old_string and new_string must differ") + count = buffer.count(old) + if count == 0: + raise ValueError(f"edit #{i}: old_string not found") + if count > 1 and not replace_all: + raise ValueError( + f"edit #{i}: old_string appears {count} times; " + "add surrounding context to make it unique, or pass replace_all=true" + ) + if replace_all: + buffer = buffer.replace(old, new) + summaries.append(f"edit #{i}: replaced {count} occurrence(s)") + else: + buffer = buffer.replace(old, new, 1) + summaries.append(f"edit #{i}: replaced 1 occurrence") + return buffer, summaries + + +def _multi_edit_sync(args: Dict[str, Any], folder: Path) -> str: + path = resolve(folder, args["file_path"]) + edits = args.get("edits") + if not isinstance(edits, list) or not edits: + return "Error: 'edits' must be a non-empty array" + if not path.exists(): + return f"Error: file not found: {path}" + if not path.is_file(): + return f"Error: not a file: {path}" + + try: + text = path.read_text(errors="replace") + except (OSError, PermissionError) as exc: + return f"Error: {exc}" + + try: + new_text, summaries = _apply_edits(text, edits) + except ValueError as exc: + # File is untouched — that's the atomicity guarantee. + return f"Error: {exc}. No changes written." + + if new_text == text: + return f"No changes to {path} (every edit was a no-op)" + + try: + path.write_text(new_text) + except (OSError, PermissionError) as exc: + return f"Error writing {path}: {exc}" + + return f"Applied {len(edits)} edit(s) to {path}:\n" + "\n".join( + f" {s}" for s in summaries + ) + + +async def _multi_edit(args: Dict[str, Any], folder: Path) -> str: + return await asyncio.to_thread(_multi_edit_sync, args, folder) + + +register(Tool( + name="MultiEdit", + description=( + "Apply multiple exact string replacements to a single file in one " + "atomic operation. Edits are applied in order against the evolving " + "buffer, so later edits see the output of earlier ones. If any " + "edit's match rule fails (not found, or non-unique without " + "`replace_all`), the ENTIRE batch is aborted and the file is left " + "untouched.\n\n" + "Prefer MultiEdit over several back-to-back Edit calls when changing " + "the same file: it's atomic, cheaper in tokens, and surfaces " + "conflicts (e.g. a later edit's `old_string` no longer matching " + "because an earlier edit rewrote that region) before anything " + "lands on disk.\n\n" + "Usage:\n" + "- Read the file first to confirm `old_string` values are unique " + " (strongly recommended)\n" + "- Preserve exact indentation / whitespace in `old_string`\n" + "- Each edit's `replace_all` defaults to false — set it to true " + " to change every occurrence of that specific `old_string`" + ), + input_schema={ + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "The absolute path to the file to modify", + }, + "edits": { + "type": "array", + "minItems": 1, + "description": "Ordered list of string replacements", + "items": { + "type": "object", + "properties": { + "old_string": { + "type": "string", + "description": "The text to replace", + }, + "new_string": { + "type": "string", + "description": "The replacement text", + }, + "replace_all": { + "type": "boolean", + "description": "Replace every occurrence (default false)", + "default": False, + }, + }, + "required": ["old_string", "new_string"], + }, + }, + }, + "required": ["file_path", "edits"], + }, + executor=_multi_edit, + scopes={"main"}, +)) diff --git a/backend/app/tools/plan_mode.py b/backend/app/tools/plan_mode.py new file mode 100644 index 0000000..44e21c4 --- /dev/null +++ b/backend/app/tools/plan_mode.py @@ -0,0 +1,84 @@ +"""Plan mode — let the model propose a plan before executing anything. + +When the model invokes ``EnterPlanMode(plan=...)``, the session flips +into plan mode: + +- The plan text is emitted as a ``plan.proposal`` event so the UI can + render it as a decision card (Accept / Reject with optional feedback). +- Any subsequent mutating tool call during the same turn or the next + user turn is denied by the agent loop until the user resolves the + plan — the model can still explore with Read/Grep/Glob/WebSearch. + +Exiting plan mode is user-driven (not model-driven) for v1 so the model +can't shortcut around the user's approval by calling ``ExitPlanMode`` +itself. If the user rejects with feedback, the feedback is surfaced as +a synthetic user message so the model sees it and can revise. +""" + +from pathlib import Path +from typing import Any, Dict + +from app.tools import Tool, get_session_context, register + +ENTER_PLAN_MODE = "EnterPlanMode" + + +async def _enter_plan_mode(args: Dict[str, Any], folder: Path) -> str: + plan = args.get("plan") + if not isinstance(plan, str) or not plan.strip(): + return "Error: 'plan' is required and must be non-empty markdown." + + ctx = get_session_context() + enter_cb = ctx.get("enter_plan_mode") + if enter_cb is None: + # Runtime didn't pipe the callback (shouldn't happen in prod; + # happens in unit tests that construct a bare context). Degrade + # gracefully instead of crashing the turn. + return ( + "Plan recorded (but runtime integration is missing — the " + "plan card won't render). " + ) + try: + await enter_cb(plan.strip()) + except Exception as exc: + return f"Error entering plan mode: {exc}" + return ( + "Plan recorded and shown to the user. STOP making tool calls and " + "wait for their decision. If they reject with feedback, revise " + "the plan and call EnterPlanMode again." + ) + + +register(Tool( + name=ENTER_PLAN_MODE, + description=( + "Propose a plan for the user to review before making any " + "modifications. Use this whenever the requested task is " + "non-trivial (multi-file changes, refactors, new features, " + "schema migrations) and you want a green-light before you start " + "editing / running things.\n\n" + "While the plan is pending, the session is in PLAN MODE:\n" + "- Mutating tools (Bash, Write, Edit, MultiEdit, NotebookEdit, " + " WebFetch) will be blocked until the user Accepts.\n" + "- You can still explore freely with Read, Grep, Glob, WebSearch.\n\n" + "Arguments:\n" + "- plan: markdown text describing what you'll do, in step order. " + " Include the critical files you'll touch, risks, and the " + " verification step. Keep it scannable — the user is reading it." + ), + input_schema={ + "type": "object", + "properties": { + "plan": { + "type": "string", + "description": ( + "Markdown plan. Lead with a one-line summary, then " + "step-by-step actions, then a verification section." + ), + }, + }, + "required": ["plan"], + }, + executor=_enter_plan_mode, + scopes={"main"}, +)) diff --git a/backend/app/tools/policy.py b/backend/app/tools/policy.py new file mode 100644 index 0000000..066e2ae --- /dev/null +++ b/backend/app/tools/policy.py @@ -0,0 +1,64 @@ +"""Per-tool approval policy. + +Mirrors Claude Code's "run bold, confirm risky" posture: safe read-only +tools auto-run, anything that writes / executes / reaches out to the +network pauses for a user approval card. + +For v1 this is a hard-coded default map. Per-user overrides + a Settings +UI come later (see plan for #1); the shape of ``needs_approval`` is the +stable public surface — callers won't change when storage grows a DB +backing. +""" + +from __future__ import annotations + +from typing import Dict, Literal + +Policy = Literal["allow", "ask", "deny"] + +# Default trust matrix. Keep it tight: any tool that mutates the repo or +# reaches the network asks first. Read-only inspection runs free. +DEFAULT_POLICY: Dict[str, Policy] = { + # Read-only — auto + "Read": "allow", + "Glob": "allow", + "Grep": "allow", + # Todo system is cheap + reversible — auto + "TaskCreate": "allow", + "TaskGet": "allow", + "TaskUpdate": "allow", + "TaskList": "allow", + # Subagent lifecycle is bounded (read-only subagents); auto + "Task": "allow", + "TaskStop": "allow", + "TaskOutput": "allow", + # User-interaction tool is always safe + "AskUserQuestion": "allow", + # Plan mode is a UI-level gate; approval happens via the plan card + "EnterPlanMode": "allow", + # Memory: local file ops on user's own folder — auto for now + "Memory": "allow", + # Mutating or side-effecting — ask + "Bash": "ask", + "Write": "ask", + "Edit": "ask", + "MultiEdit": "ask", + "NotebookEdit": "ask", + # Network — ask (exfiltration + unpredictable content) + "WebFetch": "ask", + "WebSearch": "ask", +} + + +def policy_for(tool_name: str) -> Policy: + """Policy for a tool; unknown names default to ``ask`` so new tools + are safe-by-default until someone explicitly adds them.""" + return DEFAULT_POLICY.get(tool_name, "ask") + + +def needs_approval(tool_name: str) -> bool: + return policy_for(tool_name) == "ask" + + +def is_blocked(tool_name: str) -> bool: + return policy_for(tool_name) == "deny" diff --git a/backend/app/ws/session_ws.py b/backend/app/ws/session_ws.py index dae025d..0e6aa77 100644 --- a/backend/app/ws/session_ws.py +++ b/backend/app/ws/session_ws.py @@ -74,6 +74,23 @@ async def sender() -> None: answers = msg.get("answers") if isinstance(qid, str): rt.resolve_question(qid, answers) + elif kind == "tool.approve.response": + call_id = msg.get("call_id") + approved = bool(msg.get("approved")) + remember = msg.get("remember") + if isinstance(call_id, str): + rt.resolve_approval( + call_id, + approved, + remember if isinstance(remember, str) else None, + ) + elif kind == "plan.decision": + approved = bool(msg.get("approved")) + feedback = msg.get("feedback") + rt.resolve_plan( + approved, + feedback if isinstance(feedback, str) else None, + ) else: log.debug("[ws %s] unknown client event kind=%r", short_id, kind) close_reason = "loop exit" diff --git a/frontend/src/api/ws.ts b/frontend/src/api/ws.ts index d015edb..ee7072d 100644 --- a/frontend/src/api/ws.ts +++ b/frontend/src/api/ws.ts @@ -15,10 +15,16 @@ export interface WireAttachment { export type ClientEvent = | { type: 'prompt.submit'; text: string; attachments?: WireAttachment[] } - | { type: 'tool.approve'; call_id: string; decision: 'allow' | 'deny' } + | { + type: 'tool.approve.response' + call_id: string + approved: boolean + remember?: 'session' + } | { type: 'interrupt' } | { type: 'compact' } | { type: 'ask.answer'; id: string; answers: Record } + | { type: 'plan.decision'; approved: boolean; feedback?: string } export type WsStatus = | 'connecting' diff --git a/frontend/src/components/Markdown.tsx b/frontend/src/components/Markdown.tsx index 2027928..c4b647d 100644 --- a/frontend/src/components/Markdown.tsx +++ b/frontend/src/components/Markdown.tsx @@ -1,4 +1,10 @@ -import { memo, type CSSProperties, type ReactNode } from 'react' +import { + memo, + useRef, + useState, + type CSSProperties, + type ReactNode, +} from 'react' import ReactMarkdown, { type Components } from 'react-markdown' import remarkBreaks from 'remark-breaks' import remarkGfm from 'remark-gfm' @@ -30,7 +36,13 @@ function MarkdownImpl({
{children} @@ -243,7 +255,9 @@ const components: Components = { // below in the ``code`` override. pre: (props) => { const { children, ...rest } = props as NodeProps - return
{children}
+ return ( + {children} + ) }, code: (props) => { const { children, className, ...rest } = props as NodeProps @@ -273,3 +287,77 @@ function passthrough(props: NodeProps): Record { const { node: _node, ...rest } = props return rest } + +/** + * Wraps a fenced code block with a small "Copy" button in the top-right + * corner. Reads the rendered code via a ref at click time so we always + * copy the exact text the user sees (post-highlighting), not a stale + * snapshot of the children prop. + */ +function CodeBlockWithCopy({ + children, + ...rest +}: { + children?: ReactNode +} & Record) { + const preRef = useRef(null) + const [copied, setCopied] = useState(false) + const onCopy = () => { + const text = preRef.current?.innerText ?? '' + if (!text) return + const finish = () => { + setCopied(true) + setTimeout(() => setCopied(false), 1400) + } + if (navigator.clipboard?.writeText) { + navigator.clipboard.writeText(text).then(finish, finish) + } else { + // Fallback for non-HTTPS / older browsers: select + execCommand. + const sel = window.getSelection() + const range = document.createRange() + if (preRef.current) { + range.selectNodeContents(preRef.current) + sel?.removeAllRanges() + sel?.addRange(range) + try { + document.execCommand('copy') + } catch { + /* ignore */ + } + sel?.removeAllRanges() + finish() + } + } + } + return ( +
+
+        {children}
+      
+ +
+ ) +} diff --git a/frontend/src/components/MessageList.tsx b/frontend/src/components/MessageList.tsx index b7bd73a..2a9bde7 100644 --- a/frontend/src/components/MessageList.tsx +++ b/frontend/src/components/MessageList.tsx @@ -1,4 +1,4 @@ -import { useEffect, useMemo, useRef } from 'react' +import { useEffect, useMemo, useRef, useState } from 'react' import { formatTokens, type AgentKind } from '../api/rest' import type { ServerEvent } from '../api/ws' import Markdown from './Markdown' @@ -27,10 +27,25 @@ type RenderItem = input: unknown output: string | null is_error: boolean + /** 'pending' once a tool.approve.request arrives; set to 'approved' or + * 'denied' locally on click (optimistic) or when the backend + * emits a tool.call.result. 'unrequested' is the default — most + * tools don't need approval at all. */ + approval: 'unrequested' | 'pending' | 'approved' | 'denied' } | { kind: 'notice'; seq: number; level: string; text: string } | { kind: 'usage'; seq: number; input_tokens: number; output_tokens: number } | { kind: 'error'; seq: number; message: string } + | { + kind: 'plan' + seq: number + plan: string + /** 'pending' while awaiting user decision; 'accepted' / 'rejected' + * reflect what the user chose. Computed from subsequent + * plan.accepted / plan.rejected events in the stream. */ + status: 'pending' | 'accepted' | 'rejected' + feedback?: string + } function reduce(events: ServerEvent[]): RenderItem[] { const items: RenderItem[] = [] @@ -91,17 +106,38 @@ function reduce(events: ServerEvent[]): RenderItem[] { input: e.input, output: null, is_error: false, + approval: 'unrequested', }) toolIdx.set(cid, items.length - 1) + } else if (t === 'tool.approve.request') { + const cid = String(e.call_id ?? '') + const idx = toolIdx.get(cid) + if (idx !== undefined) { + const prev = items[idx] as Extract + items[idx] = { ...prev, approval: 'pending' } + } } else if (t === 'tool.call.result') { const cid = String(e.call_id ?? '') const idx = toolIdx.get(cid) if (idx !== undefined) { const prev = items[idx] as Extract + // If the tool ran (no error tagged as "denied…"), the pending + // gate must have been approved. If it errored because the user + // denied it, the agent feeds back an is_error=true tool_result + // that includes "User denied" in the text — surface that state + // so the UI can style the card accordingly. + const outputStr = String(e.output ?? '') + const isError = Boolean(e.is_error) + const wasDenied = isError && outputStr.startsWith('User denied') + const nextApproval: typeof prev.approval = + prev.approval === 'pending' + ? wasDenied ? 'denied' : 'approved' + : prev.approval items[idx] = { ...prev, - output: String(e.output ?? ''), - is_error: Boolean(e.is_error), + output: outputStr, + is_error: isError, + approval: nextApproval, } } } else if (t === 'system.notice') { @@ -120,6 +156,27 @@ function reduce(events: ServerEvent[]): RenderItem[] { }) } else if (t === 'error') { items.push({ kind: 'error', seq, message: String(e.message ?? '') }) + } else if (t === 'plan.proposal') { + items.push({ + kind: 'plan', + seq, + plan: String(e.plan ?? ''), + status: 'pending', + }) + } else if (t === 'plan.accepted' || t === 'plan.rejected') { + // Walk backward to the most recent pending plan and resolve it. + for (let j = items.length - 1; j >= 0; j--) { + const it = items[j] + if (it.kind === 'plan' && it.status === 'pending') { + items[j] = { + ...it, + status: t === 'plan.accepted' ? 'accepted' : 'rejected', + feedback: + t === 'plan.rejected' ? String(e.feedback ?? '') : undefined, + } + break + } + } } } return items @@ -129,10 +186,18 @@ export default function MessageList({ events, agentKind, contextLimits, + onToolApproval, + onPlanDecision, }: { events: ServerEvent[] agentKind: AgentKind contextLimits?: ContextLimits | null + onToolApproval?: ( + callId: string, + approved: boolean, + remember: boolean, + ) => void + onPlanDecision?: (approved: boolean, feedback?: string) => void }) { const items = useMemo(() => reduce(events), [events]) const bottomRef = useRef(null) @@ -190,7 +255,15 @@ export default function MessageList({ return (
{items.map((it, i) => ( -
{renderItem(it, accent, contextLimits ?? null)}
+
+ {renderItem( + it, + accent, + contextLimits ?? null, + onToolApproval, + onPlanDecision, + )} +
))}
@@ -201,6 +274,12 @@ function renderItem( it: RenderItem, accent: string, limits: ContextLimits | null, + onToolApproval?: ( + callId: string, + approved: boolean, + remember: boolean, + ) => void, + onPlanDecision?: (approved: boolean, feedback?: string) => void, ) { switch (it.kind) { case 'user': @@ -305,6 +384,17 @@ function renderItem( input={it.input} output={it.output} isError={it.is_error} + approval={it.approval} + onApprove={ + onToolApproval + ? (remember) => onToolApproval(it.call_id, true, remember) + : undefined + } + onDeny={ + onToolApproval + ? () => onToolApproval(it.call_id, false, false) + : undefined + } /> ) case 'notice': @@ -418,5 +508,228 @@ function renderItem( Error: {it.message}
) + case 'plan': + return ( + + ) } } + +function PlanCard({ + plan, + status, + feedback, + onDecision, + accent, +}: { + plan: string + status: 'pending' | 'accepted' | 'rejected' + feedback?: string + onDecision?: (approved: boolean, feedback?: string) => void + accent: string +}) { + const [showRevise, setShowRevise] = useState(false) + const [feedbackText, setFeedbackText] = useState('') + const pending = status === 'pending' + const badge = + status === 'accepted' + ? { text: 'Accepted', color: COLORS.green } + : status === 'rejected' + ? { text: 'Rejected', color: COLORS.red } + : { text: 'Awaiting decision', color: COLORS.amber } + return ( +
+ {/* Left rail in the provider accent, like assistant messages. */} + +
+ + Proposed plan + + + {badge.text} + +
+
+ {plan} +
+ {status === 'rejected' && feedback && ( +
+ + Your feedback: + {' '} + {feedback} +
+ )} + {pending && onDecision && ( +
+ {showRevise ? ( + <> +