HarshP4585 · HarshP4585 · Apr 24, 2026
diff --git a/backend/app/llm/agent.py b/backend/app/llm/agent.py
@@ -11,7 +11,7 @@
 
 import os
 from pathlib import Path
-from typing import Any, Awaitable, Callable, Dict, List
+from typing import Any, Awaitable, Callable, Dict, List, Optional
 
 from app import tools as tools_module
 from app.llm.context import build as build_context
@@ -22,6 +22,7 @@
 )
 from app.llm.providers import build_provider
 from app.tools import set_session_context
+from app.tools.policy import is_blocked, needs_approval
 
 EmitFn = Callable[[Dict[str, Any]], Awaitable[None]]
 
@@ -38,6 +39,19 @@ def _max_rounds() -> int:
 
 
 MAX_TOOL_ROUNDS = _max_rounds()
+
+# Tools blocked while a session is in plan mode. The model can still
+# explore (Read/Grep/Glob/WebSearch) but can't modify state or reach
+# out to the network until the user accepts / rejects the plan.
+_PLAN_MODE_BLOCKED_TOOLS = frozenset({
+    "Bash",
+    "Write",
+    "Edit",
+    "MultiEdit",
+    "NotebookEdit",
+    "WebFetch",
+})
+
 DEFAULT_MODELS = {
     "claude": "claude-sonnet-4-6",
     "openai": "gpt-4o",
@@ -61,6 +75,10 @@ async def run_turn(
     is_first_turn: bool,
     attachments: List[Dict[str, Any]] = None,  # type: ignore[assignment]
     ask_user: Any = None,
+    approve_tool: Any = None,
+    is_tool_auto_approved: Any = None,
+    enter_plan_mode: Any = None,
+    is_in_plan_mode: Any = None,
 ) -> None:
     """Execute one user → agent turn. Mutates ``history`` in place."""
     set_session_context({
@@ -69,6 +87,7 @@ async def run_turn(
         "api_key": api_key,
         "emit": emit,
         "ask_user": ask_user,
+        "enter_plan_mode": enter_plan_mode,
     })
 
     ctx = build_context(folder, agent_kind, model)
@@ -163,7 +182,13 @@ async def run_turn(
             if not result["tool_uses"]:
                 return  # turn complete
 
-            # Execute tools and feed back as tool_result blocks.
+            # Execute tools and feed back as tool_result blocks. Before
+            # each call, consult the tool policy: "allow" runs free, "ask"
+            # pauses for user approval via the runtime, "deny" is a
+            # hard-coded no-op (reserved for future policy-file blocks).
+            # A denied tool gets a synthetic error tool_result so the
+            # model sees the refusal and can course-correct rather than
+            # the turn crashing.
             tool_result_blocks: List[Dict[str, Any]] = []
             for tu in result["tool_uses"]:
                 await emit({
@@ -172,14 +197,56 @@ async def run_turn(
                     "tool": tu["name"],
                     "input": tu["input"],
                 })
-                try:
-                    output = await tools_module.execute(
-                        tu["name"], tu["input"], folder
+
+                approved = True
+                denied_reason: Optional[str] = None
+                if is_blocked(tu["name"]):
+                    approved = False
+                    denied_reason = (
+                        f"Tool '{tu['name']}' is blocked by policy."
+                    )
+                elif (
+                    is_in_plan_mode is not None
+                    and is_in_plan_mode()
+                    and tu["name"] in _PLAN_MODE_BLOCKED_TOOLS
+                ):
+                    approved = False
+                    denied_reason = (
+                        f"Session is in PLAN MODE; '{tu['name']}' is disabled "
+                        "until the user accepts or rejects the plan. Use "
+                        "read-only tools (Read, Grep, Glob, WebSearch) to "
+                        "refine the plan, or wait for the user's decision."
                     )
-                    is_error = False
-                except Exception as exc:
-                    output = f"Tool execution error: {exc}"
+                elif needs_approval(tu["name"]):
+                    already_remembered = bool(
+                        is_tool_auto_approved
+                        and is_tool_auto_approved(tu["name"])
+                    )
+                    if not already_remembered and approve_tool is not None:
+                        decision = await approve_tool(
+                            tu["id"], tu["name"], tu["input"]
+                        )
+                        approved = bool(decision.get("approved"))
+                        if not approved:
+                            denied_reason = (
+                                "User denied this tool call. Do not retry "
+                                "the same call; ask the user what they "
+                                "want, or try a different approach."
+                            )
+
+                if approved:
+                    try:
+                        output = await tools_module.execute(
+                            tu["name"], tu["input"], folder
+                        )
+                        is_error = False
+                    except Exception as exc:
+                        output = f"Tool execution error: {exc}"
+                        is_error = True
+                else:
+                    output = denied_reason or "Tool call not approved."
                     is_error = True
+
                 await emit({
                     "type": "tool.call.result",
                     "call_id": tu["id"],

diff --git a/backend/app/llm/prompts.py b/backend/app/llm/prompts.py
@@ -67,6 +67,80 @@
 
 Avoid open-ended clarifying questions like "what do you want me to do?" — give the user a short menu when you can.
 
+# Execute, don't just describe
+
+When the user asks for data or state the tools can fetch (list rows, read a file, run a command, query an API), EXECUTE the fetch and report the actual result. Do not write a documentation-style answer ("here's the endpoint, here's the curl, here's the SQL you would run") as a substitute for running the thing — that's strictly worse than just running it.
+
+Examples:
+
+- user: "list all the users in the db" → read the connection info (e.g. from `.env`), run the query, summarize the result. Don't reply with a curl example.
+- user: "what's in this folder?" → run `ls` or Glob. Don't describe how to list files.
+- user: "is the server running?" → run the check (`curl`, `ps`, `lsof`). Don't tell them how to check.
+
+Only fall back to a documentation answer if (a) the user explicitly asked for docs / a recipe, or (b) you genuinely lack a tool to run it (no shell, no network, no credentials reachable).
+
+# Summarize large result sets
+
+When a fetch returns more than ~50 rows (or lines, or entries), do NOT paste them all into the reply. Roll them up: counts by category, min/max/recent, a small table of headline numbers, or first-N plus a total. Then offer 2-4 concrete follow-up "cuts" the user is likely to want next (filter by X, restrict to time window Y, drill into id Z).
+
+# Tabular data → markdown tables, not raw terminal output
+
+The UI renders markdown tables as real HTML `<table>` elements with proper column alignment, a header row, and horizontal scroll when wide. Use GitHub-flavored pipe syntax for any tabular result. Do NOT paste psql / `column -t` / space-padded terminal output into a code block — column widths drift when any cell is longer than its peers, and the result is unreadable.
+
+<example>
+✓ Good — markdown table:
+| id | name    | email                   | role | created_at           |
+|----|---------|-------------------------|------|----------------------|
+| 1  | Admin   | admin@verifywise.com    | 5    | 2025-07-02T00:00:00Z |
+| 4  | Patel   | harsh@example.com       | 3    | 2025-07-02T00:00:00Z |
+
+✗ Bad — raw psql / fixed-width text in a code fence:
+1    Admin   admin@verifywise.com    5    2025-07-02T00:00:00Z
+4    Patel   harsh@example.com       3    2025-07-02T00:00:00Z
+</example>
+
+For ≥ 6 columns, consider whether all columns are actually needed — drop the ones the user didn't ask for before rendering. Many narrow columns are easier to read than every column compressed.
+
+# Pick the right format for the content
+
+Markdown renders differently depending on how you wrap content. Picking the wrong wrapper gives ugly or illegible output even when the underlying answer is correct.
+
+- **Code, commands, or anything the user might copy** → fenced code block with a language hint (` ```python `, ` ```bash `, ` ```json `, ` ```sql `, etc.). The hint drives syntax highlighting; an untagged fence is plain monospace and loses the color signal.
+- **Diffs** → ` ```diff ` fence. Lines starting with `+` go green, `-` go red, headers like `@@` go blue. Do NOT hand-color diffs by prefixing plain-text lines with `+` or `-` — they look like bullet lists to the renderer.
+- **Logs, stack traces, command output, or any fixed-width content** → always a fenced code block. A stack trace in a paragraph word-wraps into noise.
+- **Enumerations, status updates, step-by-step results, checklists** → bullet list (or numbered list if order matters), not a run-on paragraph. One item per line.
+- **Tabular data** → pipe-syntax markdown table (see the *Tabular data* rule above).
+
+<example>
+✗ Bad: diff in a plain paragraph.
+
+Add a null check before the return: - return user.name + return user?.name ?? "unknown"
+
+✓ Good: fenced as `diff`.
+
+```diff
+- return user.name
++ return user?.name ?? "unknown"
+```
+</example>
+
+<example>
+✗ Bad: enumeration as prose.
+
+Tests: auth.test.ts passed, users.test.ts passed, orgs.test.ts failed with a timeout on line 47, billing.test.ts passed.
+
+✓ Good: bullet list.
+
+- `auth.test.ts` — passed
+- `users.test.ts` — passed
+- `orgs.test.ts` — FAILED (timeout on line 47)
+- `billing.test.ts` — passed
+</example>
+
+# Flag sensitive data
+
+If a fetched result contains PII (names, emails, addresses, phone numbers, government ids), secrets (api keys, tokens, passwords), or anything else the user probably doesn't want pasted into chat history, flag it in one line before showing output and prefer summaries / aggregates over raw rows. If the user explicitly asked for the raw rows, comply — but still call out once that the data is sensitive so they can't say they weren't warned.
+
 # Tone and style
 
 - Only use emojis if the user explicitly requests it. Avoid using emojis in all communication unless asked.

diff --git a/backend/app/runtime.py b/backend/app/runtime.py
@@ -32,6 +32,20 @@ def __init__(self, session_id: str) -> None:
         self._history: List[Dict[str, Any]] = []
         self._is_first_turn = True
         self._pending_questions: Dict[str, asyncio.Future] = {}
+        # Per-tool approval state. ``_pending_approvals`` keys are tool
+        # call_ids (assigned by the provider, already unique per turn).
+        # ``_auto_approved_tools`` is populated when the user hits
+        # "Approve & remember for this session" on an approval card, so
+        # subsequent calls to the same tool skip the modal.
+        self._pending_approvals: Dict[str, asyncio.Future] = {}
+        self._auto_approved_tools: Set[str] = set()
+        # Plan mode: while True, the agent loop blocks mutating tools
+        # and surfaces a plan decision card to the UI. Entered by the
+        # model via the EnterPlanMode tool; exited by the user through
+        # ``resolve_plan``.
+        self._plan_mode: bool = False
+        self._current_plan: Optional[str] = None
+        self._pending_plan: Optional[asyncio.Future] = None
         # Set while a compact is in flight so we don't kick off a
         # second one concurrently and so run_turn / submit_prompt can
         # check before proceeding.
@@ -113,6 +127,92 @@ def resolve_question(self, qid: str, answers: Any) -> bool:
         future.set_result(answers)
         return True
 
+    async def approve(
+        self, call_id: str, tool: str, tool_input: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Pause until the user approves or denies a tool call. Called
+        by the agent loop before executing any tool whose policy is
+        ``ask`` (and not already remembered for this session).
+
+        Emits ``tool.approve.request`` and blocks on a Future resolved
+        by ``resolve_approval(call_id, ...)`` when the WS layer
+        forwards the user's click. Returns a dict with ``approved``
+        (bool) and optionally ``remember`` (``"session"``).
+
+        Session-scope remember is handled here: if the user ticks
+        "Approve & remember," we stash the tool name so the next call
+        to the same tool bypasses this method.
+        """
+        loop = asyncio.get_event_loop()
+        future: asyncio.Future = loop.create_future()
+        self._pending_approvals[call_id] = future
+        await self.emit({
+            "type": "tool.approve.request",
+            "call_id": call_id,
+            "tool": tool,
+            "input": tool_input,
+        })
+        try:
+            result = await future
+        finally:
+            self._pending_approvals.pop(call_id, None)
+        if result.get("approved") and result.get("remember") == "session":
+            self._auto_approved_tools.add(tool)
+        return result
+
+    def is_tool_auto_approved(self, tool: str) -> bool:
+        return tool in self._auto_approved_tools
+
+    def is_in_plan_mode(self) -> bool:
+        return self._plan_mode
+
+    async def enter_plan_mode(self, plan: str) -> None:
+        """Called by the EnterPlanMode tool's executor. Flips the
+        session into plan mode and emits a proposal event for the UI."""
+        self._plan_mode = True
+        self._current_plan = plan
+        await self.emit({
+            "type": "plan.proposal",
+            "plan": plan,
+        })
+
+    def resolve_plan(
+        self, approved: bool, feedback: Optional[str] = None
+    ) -> bool:
+        """WS entry point. Accept: clears plan mode, emits
+        ``plan.accepted``, next turn executes normally. Reject: clears
+        plan mode, emits ``plan.rejected`` with feedback. The UI then
+        typically sends the feedback as the next user prompt so the
+        model can revise."""
+        if not self._plan_mode:
+            return False
+        self._plan_mode = False
+        plan = self._current_plan
+        self._current_plan = None
+        if approved:
+            # Fire-and-forget emit; we don't hold the WS loop.
+            asyncio.create_task(
+                self.emit({"type": "plan.accepted", "plan": plan})
+            )
+        else:
+            asyncio.create_task(
+                self.emit({
+                    "type": "plan.rejected",
+                    "plan": plan,
+                    "feedback": feedback or "",
+                })
+            )
+        return True
+
+    def resolve_approval(
+        self, call_id: str, approved: bool, remember: Optional[str] = None
+    ) -> bool:
+        future = self._pending_approvals.get(call_id)
+        if future is None or future.done():
+            return False
+        future.set_result({"approved": bool(approved), "remember": remember})
+        return True
+
     async def submit_prompt(
         self,
         prompt: str,
@@ -366,6 +466,10 @@ async def _run_turn(
                 emit=self.emit,
                 is_first_turn=self._is_first_turn,
                 ask_user=self.ask_user,
+                approve_tool=self.approve,
+                is_tool_auto_approved=self.is_tool_auto_approved,
+                enter_plan_mode=self.enter_plan_mode,
+                is_in_plan_mode=self.is_in_plan_mode,
             )
             self._is_first_turn = False
             sess_store.update_status(self.session_id, "idle")

diff --git a/backend/app/tools/__init__.py b/backend/app/tools/__init__.py
@@ -152,9 +152,11 @@ def names(scope: str | None = None) -> List[str]:
 from app.tools import grep  # noqa: E402,F401
 from app.tools import read  # noqa: E402,F401
 from app.tools import edit  # noqa: E402,F401
+from app.tools import multi_edit  # noqa: E402,F401
 from app.tools import write  # noqa: E402,F401
 from app.tools import notebook_edit  # noqa: E402,F401
 from app.tools import web  # noqa: E402,F401
 from app.tools import todo  # noqa: E402,F401
 from app.tools import subagent  # noqa: E402,F401
 from app.tools import ask_user  # noqa: E402,F401
+from app.tools import plan_mode  # noqa: E402,F401