Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 75 additions & 8 deletions backend/app/llm/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import os
from pathlib import Path
from typing import Any, Awaitable, Callable, Dict, List
from typing import Any, Awaitable, Callable, Dict, List, Optional

from app import tools as tools_module
from app.llm.context import build as build_context
Expand All @@ -22,6 +22,7 @@
)
from app.llm.providers import build_provider
from app.tools import set_session_context
from app.tools.policy import is_blocked, needs_approval

EmitFn = Callable[[Dict[str, Any]], Awaitable[None]]

Expand All @@ -38,6 +39,19 @@ def _max_rounds() -> int:


MAX_TOOL_ROUNDS = _max_rounds()

# Tools blocked while a session is in plan mode. The model can still
# explore (Read/Grep/Glob/WebSearch) but can't modify state or reach
# out to the network until the user accepts / rejects the plan.
_PLAN_MODE_BLOCKED_TOOLS = frozenset({
"Bash",
"Write",
"Edit",
"MultiEdit",
"NotebookEdit",
"WebFetch",
})

DEFAULT_MODELS = {
"claude": "claude-sonnet-4-6",
"openai": "gpt-4o",
Expand All @@ -61,6 +75,10 @@ async def run_turn(
is_first_turn: bool,
attachments: List[Dict[str, Any]] = None, # type: ignore[assignment]
ask_user: Any = None,
approve_tool: Any = None,
is_tool_auto_approved: Any = None,
enter_plan_mode: Any = None,
is_in_plan_mode: Any = None,
) -> None:
"""Execute one user → agent turn. Mutates ``history`` in place."""
set_session_context({
Expand All @@ -69,6 +87,7 @@ async def run_turn(
"api_key": api_key,
"emit": emit,
"ask_user": ask_user,
"enter_plan_mode": enter_plan_mode,
})

ctx = build_context(folder, agent_kind, model)
Expand Down Expand Up @@ -163,7 +182,13 @@ async def run_turn(
if not result["tool_uses"]:
return # turn complete

# Execute tools and feed back as tool_result blocks.
# Execute tools and feed back as tool_result blocks. Before
# each call, consult the tool policy: "allow" runs free, "ask"
# pauses for user approval via the runtime, "deny" is a
# hard-coded no-op (reserved for future policy-file blocks).
# A denied tool gets a synthetic error tool_result so the
# model sees the refusal and can course-correct rather than
# the turn crashing.
tool_result_blocks: List[Dict[str, Any]] = []
for tu in result["tool_uses"]:
await emit({
Expand All @@ -172,14 +197,56 @@ async def run_turn(
"tool": tu["name"],
"input": tu["input"],
})
try:
output = await tools_module.execute(
tu["name"], tu["input"], folder

approved = True
denied_reason: Optional[str] = None
if is_blocked(tu["name"]):
approved = False
denied_reason = (
f"Tool '{tu['name']}' is blocked by policy."
)
elif (
is_in_plan_mode is not None
and is_in_plan_mode()
and tu["name"] in _PLAN_MODE_BLOCKED_TOOLS
):
approved = False
denied_reason = (
f"Session is in PLAN MODE; '{tu['name']}' is disabled "
"until the user accepts or rejects the plan. Use "
"read-only tools (Read, Grep, Glob, WebSearch) to "
"refine the plan, or wait for the user's decision."
)
is_error = False
except Exception as exc:
output = f"Tool execution error: {exc}"
elif needs_approval(tu["name"]):
already_remembered = bool(
is_tool_auto_approved
and is_tool_auto_approved(tu["name"])
)
if not already_remembered and approve_tool is not None:
decision = await approve_tool(
tu["id"], tu["name"], tu["input"]
)
approved = bool(decision.get("approved"))
if not approved:
denied_reason = (
"User denied this tool call. Do not retry "
"the same call; ask the user what they "
"want, or try a different approach."
)

if approved:
try:
output = await tools_module.execute(
tu["name"], tu["input"], folder
)
is_error = False
except Exception as exc:
output = f"Tool execution error: {exc}"
is_error = True
else:
output = denied_reason or "Tool call not approved."
is_error = True

await emit({
"type": "tool.call.result",
"call_id": tu["id"],
Expand Down
74 changes: 74 additions & 0 deletions backend/app/llm/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,80 @@

Avoid open-ended clarifying questions like "what do you want me to do?" — give the user a short menu when you can.

# Execute, don't just describe

When the user asks for data or state the tools can fetch (list rows, read a file, run a command, query an API), EXECUTE the fetch and report the actual result. Do not write a documentation-style answer ("here's the endpoint, here's the curl, here's the SQL you would run") as a substitute for running the thing — that's strictly worse than just running it.

Examples:

- user: "list all the users in the db" → read the connection info (e.g. from `.env`), run the query, summarize the result. Don't reply with a curl example.
- user: "what's in this folder?" → run `ls` or Glob. Don't describe how to list files.
- user: "is the server running?" → run the check (`curl`, `ps`, `lsof`). Don't tell them how to check.

Only fall back to a documentation answer if (a) the user explicitly asked for docs / a recipe, or (b) you genuinely lack a tool to run it (no shell, no network, no credentials reachable).

# Summarize large result sets

When a fetch returns more than ~50 rows (or lines, or entries), do NOT paste them all into the reply. Roll them up: counts by category, min/max/recent, a small table of headline numbers, or first-N plus a total. Then offer 2-4 concrete follow-up "cuts" the user is likely to want next (filter by X, restrict to time window Y, drill into id Z).

# Tabular data → markdown tables, not raw terminal output

The UI renders markdown tables as real HTML `<table>` elements with proper column alignment, a header row, and horizontal scroll when wide. Use GitHub-flavored pipe syntax for any tabular result. Do NOT paste psql / `column -t` / space-padded terminal output into a code block — column widths drift when any cell is longer than its peers, and the result is unreadable.

<example>
✓ Good — markdown table:
| id | name | email | role | created_at |
|----|---------|-------------------------|------|----------------------|
| 1 | Admin | admin@verifywise.com | 5 | 2025-07-02T00:00:00Z |
| 4 | Patel | harsh@example.com | 3 | 2025-07-02T00:00:00Z |

✗ Bad — raw psql / fixed-width text in a code fence:
1 Admin admin@verifywise.com 5 2025-07-02T00:00:00Z
4 Patel harsh@example.com 3 2025-07-02T00:00:00Z
</example>

For ≥ 6 columns, consider whether all columns are actually needed — drop the ones the user didn't ask for before rendering. Many narrow columns are easier to read than every column compressed.

# Pick the right format for the content

Markdown renders differently depending on how you wrap content. Picking the wrong wrapper gives ugly or illegible output even when the underlying answer is correct.

- **Code, commands, or anything the user might copy** → fenced code block with a language hint (` ```python `, ` ```bash `, ` ```json `, ` ```sql `, etc.). The hint drives syntax highlighting; an untagged fence is plain monospace and loses the color signal.
- **Diffs** → ` ```diff ` fence. Lines starting with `+` go green, `-` go red, headers like `@@` go blue. Do NOT hand-color diffs by prefixing plain-text lines with `+` or `-` — they look like bullet lists to the renderer.
- **Logs, stack traces, command output, or any fixed-width content** → always a fenced code block. A stack trace in a paragraph word-wraps into noise.
- **Enumerations, status updates, step-by-step results, checklists** → bullet list (or numbered list if order matters), not a run-on paragraph. One item per line.
- **Tabular data** → pipe-syntax markdown table (see the *Tabular data* rule above).

<example>
✗ Bad: diff in a plain paragraph.

Add a null check before the return: - return user.name + return user?.name ?? "unknown"

✓ Good: fenced as `diff`.

```diff
- return user.name
+ return user?.name ?? "unknown"
```
</example>

<example>
✗ Bad: enumeration as prose.

Tests: auth.test.ts passed, users.test.ts passed, orgs.test.ts failed with a timeout on line 47, billing.test.ts passed.

✓ Good: bullet list.

- `auth.test.ts` — passed
- `users.test.ts` — passed
- `orgs.test.ts` — FAILED (timeout on line 47)
- `billing.test.ts` — passed
</example>

# Flag sensitive data

If a fetched result contains PII (names, emails, addresses, phone numbers, government ids), secrets (api keys, tokens, passwords), or anything else the user probably doesn't want pasted into chat history, flag it in one line before showing output and prefer summaries / aggregates over raw rows. If the user explicitly asked for the raw rows, comply — but still call out once that the data is sensitive so they can't say they weren't warned.

# Tone and style

- Only use emojis if the user explicitly requests it. Avoid using emojis in all communication unless asked.
Expand Down
104 changes: 104 additions & 0 deletions backend/app/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,20 @@ def __init__(self, session_id: str) -> None:
self._history: List[Dict[str, Any]] = []
self._is_first_turn = True
self._pending_questions: Dict[str, asyncio.Future] = {}
# Per-tool approval state. ``_pending_approvals`` keys are tool
# call_ids (assigned by the provider, already unique per turn).
# ``_auto_approved_tools`` is populated when the user hits
# "Approve & remember for this session" on an approval card, so
# subsequent calls to the same tool skip the modal.
self._pending_approvals: Dict[str, asyncio.Future] = {}
self._auto_approved_tools: Set[str] = set()
# Plan mode: while True, the agent loop blocks mutating tools
# and surfaces a plan decision card to the UI. Entered by the
# model via the EnterPlanMode tool; exited by the user through
# ``resolve_plan``.
self._plan_mode: bool = False
self._current_plan: Optional[str] = None
self._pending_plan: Optional[asyncio.Future] = None
# Set while a compact is in flight so we don't kick off a
# second one concurrently and so run_turn / submit_prompt can
# check before proceeding.
Expand Down Expand Up @@ -113,6 +127,92 @@ def resolve_question(self, qid: str, answers: Any) -> bool:
future.set_result(answers)
return True

async def approve(
self, call_id: str, tool: str, tool_input: Dict[str, Any]
) -> Dict[str, Any]:
"""Pause until the user approves or denies a tool call. Called
by the agent loop before executing any tool whose policy is
``ask`` (and not already remembered for this session).

Emits ``tool.approve.request`` and blocks on a Future resolved
by ``resolve_approval(call_id, ...)`` when the WS layer
forwards the user's click. Returns a dict with ``approved``
(bool) and optionally ``remember`` (``"session"``).

Session-scope remember is handled here: if the user ticks
"Approve & remember," we stash the tool name so the next call
to the same tool bypasses this method.
"""
loop = asyncio.get_event_loop()
future: asyncio.Future = loop.create_future()
self._pending_approvals[call_id] = future
await self.emit({
"type": "tool.approve.request",
"call_id": call_id,
"tool": tool,
"input": tool_input,
})
try:
result = await future
finally:
self._pending_approvals.pop(call_id, None)
if result.get("approved") and result.get("remember") == "session":
self._auto_approved_tools.add(tool)
return result

def is_tool_auto_approved(self, tool: str) -> bool:
return tool in self._auto_approved_tools

def is_in_plan_mode(self) -> bool:
return self._plan_mode

async def enter_plan_mode(self, plan: str) -> None:
"""Called by the EnterPlanMode tool's executor. Flips the
session into plan mode and emits a proposal event for the UI."""
self._plan_mode = True
self._current_plan = plan
await self.emit({
"type": "plan.proposal",
"plan": plan,
})

def resolve_plan(
self, approved: bool, feedback: Optional[str] = None
) -> bool:
"""WS entry point. Accept: clears plan mode, emits
``plan.accepted``, next turn executes normally. Reject: clears
plan mode, emits ``plan.rejected`` with feedback. The UI then
typically sends the feedback as the next user prompt so the
model can revise."""
if not self._plan_mode:
return False
self._plan_mode = False
plan = self._current_plan
self._current_plan = None
if approved:
# Fire-and-forget emit; we don't hold the WS loop.
asyncio.create_task(
self.emit({"type": "plan.accepted", "plan": plan})
)
else:
asyncio.create_task(
self.emit({
"type": "plan.rejected",
"plan": plan,
"feedback": feedback or "",
})
)
return True

def resolve_approval(
self, call_id: str, approved: bool, remember: Optional[str] = None
) -> bool:
future = self._pending_approvals.get(call_id)
if future is None or future.done():
return False
future.set_result({"approved": bool(approved), "remember": remember})
return True

async def submit_prompt(
self,
prompt: str,
Expand Down Expand Up @@ -366,6 +466,10 @@ async def _run_turn(
emit=self.emit,
is_first_turn=self._is_first_turn,
ask_user=self.ask_user,
approve_tool=self.approve,
is_tool_auto_approved=self.is_tool_auto_approved,
enter_plan_mode=self.enter_plan_mode,
is_in_plan_mode=self.is_in_plan_mode,
)
self._is_first_turn = False
sess_store.update_status(self.session_id, "idle")
Expand Down
2 changes: 2 additions & 0 deletions backend/app/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,11 @@ def names(scope: str | None = None) -> List[str]:
from app.tools import grep # noqa: E402,F401
from app.tools import read # noqa: E402,F401
from app.tools import edit # noqa: E402,F401
from app.tools import multi_edit # noqa: E402,F401
from app.tools import write # noqa: E402,F401
from app.tools import notebook_edit # noqa: E402,F401
from app.tools import web # noqa: E402,F401
from app.tools import todo # noqa: E402,F401
from app.tools import subagent # noqa: E402,F401
from app.tools import ask_user # noqa: E402,F401
from app.tools import plan_mode # noqa: E402,F401
Loading