diff --git a/.tbd/config.yml b/.tbd/config.yml
index 027671ba..2f67fcc2 100644
--- a/.tbd/config.yml
+++ b/.tbd/config.yml
@@ -3,7 +3,7 @@ display:
 # Documentation cache configuration.
 # files: Maps destination paths (relative to .tbd/docs/) to source locations.
 #   Sources can be:
-#   - internal: prefix for bundled docs (e.g., "internal:shortcuts/standard/code-review-and-commit.md")
+#   - internal: prefix for bundled docs (e.g., "internal:shortcuts/standard/commit-code.md")
 #   - Full URL for external docs (e.g., "https://raw.githubusercontent.com/org/repo/main/file.md")
 # lookup_path: Search paths for doc lookup (like shell $PATH). Earlier paths take precedence.
 #
@@ -15,12 +15,10 @@ display:
 docs_cache:
   files:
     guidelines/backward-compatibility-rules.md: internal:guidelines/backward-compatibility-rules.md
-    guidelines/bun-monorepo-patterns.md: internal:guidelines/bun-monorepo-patterns.md
     guidelines/cli-agent-skill-patterns.md: internal:guidelines/cli-agent-skill-patterns.md
     guidelines/commit-conventions.md: internal:guidelines/commit-conventions.md
     guidelines/convex-limits-best-practices.md: internal:guidelines/convex-limits-best-practices.md
     guidelines/convex-rules.md: internal:guidelines/convex-rules.md
-    guidelines/electron-app-development-patterns.md: internal:guidelines/electron-app-development-patterns.md
     guidelines/error-handling-rules.md: internal:guidelines/error-handling-rules.md
     guidelines/general-coding-rules.md: internal:guidelines/general-coding-rules.md
     guidelines/general-comment-rules.md: internal:guidelines/general-comment-rules.md
@@ -29,25 +27,19 @@ docs_cache:
     guidelines/general-tdd-guidelines.md: internal:guidelines/general-tdd-guidelines.md
     guidelines/general-testing-rules.md: internal:guidelines/general-testing-rules.md
     guidelines/golden-testing-guidelines.md: internal:guidelines/golden-testing-guidelines.md
-    guidelines/pnpm-monorepo-patterns.md: internal:guidelines/pnpm-monorepo-patterns.md
     guidelines/python-cli-patterns.md: internal:guidelines/python-cli-patterns.md
     guidelines/python-modern-guidelines.md: internal:guidelines/python-modern-guidelines.md
     guidelines/python-rules.md: internal:guidelines/python-rules.md
-    guidelines/release-notes-guidelines.md: internal:guidelines/release-notes-guidelines.md
-    guidelines/tbd-sync-troubleshooting.md: internal:guidelines/tbd-sync-troubleshooting.md
+    guidelines/sync-troubleshooting.md: internal:guidelines/sync-troubleshooting.md
     guidelines/typescript-cli-tool-rules.md: internal:guidelines/typescript-cli-tool-rules.md
     guidelines/typescript-code-coverage.md: internal:guidelines/typescript-code-coverage.md
+    guidelines/typescript-monorepo-patterns.md: internal:guidelines/typescript-monorepo-patterns.md
     guidelines/typescript-rules.md: internal:guidelines/typescript-rules.md
-    guidelines/typescript-sorting-patterns.md: internal:guidelines/typescript-sorting-patterns.md
-    guidelines/typescript-yaml-handling-rules.md: internal:guidelines/typescript-yaml-handling-rules.md
-    guidelines/writing-style-guidelines.md: internal:guidelines/writing-style-guidelines.md
     shortcuts/standard/agent-handoff.md: internal:shortcuts/standard/agent-handoff.md
-    shortcuts/standard/checkout-third-party-repo.md: internal:shortcuts/standard/checkout-third-party-repo.md
-    shortcuts/standard/code-cleanup-all.md: internal:shortcuts/standard/code-cleanup-all.md
-    shortcuts/standard/code-cleanup-docstrings.md: internal:shortcuts/standard/code-cleanup-docstrings.md
-    shortcuts/standard/code-cleanup-tests.md: internal:shortcuts/standard/code-cleanup-tests.md
-    shortcuts/standard/code-review-and-commit.md: internal:shortcuts/standard/code-review-and-commit.md
-    shortcuts/standard/coding-spike.md: internal:shortcuts/standard/coding-spike.md
+    shortcuts/standard/cleanup-all.md: internal:shortcuts/standard/cleanup-all.md
+    shortcuts/standard/cleanup-remove-trivial-tests.md: internal:shortcuts/standard/cleanup-remove-trivial-tests.md
+    shortcuts/standard/cleanup-update-docstrings.md: internal:shortcuts/standard/cleanup-update-docstrings.md
+    shortcuts/standard/commit-code.md: internal:shortcuts/standard/commit-code.md
     shortcuts/standard/create-or-update-pr-simple.md: internal:shortcuts/standard/create-or-update-pr-simple.md
     shortcuts/standard/create-or-update-pr-with-validation-plan.md: internal:shortcuts/standard/create-or-update-pr-with-validation-plan.md
     shortcuts/standard/implement-beads.md: internal:shortcuts/standard/implement-beads.md
@@ -55,7 +47,6 @@ docs_cache:
     shortcuts/standard/new-architecture-doc.md: internal:shortcuts/standard/new-architecture-doc.md
     shortcuts/standard/new-guideline.md: internal:shortcuts/standard/new-guideline.md
     shortcuts/standard/new-plan-spec.md: internal:shortcuts/standard/new-plan-spec.md
-    shortcuts/standard/new-qa-playbook.md: internal:shortcuts/standard/new-qa-playbook.md
     shortcuts/standard/new-research-brief.md: internal:shortcuts/standard/new-research-brief.md
     shortcuts/standard/new-shortcut.md: internal:shortcuts/standard/new-shortcut.md
     shortcuts/standard/new-validation-plan.md: internal:shortcuts/standard/new-validation-plan.md
@@ -72,12 +63,10 @@ docs_cache:
     shortcuts/standard/update-specs-status.md: internal:shortcuts/standard/update-specs-status.md
     shortcuts/standard/welcome-user.md: internal:shortcuts/standard/welcome-user.md
     shortcuts/system/shortcut-explanation.md: internal:shortcuts/system/shortcut-explanation.md
-    shortcuts/system/skill-baseline.md: internal:shortcuts/system/skill-baseline.md
     shortcuts/system/skill-brief.md: internal:shortcuts/system/skill-brief.md
-    shortcuts/system/skill-minimal.md: internal:shortcuts/system/skill-minimal.md
+    shortcuts/system/skill.md: internal:shortcuts/system/skill.md
     templates/architecture-doc.md: internal:templates/architecture-doc.md
     templates/plan-spec.md: internal:templates/plan-spec.md
-    templates/qa-playbook.md: internal:templates/qa-playbook.md
     templates/research-brief.md: internal:templates/research-brief.md
   lookup_path:
     - .tbd/docs/shortcuts/system
diff --git a/docs/project/research/research-2026-02-02-tool-choice-parameter.md b/docs/project/research/research-2026-02-02-tool-choice-parameter.md
new file mode 100644
index 00000000..eedbc94a
--- /dev/null
+++ b/docs/project/research/research-2026-02-02-tool-choice-parameter.md
@@ -0,0 +1,565 @@
+# Research: Tool Choice Parameter in AI SDK and Major LLM Providers
+
+**Date:** 2026-02-02 (last updated 2026-02-17)
+
+**Author:** AI Research
+
+**Status:** Complete
+
+## Overview
+
+This research document provides a comprehensive technical overview of the `toolChoice` parameter
+implementation in the Vercel AI SDK and across major LLM providers. The focus is on understanding
+how to ensure agents reliably use tools (especially for form-filling use cases where web search
+or other research tools should be invoked before populating form fields).
+
+## Questions to Answer
+
+1. How does the AI SDK implement `toolChoice` and translate it to each provider's native format?
+2. What are the exact behaviors and options for each major provider (OpenAI, Anthropic, Google,
+   Deepseek, xAI/Grok)?
+3. What are the best practices for ensuring agents use tools (especially web search) before
+   filling in forms or generating structured output?
+4. What are the common issues and pitfalls when using `toolChoice`?
+5. What patterns exist for combining tool calling with structured output?
+
+## Scope
+
+- **Included**: AI SDK implementation details (source code analysis), provider-specific behaviors,
+  community best practices, form-filling patterns, troubleshooting guidance
+- **Excluded**: Implementation of specific form-filling applications, UI/UX considerations
+
+---
+
+## Findings
+
+### 1. AI SDK Core Implementation
+
+#### 1.1 Type Definition
+
+The AI SDK defines `ToolChoice` in `packages/ai/src/types/language-model.ts:100-104`:
+
+```typescript
+export type ToolChoice<TOOLS extends Record<string, unknown>> =
+  | 'auto'
+  | 'none'
+  | 'required'
+  | { type: 'tool'; toolName: Extract<keyof TOOLS, string> };
+```
+
+**Options:**
+- `'auto'` (default): The model can choose whether and which tools to call
+- `'none'`: The model must not call tools
+- `'required'`: The model must call a tool (can choose which one)
+- `{ type: 'tool', toolName: string }`: The model must call the specified tool
+
+#### 1.2 Core Translation Logic
+
+In `packages/ai/src/prompt/prepare-tools-and-tool-choice.ts:79-85`, the SDK translates the
+user-facing `toolChoice` to the internal provider format:
+
+```typescript
+toolChoice:
+  toolChoice == null
+    ? { type: 'auto' }
+    : typeof toolChoice === 'string'
+      ? { type: toolChoice }
+      : { type: 'tool' as const, toolName: toolChoice.toolName as string },
+```
+
+**Key insight**: When `toolChoice` is `undefined`/`null`, it defaults to `{ type: 'auto' }`.
+
+#### 1.3 Provider-Level Type
+
+The internal type used by providers is `LanguageModelV3ToolChoice` in
+`packages/provider/src/language-model/v3/language-model-v3-tool-choice.ts`:
+
+```typescript
+export type LanguageModelV3ToolChoice =
+  | { type: 'auto' }    // tool selection is automatic (can be no tool)
+  | { type: 'none' }    // no tool must be selected
+  | { type: 'required' } // one of the available tools must be selected
+  | { type: 'tool'; toolName: string }; // a specific tool must be selected
+```
+
+---
+
+### 2. Provider-Specific Implementations
+
+#### 2.1 OpenAI
+
+**Source**: `packages/openai/src/chat/openai-chat-prepare-tools.ts:59-76`
+
+**Translation:**
+
+| AI SDK Value | OpenAI Native Value |
+|--------------|---------------------|
+| `auto` | `'auto'` |
+| `none` | `'none'` |
+| `required` | `'required'` |
+| `{ type: 'tool', toolName }` | `{ type: 'function', function: { name: toolName } }` |
+
+**Native API Documentation:**
+- `tool_choice: "auto"` - Model decides whether to call functions (default)
+- `tool_choice: "none"` - Model will not call any tool, generates message only
+- `tool_choice: "required"` - Model must call one or more tools
+- `tool_choice: { type: "function", function: { name: "..." } }` - Force specific function
+
+**Notable:** OpenAI supports parallel function calling by default.
+
+**Sources:**
+- [OpenAI Function Calling Guide](https://platform.openai.com/docs/guides/function-calling)
+- [OpenAI Tools Guide](https://platform.openai.com/docs/guides/tools)
+
+#### 2.2 Anthropic (Claude)
+
+**Source**: `packages/anthropic/src/anthropic-prepare-tools.ts:310-353`
+
+**Translation (with important differences):**
+
+| AI SDK Value | Anthropic Native Value |
+|--------------|------------------------|
+| `auto` | `{ type: 'auto' }` |
+| `none` | *removes tools entirely* (Anthropic doesn't support 'none') |
+| `required` | `{ type: 'any' }` (**Note: 'any', not 'required'**) |
+| `{ type: 'tool', toolName }` | `{ type: 'tool', name: toolName }` |
+
+**Critical Insight from Source Code (lines 333-335):**
+```typescript
+case 'none':
+  // Anthropic does not support 'none' tool choice, so we remove the tools:
+  return { tools: undefined, toolChoice: undefined, toolWarnings, betas };
+```
+
+**Anthropic-Specific Features:**
+- `disable_parallel_tool_use: boolean` - Can be combined with any toolChoice type
+- Setting `disable_parallel_tool_use=true` with `type: 'any'` or `type: 'tool'` ensures
+  exactly one tool is called
+
+**Extended Thinking Limitation:**
+When using extended thinking, only `tool_choice: {"type": "auto"}` and
+`tool_choice: {"type": "none"}` are compatible. Using `any` or `tool` types will error.
+
+**Sources:**
+- [Anthropic Tool Use Documentation](https://platform.claude.com/docs/en/agents-and-tools/tool-use/implement-tool-use)
+- [Anthropic Advanced Tool Use](https://www.anthropic.com/engineering/advanced-tool-use)
+
+#### 2.3 Google (Gemini)
+
+**Source**: `packages/google/src/google-prepare-tools.ts:225-256`
+
+**Translation:**
+
+| AI SDK Value | Gemini Native Value |
+|--------------|---------------------|
+| `auto` | `{ functionCallingConfig: { mode: 'AUTO' } }` |
+| `none` | `{ functionCallingConfig: { mode: 'NONE' } }` |
+| `required` | `{ functionCallingConfig: { mode: 'ANY' } }` |
+| `{ type: 'tool', toolName }` | `{ functionCallingConfig: { mode: 'ANY', allowedFunctionNames: [toolName] } }` |
+
+**Native API Options:**
+- `AUTO` (default): Model decides whether to call functions
+- `NONE`: Model cannot make function calls
+- `ANY`: Forces model to predict a function call
+- `VALIDATED` (Preview): Like ANY but allows text responses too
+
+**Best Practices from Google:**
+- Keep active tools to **10-20 maximum** to reduce selection errors
+- Use **low temperature** (e.g., 0) for deterministic function calls
+- Apply **strong typing** (enums for fixed value sets)
+
+**Sources:**
+- [Google AI Function Calling](https://ai.google.dev/gemini-api/docs/function-calling)
+- [Vertex AI Function Calling](https://docs.cloud.google.com/vertex-ai/generative-ai/docs/model-reference/function-calling)
+
+#### 2.4 DeepSeek
+
+**Source**: `packages/deepseek/src/chat/deepseek-prepare-tools.ts:54-68`
+
+**Translation (follows OpenAI format):**
+
+| AI SDK Value | DeepSeek Native Value |
+|--------------|----------------------|
+| `auto` | `'auto'` |
+| `none` | `'none'` |
+| `required` | `'required'` |
+| `{ type: 'tool', toolName }` | `{ type: 'function', function: { name: toolName } }` |
+
+**Notable Limitations:**
+- DeepSeek's official documentation does not explicitly document the `tool_choice` parameter
+- Uses OpenAI-compatible API format
+- The model may hallucinate parameters not in your schema - validate arguments before calling
+- Not great at multi-turn function calling; performs best with single user message triggering calls
+
+**Sources:**
+- [DeepSeek Function Calling](https://api-docs.deepseek.com/guides/function_calling)
+- [DeepSeek Tool Calls](https://api-docs.deepseek.com/guides/tool_calls)
+
+#### 2.5 xAI (Grok)
+
+**Source**: `packages/xai/src/xai-prepare-tools.ts:71-86` and
+`packages/xai/src/responses/xai-responses-prepare-tools.ts:156-186`
+
+**Translation:**
+
+| AI SDK Value | xAI Native Value |
+|--------------|------------------|
+| `auto` | `'auto'` |
+| `none` | `'none'` |
+| `required` | `'required'` |
+| `{ type: 'tool', toolName }` | `{ type: 'function', name: toolName }` |
+
+**Notable from Source Code (lines 173-180):**
+```typescript
+if (selectedTool.type === 'provider') {
+  // xAI API does not support forcing specific server-side tools via toolChoice
+  // Only function tools can be forced
+  toolWarnings.push({
+    type: 'unsupported',
+    feature: `toolChoice for server-side tool "${selectedTool.name}"`,
+  });
+```
+
+**Recommended Model:** xAI recommends `grok-4-1-fast` for agentic tool calling.
+
+**Sources:**
+- [xAI Function Calling](https://docs.x.ai/docs/guides/function-calling)
+- [xAI Tools Overview](https://docs.x.ai/docs/guides/tools/overview)
+
+---
+
+### 3. Summary: Provider Translation Table
+
+| AI SDK `toolChoice` | OpenAI | Anthropic | Google | DeepSeek | xAI |
+|---------------------|--------|-----------|--------|----------|-----|
+| `'auto'` | `'auto'` | `{ type: 'auto' }` | `mode: 'AUTO'` | `'auto'` | `'auto'` |
+| `'none'` | `'none'` | *removes tools* | `mode: 'NONE'` | `'none'` | `'none'` |
+| `'required'` | `'required'` | `{ type: 'any' }` | `mode: 'ANY'` | `'required'` | `'required'` |
+| `{ type: 'tool', toolName: 'x' }` | `{ type: 'function', function: { name: 'x' } }` | `{ type: 'tool', name: 'x' }` | `mode: 'ANY', allowedFunctionNames: ['x']` | `{ type: 'function', function: { name: 'x' } }` | `{ type: 'function', name: 'x' }` |
+
+---
+
+### 4. Form-Filling Use Cases and Patterns
+
+#### 4.1 The Challenge
+
+When building form-filling agents, a common issue is that the model may:
+1. **Hallucinate data** instead of using tools to research
+2. **Skip tool calls** and go directly to filling the form
+3. **Analyze/plan** what it would do instead of actually calling tools
+4. **Call tools unreliably** after ~5 messages in a conversation
+
+#### 4.2 Pattern: Answer Tool with `toolChoice: 'required'`
+
+**Recommended Approach (AI SDK 6+):**
+
+Use an "answer" tool without an `execute` function and `toolChoice: 'required'` to force
+structured output:
+
+```typescript
+import { generateText, tool } from 'ai';
+import { z } from 'zod';
+
+const result = await generateText({
+  model: yourModel,
+  tools: {
+    webSearch: tool({
+      description: 'Search the web for information',
+      parameters: z.object({ query: z.string() }),
+      execute: async ({ query }) => { /* search implementation */ }
+    }),
+    submitForm: tool({
+      description: 'Submit the completed form with researched data',
+      parameters: z.object({
+        field1: z.string().describe('Value for field1 (must be researched)'),
+        field2: z.string().describe('Value for field2 (must be researched)'),
+      }),
+      // No execute function - acts as termination signal
+    }),
+  },
+  toolChoice: 'required', // Must use a tool at every step
+  stopWhen: hasToolCall('submitForm'), // Stop when form is submitted
+  system: `You are a research assistant. Before filling ANY form field:
+1. Use webSearch to find accurate, current information
+2. NEVER guess or hallucinate data
+3. Only call submitForm when you have researched ALL fields`,
+  prompt: userQuery,
+});
+
+// Get the form data from staticToolCalls (tools without execute)
+const formData = result.staticToolCalls.find(
+  call => call.toolName === 'submitForm'
+)?.args;
+```
+
+#### 4.3 Pattern: AI SDK 6 Unified Output
+
+**New in AI SDK 6:** Combine tool calling with structured output in one flow:
+
+```typescript
+import { generateText, Output } from 'ai';
+
+const result = await generateText({
+  model: yourModel,
+  tools: { webSearch, fetchUrl },
+  output: Output.object({
+    schema: z.object({
+      companyName: z.string(),
+      foundedYear: z.number(),
+      headquarters: z.string(),
+    }),
+  }),
+  system: `Research the company thoroughly using web search before
+           providing structured output. Do not hallucinate.`,
+  prompt: 'Get information about Anthropic',
+});
+
+// result.object contains the structured data
+```
+
+**Important:** Structured output generation counts as an additional step. Adjust `stopWhen`
+accordingly.
+
+#### 4.4 Pattern: Explicit Tool Guidance in Prompts
+
+**System Prompt Best Practices:**
+
+```
+CRITICAL INSTRUCTIONS FOR TOOL USE:
+1. For ANY information that could be time-sensitive, ALWAYS use webSearch first
+2. For ANY factual claims (dates, numbers, names), ALWAYS verify with webSearch
+3. NEVER fill in form fields with guessed or assumed data
+4. If webSearch returns no results, explicitly state "Unknown" rather than guessing
+5. Call tools BEFORE reasoning about the answer, not after
+```
+
+#### 4.5 Pattern: Multi-Step Verification Loop
+
+For critical data accuracy, use a verification pattern:
+
+```typescript
+const agent = createAgent({
+  tools: {
+    webSearch,
+    verifyFact: tool({
+      description: 'Double-check a fact by searching again',
+      parameters: z.object({
+        fact: z.string(),
+        originalSource: z.string(),
+      }),
+      execute: async ({ fact }) => { /* second search */ },
+    }),
+    submitVerifiedForm: tool({
+      description: 'Submit only after all facts are verified',
+      parameters: formSchema,
+    }),
+  },
+  stopWhen: hasToolCall('submitVerifiedForm'),
+  prepareStep: ({ lastToolResults }) => {
+    // Force verification if not all fields verified
+    if (needsVerification(lastToolResults)) {
+      return { toolChoice: { type: 'tool', toolName: 'verifyFact' } };
+    }
+    return {};
+  },
+});
+```
+
+---
+
+### 5. Common Issues and Troubleshooting
+
+#### 5.1 Tool Execution Becomes Unreliable After ~5 Messages
+
+**Issue:** Models increasingly fail to execute tools after approximately 5 messages, instead
+analyzing or describing what they would do.
+
+**Solutions:**
+- Add explicit tool-use reminders in subsequent messages: "Remember to USE the webSearch
+  tool, not describe using it"
+- Reset context periodically with `context.compact()` in AI SDK 6
+- Use `toolChoice: 'required'` to force tool usage
+
+#### 5.2 Endless Loop with `toolChoice: 'required'`
+
+**Issue:** Setting `toolChoice: 'required'` can cause infinite loops when using `streamText`.
+
+**Solutions:**
+- Use `stopWhen: hasToolCall('finalTool')` with a termination tool
+- Use `stopWhen: stepCountIs(n)` as a safety limit
+- Use `prepareStep` to dynamically change `toolChoice` on final step:
+
+```typescript
+prepareStep: ({ stepNumber }) => {
+  if (stepNumber >= 5) {
+    return { toolChoice: 'auto' }; // Allow text response
+  }
+  return { toolChoice: 'required' };
+},
+```
+
+#### 5.3 Model Hallucinating Tool Calls
+
+**Issue:** Model says it's calling a tool but actually hallucinating results.
+
+**Solutions:**
+- Check for actual tool_use blocks in response, not just text mentioning tools
+- Use `toolChoice: 'required'` to force structured tool calls
+- Implement validation on tool results before accepting
+
+#### 5.4 Anthropic: 'none' Doesn't Work as Expected
+
+**Issue:** `toolChoice: 'none'` with Anthropic doesn't just prevent tool use, it removes
+all tool definitions.
+
+**Solution:** This is intentional per the source code. If you need tools available but not
+used in a specific call, use prompting instead: "Do not use any tools for this response."
+
+#### 5.5 Parallel Tool Calls Not Working
+
+**Causes:**
+1. Incorrect tool result formatting (separate messages instead of combined)
+2. Weak prompting
+3. Model-specific limitations (Sonnet 3.7 less likely than Claude 4)
+
+**Solution:**
+```typescript
+// Wrong: Separate messages
+[
+  { role: 'assistant', content: [tool_use_1, tool_use_2] },
+  { role: 'user', content: [tool_result_1] },
+  { role: 'user', content: [tool_result_2] },  // Separate
+]
+
+// Correct: Single message with all results
+[
+  { role: 'assistant', content: [tool_use_1, tool_use_2] },
+  { role: 'user', content: [tool_result_1, tool_result_2] },  // Combined
+]
+```
+
+---
+
+### 6. Best Practices Summary
+
+#### 6.1 For Reliable Tool Use
+
+1. **Use `toolChoice: 'required'`** when tools MUST be used
+2. **Provide detailed tool descriptions** - this is the most important factor
+3. **Limit tools to 5-7** for optimal selection accuracy (10-20 max)
+4. **Use explicit prompting** about when to use which tool
+5. **Implement answer/termination tools** for structured output flows
+
+#### 6.2 For Form-Filling Specifically
+
+1. **Always research before filling** - use `toolChoice: 'required'` initially
+2. **Use an answer tool without execute** - terminates loop with structured data
+3. **Validate all tool results** - don't trust raw model outputs
+4. **Use the AI SDK 6 `output` option** for cleaner structured output flows
+5. **Add verification steps** for critical data
+
+#### 6.3 Provider-Specific Recommendations
+
+| Provider | Recommendation |
+|----------|----------------|
+| OpenAI | Use `'required'` directly; supports parallel calls |
+| Anthropic | Remember `required` → `any` translation; use `disable_parallel_tool_use` for single calls |
+| Google | Use `ANY` mode with `allowedFunctionNames` for specific tools |
+| DeepSeek | Validate tool arguments; avoid multi-turn tool flows |
+| xAI | Use `grok-4-1-fast` for best tool calling; can't force provider tools |
+
+---
+
+## Recommendations for Markform
+
+Based on this research and confirmed by reviewing the Markform harness architecture
+(stateless turns with multi-step tool loops within each turn):
+
+### Recommended: `toolChoice: 'required'` as Default
+
+Use `toolChoice: 'required'` (mapped to AI SDK `require_tools` policy) as the default
+for all form filling. This:
+- Prevents "analysis paralysis" where models describe what they'd do without acting
+- Works across OpenAI, Anthropic (→ `any`), Google (→ `ANY`), and xAI
+- Already set as default in `liveAgent.ts:91`
+- Uses `stopWhen: stepCountIs(maxStepsPerTurn)` to prevent infinite loops
+
+### For Guaranteed Web Search: `prepareStep` with Forced First Step
+
+When forms require factual research, use AI SDK's `prepareStep` callback to force
+`web_search` on step 0, then `'required'` for subsequent steps. This works because
+within a single `generateText()` call, the model accumulates context across steps—
+search results from step 0 are visible when calling `fill_form` in step 1.
+
+```typescript
+prepareStep: ({ steps }) => {
+  const hasSearched = steps.some(step =>
+    step.toolCalls.some(tc => isWebSearchTool(tc.toolName))
+  );
+  if (!hasSearched) {
+    return { toolChoice: { type: 'tool', toolName: 'web_search' } };
+  }
+  return { toolChoice: 'required' };
+},
+```
+
+**Critical architecture note:** Markform turns are stateless—web search results do NOT
+persist across turns. All tool policies must operate at the **step** level (within a
+single `generateText()` call), not at the turn level.
+
+### Future: Harness-Level Research Injection
+
+For maximum reliability (not dependent on model behavior), the harness could run web
+searches itself before calling the LLM, injecting results into the context prompt.
+This is the most provider-agnostic approach and decouples research quality from
+model tool-calling behavior. See the plan spec for details.
+
+### Provider-Specific Notes
+
+| Provider | `'required'` | Forced specific tool | Caveat |
+|----------|-------------|---------------------|--------|
+| OpenAI | Works directly | Works directly | — |
+| Anthropic | → `any` | → `{ type: 'tool' }` | Not compatible with extended thinking |
+| Google | → `ANY` | → `allowedFunctionNames` | Limit to 10-20 tools |
+| DeepSeek | Needs testing | Needs testing | Unreliable multi-turn calling |
+| xAI | Works | Can't force server-side tools | Use grok-4-1-fast |
+
+---
+
+## References
+
+### AI SDK Documentation
+- [AI SDK Tool Calling](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling)
+- [AI SDK Agents: Loop Control](https://ai-sdk.dev/docs/agents/loop-control)
+- [AI SDK Generating Structured Data](https://ai-sdk.dev/docs/ai-sdk-core/generating-structured-data)
+- [AI SDK Troubleshooting: Tool Calling with Structured Outputs](https://ai-sdk.dev/docs/troubleshooting/tool-calling-with-structured-outputs)
+- [AI SDK 6 Announcement](https://vercel.com/blog/ai-sdk-6)
+
+### Provider Documentation
+- [OpenAI Function Calling](https://platform.openai.com/docs/guides/function-calling)
+- [Anthropic Tool Use](https://platform.claude.com/docs/en/agents-and-tools/tool-use/overview)
+- [Anthropic Implement Tool Use](https://platform.claude.com/docs/en/agents-and-tools/tool-use/implement-tool-use)
+- [Google Gemini Function Calling](https://ai.google.dev/gemini-api/docs/function-calling)
+- [DeepSeek Function Calling](https://api-docs.deepseek.com/guides/function_calling)
+- [xAI Function Calling](https://docs.x.ai/docs/guides/function-calling)
+
+### AI SDK Source Code References
+- `packages/ai/src/types/language-model.ts:100-104` - ToolChoice type definition
+- `packages/ai/src/prompt/prepare-tools-and-tool-choice.ts` - Core translation logic
+- `packages/provider/src/language-model/v3/language-model-v3-tool-choice.ts` - Provider-level type
+- `packages/openai/src/chat/openai-chat-prepare-tools.ts:59-76` - OpenAI translation
+- `packages/anthropic/src/anthropic-prepare-tools.ts:310-353` - Anthropic translation
+- `packages/google/src/google-prepare-tools.ts:225-256` - Google translation
+- `packages/deepseek/src/chat/deepseek-prepare-tools.ts:54-68` - DeepSeek translation
+- `packages/xai/src/xai-prepare-tools.ts:71-86` - xAI translation
+
+### Community Resources
+- [GitHub Issue: Tool Execution Unreliable After ~5 Messages](https://github.com/vercel/ai/issues/10269)
+- [GitHub Issue: toolChoice 'required' Endless Loop](https://github.com/vercel/ai/issues/3944)
+- [Vercel Blog: We Removed 80% of Our Agent's Tools](https://vercel.com/blog/we-removed-80-percent-of-our-agents-tools)
+- [GitHub Discussion: Tool Calling Loop Understanding](https://github.com/vercel/ai/discussions/8514)
+
+### Hallucination Prevention
+- [Zep: Reducing LLM Hallucinations](https://www.getzep.com/ai-agents/reducing-llm-hallucinations/)
+- [Cleanlab: Prevent Hallucinated Responses](https://cleanlab.ai/blog/prevent-hallucinated-responses/)
+- [AWS: Reducing Hallucinations with Verified Semantic Cache](https://aws.amazon.com/blogs/machine-learning/reducing-hallucinations-in-llm-agents-with-a-verified-semantic-cache-using-amazon-bedrock-knowledge-bases/)
diff --git a/docs/project/specs/active/plan-2026-02-02-tool-choice-policies.md b/docs/project/specs/active/plan-2026-02-02-tool-choice-policies.md
new file mode 100644
index 00000000..58cf8de6
--- /dev/null
+++ b/docs/project/specs/active/plan-2026-02-02-tool-choice-policies.md
@@ -0,0 +1,469 @@
+# Plan Spec: Tool Choice Policies for Reliable Form Filling
+
+**Date:** 2026-02-02 (last updated 2026-02-15)
+
+**Author:** AI Research
+
+**Status:** Draft (revised after senior engineering review)
+
+## Overview
+
+This spec defines a **tool choice policy system** for Markform that gives form authors and
+consumers fine-grained control over how agents use tools (especially web search) during form
+filling. The goal is to ensure agents reliably research information before filling fields,
+reducing hallucination and improving data accuracy.
+
+**Related Docs:**
+- `docs/project/research/research-2026-02-02-tool-choice-parameter.md` - Research on AI SDK
+  toolChoice and provider behavior
+- `docs/project/specs/active/plan-2026-01-27-parallel-form-filling.md` - Parallel execution spec
+
+## Goals
+
+1. **Reduce hallucination**: Ensure agents use web search and other research tools before
+   filling fields that require external data
+2. **Configurable policies**: Provide multiple tool use policies that balance research
+   thoroughness against latency/cost
+3. **Cross-model compatibility**: Work reliably across providers (OpenAI, Anthropic, Google,
+   DeepSeek, xAI)
+4. **Form-level control**: Allow policies at form level via frontmatter and CLI
+
+## Non-Goals
+
+- Custom tool definitions per form (tools are provided by the harness)
+- Model-specific prompt tuning (policies should work across models)
+- UI for policy configuration (CLI/API only for now)
+- Automatic policy selection based on field content
+- Per-field tool policies (may be considered in v2)
+
+## Background
+
+### The Problem
+
+Models often skip web search and fill fields with training data (which may be outdated
+or hallucinated). The current `toolChoice: 'required'` default (set at `liveAgent.ts:91`)
+forces the model to call *some* tool, but doesn't guarantee it uses web search over
+fill_form.
+
+### Architecture: Steps vs Turns (Critical Context)
+
+The Markform harness has a two-level iteration model that any tool policy must respect:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ Harness Turn (one fillFormTool() call)                      │
+│                                                             │
+│  ┌──────────────────────────────────────────────────────┐   │
+│  │ Single generateText() invocation                     │   │
+│  │                                                      │   │
+│  │  Step 0: web_search("query")  →  results ✓           │   │
+│  │  Step 1: fill_form(patches)   ← sees search results  │   │
+│  │  Step 2: web_search("query2") →  results ✓           │   │
+│  │  Step 3: fill_form(patches)   ← sees all prior       │   │
+│  │  ...up to maxStepsPerTurn (default: 20)               │   │
+│  │                                                      │   │
+│  │  Context PRESERVED within these steps                 │   │
+│  └──────────────────────────────────────────────────────┘   │
+│                                                             │
+│  Form markdown updated with patches → next turn             │
+│                                                             │
+└─────────────────────────────────────────────────────────────┘
+                          │
+                          ▼ Context RESET
+┌─────────────────────────────────────────────────────────────┐
+│ Next Harness Turn (fresh fillFormTool() call)                │
+│                                                             │
+│  Only sees: updated form markdown + remaining issues         │
+│  Previous web search results are LOST                       │
+│  Previous conversation/reasoning is LOST                    │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Key facts from code review:**
+
+1. **Within a turn** (`liveAgent.ts:201-209`): One `generateText()` call allows up to
+   `maxStepsPerTurn` (default 20) AI SDK steps. The model accumulates context across
+   steps—web search results from step 0 ARE visible when calling fill_form in step 1.
+   AI SDK's `prepareStep` callback fires between steps and can change `toolChoice` and
+   `activeTools` per step (`generate-text.ts:518-546`).
+
+2. **Across turns** (`liveAgent.ts:123-125`): Each call is stateless. The full form
+   context is provided fresh. Only three things persist across turns:
+   - The form markdown (updated with filled values)
+   - The remaining issues list
+   - Previous patch rejections
+
+3. **Web search results do NOT persist across turns.** Any policy that separates
+   "research" and "fill" into different harness turns will lose the research results.
+
+### Implications for Policy Design
+
+- **Policies must operate at the step level** (within a single `generateText()` call),
+  NOT at the turn level (across separate `fillFormTool()` invocations).
+- **"Two-phase" as separate invocations is broken** — research results from Phase 1 are
+  lost when Phase 2 starts fresh.
+- **`prepareStep`** is the correct mechanism for step-level control. It can dynamically
+  change `toolChoice` and `activeTools` between steps.
+
+## Design
+
+### Tool Choice Policy Enum
+
+```typescript
+type ToolPolicy =
+  | 'none'                  // No tools provided to agent
+  | 'auto'                  // Model chooses freely whether to use tools
+  | 'require_tools'         // toolChoice: 'required' on every step (DEFAULT)
+  | 'require_web_search'    // Step 0 must be web_search, then require_tools
+```
+
+**Default:** `require_tools`
+
+### Policy Behaviors
+
+#### `none`
+
+```
+No tools provided to agent.
+Agent can only generate text responses.
+toolChoice: N/A
+```
+
+**When to use:** Testing, debugging, or when tools are intentionally disabled.
+
+#### `auto`
+
+```
+toolChoice: 'auto' on every step.
+Model decides when to search and when to fill.
+No enforcement of tool usage.
+```
+
+**When to use:** Legacy behavior, simple forms that don't need research, or when you
+want maximum model flexibility.
+
+#### `require_tools` (Default)
+
+```
+toolChoice: 'required' on every step.
+Model must call SOME tool on every step (fill_form or web_search).
+Prevents "analysis paralysis" where model talks without acting.
+Model can interleave web_search and fill_form freely within a turn.
+```
+
+**When to use:** General production use. Ensures progress on every step.
+Already set as the default at `liveAgent.ts:91`.
+
+**Note:** This does NOT guarantee web search is used. The model may go straight
+to fill_form. For guaranteed research, use `require_web_search`.
+
+#### `require_web_search`
+
+```
+Step 0: toolChoice: { type: 'tool', toolName: 'web_search' }
+Step 1+: toolChoice: 'required'
+```
+
+**When to use:** Forms with factual fields that need current data. Guarantees at
+least one web search before any form filling within each turn.
+
+**Implementation:**
+```typescript
+// In liveAgent.ts, pass prepareStep to generateText():
+prepareStep: ({ stepNumber, steps }) => {
+  const hasSearched = steps.some(step =>
+    step.toolCalls.some(tc => isWebSearchTool(tc.toolName))
+  );
+  if (!hasSearched) {
+    return { toolChoice: { type: 'tool', toolName: 'web_search' } };
+  }
+  return { toolChoice: 'required' };
+},
+```
+
+**Behavior within a single turn:**
+1. Step 0: Model is forced to call web_search
+2. Step 1: Model sees search results, chooses any tool (usually fill_form)
+3. Step 2+: Model continues with required tools (may search again or fill more)
+
+### Deferred: More Aggressive Research Policies
+
+The original spec proposed `web_search_always` and `two_phase` policies. These are
+**deferred** pending architectural work:
+
+#### Why `two_phase` doesn't work as designed
+
+The original design called for "Phase 1: research only, Phase 2: fill only" as separate
+agent invocations. This is broken because web search results from Phase 1 are completely
+lost when Phase 2 starts (turns are stateless).
+
+**Possible future approaches:**
+1. **Harness-level research injection** (recommended): The harness itself runs web
+   searches before calling the LLM, based on field labels/descriptions. Inject results
+   into the context prompt. The model never has to decide whether to search.
+2. **Research accumulator**: Store web search results in a sidecar that persists across
+   turns. Inject into subsequent prompts. Adds complexity.
+3. **Single-turn two-phase via `activeTools`**: Within one `generateText()` call, use
+   `prepareStep` to only expose web_search for steps 0-N, then only expose fill_form
+   for steps N+1+. Works within a turn but may exceed step limits for complex forms.
+
+#### Why `web_search_always` is questionable
+
+Forcing web search on every step within a turn doesn't make sense — after step 0
+returns search results, the model already has the information. Forcing redundant
+searches wastes API calls. The `require_web_search` policy (search on step 0) achieves
+the same goal more efficiently.
+
+If the concern is that the model needs to search for different fields at different
+points, `require_tools` already allows this — the model can interleave searches and
+fills freely.
+
+### Alternative Approaches Worth Considering
+
+#### Harness-Level Research Injection (Future)
+
+Instead of asking the model to decide when to search, the harness itself runs web
+searches based on field metadata:
+
+```typescript
+// Pseudocode for future harness-level research
+async function researchFields(form: ParsedForm, issues: InspectIssue[]): Promise<string> {
+  const queries = generateSearchQueries(form, issues);
+  const results = await Promise.all(queries.map(q => webSearch(q)));
+  return formatResearchContext(results);
+}
+
+// Inject into context prompt
+const contextPrompt = buildContextPrompt(issues, form, maxPatches, previousRejections);
+const researchContext = await researchFields(form, issues);
+const fullPrompt = contextPrompt + '\n\n# Research Results\n' + researchContext;
+```
+
+**Advantages:**
+- Most reliable — doesn't depend on model behavior
+- Works identically across all providers
+- Research quality can be tuned independently of the LLM
+- Can be cached/reused across turns
+
+**Disadvantages:**
+- Harness must know what to search for (field labels may not be sufficient)
+- Upfront latency for search before LLM call
+- May search for things the model doesn't need
+
+This is the recommended direction for forms that truly need guaranteed research.
+
+#### Post-Turn Validation (Alternative)
+
+Let the model use `require_tools` freely, but validate after each turn:
+
+```typescript
+// After fillFormTool() returns, check tool usage
+const toolCalls = response.stats.toolCalls;
+const usedWebSearch = toolCalls.some(tc => isWebSearchTool(tc.name));
+
+if (!usedWebSearch && policyRequiresSearch) {
+  // Inject reminder into next turn's context
+  previousRejections.push({
+    message: 'You did not use web search. Research field values before filling.',
+    // ... triggers re-try
+  });
+}
+```
+
+**Advantages:** Simple, works across providers, no `prepareStep` complexity.
+**Disadvantages:** Wastes a turn when model doesn't search. Slower.
+
+### API Changes
+
+#### FillOptions Extension
+
+```typescript
+interface FillOptions {
+  // ... existing options
+
+  /**
+   * Tool choice policy for agent tool selection.
+   * Controls how strictly the harness enforces tool usage.
+   *
+   * @default 'require_tools'
+   */
+  toolPolicy?: ToolPolicy;
+}
+```
+
+#### Frontmatter Configuration
+
+```yaml
+---
+markform:
+  spec: MF/0.1
+  harness_config:
+    tool_policy: require_web_search    # New option
+---
+```
+
+#### CLI Extension
+
+```bash
+# New --tool-policy flag
+markform fill form.md --tool-policy=require_web_search
+
+# Override policy at CLI level (CLI overrides frontmatter)
+markform fill form.md --tool-policy=auto
+```
+
+### Provider-Specific Considerations
+
+| Provider | `require_tools` | `require_web_search` | Notes |
+|----------|----------------|---------------------|-------|
+| OpenAI | Works directly | Works directly | Parallel tool calls supported |
+| Anthropic | `required` → `any` | `{ type: 'tool' }` works | Not compatible with extended thinking |
+| Google | `required` → `ANY` | `ANY` + `allowedFunctionNames` | Limit to 10-20 tools |
+| DeepSeek | **Needs testing** | **Needs testing** | Unreliable multi-turn; may need fallback |
+| xAI | Works | Can't force provider-defined tools | Use grok-4-1-fast |
+
+### Areas of Uncertainty (Requiring Testing)
+
+1. **DeepSeek `require_tools` behavior**: Research indicates unreliable tool calling.
+   - Does `toolChoice: 'required'` work reliably?
+   - Should we auto-downgrade to `auto` for DeepSeek?
+   - Test with both single-step and multi-step turns.
+
+2. **Anthropic extended thinking + `require_tools`**: Anthropic docs say only
+   `tool_choice: 'auto'` is compatible with extended thinking.
+   - Does this affect our default? Should we detect extended thinking?
+   - Workaround: Use `auto` + strong prompting when extended thinking is enabled.
+
+3. **`require_web_search` with providers using different search tool names**:
+   - OpenAI: `web_search`, Anthropic: `web_search`, Google: `google_search`
+   - Need to resolve correct tool name dynamically.
+
+4. **Step limits**: With `require_web_search`, do we burn a step on search?
+   - Current default: 20 steps per turn (plenty of room).
+   - Monitor if complex forms hit the step limit.
+
+5. **Provider-specific `toolChoice: { type: 'tool' }` support**:
+   - xAI can't force server-side tools — does this affect web search?
+   - Need to test forcing specific tool names across all providers.
+
+## Implementation Plan
+
+### Phase 1: Core Policies
+
+**Goal:** Implement `toolPolicy` with `none`, `auto`, `require_tools`, `require_web_search`.
+
+- [ ] Add `ToolPolicy` type to `harnessTypes.ts`
+- [ ] Add `toolPolicy` to `FillOptions` and `LiveAgentConfig`
+- [ ] Add `tool_policy` to `HarnessConfigYaml` and mapping in `settings.ts`
+- [ ] Implement `prepareStep` callback in `liveAgent.ts` for `require_web_search`
+- [ ] Resolve web search tool name dynamically (provider-aware)
+- [ ] Add `--tool-policy` flag to `markform fill` command
+- [ ] Update `fillRecord` to track policy in metadata
+- [ ] Write unit tests for policy → toolChoice translation
+- [ ] Write integration tests with mock agents
+
+### Phase 2: Provider Validation
+
+**Goal:** Validate all policies across providers.
+
+- [ ] Create test matrix: policy × provider
+- [ ] Test DeepSeek specifically: `require_tools` and `require_web_search`
+- [ ] Test Anthropic with extended thinking
+- [ ] Test xAI with forced tool names
+- [ ] Document provider-specific recommendations
+- [ ] Add fallback behavior for unsupported provider/policy combos
+
+### Phase 3: Harness-Level Research (Future)
+
+**Goal:** Enable guaranteed research without relying on model behavior.
+
+- [ ] Design search query generation from field metadata
+- [ ] Implement harness-level web search execution
+- [ ] Inject research results into context prompt
+- [ ] Add `research_mode: auto | manual` config option
+- [ ] Cache research results across turns for efficiency
+- [ ] Write integration tests
+
+## Testing Strategy
+
+### Unit Tests
+
+- Policy → toolChoice translation for each policy type
+- Policy parsing from frontmatter and CLI
+- Web search tool name resolution per provider
+
+### Integration Tests (Mock Agents)
+
+- `none`: Agent receives no tools
+- `auto`: Agent receives `toolChoice: 'auto'`
+- `require_tools`: Agent receives `toolChoice: 'required'`
+- `require_web_search`: Step 0 forced to web_search, step 1+ required
+
+### End-to-End Tests (Real LLM Calls)
+
+- Test each policy with a factual research form
+- Verify web search is actually called (check fill record / wire format)
+- Compare accuracy: auto vs require_tools vs require_web_search
+- Measure latency impact
+
+### Provider Matrix
+
+```
+┌─────────────────────┬────────┬──────────┬─────────┬────────┬─────┐
+│ Policy              │ OpenAI │ Anthropic│ DeepSeek│ Google │ xAI │
+├─────────────────────┼────────┼──────────┼─────────┼────────┼─────┤
+│ none                │   ✓    │    ✓     │    ✓    │   ✓    │  ✓  │
+│ auto                │   ✓    │    ✓     │    ✓    │   ✓    │  ✓  │
+│ require_tools       │   ✓    │    ✓     │    ?    │   ✓    │  ✓  │
+│ require_web_search  │   ✓    │    ?     │    ?    │   ?    │  ?  │
+└─────────────────────┴────────┴──────────┴─────────┴────────┴─────┘
+```
+
+## Rollout Plan
+
+1. **Phase 1**: Ship `none`, `auto`, `require_tools`, `require_web_search`
+   - Default is `require_tools` (already the current behavior)
+   - `require_web_search` documented as beta until provider testing complete
+
+2. **Phase 2**: Validate across providers, promote `require_web_search` to stable
+
+3. **Phase 3**: Harness-level research injection as the robust solution for
+   forms that truly need guaranteed research
+
+## Open Questions
+
+1. **Web search tool naming**: Should `require_web_search` resolve tool names
+   dynamically (e.g., `google_search` for Google provider), or should we normalize
+   all search tools to `web_search`?
+   - Current code: Google uses `google_search`, others use `web_search`
+   - Recommendation: Resolve dynamically using existing `isWebSearchTool()` helper
+
+2. **DeepSeek fallback**: If `require_tools` doesn't work on DeepSeek, should we
+   auto-detect and fall back, or let it fail visibly?
+   - Recommendation: Fail visibly with a warning, let user set `auto` explicitly
+
+3. **Extended thinking**: Should we auto-detect extended thinking on Anthropic and
+   downgrade to `auto`?
+   - Recommendation: Yes, with a logged warning
+
+4. **Harness-level research scope**: When implemented, should it search for all
+   fields or only unfilled fields with `research: required` annotation?
+   - Deferred to Phase 3 design
+
+## References
+
+- [Research: Tool Choice Parameter](../research/research-2026-02-02-tool-choice-parameter.md)
+- [AI SDK Tool Calling](https://ai-sdk.dev/docs/ai-sdk-core/tools-and-tool-calling)
+- [AI SDK Agents: Loop Control](https://ai-sdk.dev/docs/agents/loop-control)
+- [Parallel Form Filling Spec](plan-2026-01-27-parallel-form-filling.md)
+- [GitHub Issue: Tool Execution Unreliable](https://github.com/vercel/ai/issues/10269)
+- [GitHub Issue: toolChoice Endless Loop](https://github.com/vercel/ai/issues/3944)
+
+### Source Code References
+
+- `liveAgent.ts:91` — current `toolChoice` default (`'required'`)
+- `liveAgent.ts:123-125` — stateless turn documentation
+- `liveAgent.ts:201-209` — `generateText()` invocation with `stepCountIs`
+- `liveAgent.ts:231` — tool call counting (for post-turn validation)
+- `liveAgent.ts:746-748` — `isWebSearchTool()` helper
+- `generate-text.ts:518-546` (AI SDK) — `prepareStep` callback invocation