From a638997155afb8df7da789fa39efc87339d27fa3 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 11:27:36 -0700
Subject: [PATCH 01/12] docs(plan): data_extractor + query_extraction workflow
 plan
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 ...-feat-dws-data-extraction-workflow-plan.md | 254 ++++++++++++++++++
 1 file changed, 254 insertions(+)
 create mode 100644 docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md

diff --git a/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
new file mode 100644
index 0000000..0aad9aa
--- /dev/null
+++ b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
@@ -0,0 +1,254 @@
+---
+title: "feat: data_extractor + query_extraction tools and a dynamic-workflow example"
+status: active
+date: 2026-06-07
+type: feat
+target_repo: nutrient-dws-mcp-server
+base_branch: main
+supersedes: 2026-06-07-001-feat-dws-extraction-accessibility-plan.md
+---
+
+# feat: `data_extractor` + `query_extraction` + a dynamic-workflow example
+
+## Summary
+
+Ship the **data-extraction workflow primitive** for the Nutrient DWS MCP server — the single highest-leverage move toward the "AI agents using documents" keystone — plus a runnable example that demonstrates it end to end.
+
+- **`data_extractor`** — typed JSON / Markdown extraction (text, key-value pairs, tables, and structured/positional text with **coordinates + confidence**). Because structured output is large, it is written to a file; the inline response is a *decision-grade summary* (per-page element/table/KVP counts, low-confidence flags, bbox ranges) — never raw document content.
+- **`query_extraction`** — reads that extraction file back and returns **filtered slices inline** (by page, region, or minimum confidence), so the agent can pull *actionable* coordinates into context on demand instead of being handed a file it cannot read. This is what makes the primitive genuinely agent-native rather than a context-problem relocation.
+- **A worked dynamic-workflow example** (`examples/` + README walkthrough): extract → branch on low confidence → act with the existing `ai_redactor` / `document_signer` tools. This is the GTM-legible artifact a keystone needs.
+
+Both tools are thin wrappers over the existing `/build` path, reusing the current auth, sandbox, and response patterns ("copy what exists").
+
+**Deferred to their own PRs:** `accessibility_tagger` (PDF/UA auto-tagging — breadth, a one-shot transform, weak fit for the workflow narrative; ready to lift from this plan's history when wanted), Viewer, and accessibility validation.
+
+> **Scope change from the original plan** (`…-extraction-accessibility-plan.md`, superseded): accessibility was dropped in favor of the `query_extraction` affordance + example, on the product judgment that a demonstrable workflow beats tool-count growth.
+
+---
+
+## Problem Frame
+
+The DWS Processor API already returns *"typed JSON or Markdown … with tables, key-value pairs, coordinates, and confidence scores"* where *"each document element … includes bounding-box coordinates, reading order index, element type, and confidence scores."* Today an agent can only reach this by hand-constructing a full Build `instructions` object with the right `output` block on the generic `document_processor` tool — poor ergonomics for the one operation a dynamic workflow leans on most.
+
+Three in-repo realities shape the design — each surfaced by the plan review and verified against code:
+
+1. **Coordinates overflow context.** `src/schemas.ts` disables `structuredText` with: *"Structure text uses many chars, and often overflows the context length of an LLM. We will not support this for now."* Writing it to a file solves overflow but, on its own, just **moves** the problem — an agent cannot branch on coordinates it cannot read. Hence `query_extraction`: the agent retrieves only the slices it needs.
+2. **`performBuildCall` cannot serve the inline case.** `performBuildCall(instructions, outputFilePath)` requires `outputFilePath` and calls `resolveWriteFilePath` *before* the API call (`src/dws/build.ts:24`). The reusable core is the currently-**private** `processInstructions` + `makeApiBuildCall`. These must be exported, not wrapped.
+3. **One endpoint.** Everything goes through `callNutrientApi('build', …)` → `https://api.nutrient.io/build`. No new endpoint plumbing.
+
+---
+
+## Requirements
+
+- **R1.** `data_extractor` exposes text / key-value-pair / table / structured (coordinate+confidence) extraction, output as JSON or Markdown.
+- **R2.** Structured/positional results are written to `outputPath`; the inline response is a decision-grade summary containing **no extracted document content**. `structuredText: true` requires `outputPath`.
+- **R3.** `query_extraction` reads an extraction JSON file and returns filtered element slices inline, filterable by page, region (bbox), and minimum confidence.
+- **R4.** Both tools reuse existing patterns: `getApiKey()` auth, sandbox path resolution (`resolveReadFilePath`/`resolveWriteFilePath`), the shared build core, and the `createErrorResponse`/`handleFileResponse` helpers.
+- **R5.** Both tools respect the sandbox vs. non-sandbox registration model in `addToolsToServer`.
+- **R6.** `outputPath`/file paths, when supplied, are **always** validated through the sandbox resolver before any routing branch or API call.
+- **R7.** Ship one runnable dynamic-workflow example demonstrating extract → branch → act.
+- **R8.** Update README (Available Tools + Features) and amend `document_processor`'s description to point extraction users at `data_extractor` (remove the duplicate affordance).
+- **R9.** Tests cover instruction construction, inline-vs-file routing, query filtering, and error/PII paths.
+
+---
+
+## Key Technical Decisions
+
+- **KTD1 — Compose the shared build core, don't wrap `performBuildCall`.** Export `processInstructions` and `makeApiBuildCall` (package-internal) from `src/dws/build.ts`; `data_extractor` calls them directly so the inline path needs no `outputPath`. *(Review: Feasibility + Scope-guardian, 0.9 — `performBuildCall` resolves a write path before the call and cannot serve inline extraction.)*
+- **KTD2 — Structured output to file; summary inline; query for slices.** `structuredText` (or an explicit `outputPath`) ⇒ write JSON to the resolved path, return a decision-grade summary. **No size-threshold branch** in v1 — route purely on the boolean. The agent uses `query_extraction` to pull actionable elements back. *(Review: 4 reviewers — the size threshold had no defined value, was unmeasurable pre-stream, and created a "needs a file but has no path" hole.)*
+- **KTD3 — Decision-grade summary, not content.** The inline summary is restricted to: per-page element-type counts (`{tables, keyValuePairs, textBlocks}`), low-confidence element counts/flags, bbox ranges, page count, output path, byte size. It must **never** include extracted text/values (PII boundary). If expected fields aren't derivable from the live response, degrade gracefully to page-count + path + bytes and say so inline. *(Review: Security 0.75 + Feasibility 0.7.)*
+- **KTD4 — Markdown is a plain string, routed separately.** `format: markdown` returns inline text (it's a single small blob, not `{pages}`); the count-summary logic applies only to `json-content`. *(Review: Feasibility 0.85 — existing code routes non-`json-content` to a file; markdown has no `{pages}` to summarize.)*
+- **KTD5 — Verify the API contract in a spike BEFORE committing the schema (U0).** Public docs confirm confidence+coordinates exist but don't pin: (a) whether `structuredText` is a `json-content` sub-option vs. a separate output type/endpoint, (b) the confidence/coordinate field names, (c) whether `plainText`/`keyValuePairs`/`tables`/`structuredText` combine or are mutually exclusive (the existing schema says *"use one at a time"* yet defaults two on). One live `/build` call resolves all three and produces a recorded fixture that `query_extraction` and tests build on. If `structuredText` is *not* a `json-content` sub-option, U1/U3 schema shape changes — which is exactly why this is blocking. *(Review: Adversarial 0.75, Feasibility 0.7.)*
+- **KTD6 — Inline data is transcript-visible; treat as sensitive.** Both the `data_extractor` inline path (when no `outputPath`) and every `query_extraction` result place extracted content into the agent transcript, which the MCP host / LLM provider may log. Tool descriptions must state this plainly; recommend `outputPath` + scoped queries for sensitive documents. *(Review: Security 0.88.)*
+
+---
+
+## High-Level Technical Design
+
+```mermaid
+flowchart TD
+    A[Agent: data_extractor\nfile + toggles + format] --> V[validate outputPath via\nresolveWriteFilePath FIRST]
+    V --> B[build Instructions\noutput=json-content or markdown]
+    B --> C[shared build core:\nprocessInstructions → makeApiBuildCall → /build]
+    C --> D{structuredText set?}
+    D -- no, json --> E[handleJsonContentResponse\n→ JSON inline]
+    D -- no, markdown --> M[markdown string inline]
+    D -- yes --> F[write JSON to outputPath\n→ decision-grade summary inline\n(counts, low-conf flags, bbox ranges; NO content)]
+    F -.-> G[Agent: query_extraction\nfile + page/region/minConfidence]
+    G --> H[resolveReadFilePath → parse → filter\n→ matching elements inline]
+    H -.-> I[Agent branches → ai_redactor / document_signer]
+```
+
+*Directional — routing gates and the extract→query→act loop are the design intent; field-level shapes are settled by the U0 spike and in code.*
+
+---
+
+## Implementation Units
+
+### U0. Spike: verify the `/build` extraction contract + capture a fixture
+
+**Goal:** Resolve the unconfirmed API shape before any schema is committed.
+**Requirements:** KTD5 (enables R1–R3).
+**Dependencies:** none. **Blocking** for U1, U3, U4.
+**Files:** `tests/fixtures/extraction-sample.json` (new, recorded response); a throwaway script (not committed) or a documented `curl`/node snippet.
+**Approach:** With a real `NUTRIENT_DWS_API_KEY`, call `/build` against `tests/assets/example.pdf` requesting `json-content` with `plainText`, `keyValuePairs`, `tables`, and structured/positional text. Record: (a) is `structuredText` a `json-content` sub-option or separate? (b) exact field names for elements, bbox/coordinates, confidence, page, reading order; (c) whether multiple toggles can combine. Save the response as a fixture for U4/U7. Document findings in the U0 commit message / a short note in `docs/`.
+**Test scenarios:** none — investigation. Output is the fixture + recorded findings.
+**Verification:** Fixture exists; the three KTD5 questions are answered in writing. If `structuredText` is not a `json-content` sub-option, update U1/U3 before proceeding.
+
+### U1. Arg schemas for both tools
+
+**Goal:** Define `DataExtractorArgsSchema` and `QueryExtractionArgsSchema`.
+**Requirements:** R1, R2, R3, R6.
+**Dependencies:** U0.
+**Files:** `src/schemas.ts`
+**Approach:** `DataExtractorArgsSchema`: `filePath`, optional `password`/`pages`, extraction toggles, `language` (string|string[]), `format` (`json`|`markdown`, default `json`), optional `outputPath` — required when `structuredText` is true (`.superRefine`, mirroring `AiRedactArgsSchema`'s stage/apply precedent). If U0 finds the toggles are mutually exclusive, model them as an enum + refine and drop the misleading "use one at a time" wording; if combinable, keep booleans and fix the inherited descriptions. `QueryExtractionArgsSchema`: `filePath` (the extraction JSON, sandbox-resolved read), optional `pages`, optional `region` (bbox: x/y/width/height), optional `minConfidence` (0–1), optional `elementTypes` (filter to tables/kv/text). Reuse `PageRangeSchema`.
+**Patterns to follow:** `BuildAPIArgsSchema`, `JSONContentOutputSchema`, `AiRedactArgsSchema`.
+**Test scenarios:**
+- `data_extractor`: valid `plainText`-only parses, `format` defaults `json`.
+- `structuredText: true` without `outputPath` → rejected with a clear message.
+- `language` accepts string and array.
+- (if U0 ⇒ exclusive) two toggles set → rejected.
+- `query_extraction`: `minConfidence` out of 0–1 → rejected; `region` requires all four bbox fields.
+**Verification:** `pnpm pretest` passes; schema unit tests green.
+
+### U2. Export the shared build core
+
+**Goal:** Make the inline extraction path possible without a write path.
+**Requirements:** R4, KTD1.
+**Dependencies:** none (independent enabling refactor).
+**Files:** `src/dws/build.ts`
+**Approach:** Export `processInstructions` and `makeApiBuildCall` as package-internal symbols (no public/tool surface change). Leave `performBuildCall` intact and refactor it to consume the now-exported core so behavior is unchanged. No signature change to `performBuildCall`.
+**Patterns to follow:** existing `build.ts` structure.
+**Test scenarios:**
+- Existing `tests/build-api-examples.test.ts` still green (regression — the refactor is behavior-preserving).
+- Exported `processInstructions` returns the same `{instructions, fileReferences}` shape for a sample input.
+**Execution note:** Characterization-first — confirm the existing build tests pass before and after the extract, since this refactors a shipped path.
+**Verification:** `pnpm test` green; no diff in `document_processor` behavior.
+
+### U3. `data_extractor` handler
+
+**Goal:** Build instructions, call the core, route inline vs. file, summarize safely.
+**Requirements:** R1, R2, R4, R6, KTD2, KTD3, KTD4, KTD6.
+**Dependencies:** U0, U1, U2.
+**Files:** `src/dws/extract.ts` (new; includes a module-private `summarizeExtraction` helper — not exported to `utils.ts`)
+**Approach:** `performExtractCall(args)`. Validate `outputPath` via `resolveWriteFilePath` **first** (R6), before building instructions or calling the API — fail early on sandbox escape regardless of branch. Construct `Instructions` with `output: json-content` (toggles + language) or `markdown`. Call the exported core. Routing: `structuredText` set ⇒ write JSON to resolved path + return `summarizeExtraction` output (KTD3 fields only, no content); `format: markdown` ⇒ inline string; else ⇒ `handleJsonContentResponse` inline. `summarizeExtraction` parses the JSON using field names confirmed in U0; on missing fields, degrade to page-count + path + bytes. Audit the error path: ensure `handleApiError` never serializes the `Authorization` header (strip `e.config` if needed).
+**Patterns to follow:** `src/dws/build.ts`, `handleJsonContentResponse`/`handleFileResponse`/`pipeToBuffer`, `createErrorResponse`.
+**Test scenarios:**
+- `plainText` only → JSON inline (mocked stream).
+- `structuredText: true` + `outputPath` → file written; response is a summary string with counts/flags/path and **no document text** (assert a known PII token from the fixture is absent inline).
+- `format: markdown` → `output.type: 'markdown'`, inline string, no count-summary.
+- `outputPath` outside sandbox → rejected before the API call (assert no network call).
+- API error → `createErrorResponse`; assert the returned text contains no `Bearer`/key.
+**Verification:** `pnpm test` green.
+
+### U4. `query_extraction` handler
+
+**Goal:** Return actionable filtered slices inline from an extraction file.
+**Requirements:** R3, R6, KTD6.
+**Dependencies:** U0 (fixture/field shape), U1.
+**Files:** `src/dws/extract.ts` (or `src/dws/query.ts`)
+**Approach:** `performQueryCall(args)`. Resolve `filePath` via `resolveReadFilePath` (sandbox), read + `JSON.parse`. Filter elements by `pages`, `region` (bbox intersection), `minConfidence`, `elementTypes`. Return matched elements inline (bounded count; if a query still matches a very large set, return the first N + a note to narrow). Field access uses the U0-confirmed names with a defensive fallback. Tool description states results enter the transcript (KTD6).
+**Patterns to follow:** `resolveReadFilePath`, `createSuccessResponse`/`createErrorResponse`.
+**Test scenarios:**
+- `minConfidence: 0.9` → only high-confidence elements returned (against the U0 fixture).
+- `region` bbox → only intersecting elements.
+- `pages: [0]` → only page-0 elements.
+- Missing/малformed file → `createErrorResponse`.
+- Oversized match set → truncated with a "narrow your query" note.
+- File outside sandbox → rejected.
+**Verification:** `pnpm test` green.
+
+### U5. Register both tools
+
+**Goal:** Wire `data_extractor` + `query_extraction` into the server.
+**Requirements:** R1, R3, R5, R8.
+**Dependencies:** U3, U4.
+**Files:** `src/index.ts`
+**Approach:** Two `server.tool(...)` registrations mirroring `document_processor`, with descriptions that (a) note structured output goes to a file and is queried via `query_extraction`, and (b) carry the KTD6 transcript warning. Confirm both work in sandbox and non-sandbox modes. Also amend `document_processor`'s description to drop the standalone "JSON extraction" affordance and point to `data_extractor` (R8).
+**Patterns to follow:** existing `server.tool` blocks.
+**Test scenarios:**
+- Test expectation: none beyond handler tests — registration is wiring; behavior covered by U3/U4.
+**Verification:** `pnpm build`; launch server; both tools register and the `document_processor` description no longer double-advertises extraction.
+
+### U6. Dynamic-workflow example
+
+**Goal:** One runnable artifact demonstrating extract → branch → act.
+**Requirements:** R7.
+**Dependencies:** U5.
+**Files:** `examples/invoice-extraction-workflow/` (a documented script + sample doc reference), `README.md` (a "Dynamic workflows" walkthrough section)
+**Approach:** Show an agent calling `data_extractor` (structured → file) → `query_extraction` with `minConfidence` to find low-confidence fields → branching (e.g., flag for human review) and acting via the existing `ai_redactor` / `document_signer`. Keep it copy-pasteable; reference `tests/assets/example.pdf`.
+**Test scenarios:**
+- Test expectation: none — documentation/example. If a smoke script is included, gate it behind a real key and exclude from `pnpm test`.
+**Verification:** Walkthrough steps run end-to-end manually against a live key once.
+
+### U7. Tests
+
+**Goal:** Cover both handlers per the repo convention.
+**Requirements:** R9.
+**Dependencies:** U3, U4.
+**Files:** `tests/extract.test.ts` (new), `tests/query.test.ts` (new), reuse `tests/fixtures/extraction-sample.json` (from U0) and `tests/assets/example.pdf`. Inline example objects (the feature has few cases — no separate `*-api-examples.ts` data file).
+**Approach:** Follow `tests/unit.test.ts`/`tests/build-api-examples.test.ts` conventions; mock API streams; assert routing, summary-without-content, query filtering, sandbox rejection, and key-redaction in errors.
+**Execution note:** Start from a failing test asserting the structured→file summary contains no document content (the riskiest + security-critical path).
+**Test scenarios:** the scenarios enumerated in U1/U3/U4 live here.
+**Verification:** `pnpm test`, `pnpm lint`, `pnpm format` clean.
+
+### U8. Docs
+
+**Goal:** Document the new tools and workflow.
+**Requirements:** R8.
+**Dependencies:** U5, U6.
+**Files:** `README.md`
+**Approach:** Add `data_extractor` + `query_extraction` rows to "Available Tools"; update the "Data Extraction" feature row (coordinates/confidence, file output + query). Ensure the `document_processor` row no longer implies it's the extraction path. Note the transcript-visibility caveat for extracted content.
+**Test scenarios:** Test expectation: none — documentation only.
+**Verification:** Tool names/descriptions match the registered tools exactly (grep parity).
+
+---
+
+## Scope Boundaries
+
+**In scope:** `data_extractor`, `query_extraction`, one dynamic-workflow example, tests, README + `document_processor` description fix.
+
+### Deferred to Follow-Up Work
+- **`accessibility_tagger` (PDF/UA auto-tagging)** — own PR; design is ready in this plan's git history (maps to `output.type: 'pdfua'` + `metadata`). Dropped here to keep the workflow narrative sharp.
+- **Accessibility validation / compliance reporting** — not a confirmed DWS capability.
+- **Viewer tool** — low value for headless workflows.
+- **Re-enabling `structuredText` on `document_processor`** — kept off there; only `data_extractor` exposes it (behind file output).
+- **Extension-allowlist / hardened non-sandbox output paths** — see System-Wide Impact; revisit if needed.
+- **npm publish / version bump** — separate release step.
+
+---
+
+## System-Wide Impact
+
+- **No auth/transport change** — reuses `NUTRIENT_DWS_API_KEY` and stdio. No new env vars.
+- **Additive** — no breaking change to existing tools; `document_processor` keeps full capability (only its description changes to reduce extraction overlap).
+- **Sandbox** covers all new reads/writes via `resolveReadFilePath`/`resolveWriteFilePath`. **Known limitation (pre-existing):** in non-sandbox mode, any absolute `outputPath` is writable — call this out in the tool descriptions; an extension allowlist is deferred.
+- **Transcript exposure:** `data_extractor` inline results and all `query_extraction` results place extracted content in the agent transcript (KTD6) — documented, not silently introduced.
+- **Credits:** extraction is a billable Build op; existing `check_credits` applies.
+
+---
+
+## Risks & Dependencies
+
+- **R-A (high → mitigated): API contract unconfirmed.** *Mitigation:* U0 spike is blocking and produces a fixture before schemas are committed (KTD5).
+- **R-B (medium): structured field names drive both the summary and the query.** If U0 reveals an unexpected shape, `summarizeExtraction` + `query_extraction` field access change. *Mitigation:* single source of truth = U0 fixture; defensive fallbacks; both consume the same confirmed names.
+- **R-C (medium): PII in transcript.** *Mitigation:* KTD3 (no content in summaries) + KTD6 (documented warning) + a test asserting no document content leaks inline on the structured path.
+- **R-D (low): shared-core refactor regresses `document_processor`.** *Mitigation:* U2 is characterization-first; existing build tests gate it.
+- **R-E (low): query returns too much.** *Mitigation:* bounded result count + "narrow your query" guidance.
+
+---
+
+## Verification Strategy
+
+No GitHub Actions in this repo — verification is local:
+- `pnpm pretest` (tsc), `pnpm test` (vitest), `pnpm lint`, `pnpm format`.
+- `pnpm build`, launch the server, run the U6 walkthrough once against a live key (extract → query → act).
+- Per project AGENTS rules: branch off `main` → Conventional Commits → PR into `main`; never push to `main`; report the exact command + exit 0 before claiming done.
+
+---
+
+## Sources & Research
+
+- Existing code (authoritative): `src/index.ts`, `src/schemas.ts` (`JSONContentOutputSchema`, disabled `structuredText` at the bottom of that schema, `PDFUAOutputSchema`), `src/dws/build.ts` (`performBuildCall` write-path-before-call; private `processInstructions`/`makeApiBuildCall`), `src/dws/utils.ts` (`handleJsonContentResponse`/`handleFileResponse`/`handleApiError`), `src/dws/api.ts`, `src/fs/sandbox.ts`.
+- DWS Processor API — *"typed JSON or Markdown … tables, key-value pairs, coordinates, and confidence scores"*; *"each document element … bounding-box coordinates, reading order index, element type, and confidence scores."* ([nutrient.io/api](https://www.nutrient.io/api/), [processor-api](https://www.nutrient.io/api/processor-api/))
+- Plan review (2026-06-07): 6 personas; this revision applies the high-confidence Feasibility/Scope/Security/Adversarial findings (shared-core composition, drop size-threshold, markdown routing, U0 spike, PII-safe summaries, sandbox-validate-first, de-advertise overlap) and the product decision (re-focus on extraction + add the query affordance + example).

From 7348e6b1c53bbf4cb6272ef36f69b91c13b582eb Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 11:28:22 -0700
Subject: [PATCH 02/12] refactor(build): export processInstructions and
 makeApiBuildCall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expose the build core (package-internal) so focused tools like
data_extractor can compose instructions -> API call -> response routing
without performBuildCall's required outputFilePath, enabling inline
responses. Behavior-preserving; performBuildCall still consumes both.

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 src/dws/build.ts | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/dws/build.ts b/src/dws/build.ts
index 636aac5..6dbb347 100644
--- a/src/dws/build.ts
+++ b/src/dws/build.ts
@@ -39,9 +39,13 @@ export async function performBuildCall(
 }
 
 /**
- * Process file references in instructions
+ * Process file references in instructions.
+ *
+ * Exported (package-internal) so focused tools such as `data_extractor` can
+ * compose the build core directly without going through `performBuildCall`
+ * (which requires an output file path and so cannot serve inline responses).
  */
-async function processInstructions(instructions: Instructions): Promise<{
+export async function processInstructions(instructions: Instructions): Promise<{
   instructions: Instructions
   fileReferences: Map<string, FileReference>
 }> {
@@ -133,9 +137,12 @@ async function processFileReference(reference: string): Promise<FileReference> {
 }
 
 /**
- * Make the API call to the build endpoint
+ * Make the API call to the build endpoint.
+ *
+ * Exported (package-internal) so focused tools can reuse the multipart/URL
+ * negotiation and streaming-response behavior of the Processor `build` endpoint.
  */
-async function makeApiBuildCall(
+export async function makeApiBuildCall(
   instructions: Instructions,
   fileReferences: Map<string, FileReference>,
   apiClient: DwsApiClient,

From 8581e16aef52793da6ccba921e92f87c64a2345f Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 12:48:56 -0700
Subject: [PATCH 03/12] revert: drop shared build-core export (U2)

Data Extraction is a separate DWS API (POST /extraction/parse with its own
pdf_live_ key), not a json-content output of the Processor /build endpoint.
data_extractor will not reuse the Build instruction machinery, so the
processInstructions/makeApiBuildCall exports are unnecessary.
---
 src/dws/build.ts | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/dws/build.ts b/src/dws/build.ts
index 6dbb347..636aac5 100644
--- a/src/dws/build.ts
+++ b/src/dws/build.ts
@@ -39,13 +39,9 @@ export async function performBuildCall(
 }
 
 /**
- * Process file references in instructions.
- *
- * Exported (package-internal) so focused tools such as `data_extractor` can
- * compose the build core directly without going through `performBuildCall`
- * (which requires an output file path and so cannot serve inline responses).
+ * Process file references in instructions
  */
-export async function processInstructions(instructions: Instructions): Promise<{
+async function processInstructions(instructions: Instructions): Promise<{
   instructions: Instructions
   fileReferences: Map<string, FileReference>
 }> {
@@ -137,12 +133,9 @@ async function processFileReference(reference: string): Promise<FileReference> {
 }
 
 /**
- * Make the API call to the build endpoint.
- *
- * Exported (package-internal) so focused tools can reuse the multipart/URL
- * negotiation and streaming-response behavior of the Processor `build` endpoint.
+ * Make the API call to the build endpoint
  */
-export async function makeApiBuildCall(
+async function makeApiBuildCall(
   instructions: Instructions,
   fileReferences: Map<string, FileReference>,
   apiClient: DwsApiClient,

From f3bfb6529177462f5513f2bbefb36c25b1c5ac22 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 12:52:08 -0700
Subject: [PATCH 04/12] docs(plan): revise for standalone Data Extraction API
 (/extraction/parse)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

DWS Data Extraction is a separate API with its own pdf_live_ key, not a
json-content output of Processor /build. Rework KTD1 (second DwsApiClient),
add modes/formats + cost transparency, separate NUTRIENT_EXTRACTION_API_KEY,
and ground all wiring in main's client.ts/index.ts.

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 ...-feat-dws-data-extraction-workflow-plan.md | 285 +++++++++---------
 1 file changed, 144 insertions(+), 141 deletions(-)

diff --git a/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
index 0aad9aa..85f3cf8 100644
--- a/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
+++ b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
@@ -1,65 +1,70 @@
 ---
-title: "feat: data_extractor + query_extraction tools and a dynamic-workflow example"
+title: "feat: data_extractor + query_extraction tools (DWS Data Extraction API) and a workflow example"
 status: active
 date: 2026-06-07
 type: feat
 target_repo: nutrient-dws-mcp-server
 base_branch: main
-supersedes: 2026-06-07-001-feat-dws-extraction-accessibility-plan.md
 ---
 
 # feat: `data_extractor` + `query_extraction` + a dynamic-workflow example
 
 ## Summary
 
-Ship the **data-extraction workflow primitive** for the Nutrient DWS MCP server — the single highest-leverage move toward the "AI agents using documents" keystone — plus a runnable example that demonstrates it end to end.
+Add the **Data Extraction workflow primitive** to the Nutrient DWS MCP server, targeting the **new standalone DWS Data Extraction API** (`POST https://api.nutrient.io/extraction/parse`) — a separate product with its own key, **not** a `json-content` output of the Processor `/build` endpoint.
 
-- **`data_extractor`** — typed JSON / Markdown extraction (text, key-value pairs, tables, and structured/positional text with **coordinates + confidence**). Because structured output is large, it is written to a file; the inline response is a *decision-grade summary* (per-page element/table/KVP counts, low-confidence flags, bbox ranges) — never raw document content.
-- **`query_extraction`** — reads that extraction file back and returns **filtered slices inline** (by page, region, or minimum confidence), so the agent can pull *actionable* coordinates into context on demand instead of being handed a file it cannot read. This is what makes the primitive genuinely agent-native rather than a context-problem relocation.
-- **A worked dynamic-workflow example** (`examples/` + README walkthrough): extract → branch on low confidence → act with the existing `ai_redactor` / `document_signer` tools. This is the GTM-legible artifact a keystone needs.
+- **`data_extractor`** — calls `/extraction/parse` with a `mode` (`text`/`structure`/`understand`/`agentic`) and output `format` (`spatial` elements or `markdown`). Spatial output (typed elements with `bounds`, `confidence`, `readingOrder`, `page`) can be large, so it is written to a file with a decision-grade summary returned inline; markdown is returned inline.
+- **`query_extraction`** — reads a saved spatial-extraction file and returns **filtered element slices inline** (by page, region/bbox, minimum confidence, element type), so an agent can pull actionable coordinates into context on demand.
+- **A dynamic-workflow example** — extract → query low-confidence elements → act with the existing `ai_redactor` / `document_signer`.
 
-Both tools are thin wrappers over the existing `/build` path, reusing the current auth, sandbox, and response patterns ("copy what exists").
+Architecture fit: main already has a `DwsApiClient` abstraction (`baseUrl` + `tokenResolver`, `.post(endpoint, data)`). `data_extractor` uses a **second client instance** authenticated with the Data Extraction key (`pdf_live_…`) — no new HTTP plumbing.
 
-**Deferred to their own PRs:** `accessibility_tagger` (PDF/UA auto-tagging — breadth, a one-shot transform, weak fit for the workflow narrative; ready to lift from this plan's history when wanted), Viewer, and accessibility validation.
-
-> **Scope change from the original plan** (`…-extraction-accessibility-plan.md`, superseded): accessibility was dropped in favor of the `query_extraction` affordance + example, on the product judgment that a demonstrable workflow beats tool-count growth.
+**Deferred to their own PRs:** `accessibility_tagger` (the DWS **Accessibility API** is also now standalone and includes auto-tag *and* validation), Viewer.
 
 ---
 
 ## Problem Frame
 
-The DWS Processor API already returns *"typed JSON or Markdown … with tables, key-value pairs, coordinates, and confidence scores"* where *"each document element … includes bounding-box coordinates, reading order index, element type, and confidence scores."* Today an agent can only reach this by hand-constructing a full Build `instructions` object with the right `output` block on the generic `document_processor` tool — poor ergonomics for the one operation a dynamic workflow leans on most.
+DWS is now four separate APIs, each with its own key: **Processor** (`/build`, `NUTRIENT_DWS_API_KEY`), **Data Extraction** (`/extraction/parse`, `pdf_live_…`), **Accessibility**, **Viewer**. The MCP server today only speaks Processor `/build`. Extraction was *previously* reachable as a `json-content` Build output; the dedicated Data Extraction API supersedes that with richer typed elements, confidence, coordinates, and four cost/quality modes.
+
+Authoritative spec (verified on disk at `~/projects/nutrient-website/src/content/guides/dws-data-extraction/`):
 
-Three in-repo realities shape the design — each surfaced by the plan review and verified against code:
+- **Endpoint:** `POST https://api.nutrient.io/extraction/parse`. Auth: `Authorization: Bearer pdf_live_…` (separate dashboard key; `pdf_test_…` for testing).
+- **Request:** multipart `file` + `instructions={"mode":…,"output":{"format":…,"includeWords":…}}` (also supports JSON-body-with-URL and raw-binary).
+- **Modes:** `text` (1 cr/pg, markdown only, no OCR), `structure` (1.5 cr/pg, OCR spatial), `understand` (default, 9 cr/pg, AI-augmented), `agentic` (18 cr/pg, VLM).
+- **Output:** `spatial` → `output.elements[]`; `markdown` → `output.markdown`. `text` mode defaults to markdown; others default to spatial.
+- **Spatial element:** `{id, type, role, text, confidence, readingOrder, bounds:{x,y,width,height}, page:{pageIndex,pageNumber,width,height}}`. Types: `paragraph`, `table` (rows/cols/cells w/ per-cell bounds), `formula` (LaTeX), `picture` (alt text), `keyValueRegion`, `handwriting`. Optional `includeWords` adds word-level bounds.
+- **Coordinates:** top-left origin, render-space pixels, `0 ≤ x+width ≤ page.width`.
+- **Response envelope:** `{status, requestId, output:{elements|markdown}, metrics:{processingTimeMs,pagesProcessed}, configuration:{mode,outputFormat}}` — returned as JSON (the client streams it; the handler parses).
 
-1. **Coordinates overflow context.** `src/schemas.ts` disables `structuredText` with: *"Structure text uses many chars, and often overflows the context length of an LLM. We will not support this for now."* Writing it to a file solves overflow but, on its own, just **moves** the problem — an agent cannot branch on coordinates it cannot read. Hence `query_extraction`: the agent retrieves only the slices it needs.
-2. **`performBuildCall` cannot serve the inline case.** `performBuildCall(instructions, outputFilePath)` requires `outputFilePath` and calls `resolveWriteFilePath` *before* the API call (`src/dws/build.ts:24`). The reusable core is the currently-**private** `processInstructions` + `makeApiBuildCall`. These must be exported, not wrapped.
-3. **One endpoint.** Everything goes through `callNutrientApi('build', …)` → `https://api.nutrient.io/build`. No new endpoint plumbing.
+Because the schema is fully documented, the build proceeds against it directly; one live call (U0) is **confirmation**, not discovery.
 
 ---
 
 ## Requirements
 
-- **R1.** `data_extractor` exposes text / key-value-pair / table / structured (coordinate+confidence) extraction, output as JSON or Markdown.
-- **R2.** Structured/positional results are written to `outputPath`; the inline response is a decision-grade summary containing **no extracted document content**. `structuredText: true` requires `outputPath`.
-- **R3.** `query_extraction` reads an extraction JSON file and returns filtered element slices inline, filterable by page, region (bbox), and minimum confidence.
-- **R4.** Both tools reuse existing patterns: `getApiKey()` auth, sandbox path resolution (`resolveReadFilePath`/`resolveWriteFilePath`), the shared build core, and the `createErrorResponse`/`handleFileResponse` helpers.
-- **R5.** Both tools respect the sandbox vs. non-sandbox registration model in `addToolsToServer`.
-- **R6.** `outputPath`/file paths, when supplied, are **always** validated through the sandbox resolver before any routing branch or API call.
-- **R7.** Ship one runnable dynamic-workflow example demonstrating extract → branch → act.
-- **R8.** Update README (Available Tools + Features) and amend `document_processor`'s description to point extraction users at `data_extractor` (remove the duplicate affordance).
-- **R9.** Tests cover instruction construction, inline-vs-file routing, query filtering, and error/PII paths.
+- **R1.** `data_extractor` calls `/extraction/parse` exposing `mode`, `output.format`, `includeWords`, `language`, and page selection.
+- **R2.** Spatial results are written to `outputPath`; the inline response is a decision-grade summary with **no extracted document content**. Markdown results return inline. `format: spatial` requires `outputPath`.
+- **R3.** `query_extraction` reads a saved spatial-extraction file and returns filtered element slices inline (page, region/bbox, minConfidence, elementTypes).
+- **R4.** Reuse main's `DwsApiClient`; add a Data Extraction client authenticated by `NUTRIENT_EXTRACTION_API_KEY`. Reuse sandbox path resolution and response/error helpers.
+- **R5.** Respect the sandbox vs. non-sandbox registration model in `addToolsToServer`.
+- **R6.** Any `outputPath`/`filePath` is validated through the sandbox resolver before the API call or file read.
+- **R7.** Surface per-mode **credit cost** in the `data_extractor` description (`understand` = 9 cr/pg) so agents/users don't run up cost unknowingly.
+- **R8.** Ship one runnable dynamic-workflow example (extract → query → act).
+- **R9.** Update README (Available Tools + Features + the new env var) and amend `document_processor`'s description so it no longer advertises standalone extraction.
+- **R10.** Tests cover request construction, spatial→file vs markdown→inline routing, query filtering, sandbox rejection, and key/PII safety — mocked against the documented response shape.
 
 ---
 
 ## Key Technical Decisions
 
-- **KTD1 — Compose the shared build core, don't wrap `performBuildCall`.** Export `processInstructions` and `makeApiBuildCall` (package-internal) from `src/dws/build.ts`; `data_extractor` calls them directly so the inline path needs no `outputPath`. *(Review: Feasibility + Scope-guardian, 0.9 — `performBuildCall` resolves a write path before the call and cannot serve inline extraction.)*
-- **KTD2 — Structured output to file; summary inline; query for slices.** `structuredText` (or an explicit `outputPath`) ⇒ write JSON to the resolved path, return a decision-grade summary. **No size-threshold branch** in v1 — route purely on the boolean. The agent uses `query_extraction` to pull actionable elements back. *(Review: 4 reviewers — the size threshold had no defined value, was unmeasurable pre-stream, and created a "needs a file but has no path" hole.)*
-- **KTD3 — Decision-grade summary, not content.** The inline summary is restricted to: per-page element-type counts (`{tables, keyValuePairs, textBlocks}`), low-confidence element counts/flags, bbox ranges, page count, output path, byte size. It must **never** include extracted text/values (PII boundary). If expected fields aren't derivable from the live response, degrade gracefully to page-count + path + bytes and say so inline. *(Review: Security 0.75 + Feasibility 0.7.)*
-- **KTD4 — Markdown is a plain string, routed separately.** `format: markdown` returns inline text (it's a single small blob, not `{pages}`); the count-summary logic applies only to `json-content`. *(Review: Feasibility 0.85 — existing code routes non-`json-content` to a file; markdown has no `{pages}` to summarize.)*
-- **KTD5 — Verify the API contract in a spike BEFORE committing the schema (U0).** Public docs confirm confidence+coordinates exist but don't pin: (a) whether `structuredText` is a `json-content` sub-option vs. a separate output type/endpoint, (b) the confidence/coordinate field names, (c) whether `plainText`/`keyValuePairs`/`tables`/`structuredText` combine or are mutually exclusive (the existing schema says *"use one at a time"* yet defaults two on). One live `/build` call resolves all three and produces a recorded fixture that `query_extraction` and tests build on. If `structuredText` is *not* a `json-content` sub-option, U1/U3 schema shape changes — which is exactly why this is blocking. *(Review: Adversarial 0.75, Feasibility 0.7.)*
-- **KTD6 — Inline data is transcript-visible; treat as sensitive.** Both the `data_extractor` inline path (when no `outputPath`) and every `query_extraction` result place extracted content into the agent transcript, which the MCP host / LLM provider may log. Tool descriptions must state this plainly; recommend `outputPath` + scoped queries for sensitive documents. *(Review: Security 0.88.)*
+- **KTD1 — Target the Data Extraction API via a second `DwsApiClient`.** `data_extractor` builds a multipart form (`file` + `instructions`) and calls `extractionClient.post('extraction/parse', form)`, where `extractionClient = createApiClientFromApiKey(getExtractionApiKey())`. Same `baseUrl` (`https://api.nutrient.io`), different token. *(Supersedes the original "wrap /build" decision — Data Extraction is a separate API.)*
+- **KTD2 — Spatial → file + summary; markdown → inline; query for slices.** Spatial `elements[]` can be large; write the parsed JSON to `outputPath`, return a decision-grade summary, and let the agent retrieve slices via `query_extraction`. Markdown is a single blob → inline.
+- **KTD3 — Decision-grade summary, never content.** Inline summary = per-page counts by element `type`/`role`, low-confidence element count (e.g. `confidence < 0.6`), bbox coverage, page count, output path, byte size. No `text` values. (PII boundary; the field names are now known, so counts are reliable.)
+- **KTD4 — Mode + format surface with cost transparency.** Expose all four modes and both formats; default `mode: understand`, `format: spatial`. Validate `text`-mode ⇒ markdown-only. Put credit costs in the tool description (R7).
+- **KTD5 — Separate key + env var.** `getExtractionApiKey()` reads `NUTRIENT_EXTRACTION_API_KEY` (distinct from Processor `NUTRIENT_DWS_API_KEY`); fail with a clear message if unset. Document both keys.
+- **KTD6 — Inline data is transcript-visible.** `data_extractor` markdown/inline output and all `query_extraction` results enter the agent transcript (host/provider may log). Tool descriptions say so; recommend `outputPath` + scoped queries for sensitive docs.
+- **KTD7 — Response is streamed then parsed.** `DwsApiClient.post` uses `responseType: 'stream'`; the extraction handler pipes to a string (`pipeToString`) and `JSON.parse`s, since extraction returns JSON (unlike Build's file streams).
 
 ---
 
@@ -67,188 +72,186 @@ Three in-repo realities shape the design — each surfaced by the plan review an
 
 ```mermaid
 flowchart TD
-    A[Agent: data_extractor\nfile + toggles + format] --> V[validate outputPath via\nresolveWriteFilePath FIRST]
-    V --> B[build Instructions\noutput=json-content or markdown]
-    B --> C[shared build core:\nprocessInstructions → makeApiBuildCall → /build]
-    C --> D{structuredText set?}
-    D -- no, json --> E[handleJsonContentResponse\n→ JSON inline]
-    D -- no, markdown --> M[markdown string inline]
-    D -- yes --> F[write JSON to outputPath\n→ decision-grade summary inline\n(counts, low-conf flags, bbox ranges; NO content)]
-    F -.-> G[Agent: query_extraction\nfile + page/region/minConfidence]
-    G --> H[resolveReadFilePath → parse → filter\n→ matching elements inline]
-    H -.-> I[Agent branches → ai_redactor / document_signer]
+    A[Agent: data_extractor\nfile + mode + format] --> V[validate outputPath via\nresolveWriteFilePath FIRST]
+    V --> B[multipart form: file + instructions\n mode/output.format/includeWords]
+    B --> C[extractionClient.post\n'extraction/parse']
+    C --> P[pipe stream -> string -> JSON.parse]
+    P --> D{format}
+    D -- markdown --> E[output.markdown inline]
+    D -- spatial --> F[write output.elements to outputPath\n-> decision-grade summary inline\n(per-page type counts, low-conf, page dims; NO text)]
+    F -.-> G[Agent: query_extraction\nfile + page/region/minConfidence/type]
+    G --> H[resolveReadFilePath -> parse -> filter\n-> matching elements inline]
+    H -.-> I[Agent branches -> ai_redactor / document_signer]
 ```
 
-*Directional — routing gates and the extract→query→act loop are the design intent; field-level shapes are settled by the U0 spike and in code.*
+*Directional — routing gates and the extract→query→act loop are the design intent; field shapes follow the documented schema and are confirmed in U0.*
 
 ---
 
 ## Implementation Units
 
-### U0. Spike: verify the `/build` extraction contract + capture a fixture
+### U0. Verify `/extraction/parse` against the documented schema + capture fixture
 
-**Goal:** Resolve the unconfirmed API shape before any schema is committed.
-**Requirements:** KTD5 (enables R1–R3).
-**Dependencies:** none. **Blocking** for U1, U3, U4.
-**Files:** `tests/fixtures/extraction-sample.json` (new, recorded response); a throwaway script (not committed) or a documented `curl`/node snippet.
-**Approach:** With a real `NUTRIENT_DWS_API_KEY`, call `/build` against `tests/assets/example.pdf` requesting `json-content` with `plainText`, `keyValuePairs`, `tables`, and structured/positional text. Record: (a) is `structuredText` a `json-content` sub-option or separate? (b) exact field names for elements, bbox/coordinates, confidence, page, reading order; (c) whether multiple toggles can combine. Save the response as a fixture for U4/U7. Document findings in the U0 commit message / a short note in `docs/`.
-**Test scenarios:** none — investigation. Output is the fixture + recorded findings.
-**Verification:** Fixture exists; the three KTD5 questions are answered in writing. If `structuredText` is not a `json-content` sub-option, update U1/U3 before proceeding.
+**Goal:** Confirm the documented response shape with one live call and record a fixture for tests.
+**Requirements:** KTD7 (de-risks U3/U4).
+**Dependencies:** none for building; the live call needs `NUTRIENT_EXTRACTION_API_KEY`. **Deferred until the user confirms a key** — building proceeds against the documented schema meanwhile.
+**Files:** `tests/fixtures/extraction-spatial-sample.json`, `tests/fixtures/extraction-markdown-sample.json`
+**Approach:** `text` mode (1 credit) for the markdown fixture and `structure` mode (1.5 cr) for a small spatial fixture against `tests/assets/example.pdf`. Save responses verbatim. Confirm field names match `bounds/confidence/page/readingOrder/type/role`.
+**Test scenarios:** none — produces fixtures.
+**Verification:** Fixtures saved; field names match the docs (if any drift, adjust U1/U3/U4).
 
-### U1. Arg schemas for both tools
+### U1. Arg schemas
 
-**Goal:** Define `DataExtractorArgsSchema` and `QueryExtractionArgsSchema`.
+**Goal:** `DataExtractorArgsSchema` + `QueryExtractionArgsSchema`.
 **Requirements:** R1, R2, R3, R6.
-**Dependencies:** U0.
+**Dependencies:** none (documented schema).
 **Files:** `src/schemas.ts`
-**Approach:** `DataExtractorArgsSchema`: `filePath`, optional `password`/`pages`, extraction toggles, `language` (string|string[]), `format` (`json`|`markdown`, default `json`), optional `outputPath` — required when `structuredText` is true (`.superRefine`, mirroring `AiRedactArgsSchema`'s stage/apply precedent). If U0 finds the toggles are mutually exclusive, model them as an enum + refine and drop the misleading "use one at a time" wording; if combinable, keep booleans and fix the inherited descriptions. `QueryExtractionArgsSchema`: `filePath` (the extraction JSON, sandbox-resolved read), optional `pages`, optional `region` (bbox: x/y/width/height), optional `minConfidence` (0–1), optional `elementTypes` (filter to tables/kv/text). Reuse `PageRangeSchema`.
-**Patterns to follow:** `BuildAPIArgsSchema`, `JSONContentOutputSchema`, `AiRedactArgsSchema`.
+**Approach:** `DataExtractorArgsSchema`: `filePath` (sandbox read), `mode` enum (default `understand`), `format` enum `spatial|markdown` (default by mode), `includeWords` bool, `language` (string|string[]), `pages` (`PageRangeSchema`), `outputPath` — required when `format: spatial` (`.superRefine`); also refine `text` mode ⇒ `format` must be `markdown`. `QueryExtractionArgsSchema`: `filePath` (the saved spatial JSON), optional `pages`, `region` (`{x,y,width,height}` all required together), `minConfidence` (0–1), `elementTypes` (enum array), `limit` (default cap).
+**Patterns to follow:** `BuildAPIArgsSchema`, `AiRedactArgsSchema` (`.superRefine`), `PageRangeSchema`.
 **Test scenarios:**
-- `data_extractor`: valid `plainText`-only parses, `format` defaults `json`.
-- `structuredText: true` without `outputPath` → rejected with a clear message.
+- spatial without `outputPath` → rejected.
+- `text` mode with `format: spatial` → rejected.
 - `language` accepts string and array.
-- (if U0 ⇒ exclusive) two toggles set → rejected.
-- `query_extraction`: `minConfidence` out of 0–1 → rejected; `region` requires all four bbox fields.
-**Verification:** `pnpm pretest` passes; schema unit tests green.
-
-### U2. Export the shared build core
-
-**Goal:** Make the inline extraction path possible without a write path.
-**Requirements:** R4, KTD1.
-**Dependencies:** none (independent enabling refactor).
-**Files:** `src/dws/build.ts`
-**Approach:** Export `processInstructions` and `makeApiBuildCall` as package-internal symbols (no public/tool surface change). Leave `performBuildCall` intact and refactor it to consume the now-exported core so behavior is unchanged. No signature change to `performBuildCall`.
-**Patterns to follow:** existing `build.ts` structure.
+- query: `minConfidence` outside 0–1 → rejected; partial `region` → rejected.
+**Verification:** `pnpm pretest`; schema unit tests green.
+
+### U2. Data Extraction API client wiring
+
+**Goal:** Provide a `DwsApiClient` authenticated with the Data Extraction key.
+**Requirements:** R4, KTD1, KTD5.
+**Dependencies:** none.
+**Files:** `src/dws/utils.ts` or `src/utils/environment.ts` (add `getExtractionApiKey()`), `src/index.ts` (build the extraction client and thread it into `addToolsToServer` options alongside `apiClient`)
+**Approach:** `getExtractionApiKey()` reads `NUTRIENT_EXTRACTION_API_KEY`, throws a clear error if unset. In the server bootstrap, `const extractionApiClient = createApiClientFromApiKey(getExtractionApiKey())`. Extend the `addToolsToServer`/`createMcpServer` options type with `extractionApiClient: DwsApiClient`. Only construct it lazily/when the key exists so the Processor-only path still boots (extraction tools can surface a clear "set NUTRIENT_EXTRACTION_API_KEY" error if missing).
+**Patterns to follow:** `createStdioApiClient`, `createApiClientFromApiKey`, the existing `apiClient` threading in `src/index.ts`.
 **Test scenarios:**
-- Existing `tests/build-api-examples.test.ts` still green (regression — the refactor is behavior-preserving).
-- Exported `processInstructions` returns the same `{instructions, fileReferences}` shape for a sample input.
-**Execution note:** Characterization-first — confirm the existing build tests pass before and after the extract, since this refactors a shipped path.
-**Verification:** `pnpm test` green; no diff in `document_processor` behavior.
+- `getExtractionApiKey()` throws when env unset; returns the key when set.
+**Verification:** `pnpm pretest`; server boots with and without the extraction key (tools register; calling without key errors clearly).
 
 ### U3. `data_extractor` handler
 
-**Goal:** Build instructions, call the core, route inline vs. file, summarize safely.
-**Requirements:** R1, R2, R4, R6, KTD2, KTD3, KTD4, KTD6.
-**Dependencies:** U0, U1, U2.
-**Files:** `src/dws/extract.ts` (new; includes a module-private `summarizeExtraction` helper — not exported to `utils.ts`)
-**Approach:** `performExtractCall(args)`. Validate `outputPath` via `resolveWriteFilePath` **first** (R6), before building instructions or calling the API — fail early on sandbox escape regardless of branch. Construct `Instructions` with `output: json-content` (toggles + language) or `markdown`. Call the exported core. Routing: `structuredText` set ⇒ write JSON to resolved path + return `summarizeExtraction` output (KTD3 fields only, no content); `format: markdown` ⇒ inline string; else ⇒ `handleJsonContentResponse` inline. `summarizeExtraction` parses the JSON using field names confirmed in U0; on missing fields, degrade to page-count + path + bytes. Audit the error path: ensure `handleApiError` never serializes the `Authorization` header (strip `e.config` if needed).
-**Patterns to follow:** `src/dws/build.ts`, `handleJsonContentResponse`/`handleFileResponse`/`pipeToBuffer`, `createErrorResponse`.
+**Goal:** Call `/extraction/parse`, route spatial→file / markdown→inline, summarize safely.
+**Requirements:** R1, R2, R6, R7, KTD2, KTD3, KTD7.
+**Dependencies:** U1, U2 (and U0 fixture for tests).
+**Files:** `src/dws/extract.ts` (new; module-private `summarizeSpatial` helper), reuse `pipeToString` from `src/dws/utils.ts`
+**Approach:** `performExtractCall(args, extractionApiClient)`. If `format: spatial`, validate `outputPath` via `resolveWriteFilePath` **first**. Resolve `filePath` via `resolveReadFilePath`, read buffer, build `FormData` (`file` + `instructions` JSON). `await extractionApiClient.post('extraction/parse', form)`; `pipeToString` → `JSON.parse`. Markdown → return `output.markdown` inline. Spatial → write `output` (or `output.elements`) to the resolved path; return `summarizeSpatial(output)` (KTD3 fields only). Errors → `createErrorResponse`; ensure no `Authorization`/key leaks (axios error `config` stripped).
+**Patterns to follow:** `performBuildCall` structure, `processFileReference` file-read approach, `handleApiError`, `createSuccessResponse`/`createErrorResponse`.
 **Test scenarios:**
-- `plainText` only → JSON inline (mocked stream).
-- `structuredText: true` + `outputPath` → file written; response is a summary string with counts/flags/path and **no document text** (assert a known PII token from the fixture is absent inline).
-- `format: markdown` → `output.type: 'markdown'`, inline string, no count-summary.
-- `outputPath` outside sandbox → rejected before the API call (assert no network call).
-- API error → `createErrorResponse`; assert the returned text contains no `Bearer`/key.
+- markdown mode → inline string from `output.markdown` (mocked).
+- spatial mode + `outputPath` → file written; summary string has counts + path, and asserts a known text value from the fixture is **absent** inline.
+- `outputPath` outside sandbox → rejected before any network call.
+- API error → `createErrorResponse`; assert no `Bearer`/key in the message.
+- missing extraction key → clear "set NUTRIENT_EXTRACTION_API_KEY" error.
 **Verification:** `pnpm test` green.
 
 ### U4. `query_extraction` handler
 
-**Goal:** Return actionable filtered slices inline from an extraction file.
+**Goal:** Return filtered element slices inline from a saved spatial file.
 **Requirements:** R3, R6, KTD6.
-**Dependencies:** U0 (fixture/field shape), U1.
+**Dependencies:** U0 fixture, U1.
 **Files:** `src/dws/extract.ts` (or `src/dws/query.ts`)
-**Approach:** `performQueryCall(args)`. Resolve `filePath` via `resolveReadFilePath` (sandbox), read + `JSON.parse`. Filter elements by `pages`, `region` (bbox intersection), `minConfidence`, `elementTypes`. Return matched elements inline (bounded count; if a query still matches a very large set, return the first N + a note to narrow). Field access uses the U0-confirmed names with a defensive fallback. Tool description states results enter the transcript (KTD6).
+**Approach:** `performQueryCall(args)`. `resolveReadFilePath(filePath)`, read + parse. Filter `output.elements` by `pages` (`element.page.pageIndex`), `region` (bbox intersection with `element.bounds`), `minConfidence` (`element.confidence`), `elementTypes` (`element.type`). Return up to `limit` matches inline; if more matched, note the truncation and suggest narrowing. Defensive field access with a clear error if the file isn't a recognized extraction document.
 **Patterns to follow:** `resolveReadFilePath`, `createSuccessResponse`/`createErrorResponse`.
 **Test scenarios:**
-- `minConfidence: 0.9` → only high-confidence elements returned (against the U0 fixture).
+- `minConfidence: 0.9` → only high-confidence elements (against fixture).
 - `region` bbox → only intersecting elements.
 - `pages: [0]` → only page-0 elements.
-- Missing/малformed file → `createErrorResponse`.
-- Oversized match set → truncated with a "narrow your query" note.
-- File outside sandbox → rejected.
+- `elementTypes: ['table']` → only tables.
+- malformed/non-extraction file → `createErrorResponse`.
+- match set > `limit` → truncated with guidance.
+- file outside sandbox → rejected.
 **Verification:** `pnpm test` green.
 
-### U5. Register both tools
+### U5. Register tools + de-advertise extraction on `document_processor`
 
-**Goal:** Wire `data_extractor` + `query_extraction` into the server.
-**Requirements:** R1, R3, R5, R8.
+**Goal:** Wire both tools into the server.
+**Requirements:** R1, R3, R5, R7, R9.
 **Dependencies:** U3, U4.
 **Files:** `src/index.ts`
-**Approach:** Two `server.tool(...)` registrations mirroring `document_processor`, with descriptions that (a) note structured output goes to a file and is queried via `query_extraction`, and (b) carry the KTD6 transcript warning. Confirm both work in sandbox and non-sandbox modes. Also amend `document_processor`'s description to drop the standalone "JSON extraction" affordance and point to `data_extractor` (R8).
-**Patterns to follow:** existing `server.tool` blocks.
+**Approach:** Two `server.tool(...)` registrations passing `extractionApiClient` (data_extractor) and none (query_extraction reads files). Descriptions note: spatial output → file + `query_extraction`; per-mode credit cost; transcript caveat (KTD6). Amend `document_processor`'s description to drop standalone "JSON extraction" and point to `data_extractor`.
+**Patterns to follow:** existing `server.tool` blocks and the `addToolsToServer` options threading.
 **Test scenarios:**
-- Test expectation: none beyond handler tests — registration is wiring; behavior covered by U3/U4.
-**Verification:** `pnpm build`; launch server; both tools register and the `document_processor` description no longer double-advertises extraction.
+- Test expectation: none beyond handler tests — registration is wiring.
+**Verification:** `pnpm build`; server registers both tools; `document_processor` no longer double-advertises extraction.
 
 ### U6. Dynamic-workflow example
 
-**Goal:** One runnable artifact demonstrating extract → branch → act.
-**Requirements:** R7.
+**Goal:** One runnable artifact: extract → query → act.
+**Requirements:** R8.
 **Dependencies:** U5.
-**Files:** `examples/invoice-extraction-workflow/` (a documented script + sample doc reference), `README.md` (a "Dynamic workflows" walkthrough section)
-**Approach:** Show an agent calling `data_extractor` (structured → file) → `query_extraction` with `minConfidence` to find low-confidence fields → branching (e.g., flag for human review) and acting via the existing `ai_redactor` / `document_signer`. Keep it copy-pasteable; reference `tests/assets/example.pdf`.
-**Test scenarios:**
-- Test expectation: none — documentation/example. If a smoke script is included, gate it behind a real key and exclude from `pnpm test`.
-**Verification:** Walkthrough steps run end-to-end manually against a live key once.
+**Files:** `examples/invoice-extraction-workflow/` (script + notes), `README.md` ("Dynamic workflows" section)
+**Approach:** `data_extractor` (spatial → file) → `query_extraction` (`minConfidence` to find shaky fields) → branch → act via `ai_redactor`/`document_signer`. Live "act" steps use the **Processor** key; gate the runnable script behind both keys and exclude from `pnpm test`.
+**Test scenarios:** none — example/doc.
+**Verification:** Walkthrough runs once end-to-end against live keys.
 
 ### U7. Tests
 
-**Goal:** Cover both handlers per the repo convention.
-**Requirements:** R9.
-**Dependencies:** U3, U4.
-**Files:** `tests/extract.test.ts` (new), `tests/query.test.ts` (new), reuse `tests/fixtures/extraction-sample.json` (from U0) and `tests/assets/example.pdf`. Inline example objects (the feature has few cases — no separate `*-api-examples.ts` data file).
-**Approach:** Follow `tests/unit.test.ts`/`tests/build-api-examples.test.ts` conventions; mock API streams; assert routing, summary-without-content, query filtering, sandbox rejection, and key-redaction in errors.
-**Execution note:** Start from a failing test asserting the structured→file summary contains no document content (the riskiest + security-critical path).
-**Test scenarios:** the scenarios enumerated in U1/U3/U4 live here.
+**Goal:** Cover both handlers against the documented/fixture schema.
+**Requirements:** R10.
+**Dependencies:** U3, U4 (U0 fixtures).
+**Files:** `tests/extract.test.ts`, `tests/query.test.ts`; reuse `tests/fixtures/extraction-*-sample.json`, `tests/assets/example.pdf`. Inline example objects.
+**Approach:** Mock `DwsApiClient.post` to return a stream of the fixture; assert routing, summary-without-content, key-redaction, sandbox rejection, and query filters.
+**Execution note:** Start from a failing test asserting the spatial→file summary contains no element `text` (security-critical).
 **Verification:** `pnpm test`, `pnpm lint`, `pnpm format` clean.
 
 ### U8. Docs
 
-**Goal:** Document the new tools and workflow.
-**Requirements:** R8.
+**Goal:** Document tools, the new env var, and costs.
+**Requirements:** R7, R9.
 **Dependencies:** U5, U6.
-**Files:** `README.md`
-**Approach:** Add `data_extractor` + `query_extraction` rows to "Available Tools"; update the "Data Extraction" feature row (coordinates/confidence, file output + query). Ensure the `document_processor` row no longer implies it's the extraction path. Note the transcript-visibility caveat for extracted content.
-**Test scenarios:** Test expectation: none — documentation only.
-**Verification:** Tool names/descriptions match the registered tools exactly (grep parity).
+**Files:** `README.md`, `.env.example`
+**Approach:** Add `data_extractor` + `query_extraction` to Available Tools; add a Data Extraction feature row (modes, spatial/markdown, coords+confidence, file+query); add `NUTRIENT_EXTRACTION_API_KEY` to the env table + `.env.example`; note per-mode credits; ensure `document_processor` row no longer implies it's the extraction path.
+**Test scenarios:** none — docs.
+**Verification:** Tool names/descriptions match registrations (grep parity).
 
 ---
 
 ## Scope Boundaries
 
-**In scope:** `data_extractor`, `query_extraction`, one dynamic-workflow example, tests, README + `document_processor` description fix.
+**In scope:** `data_extractor`, `query_extraction`, the Data Extraction client wiring, one workflow example, tests, README/.env updates, `document_processor` description fix.
 
 ### Deferred to Follow-Up Work
-- **`accessibility_tagger` (PDF/UA auto-tagging)** — own PR; design is ready in this plan's git history (maps to `output.type: 'pdfua'` + `metadata`). Dropped here to keep the workflow narrative sharp.
-- **Accessibility validation / compliance reporting** — not a confirmed DWS capability.
-- **Viewer tool** — low value for headless workflows.
-- **Re-enabling `structuredText` on `document_processor`** — kept off there; only `data_extractor` exposes it (behind file output).
-- **Extension-allowlist / hardened non-sandbox output paths** — see System-Wide Impact; revisit if needed.
-- **npm publish / version bump** — separate release step.
+- **`accessibility_tagger`** — DWS **Accessibility API** is now standalone (auto-tag *and* validation, own key); own PR.
+- **Viewer tool** — own key; low value for headless workflows.
+- **JSON-body-with-URL / raw-binary inputs** to `/extraction/parse` — start with multipart file upload; add URL input if needed.
+- **`agentic` cost guardrails** beyond surfacing cost in the description.
 
 ---
 
 ## System-Wide Impact
 
-- **No auth/transport change** — reuses `NUTRIENT_DWS_API_KEY` and stdio. No new env vars.
-- **Additive** — no breaking change to existing tools; `document_processor` keeps full capability (only its description changes to reduce extraction overlap).
-- **Sandbox** covers all new reads/writes via `resolveReadFilePath`/`resolveWriteFilePath`. **Known limitation (pre-existing):** in non-sandbox mode, any absolute `outputPath` is writable — call this out in the tool descriptions; an extension allowlist is deferred.
-- **Transcript exposure:** `data_extractor` inline results and all `query_extraction` results place extracted content in the agent transcript (KTD6) — documented, not silently introduced.
-- **Credits:** extraction is a billable Build op; existing `check_credits` applies.
+- **New env var** `NUTRIENT_EXTRACTION_API_KEY` (separate from `NUTRIENT_DWS_API_KEY`). Documented; extraction tools error clearly if unset, Processor tools unaffected.
+- **Additive** — no breaking change; `document_processor` keeps capability (description-only change).
+- **Sandbox** covers the new file read (`query_extraction`, source PDF) and write (`data_extractor` spatial output).
+- **Transcript exposure** (KTD6) documented.
+- **Cost:** `understand` (default) = 9 credits/page; surfaced in the description (R7).
 
 ---
 
 ## Risks & Dependencies
 
-- **R-A (high → mitigated): API contract unconfirmed.** *Mitigation:* U0 spike is blocking and produces a fixture before schemas are committed (KTD5).
-- **R-B (medium): structured field names drive both the summary and the query.** If U0 reveals an unexpected shape, `summarizeExtraction` + `query_extraction` field access change. *Mitigation:* single source of truth = U0 fixture; defensive fallbacks; both consume the same confirmed names.
-- **R-C (medium): PII in transcript.** *Mitigation:* KTD3 (no content in summaries) + KTD6 (documented warning) + a test asserting no document content leaks inline on the structured path.
-- **R-D (low): shared-core refactor regresses `document_processor`.** *Mitigation:* U2 is characterization-first; existing build tests gate it.
-- **R-E (low): query returns too much.** *Mitigation:* bounded result count + "narrow your query" guidance.
+- **R-A (low, mitigated): live response drift from docs.** *Mitigation:* U0 fixture confirms before relying on it; defensive field access.
+- **R-B (medium): default mode cost.** `understand` at 9 cr/pg can surprise. *Mitigation:* cost in description (R7); consider defaulting to `structure` — open question below.
+- **R-C (medium): PII in transcript** via markdown/inline and query results. *Mitigation:* KTD3 (no content in summaries) + KTD6 warning + a test asserting no element `text` leaks in the spatial summary.
+- **R-D (low): two keys confuse setup.** *Mitigation:* clear env table, `.env.example`, and unset-key errors.
+
+## Open Questions (resolve during execution)
+
+- Default mode: `understand` (richest, 9 cr/pg, matches API default) vs `structure` (1.5 cr/pg) for a cheaper default? Leaning toward honoring the API default (`understand`) but surfacing cost.
 
 ---
 
 ## Verification Strategy
 
-No GitHub Actions in this repo — verification is local:
-- `pnpm pretest` (tsc), `pnpm test` (vitest), `pnpm lint`, `pnpm format`.
-- `pnpm build`, launch the server, run the U6 walkthrough once against a live key (extract → query → act).
-- Per project AGENTS rules: branch off `main` → Conventional Commits → PR into `main`; never push to `main`; report the exact command + exit 0 before claiming done.
+Local (no GitHub Actions in this repo):
+- `pnpm pretest`, `pnpm test`, `pnpm lint`, `pnpm format`.
+- U0: one live `text`/`structure` call to capture fixtures (needs key; deferred to user).
+- U6: full extract→query→act once against live Extraction + Processor keys.
+- Per project AGENTS rules: branch off `main` → Conventional Commits → PR into `main`; never push to `main`; report exact command + exit 0 before claiming done.
 
 ---
 
 ## Sources & Research
 
-- Existing code (authoritative): `src/index.ts`, `src/schemas.ts` (`JSONContentOutputSchema`, disabled `structuredText` at the bottom of that schema, `PDFUAOutputSchema`), `src/dws/build.ts` (`performBuildCall` write-path-before-call; private `processInstructions`/`makeApiBuildCall`), `src/dws/utils.ts` (`handleJsonContentResponse`/`handleFileResponse`/`handleApiError`), `src/dws/api.ts`, `src/fs/sandbox.ts`.
-- DWS Processor API — *"typed JSON or Markdown … tables, key-value pairs, coordinates, and confidence scores"*; *"each document element … bounding-box coordinates, reading order index, element type, and confidence scores."* ([nutrient.io/api](https://www.nutrient.io/api/), [processor-api](https://www.nutrient.io/api/processor-api/))
-- Plan review (2026-06-07): 6 personas; this revision applies the high-confidence Feasibility/Scope/Security/Adversarial findings (shared-core composition, drop size-threshold, markdown routing, U0 spike, PII-safe summaries, sandbox-validate-first, de-advertise overlap) and the product decision (re-focus on extraction + add the query affordance + example).
+- **Authoritative, on disk:** `~/projects/nutrient-website/src/content/guides/dws-data-extraction/` — `getting-started.mdoc`, `api-overview.mdoc`, `parsing/processing-modes.mdoc`, `parsing/coordinate-spaces.mdoc`, `llms.txt`. Endpoint `POST /extraction/parse`, Bearer `pdf_live_…`, modes/formats, element schema, coordinate system.
+- Repo (authoritative for wiring): `src/dws/client.ts` (`DwsApiClient`, `createApiClientFromApiKey`, `.post`), `src/index.ts` (`createMcpServer`/`addToolsToServer` apiClient threading), `src/dws/build.ts`, `src/dws/utils.ts` (`pipeToString`, `handleApiError`), `src/fs/sandbox.ts`.
+- Plan review (2026-06-07, 6 personas) + two user corrections establishing the separate-API/separate-key reality.

From 292d3fb129ae4310af83335a278a7a869b823814 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 12:53:50 -0700
Subject: [PATCH 05/12] feat(extract): add DataExtractorArgsSchema and
 QueryExtractionArgsSchema
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Schemas for the Data Extraction API (/extraction/parse): mode (text/
structure/understand/agentic), output format (spatial/markdown), includeWords,
language, pages, outputPath; plus query filters (pages/region/minConfidence/
elementTypes/limit). Cross-field rules enforced in handlers (Schema.shape
requires a plain ZodObject).

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 src/schemas.ts | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/src/schemas.ts b/src/schemas.ts
index dbd43a3..6a3670c 100644
--- a/src/schemas.ts
+++ b/src/schemas.ts
@@ -549,3 +549,106 @@ export type Action = z.infer<typeof BuildActionSchema>
 export type SignAPIArgs = z.infer<typeof SignAPIArgsSchema>
 export type SignatureOptions = z.infer<typeof CreateDigitalSignatureSchema>
 export type AiRedactArgs = z.infer<typeof AiRedactArgsSchema>
+
+// ----- Data Extraction API (POST /extraction/parse) -----
+//
+// Cross-field rules (spatial format requires outputPath; text mode supports
+// markdown only) are enforced in the handler rather than via a top-level
+// `.superRefine`, because tools are registered with `Schema.shape`, which only
+// exists on a plain ZodObject (a refined schema would be a ZodEffects).
+
+export const ExtractionModeSchema = z
+  .enum(['text', 'structure', 'understand', 'agentic'])
+  .describe(
+    'Processing mode (cost/quality trade-off). ' +
+      'text: fast Markdown from digital-born documents, no OCR (1 credit/page). ' +
+      'structure: OCR-based spatial elements (1.5 credits/page). ' +
+      'understand: AI-augmented spatial extraction, the default (9 credits/page). ' +
+      'agentic: VLM-augmented for the most complex documents (18 credits/page).',
+  )
+
+export const ExtractionFormatSchema = z
+  .enum(['spatial', 'markdown'])
+  .describe(
+    'Output format. spatial: typed elements with bounding boxes, confidence, and reading order — written to outputPath and queried with query_extraction. ' +
+      'markdown: whole-document Markdown returned inline. text mode supports markdown only; other modes default to spatial.',
+  )
+
+export const ExtractionElementTypeSchema = z.enum([
+  'paragraph',
+  'table',
+  'formula',
+  'picture',
+  'keyValueRegion',
+  'handwriting',
+])
+
+export const DataExtractorArgsSchema = z.object({
+  filePath: z
+    .string()
+    .describe(
+      'Path to the document to extract from (PDF, image, or Office file). Resolves to sandbox path if enabled, otherwise resolves to the local file system.',
+    ),
+  mode: ExtractionModeSchema.optional().default('understand'),
+  format: ExtractionFormatSchema.optional().describe(
+    'Output format. Defaults to markdown for text mode and spatial for all other modes.',
+  ),
+  includeWords: z
+    .boolean()
+    .optional()
+    .default(false)
+    .describe('Include word-level bounding boxes in spatial output. Ignored for markdown output.'),
+  language: z
+    .union([z.string(), z.array(z.string())])
+    .optional()
+    .describe('OCR language(s) — ISO code or alias — for scanned or image documents.'),
+  pages: PageRangeSchema.optional().describe('Page range to process (0-based indexing).'),
+  outputPath: z
+    .string()
+    .optional()
+    .describe(
+      'Where to write spatial JSON output. Required for the spatial format (the element list can be large and is kept out of the conversation). ' +
+        'Resolves to sandbox path if enabled. Retrieve slices of it with query_extraction.',
+    ),
+})
+
+export const QueryExtractionArgsSchema = z.object({
+  filePath: z
+    .string()
+    .describe(
+      'Path to a spatial extraction JSON file previously produced by data_extractor. Resolves to sandbox path if enabled.',
+    ),
+  pages: z
+    .array(z.number().int().nonnegative())
+    .optional()
+    .describe('Only return elements on these 0-based page indices.'),
+  region: z
+    .object({
+      x: z.number().describe('Left edge in render-space pixels (top-left origin).'),
+      y: z.number().describe('Top edge in render-space pixels.'),
+      width: z.number().positive().describe('Region width in render-space pixels.'),
+      height: z.number().positive().describe('Region height in render-space pixels.'),
+    })
+    .optional()
+    .describe('Only return elements whose bounding box intersects this region.'),
+  minConfidence: z
+    .number()
+    .min(0)
+    .max(1)
+    .optional()
+    .describe('Only return elements with confidence greater than or equal to this value (0-1).'),
+  elementTypes: z
+    .array(ExtractionElementTypeSchema)
+    .optional()
+    .describe('Only return elements of these types.'),
+  limit: z
+    .number()
+    .int()
+    .positive()
+    .optional()
+    .default(100)
+    .describe('Maximum number of elements to return inline. Narrow the filters if results are truncated.'),
+})
+
+export type DataExtractorArgs = z.infer<typeof DataExtractorArgsSchema>
+export type QueryExtractionArgs = z.infer<typeof QueryExtractionArgsSchema>

From ee1d04abd548f3e71412ceda600e25f53d8cd19b Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 12:55:43 -0700
Subject: [PATCH 06/12] feat(env): add NUTRIENT_EXTRACTION_API_KEY for the Data
 Extraction API
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Separate key from the Processor NUTRIENT_DWS_API_KEY; the Data Extraction
client is constructed from it during tool registration (U5).

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 src/utils/environment.ts  |  5 +++++
 tests/environment.test.ts | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/utils/environment.ts b/src/utils/environment.ts
index 534bd30..723d148 100644
--- a/src/utils/environment.ts
+++ b/src/utils/environment.ts
@@ -2,6 +2,7 @@ import { z } from 'zod'
 
 export type Environment = {
   nutrientApiKey?: string
+  extractionApiKey?: string
   dwsApiBaseUrl: string
   authServerUrl: string
   clientId?: string
@@ -9,6 +10,9 @@ export type Environment = {
 
 const RawEnvironmentSchema = z.object({
   NUTRIENT_DWS_API_KEY: z.string().optional(),
+  // Separate key for the standalone DWS Data Extraction API (POST /extraction/parse).
+  // Distinct from the Processor key above; starts with `pdf_live_` / `pdf_test_`.
+  NUTRIENT_EXTRACTION_API_KEY: z.string().optional(),
   DWS_API_BASE_URL: z.string().url().default('https://api.nutrient.io'),
   AUTH_SERVER_URL: z
     .string()
@@ -26,6 +30,7 @@ export function getEnvironment(rawEnv: NodeJS.ProcessEnv = process.env): Environ
 
   return {
     nutrientApiKey: raw.NUTRIENT_DWS_API_KEY,
+    extractionApiKey: raw.NUTRIENT_EXTRACTION_API_KEY,
     dwsApiBaseUrl: raw.DWS_API_BASE_URL,
     authServerUrl: raw.AUTH_SERVER_URL,
     clientId: raw.CLIENT_ID,
diff --git a/tests/environment.test.ts b/tests/environment.test.ts
index 8a3f0d4..e82aa18 100644
--- a/tests/environment.test.ts
+++ b/tests/environment.test.ts
@@ -10,6 +10,19 @@ describe('environment', () => {
     expect(environment.authServerUrl).toBe('https://api.nutrient.io')
   })
 
+  it('parses the separate Data Extraction API key', () => {
+    const environment = getEnvironment({ NUTRIENT_EXTRACTION_API_KEY: 'pdf_live_abc123' })
+
+    expect(environment.extractionApiKey).toBe('pdf_live_abc123')
+    expect(environment.nutrientApiKey).toBeUndefined()
+  })
+
+  it('leaves the extraction key undefined when unset', () => {
+    const environment = getEnvironment({ NUTRIENT_DWS_API_KEY: 'dws-key' })
+
+    expect(environment.extractionApiKey).toBeUndefined()
+  })
+
   it('allows overriding DWS API base URL', () => {
     const environment = getEnvironment({ DWS_API_BASE_URL: 'http://localhost:4000' })
 

From 8745d6e2164457e394e60491cc2952cef6496ec1 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 12:58:26 -0700
Subject: [PATCH 07/12] feat(extract): data_extractor and query_extraction
 handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

performExtractCall: POST /extraction/parse (multipart file + instructions),
validates outputPath before the call, routes spatial->file+content-free summary
and markdown->inline, parses streamed JSON, clear error when key unset.
performQueryCall: reads a saved spatial file and returns elements filtered by
page/region/minConfidence/elementTypes, capped by limit. Drop unsupported
'pages' request param from the extractor schema; language nests under options.

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 src/dws/extract.ts | 242 +++++++++++++++++++++++++++++++++++++++++++++
 src/schemas.ts     |   6 +-
 2 files changed, 246 insertions(+), 2 deletions(-)
 create mode 100644 src/dws/extract.ts

diff --git a/src/dws/extract.ts b/src/dws/extract.ts
new file mode 100644
index 0000000..3beba46
--- /dev/null
+++ b/src/dws/extract.ts
@@ -0,0 +1,242 @@
+import FormData from 'form-data'
+import fs from 'fs'
+import path from 'path'
+import { CallToolResult } from '@modelcontextprotocol/sdk/types.js'
+import { DwsApiClient } from './client.js'
+import { DataExtractorArgs, QueryExtractionArgs } from '../schemas.js'
+import { resolveReadFilePath, resolveWriteFilePath } from '../fs/sandbox.js'
+import { pipeToString, handleApiError } from './utils.js'
+import { createSuccessResponse, createErrorResponse } from '../responses.js'
+
+const EXTRACTION_ENDPOINT = 'extraction/parse'
+const LOW_CONFIDENCE_THRESHOLD = 0.6
+
+/** A single spatial element from the Data Extraction API (`output.format: spatial`). */
+type SpatialElement = {
+  type?: string
+  role?: string
+  confidence?: number
+  bounds?: { x: number; y: number; width: number; height: number }
+  page?: { pageIndex?: number; pageNumber?: number; width?: number; height?: number }
+}
+
+/** Parsed `/extraction/parse` response (the fields this server reads). */
+type ExtractionResponse = {
+  output?: { elements?: SpatialElement[]; markdown?: string }
+  metrics?: { pagesProcessed?: number }
+}
+
+/** text mode only supports markdown; every other mode defaults to spatial. */
+function resolveFormat(mode: DataExtractorArgs['mode'], format: DataExtractorArgs['format']): 'spatial' | 'markdown' {
+  if (format) {
+    return format
+  }
+  return mode === 'text' ? 'markdown' : 'spatial'
+}
+
+/**
+ * Build a decision-grade summary of a spatial extraction result.
+ *
+ * Deliberately excludes extracted document text — it reports only counts,
+ * confidence signal, page geometry, and where the full result was written, so
+ * sensitive content never lands in the agent transcript (query it back with
+ * `query_extraction` instead).
+ */
+function summarizeSpatial(response: ExtractionResponse, outputPath: string, byteLength: number): string {
+  const elements = response.output?.elements ?? []
+  const typeCounts: Record<string, number> = {}
+  const pageIndexes = new Set<number>()
+  let lowConfidence = 0
+
+  for (const element of elements) {
+    const type = element.type ?? 'unknown'
+    typeCounts[type] = (typeCounts[type] ?? 0) + 1
+    if (typeof element.confidence === 'number' && element.confidence < LOW_CONFIDENCE_THRESHOLD) {
+      lowConfidence += 1
+    }
+    if (typeof element.page?.pageIndex === 'number') {
+      pageIndexes.add(element.page.pageIndex)
+    }
+  }
+
+  const pageCount = response.metrics?.pagesProcessed ?? pageIndexes.size
+  const typeSummary = Object.entries(typeCounts)
+    .map(([type, count]) => `${type}: ${count}`)
+    .join(', ')
+
+  return [
+    `Extracted ${elements.length} elements across ${pageCount} page(s) and wrote the full spatial JSON to ${outputPath} (${byteLength} bytes).`,
+    `Element types: ${typeSummary || 'none'}.`,
+    `Low-confidence elements (confidence < ${LOW_CONFIDENCE_THRESHOLD}): ${lowConfidence}.`,
+    `Retrieve specific elements with query_extraction (filter by page, region, minConfidence, or elementTypes). The document content is not included here.`,
+  ].join('\n')
+}
+
+/**
+ * Calls the Nutrient DWS Data Extraction API (`POST /extraction/parse`).
+ *
+ * Spatial output is written to `outputPath` and summarized inline; markdown
+ * output is returned inline.
+ */
+export async function performExtractCall(
+  args: DataExtractorArgs,
+  extractionApiClient: DwsApiClient | undefined,
+): Promise<CallToolResult> {
+  if (!extractionApiClient) {
+    return createErrorResponse(
+      'Error: Data Extraction is not configured. Set the NUTRIENT_EXTRACTION_API_KEY environment variable ' +
+        '(a Data Extraction API key from the Nutrient dashboard, starting with pdf_live_ or pdf_test_).',
+    )
+  }
+
+  const { filePath, mode, language, includeWords, outputPath } = args
+  const format = resolveFormat(mode, args.format)
+
+  if (mode === 'text' && format === 'spatial') {
+    return createErrorResponse('Error: text mode only supports markdown output. Use a different mode for spatial output.')
+  }
+
+  if (format === 'spatial' && !outputPath) {
+    return createErrorResponse(
+      'Error: spatial output requires outputPath — the element list can be large and is written to a file, ' +
+        'then queried with query_extraction.',
+    )
+  }
+
+  // Resolve the output path first (fail early on a sandbox escape, before any API call).
+  let resolvedOutputPath: string | undefined
+  if (format === 'spatial' && outputPath) {
+    try {
+      resolvedOutputPath = await resolveWriteFilePath(outputPath)
+    } catch (error) {
+      return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`)
+    }
+  }
+
+  let fileBuffer: Buffer
+  let fileName: string
+  try {
+    const resolvedInputPath = await resolveReadFilePath(filePath)
+    fileBuffer = await fs.promises.readFile(resolvedInputPath)
+    fileName = path.basename(resolvedInputPath)
+  } catch (error) {
+    return createErrorResponse(`Error with input file ${filePath}: ${error instanceof Error ? error.message : String(error)}`)
+  }
+
+  const instructions: Record<string, unknown> = {
+    mode,
+    output: format === 'spatial' ? { format, includeWords: includeWords ?? false } : { format },
+  }
+  if (language && mode !== 'text') {
+    instructions.options = { language }
+  }
+
+  try {
+    const form = new FormData()
+    form.append('file', fileBuffer, { filename: fileName })
+    form.append('instructions', JSON.stringify(instructions))
+
+    const response = await extractionApiClient.post(EXTRACTION_ENDPOINT, form)
+    const body = await pipeToString(response.data)
+
+    let parsed: ExtractionResponse
+    try {
+      parsed = JSON.parse(body) as ExtractionResponse
+    } catch {
+      return createErrorResponse('Error: the Data Extraction API returned a response that could not be parsed as JSON.')
+    }
+
+    if (format === 'markdown') {
+      const markdown = parsed.output?.markdown
+      if (typeof markdown !== 'string') {
+        return createErrorResponse('Error: the Data Extraction API did not return markdown output.')
+      }
+      return createSuccessResponse(markdown)
+    }
+
+    // Spatial: write the full result to disk, return a content-free summary.
+    const outputDir = path.dirname(resolvedOutputPath as string)
+    try {
+      await fs.promises.access(outputDir)
+    } catch {
+      await fs.promises.mkdir(outputDir, { recursive: true })
+    }
+    const json = JSON.stringify(parsed, null, 2)
+    await fs.promises.writeFile(resolvedOutputPath as string, json)
+
+    return createSuccessResponse(summarizeSpatial(parsed, resolvedOutputPath as string, Buffer.byteLength(json)))
+  } catch (error) {
+    return handleApiError(error)
+  }
+}
+
+/** Does element `bounds` intersect the query `region`? */
+function intersects(
+  bounds: SpatialElement['bounds'],
+  region: NonNullable<QueryExtractionArgs['region']>,
+): boolean {
+  if (!bounds) {
+    return false
+  }
+  const right = bounds.x + bounds.width
+  const bottom = bounds.y + bounds.height
+  const regionRight = region.x + region.width
+  const regionBottom = region.y + region.height
+  return !(right < region.x || bounds.x > regionRight || bottom < region.y || bounds.y > regionBottom)
+}
+
+/**
+ * Reads a spatial extraction file produced by `data_extractor` and returns the
+ * subset of elements matching the given filters, inline.
+ */
+export async function performQueryCall(args: QueryExtractionArgs): Promise<CallToolResult> {
+  const { filePath, pages, region, minConfidence, elementTypes, limit } = args
+
+  let parsed: ExtractionResponse
+  try {
+    const resolvedPath = await resolveReadFilePath(filePath)
+    const body = await fs.promises.readFile(resolvedPath, 'utf-8')
+    parsed = JSON.parse(body) as ExtractionResponse
+  } catch (error) {
+    return createErrorResponse(
+      `Error reading extraction file ${filePath}: ${error instanceof Error ? error.message : String(error)}`,
+    )
+  }
+
+  const elements = parsed.output?.elements
+  if (!Array.isArray(elements)) {
+    return createErrorResponse(
+      'Error: this file does not look like a spatial extraction result (no output.elements array). ' +
+        'Produce one with data_extractor using format: spatial.',
+    )
+  }
+
+  const pageSet = pages && pages.length > 0 ? new Set(pages) : undefined
+  const typeSet = elementTypes && elementTypes.length > 0 ? new Set<string>(elementTypes) : undefined
+
+  const matches = elements.filter((element) => {
+    if (pageSet && (typeof element.page?.pageIndex !== 'number' || !pageSet.has(element.page.pageIndex))) {
+      return false
+    }
+    if (typeSet && (typeof element.type !== 'string' || !typeSet.has(element.type))) {
+      return false
+    }
+    if (typeof minConfidence === 'number' && !(typeof element.confidence === 'number' && element.confidence >= minConfidence)) {
+      return false
+    }
+    if (region && !intersects(element.bounds, region)) {
+      return false
+    }
+    return true
+  })
+
+  const limited = matches.slice(0, limit)
+  const truncatedNote =
+    matches.length > limited.length
+      ? `\n\nShowing the first ${limited.length} of ${matches.length} matches. Narrow the filters (page, region, minConfidence, elementTypes) to see the rest.`
+      : ''
+
+  return createSuccessResponse(
+    `${limited.length} matching element(s):\n${JSON.stringify(limited, null, 2)}${truncatedNote}`,
+  )
+}
diff --git a/src/schemas.ts b/src/schemas.ts
index 6a3670c..9798694 100644
--- a/src/schemas.ts
+++ b/src/schemas.ts
@@ -601,8 +601,10 @@ export const DataExtractorArgsSchema = z.object({
   language: z
     .union([z.string(), z.array(z.string())])
     .optional()
-    .describe('OCR language(s) — ISO code or alias — for scanned or image documents.'),
-  pages: PageRangeSchema.optional().describe('Page range to process (0-based indexing).'),
+    .describe(
+      'OCR language(s) — full name (e.g. "german"), ISO code (e.g. "deu"), or array for multilingual docs. ' +
+        'Only applies to structure/understand/agentic modes; ignored for text mode.',
+    ),
   outputPath: z
     .string()
     .optional()

From acf32a9ed02740cd38266b4f8a3bf25f7db3a0c6 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 13:01:31 -0700
Subject: [PATCH 08/12] feat(tools): register data_extractor +
 query_extraction; de-advertise extraction on document_processor
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Builds a Data Extraction DwsApiClient from NUTRIENT_EXTRACTION_API_KEY (undefined
when unset; data_extractor then returns a clear setup error). Threads it through
createMcpServer/addToolsToServer. Updates document_processor description to point
extraction users to data_extractor, and the tool-list test for the two new tools.

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 src/index.ts            | 91 +++++++++++++++++++++++++++++++++++++++--
 tests/mcp-tools.test.ts |  2 +
 2 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/src/index.ts b/src/index.ts
index 78dff1b..14726bb 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -14,10 +14,13 @@ import {
   AiRedactArgsSchema,
   BuildAPIArgsSchema,
   CheckCreditsArgsSchema,
+  DataExtractorArgsSchema,
   DirectoryTreeArgsSchema,
+  QueryExtractionArgsSchema,
   SignAPIArgsSchema,
 } from './schemas.js'
 import { performBuildCall } from './dws/build.js'
+import { performExtractCall, performQueryCall } from './dws/extract.js'
 import { performSignCall } from './dws/sign.js'
 import { performAiRedactCall } from './dws/ai-redact.js'
 import { performCheckCreditsCall } from './dws/credits.js'
@@ -36,8 +39,9 @@ function addToolsToServer(options: {
   server: McpServer
   sandboxEnabled: boolean
   apiClient: DwsApiClient
+  extractionApiClient?: DwsApiClient
 }) {
-  const { server, sandboxEnabled, apiClient } = options
+  const { server, sandboxEnabled, apiClient, extractionApiClient } = options
 
   server.tool(
     'document_processor',
@@ -51,7 +55,9 @@ Features:
 • Watermarking (text/image)
 • Redaction creation and application
 
-Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), JSON extraction, Office (DOCX, XLSX, PPTX)`,
+Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), Office (DOCX, XLSX, PPTX)
+
+For structured data extraction (typed JSON or Markdown with bounding boxes and confidence scores), use the dedicated data_extractor tool instead.`,
     BuildAPIArgsSchema.shape,
     {
       title: 'Nutrient Document Processor',
@@ -164,6 +170,62 @@ Returns: subscription type, total credits, used credits, and remaining credits.`
     },
   )
 
+  server.tool(
+    'data_extractor',
+    `Extract structured data from a document using the Nutrient DWS Data Extraction API. Reads the input file from the local file system or sandbox (if enabled).
+
+Output formats:
+• spatial — typed elements (paragraphs, tables, key-value pairs, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Written to outputPath (the list can be large); retrieve slices with the query_extraction tool.
+• markdown — whole-document Markdown, returned inline. Good for RAG and search indexing.
+
+Processing modes (cost per page): text = fast Markdown, no OCR (1 credit); structure = OCR spatial (1.5 credits); understand = AI-augmented, default (9 credits); agentic = VLM-augmented (18 credits).
+
+Note: markdown output and any extracted content are returned into this conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped query_extraction calls.`,
+    DataExtractorArgsSchema.shape,
+    {
+      title: 'Nutrient Data Extractor',
+      readOnlyHint: false,
+      destructiveHint: true,
+      idempotentHint: false,
+      openWorldHint: true,
+    },
+    async (args) => {
+      try {
+        return await performExtractCall(args, extractionApiClient)
+      } catch (error) {
+        return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`)
+      }
+    },
+  )
+
+  server.tool(
+    'query_extraction',
+    `Query a spatial extraction file previously produced by data_extractor and return the matching elements inline. Reads the file from the local file system or sandbox (if enabled); does not call the Nutrient API.
+
+Filter by any combination of:
+• pages — 0-based page indices
+• region — a bounding box {x, y, width, height} in render-space pixels (top-left origin); returns elements whose bounds intersect it
+• minConfidence — only elements at or above this confidence (0-1)
+• elementTypes — paragraph, table, formula, picture, keyValueRegion, handwriting
+
+Use this to pull just the elements you need (e.g. low-confidence fields, or everything in a table region) instead of loading the whole extraction. Returned elements include their text and coordinates, which enter this conversation.`,
+    QueryExtractionArgsSchema.shape,
+    {
+      title: 'Nutrient Extraction Query',
+      readOnlyHint: true,
+      destructiveHint: false,
+      idempotentHint: true,
+      openWorldHint: false,
+    },
+    async (args) => {
+      try {
+        return await performQueryCall(args)
+      } catch (error) {
+        return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`)
+      }
+    },
+  )
+
   if (sandboxEnabled) {
     server.tool(
       'sandbox_file_tree',
@@ -195,7 +257,11 @@ Returns: subscription type, total credits, used credits, and remaining credits.`
   }
 }
 
-export function createMcpServer(options: { sandboxEnabled: boolean; apiClient: DwsApiClient }) {
+export function createMcpServer(options: {
+  sandboxEnabled: boolean
+  apiClient: DwsApiClient
+  extractionApiClient?: DwsApiClient
+}) {
   const server = new McpServer(
     {
       name: 'nutrient-dws-mcp-server',
@@ -213,11 +279,28 @@ export function createMcpServer(options: { sandboxEnabled: boolean; apiClient: D
     server,
     sandboxEnabled: options.sandboxEnabled,
     apiClient: options.apiClient,
+    extractionApiClient: options.extractionApiClient,
   })
 
   return server
 }
 
+/**
+ * Builds the Data Extraction API client when NUTRIENT_EXTRACTION_API_KEY is set.
+ * Returns undefined otherwise, in which case data_extractor reports a clear
+ * "set NUTRIENT_EXTRACTION_API_KEY" error when invoked.
+ */
+function createExtractionApiClient(environment: Environment): DwsApiClient | undefined {
+  if (!environment.extractionApiKey) {
+    return undefined
+  }
+
+  return createApiClient({
+    apiKey: environment.extractionApiKey,
+    baseUrl: environment.dwsApiBaseUrl,
+  })
+}
+
 async function parseCommandLineArgs() {
   const args = process.argv.slice(2)
   const sandboxDir = parseSandboxPath(args, process.env.SANDBOX_PATH) || null
@@ -283,10 +366,12 @@ export async function runServer(environment: Environment): Promise<RunServerResu
   })
 
   const apiClient = createStdioApiClient(environment)
+  const extractionApiClient = createExtractionApiClient(environment)
 
   const server = createMcpServer({
     sandboxEnabled,
     apiClient,
+    extractionApiClient,
   })
 
   const transport = new StdioServerTransport()
diff --git a/tests/mcp-tools.test.ts b/tests/mcp-tools.test.ts
index 5ca2abe..764af6e 100644
--- a/tests/mcp-tools.test.ts
+++ b/tests/mcp-tools.test.ts
@@ -37,8 +37,10 @@ describe('MCP tool metadata', () => {
     expect(Object.keys(tools).sort()).toEqual([
       'ai_redactor',
       'check_credits',
+      'data_extractor',
       'document_processor',
       'document_signer',
+      'query_extraction',
       'sandbox_file_tree',
     ])
 

From 2e390d6415a2d52512b16455756a0229ebbe04fb Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 13:06:04 -0700
Subject: [PATCH 09/12] test(extract): cover data_extractor and
 query_extraction handlers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

12 tests against the documented response shape (mocked client): markdown
inline, spatial->file with a content-free summary (asserts no PII leaks
inline while the file retains it), spatial-requires-outputPath, text-mode
rejects spatial, sandbox containment of absolute paths, missing-key error,
and query filters (minConfidence/type/page/region/limit/malformed).

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 src/dws/extract.ts    |  18 +--
 src/schemas.ts        |   5 +-
 tests/extract.test.ts | 258 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 270 insertions(+), 11 deletions(-)
 create mode 100644 tests/extract.test.ts

diff --git a/src/dws/extract.ts b/src/dws/extract.ts
index 3beba46..062aafe 100644
--- a/src/dws/extract.ts
+++ b/src/dws/extract.ts
@@ -93,7 +93,9 @@ export async function performExtractCall(
   const format = resolveFormat(mode, args.format)
 
   if (mode === 'text' && format === 'spatial') {
-    return createErrorResponse('Error: text mode only supports markdown output. Use a different mode for spatial output.')
+    return createErrorResponse(
+      'Error: text mode only supports markdown output. Use a different mode for spatial output.',
+    )
   }
 
   if (format === 'spatial' && !outputPath) {
@@ -120,7 +122,9 @@ export async function performExtractCall(
     fileBuffer = await fs.promises.readFile(resolvedInputPath)
     fileName = path.basename(resolvedInputPath)
   } catch (error) {
-    return createErrorResponse(`Error with input file ${filePath}: ${error instanceof Error ? error.message : String(error)}`)
+    return createErrorResponse(
+      `Error with input file ${filePath}: ${error instanceof Error ? error.message : String(error)}`,
+    )
   }
 
   const instructions: Record<string, unknown> = {
@@ -171,10 +175,7 @@ export async function performExtractCall(
 }
 
 /** Does element `bounds` intersect the query `region`? */
-function intersects(
-  bounds: SpatialElement['bounds'],
-  region: NonNullable<QueryExtractionArgs['region']>,
-): boolean {
+function intersects(bounds: SpatialElement['bounds'], region: NonNullable<QueryExtractionArgs['region']>): boolean {
   if (!bounds) {
     return false
   }
@@ -221,7 +222,10 @@ export async function performQueryCall(args: QueryExtractionArgs): Promise<CallT
     if (typeSet && (typeof element.type !== 'string' || !typeSet.has(element.type))) {
       return false
     }
-    if (typeof minConfidence === 'number' && !(typeof element.confidence === 'number' && element.confidence >= minConfidence)) {
+    if (
+      typeof minConfidence === 'number' &&
+      !(typeof element.confidence === 'number' && element.confidence >= minConfidence)
+    ) {
       return false
     }
     if (region && !intersects(element.bounds, region)) {
diff --git a/src/schemas.ts b/src/schemas.ts
index 9798694..d064753 100644
--- a/src/schemas.ts
+++ b/src/schemas.ts
@@ -639,10 +639,7 @@ export const QueryExtractionArgsSchema = z.object({
     .max(1)
     .optional()
     .describe('Only return elements with confidence greater than or equal to this value (0-1).'),
-  elementTypes: z
-    .array(ExtractionElementTypeSchema)
-    .optional()
-    .describe('Only return elements of these types.'),
+  elementTypes: z.array(ExtractionElementTypeSchema).optional().describe('Only return elements of these types.'),
   limit: z
     .number()
     .int()
diff --git a/tests/extract.test.ts b/tests/extract.test.ts
new file mode 100644
index 0000000..163091c
--- /dev/null
+++ b/tests/extract.test.ts
@@ -0,0 +1,258 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+import { Readable } from 'stream'
+import { setSandboxDirectory } from '../src/fs/sandbox.js'
+import { performExtractCall, performQueryCall } from '../src/dws/extract.js'
+import type { DwsApiClient } from '../src/dws/client.js'
+import type { DataExtractorArgs, QueryExtractionArgs } from '../src/schemas.js'
+import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js'
+
+// A recognizable "PII" string used to prove extracted content never appears in
+// the inline spatial summary (it must only live in the written file).
+const SECRET = 'SSN 123-45-6789'
+
+const spatialFixture = {
+  status: 200,
+  requestId: 'req_test',
+  output: {
+    elements: [
+      {
+        id: '1',
+        type: 'paragraph',
+        role: 'Title',
+        text: 'Quarterly Report',
+        confidence: 0.95,
+        readingOrder: 0,
+        bounds: { x: 100, y: 50, width: 400, height: 35 },
+        page: { pageIndex: 0, pageNumber: 1, width: 1818, height: 2422 },
+      },
+      {
+        id: '2',
+        type: 'keyValueRegion',
+        text: SECRET,
+        confidence: 0.4,
+        readingOrder: 1,
+        bounds: { x: 100, y: 200, width: 300, height: 20 },
+        page: { pageIndex: 0, pageNumber: 1, width: 1818, height: 2422 },
+      },
+      {
+        id: '3',
+        type: 'table',
+        confidence: 0.8,
+        readingOrder: 2,
+        bounds: { x: 100, y: 400, width: 600, height: 300 },
+        page: { pageIndex: 1, pageNumber: 2, width: 1818, height: 2422 },
+      },
+    ],
+  },
+  metrics: { processingTimeMs: 100, pagesProcessed: 2 },
+}
+
+function mockClient(payload: unknown): { client: DwsApiClient; post: ReturnType<typeof vi.fn> } {
+  const post = vi.fn().mockResolvedValue({ data: Readable.from([JSON.stringify(payload)]) })
+  return { client: { post } as unknown as DwsApiClient, post }
+}
+
+function text(result: CallToolResult): string {
+  return result.content.map((c) => (c.type === 'text' ? c.text : '')).join('\n')
+}
+
+let sandboxDir: string
+let counter = 0
+
+beforeEach(async () => {
+  counter += 1
+  sandboxDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'extract-test-'))
+  await setSandboxDirectory(sandboxDir)
+})
+
+afterEach(async () => {
+  await fs.promises.rm(sandboxDir, { recursive: true, force: true })
+})
+
+async function writeInput(): Promise<string> {
+  const name = `input-${counter}.pdf`
+  await fs.promises.writeFile(path.join(sandboxDir, name), 'dummy pdf bytes')
+  return name
+}
+
+function extractArgs(overrides: Partial<DataExtractorArgs>): DataExtractorArgs {
+  return {
+    filePath: overrides.filePath ?? `input-${counter}.pdf`,
+    mode: overrides.mode ?? 'understand',
+    format: overrides.format,
+    includeWords: overrides.includeWords ?? false,
+    language: overrides.language,
+    outputPath: overrides.outputPath,
+  }
+}
+
+describe('performExtractCall', () => {
+  it('returns markdown output inline', async () => {
+    const input = await writeInput()
+    const { client, post } = mockClient({ output: { markdown: '# Hello World' } })
+
+    const result = await performExtractCall(extractArgs({ filePath: input, mode: 'text', format: 'markdown' }), client)
+
+    expect(result.isError).toBeFalsy()
+    expect(text(result)).toBe('# Hello World')
+    expect(post).toHaveBeenCalledOnce()
+  })
+
+  it('writes spatial output to a file and returns a content-free summary', async () => {
+    const input = await writeInput()
+    const outName = `out-${counter}.json`
+    const { client } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: outName }),
+      client,
+    )
+
+    expect(result.isError).toBeFalsy()
+    const summary = text(result)
+    // Summary reports structure, not content.
+    expect(summary).toContain('Extracted 3 elements')
+    expect(summary).toContain('keyValueRegion: 1')
+    expect(summary).toContain('Low-confidence elements')
+    // The PII must NOT leak into the inline summary...
+    expect(summary).not.toContain(SECRET)
+    // ...but the full data IS persisted to the file.
+    const written = await fs.promises.readFile(path.join(sandboxDir, outName), 'utf-8')
+    expect(written).toContain(SECRET)
+  })
+
+  it('rejects spatial output without an outputPath, before any API call', async () => {
+    const input = await writeInput()
+    const { client, post } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial' }),
+      client,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('outputPath')
+    expect(post).not.toHaveBeenCalled()
+  })
+
+  it('rejects text mode with spatial output', async () => {
+    const input = await writeInput()
+    const { client, post } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'text', format: 'spatial', outputPath: `out-${counter}.json` }),
+      client,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('text mode')
+    expect(post).not.toHaveBeenCalled()
+  })
+
+  it('contains an outside-sandbox absolute outputPath within the sandbox', async () => {
+    const input = await writeInput()
+    const escape = path.join(os.tmpdir(), `escape-${counter}.json`)
+    const { client } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: escape }),
+      client,
+    )
+
+    // The sandbox re-roots the absolute path inside the sandbox rather than
+    // writing to the literal location, so nothing escapes.
+    expect(result.isError).toBeFalsy()
+    await expect(fs.promises.access(escape)).rejects.toThrow()
+  })
+
+  it('returns a clear setup error when the extraction client is not configured', async () => {
+    const input = await writeInput()
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'text', format: 'markdown' }),
+      undefined,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('NUTRIENT_EXTRACTION_API_KEY')
+  })
+})
+
+describe('performQueryCall', () => {
+  async function writeFixture(): Promise<string> {
+    const name = `extraction-${counter}.json`
+    await fs.promises.writeFile(path.join(sandboxDir, name), JSON.stringify(spatialFixture))
+    return name
+  }
+
+  function queryArgs(overrides: Partial<QueryExtractionArgs>): QueryExtractionArgs {
+    return {
+      filePath: overrides.filePath ?? `extraction-${counter}.json`,
+      pages: overrides.pages,
+      region: overrides.region,
+      minConfidence: overrides.minConfidence,
+      elementTypes: overrides.elementTypes,
+      limit: overrides.limit ?? 100,
+    }
+  }
+
+  it('filters by minConfidence', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, minConfidence: 0.9 }))
+
+    expect(result.isError).toBeFalsy()
+    const out = text(result)
+    expect(out).toContain('1 matching element')
+    expect(out).toContain('Quarterly Report')
+    expect(out).not.toContain(SECRET)
+  })
+
+  it('filters by element type', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, elementTypes: ['table'] }))
+
+    expect(text(result)).toContain('1 matching element')
+    expect(text(result)).toContain('"type": "table"')
+  })
+
+  it('filters by page index', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, pages: [1] }))
+
+    const out = text(result)
+    expect(out).toContain('1 matching element')
+    expect(out).toContain('"pageIndex": 1')
+  })
+
+  it('filters by region intersection', async () => {
+    const file = await writeFixture()
+    // Region overlapping only the Title element at (100,50,400,35).
+    const result = await performQueryCall(
+      queryArgs({ filePath: file, region: { x: 90, y: 40, width: 50, height: 50 } }),
+    )
+
+    expect(text(result)).toContain('Quarterly Report')
+  })
+
+  it('truncates to limit with guidance', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, limit: 1 }))
+
+    const out = text(result)
+    expect(out).toContain('Showing the first 1 of 3 matches')
+    expect(out).toContain('Narrow the filters')
+  })
+
+  it('errors on a file that is not a spatial extraction result', async () => {
+    const name = `bad-${counter}.json`
+    await fs.promises.writeFile(path.join(sandboxDir, name), JSON.stringify({ not: 'an extraction' }))
+
+    const result = await performQueryCall(queryArgs({ filePath: name }))
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('output.elements')
+  })
+})

From 4a11a99c38803a116d16a81a556e2c319f17c013 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 13:08:02 -0700
Subject: [PATCH 10/12] docs: document data_extractor + query_extraction and
 the Data Extraction key
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add both tools to the Available Tools table, a Data Extraction section (modes,
cost per page, spatial-vs-markdown, file+query workflow, transcript caveat),
NUTRIENT_EXTRACTION_API_KEY to the env table and .env.example, and point the
document_processor capability row at the dedicated tool.

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 .env.example |   4 ++
 README.md    | 111 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 71 insertions(+), 44 deletions(-)

diff --git a/.env.example b/.env.example
index bc70500..849cb00 100644
--- a/.env.example
+++ b/.env.example
@@ -1,2 +1,6 @@
 # Used in testing
+# Processor API key (build/sign/redact/credits tools)
 NUTRIENT_DWS_API_KEY=your-nutrient-dws-api-key
+
+# Separate Data Extraction API key (data_extractor tool). Starts with pdf_live_ / pdf_test_.
+NUTRIENT_EXTRACTION_API_KEY=your-nutrient-data-extraction-api-key
diff --git a/README.md b/README.md
index a749746..aef0564 100644
--- a/README.md
+++ b/README.md
@@ -74,9 +74,9 @@ Open Settings → Developer → Edit Config, then add:
         // "C:\\your\\sandbox\\directory" for Windows
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -98,9 +98,9 @@ Create `.cursor/mcp.json` in your project root:
         // "C:\\your\\project\\documents" for Windows
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -122,9 +122,9 @@ Add to `~/.codeium/windsurf/mcp_config.json`:
         // "C:\\your\\sandbox\\directory" for Windows
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -146,9 +146,9 @@ Create `.vscode/mcp.json` in your project, or add the same server definition to
         "SANDBOX_PATH": "${workspaceFolder}",
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -178,28 +178,50 @@ Place documents in your sandbox directory and use explicit file names or paths i
 
 ## Available Tools
 
-| Tool | Description |
-| ---- | ----------- |
-| `document_processor` | Document processing for conversions, OCR, extraction, watermarking, rotation, annotation flattening, and redaction workflows |
-| `document_signer` | PDF signing with CMS / PKCS#7 and CAdES signatures plus visible or invisible appearance options |
-| `ai_redactor` | AI redaction for detecting and permanently removing sensitive content such as names, addresses, SSNs, emails, and custom criteria |
-| `check_credits` | Read-only account lookup for current DWS credits and usage. No document content is uploaded |
-| `sandbox_file_tree` | Read-only view of files inside the configured sandbox directory |
-| `directory_tree` | Read-only view of local files when sandbox mode is disabled. Sandbox mode is strongly recommended |
+| Tool                 | Description                                                                                                                                  |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- |
+| `document_processor` | Document processing for conversions, OCR, watermarking, rotation, annotation flattening, and redaction workflows                             |
+| `data_extractor`     | Structured data extraction (DWS Data Extraction API): typed JSON elements with bounding boxes and confidence, or whole-document Markdown     |
+| `query_extraction`   | Read-only query over a saved extraction file — filter elements by page, region, confidence, or type without re-extracting or calling the API |
+| `document_signer`    | PDF signing with CMS / PKCS#7 and CAdES signatures plus visible or invisible appearance options                                              |
+| `ai_redactor`        | AI redaction for detecting and permanently removing sensitive content such as names, addresses, SSNs, emails, and custom criteria            |
+| `check_credits`      | Read-only account lookup for current DWS credits and usage. No document content is uploaded                                                  |
+| `sandbox_file_tree`  | Read-only view of files inside the configured sandbox directory                                                                              |
+| `directory_tree`     | Read-only view of local files when sandbox mode is disabled. Sandbox mode is strongly recommended                                            |
 
 ### Document Processor Capabilities
 
-| Feature           | Description                                                                                       |
-| ----------------- | ------------------------------------------------------------------------------------------------- |
-| Document Creation | Merge PDFs, Office docs (DOCX, XLSX, PPTX), and images into a single document                     |
-| Format Conversion | PDF ↔ DOCX, images (PNG, JPEG, WebP), PDF/A, PDF/UA, HTML, Markdown                               |
-| Editing           | Watermark (text/image), rotate pages, flatten annotations                                         |
-| Security          | Redact sensitive data (SSNs, credit cards, emails, etc.), password protection, permission control |
-| Data Extraction   | Extract text, tables, or key-value pairs as structured JSON                                       |
-| OCR               | Multi-language optical character recognition for scanned documents                                |
-| Optimization      | Compress and linearize PDFs without quality loss                                                  |
-| Annotations       | Import XFDF annotations, flatten annotations                                                      |
-| Digital Signing   | PAdES-compliant CMS and CAdES digital signatures (via document_signer tool)                       |
+| Feature           | Description                                                                                                                               |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| Document Creation | Merge PDFs, Office docs (DOCX, XLSX, PPTX), and images into a single document                                                             |
+| Format Conversion | PDF ↔ DOCX, images (PNG, JPEG, WebP), PDF/A, PDF/UA, HTML, Markdown                                                                       |
+| Editing           | Watermark (text/image), rotate pages, flatten annotations                                                                                 |
+| Security          | Redact sensitive data (SSNs, credit cards, emails, etc.), password protection, permission control                                         |
+| Data Extraction   | Now a dedicated tool — see [Data Extraction](#data-extraction) (`data_extractor`) for typed JSON/Markdown with coordinates and confidence |
+| OCR               | Multi-language optical character recognition for scanned documents                                                                        |
+| Optimization      | Compress and linearize PDFs without quality loss                                                                                          |
+| Annotations       | Import XFDF annotations, flatten annotations                                                                                              |
+| Digital Signing   | PAdES-compliant CMS and CAdES digital signatures (via document_signer tool)                                                               |
+
+### Data Extraction
+
+The `data_extractor` and `query_extraction` tools wrap the standalone [DWS Data Extraction API](https://www.nutrient.io/guides/dws-data-extraction/). They authenticate with a **separate** `NUTRIENT_EXTRACTION_API_KEY` (it starts with `pdf_live_`), independent of the Processor `NUTRIENT_DWS_API_KEY`.
+
+`data_extractor` runs one of four processing modes:
+
+| Mode                   | Output              | OCR                | Cost per page |
+| ---------------------- | ------------------- | ------------------ | ------------- |
+| `text`                 | Markdown only       | No                 | 1 credit      |
+| `structure`            | Spatial or Markdown | Yes                | 1.5 credits   |
+| `understand` (default) | Spatial or Markdown | Yes (AI-augmented) | 9 credits     |
+| `agentic`              | Spatial or Markdown | Yes (VLM)          | 18 credits    |
+
+- **Spatial** output returns typed elements (paragraphs, tables, key-value regions, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Because the element list can be large, it is written to `outputPath` and the tool returns a content-free summary (element counts, low-confidence flags, page geometry).
+- **Markdown** output returns whole-document Markdown inline — useful for RAG and search indexing.
+
+Use `query_extraction` to pull just the elements you need from a saved spatial file — filter by `pages`, `region` (bounding box), `minConfidence`, or `elementTypes` — so coordinates and values enter the conversation only when you ask for them.
+
+> **Note:** Extracted content returned inline (Markdown output, or `query_extraction` results) enters the conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped `query_extraction` calls.
 
 ## Usage Examples
 
@@ -277,24 +299,25 @@ Processed files are saved to a location determined by the AI. To guide output pl
 
 The server authenticates to the Nutrient DWS API (`https://api.nutrient.io`) using one of:
 
-| Method | When | Config |
-|--------|------|--------|
-| **API key** | `NUTRIENT_DWS_API_KEY` is set | Static key passed as Bearer token to DWS API |
-| **OAuth browser flow** | No API key set | Opens browser for Nutrient OAuth consent on the first request that uses the Nutrient API, caches token locally |
+| Method                 | When                          | Config                                                                                                         |
+| ---------------------- | ----------------------------- | -------------------------------------------------------------------------------------------------------------- |
+| **API key**            | `NUTRIENT_DWS_API_KEY` is set | Static key passed as Bearer token to DWS API                                                                   |
+| **OAuth browser flow** | No API key set                | Opens browser for Nutrient OAuth consent on the first request that uses the Nutrient API, caches token locally |
 
 When no API key is configured, the server stays connected and opens a browser-based OAuth flow on the first request that uses the Nutrient API (similar to `gh auth login`). Tokens are cached at `$XDG_CONFIG_HOME/nutrient/credentials.json` or `~/.config/nutrient/credentials.json` and refreshed automatically.
 
 ### Environment Variables
 
-| Variable               | Required    | Description                                                                                  |
-| ---------------------- | ----------- | -------------------------------------------------------------------------------------------- |
-| `NUTRIENT_DWS_API_KEY` | No*         | Nutrient DWS API key ([get one free](https://dashboard.nutrient.io/sign_up/))               |
-| `SANDBOX_PATH`         | Recommended | Directory to restrict file operations to                                                    |
-| `AUTH_SERVER_URL`      | No          | OAuth server base URL (default: `https://api.nutrient.io`)                                 |
-| `CLIENT_ID`            | No          | OAuth client ID. Skips DCR and enables refresh token reuse when set                         |
-| `DWS_API_BASE_URL`     | No          | DWS API base URL (default: `https://api.nutrient.io`)                                      |
-| `LOG_LEVEL`            | No          | Winston logger level (`info` default). Logs are written to `MCP_LOG_FILE` in stdio mode     |
-| `MCP_LOG_FILE`         | No          | Override log file path (default: system temp directory)                                     |
+| Variable                      | Required    | Description                                                                                                                   |
+| ----------------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `NUTRIENT_DWS_API_KEY`        | No\*        | Nutrient DWS **Processor** API key ([get one free](https://dashboard.nutrient.io/sign_up/))                                   |
+| `NUTRIENT_EXTRACTION_API_KEY` | No          | Nutrient DWS **Data Extraction** API key (separate key, starts with `pdf_live_`). Required only for the `data_extractor` tool |
+| `SANDBOX_PATH`                | Recommended | Directory to restrict file operations to                                                                                      |
+| `AUTH_SERVER_URL`             | No          | OAuth server base URL (default: `https://api.nutrient.io`)                                                                    |
+| `CLIENT_ID`                   | No          | OAuth client ID. Skips DCR and enables refresh token reuse when set                                                           |
+| `DWS_API_BASE_URL`            | No          | DWS API base URL (default: `https://api.nutrient.io`)                                                                         |
+| `LOG_LEVEL`                   | No          | Winston logger level (`info` default). Logs are written to `MCP_LOG_FILE` in stdio mode                                       |
+| `MCP_LOG_FILE`                | No          | Override log file path (default: system temp directory)                                                                       |
 
 \* If omitted, the server uses an OAuth browser flow to authenticate with the Nutrient API.
 

From 08c217aec8a78ab99cb0e6d311e59a6669133a90 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 13:08:57 -0700
Subject: [PATCH 11/12] docs(example): add extract -> query -> act
 dynamic-workflow walkthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

examples/invoice-extraction-workflow.md demonstrates data_extractor (spatial
to file) -> query_extraction (low-confidence + region slices) -> act via
ai_redactor/document_signer, keeping the full payload out of context.

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 README.md                               |  2 +
 examples/invoice-extraction-workflow.md | 79 +++++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 examples/invoice-extraction-workflow.md

diff --git a/README.md b/README.md
index aef0564..dd03763 100644
--- a/README.md
+++ b/README.md
@@ -223,6 +223,8 @@ Use `query_extraction` to pull just the elements you need from a saved spatial f
 
 > **Note:** Extracted content returned inline (Markdown output, or `query_extraction` results) enters the conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped `query_extraction` calls.
 
+For a worked extract → query → act walkthrough, see [examples/invoice-extraction-workflow.md](examples/invoice-extraction-workflow.md).
+
 ## Usage Examples
 
 These examples assume your files live inside the configured sandbox and that you use explicit paths.
diff --git a/examples/invoice-extraction-workflow.md b/examples/invoice-extraction-workflow.md
new file mode 100644
index 0000000..9458253
--- /dev/null
+++ b/examples/invoice-extraction-workflow.md
@@ -0,0 +1,79 @@
+# Dynamic workflow: extract → query → act
+
+This example shows how an AI agent chains the Data Extraction tools with the
+existing document tools to process an invoice **without ever loading the full
+extraction into context**. It is the pattern dynamic workflows are built on:
+extract structured data, branch on it, then act.
+
+**Prerequisites**
+
+- `NUTRIENT_EXTRACTION_API_KEY` (Data Extraction API key, starts with `pdf_live_`) for `data_extractor`.
+- `NUTRIENT_DWS_API_KEY` (or OAuth) for the `ai_redactor` / `document_signer` "act" steps.
+- `SANDBOX_PATH` set to a directory containing `invoice.pdf`.
+
+## Step 1 — Extract structured elements to a file
+
+The agent calls `data_extractor` in `understand` mode with spatial output. The
+element list (with coordinates and confidence) is written to a file; only a
+compact summary comes back.
+
+```jsonc
+// tool: data_extractor
+{ "filePath": "invoice.pdf", "mode": "understand", "format": "spatial", "outputPath": "invoice.elements.json" }
+```
+
+```
+Extracted 142 elements across 2 page(s) and wrote the full spatial JSON to invoice.elements.json (38217 bytes).
+Element types: paragraph: 96, table: 2, keyValueRegion: 18, picture: 1.
+Low-confidence elements (confidence < 0.6): 7.
+Retrieve specific elements with query_extraction ...
+```
+
+The agent now knows the shape of the document — and that **7 fields are
+low-confidence** — without 142 elements entering the conversation.
+
+## Step 2 — Branch on the result with `query_extraction`
+
+The summary flagged low-confidence elements, so the agent pulls just those to
+decide whether the document needs human review:
+
+```jsonc
+// tool: query_extraction
+{ "filePath": "invoice.elements.json", "minConfidence": 0, "elementTypes": ["keyValueRegion"], "limit": 50 }
+```
+
+It can also grab a specific region — e.g. the totals box in the bottom-right of
+page 2 — to read the amount due:
+
+```jsonc
+// tool: query_extraction
+{ "filePath": "invoice.elements.json", "pages": [1], "region": { "x": 1200, "y": 2000, "width": 600, "height": 400 } }
+```
+
+Only the handful of elements the agent actually needs — with their text and
+coordinates — enter context.
+
+## Step 3 — Act with the existing tools
+
+Branching on what it found, the agent acts:
+
+- **Low-confidence or sensitive fields →** redact before sharing:
+
+  ```jsonc
+  // tool: ai_redactor
+  { "filePath": "invoice.pdf", "criteria": "Bank account and routing numbers", "outputPath": "invoice-redacted.pdf" }
+  ```
+
+- **Clean and approved →** sign it:
+
+  ```jsonc
+  // tool: document_signer
+  { "filePath": "invoice.pdf", "outputPath": "invoice-signed.pdf", "signatureOptions": { "signatureType": "cms" } }
+  ```
+
+## Why this is the workflow primitive
+
+The agent reasons over **structure and coordinates** (counts, confidence,
+regions) rather than a wall of text, retrieves only the slices it needs, and
+hands off to deterministic document operations. The large, sensitive payload
+stays on disk; the conversation stays small and auditable.

From 14e77a8c137f0aa8b247c056d8a06ca2a4f72027 Mon Sep 17 00:00:00 2001
From: "Jonathan D. Rhyne" <jonathan@pspdfkit.com>
Date: Sun, 7 Jun 2026 13:26:36 -0700
Subject: [PATCH 12/12] =?UTF-8?q?fix(extract):=20address=20code=20review?=
 =?UTF-8?q?=20=E2=80=94=20markdown-to-file,=20spatial=20response=20guard,?=
 =?UTF-8?q?=20leaner=20write?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Honor outputPath for markdown output (large docs would overflow context inline)
- Reject 2xx responses lacking output.elements before writing, so a non-extraction
  body can't overwrite the target file
- Write the raw response body for spatial output instead of re-stringifying
  (drops a copy of large payloads, preserves all API fields)
- Extract writeToResolvedPath helper (de-dupes mkdir-p), drop redundant
  includeWords coalesce and the resolvedOutputPath casts
- Add tests for markdown-to-file and the non-spatial-body guard

🔮 View transcript: https://nutrient-agentlogs.dev/s/duk4x9tr3rmlnxta7c4bwk6o
---
 README.md             |  2 +-
 src/dws/extract.ts    | 48 +++++++++++++++++++++++++++++++------------
 src/index.ts          |  2 +-
 tests/extract.test.ts | 34 ++++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index dd03763..37b9865 100644
--- a/README.md
+++ b/README.md
@@ -217,7 +217,7 @@ The `data_extractor` and `query_extraction` tools wrap the standalone [DWS Data
 | `agentic`              | Spatial or Markdown | Yes (VLM)          | 18 credits    |
 
 - **Spatial** output returns typed elements (paragraphs, tables, key-value regions, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Because the element list can be large, it is written to `outputPath` and the tool returns a content-free summary (element counts, low-confidence flags, page geometry).
-- **Markdown** output returns whole-document Markdown inline — useful for RAG and search indexing.
+- **Markdown** output returns whole-document Markdown inline, or writes it to `outputPath` when provided (recommended for large documents) — useful for RAG and search indexing.
 
 Use `query_extraction` to pull just the elements you need from a saved spatial file — filter by `pages`, `region` (bounding box), `minConfidence`, or `elementTypes` — so coordinates and values enter the conversation only when you ask for them.
 
diff --git a/src/dws/extract.ts b/src/dws/extract.ts
index 062aafe..c1c8850 100644
--- a/src/dws/extract.ts
+++ b/src/dws/extract.ts
@@ -72,6 +72,17 @@ function summarizeSpatial(response: ExtractionResponse, outputPath: string, byte
   ].join('\n')
 }
 
+/** Writes `data` to `resolvedPath`, creating parent directories as needed. */
+async function writeToResolvedPath(resolvedPath: string, data: string): Promise<void> {
+  const outputDir = path.dirname(resolvedPath)
+  try {
+    await fs.promises.access(outputDir)
+  } catch {
+    await fs.promises.mkdir(outputDir, { recursive: true })
+  }
+  await fs.promises.writeFile(resolvedPath, data)
+}
+
 /**
  * Calls the Nutrient DWS Data Extraction API (`POST /extraction/parse`).
  *
@@ -105,9 +116,10 @@ export async function performExtractCall(
     )
   }
 
-  // Resolve the output path first (fail early on a sandbox escape, before any API call).
+  // Resolve any provided output path first (fail early on a sandbox escape,
+  // before the API call). Required for spatial, optional for markdown.
   let resolvedOutputPath: string | undefined
-  if (format === 'spatial' && outputPath) {
+  if (outputPath) {
     try {
       resolvedOutputPath = await resolveWriteFilePath(outputPath)
     } catch (error) {
@@ -129,7 +141,7 @@ export async function performExtractCall(
 
   const instructions: Record<string, unknown> = {
     mode,
-    output: format === 'spatial' ? { format, includeWords: includeWords ?? false } : { format },
+    output: format === 'spatial' ? { format, includeWords } : { format },
   }
   if (language && mode !== 'text') {
     instructions.options = { language }
@@ -155,20 +167,30 @@ export async function performExtractCall(
       if (typeof markdown !== 'string') {
         return createErrorResponse('Error: the Data Extraction API did not return markdown output.')
       }
+      // Honor outputPath for markdown too — a large document returned inline
+      // would overflow the conversation. Only return inline when no path given.
+      if (resolvedOutputPath) {
+        await writeToResolvedPath(resolvedOutputPath, markdown)
+        return createSuccessResponse(`Wrote ${Buffer.byteLength(markdown)} bytes of Markdown to ${resolvedOutputPath}.`)
+      }
       return createSuccessResponse(markdown)
     }
 
-    // Spatial: write the full result to disk, return a content-free summary.
-    const outputDir = path.dirname(resolvedOutputPath as string)
-    try {
-      await fs.promises.access(outputDir)
-    } catch {
-      await fs.promises.mkdir(outputDir, { recursive: true })
+    // Spatial. The early guard guarantees outputPath was provided.
+    if (!resolvedOutputPath) {
+      return createErrorResponse('Error: spatial output requires outputPath.')
     }
-    const json = JSON.stringify(parsed, null, 2)
-    await fs.promises.writeFile(resolvedOutputPath as string, json)
-
-    return createSuccessResponse(summarizeSpatial(parsed, resolvedOutputPath as string, Buffer.byteLength(json)))
+    // Guard against a 2xx response that is not a spatial result, so we never
+    // overwrite the target file with a non-extraction body.
+    if (!Array.isArray(parsed.output?.elements)) {
+      return createErrorResponse(
+        'Error: the Data Extraction API response did not contain a spatial element list (output.elements). Nothing was written.',
+      )
+    }
+    // Write the raw response body: avoids re-serializing a potentially large
+    // payload and preserves every field the API returned.
+    await writeToResolvedPath(resolvedOutputPath, body)
+    return createSuccessResponse(summarizeSpatial(parsed, resolvedOutputPath, Buffer.byteLength(body)))
   } catch (error) {
     return handleApiError(error)
   }
diff --git a/src/index.ts b/src/index.ts
index 14726bb..5ac6df8 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -176,7 +176,7 @@ Returns: subscription type, total credits, used credits, and remaining credits.`
 
 Output formats:
 • spatial — typed elements (paragraphs, tables, key-value pairs, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Written to outputPath (the list can be large); retrieve slices with the query_extraction tool.
-• markdown — whole-document Markdown, returned inline. Good for RAG and search indexing.
+• markdown — whole-document Markdown. Returned inline, or written to outputPath when provided (recommended for large documents). Good for RAG and search indexing.
 
 Processing modes (cost per page): text = fast Markdown, no OCR (1 credit); structure = OCR spatial (1.5 credits); understand = AI-augmented, default (9 credits); agentic = VLM-augmented (18 credits).
 
diff --git a/tests/extract.test.ts b/tests/extract.test.ts
index 163091c..d25128f 100644
--- a/tests/extract.test.ts
+++ b/tests/extract.test.ts
@@ -101,6 +101,40 @@ describe('performExtractCall', () => {
     expect(post).toHaveBeenCalledOnce()
   })
 
+  it('writes markdown to a file when outputPath is given, returning a summary not the content', async () => {
+    const input = await writeInput()
+    const outName = `out-${counter}.md`
+    const { client } = mockClient({ output: { markdown: '# Big Document\n\nlots of text' } })
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'text', format: 'markdown', outputPath: outName }),
+      client,
+    )
+
+    expect(result.isError).toBeFalsy()
+    const summary = text(result)
+    expect(summary).toContain('Wrote')
+    expect(summary).toContain(outName)
+    expect(summary).not.toContain('lots of text')
+    const written = await fs.promises.readFile(path.join(sandboxDir, outName), 'utf-8')
+    expect(written).toBe('# Big Document\n\nlots of text')
+  })
+
+  it('rejects a 2xx response with no spatial element list without writing the file', async () => {
+    const input = await writeInput()
+    const outName = `out-${counter}.json`
+    const { client } = mockClient({ status: 200, output: { markdown: 'oops wrong shape' } })
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: outName }),
+      client,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('output.elements')
+    await expect(fs.promises.access(path.join(sandboxDir, outName))).rejects.toThrow()
+  })
+
   it('writes spatial output to a file and returns a content-free summary', async () => {
     const input = await writeInput()
     const outName = `out-${counter}.json`