diff --git a/.env.example b/.env.example
index bc70500..849cb00 100644
--- a/.env.example
+++ b/.env.example
@@ -1,2 +1,6 @@
 # Used in testing
+# Processor API key (build/sign/redact/credits tools)
 NUTRIENT_DWS_API_KEY=your-nutrient-dws-api-key
+
+# Separate Data Extraction API key (data_extractor tool). Starts with pdf_live_ / pdf_test_.
+NUTRIENT_EXTRACTION_API_KEY=your-nutrient-data-extraction-api-key
diff --git a/README.md b/README.md
index a749746..37b9865 100644
--- a/README.md
+++ b/README.md
@@ -74,9 +74,9 @@ Open Settings → Developer → Edit Config, then add:
         // "C:\\your\\sandbox\\directory" for Windows
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -98,9 +98,9 @@ Create `.cursor/mcp.json` in your project root:
         // "C:\\your\\project\\documents" for Windows
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -122,9 +122,9 @@ Add to `~/.codeium/windsurf/mcp_config.json`:
         // "C:\\your\\sandbox\\directory" for Windows
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -146,9 +146,9 @@ Create `.vscode/mcp.json` in your project, or add the same server definition to
         "SANDBOX_PATH": "${workspaceFolder}",
         // Optional for CI or headless usage:
         // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE"
-      }
-    }
-  }
+      },
+    },
+  },
 }
 ```
 
@@ -178,28 +178,52 @@ Place documents in your sandbox directory and use explicit file names or paths i
 
 ## Available Tools
 
-| Tool | Description |
-| ---- | ----------- |
-| `document_processor` | Document processing for conversions, OCR, extraction, watermarking, rotation, annotation flattening, and redaction workflows |
-| `document_signer` | PDF signing with CMS / PKCS#7 and CAdES signatures plus visible or invisible appearance options |
-| `ai_redactor` | AI redaction for detecting and permanently removing sensitive content such as names, addresses, SSNs, emails, and custom criteria |
-| `check_credits` | Read-only account lookup for current DWS credits and usage. No document content is uploaded |
-| `sandbox_file_tree` | Read-only view of files inside the configured sandbox directory |
-| `directory_tree` | Read-only view of local files when sandbox mode is disabled. Sandbox mode is strongly recommended |
+| Tool                 | Description                                                                                                                                  |
+| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- |
+| `document_processor` | Document processing for conversions, OCR, watermarking, rotation, annotation flattening, and redaction workflows                             |
+| `data_extractor`     | Structured data extraction (DWS Data Extraction API): typed JSON elements with bounding boxes and confidence, or whole-document Markdown     |
+| `query_extraction`   | Read-only query over a saved extraction file — filter elements by page, region, confidence, or type without re-extracting or calling the API |
+| `document_signer`    | PDF signing with CMS / PKCS#7 and CAdES signatures plus visible or invisible appearance options                                              |
+| `ai_redactor`        | AI redaction for detecting and permanently removing sensitive content such as names, addresses, SSNs, emails, and custom criteria            |
+| `check_credits`      | Read-only account lookup for current DWS credits and usage. No document content is uploaded                                                  |
+| `sandbox_file_tree`  | Read-only view of files inside the configured sandbox directory                                                                              |
+| `directory_tree`     | Read-only view of local files when sandbox mode is disabled. Sandbox mode is strongly recommended                                            |
 
 ### Document Processor Capabilities
 
-| Feature           | Description                                                                                       |
-| ----------------- | ------------------------------------------------------------------------------------------------- |
-| Document Creation | Merge PDFs, Office docs (DOCX, XLSX, PPTX), and images into a single document                     |
-| Format Conversion | PDF ↔ DOCX, images (PNG, JPEG, WebP), PDF/A, PDF/UA, HTML, Markdown                               |
-| Editing           | Watermark (text/image), rotate pages, flatten annotations                                         |
-| Security          | Redact sensitive data (SSNs, credit cards, emails, etc.), password protection, permission control |
-| Data Extraction   | Extract text, tables, or key-value pairs as structured JSON                                       |
-| OCR               | Multi-language optical character recognition for scanned documents                                |
-| Optimization      | Compress and linearize PDFs without quality loss                                                  |
-| Annotations       | Import XFDF annotations, flatten annotations                                                      |
-| Digital Signing   | PAdES-compliant CMS and CAdES digital signatures (via document_signer tool)                       |
+| Feature           | Description                                                                                                                               |
+| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| Document Creation | Merge PDFs, Office docs (DOCX, XLSX, PPTX), and images into a single document                                                             |
+| Format Conversion | PDF ↔ DOCX, images (PNG, JPEG, WebP), PDF/A, PDF/UA, HTML, Markdown                                                                       |
+| Editing           | Watermark (text/image), rotate pages, flatten annotations                                                                                 |
+| Security          | Redact sensitive data (SSNs, credit cards, emails, etc.), password protection, permission control                                         |
+| Data Extraction   | Now a dedicated tool — see [Data Extraction](#data-extraction) (`data_extractor`) for typed JSON/Markdown with coordinates and confidence |
+| OCR               | Multi-language optical character recognition for scanned documents                                                                        |
+| Optimization      | Compress and linearize PDFs without quality loss                                                                                          |
+| Annotations       | Import XFDF annotations, flatten annotations                                                                                              |
+| Digital Signing   | PAdES-compliant CMS and CAdES digital signatures (via document_signer tool)                                                               |
+
+### Data Extraction
+
+The `data_extractor` and `query_extraction` tools wrap the standalone [DWS Data Extraction API](https://www.nutrient.io/guides/dws-data-extraction/). They authenticate with a **separate** `NUTRIENT_EXTRACTION_API_KEY` (it starts with `pdf_live_`), independent of the Processor `NUTRIENT_DWS_API_KEY`.
+
+`data_extractor` runs one of four processing modes:
+
+| Mode                   | Output              | OCR                | Cost per page |
+| ---------------------- | ------------------- | ------------------ | ------------- |
+| `text`                 | Markdown only       | No                 | 1 credit      |
+| `structure`            | Spatial or Markdown | Yes                | 1.5 credits   |
+| `understand` (default) | Spatial or Markdown | Yes (AI-augmented) | 9 credits     |
+| `agentic`              | Spatial or Markdown | Yes (VLM)          | 18 credits    |
+
+- **Spatial** output returns typed elements (paragraphs, tables, key-value regions, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Because the element list can be large, it is written to `outputPath` and the tool returns a content-free summary (element counts, low-confidence flags, page geometry).
+- **Markdown** output returns whole-document Markdown inline, or writes it to `outputPath` when provided (recommended for large documents) — useful for RAG and search indexing.
+
+Use `query_extraction` to pull just the elements you need from a saved spatial file — filter by `pages`, `region` (bounding box), `minConfidence`, or `elementTypes` — so coordinates and values enter the conversation only when you ask for them.
+
+> **Note:** Extracted content returned inline (Markdown output, or `query_extraction` results) enters the conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped `query_extraction` calls.
+
+For a worked extract → query → act walkthrough, see [examples/invoice-extraction-workflow.md](examples/invoice-extraction-workflow.md).
 
 ## Usage Examples
 
@@ -277,24 +301,25 @@ Processed files are saved to a location determined by the AI. To guide output pl
 
 The server authenticates to the Nutrient DWS API (`https://api.nutrient.io`) using one of:
 
-| Method | When | Config |
-|--------|------|--------|
-| **API key** | `NUTRIENT_DWS_API_KEY` is set | Static key passed as Bearer token to DWS API |
-| **OAuth browser flow** | No API key set | Opens browser for Nutrient OAuth consent on the first request that uses the Nutrient API, caches token locally |
+| Method                 | When                          | Config                                                                                                         |
+| ---------------------- | ----------------------------- | -------------------------------------------------------------------------------------------------------------- |
+| **API key**            | `NUTRIENT_DWS_API_KEY` is set | Static key passed as Bearer token to DWS API                                                                   |
+| **OAuth browser flow** | No API key set                | Opens browser for Nutrient OAuth consent on the first request that uses the Nutrient API, caches token locally |
 
 When no API key is configured, the server stays connected and opens a browser-based OAuth flow on the first request that uses the Nutrient API (similar to `gh auth login`). Tokens are cached at `$XDG_CONFIG_HOME/nutrient/credentials.json` or `~/.config/nutrient/credentials.json` and refreshed automatically.
 
 ### Environment Variables
 
-| Variable               | Required    | Description                                                                                  |
-| ---------------------- | ----------- | -------------------------------------------------------------------------------------------- |
-| `NUTRIENT_DWS_API_KEY` | No*         | Nutrient DWS API key ([get one free](https://dashboard.nutrient.io/sign_up/))               |
-| `SANDBOX_PATH`         | Recommended | Directory to restrict file operations to                                                    |
-| `AUTH_SERVER_URL`      | No          | OAuth server base URL (default: `https://api.nutrient.io`)                                 |
-| `CLIENT_ID`            | No          | OAuth client ID. Skips DCR and enables refresh token reuse when set                         |
-| `DWS_API_BASE_URL`     | No          | DWS API base URL (default: `https://api.nutrient.io`)                                      |
-| `LOG_LEVEL`            | No          | Winston logger level (`info` default). Logs are written to `MCP_LOG_FILE` in stdio mode     |
-| `MCP_LOG_FILE`         | No          | Override log file path (default: system temp directory)                                     |
+| Variable                      | Required    | Description                                                                                                                   |
+| ----------------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `NUTRIENT_DWS_API_KEY`        | No\*        | Nutrient DWS **Processor** API key ([get one free](https://dashboard.nutrient.io/sign_up/))                                   |
+| `NUTRIENT_EXTRACTION_API_KEY` | No          | Nutrient DWS **Data Extraction** API key (separate key, starts with `pdf_live_`). Required only for the `data_extractor` tool |
+| `SANDBOX_PATH`                | Recommended | Directory to restrict file operations to                                                                                      |
+| `AUTH_SERVER_URL`             | No          | OAuth server base URL (default: `https://api.nutrient.io`)                                                                    |
+| `CLIENT_ID`                   | No          | OAuth client ID. Skips DCR and enables refresh token reuse when set                                                           |
+| `DWS_API_BASE_URL`            | No          | DWS API base URL (default: `https://api.nutrient.io`)                                                                         |
+| `LOG_LEVEL`                   | No          | Winston logger level (`info` default). Logs are written to `MCP_LOG_FILE` in stdio mode                                       |
+| `MCP_LOG_FILE`                | No          | Override log file path (default: system temp directory)                                                                       |
 
 \* If omitted, the server uses an OAuth browser flow to authenticate with the Nutrient API.
 
diff --git a/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
new file mode 100644
index 0000000..85f3cf8
--- /dev/null
+++ b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md
@@ -0,0 +1,257 @@
+---
+title: "feat: data_extractor + query_extraction tools (DWS Data Extraction API) and a workflow example"
+status: active
+date: 2026-06-07
+type: feat
+target_repo: nutrient-dws-mcp-server
+base_branch: main
+---
+
+# feat: `data_extractor` + `query_extraction` + a dynamic-workflow example
+
+## Summary
+
+Add the **Data Extraction workflow primitive** to the Nutrient DWS MCP server, targeting the **new standalone DWS Data Extraction API** (`POST https://api.nutrient.io/extraction/parse`) — a separate product with its own key, **not** a `json-content` output of the Processor `/build` endpoint.
+
+- **`data_extractor`** — calls `/extraction/parse` with a `mode` (`text`/`structure`/`understand`/`agentic`) and output `format` (`spatial` elements or `markdown`). Spatial output (typed elements with `bounds`, `confidence`, `readingOrder`, `page`) can be large, so it is written to a file with a decision-grade summary returned inline; markdown is returned inline.
+- **`query_extraction`** — reads a saved spatial-extraction file and returns **filtered element slices inline** (by page, region/bbox, minimum confidence, element type), so an agent can pull actionable coordinates into context on demand.
+- **A dynamic-workflow example** — extract → query low-confidence elements → act with the existing `ai_redactor` / `document_signer`.
+
+Architecture fit: main already has a `DwsApiClient` abstraction (`baseUrl` + `tokenResolver`, `.post(endpoint, data)`). `data_extractor` uses a **second client instance** authenticated with the Data Extraction key (`pdf_live_…`) — no new HTTP plumbing.
+
+**Deferred to their own PRs:** `accessibility_tagger` (the DWS **Accessibility API** is also now standalone and includes auto-tag *and* validation), Viewer.
+
+---
+
+## Problem Frame
+
+DWS is now four separate APIs, each with its own key: **Processor** (`/build`, `NUTRIENT_DWS_API_KEY`), **Data Extraction** (`/extraction/parse`, `pdf_live_…`), **Accessibility**, **Viewer**. The MCP server today only speaks Processor `/build`. Extraction was *previously* reachable as a `json-content` Build output; the dedicated Data Extraction API supersedes that with richer typed elements, confidence, coordinates, and four cost/quality modes.
+
+Authoritative spec (verified on disk at `~/projects/nutrient-website/src/content/guides/dws-data-extraction/`):
+
+- **Endpoint:** `POST https://api.nutrient.io/extraction/parse`. Auth: `Authorization: Bearer pdf_live_…` (separate dashboard key; `pdf_test_…` for testing).
+- **Request:** multipart `file` + `instructions={"mode":…,"output":{"format":…,"includeWords":…}}` (also supports JSON-body-with-URL and raw-binary).
+- **Modes:** `text` (1 cr/pg, markdown only, no OCR), `structure` (1.5 cr/pg, OCR spatial), `understand` (default, 9 cr/pg, AI-augmented), `agentic` (18 cr/pg, VLM).
+- **Output:** `spatial` → `output.elements[]`; `markdown` → `output.markdown`. `text` mode defaults to markdown; others default to spatial.
+- **Spatial element:** `{id, type, role, text, confidence, readingOrder, bounds:{x,y,width,height}, page:{pageIndex,pageNumber,width,height}}`. Types: `paragraph`, `table` (rows/cols/cells w/ per-cell bounds), `formula` (LaTeX), `picture` (alt text), `keyValueRegion`, `handwriting`. Optional `includeWords` adds word-level bounds.
+- **Coordinates:** top-left origin, render-space pixels, `0 ≤ x+width ≤ page.width`.
+- **Response envelope:** `{status, requestId, output:{elements|markdown}, metrics:{processingTimeMs,pagesProcessed}, configuration:{mode,outputFormat}}` — returned as JSON (the client streams it; the handler parses).
+
+Because the schema is fully documented, the build proceeds against it directly; one live call (U0) is **confirmation**, not discovery.
+
+---
+
+## Requirements
+
+- **R1.** `data_extractor` calls `/extraction/parse` exposing `mode`, `output.format`, `includeWords`, `language`, and page selection.
+- **R2.** Spatial results are written to `outputPath`; the inline response is a decision-grade summary with **no extracted document content**. Markdown results return inline. `format: spatial` requires `outputPath`.
+- **R3.** `query_extraction` reads a saved spatial-extraction file and returns filtered element slices inline (page, region/bbox, minConfidence, elementTypes).
+- **R4.** Reuse main's `DwsApiClient`; add a Data Extraction client authenticated by `NUTRIENT_EXTRACTION_API_KEY`. Reuse sandbox path resolution and response/error helpers.
+- **R5.** Respect the sandbox vs. non-sandbox registration model in `addToolsToServer`.
+- **R6.** Any `outputPath`/`filePath` is validated through the sandbox resolver before the API call or file read.
+- **R7.** Surface per-mode **credit cost** in the `data_extractor` description (`understand` = 9 cr/pg) so agents/users don't run up cost unknowingly.
+- **R8.** Ship one runnable dynamic-workflow example (extract → query → act).
+- **R9.** Update README (Available Tools + Features + the new env var) and amend `document_processor`'s description so it no longer advertises standalone extraction.
+- **R10.** Tests cover request construction, spatial→file vs markdown→inline routing, query filtering, sandbox rejection, and key/PII safety — mocked against the documented response shape.
+
+---
+
+## Key Technical Decisions
+
+- **KTD1 — Target the Data Extraction API via a second `DwsApiClient`.** `data_extractor` builds a multipart form (`file` + `instructions`) and calls `extractionClient.post('extraction/parse', form)`, where `extractionClient = createApiClientFromApiKey(getExtractionApiKey())`. Same `baseUrl` (`https://api.nutrient.io`), different token. *(Supersedes the original "wrap /build" decision — Data Extraction is a separate API.)*
+- **KTD2 — Spatial → file + summary; markdown → inline; query for slices.** Spatial `elements[]` can be large; write the parsed JSON to `outputPath`, return a decision-grade summary, and let the agent retrieve slices via `query_extraction`. Markdown is a single blob → inline.
+- **KTD3 — Decision-grade summary, never content.** Inline summary = per-page counts by element `type`/`role`, low-confidence element count (e.g. `confidence < 0.6`), bbox coverage, page count, output path, byte size. No `text` values. (PII boundary; the field names are now known, so counts are reliable.)
+- **KTD4 — Mode + format surface with cost transparency.** Expose all four modes and both formats; default `mode: understand`, `format: spatial`. Validate `text`-mode ⇒ markdown-only. Put credit costs in the tool description (R7).
+- **KTD5 — Separate key + env var.** `getExtractionApiKey()` reads `NUTRIENT_EXTRACTION_API_KEY` (distinct from Processor `NUTRIENT_DWS_API_KEY`); fail with a clear message if unset. Document both keys.
+- **KTD6 — Inline data is transcript-visible.** `data_extractor` markdown/inline output and all `query_extraction` results enter the agent transcript (host/provider may log). Tool descriptions say so; recommend `outputPath` + scoped queries for sensitive docs.
+- **KTD7 — Response is streamed then parsed.** `DwsApiClient.post` uses `responseType: 'stream'`; the extraction handler pipes to a string (`pipeToString`) and `JSON.parse`s, since extraction returns JSON (unlike Build's file streams).
+
+---
+
+## High-Level Technical Design
+
+```mermaid
+flowchart TD
+    A[Agent: data_extractor\nfile + mode + format] --> V[validate outputPath via\nresolveWriteFilePath FIRST]
+    V --> B[multipart form: file + instructions\n mode/output.format/includeWords]
+    B --> C[extractionClient.post\n'extraction/parse']
+    C --> P[pipe stream -> string -> JSON.parse]
+    P --> D{format}
+    D -- markdown --> E[output.markdown inline]
+    D -- spatial --> F[write output.elements to outputPath\n-> decision-grade summary inline\n(per-page type counts, low-conf, page dims; NO text)]
+    F -.-> G[Agent: query_extraction\nfile + page/region/minConfidence/type]
+    G --> H[resolveReadFilePath -> parse -> filter\n-> matching elements inline]
+    H -.-> I[Agent branches -> ai_redactor / document_signer]
+```
+
+*Directional — routing gates and the extract→query→act loop are the design intent; field shapes follow the documented schema and are confirmed in U0.*
+
+---
+
+## Implementation Units
+
+### U0. Verify `/extraction/parse` against the documented schema + capture fixture
+
+**Goal:** Confirm the documented response shape with one live call and record a fixture for tests.
+**Requirements:** KTD7 (de-risks U3/U4).
+**Dependencies:** none for building; the live call needs `NUTRIENT_EXTRACTION_API_KEY`. **Deferred until the user confirms a key** — building proceeds against the documented schema meanwhile.
+**Files:** `tests/fixtures/extraction-spatial-sample.json`, `tests/fixtures/extraction-markdown-sample.json`
+**Approach:** `text` mode (1 credit) for the markdown fixture and `structure` mode (1.5 cr) for a small spatial fixture against `tests/assets/example.pdf`. Save responses verbatim. Confirm field names match `bounds/confidence/page/readingOrder/type/role`.
+**Test scenarios:** none — produces fixtures.
+**Verification:** Fixtures saved; field names match the docs (if any drift, adjust U1/U3/U4).
+
+### U1. Arg schemas
+
+**Goal:** `DataExtractorArgsSchema` + `QueryExtractionArgsSchema`.
+**Requirements:** R1, R2, R3, R6.
+**Dependencies:** none (documented schema).
+**Files:** `src/schemas.ts`
+**Approach:** `DataExtractorArgsSchema`: `filePath` (sandbox read), `mode` enum (default `understand`), `format` enum `spatial|markdown` (default by mode), `includeWords` bool, `language` (string|string[]), `pages` (`PageRangeSchema`), `outputPath` — required when `format: spatial` (`.superRefine`); also refine `text` mode ⇒ `format` must be `markdown`. `QueryExtractionArgsSchema`: `filePath` (the saved spatial JSON), optional `pages`, `region` (`{x,y,width,height}` all required together), `minConfidence` (0–1), `elementTypes` (enum array), `limit` (default cap).
+**Patterns to follow:** `BuildAPIArgsSchema`, `AiRedactArgsSchema` (`.superRefine`), `PageRangeSchema`.
+**Test scenarios:**
+- spatial without `outputPath` → rejected.
+- `text` mode with `format: spatial` → rejected.
+- `language` accepts string and array.
+- query: `minConfidence` outside 0–1 → rejected; partial `region` → rejected.
+**Verification:** `pnpm pretest`; schema unit tests green.
+
+### U2. Data Extraction API client wiring
+
+**Goal:** Provide a `DwsApiClient` authenticated with the Data Extraction key.
+**Requirements:** R4, KTD1, KTD5.
+**Dependencies:** none.
+**Files:** `src/dws/utils.ts` or `src/utils/environment.ts` (add `getExtractionApiKey()`), `src/index.ts` (build the extraction client and thread it into `addToolsToServer` options alongside `apiClient`)
+**Approach:** `getExtractionApiKey()` reads `NUTRIENT_EXTRACTION_API_KEY`, throws a clear error if unset. In the server bootstrap, `const extractionApiClient = createApiClientFromApiKey(getExtractionApiKey())`. Extend the `addToolsToServer`/`createMcpServer` options type with `extractionApiClient: DwsApiClient`. Only construct it lazily/when the key exists so the Processor-only path still boots (extraction tools can surface a clear "set NUTRIENT_EXTRACTION_API_KEY" error if missing).
+**Patterns to follow:** `createStdioApiClient`, `createApiClientFromApiKey`, the existing `apiClient` threading in `src/index.ts`.
+**Test scenarios:**
+- `getExtractionApiKey()` throws when env unset; returns the key when set.
+**Verification:** `pnpm pretest`; server boots with and without the extraction key (tools register; calling without key errors clearly).
+
+### U3. `data_extractor` handler
+
+**Goal:** Call `/extraction/parse`, route spatial→file / markdown→inline, summarize safely.
+**Requirements:** R1, R2, R6, R7, KTD2, KTD3, KTD7.
+**Dependencies:** U1, U2 (and U0 fixture for tests).
+**Files:** `src/dws/extract.ts` (new; module-private `summarizeSpatial` helper), reuse `pipeToString` from `src/dws/utils.ts`
+**Approach:** `performExtractCall(args, extractionApiClient)`. If `format: spatial`, validate `outputPath` via `resolveWriteFilePath` **first**. Resolve `filePath` via `resolveReadFilePath`, read buffer, build `FormData` (`file` + `instructions` JSON). `await extractionApiClient.post('extraction/parse', form)`; `pipeToString` → `JSON.parse`. Markdown → return `output.markdown` inline. Spatial → write `output` (or `output.elements`) to the resolved path; return `summarizeSpatial(output)` (KTD3 fields only). Errors → `createErrorResponse`; ensure no `Authorization`/key leaks (axios error `config` stripped).
+**Patterns to follow:** `performBuildCall` structure, `processFileReference` file-read approach, `handleApiError`, `createSuccessResponse`/`createErrorResponse`.
+**Test scenarios:**
+- markdown mode → inline string from `output.markdown` (mocked).
+- spatial mode + `outputPath` → file written; summary string has counts + path, and asserts a known text value from the fixture is **absent** inline.
+- `outputPath` outside sandbox → rejected before any network call.
+- API error → `createErrorResponse`; assert no `Bearer`/key in the message.
+- missing extraction key → clear "set NUTRIENT_EXTRACTION_API_KEY" error.
+**Verification:** `pnpm test` green.
+
+### U4. `query_extraction` handler
+
+**Goal:** Return filtered element slices inline from a saved spatial file.
+**Requirements:** R3, R6, KTD6.
+**Dependencies:** U0 fixture, U1.
+**Files:** `src/dws/extract.ts` (or `src/dws/query.ts`)
+**Approach:** `performQueryCall(args)`. `resolveReadFilePath(filePath)`, read + parse. Filter `output.elements` by `pages` (`element.page.pageIndex`), `region` (bbox intersection with `element.bounds`), `minConfidence` (`element.confidence`), `elementTypes` (`element.type`). Return up to `limit` matches inline; if more matched, note the truncation and suggest narrowing. Defensive field access with a clear error if the file isn't a recognized extraction document.
+**Patterns to follow:** `resolveReadFilePath`, `createSuccessResponse`/`createErrorResponse`.
+**Test scenarios:**
+- `minConfidence: 0.9` → only high-confidence elements (against fixture).
+- `region` bbox → only intersecting elements.
+- `pages: [0]` → only page-0 elements.
+- `elementTypes: ['table']` → only tables.
+- malformed/non-extraction file → `createErrorResponse`.
+- match set > `limit` → truncated with guidance.
+- file outside sandbox → rejected.
+**Verification:** `pnpm test` green.
+
+### U5. Register tools + de-advertise extraction on `document_processor`
+
+**Goal:** Wire both tools into the server.
+**Requirements:** R1, R3, R5, R7, R9.
+**Dependencies:** U3, U4.
+**Files:** `src/index.ts`
+**Approach:** Two `server.tool(...)` registrations passing `extractionApiClient` (data_extractor) and none (query_extraction reads files). Descriptions note: spatial output → file + `query_extraction`; per-mode credit cost; transcript caveat (KTD6). Amend `document_processor`'s description to drop standalone "JSON extraction" and point to `data_extractor`.
+**Patterns to follow:** existing `server.tool` blocks and the `addToolsToServer` options threading.
+**Test scenarios:**
+- Test expectation: none beyond handler tests — registration is wiring.
+**Verification:** `pnpm build`; server registers both tools; `document_processor` no longer double-advertises extraction.
+
+### U6. Dynamic-workflow example
+
+**Goal:** One runnable artifact: extract → query → act.
+**Requirements:** R8.
+**Dependencies:** U5.
+**Files:** `examples/invoice-extraction-workflow/` (script + notes), `README.md` ("Dynamic workflows" section)
+**Approach:** `data_extractor` (spatial → file) → `query_extraction` (`minConfidence` to find shaky fields) → branch → act via `ai_redactor`/`document_signer`. Live "act" steps use the **Processor** key; gate the runnable script behind both keys and exclude from `pnpm test`.
+**Test scenarios:** none — example/doc.
+**Verification:** Walkthrough runs once end-to-end against live keys.
+
+### U7. Tests
+
+**Goal:** Cover both handlers against the documented/fixture schema.
+**Requirements:** R10.
+**Dependencies:** U3, U4 (U0 fixtures).
+**Files:** `tests/extract.test.ts`, `tests/query.test.ts`; reuse `tests/fixtures/extraction-*-sample.json`, `tests/assets/example.pdf`. Inline example objects.
+**Approach:** Mock `DwsApiClient.post` to return a stream of the fixture; assert routing, summary-without-content, key-redaction, sandbox rejection, and query filters.
+**Execution note:** Start from a failing test asserting the spatial→file summary contains no element `text` (security-critical).
+**Verification:** `pnpm test`, `pnpm lint`, `pnpm format` clean.
+
+### U8. Docs
+
+**Goal:** Document tools, the new env var, and costs.
+**Requirements:** R7, R9.
+**Dependencies:** U5, U6.
+**Files:** `README.md`, `.env.example`
+**Approach:** Add `data_extractor` + `query_extraction` to Available Tools; add a Data Extraction feature row (modes, spatial/markdown, coords+confidence, file+query); add `NUTRIENT_EXTRACTION_API_KEY` to the env table + `.env.example`; note per-mode credits; ensure `document_processor` row no longer implies it's the extraction path.
+**Test scenarios:** none — docs.
+**Verification:** Tool names/descriptions match registrations (grep parity).
+
+---
+
+## Scope Boundaries
+
+**In scope:** `data_extractor`, `query_extraction`, the Data Extraction client wiring, one workflow example, tests, README/.env updates, `document_processor` description fix.
+
+### Deferred to Follow-Up Work
+- **`accessibility_tagger`** — DWS **Accessibility API** is now standalone (auto-tag *and* validation, own key); own PR.
+- **Viewer tool** — own key; low value for headless workflows.
+- **JSON-body-with-URL / raw-binary inputs** to `/extraction/parse` — start with multipart file upload; add URL input if needed.
+- **`agentic` cost guardrails** beyond surfacing cost in the description.
+
+---
+
+## System-Wide Impact
+
+- **New env var** `NUTRIENT_EXTRACTION_API_KEY` (separate from `NUTRIENT_DWS_API_KEY`). Documented; extraction tools error clearly if unset, Processor tools unaffected.
+- **Additive** — no breaking change; `document_processor` keeps capability (description-only change).
+- **Sandbox** covers the new file read (`query_extraction`, source PDF) and write (`data_extractor` spatial output).
+- **Transcript exposure** (KTD6) documented.
+- **Cost:** `understand` (default) = 9 credits/page; surfaced in the description (R7).
+
+---
+
+## Risks & Dependencies
+
+- **R-A (low, mitigated): live response drift from docs.** *Mitigation:* U0 fixture confirms before relying on it; defensive field access.
+- **R-B (medium): default mode cost.** `understand` at 9 cr/pg can surprise. *Mitigation:* cost in description (R7); consider defaulting to `structure` — open question below.
+- **R-C (medium): PII in transcript** via markdown/inline and query results. *Mitigation:* KTD3 (no content in summaries) + KTD6 warning + a test asserting no element `text` leaks in the spatial summary.
+- **R-D (low): two keys confuse setup.** *Mitigation:* clear env table, `.env.example`, and unset-key errors.
+
+## Open Questions (resolve during execution)
+
+- Default mode: `understand` (richest, 9 cr/pg, matches API default) vs `structure` (1.5 cr/pg) for a cheaper default? Leaning toward honoring the API default (`understand`) but surfacing cost.
+
+---
+
+## Verification Strategy
+
+Local (no GitHub Actions in this repo):
+- `pnpm pretest`, `pnpm test`, `pnpm lint`, `pnpm format`.
+- U0: one live `text`/`structure` call to capture fixtures (needs key; deferred to user).
+- U6: full extract→query→act once against live Extraction + Processor keys.
+- Per project AGENTS rules: branch off `main` → Conventional Commits → PR into `main`; never push to `main`; report exact command + exit 0 before claiming done.
+
+---
+
+## Sources & Research
+
+- **Authoritative, on disk:** `~/projects/nutrient-website/src/content/guides/dws-data-extraction/` — `getting-started.mdoc`, `api-overview.mdoc`, `parsing/processing-modes.mdoc`, `parsing/coordinate-spaces.mdoc`, `llms.txt`. Endpoint `POST /extraction/parse`, Bearer `pdf_live_…`, modes/formats, element schema, coordinate system.
+- Repo (authoritative for wiring): `src/dws/client.ts` (`DwsApiClient`, `createApiClientFromApiKey`, `.post`), `src/index.ts` (`createMcpServer`/`addToolsToServer` apiClient threading), `src/dws/build.ts`, `src/dws/utils.ts` (`pipeToString`, `handleApiError`), `src/fs/sandbox.ts`.
+- Plan review (2026-06-07, 6 personas) + two user corrections establishing the separate-API/separate-key reality.
diff --git a/examples/invoice-extraction-workflow.md b/examples/invoice-extraction-workflow.md
new file mode 100644
index 0000000..9458253
--- /dev/null
+++ b/examples/invoice-extraction-workflow.md
@@ -0,0 +1,79 @@
+# Dynamic workflow: extract → query → act
+
+This example shows how an AI agent chains the Data Extraction tools with the
+existing document tools to process an invoice **without ever loading the full
+extraction into context**. It is the pattern dynamic workflows are built on:
+extract structured data, branch on it, then act.
+
+**Prerequisites**
+
+- `NUTRIENT_EXTRACTION_API_KEY` (Data Extraction API key, starts with `pdf_live_`) for `data_extractor`.
+- `NUTRIENT_DWS_API_KEY` (or OAuth) for the `ai_redactor` / `document_signer` "act" steps.
+- `SANDBOX_PATH` set to a directory containing `invoice.pdf`.
+
+## Step 1 — Extract structured elements to a file
+
+The agent calls `data_extractor` in `understand` mode with spatial output. The
+element list (with coordinates and confidence) is written to a file; only a
+compact summary comes back.
+
+```jsonc
+// tool: data_extractor
+{ "filePath": "invoice.pdf", "mode": "understand", "format": "spatial", "outputPath": "invoice.elements.json" }
+```
+
+```
+Extracted 142 elements across 2 page(s) and wrote the full spatial JSON to invoice.elements.json (38217 bytes).
+Element types: paragraph: 96, table: 2, keyValueRegion: 18, picture: 1.
+Low-confidence elements (confidence < 0.6): 7.
+Retrieve specific elements with query_extraction ...
+```
+
+The agent now knows the shape of the document — and that **7 fields are
+low-confidence** — without 142 elements entering the conversation.
+
+## Step 2 — Branch on the result with `query_extraction`
+
+The summary flagged low-confidence elements, so the agent pulls just those to
+decide whether the document needs human review:
+
+```jsonc
+// tool: query_extraction
+{ "filePath": "invoice.elements.json", "minConfidence": 0, "elementTypes": ["keyValueRegion"], "limit": 50 }
+```
+
+It can also grab a specific region — e.g. the totals box in the bottom-right of
+page 2 — to read the amount due:
+
+```jsonc
+// tool: query_extraction
+{ "filePath": "invoice.elements.json", "pages": [1], "region": { "x": 1200, "y": 2000, "width": 600, "height": 400 } }
+```
+
+Only the handful of elements the agent actually needs — with their text and
+coordinates — enter context.
+
+## Step 3 — Act with the existing tools
+
+Branching on what it found, the agent acts:
+
+- **Low-confidence or sensitive fields →** redact before sharing:
+
+  ```jsonc
+  // tool: ai_redactor
+  { "filePath": "invoice.pdf", "criteria": "Bank account and routing numbers", "outputPath": "invoice-redacted.pdf" }
+  ```
+
+- **Clean and approved →** sign it:
+
+  ```jsonc
+  // tool: document_signer
+  { "filePath": "invoice.pdf", "outputPath": "invoice-signed.pdf", "signatureOptions": { "signatureType": "cms" } }
+  ```
+
+## Why this is the workflow primitive
+
+The agent reasons over **structure and coordinates** (counts, confidence,
+regions) rather than a wall of text, retrieves only the slices it needs, and
+hands off to deterministic document operations. The large, sensitive payload
+stays on disk; the conversation stays small and auditable.
diff --git a/src/dws/extract.ts b/src/dws/extract.ts
new file mode 100644
index 0000000..c1c8850
--- /dev/null
+++ b/src/dws/extract.ts
@@ -0,0 +1,268 @@
+import FormData from 'form-data'
+import fs from 'fs'
+import path from 'path'
+import { CallToolResult } from '@modelcontextprotocol/sdk/types.js'
+import { DwsApiClient } from './client.js'
+import { DataExtractorArgs, QueryExtractionArgs } from '../schemas.js'
+import { resolveReadFilePath, resolveWriteFilePath } from '../fs/sandbox.js'
+import { pipeToString, handleApiError } from './utils.js'
+import { createSuccessResponse, createErrorResponse } from '../responses.js'
+
+const EXTRACTION_ENDPOINT = 'extraction/parse'
+const LOW_CONFIDENCE_THRESHOLD = 0.6
+
+/** A single spatial element from the Data Extraction API (`output.format: spatial`). */
+type SpatialElement = {
+  type?: string
+  role?: string
+  confidence?: number
+  bounds?: { x: number; y: number; width: number; height: number }
+  page?: { pageIndex?: number; pageNumber?: number; width?: number; height?: number }
+}
+
+/** Parsed `/extraction/parse` response (the fields this server reads). */
+type ExtractionResponse = {
+  output?: { elements?: SpatialElement[]; markdown?: string }
+  metrics?: { pagesProcessed?: number }
+}
+
+/** text mode only supports markdown; every other mode defaults to spatial. */
+function resolveFormat(mode: DataExtractorArgs['mode'], format: DataExtractorArgs['format']): 'spatial' | 'markdown' {
+  if (format) {
+    return format
+  }
+  return mode === 'text' ? 'markdown' : 'spatial'
+}
+
+/**
+ * Build a decision-grade summary of a spatial extraction result.
+ *
+ * Deliberately excludes extracted document text — it reports only counts,
+ * confidence signal, page geometry, and where the full result was written, so
+ * sensitive content never lands in the agent transcript (query it back with
+ * `query_extraction` instead).
+ */
+function summarizeSpatial(response: ExtractionResponse, outputPath: string, byteLength: number): string {
+  const elements = response.output?.elements ?? []
+  const typeCounts: Record<string, number> = {}
+  const pageIndexes = new Set<number>()
+  let lowConfidence = 0
+
+  for (const element of elements) {
+    const type = element.type ?? 'unknown'
+    typeCounts[type] = (typeCounts[type] ?? 0) + 1
+    if (typeof element.confidence === 'number' && element.confidence < LOW_CONFIDENCE_THRESHOLD) {
+      lowConfidence += 1
+    }
+    if (typeof element.page?.pageIndex === 'number') {
+      pageIndexes.add(element.page.pageIndex)
+    }
+  }
+
+  const pageCount = response.metrics?.pagesProcessed ?? pageIndexes.size
+  const typeSummary = Object.entries(typeCounts)
+    .map(([type, count]) => `${type}: ${count}`)
+    .join(', ')
+
+  return [
+    `Extracted ${elements.length} elements across ${pageCount} page(s) and wrote the full spatial JSON to ${outputPath} (${byteLength} bytes).`,
+    `Element types: ${typeSummary || 'none'}.`,
+    `Low-confidence elements (confidence < ${LOW_CONFIDENCE_THRESHOLD}): ${lowConfidence}.`,
+    `Retrieve specific elements with query_extraction (filter by page, region, minConfidence, or elementTypes). The document content is not included here.`,
+  ].join('\n')
+}
+
+/** Writes `data` to `resolvedPath`, creating parent directories as needed. */
+async function writeToResolvedPath(resolvedPath: string, data: string): Promise<void> {
+  const outputDir = path.dirname(resolvedPath)
+  try {
+    await fs.promises.access(outputDir)
+  } catch {
+    await fs.promises.mkdir(outputDir, { recursive: true })
+  }
+  await fs.promises.writeFile(resolvedPath, data)
+}
+
+/**
+ * Calls the Nutrient DWS Data Extraction API (`POST /extraction/parse`).
+ *
+ * Spatial output is written to `outputPath` and summarized inline; markdown
+ * output is returned inline.
+ */
+export async function performExtractCall(
+  args: DataExtractorArgs,
+  extractionApiClient: DwsApiClient | undefined,
+): Promise<CallToolResult> {
+  if (!extractionApiClient) {
+    return createErrorResponse(
+      'Error: Data Extraction is not configured. Set the NUTRIENT_EXTRACTION_API_KEY environment variable ' +
+        '(a Data Extraction API key from the Nutrient dashboard, starting with pdf_live_ or pdf_test_).',
+    )
+  }
+
+  const { filePath, mode, language, includeWords, outputPath } = args
+  const format = resolveFormat(mode, args.format)
+
+  if (mode === 'text' && format === 'spatial') {
+    return createErrorResponse(
+      'Error: text mode only supports markdown output. Use a different mode for spatial output.',
+    )
+  }
+
+  if (format === 'spatial' && !outputPath) {
+    return createErrorResponse(
+      'Error: spatial output requires outputPath — the element list can be large and is written to a file, ' +
+        'then queried with query_extraction.',
+    )
+  }
+
+  // Resolve any provided output path first (fail early on a sandbox escape,
+  // before the API call). Required for spatial, optional for markdown.
+  let resolvedOutputPath: string | undefined
+  if (outputPath) {
+    try {
+      resolvedOutputPath = await resolveWriteFilePath(outputPath)
+    } catch (error) {
+      return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`)
+    }
+  }
+
+  let fileBuffer: Buffer
+  let fileName: string
+  try {
+    const resolvedInputPath = await resolveReadFilePath(filePath)
+    fileBuffer = await fs.promises.readFile(resolvedInputPath)
+    fileName = path.basename(resolvedInputPath)
+  } catch (error) {
+    return createErrorResponse(
+      `Error with input file ${filePath}: ${error instanceof Error ? error.message : String(error)}`,
+    )
+  }
+
+  const instructions: Record<string, unknown> = {
+    mode,
+    output: format === 'spatial' ? { format, includeWords } : { format },
+  }
+  if (language && mode !== 'text') {
+    instructions.options = { language }
+  }
+
+  try {
+    const form = new FormData()
+    form.append('file', fileBuffer, { filename: fileName })
+    form.append('instructions', JSON.stringify(instructions))
+
+    const response = await extractionApiClient.post(EXTRACTION_ENDPOINT, form)
+    const body = await pipeToString(response.data)
+
+    let parsed: ExtractionResponse
+    try {
+      parsed = JSON.parse(body) as ExtractionResponse
+    } catch {
+      return createErrorResponse('Error: the Data Extraction API returned a response that could not be parsed as JSON.')
+    }
+
+    if (format === 'markdown') {
+      const markdown = parsed.output?.markdown
+      if (typeof markdown !== 'string') {
+        return createErrorResponse('Error: the Data Extraction API did not return markdown output.')
+      }
+      // Honor outputPath for markdown too — a large document returned inline
+      // would overflow the conversation. Only return inline when no path given.
+      if (resolvedOutputPath) {
+        await writeToResolvedPath(resolvedOutputPath, markdown)
+        return createSuccessResponse(`Wrote ${Buffer.byteLength(markdown)} bytes of Markdown to ${resolvedOutputPath}.`)
+      }
+      return createSuccessResponse(markdown)
+    }
+
+    // Spatial. The early guard guarantees outputPath was provided.
+    if (!resolvedOutputPath) {
+      return createErrorResponse('Error: spatial output requires outputPath.')
+    }
+    // Guard against a 2xx response that is not a spatial result, so we never
+    // overwrite the target file with a non-extraction body.
+    if (!Array.isArray(parsed.output?.elements)) {
+      return createErrorResponse(
+        'Error: the Data Extraction API response did not contain a spatial element list (output.elements). Nothing was written.',
+      )
+    }
+    // Write the raw response body: avoids re-serializing a potentially large
+    // payload and preserves every field the API returned.
+    await writeToResolvedPath(resolvedOutputPath, body)
+    return createSuccessResponse(summarizeSpatial(parsed, resolvedOutputPath, Buffer.byteLength(body)))
+  } catch (error) {
+    return handleApiError(error)
+  }
+}
+
+/** Does element `bounds` intersect the query `region`? */
+function intersects(bounds: SpatialElement['bounds'], region: NonNullable<QueryExtractionArgs['region']>): boolean {
+  if (!bounds) {
+    return false
+  }
+  const right = bounds.x + bounds.width
+  const bottom = bounds.y + bounds.height
+  const regionRight = region.x + region.width
+  const regionBottom = region.y + region.height
+  return !(right < region.x || bounds.x > regionRight || bottom < region.y || bounds.y > regionBottom)
+}
+
+/**
+ * Reads a spatial extraction file produced by `data_extractor` and returns the
+ * subset of elements matching the given filters, inline.
+ */
+export async function performQueryCall(args: QueryExtractionArgs): Promise<CallToolResult> {
+  const { filePath, pages, region, minConfidence, elementTypes, limit } = args
+
+  let parsed: ExtractionResponse
+  try {
+    const resolvedPath = await resolveReadFilePath(filePath)
+    const body = await fs.promises.readFile(resolvedPath, 'utf-8')
+    parsed = JSON.parse(body) as ExtractionResponse
+  } catch (error) {
+    return createErrorResponse(
+      `Error reading extraction file ${filePath}: ${error instanceof Error ? error.message : String(error)}`,
+    )
+  }
+
+  const elements = parsed.output?.elements
+  if (!Array.isArray(elements)) {
+    return createErrorResponse(
+      'Error: this file does not look like a spatial extraction result (no output.elements array). ' +
+        'Produce one with data_extractor using format: spatial.',
+    )
+  }
+
+  const pageSet = pages && pages.length > 0 ? new Set(pages) : undefined
+  const typeSet = elementTypes && elementTypes.length > 0 ? new Set<string>(elementTypes) : undefined
+
+  const matches = elements.filter((element) => {
+    if (pageSet && (typeof element.page?.pageIndex !== 'number' || !pageSet.has(element.page.pageIndex))) {
+      return false
+    }
+    if (typeSet && (typeof element.type !== 'string' || !typeSet.has(element.type))) {
+      return false
+    }
+    if (
+      typeof minConfidence === 'number' &&
+      !(typeof element.confidence === 'number' && element.confidence >= minConfidence)
+    ) {
+      return false
+    }
+    if (region && !intersects(element.bounds, region)) {
+      return false
+    }
+    return true
+  })
+
+  const limited = matches.slice(0, limit)
+  const truncatedNote =
+    matches.length > limited.length
+      ? `\n\nShowing the first ${limited.length} of ${matches.length} matches. Narrow the filters (page, region, minConfidence, elementTypes) to see the rest.`
+      : ''
+
+  return createSuccessResponse(
+    `${limited.length} matching element(s):\n${JSON.stringify(limited, null, 2)}${truncatedNote}`,
+  )
+}
diff --git a/src/index.ts b/src/index.ts
index 78dff1b..5ac6df8 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -14,10 +14,13 @@ import {
   AiRedactArgsSchema,
   BuildAPIArgsSchema,
   CheckCreditsArgsSchema,
+  DataExtractorArgsSchema,
   DirectoryTreeArgsSchema,
+  QueryExtractionArgsSchema,
   SignAPIArgsSchema,
 } from './schemas.js'
 import { performBuildCall } from './dws/build.js'
+import { performExtractCall, performQueryCall } from './dws/extract.js'
 import { performSignCall } from './dws/sign.js'
 import { performAiRedactCall } from './dws/ai-redact.js'
 import { performCheckCreditsCall } from './dws/credits.js'
@@ -36,8 +39,9 @@ function addToolsToServer(options: {
   server: McpServer
   sandboxEnabled: boolean
   apiClient: DwsApiClient
+  extractionApiClient?: DwsApiClient
 }) {
-  const { server, sandboxEnabled, apiClient } = options
+  const { server, sandboxEnabled, apiClient, extractionApiClient } = options
 
   server.tool(
     'document_processor',
@@ -51,7 +55,9 @@ Features:
 • Watermarking (text/image)
 • Redaction creation and application
 
-Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), JSON extraction, Office (DOCX, XLSX, PPTX)`,
+Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), Office (DOCX, XLSX, PPTX)
+
+For structured data extraction (typed JSON or Markdown with bounding boxes and confidence scores), use the dedicated data_extractor tool instead.`,
     BuildAPIArgsSchema.shape,
     {
       title: 'Nutrient Document Processor',
@@ -164,6 +170,62 @@ Returns: subscription type, total credits, used credits, and remaining credits.`
     },
   )
 
+  server.tool(
+    'data_extractor',
+    `Extract structured data from a document using the Nutrient DWS Data Extraction API. Reads the input file from the local file system or sandbox (if enabled).
+
+Output formats:
+• spatial — typed elements (paragraphs, tables, key-value pairs, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Written to outputPath (the list can be large); retrieve slices with the query_extraction tool.
+• markdown — whole-document Markdown. Returned inline, or written to outputPath when provided (recommended for large documents). Good for RAG and search indexing.
+
+Processing modes (cost per page): text = fast Markdown, no OCR (1 credit); structure = OCR spatial (1.5 credits); understand = AI-augmented, default (9 credits); agentic = VLM-augmented (18 credits).
+
+Note: markdown output and any extracted content are returned into this conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped query_extraction calls.`,
+    DataExtractorArgsSchema.shape,
+    {
+      title: 'Nutrient Data Extractor',
+      readOnlyHint: false,
+      destructiveHint: true,
+      idempotentHint: false,
+      openWorldHint: true,
+    },
+    async (args) => {
+      try {
+        return await performExtractCall(args, extractionApiClient)
+      } catch (error) {
+        return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`)
+      }
+    },
+  )
+
+  server.tool(
+    'query_extraction',
+    `Query a spatial extraction file previously produced by data_extractor and return the matching elements inline. Reads the file from the local file system or sandbox (if enabled); does not call the Nutrient API.
+
+Filter by any combination of:
+• pages — 0-based page indices
+• region — a bounding box {x, y, width, height} in render-space pixels (top-left origin); returns elements whose bounds intersect it
+• minConfidence — only elements at or above this confidence (0-1)
+• elementTypes — paragraph, table, formula, picture, keyValueRegion, handwriting
+
+Use this to pull just the elements you need (e.g. low-confidence fields, or everything in a table region) instead of loading the whole extraction. Returned elements include their text and coordinates, which enter this conversation.`,
+    QueryExtractionArgsSchema.shape,
+    {
+      title: 'Nutrient Extraction Query',
+      readOnlyHint: true,
+      destructiveHint: false,
+      idempotentHint: true,
+      openWorldHint: false,
+    },
+    async (args) => {
+      try {
+        return await performQueryCall(args)
+      } catch (error) {
+        return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`)
+      }
+    },
+  )
+
   if (sandboxEnabled) {
     server.tool(
       'sandbox_file_tree',
@@ -195,7 +257,11 @@ Returns: subscription type, total credits, used credits, and remaining credits.`
   }
 }
 
-export function createMcpServer(options: { sandboxEnabled: boolean; apiClient: DwsApiClient }) {
+export function createMcpServer(options: {
+  sandboxEnabled: boolean
+  apiClient: DwsApiClient
+  extractionApiClient?: DwsApiClient
+}) {
   const server = new McpServer(
     {
       name: 'nutrient-dws-mcp-server',
@@ -213,11 +279,28 @@ export function createMcpServer(options: { sandboxEnabled: boolean; apiClient: D
     server,
     sandboxEnabled: options.sandboxEnabled,
     apiClient: options.apiClient,
+    extractionApiClient: options.extractionApiClient,
   })
 
   return server
 }
 
+/**
+ * Builds the Data Extraction API client when NUTRIENT_EXTRACTION_API_KEY is set.
+ * Returns undefined otherwise, in which case data_extractor reports a clear
+ * "set NUTRIENT_EXTRACTION_API_KEY" error when invoked.
+ */
+function createExtractionApiClient(environment: Environment): DwsApiClient | undefined {
+  if (!environment.extractionApiKey) {
+    return undefined
+  }
+
+  return createApiClient({
+    apiKey: environment.extractionApiKey,
+    baseUrl: environment.dwsApiBaseUrl,
+  })
+}
+
 async function parseCommandLineArgs() {
   const args = process.argv.slice(2)
   const sandboxDir = parseSandboxPath(args, process.env.SANDBOX_PATH) || null
@@ -283,10 +366,12 @@ export async function runServer(environment: Environment): Promise<RunServerResu
   })
 
   const apiClient = createStdioApiClient(environment)
+  const extractionApiClient = createExtractionApiClient(environment)
 
   const server = createMcpServer({
     sandboxEnabled,
     apiClient,
+    extractionApiClient,
   })
 
   const transport = new StdioServerTransport()
diff --git a/src/schemas.ts b/src/schemas.ts
index dbd43a3..d064753 100644
--- a/src/schemas.ts
+++ b/src/schemas.ts
@@ -549,3 +549,105 @@ export type Action = z.infer<typeof BuildActionSchema>
 export type SignAPIArgs = z.infer<typeof SignAPIArgsSchema>
 export type SignatureOptions = z.infer<typeof CreateDigitalSignatureSchema>
 export type AiRedactArgs = z.infer<typeof AiRedactArgsSchema>
+
+// ----- Data Extraction API (POST /extraction/parse) -----
+//
+// Cross-field rules (spatial format requires outputPath; text mode supports
+// markdown only) are enforced in the handler rather than via a top-level
+// `.superRefine`, because tools are registered with `Schema.shape`, which only
+// exists on a plain ZodObject (a refined schema would be a ZodEffects).
+
+export const ExtractionModeSchema = z
+  .enum(['text', 'structure', 'understand', 'agentic'])
+  .describe(
+    'Processing mode (cost/quality trade-off). ' +
+      'text: fast Markdown from digital-born documents, no OCR (1 credit/page). ' +
+      'structure: OCR-based spatial elements (1.5 credits/page). ' +
+      'understand: AI-augmented spatial extraction, the default (9 credits/page). ' +
+      'agentic: VLM-augmented for the most complex documents (18 credits/page).',
+  )
+
+export const ExtractionFormatSchema = z
+  .enum(['spatial', 'markdown'])
+  .describe(
+    'Output format. spatial: typed elements with bounding boxes, confidence, and reading order — written to outputPath and queried with query_extraction. ' +
+      'markdown: whole-document Markdown returned inline. text mode supports markdown only; other modes default to spatial.',
+  )
+
+export const ExtractionElementTypeSchema = z.enum([
+  'paragraph',
+  'table',
+  'formula',
+  'picture',
+  'keyValueRegion',
+  'handwriting',
+])
+
+export const DataExtractorArgsSchema = z.object({
+  filePath: z
+    .string()
+    .describe(
+      'Path to the document to extract from (PDF, image, or Office file). Resolves to sandbox path if enabled, otherwise resolves to the local file system.',
+    ),
+  mode: ExtractionModeSchema.optional().default('understand'),
+  format: ExtractionFormatSchema.optional().describe(
+    'Output format. Defaults to markdown for text mode and spatial for all other modes.',
+  ),
+  includeWords: z
+    .boolean()
+    .optional()
+    .default(false)
+    .describe('Include word-level bounding boxes in spatial output. Ignored for markdown output.'),
+  language: z
+    .union([z.string(), z.array(z.string())])
+    .optional()
+    .describe(
+      'OCR language(s) — full name (e.g. "german"), ISO code (e.g. "deu"), or array for multilingual docs. ' +
+        'Only applies to structure/understand/agentic modes; ignored for text mode.',
+    ),
+  outputPath: z
+    .string()
+    .optional()
+    .describe(
+      'Where to write spatial JSON output. Required for the spatial format (the element list can be large and is kept out of the conversation). ' +
+        'Resolves to sandbox path if enabled. Retrieve slices of it with query_extraction.',
+    ),
+})
+
+export const QueryExtractionArgsSchema = z.object({
+  filePath: z
+    .string()
+    .describe(
+      'Path to a spatial extraction JSON file previously produced by data_extractor. Resolves to sandbox path if enabled.',
+    ),
+  pages: z
+    .array(z.number().int().nonnegative())
+    .optional()
+    .describe('Only return elements on these 0-based page indices.'),
+  region: z
+    .object({
+      x: z.number().describe('Left edge in render-space pixels (top-left origin).'),
+      y: z.number().describe('Top edge in render-space pixels.'),
+      width: z.number().positive().describe('Region width in render-space pixels.'),
+      height: z.number().positive().describe('Region height in render-space pixels.'),
+    })
+    .optional()
+    .describe('Only return elements whose bounding box intersects this region.'),
+  minConfidence: z
+    .number()
+    .min(0)
+    .max(1)
+    .optional()
+    .describe('Only return elements with confidence greater than or equal to this value (0-1).'),
+  elementTypes: z.array(ExtractionElementTypeSchema).optional().describe('Only return elements of these types.'),
+  limit: z
+    .number()
+    .int()
+    .positive()
+    .optional()
+    .default(100)
+    .describe('Maximum number of elements to return inline. Narrow the filters if results are truncated.'),
+})
+
+export type DataExtractorArgs = z.infer<typeof DataExtractorArgsSchema>
+export type QueryExtractionArgs = z.infer<typeof QueryExtractionArgsSchema>
diff --git a/src/utils/environment.ts b/src/utils/environment.ts
index 534bd30..723d148 100644
--- a/src/utils/environment.ts
+++ b/src/utils/environment.ts
@@ -2,6 +2,7 @@ import { z } from 'zod'
 
 export type Environment = {
   nutrientApiKey?: string
+  extractionApiKey?: string
   dwsApiBaseUrl: string
   authServerUrl: string
   clientId?: string
@@ -9,6 +10,9 @@ export type Environment = {
 
 const RawEnvironmentSchema = z.object({
   NUTRIENT_DWS_API_KEY: z.string().optional(),
+  // Separate key for the standalone DWS Data Extraction API (POST /extraction/parse).
+  // Distinct from the Processor key above; starts with `pdf_live_` / `pdf_test_`.
+  NUTRIENT_EXTRACTION_API_KEY: z.string().optional(),
   DWS_API_BASE_URL: z.string().url().default('https://api.nutrient.io'),
   AUTH_SERVER_URL: z
     .string()
@@ -26,6 +30,7 @@ export function getEnvironment(rawEnv: NodeJS.ProcessEnv = process.env): Environ
 
   return {
     nutrientApiKey: raw.NUTRIENT_DWS_API_KEY,
+    extractionApiKey: raw.NUTRIENT_EXTRACTION_API_KEY,
     dwsApiBaseUrl: raw.DWS_API_BASE_URL,
     authServerUrl: raw.AUTH_SERVER_URL,
     clientId: raw.CLIENT_ID,
diff --git a/tests/environment.test.ts b/tests/environment.test.ts
index 8a3f0d4..e82aa18 100644
--- a/tests/environment.test.ts
+++ b/tests/environment.test.ts
@@ -10,6 +10,19 @@ describe('environment', () => {
     expect(environment.authServerUrl).toBe('https://api.nutrient.io')
   })
 
+  it('parses the separate Data Extraction API key', () => {
+    const environment = getEnvironment({ NUTRIENT_EXTRACTION_API_KEY: 'pdf_live_abc123' })
+
+    expect(environment.extractionApiKey).toBe('pdf_live_abc123')
+    expect(environment.nutrientApiKey).toBeUndefined()
+  })
+
+  it('leaves the extraction key undefined when unset', () => {
+    const environment = getEnvironment({ NUTRIENT_DWS_API_KEY: 'dws-key' })
+
+    expect(environment.extractionApiKey).toBeUndefined()
+  })
+
   it('allows overriding DWS API base URL', () => {
     const environment = getEnvironment({ DWS_API_BASE_URL: 'http://localhost:4000' })
 
diff --git a/tests/extract.test.ts b/tests/extract.test.ts
new file mode 100644
index 0000000..d25128f
--- /dev/null
+++ b/tests/extract.test.ts
@@ -0,0 +1,292 @@
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+import fs from 'fs'
+import os from 'os'
+import path from 'path'
+import { Readable } from 'stream'
+import { setSandboxDirectory } from '../src/fs/sandbox.js'
+import { performExtractCall, performQueryCall } from '../src/dws/extract.js'
+import type { DwsApiClient } from '../src/dws/client.js'
+import type { DataExtractorArgs, QueryExtractionArgs } from '../src/schemas.js'
+import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js'
+
+// A recognizable "PII" string used to prove extracted content never appears in
+// the inline spatial summary (it must only live in the written file).
+const SECRET = 'SSN 123-45-6789'
+
+const spatialFixture = {
+  status: 200,
+  requestId: 'req_test',
+  output: {
+    elements: [
+      {
+        id: '1',
+        type: 'paragraph',
+        role: 'Title',
+        text: 'Quarterly Report',
+        confidence: 0.95,
+        readingOrder: 0,
+        bounds: { x: 100, y: 50, width: 400, height: 35 },
+        page: { pageIndex: 0, pageNumber: 1, width: 1818, height: 2422 },
+      },
+      {
+        id: '2',
+        type: 'keyValueRegion',
+        text: SECRET,
+        confidence: 0.4,
+        readingOrder: 1,
+        bounds: { x: 100, y: 200, width: 300, height: 20 },
+        page: { pageIndex: 0, pageNumber: 1, width: 1818, height: 2422 },
+      },
+      {
+        id: '3',
+        type: 'table',
+        confidence: 0.8,
+        readingOrder: 2,
+        bounds: { x: 100, y: 400, width: 600, height: 300 },
+        page: { pageIndex: 1, pageNumber: 2, width: 1818, height: 2422 },
+      },
+    ],
+  },
+  metrics: { processingTimeMs: 100, pagesProcessed: 2 },
+}
+
+function mockClient(payload: unknown): { client: DwsApiClient; post: ReturnType<typeof vi.fn> } {
+  const post = vi.fn().mockResolvedValue({ data: Readable.from([JSON.stringify(payload)]) })
+  return { client: { post } as unknown as DwsApiClient, post }
+}
+
+function text(result: CallToolResult): string {
+  return result.content.map((c) => (c.type === 'text' ? c.text : '')).join('\n')
+}
+
+let sandboxDir: string
+let counter = 0
+
+beforeEach(async () => {
+  counter += 1
+  sandboxDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'extract-test-'))
+  await setSandboxDirectory(sandboxDir)
+})
+
+afterEach(async () => {
+  await fs.promises.rm(sandboxDir, { recursive: true, force: true })
+})
+
+async function writeInput(): Promise<string> {
+  const name = `input-${counter}.pdf`
+  await fs.promises.writeFile(path.join(sandboxDir, name), 'dummy pdf bytes')
+  return name
+}
+
+function extractArgs(overrides: Partial<DataExtractorArgs>): DataExtractorArgs {
+  return {
+    filePath: overrides.filePath ?? `input-${counter}.pdf`,
+    mode: overrides.mode ?? 'understand',
+    format: overrides.format,
+    includeWords: overrides.includeWords ?? false,
+    language: overrides.language,
+    outputPath: overrides.outputPath,
+  }
+}
+
+describe('performExtractCall', () => {
+  it('returns markdown output inline', async () => {
+    const input = await writeInput()
+    const { client, post } = mockClient({ output: { markdown: '# Hello World' } })
+
+    const result = await performExtractCall(extractArgs({ filePath: input, mode: 'text', format: 'markdown' }), client)
+
+    expect(result.isError).toBeFalsy()
+    expect(text(result)).toBe('# Hello World')
+    expect(post).toHaveBeenCalledOnce()
+  })
+
+  it('writes markdown to a file when outputPath is given, returning a summary not the content', async () => {
+    const input = await writeInput()
+    const outName = `out-${counter}.md`
+    const { client } = mockClient({ output: { markdown: '# Big Document\n\nlots of text' } })
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'text', format: 'markdown', outputPath: outName }),
+      client,
+    )
+
+    expect(result.isError).toBeFalsy()
+    const summary = text(result)
+    expect(summary).toContain('Wrote')
+    expect(summary).toContain(outName)
+    expect(summary).not.toContain('lots of text')
+    const written = await fs.promises.readFile(path.join(sandboxDir, outName), 'utf-8')
+    expect(written).toBe('# Big Document\n\nlots of text')
+  })
+
+  it('rejects a 2xx response with no spatial element list without writing the file', async () => {
+    const input = await writeInput()
+    const outName = `out-${counter}.json`
+    const { client } = mockClient({ status: 200, output: { markdown: 'oops wrong shape' } })
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: outName }),
+      client,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('output.elements')
+    await expect(fs.promises.access(path.join(sandboxDir, outName))).rejects.toThrow()
+  })
+
+  it('writes spatial output to a file and returns a content-free summary', async () => {
+    const input = await writeInput()
+    const outName = `out-${counter}.json`
+    const { client } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: outName }),
+      client,
+    )
+
+    expect(result.isError).toBeFalsy()
+    const summary = text(result)
+    // Summary reports structure, not content.
+    expect(summary).toContain('Extracted 3 elements')
+    expect(summary).toContain('keyValueRegion: 1')
+    expect(summary).toContain('Low-confidence elements')
+    // The PII must NOT leak into the inline summary...
+    expect(summary).not.toContain(SECRET)
+    // ...but the full data IS persisted to the file.
+    const written = await fs.promises.readFile(path.join(sandboxDir, outName), 'utf-8')
+    expect(written).toContain(SECRET)
+  })
+
+  it('rejects spatial output without an outputPath, before any API call', async () => {
+    const input = await writeInput()
+    const { client, post } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial' }),
+      client,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('outputPath')
+    expect(post).not.toHaveBeenCalled()
+  })
+
+  it('rejects text mode with spatial output', async () => {
+    const input = await writeInput()
+    const { client, post } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'text', format: 'spatial', outputPath: `out-${counter}.json` }),
+      client,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('text mode')
+    expect(post).not.toHaveBeenCalled()
+  })
+
+  it('contains an outside-sandbox absolute outputPath within the sandbox', async () => {
+    const input = await writeInput()
+    const escape = path.join(os.tmpdir(), `escape-${counter}.json`)
+    const { client } = mockClient(spatialFixture)
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: escape }),
+      client,
+    )
+
+    // The sandbox re-roots the absolute path inside the sandbox rather than
+    // writing to the literal location, so nothing escapes.
+    expect(result.isError).toBeFalsy()
+    await expect(fs.promises.access(escape)).rejects.toThrow()
+  })
+
+  it('returns a clear setup error when the extraction client is not configured', async () => {
+    const input = await writeInput()
+
+    const result = await performExtractCall(
+      extractArgs({ filePath: input, mode: 'text', format: 'markdown' }),
+      undefined,
+    )
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('NUTRIENT_EXTRACTION_API_KEY')
+  })
+})
+
+describe('performQueryCall', () => {
+  async function writeFixture(): Promise<string> {
+    const name = `extraction-${counter}.json`
+    await fs.promises.writeFile(path.join(sandboxDir, name), JSON.stringify(spatialFixture))
+    return name
+  }
+
+  function queryArgs(overrides: Partial<QueryExtractionArgs>): QueryExtractionArgs {
+    return {
+      filePath: overrides.filePath ?? `extraction-${counter}.json`,
+      pages: overrides.pages,
+      region: overrides.region,
+      minConfidence: overrides.minConfidence,
+      elementTypes: overrides.elementTypes,
+      limit: overrides.limit ?? 100,
+    }
+  }
+
+  it('filters by minConfidence', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, minConfidence: 0.9 }))
+
+    expect(result.isError).toBeFalsy()
+    const out = text(result)
+    expect(out).toContain('1 matching element')
+    expect(out).toContain('Quarterly Report')
+    expect(out).not.toContain(SECRET)
+  })
+
+  it('filters by element type', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, elementTypes: ['table'] }))
+
+    expect(text(result)).toContain('1 matching element')
+    expect(text(result)).toContain('"type": "table"')
+  })
+
+  it('filters by page index', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, pages: [1] }))
+
+    const out = text(result)
+    expect(out).toContain('1 matching element')
+    expect(out).toContain('"pageIndex": 1')
+  })
+
+  it('filters by region intersection', async () => {
+    const file = await writeFixture()
+    // Region overlapping only the Title element at (100,50,400,35).
+    const result = await performQueryCall(
+      queryArgs({ filePath: file, region: { x: 90, y: 40, width: 50, height: 50 } }),
+    )
+
+    expect(text(result)).toContain('Quarterly Report')
+  })
+
+  it('truncates to limit with guidance', async () => {
+    const file = await writeFixture()
+    const result = await performQueryCall(queryArgs({ filePath: file, limit: 1 }))
+
+    const out = text(result)
+    expect(out).toContain('Showing the first 1 of 3 matches')
+    expect(out).toContain('Narrow the filters')
+  })
+
+  it('errors on a file that is not a spatial extraction result', async () => {
+    const name = `bad-${counter}.json`
+    await fs.promises.writeFile(path.join(sandboxDir, name), JSON.stringify({ not: 'an extraction' }))
+
+    const result = await performQueryCall(queryArgs({ filePath: name }))
+
+    expect(result.isError).toBe(true)
+    expect(text(result)).toContain('output.elements')
+  })
+})
diff --git a/tests/mcp-tools.test.ts b/tests/mcp-tools.test.ts
index 5ca2abe..764af6e 100644
--- a/tests/mcp-tools.test.ts
+++ b/tests/mcp-tools.test.ts
@@ -37,8 +37,10 @@ describe('MCP tool metadata', () => {
     expect(Object.keys(tools).sort()).toEqual([
       'ai_redactor',
       'check_credits',
+      'data_extractor',
       'document_processor',
       'document_signer',
+      'query_extraction',
       'sandbox_file_tree',
     ])