diff --git a/.env.example b/.env.example index bc70500..849cb00 100644 --- a/.env.example +++ b/.env.example @@ -1,2 +1,6 @@ # Used in testing +# Processor API key (build/sign/redact/credits tools) NUTRIENT_DWS_API_KEY=your-nutrient-dws-api-key + +# Separate Data Extraction API key (data_extractor tool). Starts with pdf_live_ / pdf_test_. +NUTRIENT_EXTRACTION_API_KEY=your-nutrient-data-extraction-api-key diff --git a/README.md b/README.md index a749746..37b9865 100644 --- a/README.md +++ b/README.md @@ -74,9 +74,9 @@ Open Settings → Developer → Edit Config, then add: // "C:\\your\\sandbox\\directory" for Windows // Optional for CI or headless usage: // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE" - } - } - } + }, + }, + }, } ``` @@ -98,9 +98,9 @@ Create `.cursor/mcp.json` in your project root: // "C:\\your\\project\\documents" for Windows // Optional for CI or headless usage: // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE" - } - } - } + }, + }, + }, } ``` @@ -122,9 +122,9 @@ Add to `~/.codeium/windsurf/mcp_config.json`: // "C:\\your\\sandbox\\directory" for Windows // Optional for CI or headless usage: // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE" - } - } - } + }, + }, + }, } ``` @@ -146,9 +146,9 @@ Create `.vscode/mcp.json` in your project, or add the same server definition to "SANDBOX_PATH": "${workspaceFolder}", // Optional for CI or headless usage: // "NUTRIENT_DWS_API_KEY": "YOUR_API_KEY_HERE" - } - } - } + }, + }, + }, } ``` @@ -178,28 +178,52 @@ Place documents in your sandbox directory and use explicit file names or paths i ## Available Tools -| Tool | Description | -| ---- | ----------- | -| `document_processor` | Document processing for conversions, OCR, extraction, watermarking, rotation, annotation flattening, and redaction workflows | -| `document_signer` | PDF signing with CMS / PKCS#7 and CAdES signatures plus visible or invisible appearance options | -| `ai_redactor` | AI redaction for detecting and permanently removing sensitive content such as names, addresses, SSNs, emails, and custom criteria | -| `check_credits` | Read-only account lookup for current DWS credits and usage. No document content is uploaded | -| `sandbox_file_tree` | Read-only view of files inside the configured sandbox directory | -| `directory_tree` | Read-only view of local files when sandbox mode is disabled. Sandbox mode is strongly recommended | +| Tool | Description | +| -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | +| `document_processor` | Document processing for conversions, OCR, watermarking, rotation, annotation flattening, and redaction workflows | +| `data_extractor` | Structured data extraction (DWS Data Extraction API): typed JSON elements with bounding boxes and confidence, or whole-document Markdown | +| `query_extraction` | Read-only query over a saved extraction file — filter elements by page, region, confidence, or type without re-extracting or calling the API | +| `document_signer` | PDF signing with CMS / PKCS#7 and CAdES signatures plus visible or invisible appearance options | +| `ai_redactor` | AI redaction for detecting and permanently removing sensitive content such as names, addresses, SSNs, emails, and custom criteria | +| `check_credits` | Read-only account lookup for current DWS credits and usage. No document content is uploaded | +| `sandbox_file_tree` | Read-only view of files inside the configured sandbox directory | +| `directory_tree` | Read-only view of local files when sandbox mode is disabled. Sandbox mode is strongly recommended | ### Document Processor Capabilities -| Feature | Description | -| ----------------- | ------------------------------------------------------------------------------------------------- | -| Document Creation | Merge PDFs, Office docs (DOCX, XLSX, PPTX), and images into a single document | -| Format Conversion | PDF ↔ DOCX, images (PNG, JPEG, WebP), PDF/A, PDF/UA, HTML, Markdown | -| Editing | Watermark (text/image), rotate pages, flatten annotations | -| Security | Redact sensitive data (SSNs, credit cards, emails, etc.), password protection, permission control | -| Data Extraction | Extract text, tables, or key-value pairs as structured JSON | -| OCR | Multi-language optical character recognition for scanned documents | -| Optimization | Compress and linearize PDFs without quality loss | -| Annotations | Import XFDF annotations, flatten annotations | -| Digital Signing | PAdES-compliant CMS and CAdES digital signatures (via document_signer tool) | +| Feature | Description | +| ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | +| Document Creation | Merge PDFs, Office docs (DOCX, XLSX, PPTX), and images into a single document | +| Format Conversion | PDF ↔ DOCX, images (PNG, JPEG, WebP), PDF/A, PDF/UA, HTML, Markdown | +| Editing | Watermark (text/image), rotate pages, flatten annotations | +| Security | Redact sensitive data (SSNs, credit cards, emails, etc.), password protection, permission control | +| Data Extraction | Now a dedicated tool — see [Data Extraction](#data-extraction) (`data_extractor`) for typed JSON/Markdown with coordinates and confidence | +| OCR | Multi-language optical character recognition for scanned documents | +| Optimization | Compress and linearize PDFs without quality loss | +| Annotations | Import XFDF annotations, flatten annotations | +| Digital Signing | PAdES-compliant CMS and CAdES digital signatures (via document_signer tool) | + +### Data Extraction + +The `data_extractor` and `query_extraction` tools wrap the standalone [DWS Data Extraction API](https://www.nutrient.io/guides/dws-data-extraction/). They authenticate with a **separate** `NUTRIENT_EXTRACTION_API_KEY` (it starts with `pdf_live_`), independent of the Processor `NUTRIENT_DWS_API_KEY`. + +`data_extractor` runs one of four processing modes: + +| Mode | Output | OCR | Cost per page | +| ---------------------- | ------------------- | ------------------ | ------------- | +| `text` | Markdown only | No | 1 credit | +| `structure` | Spatial or Markdown | Yes | 1.5 credits | +| `understand` (default) | Spatial or Markdown | Yes (AI-augmented) | 9 credits | +| `agentic` | Spatial or Markdown | Yes (VLM) | 18 credits | + +- **Spatial** output returns typed elements (paragraphs, tables, key-value regions, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Because the element list can be large, it is written to `outputPath` and the tool returns a content-free summary (element counts, low-confidence flags, page geometry). +- **Markdown** output returns whole-document Markdown inline, or writes it to `outputPath` when provided (recommended for large documents) — useful for RAG and search indexing. + +Use `query_extraction` to pull just the elements you need from a saved spatial file — filter by `pages`, `region` (bounding box), `minConfidence`, or `elementTypes` — so coordinates and values enter the conversation only when you ask for them. + +> **Note:** Extracted content returned inline (Markdown output, or `query_extraction` results) enters the conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped `query_extraction` calls. + +For a worked extract → query → act walkthrough, see [examples/invoice-extraction-workflow.md](examples/invoice-extraction-workflow.md). ## Usage Examples @@ -277,24 +301,25 @@ Processed files are saved to a location determined by the AI. To guide output pl The server authenticates to the Nutrient DWS API (`https://api.nutrient.io`) using one of: -| Method | When | Config | -|--------|------|--------| -| **API key** | `NUTRIENT_DWS_API_KEY` is set | Static key passed as Bearer token to DWS API | -| **OAuth browser flow** | No API key set | Opens browser for Nutrient OAuth consent on the first request that uses the Nutrient API, caches token locally | +| Method | When | Config | +| ---------------------- | ----------------------------- | -------------------------------------------------------------------------------------------------------------- | +| **API key** | `NUTRIENT_DWS_API_KEY` is set | Static key passed as Bearer token to DWS API | +| **OAuth browser flow** | No API key set | Opens browser for Nutrient OAuth consent on the first request that uses the Nutrient API, caches token locally | When no API key is configured, the server stays connected and opens a browser-based OAuth flow on the first request that uses the Nutrient API (similar to `gh auth login`). Tokens are cached at `$XDG_CONFIG_HOME/nutrient/credentials.json` or `~/.config/nutrient/credentials.json` and refreshed automatically. ### Environment Variables -| Variable | Required | Description | -| ---------------------- | ----------- | -------------------------------------------------------------------------------------------- | -| `NUTRIENT_DWS_API_KEY` | No* | Nutrient DWS API key ([get one free](https://dashboard.nutrient.io/sign_up/)) | -| `SANDBOX_PATH` | Recommended | Directory to restrict file operations to | -| `AUTH_SERVER_URL` | No | OAuth server base URL (default: `https://api.nutrient.io`) | -| `CLIENT_ID` | No | OAuth client ID. Skips DCR and enables refresh token reuse when set | -| `DWS_API_BASE_URL` | No | DWS API base URL (default: `https://api.nutrient.io`) | -| `LOG_LEVEL` | No | Winston logger level (`info` default). Logs are written to `MCP_LOG_FILE` in stdio mode | -| `MCP_LOG_FILE` | No | Override log file path (default: system temp directory) | +| Variable | Required | Description | +| ----------------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | +| `NUTRIENT_DWS_API_KEY` | No\* | Nutrient DWS **Processor** API key ([get one free](https://dashboard.nutrient.io/sign_up/)) | +| `NUTRIENT_EXTRACTION_API_KEY` | No | Nutrient DWS **Data Extraction** API key (separate key, starts with `pdf_live_`). Required only for the `data_extractor` tool | +| `SANDBOX_PATH` | Recommended | Directory to restrict file operations to | +| `AUTH_SERVER_URL` | No | OAuth server base URL (default: `https://api.nutrient.io`) | +| `CLIENT_ID` | No | OAuth client ID. Skips DCR and enables refresh token reuse when set | +| `DWS_API_BASE_URL` | No | DWS API base URL (default: `https://api.nutrient.io`) | +| `LOG_LEVEL` | No | Winston logger level (`info` default). Logs are written to `MCP_LOG_FILE` in stdio mode | +| `MCP_LOG_FILE` | No | Override log file path (default: system temp directory) | \* If omitted, the server uses an OAuth browser flow to authenticate with the Nutrient API. diff --git a/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md new file mode 100644 index 0000000..85f3cf8 --- /dev/null +++ b/docs/plans/2026-06-07-001-feat-dws-data-extraction-workflow-plan.md @@ -0,0 +1,257 @@ +--- +title: "feat: data_extractor + query_extraction tools (DWS Data Extraction API) and a workflow example" +status: active +date: 2026-06-07 +type: feat +target_repo: nutrient-dws-mcp-server +base_branch: main +--- + +# feat: `data_extractor` + `query_extraction` + a dynamic-workflow example + +## Summary + +Add the **Data Extraction workflow primitive** to the Nutrient DWS MCP server, targeting the **new standalone DWS Data Extraction API** (`POST https://api.nutrient.io/extraction/parse`) — a separate product with its own key, **not** a `json-content` output of the Processor `/build` endpoint. + +- **`data_extractor`** — calls `/extraction/parse` with a `mode` (`text`/`structure`/`understand`/`agentic`) and output `format` (`spatial` elements or `markdown`). Spatial output (typed elements with `bounds`, `confidence`, `readingOrder`, `page`) can be large, so it is written to a file with a decision-grade summary returned inline; markdown is returned inline. +- **`query_extraction`** — reads a saved spatial-extraction file and returns **filtered element slices inline** (by page, region/bbox, minimum confidence, element type), so an agent can pull actionable coordinates into context on demand. +- **A dynamic-workflow example** — extract → query low-confidence elements → act with the existing `ai_redactor` / `document_signer`. + +Architecture fit: main already has a `DwsApiClient` abstraction (`baseUrl` + `tokenResolver`, `.post(endpoint, data)`). `data_extractor` uses a **second client instance** authenticated with the Data Extraction key (`pdf_live_…`) — no new HTTP plumbing. + +**Deferred to their own PRs:** `accessibility_tagger` (the DWS **Accessibility API** is also now standalone and includes auto-tag *and* validation), Viewer. + +--- + +## Problem Frame + +DWS is now four separate APIs, each with its own key: **Processor** (`/build`, `NUTRIENT_DWS_API_KEY`), **Data Extraction** (`/extraction/parse`, `pdf_live_…`), **Accessibility**, **Viewer**. The MCP server today only speaks Processor `/build`. Extraction was *previously* reachable as a `json-content` Build output; the dedicated Data Extraction API supersedes that with richer typed elements, confidence, coordinates, and four cost/quality modes. + +Authoritative spec (verified on disk at `~/projects/nutrient-website/src/content/guides/dws-data-extraction/`): + +- **Endpoint:** `POST https://api.nutrient.io/extraction/parse`. Auth: `Authorization: Bearer pdf_live_…` (separate dashboard key; `pdf_test_…` for testing). +- **Request:** multipart `file` + `instructions={"mode":…,"output":{"format":…,"includeWords":…}}` (also supports JSON-body-with-URL and raw-binary). +- **Modes:** `text` (1 cr/pg, markdown only, no OCR), `structure` (1.5 cr/pg, OCR spatial), `understand` (default, 9 cr/pg, AI-augmented), `agentic` (18 cr/pg, VLM). +- **Output:** `spatial` → `output.elements[]`; `markdown` → `output.markdown`. `text` mode defaults to markdown; others default to spatial. +- **Spatial element:** `{id, type, role, text, confidence, readingOrder, bounds:{x,y,width,height}, page:{pageIndex,pageNumber,width,height}}`. Types: `paragraph`, `table` (rows/cols/cells w/ per-cell bounds), `formula` (LaTeX), `picture` (alt text), `keyValueRegion`, `handwriting`. Optional `includeWords` adds word-level bounds. +- **Coordinates:** top-left origin, render-space pixels, `0 ≤ x+width ≤ page.width`. +- **Response envelope:** `{status, requestId, output:{elements|markdown}, metrics:{processingTimeMs,pagesProcessed}, configuration:{mode,outputFormat}}` — returned as JSON (the client streams it; the handler parses). + +Because the schema is fully documented, the build proceeds against it directly; one live call (U0) is **confirmation**, not discovery. + +--- + +## Requirements + +- **R1.** `data_extractor` calls `/extraction/parse` exposing `mode`, `output.format`, `includeWords`, `language`, and page selection. +- **R2.** Spatial results are written to `outputPath`; the inline response is a decision-grade summary with **no extracted document content**. Markdown results return inline. `format: spatial` requires `outputPath`. +- **R3.** `query_extraction` reads a saved spatial-extraction file and returns filtered element slices inline (page, region/bbox, minConfidence, elementTypes). +- **R4.** Reuse main's `DwsApiClient`; add a Data Extraction client authenticated by `NUTRIENT_EXTRACTION_API_KEY`. Reuse sandbox path resolution and response/error helpers. +- **R5.** Respect the sandbox vs. non-sandbox registration model in `addToolsToServer`. +- **R6.** Any `outputPath`/`filePath` is validated through the sandbox resolver before the API call or file read. +- **R7.** Surface per-mode **credit cost** in the `data_extractor` description (`understand` = 9 cr/pg) so agents/users don't run up cost unknowingly. +- **R8.** Ship one runnable dynamic-workflow example (extract → query → act). +- **R9.** Update README (Available Tools + Features + the new env var) and amend `document_processor`'s description so it no longer advertises standalone extraction. +- **R10.** Tests cover request construction, spatial→file vs markdown→inline routing, query filtering, sandbox rejection, and key/PII safety — mocked against the documented response shape. + +--- + +## Key Technical Decisions + +- **KTD1 — Target the Data Extraction API via a second `DwsApiClient`.** `data_extractor` builds a multipart form (`file` + `instructions`) and calls `extractionClient.post('extraction/parse', form)`, where `extractionClient = createApiClientFromApiKey(getExtractionApiKey())`. Same `baseUrl` (`https://api.nutrient.io`), different token. *(Supersedes the original "wrap /build" decision — Data Extraction is a separate API.)* +- **KTD2 — Spatial → file + summary; markdown → inline; query for slices.** Spatial `elements[]` can be large; write the parsed JSON to `outputPath`, return a decision-grade summary, and let the agent retrieve slices via `query_extraction`. Markdown is a single blob → inline. +- **KTD3 — Decision-grade summary, never content.** Inline summary = per-page counts by element `type`/`role`, low-confidence element count (e.g. `confidence < 0.6`), bbox coverage, page count, output path, byte size. No `text` values. (PII boundary; the field names are now known, so counts are reliable.) +- **KTD4 — Mode + format surface with cost transparency.** Expose all four modes and both formats; default `mode: understand`, `format: spatial`. Validate `text`-mode ⇒ markdown-only. Put credit costs in the tool description (R7). +- **KTD5 — Separate key + env var.** `getExtractionApiKey()` reads `NUTRIENT_EXTRACTION_API_KEY` (distinct from Processor `NUTRIENT_DWS_API_KEY`); fail with a clear message if unset. Document both keys. +- **KTD6 — Inline data is transcript-visible.** `data_extractor` markdown/inline output and all `query_extraction` results enter the agent transcript (host/provider may log). Tool descriptions say so; recommend `outputPath` + scoped queries for sensitive docs. +- **KTD7 — Response is streamed then parsed.** `DwsApiClient.post` uses `responseType: 'stream'`; the extraction handler pipes to a string (`pipeToString`) and `JSON.parse`s, since extraction returns JSON (unlike Build's file streams). + +--- + +## High-Level Technical Design + +```mermaid +flowchart TD + A[Agent: data_extractor\nfile + mode + format] --> V[validate outputPath via\nresolveWriteFilePath FIRST] + V --> B[multipart form: file + instructions\n mode/output.format/includeWords] + B --> C[extractionClient.post\n'extraction/parse'] + C --> P[pipe stream -> string -> JSON.parse] + P --> D{format} + D -- markdown --> E[output.markdown inline] + D -- spatial --> F[write output.elements to outputPath\n-> decision-grade summary inline\n(per-page type counts, low-conf, page dims; NO text)] + F -.-> G[Agent: query_extraction\nfile + page/region/minConfidence/type] + G --> H[resolveReadFilePath -> parse -> filter\n-> matching elements inline] + H -.-> I[Agent branches -> ai_redactor / document_signer] +``` + +*Directional — routing gates and the extract→query→act loop are the design intent; field shapes follow the documented schema and are confirmed in U0.* + +--- + +## Implementation Units + +### U0. Verify `/extraction/parse` against the documented schema + capture fixture + +**Goal:** Confirm the documented response shape with one live call and record a fixture for tests. +**Requirements:** KTD7 (de-risks U3/U4). +**Dependencies:** none for building; the live call needs `NUTRIENT_EXTRACTION_API_KEY`. **Deferred until the user confirms a key** — building proceeds against the documented schema meanwhile. +**Files:** `tests/fixtures/extraction-spatial-sample.json`, `tests/fixtures/extraction-markdown-sample.json` +**Approach:** `text` mode (1 credit) for the markdown fixture and `structure` mode (1.5 cr) for a small spatial fixture against `tests/assets/example.pdf`. Save responses verbatim. Confirm field names match `bounds/confidence/page/readingOrder/type/role`. +**Test scenarios:** none — produces fixtures. +**Verification:** Fixtures saved; field names match the docs (if any drift, adjust U1/U3/U4). + +### U1. Arg schemas + +**Goal:** `DataExtractorArgsSchema` + `QueryExtractionArgsSchema`. +**Requirements:** R1, R2, R3, R6. +**Dependencies:** none (documented schema). +**Files:** `src/schemas.ts` +**Approach:** `DataExtractorArgsSchema`: `filePath` (sandbox read), `mode` enum (default `understand`), `format` enum `spatial|markdown` (default by mode), `includeWords` bool, `language` (string|string[]), `pages` (`PageRangeSchema`), `outputPath` — required when `format: spatial` (`.superRefine`); also refine `text` mode ⇒ `format` must be `markdown`. `QueryExtractionArgsSchema`: `filePath` (the saved spatial JSON), optional `pages`, `region` (`{x,y,width,height}` all required together), `minConfidence` (0–1), `elementTypes` (enum array), `limit` (default cap). +**Patterns to follow:** `BuildAPIArgsSchema`, `AiRedactArgsSchema` (`.superRefine`), `PageRangeSchema`. +**Test scenarios:** +- spatial without `outputPath` → rejected. +- `text` mode with `format: spatial` → rejected. +- `language` accepts string and array. +- query: `minConfidence` outside 0–1 → rejected; partial `region` → rejected. +**Verification:** `pnpm pretest`; schema unit tests green. + +### U2. Data Extraction API client wiring + +**Goal:** Provide a `DwsApiClient` authenticated with the Data Extraction key. +**Requirements:** R4, KTD1, KTD5. +**Dependencies:** none. +**Files:** `src/dws/utils.ts` or `src/utils/environment.ts` (add `getExtractionApiKey()`), `src/index.ts` (build the extraction client and thread it into `addToolsToServer` options alongside `apiClient`) +**Approach:** `getExtractionApiKey()` reads `NUTRIENT_EXTRACTION_API_KEY`, throws a clear error if unset. In the server bootstrap, `const extractionApiClient = createApiClientFromApiKey(getExtractionApiKey())`. Extend the `addToolsToServer`/`createMcpServer` options type with `extractionApiClient: DwsApiClient`. Only construct it lazily/when the key exists so the Processor-only path still boots (extraction tools can surface a clear "set NUTRIENT_EXTRACTION_API_KEY" error if missing). +**Patterns to follow:** `createStdioApiClient`, `createApiClientFromApiKey`, the existing `apiClient` threading in `src/index.ts`. +**Test scenarios:** +- `getExtractionApiKey()` throws when env unset; returns the key when set. +**Verification:** `pnpm pretest`; server boots with and without the extraction key (tools register; calling without key errors clearly). + +### U3. `data_extractor` handler + +**Goal:** Call `/extraction/parse`, route spatial→file / markdown→inline, summarize safely. +**Requirements:** R1, R2, R6, R7, KTD2, KTD3, KTD7. +**Dependencies:** U1, U2 (and U0 fixture for tests). +**Files:** `src/dws/extract.ts` (new; module-private `summarizeSpatial` helper), reuse `pipeToString` from `src/dws/utils.ts` +**Approach:** `performExtractCall(args, extractionApiClient)`. If `format: spatial`, validate `outputPath` via `resolveWriteFilePath` **first**. Resolve `filePath` via `resolveReadFilePath`, read buffer, build `FormData` (`file` + `instructions` JSON). `await extractionApiClient.post('extraction/parse', form)`; `pipeToString` → `JSON.parse`. Markdown → return `output.markdown` inline. Spatial → write `output` (or `output.elements`) to the resolved path; return `summarizeSpatial(output)` (KTD3 fields only). Errors → `createErrorResponse`; ensure no `Authorization`/key leaks (axios error `config` stripped). +**Patterns to follow:** `performBuildCall` structure, `processFileReference` file-read approach, `handleApiError`, `createSuccessResponse`/`createErrorResponse`. +**Test scenarios:** +- markdown mode → inline string from `output.markdown` (mocked). +- spatial mode + `outputPath` → file written; summary string has counts + path, and asserts a known text value from the fixture is **absent** inline. +- `outputPath` outside sandbox → rejected before any network call. +- API error → `createErrorResponse`; assert no `Bearer`/key in the message. +- missing extraction key → clear "set NUTRIENT_EXTRACTION_API_KEY" error. +**Verification:** `pnpm test` green. + +### U4. `query_extraction` handler + +**Goal:** Return filtered element slices inline from a saved spatial file. +**Requirements:** R3, R6, KTD6. +**Dependencies:** U0 fixture, U1. +**Files:** `src/dws/extract.ts` (or `src/dws/query.ts`) +**Approach:** `performQueryCall(args)`. `resolveReadFilePath(filePath)`, read + parse. Filter `output.elements` by `pages` (`element.page.pageIndex`), `region` (bbox intersection with `element.bounds`), `minConfidence` (`element.confidence`), `elementTypes` (`element.type`). Return up to `limit` matches inline; if more matched, note the truncation and suggest narrowing. Defensive field access with a clear error if the file isn't a recognized extraction document. +**Patterns to follow:** `resolveReadFilePath`, `createSuccessResponse`/`createErrorResponse`. +**Test scenarios:** +- `minConfidence: 0.9` → only high-confidence elements (against fixture). +- `region` bbox → only intersecting elements. +- `pages: [0]` → only page-0 elements. +- `elementTypes: ['table']` → only tables. +- malformed/non-extraction file → `createErrorResponse`. +- match set > `limit` → truncated with guidance. +- file outside sandbox → rejected. +**Verification:** `pnpm test` green. + +### U5. Register tools + de-advertise extraction on `document_processor` + +**Goal:** Wire both tools into the server. +**Requirements:** R1, R3, R5, R7, R9. +**Dependencies:** U3, U4. +**Files:** `src/index.ts` +**Approach:** Two `server.tool(...)` registrations passing `extractionApiClient` (data_extractor) and none (query_extraction reads files). Descriptions note: spatial output → file + `query_extraction`; per-mode credit cost; transcript caveat (KTD6). Amend `document_processor`'s description to drop standalone "JSON extraction" and point to `data_extractor`. +**Patterns to follow:** existing `server.tool` blocks and the `addToolsToServer` options threading. +**Test scenarios:** +- Test expectation: none beyond handler tests — registration is wiring. +**Verification:** `pnpm build`; server registers both tools; `document_processor` no longer double-advertises extraction. + +### U6. Dynamic-workflow example + +**Goal:** One runnable artifact: extract → query → act. +**Requirements:** R8. +**Dependencies:** U5. +**Files:** `examples/invoice-extraction-workflow/` (script + notes), `README.md` ("Dynamic workflows" section) +**Approach:** `data_extractor` (spatial → file) → `query_extraction` (`minConfidence` to find shaky fields) → branch → act via `ai_redactor`/`document_signer`. Live "act" steps use the **Processor** key; gate the runnable script behind both keys and exclude from `pnpm test`. +**Test scenarios:** none — example/doc. +**Verification:** Walkthrough runs once end-to-end against live keys. + +### U7. Tests + +**Goal:** Cover both handlers against the documented/fixture schema. +**Requirements:** R10. +**Dependencies:** U3, U4 (U0 fixtures). +**Files:** `tests/extract.test.ts`, `tests/query.test.ts`; reuse `tests/fixtures/extraction-*-sample.json`, `tests/assets/example.pdf`. Inline example objects. +**Approach:** Mock `DwsApiClient.post` to return a stream of the fixture; assert routing, summary-without-content, key-redaction, sandbox rejection, and query filters. +**Execution note:** Start from a failing test asserting the spatial→file summary contains no element `text` (security-critical). +**Verification:** `pnpm test`, `pnpm lint`, `pnpm format` clean. + +### U8. Docs + +**Goal:** Document tools, the new env var, and costs. +**Requirements:** R7, R9. +**Dependencies:** U5, U6. +**Files:** `README.md`, `.env.example` +**Approach:** Add `data_extractor` + `query_extraction` to Available Tools; add a Data Extraction feature row (modes, spatial/markdown, coords+confidence, file+query); add `NUTRIENT_EXTRACTION_API_KEY` to the env table + `.env.example`; note per-mode credits; ensure `document_processor` row no longer implies it's the extraction path. +**Test scenarios:** none — docs. +**Verification:** Tool names/descriptions match registrations (grep parity). + +--- + +## Scope Boundaries + +**In scope:** `data_extractor`, `query_extraction`, the Data Extraction client wiring, one workflow example, tests, README/.env updates, `document_processor` description fix. + +### Deferred to Follow-Up Work +- **`accessibility_tagger`** — DWS **Accessibility API** is now standalone (auto-tag *and* validation, own key); own PR. +- **Viewer tool** — own key; low value for headless workflows. +- **JSON-body-with-URL / raw-binary inputs** to `/extraction/parse` — start with multipart file upload; add URL input if needed. +- **`agentic` cost guardrails** beyond surfacing cost in the description. + +--- + +## System-Wide Impact + +- **New env var** `NUTRIENT_EXTRACTION_API_KEY` (separate from `NUTRIENT_DWS_API_KEY`). Documented; extraction tools error clearly if unset, Processor tools unaffected. +- **Additive** — no breaking change; `document_processor` keeps capability (description-only change). +- **Sandbox** covers the new file read (`query_extraction`, source PDF) and write (`data_extractor` spatial output). +- **Transcript exposure** (KTD6) documented. +- **Cost:** `understand` (default) = 9 credits/page; surfaced in the description (R7). + +--- + +## Risks & Dependencies + +- **R-A (low, mitigated): live response drift from docs.** *Mitigation:* U0 fixture confirms before relying on it; defensive field access. +- **R-B (medium): default mode cost.** `understand` at 9 cr/pg can surprise. *Mitigation:* cost in description (R7); consider defaulting to `structure` — open question below. +- **R-C (medium): PII in transcript** via markdown/inline and query results. *Mitigation:* KTD3 (no content in summaries) + KTD6 warning + a test asserting no element `text` leaks in the spatial summary. +- **R-D (low): two keys confuse setup.** *Mitigation:* clear env table, `.env.example`, and unset-key errors. + +## Open Questions (resolve during execution) + +- Default mode: `understand` (richest, 9 cr/pg, matches API default) vs `structure` (1.5 cr/pg) for a cheaper default? Leaning toward honoring the API default (`understand`) but surfacing cost. + +--- + +## Verification Strategy + +Local (no GitHub Actions in this repo): +- `pnpm pretest`, `pnpm test`, `pnpm lint`, `pnpm format`. +- U0: one live `text`/`structure` call to capture fixtures (needs key; deferred to user). +- U6: full extract→query→act once against live Extraction + Processor keys. +- Per project AGENTS rules: branch off `main` → Conventional Commits → PR into `main`; never push to `main`; report exact command + exit 0 before claiming done. + +--- + +## Sources & Research + +- **Authoritative, on disk:** `~/projects/nutrient-website/src/content/guides/dws-data-extraction/` — `getting-started.mdoc`, `api-overview.mdoc`, `parsing/processing-modes.mdoc`, `parsing/coordinate-spaces.mdoc`, `llms.txt`. Endpoint `POST /extraction/parse`, Bearer `pdf_live_…`, modes/formats, element schema, coordinate system. +- Repo (authoritative for wiring): `src/dws/client.ts` (`DwsApiClient`, `createApiClientFromApiKey`, `.post`), `src/index.ts` (`createMcpServer`/`addToolsToServer` apiClient threading), `src/dws/build.ts`, `src/dws/utils.ts` (`pipeToString`, `handleApiError`), `src/fs/sandbox.ts`. +- Plan review (2026-06-07, 6 personas) + two user corrections establishing the separate-API/separate-key reality. diff --git a/examples/invoice-extraction-workflow.md b/examples/invoice-extraction-workflow.md new file mode 100644 index 0000000..9458253 --- /dev/null +++ b/examples/invoice-extraction-workflow.md @@ -0,0 +1,79 @@ +# Dynamic workflow: extract → query → act + +This example shows how an AI agent chains the Data Extraction tools with the +existing document tools to process an invoice **without ever loading the full +extraction into context**. It is the pattern dynamic workflows are built on: +extract structured data, branch on it, then act. + +**Prerequisites** + +- `NUTRIENT_EXTRACTION_API_KEY` (Data Extraction API key, starts with `pdf_live_`) for `data_extractor`. +- `NUTRIENT_DWS_API_KEY` (or OAuth) for the `ai_redactor` / `document_signer` "act" steps. +- `SANDBOX_PATH` set to a directory containing `invoice.pdf`. + +## Step 1 — Extract structured elements to a file + +The agent calls `data_extractor` in `understand` mode with spatial output. The +element list (with coordinates and confidence) is written to a file; only a +compact summary comes back. + +```jsonc +// tool: data_extractor +{ "filePath": "invoice.pdf", "mode": "understand", "format": "spatial", "outputPath": "invoice.elements.json" } +``` + +``` +Extracted 142 elements across 2 page(s) and wrote the full spatial JSON to invoice.elements.json (38217 bytes). +Element types: paragraph: 96, table: 2, keyValueRegion: 18, picture: 1. +Low-confidence elements (confidence < 0.6): 7. +Retrieve specific elements with query_extraction ... +``` + +The agent now knows the shape of the document — and that **7 fields are +low-confidence** — without 142 elements entering the conversation. + +## Step 2 — Branch on the result with `query_extraction` + +The summary flagged low-confidence elements, so the agent pulls just those to +decide whether the document needs human review: + +```jsonc +// tool: query_extraction +{ "filePath": "invoice.elements.json", "minConfidence": 0, "elementTypes": ["keyValueRegion"], "limit": 50 } +``` + +It can also grab a specific region — e.g. the totals box in the bottom-right of +page 2 — to read the amount due: + +```jsonc +// tool: query_extraction +{ "filePath": "invoice.elements.json", "pages": [1], "region": { "x": 1200, "y": 2000, "width": 600, "height": 400 } } +``` + +Only the handful of elements the agent actually needs — with their text and +coordinates — enter context. + +## Step 3 — Act with the existing tools + +Branching on what it found, the agent acts: + +- **Low-confidence or sensitive fields →** redact before sharing: + + ```jsonc + // tool: ai_redactor + { "filePath": "invoice.pdf", "criteria": "Bank account and routing numbers", "outputPath": "invoice-redacted.pdf" } + ``` + +- **Clean and approved →** sign it: + + ```jsonc + // tool: document_signer + { "filePath": "invoice.pdf", "outputPath": "invoice-signed.pdf", "signatureOptions": { "signatureType": "cms" } } + ``` + +## Why this is the workflow primitive + +The agent reasons over **structure and coordinates** (counts, confidence, +regions) rather than a wall of text, retrieves only the slices it needs, and +hands off to deterministic document operations. The large, sensitive payload +stays on disk; the conversation stays small and auditable. diff --git a/src/dws/extract.ts b/src/dws/extract.ts new file mode 100644 index 0000000..c1c8850 --- /dev/null +++ b/src/dws/extract.ts @@ -0,0 +1,268 @@ +import FormData from 'form-data' +import fs from 'fs' +import path from 'path' +import { CallToolResult } from '@modelcontextprotocol/sdk/types.js' +import { DwsApiClient } from './client.js' +import { DataExtractorArgs, QueryExtractionArgs } from '../schemas.js' +import { resolveReadFilePath, resolveWriteFilePath } from '../fs/sandbox.js' +import { pipeToString, handleApiError } from './utils.js' +import { createSuccessResponse, createErrorResponse } from '../responses.js' + +const EXTRACTION_ENDPOINT = 'extraction/parse' +const LOW_CONFIDENCE_THRESHOLD = 0.6 + +/** A single spatial element from the Data Extraction API (`output.format: spatial`). */ +type SpatialElement = { + type?: string + role?: string + confidence?: number + bounds?: { x: number; y: number; width: number; height: number } + page?: { pageIndex?: number; pageNumber?: number; width?: number; height?: number } +} + +/** Parsed `/extraction/parse` response (the fields this server reads). */ +type ExtractionResponse = { + output?: { elements?: SpatialElement[]; markdown?: string } + metrics?: { pagesProcessed?: number } +} + +/** text mode only supports markdown; every other mode defaults to spatial. */ +function resolveFormat(mode: DataExtractorArgs['mode'], format: DataExtractorArgs['format']): 'spatial' | 'markdown' { + if (format) { + return format + } + return mode === 'text' ? 'markdown' : 'spatial' +} + +/** + * Build a decision-grade summary of a spatial extraction result. + * + * Deliberately excludes extracted document text — it reports only counts, + * confidence signal, page geometry, and where the full result was written, so + * sensitive content never lands in the agent transcript (query it back with + * `query_extraction` instead). + */ +function summarizeSpatial(response: ExtractionResponse, outputPath: string, byteLength: number): string { + const elements = response.output?.elements ?? [] + const typeCounts: Record = {} + const pageIndexes = new Set() + let lowConfidence = 0 + + for (const element of elements) { + const type = element.type ?? 'unknown' + typeCounts[type] = (typeCounts[type] ?? 0) + 1 + if (typeof element.confidence === 'number' && element.confidence < LOW_CONFIDENCE_THRESHOLD) { + lowConfidence += 1 + } + if (typeof element.page?.pageIndex === 'number') { + pageIndexes.add(element.page.pageIndex) + } + } + + const pageCount = response.metrics?.pagesProcessed ?? pageIndexes.size + const typeSummary = Object.entries(typeCounts) + .map(([type, count]) => `${type}: ${count}`) + .join(', ') + + return [ + `Extracted ${elements.length} elements across ${pageCount} page(s) and wrote the full spatial JSON to ${outputPath} (${byteLength} bytes).`, + `Element types: ${typeSummary || 'none'}.`, + `Low-confidence elements (confidence < ${LOW_CONFIDENCE_THRESHOLD}): ${lowConfidence}.`, + `Retrieve specific elements with query_extraction (filter by page, region, minConfidence, or elementTypes). The document content is not included here.`, + ].join('\n') +} + +/** Writes `data` to `resolvedPath`, creating parent directories as needed. */ +async function writeToResolvedPath(resolvedPath: string, data: string): Promise { + const outputDir = path.dirname(resolvedPath) + try { + await fs.promises.access(outputDir) + } catch { + await fs.promises.mkdir(outputDir, { recursive: true }) + } + await fs.promises.writeFile(resolvedPath, data) +} + +/** + * Calls the Nutrient DWS Data Extraction API (`POST /extraction/parse`). + * + * Spatial output is written to `outputPath` and summarized inline; markdown + * output is returned inline. + */ +export async function performExtractCall( + args: DataExtractorArgs, + extractionApiClient: DwsApiClient | undefined, +): Promise { + if (!extractionApiClient) { + return createErrorResponse( + 'Error: Data Extraction is not configured. Set the NUTRIENT_EXTRACTION_API_KEY environment variable ' + + '(a Data Extraction API key from the Nutrient dashboard, starting with pdf_live_ or pdf_test_).', + ) + } + + const { filePath, mode, language, includeWords, outputPath } = args + const format = resolveFormat(mode, args.format) + + if (mode === 'text' && format === 'spatial') { + return createErrorResponse( + 'Error: text mode only supports markdown output. Use a different mode for spatial output.', + ) + } + + if (format === 'spatial' && !outputPath) { + return createErrorResponse( + 'Error: spatial output requires outputPath — the element list can be large and is written to a file, ' + + 'then queried with query_extraction.', + ) + } + + // Resolve any provided output path first (fail early on a sandbox escape, + // before the API call). Required for spatial, optional for markdown. + let resolvedOutputPath: string | undefined + if (outputPath) { + try { + resolvedOutputPath = await resolveWriteFilePath(outputPath) + } catch (error) { + return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`) + } + } + + let fileBuffer: Buffer + let fileName: string + try { + const resolvedInputPath = await resolveReadFilePath(filePath) + fileBuffer = await fs.promises.readFile(resolvedInputPath) + fileName = path.basename(resolvedInputPath) + } catch (error) { + return createErrorResponse( + `Error with input file ${filePath}: ${error instanceof Error ? error.message : String(error)}`, + ) + } + + const instructions: Record = { + mode, + output: format === 'spatial' ? { format, includeWords } : { format }, + } + if (language && mode !== 'text') { + instructions.options = { language } + } + + try { + const form = new FormData() + form.append('file', fileBuffer, { filename: fileName }) + form.append('instructions', JSON.stringify(instructions)) + + const response = await extractionApiClient.post(EXTRACTION_ENDPOINT, form) + const body = await pipeToString(response.data) + + let parsed: ExtractionResponse + try { + parsed = JSON.parse(body) as ExtractionResponse + } catch { + return createErrorResponse('Error: the Data Extraction API returned a response that could not be parsed as JSON.') + } + + if (format === 'markdown') { + const markdown = parsed.output?.markdown + if (typeof markdown !== 'string') { + return createErrorResponse('Error: the Data Extraction API did not return markdown output.') + } + // Honor outputPath for markdown too — a large document returned inline + // would overflow the conversation. Only return inline when no path given. + if (resolvedOutputPath) { + await writeToResolvedPath(resolvedOutputPath, markdown) + return createSuccessResponse(`Wrote ${Buffer.byteLength(markdown)} bytes of Markdown to ${resolvedOutputPath}.`) + } + return createSuccessResponse(markdown) + } + + // Spatial. The early guard guarantees outputPath was provided. + if (!resolvedOutputPath) { + return createErrorResponse('Error: spatial output requires outputPath.') + } + // Guard against a 2xx response that is not a spatial result, so we never + // overwrite the target file with a non-extraction body. + if (!Array.isArray(parsed.output?.elements)) { + return createErrorResponse( + 'Error: the Data Extraction API response did not contain a spatial element list (output.elements). Nothing was written.', + ) + } + // Write the raw response body: avoids re-serializing a potentially large + // payload and preserves every field the API returned. + await writeToResolvedPath(resolvedOutputPath, body) + return createSuccessResponse(summarizeSpatial(parsed, resolvedOutputPath, Buffer.byteLength(body))) + } catch (error) { + return handleApiError(error) + } +} + +/** Does element `bounds` intersect the query `region`? */ +function intersects(bounds: SpatialElement['bounds'], region: NonNullable): boolean { + if (!bounds) { + return false + } + const right = bounds.x + bounds.width + const bottom = bounds.y + bounds.height + const regionRight = region.x + region.width + const regionBottom = region.y + region.height + return !(right < region.x || bounds.x > regionRight || bottom < region.y || bounds.y > regionBottom) +} + +/** + * Reads a spatial extraction file produced by `data_extractor` and returns the + * subset of elements matching the given filters, inline. + */ +export async function performQueryCall(args: QueryExtractionArgs): Promise { + const { filePath, pages, region, minConfidence, elementTypes, limit } = args + + let parsed: ExtractionResponse + try { + const resolvedPath = await resolveReadFilePath(filePath) + const body = await fs.promises.readFile(resolvedPath, 'utf-8') + parsed = JSON.parse(body) as ExtractionResponse + } catch (error) { + return createErrorResponse( + `Error reading extraction file ${filePath}: ${error instanceof Error ? error.message : String(error)}`, + ) + } + + const elements = parsed.output?.elements + if (!Array.isArray(elements)) { + return createErrorResponse( + 'Error: this file does not look like a spatial extraction result (no output.elements array). ' + + 'Produce one with data_extractor using format: spatial.', + ) + } + + const pageSet = pages && pages.length > 0 ? new Set(pages) : undefined + const typeSet = elementTypes && elementTypes.length > 0 ? new Set(elementTypes) : undefined + + const matches = elements.filter((element) => { + if (pageSet && (typeof element.page?.pageIndex !== 'number' || !pageSet.has(element.page.pageIndex))) { + return false + } + if (typeSet && (typeof element.type !== 'string' || !typeSet.has(element.type))) { + return false + } + if ( + typeof minConfidence === 'number' && + !(typeof element.confidence === 'number' && element.confidence >= minConfidence) + ) { + return false + } + if (region && !intersects(element.bounds, region)) { + return false + } + return true + }) + + const limited = matches.slice(0, limit) + const truncatedNote = + matches.length > limited.length + ? `\n\nShowing the first ${limited.length} of ${matches.length} matches. Narrow the filters (page, region, minConfidence, elementTypes) to see the rest.` + : '' + + return createSuccessResponse( + `${limited.length} matching element(s):\n${JSON.stringify(limited, null, 2)}${truncatedNote}`, + ) +} diff --git a/src/index.ts b/src/index.ts index 78dff1b..5ac6df8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -14,10 +14,13 @@ import { AiRedactArgsSchema, BuildAPIArgsSchema, CheckCreditsArgsSchema, + DataExtractorArgsSchema, DirectoryTreeArgsSchema, + QueryExtractionArgsSchema, SignAPIArgsSchema, } from './schemas.js' import { performBuildCall } from './dws/build.js' +import { performExtractCall, performQueryCall } from './dws/extract.js' import { performSignCall } from './dws/sign.js' import { performAiRedactCall } from './dws/ai-redact.js' import { performCheckCreditsCall } from './dws/credits.js' @@ -36,8 +39,9 @@ function addToolsToServer(options: { server: McpServer sandboxEnabled: boolean apiClient: DwsApiClient + extractionApiClient?: DwsApiClient }) { - const { server, sandboxEnabled, apiClient } = options + const { server, sandboxEnabled, apiClient, extractionApiClient } = options server.tool( 'document_processor', @@ -51,7 +55,9 @@ Features: • Watermarking (text/image) • Redaction creation and application -Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), JSON extraction, Office (DOCX, XLSX, PPTX)`, +Output formats: PDF, PDF/A, images (PNG, JPEG, WebP), Office (DOCX, XLSX, PPTX) + +For structured data extraction (typed JSON or Markdown with bounding boxes and confidence scores), use the dedicated data_extractor tool instead.`, BuildAPIArgsSchema.shape, { title: 'Nutrient Document Processor', @@ -164,6 +170,62 @@ Returns: subscription type, total credits, used credits, and remaining credits.` }, ) + server.tool( + 'data_extractor', + `Extract structured data from a document using the Nutrient DWS Data Extraction API. Reads the input file from the local file system or sandbox (if enabled). + +Output formats: +• spatial — typed elements (paragraphs, tables, key-value pairs, formulas, pictures, handwriting) with bounding boxes, confidence scores, and reading order. Written to outputPath (the list can be large); retrieve slices with the query_extraction tool. +• markdown — whole-document Markdown. Returned inline, or written to outputPath when provided (recommended for large documents). Good for RAG and search indexing. + +Processing modes (cost per page): text = fast Markdown, no OCR (1 credit); structure = OCR spatial (1.5 credits); understand = AI-augmented, default (9 credits); agentic = VLM-augmented (18 credits). + +Note: markdown output and any extracted content are returned into this conversation and may be logged by the host. For sensitive documents, prefer spatial output to a file plus scoped query_extraction calls.`, + DataExtractorArgsSchema.shape, + { + title: 'Nutrient Data Extractor', + readOnlyHint: false, + destructiveHint: true, + idempotentHint: false, + openWorldHint: true, + }, + async (args) => { + try { + return await performExtractCall(args, extractionApiClient) + } catch (error) { + return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`) + } + }, + ) + + server.tool( + 'query_extraction', + `Query a spatial extraction file previously produced by data_extractor and return the matching elements inline. Reads the file from the local file system or sandbox (if enabled); does not call the Nutrient API. + +Filter by any combination of: +• pages — 0-based page indices +• region — a bounding box {x, y, width, height} in render-space pixels (top-left origin); returns elements whose bounds intersect it +• minConfidence — only elements at or above this confidence (0-1) +• elementTypes — paragraph, table, formula, picture, keyValueRegion, handwriting + +Use this to pull just the elements you need (e.g. low-confidence fields, or everything in a table region) instead of loading the whole extraction. Returned elements include their text and coordinates, which enter this conversation.`, + QueryExtractionArgsSchema.shape, + { + title: 'Nutrient Extraction Query', + readOnlyHint: true, + destructiveHint: false, + idempotentHint: true, + openWorldHint: false, + }, + async (args) => { + try { + return await performQueryCall(args) + } catch (error) { + return createErrorResponse(`Error: ${error instanceof Error ? error.message : String(error)}`) + } + }, + ) + if (sandboxEnabled) { server.tool( 'sandbox_file_tree', @@ -195,7 +257,11 @@ Returns: subscription type, total credits, used credits, and remaining credits.` } } -export function createMcpServer(options: { sandboxEnabled: boolean; apiClient: DwsApiClient }) { +export function createMcpServer(options: { + sandboxEnabled: boolean + apiClient: DwsApiClient + extractionApiClient?: DwsApiClient +}) { const server = new McpServer( { name: 'nutrient-dws-mcp-server', @@ -213,11 +279,28 @@ export function createMcpServer(options: { sandboxEnabled: boolean; apiClient: D server, sandboxEnabled: options.sandboxEnabled, apiClient: options.apiClient, + extractionApiClient: options.extractionApiClient, }) return server } +/** + * Builds the Data Extraction API client when NUTRIENT_EXTRACTION_API_KEY is set. + * Returns undefined otherwise, in which case data_extractor reports a clear + * "set NUTRIENT_EXTRACTION_API_KEY" error when invoked. + */ +function createExtractionApiClient(environment: Environment): DwsApiClient | undefined { + if (!environment.extractionApiKey) { + return undefined + } + + return createApiClient({ + apiKey: environment.extractionApiKey, + baseUrl: environment.dwsApiBaseUrl, + }) +} + async function parseCommandLineArgs() { const args = process.argv.slice(2) const sandboxDir = parseSandboxPath(args, process.env.SANDBOX_PATH) || null @@ -283,10 +366,12 @@ export async function runServer(environment: Environment): Promise export type SignAPIArgs = z.infer export type SignatureOptions = z.infer export type AiRedactArgs = z.infer + +// ----- Data Extraction API (POST /extraction/parse) ----- +// +// Cross-field rules (spatial format requires outputPath; text mode supports +// markdown only) are enforced in the handler rather than via a top-level +// `.superRefine`, because tools are registered with `Schema.shape`, which only +// exists on a plain ZodObject (a refined schema would be a ZodEffects). + +export const ExtractionModeSchema = z + .enum(['text', 'structure', 'understand', 'agentic']) + .describe( + 'Processing mode (cost/quality trade-off). ' + + 'text: fast Markdown from digital-born documents, no OCR (1 credit/page). ' + + 'structure: OCR-based spatial elements (1.5 credits/page). ' + + 'understand: AI-augmented spatial extraction, the default (9 credits/page). ' + + 'agentic: VLM-augmented for the most complex documents (18 credits/page).', + ) + +export const ExtractionFormatSchema = z + .enum(['spatial', 'markdown']) + .describe( + 'Output format. spatial: typed elements with bounding boxes, confidence, and reading order — written to outputPath and queried with query_extraction. ' + + 'markdown: whole-document Markdown returned inline. text mode supports markdown only; other modes default to spatial.', + ) + +export const ExtractionElementTypeSchema = z.enum([ + 'paragraph', + 'table', + 'formula', + 'picture', + 'keyValueRegion', + 'handwriting', +]) + +export const DataExtractorArgsSchema = z.object({ + filePath: z + .string() + .describe( + 'Path to the document to extract from (PDF, image, or Office file). Resolves to sandbox path if enabled, otherwise resolves to the local file system.', + ), + mode: ExtractionModeSchema.optional().default('understand'), + format: ExtractionFormatSchema.optional().describe( + 'Output format. Defaults to markdown for text mode and spatial for all other modes.', + ), + includeWords: z + .boolean() + .optional() + .default(false) + .describe('Include word-level bounding boxes in spatial output. Ignored for markdown output.'), + language: z + .union([z.string(), z.array(z.string())]) + .optional() + .describe( + 'OCR language(s) — full name (e.g. "german"), ISO code (e.g. "deu"), or array for multilingual docs. ' + + 'Only applies to structure/understand/agentic modes; ignored for text mode.', + ), + outputPath: z + .string() + .optional() + .describe( + 'Where to write spatial JSON output. Required for the spatial format (the element list can be large and is kept out of the conversation). ' + + 'Resolves to sandbox path if enabled. Retrieve slices of it with query_extraction.', + ), +}) + +export const QueryExtractionArgsSchema = z.object({ + filePath: z + .string() + .describe( + 'Path to a spatial extraction JSON file previously produced by data_extractor. Resolves to sandbox path if enabled.', + ), + pages: z + .array(z.number().int().nonnegative()) + .optional() + .describe('Only return elements on these 0-based page indices.'), + region: z + .object({ + x: z.number().describe('Left edge in render-space pixels (top-left origin).'), + y: z.number().describe('Top edge in render-space pixels.'), + width: z.number().positive().describe('Region width in render-space pixels.'), + height: z.number().positive().describe('Region height in render-space pixels.'), + }) + .optional() + .describe('Only return elements whose bounding box intersects this region.'), + minConfidence: z + .number() + .min(0) + .max(1) + .optional() + .describe('Only return elements with confidence greater than or equal to this value (0-1).'), + elementTypes: z.array(ExtractionElementTypeSchema).optional().describe('Only return elements of these types.'), + limit: z + .number() + .int() + .positive() + .optional() + .default(100) + .describe('Maximum number of elements to return inline. Narrow the filters if results are truncated.'), +}) + +export type DataExtractorArgs = z.infer +export type QueryExtractionArgs = z.infer diff --git a/src/utils/environment.ts b/src/utils/environment.ts index 534bd30..723d148 100644 --- a/src/utils/environment.ts +++ b/src/utils/environment.ts @@ -2,6 +2,7 @@ import { z } from 'zod' export type Environment = { nutrientApiKey?: string + extractionApiKey?: string dwsApiBaseUrl: string authServerUrl: string clientId?: string @@ -9,6 +10,9 @@ export type Environment = { const RawEnvironmentSchema = z.object({ NUTRIENT_DWS_API_KEY: z.string().optional(), + // Separate key for the standalone DWS Data Extraction API (POST /extraction/parse). + // Distinct from the Processor key above; starts with `pdf_live_` / `pdf_test_`. + NUTRIENT_EXTRACTION_API_KEY: z.string().optional(), DWS_API_BASE_URL: z.string().url().default('https://api.nutrient.io'), AUTH_SERVER_URL: z .string() @@ -26,6 +30,7 @@ export function getEnvironment(rawEnv: NodeJS.ProcessEnv = process.env): Environ return { nutrientApiKey: raw.NUTRIENT_DWS_API_KEY, + extractionApiKey: raw.NUTRIENT_EXTRACTION_API_KEY, dwsApiBaseUrl: raw.DWS_API_BASE_URL, authServerUrl: raw.AUTH_SERVER_URL, clientId: raw.CLIENT_ID, diff --git a/tests/environment.test.ts b/tests/environment.test.ts index 8a3f0d4..e82aa18 100644 --- a/tests/environment.test.ts +++ b/tests/environment.test.ts @@ -10,6 +10,19 @@ describe('environment', () => { expect(environment.authServerUrl).toBe('https://api.nutrient.io') }) + it('parses the separate Data Extraction API key', () => { + const environment = getEnvironment({ NUTRIENT_EXTRACTION_API_KEY: 'pdf_live_abc123' }) + + expect(environment.extractionApiKey).toBe('pdf_live_abc123') + expect(environment.nutrientApiKey).toBeUndefined() + }) + + it('leaves the extraction key undefined when unset', () => { + const environment = getEnvironment({ NUTRIENT_DWS_API_KEY: 'dws-key' }) + + expect(environment.extractionApiKey).toBeUndefined() + }) + it('allows overriding DWS API base URL', () => { const environment = getEnvironment({ DWS_API_BASE_URL: 'http://localhost:4000' }) diff --git a/tests/extract.test.ts b/tests/extract.test.ts new file mode 100644 index 0000000..d25128f --- /dev/null +++ b/tests/extract.test.ts @@ -0,0 +1,292 @@ +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import fs from 'fs' +import os from 'os' +import path from 'path' +import { Readable } from 'stream' +import { setSandboxDirectory } from '../src/fs/sandbox.js' +import { performExtractCall, performQueryCall } from '../src/dws/extract.js' +import type { DwsApiClient } from '../src/dws/client.js' +import type { DataExtractorArgs, QueryExtractionArgs } from '../src/schemas.js' +import type { CallToolResult } from '@modelcontextprotocol/sdk/types.js' + +// A recognizable "PII" string used to prove extracted content never appears in +// the inline spatial summary (it must only live in the written file). +const SECRET = 'SSN 123-45-6789' + +const spatialFixture = { + status: 200, + requestId: 'req_test', + output: { + elements: [ + { + id: '1', + type: 'paragraph', + role: 'Title', + text: 'Quarterly Report', + confidence: 0.95, + readingOrder: 0, + bounds: { x: 100, y: 50, width: 400, height: 35 }, + page: { pageIndex: 0, pageNumber: 1, width: 1818, height: 2422 }, + }, + { + id: '2', + type: 'keyValueRegion', + text: SECRET, + confidence: 0.4, + readingOrder: 1, + bounds: { x: 100, y: 200, width: 300, height: 20 }, + page: { pageIndex: 0, pageNumber: 1, width: 1818, height: 2422 }, + }, + { + id: '3', + type: 'table', + confidence: 0.8, + readingOrder: 2, + bounds: { x: 100, y: 400, width: 600, height: 300 }, + page: { pageIndex: 1, pageNumber: 2, width: 1818, height: 2422 }, + }, + ], + }, + metrics: { processingTimeMs: 100, pagesProcessed: 2 }, +} + +function mockClient(payload: unknown): { client: DwsApiClient; post: ReturnType } { + const post = vi.fn().mockResolvedValue({ data: Readable.from([JSON.stringify(payload)]) }) + return { client: { post } as unknown as DwsApiClient, post } +} + +function text(result: CallToolResult): string { + return result.content.map((c) => (c.type === 'text' ? c.text : '')).join('\n') +} + +let sandboxDir: string +let counter = 0 + +beforeEach(async () => { + counter += 1 + sandboxDir = await fs.promises.mkdtemp(path.join(os.tmpdir(), 'extract-test-')) + await setSandboxDirectory(sandboxDir) +}) + +afterEach(async () => { + await fs.promises.rm(sandboxDir, { recursive: true, force: true }) +}) + +async function writeInput(): Promise { + const name = `input-${counter}.pdf` + await fs.promises.writeFile(path.join(sandboxDir, name), 'dummy pdf bytes') + return name +} + +function extractArgs(overrides: Partial): DataExtractorArgs { + return { + filePath: overrides.filePath ?? `input-${counter}.pdf`, + mode: overrides.mode ?? 'understand', + format: overrides.format, + includeWords: overrides.includeWords ?? false, + language: overrides.language, + outputPath: overrides.outputPath, + } +} + +describe('performExtractCall', () => { + it('returns markdown output inline', async () => { + const input = await writeInput() + const { client, post } = mockClient({ output: { markdown: '# Hello World' } }) + + const result = await performExtractCall(extractArgs({ filePath: input, mode: 'text', format: 'markdown' }), client) + + expect(result.isError).toBeFalsy() + expect(text(result)).toBe('# Hello World') + expect(post).toHaveBeenCalledOnce() + }) + + it('writes markdown to a file when outputPath is given, returning a summary not the content', async () => { + const input = await writeInput() + const outName = `out-${counter}.md` + const { client } = mockClient({ output: { markdown: '# Big Document\n\nlots of text' } }) + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'text', format: 'markdown', outputPath: outName }), + client, + ) + + expect(result.isError).toBeFalsy() + const summary = text(result) + expect(summary).toContain('Wrote') + expect(summary).toContain(outName) + expect(summary).not.toContain('lots of text') + const written = await fs.promises.readFile(path.join(sandboxDir, outName), 'utf-8') + expect(written).toBe('# Big Document\n\nlots of text') + }) + + it('rejects a 2xx response with no spatial element list without writing the file', async () => { + const input = await writeInput() + const outName = `out-${counter}.json` + const { client } = mockClient({ status: 200, output: { markdown: 'oops wrong shape' } }) + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: outName }), + client, + ) + + expect(result.isError).toBe(true) + expect(text(result)).toContain('output.elements') + await expect(fs.promises.access(path.join(sandboxDir, outName))).rejects.toThrow() + }) + + it('writes spatial output to a file and returns a content-free summary', async () => { + const input = await writeInput() + const outName = `out-${counter}.json` + const { client } = mockClient(spatialFixture) + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: outName }), + client, + ) + + expect(result.isError).toBeFalsy() + const summary = text(result) + // Summary reports structure, not content. + expect(summary).toContain('Extracted 3 elements') + expect(summary).toContain('keyValueRegion: 1') + expect(summary).toContain('Low-confidence elements') + // The PII must NOT leak into the inline summary... + expect(summary).not.toContain(SECRET) + // ...but the full data IS persisted to the file. + const written = await fs.promises.readFile(path.join(sandboxDir, outName), 'utf-8') + expect(written).toContain(SECRET) + }) + + it('rejects spatial output without an outputPath, before any API call', async () => { + const input = await writeInput() + const { client, post } = mockClient(spatialFixture) + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'structure', format: 'spatial' }), + client, + ) + + expect(result.isError).toBe(true) + expect(text(result)).toContain('outputPath') + expect(post).not.toHaveBeenCalled() + }) + + it('rejects text mode with spatial output', async () => { + const input = await writeInput() + const { client, post } = mockClient(spatialFixture) + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'text', format: 'spatial', outputPath: `out-${counter}.json` }), + client, + ) + + expect(result.isError).toBe(true) + expect(text(result)).toContain('text mode') + expect(post).not.toHaveBeenCalled() + }) + + it('contains an outside-sandbox absolute outputPath within the sandbox', async () => { + const input = await writeInput() + const escape = path.join(os.tmpdir(), `escape-${counter}.json`) + const { client } = mockClient(spatialFixture) + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'structure', format: 'spatial', outputPath: escape }), + client, + ) + + // The sandbox re-roots the absolute path inside the sandbox rather than + // writing to the literal location, so nothing escapes. + expect(result.isError).toBeFalsy() + await expect(fs.promises.access(escape)).rejects.toThrow() + }) + + it('returns a clear setup error when the extraction client is not configured', async () => { + const input = await writeInput() + + const result = await performExtractCall( + extractArgs({ filePath: input, mode: 'text', format: 'markdown' }), + undefined, + ) + + expect(result.isError).toBe(true) + expect(text(result)).toContain('NUTRIENT_EXTRACTION_API_KEY') + }) +}) + +describe('performQueryCall', () => { + async function writeFixture(): Promise { + const name = `extraction-${counter}.json` + await fs.promises.writeFile(path.join(sandboxDir, name), JSON.stringify(spatialFixture)) + return name + } + + function queryArgs(overrides: Partial): QueryExtractionArgs { + return { + filePath: overrides.filePath ?? `extraction-${counter}.json`, + pages: overrides.pages, + region: overrides.region, + minConfidence: overrides.minConfidence, + elementTypes: overrides.elementTypes, + limit: overrides.limit ?? 100, + } + } + + it('filters by minConfidence', async () => { + const file = await writeFixture() + const result = await performQueryCall(queryArgs({ filePath: file, minConfidence: 0.9 })) + + expect(result.isError).toBeFalsy() + const out = text(result) + expect(out).toContain('1 matching element') + expect(out).toContain('Quarterly Report') + expect(out).not.toContain(SECRET) + }) + + it('filters by element type', async () => { + const file = await writeFixture() + const result = await performQueryCall(queryArgs({ filePath: file, elementTypes: ['table'] })) + + expect(text(result)).toContain('1 matching element') + expect(text(result)).toContain('"type": "table"') + }) + + it('filters by page index', async () => { + const file = await writeFixture() + const result = await performQueryCall(queryArgs({ filePath: file, pages: [1] })) + + const out = text(result) + expect(out).toContain('1 matching element') + expect(out).toContain('"pageIndex": 1') + }) + + it('filters by region intersection', async () => { + const file = await writeFixture() + // Region overlapping only the Title element at (100,50,400,35). + const result = await performQueryCall( + queryArgs({ filePath: file, region: { x: 90, y: 40, width: 50, height: 50 } }), + ) + + expect(text(result)).toContain('Quarterly Report') + }) + + it('truncates to limit with guidance', async () => { + const file = await writeFixture() + const result = await performQueryCall(queryArgs({ filePath: file, limit: 1 })) + + const out = text(result) + expect(out).toContain('Showing the first 1 of 3 matches') + expect(out).toContain('Narrow the filters') + }) + + it('errors on a file that is not a spatial extraction result', async () => { + const name = `bad-${counter}.json` + await fs.promises.writeFile(path.join(sandboxDir, name), JSON.stringify({ not: 'an extraction' })) + + const result = await performQueryCall(queryArgs({ filePath: name })) + + expect(result.isError).toBe(true) + expect(text(result)).toContain('output.elements') + }) +}) diff --git a/tests/mcp-tools.test.ts b/tests/mcp-tools.test.ts index 5ca2abe..764af6e 100644 --- a/tests/mcp-tools.test.ts +++ b/tests/mcp-tools.test.ts @@ -37,8 +37,10 @@ describe('MCP tool metadata', () => { expect(Object.keys(tools).sort()).toEqual([ 'ai_redactor', 'check_credits', + 'data_extractor', 'document_processor', 'document_signer', + 'query_extraction', 'sandbox_file_tree', ])