diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..d439009 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,83 @@ +name: Bug report +description: Report a reproducible DocTruth bug +title: "fix: " +labels: + - bug +body: + - type: markdown + attributes: + value: | + Thanks for reporting a bug. Please use synthetic documents and remove secrets, API keys, and customer data. + - type: textarea + id: summary + attributes: + label: Summary + description: What went wrong? + placeholder: DocTruth returned a weak citation for an exact source quote. + validations: + required: true + - type: textarea + id: reproduce + attributes: + label: Reproduction + description: Minimal steps, code, document shape, or fixture needed to reproduce. + placeholder: | + 1. Parse sample.pdf with PdfDocumentParser + 2. Extract Contract.class with withProvenance() + 3. Observe citation matchScore... + validations: + required: true + - type: textarea + id: expected + attributes: + label: Expected behavior + description: What should have happened? + validations: + required: true + - type: textarea + id: actual + attributes: + label: Actual behavior + description: What happened instead? + validations: + required: true + - type: dropdown + id: area + attributes: + label: Area + options: + - PDF parser + - DOCX parser + - XLSX parser + - CSV parser + - Citation matching + - Bounding boxes + - JSON Schema + - Java schema reflection + - Provider integration + - Audit JSON + - CLI + - Documentation + - Other + validations: + required: true + - type: input + id: version + attributes: + label: Version + placeholder: 0.2.0-alpha or commit SHA + validations: + required: true + - type: input + id: java + attributes: + label: Java version + placeholder: java -version + validations: + required: true + - type: textarea + id: logs + attributes: + label: Logs or audit output + description: Paste minimal logs or audit JSON. Redact secrets and source documents. + render: text diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..cd5a325 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: true +contact_links: + - name: Security vulnerability + url: mailto:security@doctruth.ai + about: Please report security issues privately instead of opening a public issue. + - name: Sensitive document or data issue + url: mailto:security@doctruth.ai + about: Do not attach real customer documents, secrets, or regulated data to public issues. diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 0000000..39f12b3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,54 @@ +name: Feature request +description: Propose a narrowly-scoped DocTruth capability +title: "feat: " +labels: + - enhancement +body: + - type: markdown + attributes: + value: | + DocTruth accepts features that strengthen source evidence, provenance, confidence, schema validation, or audit export. General agent, RAG, UI, or workflow features are usually out of scope. + - type: textarea + id: problem + attributes: + label: Problem + description: What evidence, extraction, validation, or audit problem does this solve? + validations: + required: true + - type: textarea + id: proposal + attributes: + label: Proposed solution + description: Describe the smallest useful API or behavior change. + validations: + required: true + - type: dropdown + id: area + attributes: + label: Area + options: + - Parser evidence + - Citation matching + - Bounding boxes + - Schema validation + - Provider behavior + - Audit JSON + - CLI + - Documentation + - Other + validations: + required: true + - type: textarea + id: alternatives + attributes: + label: Alternatives considered + description: What did you try instead? + - type: checkboxes + id: scope + attributes: + label: Scope check + options: + - label: This feature needs source evidence, provenance, confidence, schema validation, or audit export semantics. + required: true + - label: This feature is not a general-purpose agent, vector store, chatbot, or UI workflow request. + required: true diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..b5ebfda --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,24 @@ +version: 2 +updates: + - package-ecosystem: maven + directory: / + schedule: + interval: weekly + day: monday + time: "09:00" + timezone: Australia/Sydney + open-pull-requests-limit: 5 + groups: + maven-runtime: + dependency-type: production + maven-test: + dependency-type: development + + - package-ecosystem: github-actions + directory: / + schedule: + interval: weekly + day: monday + time: "09:30" + timezone: Australia/Sydney + open-pull-requests-limit: 5 diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..60bad30 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,27 @@ +## Summary + + + +## Scope + + + +## Verification + +- [ ] `mvn test` +- [ ] `mvn verify` +- [ ] `mvn spotless:check` +- [ ] `mvn checkstyle:check` +- [ ] Documentation updated, if behavior changed + +## Contract Checklist + +- [ ] Public API changes are intentional and called out +- [ ] Source evidence, provenance, confidence, or audit semantics are preserved +- [ ] No new direct dependency without an ADR +- [ ] No real customer documents, secrets, API keys, or personal data included +- [ ] New behavior has a focused test at the closest contract boundary + +## Notes For Reviewers + + diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 37539d4..bd4b7f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -38,6 +38,21 @@ jobs: - name: Verify (unit + integration + recorded LLM + coverage) run: mvn -B -ntp verify -P recorded + - name: Resolve project version + run: echo "PROJECT_VERSION=$(mvn -q -DforceStdout help:evaluate -Dexpression=project.version)" >> "$GITHUB_ENV" + + - name: Compile quickstart against packaged SDK + run: scripts/compile-quickstart.sh + + - name: Package CLI release artifacts + run: scripts/package-cli-release.sh --version "${PROJECT_VERSION}" + + - name: Smoke CLI release tarball + run: scripts/smoke-cli-release.sh --version "${PROJECT_VERSION}" + + - name: Generate SBOM + run: mvn -B -ntp -DskipTests cyclonedx:makeAggregateBom + - name: Upload surefire reports on failure if: failure() uses: actions/upload-artifact@v4 diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml new file mode 100644 index 0000000..1d1003a --- /dev/null +++ b/.github/workflows/dependency-review.yml @@ -0,0 +1,20 @@ +name: Dependency Review + +on: + pull_request: + branches: [main] + +permissions: + contents: read + pull-requests: read + +jobs: + review: + name: dependency vulnerability scan + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/dependency-review-action@v4 + with: + fail-on-severity: high + comment-summary-in-pr: always diff --git a/.github/workflows/javadocs.yml b/.github/workflows/javadocs.yml new file mode 100644 index 0000000..0f16585 --- /dev/null +++ b/.github/workflows/javadocs.yml @@ -0,0 +1,46 @@ +name: Javadocs + +on: + workflow_dispatch: + push: + tags: + - 'v*' + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +jobs: + publish: + name: publish javadocs + runs-on: ubuntu-latest + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 25 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: '25' + cache: maven + + - name: Generate Javadocs + run: mvn -B -ntp javadoc:javadoc + + - name: Upload Pages artifact + uses: actions/upload-pages-artifact@v3 + with: + path: target/site/apidocs + + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v4 diff --git a/.github/workflows/nightly-live.yml b/.github/workflows/nightly-live.yml new file mode 100644 index 0000000..d77f475 --- /dev/null +++ b/.github/workflows/nightly-live.yml @@ -0,0 +1,46 @@ +name: Nightly Live LLM + +on: + workflow_dispatch: + schedule: + - cron: '17 17 * * *' + +permissions: + contents: read + +concurrency: + group: nightly-live-llm + cancel-in-progress: false + +jobs: + live-smoke: + name: live provider contract smoke + runs-on: ubuntu-latest + timeout-minutes: 20 + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 25 + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: '25' + cache: maven + + - name: Run bounded live LLM smoke + run: mvn -B -ntp -P live -Dit.test=ExternalLlmSmokeIT failsafe:integration-test failsafe:verify + env: + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }} + GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }} + DEEPSEEK_API_KEY: ${{ secrets.DEEPSEEK_API_KEY }} + + - name: Upload live smoke reports on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: nightly-live-failsafe + path: '**/target/failsafe-reports' + retention-days: 14 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9c00e91..6031ad0 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -6,7 +6,7 @@ on: - 'v*' permissions: - contents: read + contents: write concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -16,6 +16,8 @@ jobs: release: name: release (Sonatype Central Portal) runs-on: ubuntu-latest + env: + HOMEBREW_TAP_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }} steps: - uses: actions/checkout@v4 @@ -35,6 +37,17 @@ jobs: - name: Verify release commit run: mvn -B -ntp spotless:check checkstyle:check verify -P recorded + - name: Package CLI release artifacts + run: scripts/package-cli-release.sh --version "${GITHUB_REF_NAME#v}" + + - name: Smoke CLI release tarball + run: scripts/smoke-cli-release.sh --version "${GITHUB_REF_NAME#v}" + + - name: Generate CycloneDX SBOM + run: | + mvn -B -ntp -DskipTests cyclonedx:makeAggregateBom + cp target/bom.json dist/doctruth-${GITHUB_REF_NAME#v}-sbom.cdx.json + - name: Deploy to Sonatype Central Portal run: mvn -B -ntp -P release deploy -DskipTests env: @@ -48,11 +61,40 @@ jobs: with: name: release-artefacts path: | + dist/* + dist/homebrew/* target/*.jar target/*.jar.asc target/*.pom retention-days: 30 + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + files: | + dist/doctruth-*.tar.gz + dist/doctruth-java-*-all.jar + dist/doctruth-*-sbom.cdx.json + dist/checksums.txt + dist/homebrew/doctruth.rb + generate_release_notes: true + + - name: Update Homebrew tap + if: ${{ env.HOMEBREW_TAP_TOKEN != '' }} + env: + RELEASE_VERSION: ${{ github.ref_name }} + run: | + version="${RELEASE_VERSION#v}" + git config --global user.name "doctruth-release-bot" + git config --global user.email "release-bot@doctruth.ai" + git clone "https://x-access-token:${HOMEBREW_TAP_TOKEN}@github.com/doctruthhq/homebrew-tap.git" tap + mkdir -p tap/Formula + cp dist/homebrew/doctruth.rb tap/Formula/doctruth.rb + cd tap + git add Formula/doctruth.rb + git commit -m "doctruth ${version}" || exit 0 + git push origin HEAD:main + - name: Upload surefire reports on failure if: failure() uses: actions/upload-artifact@v4 diff --git a/.gitignore b/.gitignore index 94ce627..f4c9fa6 100644 --- a/.gitignore +++ b/.gitignore @@ -60,6 +60,8 @@ fixtures/ # Generated example output (re-runnable via `mvn package -DskipTests` + java) examples/evidence-overlay/output/ +.doctruth/ +dist/ # Local Claude skill state (per-developer) .claude/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..0ef47a4 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,52 @@ +# Code of Conduct + +DocTruth follows the Contributor Covenant Code of Conduct, version 2.1. + +## Our Pledge + +We pledge to make participation in this project a harassment-free experience +for everyone, regardless of age, body size, visible or invisible disability, +ethnicity, sex characteristics, gender identity and expression, level of +experience, education, socio-economic status, nationality, personal appearance, +race, caste, color, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to a positive environment: + +- using welcoming and inclusive language +- being respectful of differing viewpoints and experiences +- accepting constructive feedback +- focusing on what is best for the community and project +- showing empathy toward other community members + +Examples of unacceptable behavior: + +- sexualized language or imagery +- trolling, insulting or derogatory comments, and personal attacks +- public or private harassment +- publishing others' private information without explicit permission +- other conduct that could reasonably be considered inappropriate in a + professional setting + +## Enforcement + +Project maintainers may remove, edit, or reject comments, commits, code, wiki +edits, issues, and other contributions that are not aligned with this Code of +Conduct. Maintainers may temporarily or permanently ban contributors for +behavior they deem inappropriate, threatening, offensive, or harmful. + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by emailing: + +```text +conduct@doctruth.ai +``` + +All complaints will be reviewed and investigated promptly and fairly. + +## Attribution + +This Code of Conduct is adapted from the Contributor Covenant, version 2.1: + +https://www.contributor-covenant.org/version/2/1/code_of_conduct/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 10be09f..fc688c7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -33,6 +33,14 @@ Public API compatibility: - `ai.doctruth.internal.*` types are not public API and may change in any release. - Removing a public type, changing a public method signature, or changing record components requires a major version bump. - Internal dependency types must not leak into public method signatures. +- `PublicApiSnapshotTest` snapshots the public SDK surface. If you intentionally change + public API, regenerate it with: + +```bash +mvn -Dtest=ai.doctruth.PublicApiSnapshotTest -Ddoctruth.updatePublicApiSnapshot=true test +``` + +Review `src/test/resources/ai/doctruth/public-api-snapshot.txt` before committing. Scope boundaries: @@ -73,7 +81,9 @@ mvn verify # unit + integration + JaCoCo coverage gate (~10s) mvn spotless:apply # auto-format ``` -`mvn verify` runs the JaCoCo coverage check (line ≥ 85% bundle-wide, excluding `ai.doctruth.internal.providers.*` wire records). Lower the gate only by ADR. +`mvn verify` runs the JaCoCo coverage check (line ≥ 90% and branch ≥ 79% +bundle-wide, excluding `ai.doctruth.internal.providers.*` wire records). Lower +the gate only by ADR. ## How to add a new LLM provider @@ -110,9 +120,10 @@ Before opening a PR, confirm: - [ ] No file exceeds 300 LOC; no method body exceeds 30 LOC - [ ] No new entries in `` without an ADR in the same PR - [ ] Public-API changes flagged in the PR title (e.g. `feat!:` or `BREAKING CHANGE:` footer) +- [ ] Public-API snapshot updated for intentional `ai.doctruth.*` / `ai.doctruth.spi.*` changes - [ ] Commit message follows Conventional Commits — `feat:`, `fix:`, `docs:`, `refactor:`, `test:`, `chore:` (one concept per commit, one concept per PR) - [ ] If your change touches `ai.doctruth.*` or `ai.doctruth.spi.*`, the corresponding `*Test` class is updated -- [ ] If your change touches the AU compliance posture, `docs/compliance/au-audit-readiness.md` and `AustralianAuditContractTest` are updated together +- [ ] If your change touches evidence/audit semantics, update `docs/evidence-schema.md`, `docs/error-handling.md`, and the nearest contract test together ## Code review guarantee diff --git a/README.es.md b/README.es.md index 605039d..7901b0d 100644 --- a/README.es.md +++ b/README.es.md @@ -1,4 +1,4 @@ -# DocTruth +# DocTruth - Extracción LLM Auditable para Java

DocTruth source-cited extraction: every extracted field cites a source page and line @@ -16,7 +16,7 @@ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) [![Java](https://img.shields.io/badge/Java-25+-007396?logo=openjdk)](https://openjdk.org) -**Extracción LLM auditable para Java.** Analiza documentos, extrae datos estructurados y adjunta citas por campo, confianza y procedencia. +**Extracción LLM auditable para Java.** DocTruth convierte PDF, DOCX, XLSX y CSV en structured output basado en schema, con citas de fuente por campo, bounding boxes opcionales para PDF, confianza, provenance y PROV-O audit JSON. DocTruth existe para responder una pregunta: @@ -24,6 +24,8 @@ DocTruth existe para responder una pregunta: No es un framework de agentes, un framework de chains, un wrapper de bases vectoriales ni una UI. Es una biblioteca Java pequeña para el límite de extracción: documento de entrada, salida estructurada validada y evidencia verificable. +Es framework-agnostic y encaja en plain Java, Spring Boot, LangChain4j, Spring AI, Quarkus, Micronaut o cualquier servicio Java que ya use OpenAI, Anthropic, Gemini, DeepSeek o un endpoint compatible con OpenAI. + ## Instalación Requiere Java 25+. Verifica Maven Central: @@ -110,9 +112,9 @@ DocTruth soporta exportaciones comunes de Pydantic v2 JSON Schema, incluyendo `$ Herramienta de migración en build-time: ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar \ +java -jar target/doctruth-java-0.2.0-alpha-all.jar \ migrate pydantic myapp.schemas:ResumeExtraction \ - --out schemas/resume.schema.json \ + -o schemas/resume.schema.json \ --check ``` @@ -134,8 +136,10 @@ Los clientes usan `java.net.http.HttpClient` del JDK; no hay SDKs de proveedores ## CLI ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar parse contract.pdf -java -jar target/doctruth-java-0.2.0-alpha.jar migrate pydantic myapp.schemas:Model --out schema.json --check +mvn package -DskipTests +java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf +java -jar target/doctruth-java-0.2.0-alpha-all.jar schema contract.schema.json +java -jar target/doctruth-java-0.2.0-alpha-all.jar extract contract.pdf -s contract.schema.json ``` ## Documentación diff --git a/README.md b/README.md index 3123186..81c6d1f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# DocTruth +# DocTruth - Auditable LLM Extraction for Java

DocTruth source-cited extraction: every extracted field cites a source page and line @@ -16,23 +16,27 @@ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) [![Java](https://img.shields.io/badge/Java-25+-007396?logo=openjdk)](https://openjdk.org) -**Auditable LLM extraction for Java.** Parse documents, extract structured data, and attach field-level citations, confidence, and provenance. +**Auditable LLM extraction for Java.** DocTruth turns PDFs, DOCX, XLSX, and CSV files into schema-bound structured output with field-level source citations, optional PDF bounding boxes, confidence scores, provenance, and PROV-O audit JSON. DocTruth is for teams that need to answer one question reliably: > Where did this extracted value come from? -It is not an agent framework, chain framework, vector database wrapper, or UI. It is a small Java library for the extraction boundary: source document in, validated structured output plus evidence trail out. +The core boundary is simple: source document in, validated structured output plus evidence trail out. -## Installation - -Requires Java 25+. Verify Maven Central availability: +It is framework-agnostic and fits into plain Java, Spring Boot, LangChain4j, Spring AI, Quarkus, Micronaut, or any Java service that already calls OpenAI, Anthropic, Gemini, DeepSeek, or an OpenAI-compatible model endpoint. -```bash -mvn dependency:get -Dartifact=ai.doctruth:doctruth-java:0.2.0-alpha +```text +contract.pdf +→ Contract record +→ result.requireCitation("totalValue") +→ source quote + page/line + optional bbox + match score +→ audit JSON ``` -Use in a Maven project: +## Installation + +Requires Java 25+. Use in a Maven project: ```xml @@ -50,44 +54,82 @@ Upgrade to the latest release: mvn versions:use-latest-releases -Dincludes=ai.doctruth:doctruth-java -DgenerateBackupPoms=false ``` +If `java` on your shell still points to the macOS Java stub or an older runtime, +set `JAVA_HOME` to a Java 25 installation before running the CLI or examples: + +```bash +export JAVA_HOME=/path/to/jdk-25 +export PATH="$JAVA_HOME/bin:$PATH" +java -version +``` + ## Quick Start ```java import ai.doctruth.DocTruth; -import ai.doctruth.OpenAiProvider; -import ai.doctruth.PdfDocumentParser; import java.math.BigDecimal; import java.nio.file.Path; import java.time.LocalDate; record Contract(String partyA, String partyB, LocalDate effectiveDate, BigDecimal totalValue) {} -var doc = PdfDocumentParser.parse(Path.of("contract.pdf")); - -var result = DocTruth.from(new OpenAiProvider(System.getenv("OPENAI_API_KEY"))) +var result = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY")) + .fromPdf(Path.of("contract.pdf")) .extract("Extract the contract terms", Contract.class) - .withProvenance() - .withConfidence() - .withBitemporal() - .run(doc); + .withEvidence() + .run(); Contract contract = result.value(); -var partyACitation = result.citations().get("partyA"); +var partyACitation = result.requireCitation("partyA"); + +System.out.println(partyACitation.exactQuote()); +System.out.println(partyACitation.location()); +partyACitation.boundingBox().ifPresent(System.out::println); +result.writeAudit(Path.of("audit.json")); ``` See [`examples/quickstart`](examples/quickstart/) for a runnable example. +`withEvidence()` is the opinionated default for auditable extraction. It enables +field citations, confidence scores, bitemporal provenance, and audit metadata in +one call. Use `result.requireCitation("field")` for required evidence and +`result.findCitation("field")` when missing evidence should be handled manually. + +## CLI For Try / Debug / Inspect + +The CLI is for first-run inspection, parser debugging, schema checks, and CI +smoke tests. Parser and schema inspection do not require an LLM key. + +```bash +mvn package -DskipTests +java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf --bboxes +java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf --json -o parsed.json +java -jar target/doctruth-java-0.2.0-alpha-all.jar schema contract.schema.json +``` + +See [Install DocTruth CLI](docs/install.md) and [CLI](docs/cli.md). + +Tagged releases publish `doctruth-.tar.gz`, +`doctruth-java--all.jar`, checksums, and a generated Homebrew formula. +Homebrew install is the intended default once the tap is updated: + +```bash +brew tap doctruthhq/tap +brew install doctruth +doctruth version +``` + ## What It Does

DocTruth capabilities: parse, assemble context, extract with LLM providers, validate schema, attach evidence, and export audit JSON

-- Parses PDF, DOCX, XLSX, and CSV into sections with source locations. +- Parses PDF, DOCX, XLSX, and CSV into sections with source locations; PDF text sections include page-normalized bounding boxes when layout data is available. - Extracts Java records or JSON Schema-bound objects through LLM providers. - Validates structured output locally and retries repairable failures. - Matches extracted fields back to exact source quotes. -- Returns per-field `Citation`, `Confidence`, and `Provenance`. +- Returns per-field `Citation`, including source location and optional PDF bounding box, plus `Confidence` and `Provenance`. - Exports W3C PROV-O JSON-LD audit files with `toAuditJson(...)`. ## Java Schema and JSON Schema Interop @@ -104,32 +146,25 @@ it is omitted from `required`, while the wrapped value type is still reflected i the generated schema. Raw `Object` and unbounded shapes fail fast instead of becoming unauditable catch-all objects. -JSON Schema remains the interoperability path for external schema producers such -as Pydantic. +JSON Schema remains the interoperability path for external schema producers and +template packs. ```java var schema = JsonSchema.from(Path.of("contract.schema.json")); -var result = DocTruth.from(provider) +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) .extractJson("Extract contract terms", schema) .requireCitation("partyA") .requireCitation("totalValue") + .withEvidence() .withMaxRetries(2) - .runJson(doc); + .runJson(); ``` -DocTruth supports common Pydantic v2 JSON Schema exports, including local `$defs` / `$ref`, nullable unions, nested objects, arrays, enums, required fields, scalar constraints, and `additionalProperties=false`. - -Build-time helper: - -```bash -java -jar target/doctruth-java-0.2.0-alpha.jar \ - migrate pydantic myapp.schemas:ResumeExtraction \ - --out schemas/resume.schema.json \ - --check -``` - -Production Java extraction only needs the exported schema file and the DocTruth jar. +If a team already owns Pydantic v2 models, export them to JSON Schema at build +time and treat the output as a normal schema file. DocTruth does not import +Python in Java production. ## Providers @@ -144,20 +179,52 @@ OpenAI-compatible chat completions are the primary path because many hosted, gat Provider clients use JDK `java.net.http.HttpClient`; no vendor SDKs are on the classpath. +Common provider setup: + +```java +var client = DocTruth.withProvider(LlmProviders.openAi(System.getenv("OPENAI_API_KEY"))); +var anthropic = DocTruth.withProvider(LlmProviders.anthropic("sk-ant-...")); +var local = DocTruth.withProvider(LlmProviders.openAiCompatible( + "local-key", + URI.create("http://localhost:11434/v1/chat/completions"), + "qwen2.5")); +``` + ## CLI ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar parse contract.pdf -java -jar target/doctruth-java-0.2.0-alpha.jar migrate pydantic myapp.schemas:Model --out schema.json --check +doctruth init +doctruth parse contract.pdf --bboxes +doctruth schema contract.schema.json +doctruth doctor +doctruth extract contract.pdf -s contract.schema.json +doctruth audit .doctruth/runs//audit.json ``` ## Documentation -- [Quickstart example](examples/quickstart/) -- [Pydantic interop example](examples/pydantic-interop/) -- [Architecture](docs/architecture/auditable-structured-extraction-engine.md) -- [Error handling](docs/error-handling.md) -- [Release process](docs/release.md) +- Start here: + - [Quickstart example](examples/quickstart/) + - [No-LLM parse example](examples/no-llm-parse/) + - [Install DocTruth CLI](docs/install.md) + - [CLI](docs/cli.md) + - [Evidence schema](docs/evidence-schema.md) +- Integrate: + - [Java integration guide](docs/java-integration.md) + - [Spring Boot](docs/integrations/spring-boot.md) + - [LangChain4j](docs/integrations/langchain4j.md) + - [JSON Schema](docs/integrations/json-schema.md) + - [Pydantic interop example](examples/pydantic-interop/) for existing Python schema owners +- Understand: + - [Parser capability matrix](docs/parser-capability-matrix.md) + - [Architecture](docs/architecture/auditable-structured-extraction-engine.md) + - [Error handling](docs/error-handling.md) + - [OSS PMF gap](docs/oss-pmf-gap.md) + - [Release process](docs/release.md) +- Use cases: + - [Auditable LLM extraction for Java](docs/use-cases/auditable-llm-extraction-java.md) + - [Source citations for LLM output](docs/use-cases/source-citations-for-llm-output.md) + - [PDF extraction with bounding boxes](docs/use-cases/pdf-extraction-with-bounding-boxes.md) - [Contributing](CONTRIBUTING.md) - [Changelog](CHANGELOG.md) @@ -165,7 +232,9 @@ java -jar target/doctruth-java-0.2.0-alpha.jar migrate pydantic myapp.schemas:Mo `0.2.0-alpha` is an early public alpha. The API is usable, tested, and published for feedback, but may still change before `1.0`. -Current verification baseline: 645 unit tests and 16 integration tests passing, with 2 external smoke tests skipped, coverage gates at 90% line / 80% branch, single jar about 205 KB. +Current verification baseline: `mvn verify` passes with 703 unit tests and the +tracked integration suite; optional local corpus tests run when `fixtures/` is +present. Coverage gates are 90% line / 79% branch. ## License diff --git a/README.zh-CN.md b/README.zh-CN.md index 6a8eeca..ab4a200 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -1,4 +1,4 @@ -# DocTruth +# DocTruth - 面向 Java 的可审计 LLM 抽取

DocTruth source-cited extraction: every extracted field cites a source page and line @@ -16,7 +16,7 @@ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) [![Java](https://img.shields.io/badge/Java-25+-007396?logo=openjdk)](https://openjdk.org) -**面向 Java 的可审计 LLM 抽取库。** 解析文档、抽取结构化数据,并为每个字段附上来源引用、置信度和 provenance。 +**面向 Java 的可审计 LLM 抽取库。** DocTruth 将 PDF、DOCX、XLSX 和 CSV 转成 schema-bound structured output,并为每个字段附上来源引用、可选 PDF bounding box、置信度、provenance 和 PROV-O audit JSON。 DocTruth 主要回答一个问题: @@ -24,6 +24,8 @@ DocTruth 主要回答一个问题: 它不是 agent 框架、chain 框架、向量数据库封装,也不是 UI。它只专注于抽取边界:输入源文档,输出经过验证的结构化结果和证据链。 +它不绑定框架,可以放进 plain Java、Spring Boot、LangChain4j、Spring AI、Quarkus、Micronaut,或者任何已经在调用 OpenAI、Anthropic、Gemini、DeepSeek 或 OpenAI-compatible endpoint 的 Java 服务。 + ## 安装 需要 Java 25+。验证 Maven Central 可用: @@ -110,9 +112,9 @@ DocTruth 支持常见 Pydantic v2 JSON Schema 输出,包括本地 `$defs` / `$ 构建期迁移工具: ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar \ +java -jar target/doctruth-java-0.2.0-alpha-all.jar \ migrate pydantic myapp.schemas:ResumeExtraction \ - --out schemas/resume.schema.json \ + -o schemas/resume.schema.json \ --check ``` @@ -134,8 +136,10 @@ Provider client 使用 JDK `java.net.http.HttpClient`,不引入 vendor SDK。 ## CLI ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar parse contract.pdf -java -jar target/doctruth-java-0.2.0-alpha.jar migrate pydantic myapp.schemas:Model --out schema.json --check +mvn package -DskipTests +java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf +java -jar target/doctruth-java-0.2.0-alpha-all.jar schema contract.schema.json +java -jar target/doctruth-java-0.2.0-alpha-all.jar extract contract.pdf -s contract.schema.json ``` ## 文档 diff --git a/README.zh-TW.md b/README.zh-TW.md index 0439b7c..d493c30 100644 --- a/README.zh-TW.md +++ b/README.zh-TW.md @@ -1,4 +1,4 @@ -# DocTruth +# DocTruth - 面向 Java 的可稽核 LLM 擷取

DocTruth source-cited extraction: every extracted field cites a source page and line @@ -16,7 +16,7 @@ [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) [![Java](https://img.shields.io/badge/Java-25+-007396?logo=openjdk)](https://openjdk.org) -**面向 Java 的可稽核 LLM 擷取函式庫。** 解析文件、擷取結構化資料,並為每個欄位附上來源引用、信心分數和 provenance。 +**面向 Java 的可稽核 LLM 擷取函式庫。** DocTruth 將 PDF、DOCX、XLSX 和 CSV 轉成 schema-bound structured output,並為每個欄位附上來源引用、可選 PDF bounding box、信心分數、provenance 和 PROV-O audit JSON。 DocTruth 主要回答一個問題: @@ -24,6 +24,8 @@ DocTruth 主要回答一個問題: 它不是 agent 框架、chain 框架、向量資料庫封裝,也不是 UI。它只專注於擷取邊界:輸入來源文件,輸出已驗證的結構化結果和證據鏈。 +它不綁定框架,可以放進 plain Java、Spring Boot、LangChain4j、Spring AI、Quarkus、Micronaut,或任何已經在呼叫 OpenAI、Anthropic、Gemini、DeepSeek 或 OpenAI-compatible endpoint 的 Java 服務。 + ## 安裝 需要 Java 25+。驗證 Maven Central 可用: @@ -110,9 +112,9 @@ DocTruth 支援常見 Pydantic v2 JSON Schema 輸出,包括本地 `$defs` / `$ 建置期遷移工具: ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar \ +java -jar target/doctruth-java-0.2.0-alpha-all.jar \ migrate pydantic myapp.schemas:ResumeExtraction \ - --out schemas/resume.schema.json \ + -o schemas/resume.schema.json \ --check ``` @@ -134,8 +136,10 @@ Provider client 使用 JDK `java.net.http.HttpClient`,不引入 vendor SDK。 ## CLI ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar parse contract.pdf -java -jar target/doctruth-java-0.2.0-alpha.jar migrate pydantic myapp.schemas:Model --out schema.json --check +mvn package -DskipTests +java -jar target/doctruth-java-0.2.0-alpha-all.jar parse contract.pdf +java -jar target/doctruth-java-0.2.0-alpha-all.jar schema contract.schema.json +java -jar target/doctruth-java-0.2.0-alpha-all.jar extract contract.pdf -s contract.schema.json ``` ## 文件 diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..ac3b545 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,55 @@ +# Security Policy + +DocTruth is an OSS Java library for auditable document extraction. Security +reports are welcome and should be handled privately before public disclosure. + +## Supported Versions + +| Version | Supported | +| --- | --- | +| `0.2.x-alpha` | Security fixes accepted on `main` | +| Older alpha releases | Best effort | + +The project is pre-`1.0`, so public APIs may still change. Security fixes take +priority over compatibility when needed. + +## Reporting A Vulnerability + +Please do not open a public GitHub issue for suspected vulnerabilities. + +Report security concerns by emailing: + +```text +security@doctruth.ai +``` + +Include: + +- affected version or commit +- environment details +- reproduction steps +- expected impact +- whether the issue involves secrets, source documents, audit JSON, provider + calls, or generated artifacts + +We aim to acknowledge valid reports within 5 business days. + +## Sensitive Data + +Do not attach real customer documents, secrets, API keys, credentials, or +regulated data to public issues, discussions, or pull requests. Use minimal +synthetic fixtures whenever possible. + +## Scope + +Security-sensitive areas include: + +- parser behavior for untrusted PDF, DOCX, XLSX, and CSV files +- provider request and response handling +- prompt, source text, and audit JSON logging +- citation and provenance integrity +- dependency vulnerabilities +- CLI handling of local files and output paths + +General bugs that do not affect confidentiality, integrity, availability, or +evidence correctness can be filed as normal GitHub issues. diff --git a/docs/architecture/auditable-structured-extraction-engine.md b/docs/architecture/auditable-structured-extraction-engine.md index aafad9d..0798d13 100644 --- a/docs/architecture/auditable-structured-extraction-engine.md +++ b/docs/architecture/auditable-structured-extraction-engine.md @@ -158,12 +158,24 @@ The result should expose: ## Target public API shape -The API should stay fluent and Java-native: +The primary API should stay fluent, Java-native, and document-first: ```java record Contract(String partyA, String partyB, BigDecimal totalValue) {} -var result = DocTruth.from(provider) +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); +``` + +Advanced validation keeps the same mental model, with extra constraints added +before `run()`: + +```java +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) .extract("Extract contract terms", Contract.class) .withFieldConstraint( "totalValue", @@ -173,22 +185,24 @@ var result = DocTruth.from(provider) .withObjectConstraint( contract -> !contract.partyA().equals(contract.partyB()), "partyA and partyB must differ") - .withProvenance() - .withConfidence() + .withEvidence() .withMaxRetries(2) - .run(doc); + .run(); ``` -JSON Schema entry points preserve the same semantics while avoiding Java -overload ambiguity with `extract("prompt", null)`: +JSON Schema is the advanced interoperability path for teams that own schemas +outside Java. It uses the same document-first flow while preserving validation, +repair, evidence, provenance, and audit semantics: ```java -var result = DocTruth.from(provider) +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) .extractJson("Extract contract terms", JsonSchema.from(schemaPath)) .requireCitation("partyA") .requireCitation("totalValue") + .withEvidence() .withMaxRetries(2) - .runJson(doc); + .runJson(); ``` The schema source changes; validation, repair, evidence gating, provenance, and @@ -226,7 +240,7 @@ runtime validators, serializers, computed fields, or Python plugin ecosystem. A future migration CLI can make the build-time export path easier, for example: ```bash -doctruth migrate pydantic myapp.schemas:Resume --out resume.schema.json --check +doctruth migrate pydantic myapp.schemas:Resume -o resume.schema.json --check ``` That tool should remain outside the runtime core: it may invoke Python during diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..01e232f --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,193 @@ +# CLI + +DocTruth CLI is the try/debug/inspect entry point. The primary integration path +is the Java SDK (`DocTruth.withOpenAi(...).fromPdf(...).extract(...).run()`), +while the CLI is optimized for first-run evidence inspection: parse without an +LLM key, check schemas directly, and write extraction outputs into a run +directory. + +Build the standalone CLI jar: + +```bash +mvn package -DskipTests +``` + +Run it: + +```bash +java -jar target/doctruth-java-0.2.0-alpha-all.jar --help +``` + +Install a local launcher: + +```bash +scripts/install-cli.sh --prefix "$HOME/.local" +export PATH="$HOME/.local/bin:$PATH" +doctruth version +``` + +See [Install DocTruth CLI](install.md) for the install path. + +## Commands + +### Initialize + +```bash +doctruth init +``` + +Creates: + +```text +doctruth.yml +schemas/ +.doctruth/runs/ +``` + +`doctruth.yml` stores defaults for provider, model, and output directory. + +### Parse + +No provider key required: + +```bash +doctruth parse contract.pdf +``` + +Prints a summary: + +```text +contract.pdf +pages: 3 +sections: 42 +text: 38 +tables: 2 +figures: 0 +bbox coverage: 31/38 +``` + +Write parsed sections as JSON: + +```bash +doctruth parse contract.pdf --json -o parsed.json +``` + +Show that bbox recovery is enabled in the summary: + +```bash +doctruth parse contract.pdf --bboxes +``` + +### Schema + +Check a JSON Schema: + +```bash +doctruth schema contract.schema.json +``` + +Machine-readable summary: + +```bash +doctruth schema contract.schema.json --json +``` + +### Extract + +Default extraction: + +```bash +doctruth extract contract.pdf -s contract.schema.json +``` + +By default, DocTruth: + +- reads provider/model/output defaults from `doctruth.yml` when present +- uses `openai` as the default provider +- requires citations for top-level schema fields +- writes `result.json` and `audit.json` to `.doctruth/runs//` + +Common overrides: + +```bash +doctruth extract contract.pdf -s contract.schema.json -o out/ +doctruth extract contract.pdf -s contract.schema.json --provider anthropic +doctruth extract contract.pdf -s contract.schema.json --model gpt-4o-mini +doctruth extract contract.pdf -s contract.schema.json --base-url http://localhost:11434/v1 +doctruth extract contract.pdf -s contract.schema.json --allow-uncited +doctruth extract contract.pdf -s contract.schema.json --require partyA,totalValue +``` + +Provider keys: + +| Provider | Env var | +| --- | --- | +| `openai` | `OPENAI_API_KEY` | +| `anthropic` | `ANTHROPIC_API_KEY` | +| `gemini` | `GOOGLE_API_KEY` | +| `deepseek` | `DEEPSEEK_API_KEY` | + +### Audit + +Read an audit JSON file: + +```bash +doctruth audit .doctruth/runs/run_abc/audit.json +``` + +Machine-readable summary: + +```bash +doctruth audit .doctruth/runs/run_abc/audit.json --json +``` + +### Doctor + +Check the local runtime, project config, output directory, and provider-key +readiness: + +```bash +doctruth doctor +doctruth doctor --json +``` + +`doctor` does not call an LLM. It is safe to run before configuring extraction. + +### Completion + +Generate shell completion: + +```bash +doctruth completion bash > ~/.local/share/bash-completion/completions/doctruth +doctruth completion zsh > "${fpath[1]}/_doctruth" +doctruth completion fish > ~/.config/fish/completions/doctruth.fish +``` + +### Version + +```bash +doctruth version +doctruth --version +``` + +## Advanced: Pydantic Schema Migration + +This is not the primary path. Use it only when a team already owns Pydantic v2 +models and wants to export JSON Schema at build time. + +Export a Pydantic v2 model to JSON Schema: + +```bash +doctruth migrate pydantic myapp.schemas:Resume -o resume.schema.json --check +``` + +This command may invoke Python during migration. Runtime Java extraction only +uses the exported schema file. + +## Exit Codes + +| Code | Meaning | +| --- | --- | +| `0` | Command succeeded | +| `1` | Runtime failure, parse failure, provider failure, or schema compatibility failure | +| `2` | Invalid CLI usage | diff --git a/docs/evidence-schema.md b/docs/evidence-schema.md new file mode 100644 index 0000000..d22ed93 --- /dev/null +++ b/docs/evidence-schema.md @@ -0,0 +1,269 @@ +# Evidence Schema + +DocTruth's core output is not just a typed value. It is a typed value plus the +evidence needed to defend every extracted field. + +The evidence model has four layers: + +```text +ParsedDocument +→ SourceLocation / BoundingBox +→ Citation +→ ExtractionResult / audit JSON +``` + +## Parsed Document + +`ParsedDocument` is the normalized document shape used by extraction and +citation matching. + +```java +public record ParsedDocument( + String docId, + List sections, + DocumentMetadata metadata) {} +``` + +It is intentionally small. It does not try to be a PDF object model, a DOM, or a +layout engine. It gives DocTruth enough stable source text to assemble context +and enough location data to prove where an extracted field came from. + +## Parsed Sections + +`ParsedSection` is a sealed family: + +```java +public sealed interface ParsedSection permits TextSection, TableSection, FigureSection {} +``` + +Current section types: + +| Type | Purpose | +| --- | --- | +| `TextSection` | Text block with source location, block kind, and optional PDF bbox | +| `TableSection` | Logical table rows with source location | +| `FigureSection` | Figure caption with source location | + +## Source Location + +`SourceLocation` is the text anchor: + +```java +public record SourceLocation( + int pageStart, + int pageEnd, + int lineStart, + int lineEnd, + int charOffset) {} +``` + +Semantics: + +| Field | Meaning | +| --- | --- | +| `pageStart` / `pageEnd` | 1-indexed source page range | +| `lineStart` / `lineEnd` | 1-indexed logical line range inside the parsed document | +| `charOffset` | Character offset inside the rendered page text | + +For non-paginated formats such as CSV, DocTruth maps logical sheets/rows into +page and line anchors so the downstream contract stays consistent. + +## Bounding Box + +`BoundingBox` is the visual anchor: + +```java +public record BoundingBox(double x0, double y0, double x1, double y1) {} +``` + +Semantics: + +| Field | Meaning | +| --- | --- | +| `x0` | Left edge | +| `y0` | Top edge | +| `x1` | Right edge | +| `y1` | Bottom edge | + +Rules: + +- Coordinates are page-normalized to `0..1000`. +- Origin is top-left. +- `x1 > x0` and `y1 > y0`. +- Values are independent of PDF page size and render DPI. +- Bounding boxes are optional because not every source format has reliable page geometry. + +Example: + +```java +var bbox = new BoundingBox(72.4, 118.0, 380.7, 142.5); +``` + +This means the evidence region starts about 7.2% from the left edge and 11.8% +from the top edge of the page. + +## Text Section With Bbox + +PDF-originated text sections can carry a bounding box: + +```java +public record TextSection( + String text, + SourceLocation location, + BlockKind kind, + Optional boundingBox) implements ParsedSection {} +``` + +`BlockKind` is a coarse layout hint: + +```java +public enum BlockKind { + HEADING, + BODY, + LIST, + OTHER +} +``` + +Use `TextSection.boundingBox()` when building source overlays or reviewer +highlight UIs. Use `TextSection.location()` when storing durable text anchors. + +## Citation + +`Citation` is the field-level evidence anchor: + +```java +public record Citation( + SourceLocation location, + String exactQuote, + double matchScore, + Optional boundingBox) {} +``` + +Semantics: + +| Field | Meaning | +| --- | --- | +| `location` | Source text location for the matched field | +| `exactQuote` | Source quote used as evidence | +| `matchScore` | Similarity score in `[0.0, 1.0]` | +| `boundingBox` | Optional visual region for PDF-originated evidence | + +`matchScore == 1.0` means exact substring match. Lower scores come from fuzzy +matching and should be treated as warnings by downstream systems. The default +strong-citation threshold is `0.85`. + +## Extraction Result + +`ExtractionResult` is the main return object: + +```java +public record ExtractionResult( + T value, + Map citations, + Map confidence, + Provenance provenance) {} +``` + +Field paths use Java/JSON-style names: + +```text +partyA +totalValue +lineItems[0].amount +members[1].address.city +``` + +Example: + +```java +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + +var citation = result.requireCitation("totalValue"); + +System.out.println(citation.exactQuote()); +System.out.println(citation.location().pageStart()); +citation.boundingBox().ifPresent(System.out::println); +``` + +## Audit JSON + +`ExtractionResult.toAuditJson()` exports a W3C PROV-O compatible JSON-LD +document with DocTruth-specific evidence fields. + +Compact shape: + +```json +{ + "@context": "https://www.w3.org/ns/prov", + "@type": "prov:Entity", + "doctruth:value": { + "partyA": "Acme Industrial Materials Pty Ltd", + "totalValue": 2450000 + }, + "doctruth:retries": 0, + "prov:wasGeneratedBy": { + "@type": "prov:Activity", + "prov:startedAtTime": "2026-05-07T05:30:14.218Z", + "prov:wasAssociatedWith": { + "@type": "prov:SoftwareAgent", + "rdfs:label": "openai", + "prov:version": "gpt-4o" + } + }, + "prov:wasDerivedFrom": [ + { + "@type": "prov:Entity", + "doctruth:fieldPath": "partyA", + "prov:value": "Acme Industrial Materials Pty Ltd", + "doctruth:matchScore": 1.0, + "doctruth:sourceLocation": { + "pageStart": 1, + "pageEnd": 1, + "lineStart": 2, + "lineEnd": 2, + "charOffset": 31 + }, + "doctruth:boundingBox": { + "x0": 72.4, + "y0": 118.0, + "x1": 380.7, + "y1": 142.5 + } + } + ], + "doctruth:confidence": { + "partyA": { + "score": 1.0, + "rationale": "exact substring match" + } + } +} +``` + +`doctruth:boundingBox` is emitted only when a citation has visual geometry. + +## Design Rules + +DocTruth evidence should stay: + +- **Source-grounded**: every citation points at source text, not model rationale. +- **Portable**: audit JSON can be stored or handed to another system. +- **Optional where honest**: bbox is absent when source geometry is unavailable. +- **Warning-friendly**: weak citation matches are surfaced, not silently dropped. +- **Java-native**: records and maps expose the contract without framework types. + +## Common Edge Cases + +| Case | Expected Behavior | +| --- | --- | +| Exact quote found | `matchScore == 1.0`, citation carries the section location and bbox if present | +| Fuzzy quote found | Citation is returned with lower `matchScore`; callers can warn, retry, or review | +| No source text match | Citation is returned with `matchScore == 0.0` so the failure is explicit | +| Source has no bbox | `boundingBox()` is `Optional.empty()` | +| Multi-page field | Current citation points to the best matching section; richer span sets are future work | +| Table values | Citation uses table source location; cell-level bbox is future work | diff --git a/docs/homebrew.md b/docs/homebrew.md new file mode 100644 index 0000000..c059e3b --- /dev/null +++ b/docs/homebrew.md @@ -0,0 +1,62 @@ +# Homebrew Distribution + +Homebrew is the preferred CLI distribution path for macOS and many Java +developers. The DocTruth release workflow generates a ready-to-copy formula at: + +```text +dist/homebrew/doctruth.rb +``` + +The formula downloads the release tarball: + +```text +https://github.com/doctruthhq/DocTruth/releases/download/v/doctruth-.tar.gz +``` + +## Maintainer Flow + +1. Cut a GitHub release tag, for example `v0.2.0-alpha`. +2. Wait for the `Release` workflow to finish. +3. If `HOMEBREW_TAP_TOKEN` is configured, the workflow pushes the generated + formula to `doctruthhq/homebrew-tap` automatically. +4. Otherwise, download `doctruth.rb` from the GitHub Release assets or workflow + artifact and copy it into the tap repository: + +```text +doctruthhq/homebrew-tap/Formula/doctruth.rb +``` + +5. Commit and push the tap change. + +Users can then install: + +```bash +brew tap doctruthhq/tap +brew install doctruth +doctruth version +doctruth doctor +``` + +## Local Formula Generation + +Build and package locally: + +```bash +mvn package -DskipTests +scripts/package-cli-release.sh +``` + +Smoke the generated tarball: + +```bash +mkdir -p /tmp/doctruth-release-smoke +tar -xzf dist/doctruth-0.2.0-alpha.tar.gz -C /tmp/doctruth-release-smoke +JAVA=/path/to/java /tmp/doctruth-release-smoke/doctruth-0.2.0-alpha/bin/doctruth version +``` + +## Why The Formula Is Not Committed As A Live Formula Here + +The working formula belongs in a separate Homebrew tap repository. Keeping the +generated formula in release artifacts avoids a stale checksum in this source +repository. Every release tarball has a different SHA-256, so the formula must be +generated from the final artifact. diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000..a2def5f --- /dev/null +++ b/docs/install.md @@ -0,0 +1,148 @@ +# Install DocTruth CLI + +The Java SDK is the primary production integration path. The CLI is the +try/debug/inspect path: it lets a Java team verify the core promise before +writing integration code: + +```text +document -> parsed sections with source locations -> schema check -> audit output +``` + +## SDK Install + +Use the SDK when adding DocTruth to an application: + +```xml + + ai.doctruth + doctruth-java + 0.2.0-alpha + +``` + +Minimal application flow: + +```java +var result = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY")) + .fromPdf(Path.of("contract.pdf")) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); +``` + +## CLI From Source + +Requires Java 25+ and Maven. + +Build the standalone jar: + +```bash +mvn package -DskipTests +``` + +Run it directly: + +```bash +java -jar target/doctruth-java-0.2.0-alpha-all.jar --help +``` + +Install a `doctruth` launcher: + +```bash +scripts/install-cli.sh --prefix "$HOME/.local" +``` + +Make sure the install prefix is on your path: + +```bash +export PATH="$HOME/.local/bin:$PATH" +``` + +Check the install: + +```bash +doctruth version +doctruth doctor +doctruth parse fixtures/pdf/ResumeAFIQDANISH.pdf --bboxes +``` + +If `java` is not on `PATH`, point the launcher at your Java 25 runtime: + +```bash +JAVA=/path/to/java doctruth version +``` + +On macOS, `/usr/bin/java` may be a stub even when Maven can find a Homebrew JDK. +In that case set `JAVA_HOME` and prepend it to `PATH`: + +```bash +export JAVA_HOME=/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home +export PATH="$JAVA_HOME/bin:$PATH" +java -version +``` + +## No-LLM First Run + +No provider key is required for parser and schema inspection: + +```bash +doctruth parse contract.pdf --bboxes +doctruth parse contract.pdf --json -o parsed.json +doctruth schema contract.schema.json +``` + +This is the recommended first-run path. It proves the document evidence surface +before a user spends time configuring model keys. + +## Extraction Run + +Extraction requires a provider key: + +```bash +export OPENAI_API_KEY=... +doctruth extract contract.pdf -s contract.schema.json +doctruth audit .doctruth/runs//audit.json +``` + +Use `--provider`, `--model`, and `--base-url` only when the defaults are not +enough. + +The CLI and SDK use the same parser, citation, provenance, and audit primitives. + +## GitHub Release Artifacts + +Tagged releases attach CLI artifacts: + +```text +doctruth-.tar.gz +doctruth-java--all.jar +checksums.txt +doctruth.rb +``` + +Use the tarball when you want a `bin/doctruth` launcher plus the bundled jar: + +```bash +tar -xzf doctruth-0.2.0-alpha.tar.gz +JAVA=/path/to/java ./doctruth-0.2.0-alpha/bin/doctruth version +``` + +Use the all-jar when you want the simplest direct invocation: + +```bash +java -jar doctruth-java-0.2.0-alpha-all.jar version +``` + +## Homebrew + +The release workflow generates a formula for the `doctruthhq/homebrew-tap` +repository. Once the tap is updated: + +```bash +brew tap doctruthhq/tap +brew install doctruth +doctruth version +doctruth doctor +``` + +See [Homebrew Distribution](homebrew.md) for maintainer details. diff --git a/docs/integrations/json-schema.md b/docs/integrations/json-schema.md new file mode 100644 index 0000000..f53c9e7 --- /dev/null +++ b/docs/integrations/json-schema.md @@ -0,0 +1,72 @@ +# JSON Schema Integration + +DocTruth supports Java records and classes as the native schema path. It also +accepts caller-supplied JSON Schema for teams that define extraction contracts +outside Java. + +For normal Java application code, prefer the SDK-first record path: +`DocTruth.withOpenAi(...).fromPdf(...).extract(...).withEvidence().run()`. +Use JSON Schema when the schema is owned outside Java or must stay +language-neutral. + +## Load A Schema + +```java +var schema = JsonSchema.from(Path.of("contract.schema.json")); + +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) + .extractJson("Extract contract terms", schema) + .requireCitation("partyA") + .requireCitation("totalValue") + .withEvidence() + .runJson(); +``` + +The returned value is a Jackson `JsonNode`, while citations, confidence, and +provenance stay in the normal `ExtractionResult` contract. + +## Common Schema Sources + +JSON Schema is useful when: + +- another service already owns the schema +- a Pydantic model exports the contract at build time +- a document automation team wants language-neutral templates +- the extraction target changes more often than Java code + +DocTruth supports common Pydantic v2 JSON Schema exports, including local +`$defs` / `$ref`, nullable unions, nested objects, arrays, enums, required +fields, scalar constraints, and `additionalProperties=false`. + +## Citation Requirements + +Use `requireCitation(...)` for fields that must not enter the result without +source evidence: + +```java +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("invoice.pdf")) + .extractJson("Extract invoice fields", schema) + .requireCitation("invoiceNumber") + .requireCitation("totalAmount") + .withMaxRetries(2) + .runJson(); +``` + +This keeps schema validation and source evidence in the same extraction run. + +## Build-Time Pydantic Export + +If schemas start in Python, export JSON Schema at build time and feed the schema +file to the Java runtime: + +```bash +java -jar target/doctruth-java-0.2.0-alpha-all.jar \ + migrate pydantic myapp.schemas:ResumeExtraction \ + -o schemas/resume.schema.json \ + --check +``` + +DocTruth does not import Python in Java production. Pydantic compatibility means +JSON Schema interoperability. diff --git a/docs/integrations/langchain4j.md b/docs/integrations/langchain4j.md new file mode 100644 index 0000000..c3eaa23 --- /dev/null +++ b/docs/integrations/langchain4j.md @@ -0,0 +1,60 @@ +# LangChain4j Integration + +DocTruth is not a LangChain4j replacement. Use LangChain4j for orchestration, +tools, retrieval, memory, and agent-style workflows. Use DocTruth when a +document field must become auditable structured data. + +## Recommended Split + +| Responsibility | Owner | +| --- | --- | +| Workflow orchestration | LangChain4j | +| Tools and retrieval | LangChain4j or application code | +| Document parsing to source-located sections | DocTruth | +| Schema-bound extraction | DocTruth | +| Field citation matching | DocTruth | +| PROV-O audit JSON | DocTruth | + +## Flow + +```text +LangChain4j workflow chooses document and schema +→ application calls DocTruth +→ DocTruth returns typed value + citations + confidence + provenance +→ workflow uses verified result downstream +``` + +The key is to call DocTruth where the field is created. Avoid generating +unaudited model fields first and asking DocTruth to reconstruct evidence later. + +## Example Boundary + +```java +public final class VerifiedExtractionTool { + + private final LlmProvider provider; + + public VerifiedExtractionTool(LlmProvider provider) { + this.provider = provider; + } + + public ExtractionResult extractContract(Path pdf) + throws ParseException, ExtractionException { + return DocTruth.withProvider(provider) + .fromPdf(pdf) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + } +} +``` + +LangChain4j can call this boundary as a normal application tool, but the +evidence contract stays owned by DocTruth. + +## Why This Matters + +General agent frameworks are good at deciding what to do next. They usually do +not enforce that every structured field can cite a source page, source quote, +match score, model version, and extraction timestamp. DocTruth provides that +evidence-gated extraction boundary for Java applications. diff --git a/docs/integrations/spring-boot.md b/docs/integrations/spring-boot.md new file mode 100644 index 0000000..a5df665 --- /dev/null +++ b/docs/integrations/spring-boot.md @@ -0,0 +1,82 @@ +# Spring Boot Integration + +DocTruth is framework-agnostic, but it fits naturally into a Spring Boot +service. Keep DocTruth inside an application service boundary: controllers +should handle transport, services should call DocTruth, and repositories should +store the typed result plus audit JSON. + +## Service Shape + +```java +import ai.doctruth.DocTruth; +import ai.doctruth.ExtractionException; +import ai.doctruth.ExtractionResult; +import ai.doctruth.LlmProvider; +import ai.doctruth.ParseException; +import java.nio.file.Path; +import org.springframework.stereotype.Service; + +@Service +public final class ContractExtractionService { + + private final LlmProvider provider; + + public ContractExtractionService(LlmProvider provider) { + this.provider = provider; + } + + public ExtractionResult extract(Path pdf) + throws ParseException, ExtractionException { + return DocTruth.withProvider(provider) + .fromPdf(pdf) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + } +} +``` + +## Provider Bean + +```java +import ai.doctruth.LlmProvider; +import ai.doctruth.LlmProviders; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +class DocTruthConfig { + + @Bean + LlmProvider llmProvider() { + return LlmProviders.openAi(System.getenv("OPENAI_API_KEY")); + } +} +``` + +## Storage Pattern + +Store the structured value and audit artifact together: + +| Artifact | Suggested Storage | +| --- | --- | +| Source file | Object storage | +| Extracted Java value | Application database | +| `result.toAuditJson()` | Audit table, object storage, or immutable log | +| Field review state | Application database | + +The application should assign a durable run id that links the source document, +the extracted value, and the audit JSON. + +## Error Handling + +Recommended mapping: + +| Exception | Application Meaning | +| --- | --- | +| `ParseException` | Document intake failed | +| `ProviderException` | Model provider or network failure | +| `ExtractionException` | Validation, retry, or citation requirement failed | + +Do not log full prompts or source documents unless your deployment has explicit +approval to store that data. diff --git a/docs/java-integration.md b/docs/java-integration.md new file mode 100644 index 0000000..2a11d86 --- /dev/null +++ b/docs/java-integration.md @@ -0,0 +1,259 @@ +# Java Integration Guide + +DocTruth is designed as a Java backend primitive. It should fit into an existing +service without forcing a framework, agent runtime, or vendor SDK onto the +application. + +## Integration Model + +The normal integration path is: + +```text +source document +→ DocTruth parser +→ ParsedDocument +→ DocTruth extraction +→ typed value + citations + confidence + provenance +→ caller's business system +``` + +DocTruth owns the extraction evidence boundary. The application still owns +authentication, storage, queues, review workflow, and business-specific policy. + +## Plain Java + +Use plain Java when you want the smallest possible integration surface. + +```java +import ai.doctruth.DocTruth; +import ai.doctruth.ExtractionResult; +import java.math.BigDecimal; +import java.nio.file.Path; +import java.time.LocalDate; + +record Contract(String partyA, String partyB, LocalDate effectiveDate, BigDecimal totalValue) {} + +ExtractionResult result = DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY")) + .fromPdf(Path.of("contract.pdf")) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + +var value = result.value(); +var citation = result.requireCitation("totalValue"); +result.writeAudit(Path.of("audit.json")); +``` + +The returned value is ordinary Java. The evidence map is ordinary Java. No +framework type is required. + +## Spring Boot + +DocTruth does not depend on Spring, but it works naturally inside Spring +services. + +Example service shape: + +```java +import ai.doctruth.DocTruth; +import ai.doctruth.ExtractionException; +import ai.doctruth.ExtractionResult; +import ai.doctruth.LlmProvider; +import ai.doctruth.ParseException; +import java.nio.file.Path; +import org.springframework.stereotype.Service; + +@Service +public final class ContractExtractionService { + + private final LlmProvider provider; + + public ContractExtractionService(LlmProvider provider) { + this.provider = provider; + } + + public ExtractionResult extract(Path pdf) + throws ParseException, ExtractionException { + return DocTruth.withProvider(provider) + .fromPdf(pdf) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + } +} +``` + +Provider configuration can stay in the application: + +```java +import ai.doctruth.LlmProvider; +import ai.doctruth.LlmProviders; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +class DocTruthConfig { + + @Bean + LlmProvider llmProvider() { + return LlmProviders.openAi(System.getenv("OPENAI_API_KEY")); + } +} +``` + +Recommended Spring boundary: + +- Controller receives or locates the document. +- Service calls DocTruth. +- Repository stores `result.value()` and `result.toAuditJson()`. +- Review UI reads `result.citations()` or the stored audit JSON. + +## Quarkus / Micronaut + +The same pattern applies in Quarkus, Micronaut, Helidon, or plain Jakarta +services: + +- create one provider bean / singleton +- call `DocTruth.withProvider(provider).fromPdf(...)` inside an application service +- store typed value and audit JSON separately + +Keep DocTruth inside a service boundary rather than scattering extraction calls +across controllers and jobs. + +## LangChain4j Interop + +DocTruth is not a LangChain4j replacement. Use LangChain4j for broader +orchestration if you already use it, and use DocTruth at the evidence-gated +extraction boundary. + +Recommended split: + +| Responsibility | Owner | +| --- | --- | +| Agent orchestration | LangChain4j | +| Retrieval / tools / memory | LangChain4j or application | +| Document parsing to source-located sections | DocTruth | +| Schema-bound extraction | DocTruth | +| Field citation matching | DocTruth | +| Audit JSON | DocTruth | + +Typical flow: + +```text +LangChain4j workflow chooses document and schema +→ application calls DocTruth extraction +→ DocTruth returns auditable structured result +→ LangChain4j workflow uses the verified result downstream +``` + +Avoid passing unaudited model-generated fields into a system of record and then +asking DocTruth to reconstruct evidence later. Call DocTruth at the point where +the field is created. + +## Spring AI Interop + +DocTruth does not depend on Spring AI. This keeps the core library usable in +non-Spring and regulated environments. + +Use Spring AI for application-level model interactions when useful. Use +DocTruth when the output must become auditable structured data. + +Recommended split: + +| Responsibility | Owner | +| --- | --- | +| General chat / assistants | Spring AI | +| Embeddings / vector store wiring | Spring AI or application | +| Evidence-backed document extraction | DocTruth | +| Citation and provenance output | DocTruth | + +If an application already uses Spring AI model clients, keep that wiring at the +application boundary. DocTruth's built-in providers remain the evidence +extraction path until a dedicated provider adapter is added. + +## Batch Jobs + +For batch extraction, keep each document run independently auditable: + +```text +for each document: + parse + extract + write typed result + write audit JSON + record run id / source id / schema version +``` + +Do not merge audit JSON across documents unless a higher-level workflow creates +an explicit bundle. Single-run evidence should remain inspectable on its own. + +## Storing Results + +A practical storage layout: + +| Artifact | Suggested Storage | +| --- | --- | +| Source file | Object storage or document store | +| Typed value | Application database | +| `ExtractionResult.toAuditJson()` | Object storage, audit table, or immutable log | +| Field-level review status | Application database | +| Rendered PDF overlay | Application-generated artifact | + +The typed value and the audit artifact should share a durable run id in the +caller system. + +## Error Handling + +DocTruth uses checked exceptions at public boundaries: + +| Exception | Meaning | +| --- | --- | +| `ParseException` | Source file could not be parsed | +| `ProviderException` | Model provider call failed | +| `ExtractionException` | Extraction, validation, retry, or citation requirement failed | + +Recommended service behavior: + +- Treat parse failures as document intake failures. +- Treat provider failures as retryable infrastructure failures when appropriate. +- Treat extraction validation failures as review or schema-quality failures. +- Store failed run metadata without storing secrets or full prompts in logs. + +See [error handling](error-handling.md) for the detailed exception contract. + +## Evidence UI + +DocTruth is not a UI framework, but the evidence contract supports reviewer UI: + +```text +field path +→ citation.exactQuote() +→ citation.location() +→ citation.boundingBox() +→ highlight source PDF region +``` + +For a lightweight visual example, see +[examples/evidence-overlay](../examples/evidence-overlay/). + +## Deployment Shape + +Short term, use DocTruth as an embedded library: + +```text +Java service +└── doctruth-java dependency +``` + +For teams that want one shared extraction service across applications, the +natural next step is a private DocTruth sidecar/server: + +```text +application services +→ DocTruth Server +→ model provider +→ typed result + audit JSON +``` + +That server shape is an operational product boundary. The OSS library remains +the single-document, single-run evidence primitive. diff --git a/docs/oss-pmf-gap.md b/docs/oss-pmf-gap.md new file mode 100644 index 0000000..694aaac --- /dev/null +++ b/docs/oss-pmf-gap.md @@ -0,0 +1,204 @@ +# OSS PMF Gap + +DocTruth should win its first market as a Java evidence primitive, not as a +general document parser, agent framework, knowledge graph product, or enterprise +governance platform. + +The short-term PMF question is: + +```text +Can a Java team add field-level evidence, validation, provenance, and audit JSON +to LLM document extraction without building its own evidence system? +``` + +If the answer becomes "yes", the OSS project is doing its job. + +## Target User + +The first OSS user is a Java backend developer or technical lead building +document AI inside an existing product. + +Likely examples: + +- a Spring Boot team extracting contract, invoice, certificate, or resume fields +- a vertical SaaS team adding LLM extraction to an existing workflow +- a system integrator delivering private document automation for regulated + customers +- a Java enterprise team that cannot adopt a Python-first extraction stack in + production + +This user does not want a new platform. They want a small library that can sit +inside the service they already operate. + +## Product Boundary + +The OSS boundary should stay narrow: + +```text +single document +→ schema-bound extraction +→ field citations +→ optional PDF bbox +→ confidence +→ provenance +→ audit JSON +``` + +DocTruth OSS should not expand into: + +- multi-tenant SaaS +- agent orchestration +- generic RAG +- data catalog or lineage platform +- business workflow engine +- organization-wide evidence graph +- UI-heavy review application + +Those may become paid or downstream products, but they are not the OSS adoption +wedge. + +## Current OSS Strengths + +The project already has enough substance to be more than a toy: + +| Area | Current State | +| --- | --- | +| Public Java API | `DocTruth`, `ExtractionBuilder`, `ExtractionResult`, typed records | +| Document parsing | PDF, DOCX, XLSX, CSV parser entry points | +| Source anchoring | `SourceLocation` page/line/offset model | +| Visual evidence | PDF text bbox support through optional `BoundingBox` | +| Citation contract | `Citation` with quote, match score, location, optional bbox | +| Provider boundary | provider-neutral API for common LLM backends | +| Schema support | Java records/classes plus caller-supplied JSON Schema | +| Audit output | PROV-O compatible JSON-LD export | +| Java fit | framework-agnostic, usable from Spring Boot or plain Java | + +This is the correct foundation for an OSS Java library. + +## Gap To Ideal OSS + +The gap is not "replace Python". The gap is making the Java path feel obvious, +trustworthy, and production-shaped. + +| Gap | Why It Matters | Desired State | +| --- | --- | --- | +| First-run experience | OSS adoption depends on seeing value in minutes | Quickstart runs cleanly, prints field citation, bbox, confidence, and audit JSON | +| Parser confidence | Users need to know when built-in parsing is enough | Clear parser capability matrix and documented adapter boundary | +| Evidence schema | Integrators need a stable contract | Evidence schema doc defines `SourceLocation`, `BoundingBox`, `Citation`, and audit JSON | +| Java framework fit | Most buyers run Spring Boot or similar services | Integration guide shows plain Java, Spring Boot, LangChain4j, and Spring AI boundaries | +| Citation reliability | This is the product's core trust claim | Matching behavior, weak matches, and failure cases are explicit and tested | +| JSON Schema flow | Many teams already define schemas outside Java | JSON Schema examples feel first-class, not secondary | +| Performance expectation | Parser-heavy workloads need credible guidance | Benchmarks explain PDF parse throughput, CPU use, and concurrency limits | +| Maven adoption | Java users expect normal dependency flow | Maven Central release path and JPMS-friendly packaging stay clean | +| Public positioning | The project must avoid looking like a vague AI platform | README says "auditable LLM extraction for Java" and stops there | + +## Competition Reality + +DocTruth should not frame itself against huge platforms first. + +| Category | Strong Existing Players | DocTruth Position | +| --- | --- | --- | +| Document parsing | Docling, MinerU, Unstructured, LlamaParse | Use or adapt parser output when useful; do not compete on parsing alone | +| Java LLM orchestration | LangChain4j, Spring AI | Complement them at the evidence-gated extraction boundary | +| Data governance | DataHub, Collibra, OpenLineage-style ecosystems | Stay below that layer; export audit artifacts that can feed governance systems | +| Enterprise platforms | Palantir-style internal operating systems | Do not claim this category in OSS | + +The sharper category is: + +```text +evidence-backed LLM extraction for Java enterprise stacks +``` + +That category is small enough for a focused OSS project and painful enough to +create commercial pull. + +## PMF Stages + +### Stage 1: Java OSS Primitive + +Goal: + +```text +Developers can add auditable extraction to one Java service. +``` + +Required OSS capabilities: + +- reliable parser entry points +- stable `ParsedDocument`, `SourceLocation`, `BoundingBox`, and `Citation` +- typed record extraction +- JSON Schema extraction +- provider-neutral LLM calls +- local validation and retry +- audit JSON export +- quickstart and integration docs + +PMF signal: + +- external developers use it without maintainer handholding +- one or more production integrations store DocTruth audit JSON +- issues ask for adapters, templates, or server operation rather than "what is this?" + +### Stage 2: Paid Server + +Goal: + +```text +Teams that like the OSS primitive can operate it as a shared private service. +``` + +Paid layer: + +- Docker sidecar / REST API +- batch jobs +- template registry +- webhooks +- run history +- private deployment license +- support + +This is still not an enterprise governance platform. It is an operationalized +version of the OSS primitive. + +### Stage 3: Enterprise Evidence Layer + +Goal: + +```text +Organizations can manage evidence across documents, runs, users, and systems. +``` + +Enterprise layer: + +- persistent evidence store +- multi-document audit bundles +- permission-aware access +- SSO/RBAC +- connectors +- reviewer workflow +- compliance exports +- report or claim verification + +This stage should arrive only after Stage 1 has developer pull and Stage 2 has +paid operational demand. + +## What To Build Next + +Near-term OSS work should prioritize: + +1. Keep bbox evidence stable across parser, citation, and audit JSON. +2. Make the quickstart undeniable: one command, one result, one citation, one + bbox, one audit file. +3. Add a parser capability matrix so users know what PDF/DOCX/XLSX/CSV evidence + quality to expect. +4. Add a benchmark document that reports parser throughput and concurrency + guidance without promising unrealistic replacement claims. +5. Tighten JSON Schema examples because this is the bridge to non-Java schema + authors. + +The main product rule: + +```text +If a feature does not make extracted fields more source-grounded, auditable, or +easier to adopt in Java, it does not belong in the short-term OSS roadmap. +``` diff --git a/docs/parser-capability-matrix.md b/docs/parser-capability-matrix.md new file mode 100644 index 0000000..08b6e2c --- /dev/null +++ b/docs/parser-capability-matrix.md @@ -0,0 +1,20 @@ +# Parser Capability Matrix + +DocTruth parsing exists to preserve evidence anchors for extraction. It is not a +general document conversion product. + +| Source | Text Anchor | Visual Anchor | Current Notes | +| --- | --- | --- | --- | +| PDF text | page, line, char offset | optional page-normalized bbox | Best-supported path for reviewer highlights | +| PDF scanned image | future OCR adapter | future OCR bbox | Not a built-in OCR engine today | +| DOCX | paragraph-style logical sections | none | Word pagination is not stable without a renderer | +| XLSX | sheet/row-style logical sections | none | Cell-level bbox is future work | +| CSV | row/column-style logical sections | none | Logical tabular evidence only | +| PDF tables | section-level source location | future table/cell bbox | Table geometry is not yet a public contract | + +Rules: + +- `SourceLocation` is the durable audit anchor. +- `BoundingBox` is an optional visual anchor for PDF-originated text. +- Absence of bbox does not mean absence of evidence. +- Scanned PDFs should be routed to OCR before relying on DocTruth extraction. diff --git a/docs/plans/2026-05-16-sdk-happy-path.md b/docs/plans/2026-05-16-sdk-happy-path.md new file mode 100644 index 0000000..2786781 --- /dev/null +++ b/docs/plans/2026-05-16-sdk-happy-path.md @@ -0,0 +1,59 @@ +# SDK Happy Path Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Make the primary SDK path read like `DocTruth.withProvider(provider).fromPdf(path).extract(...).withEvidence().run()`. + +**Architecture:** Keep the existing low-level API intact. Add a thin convenience layer that owns the provider and parsed document, delegates to `ExtractionBuilder`, and adds result helper aliases for common citation and audit operations. + +**Tech Stack:** Java 25, existing parser classes, existing provider classes, JUnit 5, AssertJ. + +--- + +### Task 1: Result Helpers + +**Files:** +- Modify: `src/main/java/ai/doctruth/ExtractionResult.java` +- Test: `src/test/java/ai/doctruth/ExtractionResultAuditJsonTest.java` + +**Steps:** +1. Write failing tests for `citation(String)` and `writeAudit(Path)`. +2. Implement citation convenience methods (`findCitation`, `requireCitation`, legacy `citation`) and `writeAudit(path)`. +3. Run the focused result tests. + +### Task 2: Document-First SDK Flow + +**Files:** +- Modify: `src/main/java/ai/doctruth/DocTruth.java` +- Create: `src/main/java/ai/doctruth/DocTruthClient.java` +- Create: `src/main/java/ai/doctruth/DocTruthDocument.java` +- Create: `src/main/java/ai/doctruth/DocumentExtractionBuilder.java` +- Test: `src/test/java/ai/doctruth/DocTruthHappyPathTest.java` + +**Steps:** +1. Write failing tests for `withProvider(provider).from(parsedDoc).extract(...).withEvidence().run()`. +2. Write failing tests for `fromPdf(Path)` using a generated one-page PDF. +3. Implement the thin wrappers and delegate to existing parsers/builders. +4. Run the focused happy-path tests. + +### Task 3: OpenAI Convenience + +**Files:** +- Modify: `src/main/java/ai/doctruth/DocTruth.java` +- Test: `src/test/java/ai/doctruth/DocTruthHappyPathTest.java` + +**Steps:** +1. Write failing tests for `withOpenAi(String apiKey)` and blank-key handling. +2. Implement the static convenience factory. +3. Keep `withOpenAi()` env-based but document that it reads `OPENAI_API_KEY`. + +### Task 4: Docs + +**Files:** +- Modify: `README.md` +- Modify as needed: `examples/quickstart/README.md` + +**Steps:** +1. Move the new SDK happy path into the first Java example. +2. Keep CLI positioned as try/debug/inspect. +3. Run formatting and verification. diff --git a/docs/release.md b/docs/release.md index 515543b..8b14954 100644 --- a/docs/release.md +++ b/docs/release.md @@ -41,6 +41,8 @@ In `doctruthhq/DocTruth → Settings → Secrets and variables → Actions`: | `MAVEN_PASSWORD` | Central Portal token password | | `OSSRH_GPG_PRIVATE_KEY` | Contents of `private.asc` (full ASCII-armored block) | | `MAVEN_GPG_PASSPHRASE` | GPG key passphrase | +| `HOMEBREW_TAP_TOKEN` | Optional token with write access to `doctruthhq/homebrew-tap` | +| `OPENAI_API_KEY` / `ANTHROPIC_API_KEY` / `GOOGLE_API_KEY` / `DEEPSEEK_API_KEY` | Optional nightly live smoke keys | Delete `private.asc` from the local disk afterwards. @@ -62,7 +64,19 @@ Move items from `## [Unreleased]` into a new `## [0.1.0] - YYYY-MM-DD` section. Keep the `## [Unreleased]` heading at the top with empty `### Added/Changed/Fixed` subheadings ready for the next cycle. -### 3. Commit, tag, push +### 3. Confirm the public API snapshot + +If this release intentionally changes `ai.doctruth.*` or `ai.doctruth.spi.*`, +regenerate and review the API snapshot before tagging: + +```bash +mvn -Dtest=ai.doctruth.PublicApiSnapshotTest -Ddoctruth.updatePublicApiSnapshot=true test +git diff -- src/test/resources/ai/doctruth/public-api-snapshot.txt +``` + +For patch releases, the snapshot should usually be unchanged. + +### 4. Commit, tag, push ```bash git add pom.xml CHANGELOG.md @@ -72,24 +86,32 @@ git push origin main git push origin v0.1.0 ``` -### 4. GitHub Actions runs +### 5. GitHub Actions runs The `Release` workflow (`.github/workflows/release.yml`) fires on the `v*` tag: +- Builds the standalone CLI jar and packages GitHub Release artifacts +- Creates `doctruth-.tar.gz`, `doctruth-java--all.jar`, + `checksums.txt`, a CycloneDX SBOM, and a generated Homebrew formula +- Smoke-tests the generated CLI tarball before publishing +- Creates a GitHub Release with those CLI artifacts attached - Builds + signs jar / sources jar / javadoc jar with GPG - Deploys to the Central Portal via `central-publishing-maven-plugin` - Automatically publishes the deployment after Central validation passes -- Uploads signed artefacts as workflow artifacts (30-day retention) +- Updates `doctruthhq/homebrew-tap` automatically when `HOMEBREW_TAP_TOKEN` + is configured; otherwise the generated formula is attached for manual tap update +- Uploads signed artefacts and CLI distribution files as workflow artifacts + (30-day retention) Watch it at https://github.com/doctruthhq/DocTruth/actions. -### 5. Wait for Central propagation +### 6. Wait for Central propagation `autoPublish=true`, so the GitHub Actions release job publishes automatically after Central validation passes. Propagation to Maven Central usually takes ~10–30 min; search index updates can take ~4 hours. -### 6. Bump to next `-SNAPSHOT` +### 7. Bump to next `-SNAPSHOT` ```bash mvn -B versions:set -DnewVersion=0.2.0-SNAPSHOT -DgenerateBackupPoms=false @@ -131,6 +153,34 @@ EOF mvn dependency:resolve ``` +Verify the CLI release artifacts from the GitHub Release: + +```bash +shasum -a 256 -c checksums.txt +tar -xzf doctruth-0.1.0.tar.gz +JAVA=/path/to/java ./doctruth-0.1.0/bin/doctruth version +JAVA=/path/to/java ./doctruth-0.1.0/bin/doctruth doctor +``` + +When `HOMEBREW_TAP_TOKEN` is not configured, update the Homebrew tap manually +with the generated formula: + +```bash +cp doctruth.rb ../homebrew-tap/Formula/doctruth.rb +cd ../homebrew-tap +brew install --build-from-source ./Formula/doctruth.rb +doctruth version +doctruth doctor +git add Formula/doctruth.rb +git commit -m "doctruth 0.1.0" +git push origin main +``` + +Public Javadocs are deployed by `.github/workflows/javadocs.yml` on release tags +and manual dispatch. The dependency review workflow blocks high-severity +dependency changes on pull requests, and Dependabot opens weekly Maven and +GitHub Actions update PRs. + --- ## Rolling back diff --git a/docs/use-cases/auditable-llm-extraction-java.md b/docs/use-cases/auditable-llm-extraction-java.md new file mode 100644 index 0000000..62811fd --- /dev/null +++ b/docs/use-cases/auditable-llm-extraction-java.md @@ -0,0 +1,82 @@ +# Auditable LLM Extraction for Java + +DocTruth is for Java teams that need structured LLM extraction results they can +defend later. It parses business documents, asks a model for schema-bound +output, validates the result, and attaches source evidence to each extracted +field. + +The core use case is simple: + +```text +PDF / DOCX / XLSX / CSV +→ Java record or JSON Schema +→ typed extraction result +→ field citations +→ confidence +→ provenance +→ audit JSON +``` + +## When To Use It + +Use DocTruth when extracted fields need to enter a business system, review +queue, customer-facing report, or compliance process. + +Common examples: + +- contract terms extraction +- invoice and purchase order extraction +- supplier certificate extraction +- insurance document extraction +- resume and profile extraction +- regulated document intake + +The important requirement is not just "can the model return JSON?" It is: + +```text +Can each returned field point back to the source document? +``` + +## Java-Native Shape + +DocTruth returns ordinary Java objects: + +```java +record Contract(String partyA, String partyB, BigDecimal totalValue) {} + +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + +Contract contract = result.value(); +var citation = result.requireCitation("totalValue"); +``` + +The caller keeps its normal service, queue, database, and review workflow. +DocTruth owns the evidence boundary. + +## What Makes It Auditable + +Each extraction result can include: + +| Artifact | Purpose | +| --- | --- | +| Typed value | Data the application can store or use | +| `Citation` | Field-level quote, page/line location, match score, optional bbox | +| `Confidence` | Per-field score and rationale | +| `Provenance` | Model, model version, timestamps, retry count | +| Audit JSON | PROV-O compatible JSON-LD export | + +This is the difference between "the model said so" and "this field came from +this source quote on this page." + +## What DocTruth Is Not + +DocTruth is not an agent framework, vector store, data catalog, BI tool, or +general document Q&A system. It is intentionally small: one document, one +schema-bound extraction run, evidence attached at the field boundary. + +That narrow scope is what makes it practical to drop into existing Java +enterprise stacks. diff --git a/docs/use-cases/pdf-extraction-with-bounding-boxes.md b/docs/use-cases/pdf-extraction-with-bounding-boxes.md new file mode 100644 index 0000000..cad7e28 --- /dev/null +++ b/docs/use-cases/pdf-extraction-with-bounding-boxes.md @@ -0,0 +1,75 @@ +# PDF Extraction With Bounding Boxes + +DocTruth can attach optional PDF bounding boxes to text sections and field +citations. This lets Java applications build source highlights without turning +DocTruth into a PDF viewer or layout engine. + +## Coordinate System + +`BoundingBox` uses page-normalized coordinates: + +```java +public record BoundingBox(double x0, double y0, double x1, double y1) {} +``` + +Rules: + +- origin is top-left +- coordinates are normalized to `0..1000` +- `x1 > x0` +- `y1 > y0` +- values are independent of page size and render DPI + +Example: + +```java +new BoundingBox(72.4, 118.0, 380.7, 142.5) +``` + +This means the evidence region starts about 7.2% from the left edge and 11.8% +from the top edge of the page. + +## Parser To Citation Flow + +```text +PDF text positions +→ TextSection with SourceLocation and optional BoundingBox +→ LLM extraction +→ Citation with exact quote and optional BoundingBox +→ audit JSON +``` + +The source location remains the durable text anchor. The bounding box is the +visual anchor. + +## Java Example + +```java +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("contract.pdf")) + .extract("Extract contract terms", Contract.class) + .withEvidence() + .run(); + +var citation = result.requireCitation("partyA"); + +citation.boundingBox().ifPresent(bbox -> { + System.out.println(bbox.x0()); + System.out.println(bbox.y0()); + System.out.println(bbox.x1()); + System.out.println(bbox.y1()); +}); +``` + +## When Bbox Is Absent + +Bounding boxes are optional. A citation may not have one when: + +- the source format is not visually paginated +- the parser could not recover reliable geometry +- the evidence came from a table or non-text source where cell-level geometry is + not yet available + +Applications should treat `boundingBox()` as an enhancement, not the only +evidence anchor. `SourceLocation` and `exactQuote` remain available for audit +flows. diff --git a/docs/use-cases/source-citations-for-llm-output.md b/docs/use-cases/source-citations-for-llm-output.md new file mode 100644 index 0000000..9c5ea24 --- /dev/null +++ b/docs/use-cases/source-citations-for-llm-output.md @@ -0,0 +1,73 @@ +# Source Citations for LLM Output + +LLM output is hard to operationalize when nobody can verify where a field came +from. DocTruth adds source citations to structured extraction results so a Java +application can show, store, review, and export the evidence behind each field. + +## Citation Contract + +A DocTruth citation contains: + +```java +public record Citation( + SourceLocation location, + String exactQuote, + double matchScore, + Optional boundingBox) {} +``` + +This gives the caller both a text anchor and, when available, a visual PDF +anchor. + +## Example + +```java +var result = DocTruth.withProvider(provider) + .fromPdf(Path.of("invoice.pdf")) + .extract("Extract invoice fields", Invoice.class) + .withEvidence() + .run(); + +var citation = result.requireCitation("invoiceNumber"); + +System.out.println(citation.exactQuote()); +System.out.println(citation.location()); +System.out.println(citation.matchScore()); +``` + +If the source is a PDF and geometry is available, the citation can also include +a page-normalized bounding box for highlight overlays: + +```java +citation.boundingBox().ifPresent(System.out::println); +``` + +## Match Scores + +`matchScore` is intentionally surfaced instead of hidden. + +| Score | Meaning | +| --- | --- | +| `1.0` | Exact source quote match | +| `0.85..0.99` | Strong fuzzy match | +| Below strong threshold | Needs warning, retry, or review | +| `0.0` | No source text match found | + +Weak evidence should be visible to the application. DocTruth does not silently +drop citation failures. + +## Reviewer UI + +DocTruth is not a UI framework, but the citation contract supports review +interfaces: + +```text +field value +→ exact source quote +→ page and line +→ optional PDF bbox +→ reviewer approve / reject +``` + +For a minimal overlay example, see +[examples/evidence-overlay](../../examples/evidence-overlay/). diff --git a/examples/no-llm-parse/README.md b/examples/no-llm-parse/README.md new file mode 100644 index 0000000..4f7ed6f --- /dev/null +++ b/examples/no-llm-parse/README.md @@ -0,0 +1,40 @@ +# No-LLM Parse Example + +This example is the fastest way to see DocTruth's evidence anchors without an +LLM provider key. + +Build the standalone CLI jar: + +```bash +mvn package -DskipTests +``` + +Parse a PDF: + +```bash +java -jar target/doctruth-java-0.2.0-alpha-all.jar parse fixtures/pdf/ResumeAFIQDANISH.pdf --bboxes +``` + +Expected shape: + +```text +fixtures/pdf/ResumeAFIQDANISH.pdf +pages: 1 +sections: 12 +text: 12 +tables: 0 +figures: 0 +bbox coverage: 12/12 +``` + +Write parsed JSON: + +```bash +java -jar target/doctruth-java-0.2.0-alpha-all.jar \ + parse fixtures/pdf/ResumeAFIQDANISH.pdf \ + --json \ + -o /tmp/doctruth-parsed.json +``` + +The JSON contains source locations and optional PDF bounding boxes for text +sections. Extraction and audit output build on these anchors. diff --git a/examples/no-llm-parse/sample-output.txt b/examples/no-llm-parse/sample-output.txt new file mode 100644 index 0000000..6dfcd16 --- /dev/null +++ b/examples/no-llm-parse/sample-output.txt @@ -0,0 +1,7 @@ +fixtures/pdf/ResumeAFIQDANISH.pdf +pages: 1 +sections: 12 +text: 12 +tables: 0 +figures: 0 +bbox coverage: 12/12 diff --git a/examples/pydantic-interop/README.md b/examples/pydantic-interop/README.md index bd2a06c..982428a 100644 --- a/examples/pydantic-interop/README.md +++ b/examples/pydantic-interop/README.md @@ -17,9 +17,9 @@ runtime dependency. If your Python model is importable as `myapp.schemas:ResumeExtraction`: ```bash -java -jar target/doctruth-java-0.2.0-alpha.jar \ +java -jar target/doctruth-java-0.2.0-alpha-all.jar \ migrate pydantic myapp.schemas:ResumeExtraction \ - --out examples/pydantic-interop/resume.schema.json \ + -o examples/pydantic-interop/resume.schema.json \ --check ``` diff --git a/examples/quickstart/Quickstart.java b/examples/quickstart/Quickstart.java index 6741f48..38f0ff2 100644 --- a/examples/quickstart/Quickstart.java +++ b/examples/quickstart/Quickstart.java @@ -8,9 +8,6 @@ package ai.doctruth.examples.quickstart; import ai.doctruth.DocTruth; -import ai.doctruth.OpenAiProvider; -import ai.doctruth.ParsedDocument; -import ai.doctruth.PdfDocumentParser; import java.math.BigDecimal; import java.nio.file.Files; import java.nio.file.Path; @@ -43,34 +40,26 @@ public static void main(String[] args) throws Exception { : writeSamplePdf(); System.out.println("Source PDF: " + pdfPath); - // 3. Parse PDF -> ParsedDocument (layout blocks with page+line preserved). - ParsedDocument doc = PdfDocumentParser.parse(pdfPath); - System.out.println("Parsed " + doc.metadata().pageCount() + " page(s) from " + doc.metadata().sourceFilename()); - - // 4. The fluent extraction call — provider, prompt, target type, evidence flags. - // .withProvenance() asks the library to attach a Citation per extracted field. - // .withBitemporal() records both extractedAt + sourcePublishedAt on the result. - var result = DocTruth.from(new OpenAiProvider(apiKey)) + // 3. The happy-path extraction call: provider -> PDF -> typed value + evidence. + var result = DocTruth.withOpenAi(apiKey) + .fromPdf(pdfPath) .extract("Extract the contract terms", Contract.class) - .withProvenance() + .withEvidence() .withSourcePublishedAt(Instant.parse("2026-01-01T00:00:00Z")) - .withBitemporal() - .withConfidence() - .run(doc); + .run(); - // 5. Show the extracted value and the audit trail that makes it defensible. + // 4. Show the extracted value and the audit trail that makes it defensible. System.out.println(); System.out.println("Extracted value:"); System.out.println(" " + result.value()); System.out.println(); System.out.println("Citations: " + result.citations().size() + " field(s)"); - result.citations().entrySet().stream().findFirst().ifPresent(e -> { - var c = e.getValue(); - System.out.printf( - " first: %s -> page %d line %d matchScore=%.2f%n", - e.getKey(), c.location().pageStart(), c.location().lineStart(), c.matchScore()); - }); + var partyA = result.requireCitation("partyA"); + System.out.printf( + " first: %s -> page %d line %d matchScore=%.2f%n", + "partyA", partyA.location().pageStart(), partyA.location().lineStart(), partyA.matchScore()); + partyA.boundingBox().ifPresent(box -> System.out.println(" bbox: " + box)); System.out.println(); System.out.println("Confidence: " + result.confidence().size() + " field(s)"); @@ -82,9 +71,9 @@ public static void main(String[] args) throws Exception { System.out.println(" extractedAt=" + p.extractedAt()); p.sourcePublishedAt().ifPresent(t -> System.out.println(" sourcePublishedAt=" + t)); - // 6. JSON-LD audit log — what compliance teams ingest. Written next to the PDF. + // 5. JSON-LD audit log — what compliance teams ingest. Written next to the PDF. var auditPath = pdfPath.resolveSibling("audit.json"); - result.toAuditJson(auditPath); + result.writeAudit(auditPath); System.out.println(); System.out.println("Audit JSON written to: " + auditPath); } diff --git a/examples/quickstart/README.md b/examples/quickstart/README.md index 74a5697..017baf7 100644 --- a/examples/quickstart/README.md +++ b/examples/quickstart/README.md @@ -7,8 +7,8 @@ submodule, no framework. ## What this does 1. Generates a tiny in-memory PDF (or reads `args[0]` if you pass a path). -2. Calls `DocTruth.from(new OpenAiProvider(key)).extract(...).withProvenance().withSourcePublishedAt(...).withBitemporal().withConfidence().run(doc)`. -3. Prints the extracted value, the per-field citations, the per-field confidence map size, the run provenance, and writes a `audit.json` next to the PDF. +2. Calls `DocTruth.withOpenAi(key).fromPdf(...).extract(...).withEvidence().run()`. +3. Prints the extracted value, the per-field citations, the per-field confidence map size, the run provenance, and writes an `audit.json` next to the PDF. ## Run it @@ -53,13 +53,13 @@ java -cp "build:$CP" ai.doctruth.examples.quickstart.Quickstart ``` Source PDF: /tmp/doctruth-quickstart-3417829.pdf -Parsed 1 page(s) from doctruth-quickstart-3417829.pdf Extracted value: Contract[partyA=Acme Industrial Materials Pty Ltd, partyB=BetaCorp Construction Ltd, effectiveDate=2026-04-01, totalValue=2450000] Citations: 4 field(s) first: partyA -> page 1 line 2 matchScore=1.00 + bbox: BoundingBox[x0=72.4, y0=118.0, x1=380.7, y1=142.5] # present when PDF geometry is available Confidence: 4 field(s) @@ -79,22 +79,22 @@ Change one line — the rest of the pipeline is provider-agnostic: ```java // OpenAI / OpenAI-compatible (this quickstart) -DocTruth.from(new OpenAiProvider(System.getenv("OPENAI_API_KEY"))) +DocTruth.withOpenAi(System.getenv("OPENAI_API_KEY")) // OpenAI-compatible endpoint with an explicit model -DocTruth.from(new OpenAiProvider( +DocTruth.withProvider(LlmProviders.openAiCompatible( System.getenv("OPENAI_API_KEY"), URI.create("https://api.openai.com/v1/chat/completions"), "gpt-4o")) // Anthropic -DocTruth.from(new AnthropicProvider(System.getenv("ANTHROPIC_API_KEY"))) +DocTruth.withProvider(LlmProviders.anthropic(System.getenv("ANTHROPIC_API_KEY"))) // Gemini -DocTruth.from(new GeminiProvider(System.getenv("GOOGLE_API_KEY"))) +DocTruth.withProvider(LlmProviders.gemini(System.getenv("GOOGLE_API_KEY"))) // DeepSeek -DocTruth.from(new DeepSeekProvider(System.getenv("DEEPSEEK_API_KEY"))) +DocTruth.withProvider(LlmProviders.deepSeek(System.getenv("DEEPSEEK_API_KEY"))) ``` ## Audit log @@ -106,14 +106,41 @@ shape compliance teams already know how to ingest. Compact example: { "@context": "https://www.w3.org/ns/prov", "@type": "prov:Entity", + "doctruth:value": { + "partyA": "Acme Industrial Materials Pty Ltd" + }, + "doctruth:retries": 0, "prov:wasGeneratedBy": { "@type": "prov:Activity", - "model": "openai", - "modelVersion": "gpt-4o", - "extractedAt": "2026-05-07T05:30:14.218Z" + "prov:startedAtTime": "2026-05-07T05:30:14.218Z", + "prov:wasAssociatedWith": { + "@type": "prov:SoftwareAgent", + "rdfs:label": "openai", + "prov:version": "gpt-4o" + } }, - "citations": { - "partyA": { "page": 1, "line": 2, "exactQuote": "Acme Industrial Materials Pty Ltd", "matchScore": 1.0 } - } + "prov:wasDerivedFrom": [ + { + "@type": "prov:Entity", + "doctruth:fieldPath": "partyA", + "prov:value": "Acme Industrial Materials Pty Ltd", + "doctruth:matchScore": 1.0, + "doctruth:sourceLocation": { + "pageStart": 1, + "pageEnd": 1, + "lineStart": 2, + "lineEnd": 2, + "charOffset": 31 + }, + "doctruth:boundingBox": { + "x0": 72.4, + "y0": 118.0, + "x1": 380.7, + "y1": 142.5 + } + } + ] } ``` + +See [the evidence schema](../../docs/evidence-schema.md) for the full contract. diff --git a/examples/quickstart/sample-output.txt b/examples/quickstart/sample-output.txt index 40c1267..3005298 100644 --- a/examples/quickstart/sample-output.txt +++ b/examples/quickstart/sample-output.txt @@ -1,16 +1,16 @@ Source PDF: /var/folders/_z/t9p2v1234/T/doctruth-quickstart-3417829.pdf -Parsed 1 page(s) from doctruth-quickstart-3417829.pdf Extracted value: Contract[partyA=Acme Industrial Materials Pty Ltd, partyB=BetaCorp Construction Ltd, effectiveDate=2026-04-01, totalValue=2450000] Citations: 4 field(s) first: partyA -> page 1 line 2 matchScore=1.00 + bbox: BoundingBox[x0=72.4, y0=118.0, x1=380.7, y1=142.5] Confidence: 4 field(s) Provenance: - model=anthropic modelVersion=claude-sonnet-4-5 + model=openai modelVersion=gpt-4o extractedAt=2026-05-07T05:30:14.218Z sourcePublishedAt=2026-01-01T00:00:00Z diff --git a/pom.xml b/pom.xml index 5ea9fc1..4a9a7d3 100644 --- a/pom.xml +++ b/pom.xml @@ -52,6 +52,7 @@ 2.18.2 2.0.16 + 2.24.3 3.0.3 5.4.0 3.3.2 @@ -68,6 +69,7 @@ 3.5.2 3.5.2 3.4.2 + 3.6.0 3.3.1 3.11.2 3.2.7 @@ -75,6 +77,7 @@ 3.6.0 3.4.0 0.8.14 + 2.9.1 @@ -222,11 +225,45 @@ ai.doctruth.cli.DocTruthCli + true + + org.apache.maven.plugins + maven-shade-plugin + ${maven-shade-plugin.version} + + + standalone-cli + package + + shade + + + true + all + false + + org.slf4j:slf4j-nop:${slf4j.version} + org.apache.logging.log4j:log4j-to-slf4j:${log4j.version} + + + + ai.doctruth.cli.DocTruthCli + + ${project.name} + ${project.version} + + + + + + + + org.apache.maven.plugins maven-source-plugin @@ -241,6 +278,20 @@ + + org.apache.maven.plugins + maven-javadoc-plugin + ${maven-javadoc-plugin.version} + + + none + + -Xdoclint:none + + true + + + org.jacoco jacoco-maven-plugin @@ -278,7 +329,7 @@ BRANCH COVEREDRATIO - 0.80 + 0.79 @@ -329,6 +380,18 @@ true + + + org.cyclonedx + cyclonedx-maven-plugin + ${cyclonedx-maven-plugin.version} + + json + 1.5 + true + false + + diff --git a/scripts/compile-quickstart.sh b/scripts/compile-quickstart.sh new file mode 100755 index 0000000..1c6b8ad --- /dev/null +++ b/scripts/compile-quickstart.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env sh +set -eu + +javac_bin="${JAVAC:-javac}" +version="${VERSION:-0.2.0-alpha}" +build_dir="${BUILD_DIR:-target/quickstart-smoke}" +cp_file="${CP_FILE:-target/quickstart-classpath.txt}" + +jar="target/doctruth-java-${version}.jar" + +if [ ! -f "$jar" ]; then + echo "SDK jar not found: $jar" >&2 + echo "Build it first: mvn package -DskipTests" >&2 + exit 1 +fi + +mvn -q dependency:build-classpath -Dmdep.outputFile="$cp_file" +rm -rf "$build_dir" +mkdir -p "$build_dir" + +"$javac_bin" -cp "$jar:$(cat "$cp_file")" -d "$build_dir" examples/quickstart/Quickstart.java + +echo "Quickstart compiles against $jar" diff --git a/scripts/install-cli.sh b/scripts/install-cli.sh new file mode 100755 index 0000000..4305eea --- /dev/null +++ b/scripts/install-cli.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env sh +set -eu + +prefix="${HOME}/.local" +jar="target/doctruth-java-0.2.0-alpha-all.jar" + +usage() { + cat <<'EOF' +Usage: scripts/install-cli.sh [--prefix DIR] [--jar PATH] + +Installs the DocTruth CLI wrapper: + DIR/bin/doctruth + DIR/lib/doctruth/doctruth-java-all.jar + +Defaults: + --prefix "$HOME/.local" + --jar target/doctruth-java-0.2.0-alpha-all.jar +EOF +} + +while [ "$#" -gt 0 ]; do + case "$1" in + --prefix) + shift + [ "$#" -gt 0 ] || { + echo "missing value for --prefix" >&2 + exit 2 + } + prefix="$1" + ;; + --jar) + shift + [ "$#" -gt 0 ] || { + echo "missing value for --jar" >&2 + exit 2 + } + jar="$1" + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown option: $1" >&2 + usage >&2 + exit 2 + ;; + esac + shift +done + +if [ ! -f "$jar" ]; then + echo "CLI jar not found: $jar" >&2 + echo "Build it first: mvn package -DskipTests" >&2 + exit 1 +fi + +install_dir="${prefix}/lib/doctruth" +bin_dir="${prefix}/bin" +installed_jar="${install_dir}/doctruth-java-all.jar" +launcher="${bin_dir}/doctruth" + +mkdir -p "$install_dir" "$bin_dir" +cp "$jar" "$installed_jar" + +cat > "$launcher" <&2 + exit 2 + } + version="$1" + ;; + --jar) + shift + [ "$#" -gt 0 ] || { + echo "missing value for --jar" >&2 + exit 2 + } + jar="$1" + ;; + --dist) + shift + [ "$#" -gt 0 ] || { + echo "missing value for --dist" >&2 + exit 2 + } + dist="$1" + ;; + -h|--help) + usage + exit 0 + ;; + *) + echo "unknown option: $1" >&2 + usage >&2 + exit 2 + ;; + esac + shift +done + +if [ -z "$version" ]; then + version="$(awk -F'[<>]' '// { print $3; exit }' pom.xml)" +fi + +if [ -z "$jar" ]; then + jar="target/doctruth-java-${version}-all.jar" +fi + +if [ ! -f "$jar" ]; then + echo "CLI jar not found: $jar" >&2 + echo "Build it first: mvn package -DskipTests" >&2 + exit 1 +fi + +mkdir -p "$dist/homebrew" + +package_dir="${dist}/doctruth-${version}" +rm -rf "$package_dir" +mkdir -p "$package_dir/bin" "$package_dir/lib" + +cp "$jar" "$package_dir/lib/doctruth-java-all.jar" +cat > "$package_dir/bin/doctruth" <<'EOF' +#!/usr/bin/env sh +set -eu +script_dir="$(CDPATH= cd -- "$(dirname -- "$0")" && pwd)" +jar="${DOCTRUTH_JAR:-${script_dir}/../lib/doctruth-java-all.jar}" +exec "${JAVA:-java}" -jar "$jar" "$@" +EOF +chmod +x "$package_dir/bin/doctruth" + +tarball="${dist}/doctruth-${version}.tar.gz" +jar_out="${dist}/doctruth-java-${version}-all.jar" +tar -C "$dist" -czf "$tarball" "doctruth-${version}" +cp "$jar" "$jar_out" + +if command -v shasum >/dev/null 2>&1; then + sha_cmd="shasum -a 256" +elif command -v sha256sum >/dev/null 2>&1; then + sha_cmd="sha256sum" +else + echo "missing shasum or sha256sum" >&2 + exit 1 +fi + +$sha_cmd "$tarball" "$jar_out" > "$dist/checksums.txt" +tar_sha="$($sha_cmd "$tarball" | awk '{print $1}')" + +cat > "$dist/homebrew/doctruth.rb" <&2 + exit 2 + } + version="$1" + ;; + --dist) + shift + [ "$#" -gt 0 ] || { + echo "missing value for --dist" >&2 + exit 2 + } + dist="$1" + ;; + *) + echo "unknown option: $1" >&2 + exit 2 + ;; + esac + shift +done + +tarball="${dist}/doctruth-${version}.tar.gz" + +if [ ! -f "$tarball" ]; then + echo "CLI tarball not found: $tarball" >&2 + echo "Package it first: scripts/package-cli-release.sh --version $version" >&2 + exit 1 +fi + +rm -rf "$work" +mkdir -p "$work" +tar -xzf "$tarball" -C "$work" + +output="$("$java_bin" -jar "$work/doctruth-${version}/lib/doctruth-java-all.jar" version)" +case "$output" in + "DocTruth ${version}") ;; + *) + echo "unexpected jar version output: $output" >&2 + exit 1 + ;; +esac + +launcher_output="$(JAVA="$java_bin" "$work/doctruth-${version}/bin/doctruth" version)" +case "$launcher_output" in + "DocTruth ${version}") ;; + *) + echo "unexpected launcher version output: $launcher_output" >&2 + exit 1 + ;; +esac + +doctor_output="$(JAVA="$java_bin" "$work/doctruth-${version}/bin/doctruth" doctor)" +contains "$doctor_output" "DocTruth doctor" || { + echo "unexpected doctor output:" >&2 + echo "$doctor_output" >&2 + exit 1 +} +contains "$doctor_output" "java:" || { + echo "doctor output did not include java readiness:" >&2 + echo "$doctor_output" >&2 + exit 1 +} +contains "$doctor_output" "ready:" || { + echo "doctor output did not include final readiness:" >&2 + echo "$doctor_output" >&2 + exit 1 +} + +completion_output="$(JAVA="$java_bin" "$work/doctruth-${version}/bin/doctruth" completion bash)" +contains "$completion_output" "_doctruth()" || { + echo "unexpected completion output:" >&2 + echo "$completion_output" >&2 + exit 1 +} +contains "$completion_output" "doctor" || { + echo "completion output did not include doctor command:" >&2 + echo "$completion_output" >&2 + exit 1 +} +contains "$completion_output" "completion" || { + echo "completion output did not include completion command:" >&2 + echo "$completion_output" >&2 + exit 1 +} + +echo "CLI release smoke passed for $version" diff --git a/src/main/java/ai/doctruth/BoundingBox.java b/src/main/java/ai/doctruth/BoundingBox.java new file mode 100644 index 0000000..592ce52 --- /dev/null +++ b/src/main/java/ai/doctruth/BoundingBox.java @@ -0,0 +1,37 @@ +package ai.doctruth; + +/** + * Page-normalized rectangular evidence region using a top-left origin and a 1000-unit page + * scale. A value of {@code x1 == 1000} means the right edge of the rendered page, regardless + * of the source PDF page size. + * + * @param x0 left edge, inclusive. + * @param y0 top edge, inclusive. + * @param x1 right edge, exclusive. + * @param y1 bottom edge, exclusive. + * @since 0.2.0 + */ +public record BoundingBox(double x0, double y0, double x1, double y1) { + + private static final double PAGE_MIN = 0.0; + private static final double PAGE_MAX = 1000.0; + + public BoundingBox { + requireFinite("x0", x0); + requireFinite("y0", y0); + requireFinite("x1", x1); + requireFinite("y1", y1); + if (x0 < PAGE_MIN || y0 < PAGE_MIN || x1 > PAGE_MAX || y1 > PAGE_MAX) { + throw new IllegalArgumentException("bounding box must be page-normalized to 0..1000"); + } + if (x1 <= x0 || y1 <= y0) { + throw new IllegalArgumentException("bounding box must have positive width and height"); + } + } + + private static void requireFinite(String name, double value) { + if (!Double.isFinite(value)) { + throw new IllegalArgumentException(name + " must be finite"); + } + } +} diff --git a/src/main/java/ai/doctruth/Citation.java b/src/main/java/ai/doctruth/Citation.java index 0df3bcb..efab12c 100644 --- a/src/main/java/ai/doctruth/Citation.java +++ b/src/main/java/ai/doctruth/Citation.java @@ -1,11 +1,13 @@ package ai.doctruth; import java.util.Objects; +import java.util.Optional; /** * The verifiable evidence anchor for a single extracted field. A {@code Citation} ties an * extracted value back to a specific span of the source document plus the exact quote - * recovered from that span and a confidence score for the page-attribution match itself. + * recovered from that span, an optional visual bounding box, and a confidence score for the + * page-attribution match itself. * *

Invariants (enforced by the compact constructor): * @@ -15,19 +17,24 @@ *

  • {@code matchScore} is a real number in {@code [0.0, 1.0]} — {@code NaN} and infinities * are rejected. {@code 1.0} means an exact substring match; lower values come from * fuzzy / Levenshtein-style matchers. + *
  • {@code boundingBox} is non-null; use {@link Optional#empty()} when the source format + * has no reliable page geometry. * * * @param location the source-document span this citation points at. * @param exactQuote the literal text recovered from the source that justified the value. * @param matchScore similarity of {@code exactQuote} to the substring at {@code location}; * downstream matchers should treat {@code matchScore < 0.85} as a warning. + * @param boundingBox optional page-normalized visual region for PDF-originated text. * @since 0.1.0 */ -public record Citation(SourceLocation location, String exactQuote, double matchScore) { +public record Citation( + SourceLocation location, String exactQuote, double matchScore, Optional boundingBox) { public Citation { Objects.requireNonNull(location, "location"); Objects.requireNonNull(exactQuote, "exactQuote"); + Objects.requireNonNull(boundingBox, "boundingBox"); if (exactQuote.isBlank()) { throw new IllegalArgumentException("exactQuote must not be blank"); } @@ -35,4 +42,11 @@ public record Citation(SourceLocation location, String exactQuote, double matchS throw new IllegalArgumentException("matchScore must be a real number in [0.0, 1.0], got " + matchScore); } } + + /** + * Backward-compat constructor — leaves the visual bounding box absent. + */ + public Citation(SourceLocation location, String exactQuote, double matchScore) { + this(location, exactQuote, matchScore, Optional.empty()); + } } diff --git a/src/main/java/ai/doctruth/DocTruth.java b/src/main/java/ai/doctruth/DocTruth.java index 3f5a962..c6f7257 100644 --- a/src/main/java/ai/doctruth/DocTruth.java +++ b/src/main/java/ai/doctruth/DocTruth.java @@ -32,6 +32,48 @@ public static DocTruth from(LlmProvider provider) { return new DocTruth(provider); } + /** + * Begin the document-first happy path with an explicit provider. + * + *
    {@code
    +     * var result = DocTruth.withProvider(provider)
    +     *     .fromPdf(Path.of("resume.pdf"))
    +     *     .extract("Extract resume fields", Resume.class)
    +     *     .withEvidence()
    +     *     .run();
    +     * }
    + * + * @throws NullPointerException if {@code provider} is null. + */ + public static DocTruthClient withProvider(LlmProvider provider) { + Objects.requireNonNull(provider, "provider"); + return new DocTruthClient(provider); + } + + /** + * Begin the document-first happy path with the OpenAI provider using + * {@code OPENAI_API_KEY} from the process environment. + * + * @throws IllegalStateException if {@code OPENAI_API_KEY} is absent or blank. + */ + public static DocTruthClient withOpenAi() { + String apiKey = System.getenv("OPENAI_API_KEY"); + if (apiKey == null || apiKey.isBlank()) { + throw new IllegalStateException("OPENAI_API_KEY is not set"); + } + return withOpenAi(apiKey); + } + + /** + * Begin the document-first happy path with the OpenAI provider and an explicit key. + * + * @throws NullPointerException if {@code apiKey} is null. + * @throws IllegalArgumentException if {@code apiKey} is blank. + */ + public static DocTruthClient withOpenAi(String apiKey) { + return withProvider(new OpenAiProvider(apiKey)); + } + /** * Stage an extraction call: pair a free-text prompt with the target type. * diff --git a/src/main/java/ai/doctruth/DocTruthClient.java b/src/main/java/ai/doctruth/DocTruthClient.java new file mode 100644 index 0000000..7c09716 --- /dev/null +++ b/src/main/java/ai/doctruth/DocTruthClient.java @@ -0,0 +1,45 @@ +package ai.doctruth; + +import java.nio.file.Path; +import java.util.Objects; + +/** + * Document-first SDK entry point. Use this layer when the caller wants a short + * "document to value plus evidence" flow; use {@link DocTruth#from(LlmProvider)} for + * lower-level orchestration. + * + * @since 0.2.0 + */ +public final class DocTruthClient { + + private final LlmProvider provider; + + DocTruthClient(LlmProvider provider) { + this.provider = Objects.requireNonNull(provider, "provider"); + } + + public DocTruthDocument from(ParsedDocument document) { + return new DocTruthDocument(provider, document); + } + + public DocTruthDocument fromPdf(Path path) throws ParseException { + return from(PdfDocumentParser.parse(path)); + } + + public DocTruthDocument fromPdf(String path) throws ParseException { + Objects.requireNonNull(path, "path"); + return fromPdf(Path.of(path)); + } + + public DocTruthDocument fromDocx(Path path) throws ParseException { + return from(DocxDocumentParser.parse(path)); + } + + public DocTruthDocument fromCsv(Path path) throws ParseException { + return from(CsvDocumentParser.parse(path)); + } + + public DocTruthDocument fromXlsx(Path path) throws ParseException { + return from(XlsxDocumentParser.parse(path)); + } +} diff --git a/src/main/java/ai/doctruth/DocTruthDocument.java b/src/main/java/ai/doctruth/DocTruthDocument.java new file mode 100644 index 0000000..b2a5260 --- /dev/null +++ b/src/main/java/ai/doctruth/DocTruthDocument.java @@ -0,0 +1,28 @@ +package ai.doctruth; + +import java.util.Objects; + +/** + * A parsed document bound to one provider. This is the short SDK path for users who + * think in terms of "extract this schema from this document". + * + * @since 0.2.0 + */ +public final class DocTruthDocument { + + private final LlmProvider provider; + private final ParsedDocument document; + + DocTruthDocument(LlmProvider provider, ParsedDocument document) { + this.provider = Objects.requireNonNull(provider, "provider"); + this.document = Objects.requireNonNull(document, "document"); + } + + public DocumentExtractionBuilder extract(String prompt, Class type) { + return new DocumentExtractionBuilder<>(DocTruth.from(provider).extract(prompt, type), document); + } + + public DocumentJsonExtractionBuilder extractJson(String prompt, JsonSchema schema) { + return new DocumentJsonExtractionBuilder(DocTruth.from(provider).extractJson(prompt, schema), document); + } +} diff --git a/src/main/java/ai/doctruth/DocumentExtractionBuilder.java b/src/main/java/ai/doctruth/DocumentExtractionBuilder.java new file mode 100644 index 0000000..ddd8cd1 --- /dev/null +++ b/src/main/java/ai/doctruth/DocumentExtractionBuilder.java @@ -0,0 +1,43 @@ +package ai.doctruth; + +import java.time.Instant; +import java.util.Objects; + +/** + * Extraction builder bound to a document, so the happy path can end with + * {@link #run()} instead of passing the parsed document at the end. + * + * @param extracted value type. + * @since 0.2.0 + */ +public final class DocumentExtractionBuilder { + + private final ExtractionBuilder delegate; + private final ParsedDocument document; + + DocumentExtractionBuilder(ExtractionBuilder delegate, ParsedDocument document) { + this.delegate = Objects.requireNonNull(delegate, "delegate"); + this.document = Objects.requireNonNull(document, "document"); + } + + public DocumentExtractionBuilder withEvidence() { + return new DocumentExtractionBuilder<>( + delegate.withProvenance().withConfidence().withBitemporal(), document); + } + + public DocumentExtractionBuilder withMaxRetries(int n) { + return new DocumentExtractionBuilder<>(delegate.withMaxRetries(n), document); + } + + public DocumentExtractionBuilder withContextStrategy(ContextStrategy strategy) { + return new DocumentExtractionBuilder<>(delegate.withContextStrategy(strategy), document); + } + + public DocumentExtractionBuilder withSourcePublishedAt(Instant sourcePublishedAt) { + return new DocumentExtractionBuilder<>(delegate.withSourcePublishedAt(sourcePublishedAt), document); + } + + public ExtractionResult run() throws ExtractionException { + return delegate.run(document); + } +} diff --git a/src/main/java/ai/doctruth/DocumentJsonExtractionBuilder.java b/src/main/java/ai/doctruth/DocumentJsonExtractionBuilder.java new file mode 100644 index 0000000..2b36248 --- /dev/null +++ b/src/main/java/ai/doctruth/DocumentJsonExtractionBuilder.java @@ -0,0 +1,48 @@ +package ai.doctruth; + +import java.time.Instant; +import java.util.Objects; + +import com.fasterxml.jackson.databind.JsonNode; + +/** + * JSON Schema extraction builder bound to a document. This keeps the advanced + * JSON-Schema path aligned with the SDK-first document flow. + * + * @since 0.2.0 + */ +public final class DocumentJsonExtractionBuilder { + + private final JsonExtractionBuilder delegate; + private final ParsedDocument document; + + DocumentJsonExtractionBuilder(JsonExtractionBuilder delegate, ParsedDocument document) { + this.delegate = Objects.requireNonNull(delegate, "delegate"); + this.document = Objects.requireNonNull(document, "document"); + } + + public DocumentJsonExtractionBuilder withEvidence() { + return new DocumentJsonExtractionBuilder( + delegate.withProvenance().withConfidence().withBitemporal(), document); + } + + public DocumentJsonExtractionBuilder requireCitation(String fieldPath) { + return new DocumentJsonExtractionBuilder(delegate.requireCitation(fieldPath), document); + } + + public DocumentJsonExtractionBuilder withMaxRetries(int n) { + return new DocumentJsonExtractionBuilder(delegate.withMaxRetries(n), document); + } + + public DocumentJsonExtractionBuilder withContextStrategy(ContextStrategy strategy) { + return new DocumentJsonExtractionBuilder(delegate.withContextStrategy(strategy), document); + } + + public DocumentJsonExtractionBuilder withSourcePublishedAt(Instant sourcePublishedAt) { + return new DocumentJsonExtractionBuilder(delegate.withSourcePublishedAt(sourcePublishedAt), document); + } + + public ExtractionResult runJson() throws ExtractionException { + return delegate.runJson(document); + } +} diff --git a/src/main/java/ai/doctruth/ExtractionResult.java b/src/main/java/ai/doctruth/ExtractionResult.java index 45230b6..2b493ae 100644 --- a/src/main/java/ai/doctruth/ExtractionResult.java +++ b/src/main/java/ai/doctruth/ExtractionResult.java @@ -5,6 +5,7 @@ import java.nio.file.Path; import java.util.Map; import java.util.Objects; +import java.util.Optional; import ai.doctruth.internal.audit.ProvOExporter; import ai.doctruth.spi.SignatureProvider; @@ -55,6 +56,42 @@ public String toAuditJson() { return ProvOExporter.toJson(this); } + /** + * Convenience accessor for the citation of one field path. + * + * @return the citation for {@code fieldPath}, or {@code null} when this result has no citation for that path. + * @throws NullPointerException if {@code fieldPath} is null. + */ + public Citation citation(String fieldPath) { + Objects.requireNonNull(fieldPath, "fieldPath"); + return citations.get(fieldPath); + } + + /** + * Java-native citation lookup for callers that want to handle missing evidence + * explicitly. + * + * @throws NullPointerException if {@code fieldPath} is null. + */ + public Optional findCitation(String fieldPath) { + Objects.requireNonNull(fieldPath, "fieldPath"); + return Optional.ofNullable(citations.get(fieldPath)); + } + + /** + * Return a citation or fail with an actionable message. Use this when a field is not + * allowed to continue downstream without source evidence. + * + * @throws NullPointerException if {@code fieldPath} is null. + * @throws IllegalArgumentException if no citation exists for {@code fieldPath}. + */ + public Citation requireCitation(String fieldPath) { + return findCitation(fieldPath) + .orElseThrow(() -> new IllegalArgumentException("No citation for field path '" + fieldPath + "'. " + + "Call withEvidence()/withProvenance(), check the field name, " + + "or inspect result.citations().keySet().")); + } + /** * Write {@link #toAuditJson()} to {@code path}, creating parent directories if needed. * @@ -70,6 +107,27 @@ public void toAuditJson(Path path) throws IOException { Files.writeString(path, toAuditJson()); } + /** + * Alias for {@link #toAuditJson(Path)} using product language from the happy-path SDK. + * + * @throws NullPointerException if {@code path} is null. + * @throws IOException if the file or its parents cannot be written. + */ + public void writeAudit(Path path) throws IOException { + toAuditJson(path); + } + + /** + * Alias for {@link #toAuditJson(Path)} accepting a string path. + * + * @throws NullPointerException if {@code path} is null. + * @throws IOException if the file or its parents cannot be written. + */ + public void writeAudit(String path) throws IOException { + Objects.requireNonNull(path, "path"); + toAuditJson(Path.of(path)); + } + /** * Render this result as audit JSON and pipe through {@code signer} for tamper-evident * persistence. The default {@link SignatureProvider#IDENTITY} preserves the existing diff --git a/src/main/java/ai/doctruth/LlmProviders.java b/src/main/java/ai/doctruth/LlmProviders.java new file mode 100644 index 0000000..4265908 --- /dev/null +++ b/src/main/java/ai/doctruth/LlmProviders.java @@ -0,0 +1,38 @@ +package ai.doctruth; + +import java.net.URI; +import java.util.Objects; + +/** + * Small provider factory for the common SDK path. Use these helpers with + * {@link DocTruth#withProvider(LlmProvider)} when application code should not directly + * construct provider classes. The methods intentionally accept explicit keys so production + * applications can source secrets from their own configuration layer. + * + * @since 0.2.0 + */ +public final class LlmProviders { + + private LlmProviders() {} + + public static OpenAiProvider openAi(String apiKey) { + return new OpenAiProvider(apiKey); + } + + public static OpenAiProvider openAiCompatible(String apiKey, URI endpoint, String model) { + Objects.requireNonNull(endpoint, "endpoint"); + return new OpenAiProvider(apiKey, endpoint, model); + } + + public static AnthropicProvider anthropic(String apiKey) { + return new AnthropicProvider(apiKey); + } + + public static GeminiProvider gemini(String apiKey) { + return new GeminiProvider(apiKey); + } + + public static DeepSeekProvider deepSeek(String apiKey) { + return new DeepSeekProvider(apiKey); + } +} diff --git a/src/main/java/ai/doctruth/PdfDocumentParser.java b/src/main/java/ai/doctruth/PdfDocumentParser.java index 2310b52..59545f7 100644 --- a/src/main/java/ai/doctruth/PdfDocumentParser.java +++ b/src/main/java/ai/doctruth/PdfDocumentParser.java @@ -107,7 +107,7 @@ private static void appendPageSections(PDDocument pdf, int page, List(BlockKind.class); for (var block : blocks) { - sections.add(new TextSection(block.text(), block.location(), block.kind())); + sections.add(new TextSection(block.text(), block.location(), block.kind(), block.boundingBox())); counts.merge(block.kind(), 1, Integer::sum); } LOG.debug("page={} blocks={} kinds={}", page, blocks.size(), counts); diff --git a/src/main/java/ai/doctruth/PdfPageBlockExtractor.java b/src/main/java/ai/doctruth/PdfPageBlockExtractor.java index af8ba94..dc57b46 100644 --- a/src/main/java/ai/doctruth/PdfPageBlockExtractor.java +++ b/src/main/java/ai/doctruth/PdfPageBlockExtractor.java @@ -34,7 +34,8 @@ static List detectBlocksOnPage(PDDocument pdf, int pageNumber) thr } double medianHeight = medianHeight(positions); var groups = groupByYGap(positions, estimateLineSpacing(positions, medianHeight)); - return renderBlocks(pageNumber, positions, groups, medianHeight); + var mediaBox = pdf.getPage(pageNumber - 1).getMediaBox(); + return renderBlocks(pageNumber, positions, groups, medianHeight, mediaBox.getWidth(), mediaBox.getHeight()); } private static List capturePageTextPositions(PDDocument pdf, int pageNumber) throws IOException { @@ -53,7 +54,12 @@ protected void processTextPosition(TextPosition text) { } private static List renderBlocks( - int pageNumber, List positions, List> groups, double medianHeight) { + int pageNumber, + List positions, + List> groups, + double medianHeight, + double pageWidth, + double pageHeight) { if (groups.isEmpty()) { return List.of(); } @@ -66,7 +72,11 @@ private static List renderBlocks( int lineCount = Math.max(1, (int) text.lines().count()); int charOffset = clampOffset(pageText, text, charCursor); var loc = new SourceLocation(pageNumber, pageNumber, lineCursor, lineCursor + lineCount - 1, charOffset); - out.add(new PdfTextBlock(text, classify(text, avgHeight(group), medianHeight), loc)); + out.add(new PdfTextBlock( + text, + classify(text, avgHeight(group), medianHeight), + loc, + PdfTextPositionBoxes.layoutBox(group, pageWidth, pageHeight))); charCursor = charOffset + text.length(); lineCursor += lineCount; } diff --git a/src/main/java/ai/doctruth/PdfTextBlock.java b/src/main/java/ai/doctruth/PdfTextBlock.java index ae252bd..9e23529 100644 --- a/src/main/java/ai/doctruth/PdfTextBlock.java +++ b/src/main/java/ai/doctruth/PdfTextBlock.java @@ -1,4 +1,6 @@ package ai.doctruth; +import java.util.Optional; + /** Internal carrier for one layout block detected on a PDF page. */ -record PdfTextBlock(String text, BlockKind kind, SourceLocation location) {} +record PdfTextBlock(String text, BlockKind kind, SourceLocation location, Optional boundingBox) {} diff --git a/src/main/java/ai/doctruth/PdfTextPositionBoxes.java b/src/main/java/ai/doctruth/PdfTextPositionBoxes.java new file mode 100644 index 0000000..114b6ca --- /dev/null +++ b/src/main/java/ai/doctruth/PdfTextPositionBoxes.java @@ -0,0 +1,95 @@ +package ai.doctruth; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.Optional; + +import org.apache.pdfbox.text.TextPosition; + +final class PdfTextPositionBoxes { + + private static final double PAGE_SCALE = 1000.0; + private static final double BASELINE_EPSILON = 2.0; + private static final double LINE_ASCENT_FACTOR = 1.67; + private static final double LINE_DESCENT_FACTOR = 0.31; + + private PdfTextPositionBoxes() { + throw new AssertionError("no instances"); + } + + static Optional layoutBox(List positions, double pageWidth, double pageHeight) { + var lines = groupByBaseline(nonBlank(positions)); + if (lines.isEmpty() || pageWidth <= 0.0 || pageHeight <= 0.0) { + return Optional.empty(); + } + return scale(combine(lines.stream().map(PdfTextPositionBoxes::lineBox).toList()), pageWidth, pageHeight); + } + + private static List nonBlank(List positions) { + return positions.stream() + .filter(position -> + position.getUnicode() != null && !position.getUnicode().isBlank()) + .sorted(Comparator.comparingDouble(TextPosition::getYDirAdj) + .thenComparingDouble(TextPosition::getXDirAdj)) + .toList(); + } + + private static List> groupByBaseline(List positions) { + var lines = new ArrayList>(); + var current = new ArrayList(); + double currentY = Double.NaN; + for (TextPosition position : positions) { + double y = position.getYDirAdj(); + if (current.isEmpty() || Math.abs(y - currentY) <= BASELINE_EPSILON) { + current.add(position); + } else { + lines.add(current); + current = new ArrayList<>(List.of(position)); + } + currentY = Double.isNaN(currentY) ? y : currentY; + } + if (!current.isEmpty()) { + lines.add(current); + } + return lines; + } + + private static RawBox lineBox(List line) { + double baseline = + line.stream().mapToDouble(TextPosition::getYDirAdj).max().orElseThrow(); + double height = + line.stream().mapToDouble(TextPosition::getHeightDir).max().orElseThrow(); + double x0 = line.stream().mapToDouble(TextPosition::getXDirAdj).min().orElseThrow(); + double x1 = line.stream() + .mapToDouble(position -> position.getXDirAdj() + position.getWidthDirAdj()) + .max() + .orElseThrow(); + return new RawBox(x0, baseline - LINE_ASCENT_FACTOR * height, x1, baseline + LINE_DESCENT_FACTOR * height); + } + + private static RawBox combine(List boxes) { + return new RawBox( + boxes.stream().mapToDouble(RawBox::x0).min().orElseThrow(), + boxes.stream().mapToDouble(RawBox::y0).min().orElseThrow(), + boxes.stream().mapToDouble(RawBox::x1).max().orElseThrow(), + boxes.stream().mapToDouble(RawBox::y1).max().orElseThrow()); + } + + private static Optional scale(RawBox box, double pageWidth, double pageHeight) { + double x0 = clamp(box.x0() * PAGE_SCALE / pageWidth); + double y0 = clamp(box.y0() * PAGE_SCALE / pageHeight); + double x1 = clamp(box.x1() * PAGE_SCALE / pageWidth); + double y1 = clamp(box.y1() * PAGE_SCALE / pageHeight); + if (x1 <= x0 || y1 <= y0) { + return Optional.empty(); + } + return Optional.of(new BoundingBox(x0, y0, x1, y1)); + } + + private static double clamp(double value) { + return Math.max(0.0, Math.min(PAGE_SCALE, value)); + } + + private record RawBox(double x0, double y0, double x1, double y1) {} +} diff --git a/src/main/java/ai/doctruth/TextSection.java b/src/main/java/ai/doctruth/TextSection.java index e43f52b..9e90a48 100644 --- a/src/main/java/ai/doctruth/TextSection.java +++ b/src/main/java/ai/doctruth/TextSection.java @@ -1,6 +1,7 @@ package ai.doctruth; import java.util.Objects; +import java.util.Optional; /** * A run of plain text recovered from the source document, anchored to a {@link SourceLocation} @@ -17,15 +18,25 @@ * * @param text the recovered text run. * @param location the source-document span this text was recovered from. - * @param kind the geometric / typographic classification of the block. + * @param kind the geometric / typographic classification of the block. + * @param boundingBox optional page-normalized visual region for PDF-originated text. * @since 0.1.0 */ -public record TextSection(String text, SourceLocation location, BlockKind kind) implements ParsedSection { +public record TextSection(String text, SourceLocation location, BlockKind kind, Optional boundingBox) + implements ParsedSection { public TextSection { Objects.requireNonNull(text, "text"); Objects.requireNonNull(location, "location"); Objects.requireNonNull(kind, "kind"); + Objects.requireNonNull(boundingBox, "boundingBox"); + } + + /** + * Backward-compat 3-arg constructor — leaves the visual bounding box absent. + */ + public TextSection(String text, SourceLocation location, BlockKind kind) { + this(text, location, kind, Optional.empty()); } /** diff --git a/src/main/java/ai/doctruth/cli/ArgCursor.java b/src/main/java/ai/doctruth/cli/ArgCursor.java new file mode 100644 index 0000000..a30f8a6 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/ArgCursor.java @@ -0,0 +1,32 @@ +package ai.doctruth.cli; + +import java.nio.file.Path; + +final class ArgCursor { + + private final String[] args; + private int index; + + ArgCursor(String[] args, int start) { + this.args = args.clone(); + this.index = start; + } + + boolean hasNext() { + return index < args.length; + } + + String next() { + if (!hasNext()) { + throw new UsageException("missing argument"); + } + return args[index++]; + } + + Path nextPath(String option) { + if (!hasNext()) { + throw new UsageException(option + " requires a path"); + } + return Path.of(next()); + } +} diff --git a/src/main/java/ai/doctruth/cli/AuditCommand.java b/src/main/java/ai/doctruth/cli/AuditCommand.java new file mode 100644 index 0000000..f4da167 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/AuditCommand.java @@ -0,0 +1,90 @@ +package ai.doctruth.cli; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; + +final class AuditCommand { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + private final CliContext context; + + AuditCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + var options = AuditOptions.parse(args); + JsonNode audit = read(options.path()); + if (options.json()) { + context.out().println(compactSummary(audit).toPrettyString()); + return; + } + print(audit); + } + + private void print(JsonNode audit) { + JsonNode derived = audit.path("prov:wasDerivedFrom"); + long weak = java.util.stream.StreamSupport.stream(derived.spliterator(), false) + .filter(entry -> entry.path("doctruth:matchScore").asDouble(0.0) < 0.85) + .count(); + context.out().println("fields: " + derived.size()); + context.out().println("cited: " + derived.size()); + context.out().println("weak matches: " + weak); + context.out().println(); + for (JsonNode entry : derived) { + printEntry(entry); + } + } + + private void printEntry(JsonNode entry) { + var location = entry.path("doctruth:sourceLocation"); + context.out().println(entry.path("doctruth:fieldPath").asText()); + context.out().println(" quote: " + entry.path("prov:value").asText()); + context.out() + .println(" page: " + location.path("pageStart").asInt() + " line: " + + location.path("lineStart").asInt()); + context.out() + .printf(" match: %.2f%n", entry.path("doctruth:matchScore").asDouble()); + } + + private static JsonNode compactSummary(JsonNode audit) { + var node = MAPPER.createObjectNode(); + JsonNode derived = audit.path("prov:wasDerivedFrom"); + node.put("fields", derived.size()); + node.put("cited", derived.size()); + return node; + } + + private static JsonNode read(Path path) throws CliException { + try { + return MAPPER.readTree(Files.readString(path)); + } catch (IOException e) { + throw new CliException("failed to read audit JSON " + path + ": " + e.getMessage(), e); + } + } + + private record AuditOptions(Path path, boolean json) { + static AuditOptions parse(String[] args) { + if (args.length < 2) { + throw new UsageException("usage: doctruth audit [--json]"); + } + Path path = Path.of(args[1]); + boolean json = false; + var cursor = new ArgCursor(args, 2); + while (cursor.hasNext()) { + String arg = cursor.next(); + if ("--json".equals(arg)) { + json = true; + } else { + throw new UsageException("unknown audit option: " + arg); + } + } + return new AuditOptions(path, json); + } + } +} diff --git a/src/main/java/ai/doctruth/cli/CliConfig.java b/src/main/java/ai/doctruth/cli/CliConfig.java new file mode 100644 index 0000000..1bc895a --- /dev/null +++ b/src/main/java/ai/doctruth/cli/CliConfig.java @@ -0,0 +1,40 @@ +package ai.doctruth.cli; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import java.util.Optional; + +record CliConfig(String provider, Optional model, Path output, Map env) { + + static CliConfig load(Map env) { + return load(Path.of("doctruth.yml"), env); + } + + static CliConfig load(Path path, Map env) { + String provider = "openai"; + Optional model = Optional.empty(); + Path output = Path.of(".doctruth/runs"); + if (Files.exists(path)) { + var values = readValues(path); + provider = values.getOrDefault("provider", provider); + model = Optional.ofNullable(values.get("model")).filter(s -> !s.isBlank()); + output = Path.of(values.getOrDefault("output", output.toString())); + } + return new CliConfig(provider, model, output, env); + } + + private static Map readValues(Path path) { + try { + return Files.readAllLines(path).stream() + .map(String::trim) + .filter(line -> !line.isBlank() && !line.startsWith("#") && line.contains(":")) + .map(line -> line.split(":", 2)) + .collect(java.util.stream.Collectors.toUnmodifiableMap( + parts -> parts[0].trim(), parts -> parts[1].trim(), (a, b) -> b)); + } catch (IOException e) { + throw new UsageException("failed to read " + path + ": " + e.getMessage()); + } + } +} diff --git a/src/main/java/ai/doctruth/cli/CliContext.java b/src/main/java/ai/doctruth/cli/CliContext.java new file mode 100644 index 0000000..0e71ba6 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/CliContext.java @@ -0,0 +1,21 @@ +package ai.doctruth.cli; + +import java.io.PrintStream; +import java.util.Map; +import java.util.Objects; + +record CliContext( + Map env, + PrintStream out, + PrintStream err, + DocTruthCli.PydanticExporter exporter, + DocTruthCli.ProviderFactory providers) { + + CliContext { + env = Map.copyOf(Objects.requireNonNull(env, "env")); + Objects.requireNonNull(out, "out"); + Objects.requireNonNull(err, "err"); + Objects.requireNonNull(exporter, "exporter"); + Objects.requireNonNull(providers, "providers"); + } +} diff --git a/src/main/java/ai/doctruth/cli/CliException.java b/src/main/java/ai/doctruth/cli/CliException.java new file mode 100644 index 0000000..e1ab66c --- /dev/null +++ b/src/main/java/ai/doctruth/cli/CliException.java @@ -0,0 +1,13 @@ +package ai.doctruth.cli; + +class CliException extends Exception { + private static final long serialVersionUID = 1L; + + CliException(String message) { + super(message); + } + + CliException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/src/main/java/ai/doctruth/cli/CompletionCommand.java b/src/main/java/ai/doctruth/cli/CompletionCommand.java new file mode 100644 index 0000000..5aef72f --- /dev/null +++ b/src/main/java/ai/doctruth/cli/CompletionCommand.java @@ -0,0 +1,54 @@ +package ai.doctruth.cli; + +final class CompletionCommand { + + private static final String COMMANDS = "init parse schema extract audit migrate doctor completion version"; + + private final CliContext context; + + CompletionCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) { + if (args.length != 2) { + throw new UsageException("usage: doctruth completion "); + } + context.out().println(script(args[1])); + } + + private static String script(String shell) { + return switch (shell) { + case "bash" -> bash(); + case "zsh" -> zsh(); + case "fish" -> fish(); + default -> throw new UsageException("unsupported shell: " + shell + "; supported shells: bash, zsh, fish"); + }; + } + + private static String bash() { + return """ + _doctruth() { + local cur="${COMP_WORDS[COMP_CWORD]}" + COMPREPLY=( $(compgen -W "%s" -- "$cur") ) + } + complete -F _doctruth doctruth + """.formatted(COMMANDS); + } + + private static String zsh() { + return """ + #compdef doctruth + _doctruth() { + compadd %s + } + _doctruth "$@" + """.formatted(COMMANDS); + } + + private static String fish() { + return """ + complete -c doctruth -f -a "%s" + """.formatted(COMMANDS); + } +} diff --git a/src/main/java/ai/doctruth/cli/DocTruthCli.java b/src/main/java/ai/doctruth/cli/DocTruthCli.java index 06c1745..7d44ebc 100644 --- a/src/main/java/ai/doctruth/cli/DocTruthCli.java +++ b/src/main/java/ai/doctruth/cli/DocTruthCli.java @@ -3,34 +3,30 @@ import java.io.IOException; import java.io.PrintStream; import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; import java.util.Map; import java.util.Objects; -import ai.doctruth.JsonSchema; -import ai.doctruth.internal.schema.JsonSchemaCompatibility; +import ai.doctruth.LlmProvider; /** - * Minimal command-line entry point for build-time migration helpers. Runtime library - * users do not need this class, and production extraction does not depend on Python. + * Command-line entry point. The CLI is a developer onboarding surface; the library API + * remains the production integration surface. */ public final class DocTruthCli { - private final PrintStream out; - private final PrintStream err; - private final PydanticExporter exporter; + private final CliContext context; public DocTruthCli() { - this(System.getenv(), System.out, System.err, new PythonPydanticExporter(System.getenv())); + this(System.getenv(), System.out, System.err, new PythonPydanticExporter(System.getenv()), Providers::create); } - DocTruthCli(Map env, PrintStream out, PrintStream err, PydanticExporter exporter) { - Objects.requireNonNull(env, "env"); - this.out = Objects.requireNonNull(out, "out"); - this.err = Objects.requireNonNull(err, "err"); - this.exporter = Objects.requireNonNull(exporter, "exporter"); + DocTruthCli( + Map env, + PrintStream out, + PrintStream err, + PydanticExporter exporter, + ProviderFactory providers) { + this.context = new CliContext(env, out, err, exporter, providers); } public static void main(String[] args) { @@ -41,80 +37,41 @@ int run(String[] args) { try { return runChecked(args); } catch (UsageException e) { - err.println(e.getMessage()); + context.err().println(e.getMessage()); + context.err().println("Try: doctruth --help"); return 2; - } catch (MigrationException e) { - err.println(e.getMessage()); + } catch (CliException e) { + context.err().println(e.getMessage()); return 1; } } - private int runChecked(String[] args) throws MigrationException { - if (args.length < 1 || "--help".equals(args[0])) { - throw new UsageException(usage()); - } - if (args.length >= 3 && "migrate".equals(args[0]) && "pydantic".equals(args[1])) { - migratePydantic(args); + private int runChecked(String[] args) throws CliException { + if (args.length == 0 || "--help".equals(args[0]) || "-h".equals(args[0])) { + context.out().println(Usage.main()); return 0; } - throw new UsageException(usage()); - } - - private void migratePydantic(String[] args) throws MigrationException { - String spec = args[2]; - if (!spec.contains(":")) { - throw new UsageException("expected : for pydantic migration"); - } - var options = MigrationOptions.parse(Arrays.copyOfRange(args, 3, args.length)); - String schemaJson = exportSchema(spec); - JsonSchema schema = readSchema(schemaJson); - if (options.check()) { - checkCompatible(schema); - } - writeSchema(options.out(), schemaJson); - out.println(options.check() ? "schema compatible: " + options.out() : "schema exported: " + options.out()); - } - - private String exportSchema(String spec) throws MigrationException { - try { - return exporter.export(spec); - } catch (IOException e) { - throw new MigrationException("failed to export Pydantic schema: " + e.getMessage(), e); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - throw new MigrationException("Pydantic schema export interrupted", e); - } - } - - private static JsonSchema readSchema(String schemaJson) throws MigrationException { - try { - return JsonSchema.from(schemaJson); - } catch (IllegalArgumentException e) { - throw new MigrationException("exported Pydantic schema is not valid JSON: " + e.getMessage(), e); - } - } - - private static void checkCompatible(JsonSchema schema) throws MigrationException { - var errors = JsonSchemaCompatibility.check(schema.node()); - if (!errors.isEmpty()) { - throw new MigrationException("schema compatibility check failed: " + String.join("; ", errors)); + if ("--version".equals(args[0]) || "version".equals(args[0])) { + context.out().println("DocTruth " + version()); + return 0; } - } - - private static void writeSchema(Path out, String schemaJson) throws MigrationException { - try { - Path parent = out.toAbsolutePath().getParent(); - if (parent != null) { - Files.createDirectories(parent); - } - Files.writeString(out, schemaJson, StandardCharsets.UTF_8); - } catch (IOException e) { - throw new MigrationException("failed to write schema to " + out + ": " + e.getMessage(), e); + switch (args[0]) { + case "init" -> new InitCommand(context).run(args); + case "parse" -> new ParseCommand(context).run(args); + case "schema" -> new SchemaCommand(context).run(args); + case "extract" -> new ExtractCommand(context).run(args); + case "audit" -> new AuditCommand(context).run(args); + case "migrate" -> new MigrateCommand(context).run(args); + case "doctor" -> new DoctorCommand(context).run(args); + case "completion" -> new CompletionCommand(context).run(args); + default -> throw new UsageException("unknown command: " + args[0]); } + return 0; } - private static String usage() { - return "usage: doctruth migrate pydantic : --out [--check]"; + private static String version() { + String value = DocTruthCli.class.getPackage().getImplementationVersion(); + return value == null || value.isBlank() ? "0.2.0-alpha" : value; } @FunctionalInterface @@ -122,24 +79,9 @@ interface PydanticExporter { String export(String spec) throws IOException, InterruptedException; } - private record MigrationOptions(Path out, boolean check) { - static MigrationOptions parse(String[] args) { - Path out = null; - boolean check = false; - for (int i = 0; i < args.length; i++) { - if ("--check".equals(args[i])) { - check = true; - } else if ("--out".equals(args[i]) && i + 1 < args.length) { - out = Path.of(args[++i]); - } else { - throw new UsageException("unknown or incomplete option: " + args[i]); - } - } - if (out == null) { - throw new UsageException("--out is required"); - } - return new MigrationOptions(out, check); - } + @FunctionalInterface + interface ProviderFactory { + LlmProvider create(ProviderConfig options) throws CliException; } static final class PythonPydanticExporter implements PydanticExporter { @@ -154,7 +96,7 @@ static final class PythonPydanticExporter implements PydanticExporter { private final Map env; PythonPydanticExporter(Map env) { - this.env = Map.copyOf(env); + this.env = Map.copyOf(Objects.requireNonNull(env, "env")); } @Override @@ -170,24 +112,4 @@ public String export(String spec) throws IOException, InterruptedException { return stdout; } } - - private static class UsageException extends RuntimeException { - private static final long serialVersionUID = 1L; - - private UsageException(String message) { - super(message); - } - } - - private static class MigrationException extends Exception { - private static final long serialVersionUID = 1L; - - private MigrationException(String message) { - super(message); - } - - private MigrationException(String message, Throwable cause) { - super(message, cause); - } - } } diff --git a/src/main/java/ai/doctruth/cli/DoctorCommand.java b/src/main/java/ai/doctruth/cli/DoctorCommand.java new file mode 100644 index 0000000..2360bf0 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/DoctorCommand.java @@ -0,0 +1,114 @@ +package ai.doctruth.cli; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +final class DoctorCommand { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + private static final Map PROVIDER_KEYS = Map.of( + "OPENAI_API_KEY", + "OpenAI / OpenAI-compatible", + "ANTHROPIC_API_KEY", + "Anthropic", + "GOOGLE_API_KEY", + "Gemini", + "DEEPSEEK_API_KEY", + "DeepSeek"); + + private final CliContext context; + + DoctorCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + boolean json = false; + var cursor = new ArgCursor(args, 1); + while (cursor.hasNext()) { + String arg = cursor.next(); + if ("--json".equals(arg)) { + json = true; + } else { + throw new UsageException("unknown doctor option: " + arg); + } + } + + var report = DoctorReport.create(context.env()); + if (json) { + context.out().println(report.toJson()); + } else { + context.out().print(report.toText()); + } + } + + private record DoctorReport( + String javaVersion, + int javaFeature, + boolean javaSupported, + boolean projectConfig, + boolean outputDir, + Map env, + boolean ready) { + + static DoctorReport create(Map env) { + var keys = PROVIDER_KEYS.keySet().stream() + .sorted() + .collect(java.util.stream.Collectors.toUnmodifiableMap(k -> k, k -> isSet(env.get(k)))); + int feature = Runtime.version().feature(); + boolean javaOk = feature >= 25; + boolean config = Files.exists(Path.of("doctruth.yml")); + boolean output = Files.exists(Path.of(".doctruth/runs")); + boolean hasProvider = keys.values().stream().anyMatch(Boolean::booleanValue); + return new DoctorReport( + System.getProperty("java.version"), feature, javaOk, config, output, keys, javaOk && hasProvider); + } + + String toText() { + var text = new StringBuilder() + .append("DocTruth doctor\n") + .append("java: ") + .append(javaVersion) + .append(javaSupported ? " ok" : " needs Java 25+") + .append('\n') + .append("project: ") + .append(projectConfig ? "doctruth.yml found" : "run `doctruth init` to create doctruth.yml") + .append('\n') + .append("runs: ") + .append(outputDir ? ".doctruth/runs found" : "created by `doctruth init` or first extraction") + .append('\n'); + env.forEach((key, set) -> text.append(key) + .append(": ") + .append(set ? "set" : "missing") + .append('\n')); + return text.append("ready: ") + .append(ready ? "yes" : "no") + .append('\n') + .toString(); + } + + String toJson() throws CliException { + try { + return MAPPER.writeValueAsString(Map.of( + "java", + Map.of("version", javaVersion, "feature", javaFeature, "supported", javaSupported), + "project", + Map.of("config", projectConfig, "runsDirectory", outputDir), + "env", + env, + "ready", + ready)); + } catch (JsonProcessingException e) { + throw new CliException("failed to render doctor JSON: " + e.getMessage(), e); + } + } + + private static boolean isSet(String value) { + return value != null && !value.isBlank(); + } + } +} diff --git a/src/main/java/ai/doctruth/cli/DocumentParsers.java b/src/main/java/ai/doctruth/cli/DocumentParsers.java new file mode 100644 index 0000000..c824c0f --- /dev/null +++ b/src/main/java/ai/doctruth/cli/DocumentParsers.java @@ -0,0 +1,41 @@ +package ai.doctruth.cli; + +import java.nio.file.Path; +import java.util.Locale; + +import ai.doctruth.CsvDocumentParser; +import ai.doctruth.DocxDocumentParser; +import ai.doctruth.ParseException; +import ai.doctruth.ParsedDocument; +import ai.doctruth.PdfDocumentParser; +import ai.doctruth.XlsxDocumentParser; + +final class DocumentParsers { + + private DocumentParsers() { + throw new AssertionError("no instances"); + } + + static ParsedDocument parse(Path path) throws CliException { + try { + return switch (extension(path)) { + case "pdf" -> PdfDocumentParser.parse(path); + case "docx" -> DocxDocumentParser.parse(path); + case "xlsx" -> XlsxDocumentParser.parse(path); + case "csv" -> CsvDocumentParser.parse(path); + default -> throw new CliException("unsupported document format: " + path); + }; + } catch (ParseException e) { + throw new CliException("failed to parse " + path + ": " + e.getMessage(), e); + } + } + + private static String extension(Path path) { + String name = path.getFileName().toString(); + int dot = name.lastIndexOf('.'); + if (dot < 0 || dot == name.length() - 1) { + return ""; + } + return name.substring(dot + 1).toLowerCase(Locale.ROOT); + } +} diff --git a/src/main/java/ai/doctruth/cli/ExtractCommand.java b/src/main/java/ai/doctruth/cli/ExtractCommand.java new file mode 100644 index 0000000..063a082 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/ExtractCommand.java @@ -0,0 +1,168 @@ +package ai.doctruth.cli; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.LinkedHashSet; +import java.util.Optional; +import java.util.Set; + +import ai.doctruth.DocTruth; +import ai.doctruth.ExtractionException; +import ai.doctruth.ExtractionResult; +import ai.doctruth.JsonSchema; + +import com.fasterxml.jackson.databind.JsonNode; + +final class ExtractCommand { + + private static final String DEFAULT_PROMPT = "Extract the document fields according to the supplied schema."; + + private final CliContext context; + + ExtractCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + var options = ExtractOptions.parse(args); + var config = CliConfig.load(context.env()); + var doc = DocumentParsers.parse(options.document()); + var schema = SchemaCommand.readSchema(options.schema()); + var provider = context.providers().create(options.providerConfig(config)); + var result = runExtraction(doc, schema, provider, options); + Path dir = options.out().orElseGet(() -> defaultRunDir(config)); + writeOutputs(dir, result); + printSummary(dir, result); + } + + private static ExtractionResult runExtraction( + ai.doctruth.ParsedDocument doc, JsonSchema schema, ai.doctruth.LlmProvider provider, ExtractOptions options) + throws CliException { + try { + var builder = DocTruth.from(provider) + .extractJson(options.prompt(), schema) + .withProvenance() + .withConfidence() + .withBitemporal() + .withMaxRetries(2); + if (!options.allowUncited()) { + for (String field : options.requiredFields(schema)) { + builder = builder.requireCitation(field); + } + } + return builder.runJson(doc); + } catch (ExtractionException e) { + throw new CliException("extraction failed: " + e.getMessage(), e); + } + } + + private static void writeOutputs(Path dir, ExtractionResult result) throws CliException { + try { + Files.createDirectories(dir); + Files.writeString(dir.resolve("result.json"), result.value().toPrettyString()); + result.toAuditJson(dir.resolve("audit.json")); + } catch (IOException e) { + throw new CliException("failed to write extraction outputs: " + e.getMessage(), e); + } + } + + private void printSummary(Path dir, ExtractionResult result) { + int fields = result.value().isObject() ? result.value().size() : 1; + long weak = result.citations().values().stream() + .filter(c -> c.matchScore() < 0.85) + .count(); + context.out().println("extracted"); + context.out().println("fields: " + fields); + context.out().println("cited: " + result.citations().size()); + context.out().println("weak matches: " + weak); + context.out().println("result: " + dir.resolve("result.json")); + context.out().println("audit: " + dir.resolve("audit.json")); + } + + private static Path defaultRunDir(CliConfig config) { + String id = "run_" + Instant.now().toString().replaceAll("[^0-9A-Za-z]", ""); + return config.output().resolve(id); + } + + private record ExtractOptions( + Path document, + Path schema, + Optional out, + String provider, + Optional model, + Optional baseUrl, + boolean allowUncited, + Set require, + String prompt) { + + static ExtractOptions parse(String[] args) { + if (args.length < 2) { + throw new UsageException("usage: doctruth extract -s [-o out/]"); + } + Path document = Path.of(args[1]); + Path schema = null; + Path out = null; + String provider = null; + Optional model = Optional.empty(); + Optional baseUrl = Optional.empty(); + boolean allowUncited = false; + Set require = new LinkedHashSet<>(); + String prompt = DEFAULT_PROMPT; + var cursor = new ArgCursor(args, 2); + while (cursor.hasNext()) { + String arg = cursor.next(); + switch (arg) { + case "-s", "--schema" -> schema = cursor.nextPath(arg); + case "-o", "--out" -> out = cursor.nextPath(arg); + case "--provider" -> provider = cursor.next(); + case "--model" -> model = Optional.of(cursor.next()); + case "--base-url" -> baseUrl = Optional.of(URI.create(cursor.next())); + case "--allow-uncited" -> allowUncited = true; + case "--require" -> addRequired(require, cursor.next()); + case "--prompt" -> prompt = cursor.next(); + default -> throw new UsageException("unknown extract option: " + arg); + } + } + if (schema == null) { + throw new UsageException("-s is required"); + } + return new ExtractOptions( + document, + schema, + Optional.ofNullable(out), + provider, + model, + baseUrl, + allowUncited, + Set.copyOf(require), + prompt); + } + + ProviderConfig providerConfig(CliConfig config) { + return new ProviderConfig(provider, model, baseUrl, config); + } + + Set requiredFields(JsonSchema schema) { + if (!require.isEmpty()) { + return require; + } + var properties = schema.node().path("properties"); + var fields = new LinkedHashSet(); + if (properties.isObject()) { + properties.fieldNames().forEachRemaining(fields::add); + } + return Set.copyOf(fields); + } + + private static void addRequired(Set require, String csv) { + for (String field : csv.split(",")) { + if (!field.isBlank()) { + require.add(field.trim()); + } + } + } + } +} diff --git a/src/main/java/ai/doctruth/cli/InitCommand.java b/src/main/java/ai/doctruth/cli/InitCommand.java new file mode 100644 index 0000000..48d6043 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/InitCommand.java @@ -0,0 +1,53 @@ +package ai.doctruth.cli; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +final class InitCommand { + + private static final String CONFIG = """ + provider: openai + model: gpt-4o + output: .doctruth/runs + citation: + require: true + minMatchScore: 0.85 + """; + + private final CliContext context; + + InitCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + Path dir = parseDir(args); + try { + Files.createDirectories(dir.resolve("schemas")); + Files.createDirectories(dir.resolve(".doctruth/runs")); + Path config = dir.resolve("doctruth.yml"); + if (!Files.exists(config)) { + Files.writeString(config, CONFIG); + } + context.out().println("initialized: " + dir); + context.out().println("config: " + config); + } catch (IOException e) { + throw new CliException("failed to initialize DocTruth project: " + e.getMessage(), e); + } + } + + private static Path parseDir(String[] args) { + Path dir = Path.of("."); + var cursor = new ArgCursor(args, 1); + while (cursor.hasNext()) { + String arg = cursor.next(); + if ("--dir".equals(arg)) { + dir = cursor.nextPath("--dir"); + } else { + throw new UsageException("unknown init option: " + arg); + } + } + return dir; + } +} diff --git a/src/main/java/ai/doctruth/cli/MigrateCommand.java b/src/main/java/ai/doctruth/cli/MigrateCommand.java new file mode 100644 index 0000000..bc330e4 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/MigrateCommand.java @@ -0,0 +1,100 @@ +package ai.doctruth.cli; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; + +import ai.doctruth.JsonSchema; +import ai.doctruth.internal.schema.JsonSchemaCompatibility; + +final class MigrateCommand { + + private final CliContext context; + + MigrateCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + if (args.length < 3 || !"pydantic".equals(args[1])) { + throw new UsageException("usage: doctruth migrate pydantic : -o [--check]"); + } + migratePydantic(args); + } + + private void migratePydantic(String[] args) throws CliException { + String spec = args[2]; + if (!spec.contains(":")) { + throw new UsageException("expected : for pydantic migration"); + } + var options = MigrationOptions.parse(Arrays.copyOfRange(args, 3, args.length)); + String schemaJson = exportSchema(spec); + JsonSchema schema = readSchema(schemaJson); + if (options.check()) { + checkCompatible(schema); + } + writeSchema(options.out(), schemaJson); + context.out() + .println(options.check() ? "schema compatible: " + options.out() : "schema exported: " + options.out()); + } + + private String exportSchema(String spec) throws CliException { + try { + return context.exporter().export(spec); + } catch (IOException e) { + throw new CliException("failed to export Pydantic schema: " + e.getMessage(), e); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + throw new CliException("Pydantic schema export interrupted", e); + } + } + + private static JsonSchema readSchema(String schemaJson) throws CliException { + try { + return JsonSchema.from(schemaJson); + } catch (IllegalArgumentException e) { + throw new CliException("exported Pydantic schema is not valid JSON: " + e.getMessage(), e); + } + } + + private static void checkCompatible(JsonSchema schema) throws CliException { + var errors = JsonSchemaCompatibility.check(schema.node()); + if (!errors.isEmpty()) { + throw new CliException("schema compatibility check failed: " + String.join("; ", errors)); + } + } + + private static void writeSchema(Path out, String schemaJson) throws CliException { + try { + Path parent = out.toAbsolutePath().getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + Files.writeString(out, schemaJson, StandardCharsets.UTF_8); + } catch (IOException e) { + throw new CliException("failed to write schema to " + out + ": " + e.getMessage(), e); + } + } + + private record MigrationOptions(Path out, boolean check) { + static MigrationOptions parse(String[] args) { + Path out = null; + boolean check = false; + for (int i = 0; i < args.length; i++) { + if ("--check".equals(args[i])) { + check = true; + } else if (("-o".equals(args[i]) || "--out".equals(args[i])) && i + 1 < args.length) { + out = Path.of(args[++i]); + } else { + throw new UsageException("unknown or incomplete migrate option: " + args[i]); + } + } + if (out == null) { + throw new UsageException("-o is required"); + } + return new MigrationOptions(out, check); + } + } +} diff --git a/src/main/java/ai/doctruth/cli/ParseCommand.java b/src/main/java/ai/doctruth/cli/ParseCommand.java new file mode 100644 index 0000000..005a02f --- /dev/null +++ b/src/main/java/ai/doctruth/cli/ParseCommand.java @@ -0,0 +1,77 @@ +package ai.doctruth.cli; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +final class ParseCommand { + + private final CliContext context; + + ParseCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + var options = ParseOptions.parse(args); + var doc = DocumentParsers.parse(options.document()); + String json = ParsedDocumentJson.toJson(doc); + if (options.out() != null) { + write(options.out(), json); + } + if (options.json() && options.out() == null) { + context.out().println(json); + return; + } + printSummary(options.document(), doc, options); + } + + private void printSummary(Path source, ai.doctruth.ParsedDocument doc, ParseOptions options) { + var stats = ParsedDocumentStats.from(doc); + context.out().println(source); + context.out().println("pages: " + doc.metadata().pageCount()); + context.out().println("sections: " + stats.sections()); + context.out().println("text: " + stats.text()); + context.out().println("tables: " + stats.tables()); + context.out().println("figures: " + stats.figures()); + context.out().println("bbox coverage: " + stats.textWithBbox() + "/" + stats.text()); + if (options.out() != null) { + context.out().println("output: " + options.out()); + } + } + + private static void write(Path out, String json) throws CliException { + try { + Path parent = out.getParent(); + if (parent != null) { + Files.createDirectories(parent); + } + Files.writeString(out, json); + } catch (IOException e) { + throw new CliException("failed to write parsed JSON: " + e.getMessage(), e); + } + } + + private record ParseOptions(Path document, boolean json, boolean bboxes, Path out) { + static ParseOptions parse(String[] args) { + if (args.length < 2) { + throw new UsageException("usage: doctruth parse [--json] [--bboxes] [-o parsed.json]"); + } + Path document = Path.of(args[1]); + boolean json = false; + boolean bboxes = false; + Path out = null; + var cursor = new ArgCursor(args, 2); + while (cursor.hasNext()) { + String arg = cursor.next(); + switch (arg) { + case "--json" -> json = true; + case "--bboxes" -> bboxes = true; + case "-o", "--out" -> out = cursor.nextPath(arg); + default -> throw new UsageException("unknown parse option: " + arg); + } + } + return new ParseOptions(document, json, bboxes, out); + } + } +} diff --git a/src/main/java/ai/doctruth/cli/ParsedDocumentJson.java b/src/main/java/ai/doctruth/cli/ParsedDocumentJson.java new file mode 100644 index 0000000..91c9c28 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/ParsedDocumentJson.java @@ -0,0 +1,81 @@ +package ai.doctruth.cli; + +import ai.doctruth.BoundingBox; +import ai.doctruth.FigureSection; +import ai.doctruth.ParsedDocument; +import ai.doctruth.SourceLocation; +import ai.doctruth.TableSection; +import ai.doctruth.TextSection; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; + +final class ParsedDocumentJson { + + private static final ObjectMapper MAPPER = new ObjectMapper().registerModule(new JavaTimeModule()); + + private ParsedDocumentJson() { + throw new AssertionError("no instances"); + } + + static String toJson(ParsedDocument doc) throws CliException { + try { + return MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(toNode(doc)); + } catch (com.fasterxml.jackson.core.JsonProcessingException e) { + throw new CliException("failed to serialize parsed document", e); + } + } + + private static ObjectNode toNode(ParsedDocument doc) { + ObjectNode root = MAPPER.createObjectNode(); + root.put("docId", doc.docId()); + ObjectNode metadata = MAPPER.createObjectNode(); + metadata.put("sourceFilename", doc.metadata().sourceFilename()); + metadata.put("pageCount", doc.metadata().pageCount()); + doc.metadata().sourcePublishedAt().ifPresent(t -> metadata.put("sourcePublishedAt", t.toString())); + root.set("metadata", metadata); + ArrayNode sections = MAPPER.createArrayNode(); + doc.sections().forEach(section -> { + switch (section) { + case TextSection text -> sections.add(textNode(text)); + case TableSection table -> sections.add(tableNode(table)); + case FigureSection figure -> sections.add(figureNode(figure)); + } + }); + root.set("sections", sections); + return root; + } + + private static ObjectNode textNode(TextSection section) { + ObjectNode node = base("text", section.location()); + node.put("kind", section.kind().name()); + node.put("text", section.text()); + section.boundingBox().ifPresent(box -> node.set("boundingBox", bbox(box))); + return node; + } + + private static ObjectNode tableNode(TableSection section) { + ObjectNode node = base("table", section.location()); + node.set("rows", MAPPER.valueToTree(section.rows())); + return node; + } + + private static ObjectNode figureNode(FigureSection section) { + ObjectNode node = base("figure", section.location()); + node.put("caption", section.caption()); + return node; + } + + private static ObjectNode base(String type, SourceLocation location) { + ObjectNode node = MAPPER.createObjectNode(); + node.put("type", type); + node.set("location", MAPPER.valueToTree(location)); + return node; + } + + private static ObjectNode bbox(BoundingBox box) { + return MAPPER.valueToTree(box); + } +} diff --git a/src/main/java/ai/doctruth/cli/ParsedDocumentStats.java b/src/main/java/ai/doctruth/cli/ParsedDocumentStats.java new file mode 100644 index 0000000..653eded --- /dev/null +++ b/src/main/java/ai/doctruth/cli/ParsedDocumentStats.java @@ -0,0 +1,29 @@ +package ai.doctruth.cli; + +import ai.doctruth.FigureSection; +import ai.doctruth.ParsedDocument; +import ai.doctruth.TableSection; +import ai.doctruth.TextSection; + +record ParsedDocumentStats(int sections, int text, int tables, int figures, int textWithBbox) { + + static ParsedDocumentStats from(ParsedDocument doc) { + int text = 0; + int tables = 0; + int figures = 0; + int boxes = 0; + for (var section : doc.sections()) { + switch (section) { + case TextSection t -> { + text++; + if (t.boundingBox().isPresent()) { + boxes++; + } + } + case TableSection ignored -> tables++; + case FigureSection ignored -> figures++; + } + } + return new ParsedDocumentStats(doc.sections().size(), text, tables, figures, boxes); + } +} diff --git a/src/main/java/ai/doctruth/cli/ProviderConfig.java b/src/main/java/ai/doctruth/cli/ProviderConfig.java new file mode 100644 index 0000000..d3063b2 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/ProviderConfig.java @@ -0,0 +1,17 @@ +package ai.doctruth.cli; + +import java.net.URI; +import java.util.Optional; + +record ProviderConfig(String provider, Optional model, Optional baseUrl, CliConfig config) { + + ProviderConfig { + provider = provider == null || provider.isBlank() ? config.provider() : provider; + model = model == null ? Optional.empty() : model; + baseUrl = baseUrl == null ? Optional.empty() : baseUrl; + } + + String effectiveModel(String fallback) { + return model.or(() -> config.model()).orElse(fallback); + } +} diff --git a/src/main/java/ai/doctruth/cli/Providers.java b/src/main/java/ai/doctruth/cli/Providers.java new file mode 100644 index 0000000..6d99915 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/Providers.java @@ -0,0 +1,73 @@ +package ai.doctruth.cli; + +import java.net.URI; +import java.util.Map; + +import ai.doctruth.AnthropicProvider; +import ai.doctruth.DeepSeekProvider; +import ai.doctruth.GeminiProvider; +import ai.doctruth.LlmProvider; +import ai.doctruth.OpenAiProvider; + +final class Providers { + + private static final URI OPENAI_ENDPOINT = URI.create("https://api.openai.com/v1/chat/completions"); + private static final URI ANTHROPIC_ENDPOINT = URI.create("https://api.anthropic.com/v1/messages"); + private static final URI GEMINI_ENDPOINT = URI.create("https://generativelanguage.googleapis.com"); + private static final URI DEEPSEEK_ENDPOINT = URI.create("https://api.deepseek.com/v1/chat/completions"); + + private Providers() { + throw new AssertionError("no instances"); + } + + static LlmProvider create(ProviderConfig options) throws CliException { + String provider = options.provider().toLowerCase(); + return switch (provider) { + case "openai" -> openAi(options); + case "anthropic" -> anthropic(options); + case "gemini" -> gemini(options); + case "deepseek" -> deepSeek(options); + default -> throw new CliException("unsupported provider: " + options.provider()); + }; + } + + private static LlmProvider openAi(ProviderConfig options) throws CliException { + String key = requireKey(options.config().env(), "OPENAI_API_KEY"); + URI endpoint = options.baseUrl().map(Providers::openAiEndpoint).orElse(OPENAI_ENDPOINT); + return new OpenAiProvider(key, endpoint, options.effectiveModel("gpt-4o")); + } + + private static LlmProvider anthropic(ProviderConfig options) throws CliException { + String key = requireKey(options.config().env(), "ANTHROPIC_API_KEY"); + return new AnthropicProvider( + key, options.baseUrl().orElse(ANTHROPIC_ENDPOINT), options.effectiveModel("claude-sonnet-4-5")); + } + + private static LlmProvider gemini(ProviderConfig options) throws CliException { + String key = requireKey(options.config().env(), "GOOGLE_API_KEY"); + return new GeminiProvider( + key, options.baseUrl().orElse(GEMINI_ENDPOINT), options.effectiveModel("gemini-1.5-pro")); + } + + private static LlmProvider deepSeek(ProviderConfig options) throws CliException { + String key = requireKey(options.config().env(), "DEEPSEEK_API_KEY"); + return new DeepSeekProvider( + key, options.baseUrl().orElse(DEEPSEEK_ENDPOINT), options.effectiveModel("deepseek-chat")); + } + + private static URI openAiEndpoint(URI base) { + String value = base.toString(); + if (value.endsWith("/chat/completions")) { + return base; + } + return URI.create(value.replaceAll("/+$", "") + "/chat/completions"); + } + + private static String requireKey(Map env, String name) throws CliException { + String value = env.get(name); + if (value == null || value.isBlank()) { + throw new CliException("missing " + name + "; set it with: export " + name + "=..."); + } + return value; + } +} diff --git a/src/main/java/ai/doctruth/cli/SchemaCommand.java b/src/main/java/ai/doctruth/cli/SchemaCommand.java new file mode 100644 index 0000000..26c3438 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/SchemaCommand.java @@ -0,0 +1,59 @@ +package ai.doctruth.cli; + +import java.io.UncheckedIOException; +import java.nio.file.Path; + +import ai.doctruth.JsonSchema; + +final class SchemaCommand { + + private final CliContext context; + + SchemaCommand(CliContext context) { + this.context = context; + } + + void run(String[] args) throws CliException { + var options = SchemaOptions.parse(args); + var schema = readSchema(options.path()); + var summary = SchemaSummary.from(schema); + if (!summary.compatible()) { + throw new CliException("schema compatibility check failed: " + String.join("; ", summary.errors())); + } + if (options.json()) { + context.out().println(summary.toJson()); + return; + } + context.out().println("schema compatible: " + options.path()); + context.out().println("fields: " + summary.fieldCount()); + context.out().println("required: " + summary.requiredCount()); + } + + static JsonSchema readSchema(Path path) throws CliException { + try { + return JsonSchema.from(path); + } catch (IllegalArgumentException | UncheckedIOException e) { + throw new CliException("failed to read schema " + path + ": " + e.getMessage(), e); + } + } + + private record SchemaOptions(Path path, boolean json) { + static SchemaOptions parse(String[] args) { + if (args.length < 2) { + throw new UsageException("usage: doctruth schema [--json]"); + } + Path path = Path.of(args[1]); + boolean json = false; + var cursor = new ArgCursor(args, 2); + while (cursor.hasNext()) { + String arg = cursor.next(); + if ("--json".equals(arg)) { + json = true; + } else { + throw new UsageException("unknown schema option: " + arg); + } + } + return new SchemaOptions(path, json); + } + } +} diff --git a/src/main/java/ai/doctruth/cli/SchemaSummary.java b/src/main/java/ai/doctruth/cli/SchemaSummary.java new file mode 100644 index 0000000..23950d8 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/SchemaSummary.java @@ -0,0 +1,44 @@ +package ai.doctruth.cli; + +import java.util.List; + +import ai.doctruth.JsonSchema; +import ai.doctruth.internal.schema.JsonSchemaCompatibility; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ObjectNode; + +record SchemaSummary(boolean compatible, int fieldCount, int requiredCount, List errors) { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + static SchemaSummary from(JsonSchema schema) { + JsonNode node = schema.node(); + var errors = JsonSchemaCompatibility.check(node); + return new SchemaSummary(errors.isEmpty(), fieldCount(node), requiredCount(node), errors); + } + + String toJson() throws CliException { + ObjectNode node = MAPPER.createObjectNode(); + node.put("compatible", compatible); + node.put("fieldCount", fieldCount); + node.put("requiredCount", requiredCount); + node.set("errors", MAPPER.valueToTree(errors)); + try { + return MAPPER.writerWithDefaultPrettyPrinter().writeValueAsString(node); + } catch (com.fasterxml.jackson.core.JsonProcessingException e) { + throw new CliException("failed to serialize schema summary", e); + } + } + + private static int fieldCount(JsonNode node) { + JsonNode properties = node.path("properties"); + return properties.isObject() ? properties.size() : 0; + } + + private static int requiredCount(JsonNode node) { + JsonNode required = node.path("required"); + return required.isArray() ? required.size() : 0; + } +} diff --git a/src/main/java/ai/doctruth/cli/Usage.java b/src/main/java/ai/doctruth/cli/Usage.java new file mode 100644 index 0000000..0b151a7 --- /dev/null +++ b/src/main/java/ai/doctruth/cli/Usage.java @@ -0,0 +1,30 @@ +package ai.doctruth.cli; + +final class Usage { + + private Usage() { + throw new AssertionError("no instances"); + } + + static String main() { + return """ + DocTruth - auditable LLM extraction for Java + + Usage: + doctruth init + doctruth parse [--json] [--bboxes] [-o parsed.json] + doctruth schema [--json] + doctruth extract -s [-o out/] + doctruth audit [--json] + doctruth doctor [--json] + doctruth completion + doctruth version + + Common: + doctruth parse contract.pdf + doctruth schema contract.schema.json + doctruth extract contract.pdf -s contract.schema.json + doctruth doctor + """; + } +} diff --git a/src/main/java/ai/doctruth/cli/UsageException.java b/src/main/java/ai/doctruth/cli/UsageException.java new file mode 100644 index 0000000..cfeea7e --- /dev/null +++ b/src/main/java/ai/doctruth/cli/UsageException.java @@ -0,0 +1,9 @@ +package ai.doctruth.cli; + +final class UsageException extends RuntimeException { + private static final long serialVersionUID = 1L; + + UsageException(String message) { + super(message); + } +} diff --git a/src/main/java/ai/doctruth/internal/audit/ProvOExporter.java b/src/main/java/ai/doctruth/internal/audit/ProvOExporter.java index 13613dc..a79f065 100644 --- a/src/main/java/ai/doctruth/internal/audit/ProvOExporter.java +++ b/src/main/java/ai/doctruth/internal/audit/ProvOExporter.java @@ -41,7 +41,8 @@ * "doctruth:fieldPath": "name", * "prov:value": "Alex Chen", * "doctruth:matchScore": 0.97, - * "doctruth:sourceLocation": {"pageStart": 1, "pageEnd": 1, "lineStart": 3, "lineEnd": 3, "charOffset": 0} + * "doctruth:sourceLocation": {"pageStart": 1, "pageEnd": 1, "lineStart": 3, "lineEnd": 3, "charOffset": 0}, + * "doctruth:boundingBox": {"x0": 10.0, "y0": 20.0, "x1": 110.0, "y1": 40.0} * }, ... * ], * "doctruth:confidence": { @@ -107,6 +108,7 @@ private static ObjectNode derivationEntry(String path, Citation citation) { entry.put("prov:value", citation.exactQuote()); entry.put("doctruth:matchScore", citation.matchScore()); entry.set("doctruth:sourceLocation", locationNode(citation.location())); + citation.boundingBox().ifPresent(box -> entry.set("doctruth:boundingBox", MAPPER.valueToTree(box))); return entry; } diff --git a/src/main/java/ai/doctruth/internal/audit/package-info.java b/src/main/java/ai/doctruth/internal/audit/package-info.java index 8f51851..17261f4 100644 --- a/src/main/java/ai/doctruth/internal/audit/package-info.java +++ b/src/main/java/ai/doctruth/internal/audit/package-info.java @@ -1,7 +1,5 @@ /** * Internal: audit-format exporters. Currently W3C PROV-O JSON-LD via * {@link ai.doctruth.internal.audit.ProvOExporter}. NOT public API. - * - * @hidden */ package ai.doctruth.internal.audit; diff --git a/src/main/java/ai/doctruth/internal/citation/CitationMatcher.java b/src/main/java/ai/doctruth/internal/citation/CitationMatcher.java index 4f7c54a..413c428 100644 --- a/src/main/java/ai/doctruth/internal/citation/CitationMatcher.java +++ b/src/main/java/ai/doctruth/internal/citation/CitationMatcher.java @@ -8,6 +8,7 @@ import java.util.Objects; import java.util.Optional; +import ai.doctruth.BoundingBox; import ai.doctruth.Citation; import ai.doctruth.FigureSection; import ai.doctruth.ParsedDocument; @@ -74,7 +75,7 @@ private Citation matchOne(String needle, String path, List sections, S for (var sec : sections) { int idx = sec.text().indexOf(needle); if (idx >= 0) { - return new Citation(sec.location(), needle, 1.0); + return new Citation(sec.location(), needle, 1.0, sec.boundingBox()); } } var best = bestFuzzy(needle, sections); @@ -95,7 +96,7 @@ private Citation matchOne(String needle, String path, List sections, S private static Citation bestFuzzy(String needle, List sections) { Citation best = null; for (var sec : sections) { - var c = bestFuzzyWindow(needle, sec.text(), sec.location()); + var c = bestFuzzyWindow(needle, sec.text(), sec.location(), sec.boundingBox()); if (c == null) { continue; } @@ -106,13 +107,8 @@ private static Citation bestFuzzy(String needle, List sections) { return best; } - /** - * Heuristic fuzzy window: pick a window length of {@code |needle|} (with ±20% jitter - * during scoring) and slide a coarse stride across {@code haystack}. Seed positions - * include matches of {@code needle.substring(0, min(5, len))} plus a uniform stride — - * good enough to find a JaroWinkler peak without full O(n) scan. - */ - private static Citation bestFuzzyWindow(String needle, String haystack, SourceLocation loc) { + private static Citation bestFuzzyWindow( + String needle, String haystack, SourceLocation loc, Optional boundingBox) { if (haystack.isEmpty() || needle.isEmpty()) { return null; } @@ -140,7 +136,7 @@ private static Citation bestFuzzyWindow(String needle, String haystack, SourceLo return null; } double clamped = Math.max(0.0, Math.min(1.0, bestScore)); - return new Citation(loc, bestQuote, clamped); + return new Citation(loc, bestQuote, clamped, boundingBox); } private static List candidatePositions(String needle, String haystack) { @@ -171,7 +167,7 @@ private static List candidatePositions(String needle, String haystack) private static List renderedSections(ParsedDocument doc) { var out = new ArrayList(doc.sections().size()); for (var s : doc.sections()) { - out.add(new Rendered(textOf(s), locationOf(s))); + out.add(new Rendered(textOf(s), locationOf(s), boundingBoxOf(s))); } return out; } @@ -201,12 +197,18 @@ private static SourceLocation locationOf(ParsedSection s) { }; } + private static Optional boundingBoxOf(ParsedSection s) { + return switch (s) { + case TextSection ts -> ts.boundingBox(); + case TableSection ignored -> Optional.empty(); + case FigureSection ignored -> Optional.empty(); + }; + } + private static SourceLocation fallbackLocation(ParsedDocument doc) { return new SourceLocation(1, 1, 1, 1, 0); } - // --- traversal ----------------------------------------------------------- - private static void traverse(String path, Object node, List out) { if (node == null) { return; @@ -287,9 +289,7 @@ private static String joinPath(String parent, String child) { return parent.isEmpty() ? child : parent + "." + child; } - // --- carriers ------------------------------------------------------------ - private record Leaf(String path, String value) {} - private record Rendered(String text, SourceLocation location) {} + private record Rendered(String text, SourceLocation location, Optional boundingBox) {} } diff --git a/src/main/java/ai/doctruth/internal/citation/package-info.java b/src/main/java/ai/doctruth/internal/citation/package-info.java index 7f0600e..2b1536f 100644 --- a/src/main/java/ai/doctruth/internal/citation/package-info.java +++ b/src/main/java/ai/doctruth/internal/citation/package-info.java @@ -9,7 +9,5 @@ * *

    Apache Commons Text is confined to this package; no concrete Commons Text type * appears in the public {@code ai.doctruth.*} API. - * - * @hidden */ package ai.doctruth.internal.citation; diff --git a/src/main/java/ai/doctruth/internal/http/package-info.java b/src/main/java/ai/doctruth/internal/http/package-info.java index aaf30a5..3acf552 100644 --- a/src/main/java/ai/doctruth/internal/http/package-info.java +++ b/src/main/java/ai/doctruth/internal/http/package-info.java @@ -5,7 +5,5 @@ *

    Anything under this package may be renamed, moved, or removed without a major * version bump. Downstream consumers must not depend on these types directly — use the * sealed {@link ai.doctruth.LlmProvider} surface instead. - * - * @hidden */ package ai.doctruth.internal.http; diff --git a/src/main/java/ai/doctruth/internal/package-info.java b/src/main/java/ai/doctruth/internal/package-info.java index ffb8609..5f8f000 100644 --- a/src/main/java/ai/doctruth/internal/package-info.java +++ b/src/main/java/ai/doctruth/internal/package-info.java @@ -3,7 +3,5 @@ * *

    Anything under this package may be renamed, moved, or removed without a major * version bump. Downstream consumers must not depend on these types directly. - * - * @hidden */ package ai.doctruth.internal; diff --git a/src/main/java/ai/doctruth/internal/providers/anthropic/package-info.java b/src/main/java/ai/doctruth/internal/providers/anthropic/package-info.java index a24f96a..bd9fa94 100644 --- a/src/main/java/ai/doctruth/internal/providers/anthropic/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/anthropic/package-info.java @@ -4,7 +4,5 @@ *

    This package owns the only Anthropic-specific HTTP / JSON code in the codebase. The * public {@link ai.doctruth.AnthropicProvider} delegates here and never sees Anthropic wire * types directly, per ADR 0003. - * - * @hidden */ package ai.doctruth.internal.providers.anthropic; diff --git a/src/main/java/ai/doctruth/internal/providers/anthropic/wire/Message.java b/src/main/java/ai/doctruth/internal/providers/anthropic/wire/Message.java index 5a31c4c..678db06 100644 --- a/src/main/java/ai/doctruth/internal/providers/anthropic/wire/Message.java +++ b/src/main/java/ai/doctruth/internal/providers/anthropic/wire/Message.java @@ -1,7 +1,7 @@ package ai.doctruth.internal.providers.anthropic.wire; /** - * One element of {@link MessagesRequest#messages()}. v0.1.0-alpha sends only string content + * One element of the Anthropic request messages list. v0.1.0-alpha sends only string content * with role {@code "user"}; tool-use forcing arrives in Phase 2. * * @hidden diff --git a/src/main/java/ai/doctruth/internal/providers/anthropic/wire/package-info.java b/src/main/java/ai/doctruth/internal/providers/anthropic/wire/package-info.java index eb8f4a8..92ab149 100644 --- a/src/main/java/ai/doctruth/internal/providers/anthropic/wire/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/anthropic/wire/package-info.java @@ -6,7 +6,5 @@ * {@code input_tokens}) so Jackson maps each record component without custom annotations. * These records exist only to insulate {@link ai.doctruth.AnthropicProvider} from vendor JSON * drift, per ADR 0003. - * - * @hidden */ package ai.doctruth.internal.providers.anthropic.wire; diff --git a/src/main/java/ai/doctruth/internal/providers/deepseek/package-info.java b/src/main/java/ai/doctruth/internal/providers/deepseek/package-info.java index 3faa21f..160b1a8 100644 --- a/src/main/java/ai/doctruth/internal/providers/deepseek/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/deepseek/package-info.java @@ -8,7 +8,5 @@ * *

    Anything under this package may be renamed, moved, or removed without a major * version bump. - * - * @hidden */ package ai.doctruth.internal.providers.deepseek; diff --git a/src/main/java/ai/doctruth/internal/providers/deepseek/wire/package-info.java b/src/main/java/ai/doctruth/internal/providers/deepseek/wire/package-info.java index 5a58e2d..79607b2 100644 --- a/src/main/java/ai/doctruth/internal/providers/deepseek/wire/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/deepseek/wire/package-info.java @@ -10,7 +10,5 @@ *

    Anything under this package may be renamed, moved, or removed without a major * version bump. Downstream consumers must not depend on these types directly — use the * sealed {@link ai.doctruth.LlmProvider} surface instead. - * - * @hidden */ package ai.doctruth.internal.providers.deepseek.wire; diff --git a/src/main/java/ai/doctruth/internal/providers/gemini/package-info.java b/src/main/java/ai/doctruth/internal/providers/gemini/package-info.java index 9e16583..3948007 100644 --- a/src/main/java/ai/doctruth/internal/providers/gemini/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/gemini/package-info.java @@ -4,7 +4,5 @@ * *

    Anything under this package may be renamed, moved, or removed without a major version * bump. Wire-shape records live under {@code .wire.*}; the HTTP delegate lives here. - * - * @hidden */ package ai.doctruth.internal.providers.gemini; diff --git a/src/main/java/ai/doctruth/internal/providers/gemini/wire/package-info.java b/src/main/java/ai/doctruth/internal/providers/gemini/wire/package-info.java index fd9e815..2782ba2 100644 --- a/src/main/java/ai/doctruth/internal/providers/gemini/wire/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/gemini/wire/package-info.java @@ -8,7 +8,5 @@ * annotations. Anything under this package may be renamed, moved, or removed without a * major version bump per ADR 0003 — vendor-specific knowledge stays here and never leaks * through {@link ai.doctruth.LlmProvider}. - * - * @hidden */ package ai.doctruth.internal.providers.gemini.wire; diff --git a/src/main/java/ai/doctruth/internal/providers/openai/package-info.java b/src/main/java/ai/doctruth/internal/providers/openai/package-info.java index 4d1cce0..14ab251 100644 --- a/src/main/java/ai/doctruth/internal/providers/openai/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/openai/package-info.java @@ -4,7 +4,5 @@ *

    This package owns the only OpenAI-specific HTTP / JSON code in the codebase. The * public {@link ai.doctruth.OpenAiProvider} delegates here and never sees OpenAI wire * types directly, per ADR 0003. - * - * @hidden */ package ai.doctruth.internal.providers.openai; diff --git a/src/main/java/ai/doctruth/internal/providers/openai/wire/package-info.java b/src/main/java/ai/doctruth/internal/providers/openai/wire/package-info.java index 52d8d01..b74d651 100644 --- a/src/main/java/ai/doctruth/internal/providers/openai/wire/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/openai/wire/package-info.java @@ -6,7 +6,5 @@ * {@code prompt_tokens}, {@code finish_reason}) so Jackson maps each record component * without custom annotations. These records exist only to insulate * {@link ai.doctruth.OpenAiProvider} from vendor JSON drift, per ADR 0003. - * - * @hidden */ package ai.doctruth.internal.providers.openai.wire; diff --git a/src/main/java/ai/doctruth/internal/providers/package-info.java b/src/main/java/ai/doctruth/internal/providers/package-info.java index d604252..ad83ab3 100644 --- a/src/main/java/ai/doctruth/internal/providers/package-info.java +++ b/src/main/java/ai/doctruth/internal/providers/package-info.java @@ -5,7 +5,5 @@ * bump. Vendor wire shapes (request / response records) live under * {@code ai.doctruth.internal.providers..wire.*} and MUST NOT leak through the public * {@link ai.doctruth.LlmProvider} surface, per ADR 0003. - * - * @hidden */ package ai.doctruth.internal.providers; diff --git a/src/main/java/ai/doctruth/internal/render/SectionRenderer.java b/src/main/java/ai/doctruth/internal/render/SectionRenderer.java index bd9045e..c985fd7 100644 --- a/src/main/java/ai/doctruth/internal/render/SectionRenderer.java +++ b/src/main/java/ai/doctruth/internal/render/SectionRenderer.java @@ -24,9 +24,8 @@ * *

    Package-private to internal callers (per CONTRIBUTING.md "Engineering principles" §1 — * the public API surface stays minimal). Public visibility on the static method is the - * compromise that lets {@code ai.doctruth} root-package code call this without exposing - * the class outside the {@code internal} subtree to downstream consumers; the - * {@code @hidden} on {@code package-info} signals the no-stability promise. + * compromise that lets {@code ai.doctruth} root-package code call this without treating + * the class as stable public API for downstream consumers. * * @hidden */ diff --git a/src/main/java/ai/doctruth/internal/render/package-info.java b/src/main/java/ai/doctruth/internal/render/package-info.java index 77ca790..a9f8433 100644 --- a/src/main/java/ai/doctruth/internal/render/package-info.java +++ b/src/main/java/ai/doctruth/internal/render/package-info.java @@ -6,7 +6,5 @@ * place in the codebase that converts a {@link ai.doctruth.ParsedSection} to a flat string, * so changes to that representation happen in exactly one location (per CONTRIBUTING.md * "Engineering principles" §1 — decoupled by default, single source of truth). - * - * @hidden */ package ai.doctruth.internal.render; diff --git a/src/main/java/ai/doctruth/internal/retry/package-info.java b/src/main/java/ai/doctruth/internal/retry/package-info.java index 4bf97aa..50d1cb9 100644 --- a/src/main/java/ai/doctruth/internal/retry/package-info.java +++ b/src/main/java/ai/doctruth/internal/retry/package-info.java @@ -5,7 +5,5 @@ * version bump. Failsafe ({@code dev.failsafe.*}) types are confined to this package by * design (per CONTRIBUTING.md §1 decoupling); they MUST NOT leak through public method * signatures. - * - * @hidden */ package ai.doctruth.internal.retry; diff --git a/src/main/java/ai/doctruth/internal/schema/package-info.java b/src/main/java/ai/doctruth/internal/schema/package-info.java index 539be4e..2e98e58 100644 --- a/src/main/java/ai/doctruth/internal/schema/package-info.java +++ b/src/main/java/ai/doctruth/internal/schema/package-info.java @@ -1,6 +1,5 @@ /** * Internal JSON Schema generation for provider structured-output contracts. * - * @hidden */ package ai.doctruth.internal.schema; diff --git a/src/test/java/ai/doctruth/BoundingBoxTest.java b/src/test/java/ai/doctruth/BoundingBoxTest.java new file mode 100644 index 0000000..3b9de90 --- /dev/null +++ b/src/test/java/ai/doctruth/BoundingBoxTest.java @@ -0,0 +1,66 @@ +package ai.doctruth; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +/** Contract tests for {@link BoundingBox}. */ +class BoundingBoxTest { + + @Nested + @DisplayName("happy path") + class HappyPath { + + @Test + @DisplayName("accepts a positive page-normalized rectangle") + void validBox() { + var box = new BoundingBox(10.0, 20.0, 300.0, 400.0); + + assertThat(box.x0()).isEqualTo(10.0); + assertThat(box.y0()).isEqualTo(20.0); + assertThat(box.x1()).isEqualTo(300.0); + assertThat(box.y1()).isEqualTo(400.0); + } + } + + @Nested + @DisplayName("invariants") + class Invariants { + + @Test + @DisplayName("rejects coordinates outside the 0..1000 page scale") + void outsidePageScale() { + assertThatThrownBy(() -> new BoundingBox(-1.0, 0.0, 10.0, 10.0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("0..1000"); + assertThatThrownBy(() -> new BoundingBox(0.0, 0.0, 1001.0, 10.0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("0..1000"); + } + + @Test + @DisplayName("rejects zero or negative width and height") + void notPositive() { + assertThatThrownBy(() -> new BoundingBox(10.0, 0.0, 10.0, 10.0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("positive"); + assertThatThrownBy(() -> new BoundingBox(0.0, 10.0, 10.0, 10.0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("positive"); + } + + @Test + @DisplayName("rejects non-finite coordinates") + void nonFinite() { + assertThatThrownBy(() -> new BoundingBox(Double.NaN, 0.0, 10.0, 10.0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("x0"); + assertThatThrownBy(() -> new BoundingBox(0.0, 0.0, Double.POSITIVE_INFINITY, 10.0)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("x1"); + } + } +} diff --git a/src/test/java/ai/doctruth/CitationTest.java b/src/test/java/ai/doctruth/CitationTest.java index c5efdef..ad05c41 100644 --- a/src/test/java/ai/doctruth/CitationTest.java +++ b/src/test/java/ai/doctruth/CitationTest.java @@ -3,6 +3,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.Optional; + import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -21,6 +23,8 @@ */ class CitationTest { + private static final BoundingBox BBOX = new BoundingBox(10.0, 20.0, 110.0, 40.0); + private static SourceLocation sampleLocation() { return new SourceLocation(1, 1, 3, 3, 0); } @@ -38,6 +42,15 @@ void typicalCitation() { assertThat(citation.location()).isSameAs(loc); assertThat(citation.exactQuote()).isEqualTo("Acme Corp Ltd"); assertThat(citation.matchScore()).isEqualTo(0.97); + assertThat(citation.boundingBox()).isEmpty(); + } + + @Test + @DisplayName("four-arg constructor retains a page-normalized bounding box") + void fourArgRetainsBoundingBox() { + var citation = new Citation(sampleLocation(), "Acme Corp Ltd", 0.97, Optional.of(BBOX)); + + assertThat(citation.boundingBox()).contains(BBOX); } @Test @@ -87,6 +100,14 @@ void nullExactQuote() { .hasMessageContaining("exactQuote"); } + @Test + @DisplayName("rejects null boundingBox optional") + void nullBoundingBoxOptional() { + assertThatThrownBy(() -> new Citation(sampleLocation(), "x", 1.0, null)) + .isInstanceOf(NullPointerException.class) + .hasMessageContaining("boundingBox"); + } + @Test @DisplayName("rejects empty exactQuote with IllegalArgumentException") void emptyExactQuote() { diff --git a/src/test/java/ai/doctruth/CliPackagingContractTest.java b/src/test/java/ai/doctruth/CliPackagingContractTest.java index 4334255..d3bff93 100644 --- a/src/test/java/ai/doctruth/CliPackagingContractTest.java +++ b/src/test/java/ai/doctruth/CliPackagingContractTest.java @@ -15,4 +15,16 @@ void mavenJarManifestUsesDocTruthCliMainClass() throws Exception { assertThat(pom).contains("ai.doctruth.cli.DocTruthCli"); } + + @Test + void mavenBuildAttachesStandaloneCliJar() throws Exception { + String pom = Files.readString(Path.of("pom.xml")); + + assertThat(pom) + .contains("maven-shade-plugin") + .contains("all") + .contains("false") + .contains("org.slf4j:slf4j-nop:${slf4j.version}") + .contains("org.apache.logging.log4j:log4j-to-slf4j:${log4j.version}"); + } } diff --git a/src/test/java/ai/doctruth/DocTruthHappyPathTest.java b/src/test/java/ai/doctruth/DocTruthHappyPathTest.java new file mode 100644 index 0000000..61a0a7a --- /dev/null +++ b/src/test/java/ai/doctruth/DocTruthHappyPathTest.java @@ -0,0 +1,228 @@ +package ai.doctruth; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNullPointerException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.List; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class DocTruthHappyPathTest { + + @TempDir + Path tempDir; + + @Test + void documentFirstFlowExtractsWithEvidenceFromParsedDocument() throws Exception { + var calls = new AtomicInteger(); + + var result = DocTruth.withProvider(provider(calls)) + .from(parsedDoc()) + .extract("Extract the candidate.", Candidate.class) + .withEvidence() + .run(); + + assertThat(result.value()).isEqualTo(new Candidate("Alex Chen")); + assertThat(result.citation("name").exactQuote()).contains("Alex Chen"); + assertThat(result.confidence()).containsKey("name"); + assertThat(result.provenance().model()).isEqualTo("openai"); + assertThat(calls.get()).isEqualTo(1); + } + + @Test + void documentFirstFlowExtractsWithEvidenceFromPdfPath() throws Exception { + Path pdf = samplePdf(); + + var result = DocTruth.withProvider(provider(new AtomicInteger())) + .fromPdf(pdf) + .extract("Extract the candidate.", Candidate.class) + .withEvidence() + .run(); + + assertThat(result.value()).isEqualTo(new Candidate("Alex Chen")); + assertThat(result.citation("name").location().pageStart()).isEqualTo(1); + } + + @Test + void clientParsesSupportedDocumentInputs() throws Exception { + var client = DocTruth.withProvider(provider(new AtomicInteger())); + Path csv = tempDir.resolve("candidate.csv"); + Files.writeString(csv, "name\nAlex Chen\n"); + + assertThat(client.fromPdf(samplePdf())).isNotNull(); + assertThat(client.fromPdf(samplePdf().toString())).isNotNull(); + assertThat(client.fromCsv(csv)).isNotNull(); + assertThat(client.fromDocx(sampleDocx())).isNotNull(); + assertThat(client.fromXlsx(sampleXlsx())).isNotNull(); + } + + @Test + void documentFirstFlowKeepsAdvancedExtractionOptions() throws Exception { + var result = DocTruth.withProvider(provider(new AtomicInteger())) + .from(parsedDoc()) + .extract("Extract the candidate.", Candidate.class) + .withSourcePublishedAt(Instant.EPOCH) + .withMaxRetries(2) + .withContextStrategy(new PriorityTruncate(List.of("Name"), 100, OverBudgetPolicy.STRICT)) + .run(); + + assertThat(result.value()).isEqualTo(new Candidate("Alex Chen")); + assertThat(result.provenance().sourcePublishedAt()).contains(Instant.EPOCH); + } + + @Test + void documentFirstJsonSchemaFlowExtractsWithEvidence() throws Exception { + var schema = JsonSchema.from(""" + { + "type": "object", + "properties": { + "name": { "type": "string" } + }, + "required": ["name"], + "additionalProperties": false + } + """); + + var result = DocTruth.withProvider(provider(new AtomicInteger())) + .from(parsedDoc()) + .extractJson("Extract the candidate.", schema) + .withEvidence() + .requireCitation("name") + .withSourcePublishedAt(Instant.EPOCH) + .withMaxRetries(1) + .withContextStrategy(new PriorityTruncate(List.of("Name"), 100, OverBudgetPolicy.STRICT)) + .runJson(); + + assertThat(result.value().get("name").asText()).isEqualTo("Alex Chen"); + assertThat(result.requireCitation("name").exactQuote()).contains("Alex Chen"); + assertThat(result.provenance().sourcePublishedAt()).contains(Instant.EPOCH); + } + + @Test + void withOpenAiApiKeyCreatesClient() { + assertThat(DocTruth.withOpenAi("test-key")).isNotNull(); + } + + @Test + void providerFactoriesCreateProviderBackedClients() { + assertThat(DocTruth.withProvider(LlmProviders.openAi("test-key"))).isNotNull(); + assertThat(DocTruth.withProvider(LlmProviders.openAiCompatible( + "test-key", URI.create("http://localhost/v1/chat/completions"), "local-model"))) + .isNotNull(); + assertThat(DocTruth.withProvider(LlmProviders.anthropic("test-key"))).isNotNull(); + assertThat(DocTruth.withProvider(LlmProviders.gemini("test-key"))).isNotNull(); + assertThat(DocTruth.withProvider(LlmProviders.deepSeek("test-key"))).isNotNull(); + } + + @Test + void providerFactoriesKeepValidationActionable() { + assertThatThrownBy(() -> LlmProviders.openAi(" ")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("apiKey"); + assertThatNullPointerException() + .isThrownBy(() -> LlmProviders.openAiCompatible("test-key", null, "local-model")) + .withMessageContaining("endpoint"); + } + + @Test + void withOpenAiEnvCreatesClientOrReportsMissingKey() { + var apiKey = System.getenv("OPENAI_API_KEY"); + if (apiKey == null || apiKey.isBlank()) { + assertThatThrownBy(DocTruth::withOpenAi) + .isInstanceOf(IllegalStateException.class) + .hasMessageContaining("OPENAI_API_KEY"); + } else { + assertThat(DocTruth.withOpenAi()).isNotNull(); + } + } + + @Test + void withOpenAiRejectsBlankApiKey() { + assertThatThrownBy(() -> DocTruth.withOpenAi(" ")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("apiKey"); + } + + @Test + void withProviderRejectsNullProvider() { + assertThatNullPointerException() + .isThrownBy(() -> DocTruth.withProvider(null)) + .withMessageContaining("provider"); + } + + private static ParsedDocument parsedDoc() { + var loc = new SourceLocation(1, 1, 1, 1, 0); + var section = new TextSection("Name: Alex Chen", loc); + var meta = new DocumentMetadata("candidate.pdf", 1, Optional.empty()); + return new ParsedDocument("doc-1", List.of(section), meta); + } + + private static LlmProvider provider(AtomicInteger calls) { + return new OpenAiProvider("test", URI.create("http://localhost"), "test-model") { + @Override + public ProviderResponse complete(ProviderRequest request) { + calls.incrementAndGet(); + return new ProviderResponse("{\"name\":\"Alex Chen\"}", new ProviderUsage(10, 3, "test-model")); + } + }; + } + + private Path samplePdf() throws Exception { + Path path = tempDir.resolve("candidate.pdf"); + try (var pdf = new PDDocument()) { + var page = new PDPage(); + pdf.addPage(page); + try (var cs = new PDPageContentStream(pdf, page)) { + cs.beginText(); + cs.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + cs.newLineAtOffset(50, 720); + cs.showText("Name: Alex Chen"); + cs.endText(); + } + pdf.save(path.toFile()); + } + return path; + } + + private Path sampleDocx() throws Exception { + Path path = tempDir.resolve("candidate.docx"); + try (var docx = new XWPFDocument()) { + var paragraph = docx.createParagraph(); + paragraph.createRun().setText("Name: Alex Chen"); + try (var out = Files.newOutputStream(path)) { + docx.write(out); + } + } + return path; + } + + private Path sampleXlsx() throws Exception { + Path path = tempDir.resolve("candidate.xlsx"); + try (var workbook = new XSSFWorkbook()) { + var sheet = workbook.createSheet("Candidates"); + sheet.createRow(0).createCell(0).setCellValue("Name"); + sheet.createRow(1).createCell(0).setCellValue("Alex Chen"); + try (var out = Files.newOutputStream(path)) { + workbook.write(out); + } + } + return path; + } + + private record Candidate(String name) {} +} diff --git a/src/test/java/ai/doctruth/ExtractionResultAuditJsonTest.java b/src/test/java/ai/doctruth/ExtractionResultAuditJsonTest.java index b785569..21cc25c 100644 --- a/src/test/java/ai/doctruth/ExtractionResultAuditJsonTest.java +++ b/src/test/java/ai/doctruth/ExtractionResultAuditJsonTest.java @@ -112,6 +112,33 @@ void citationsAppearInWasDerivedFrom() throws Exception { assertThat(src.path("lineStart").asInt()).isEqualTo(3); } + @Test + @DisplayName("a citation bounding box is exported when present") + void citationBoundingBoxAppears() throws Exception { + var loc = new SourceLocation(1, 1, 3, 3, 0); + var box = new BoundingBox(10.0, 20.0, 110.0, 40.0); + var citation = new Citation(loc, "Alex Chen", 0.97, Optional.of(box)); + var confidence = new Confidence(0.91, "exact substring match"); + var prov = new Provenance( + "anthropic", + "claude-sonnet-4-5-20250929", + Instant.parse("2026-05-07T07:30:00Z"), + Optional.empty(), + Optional.empty(), + Optional.empty(), + 0); + var result = new ExtractionResult<>( + new Person("Alex Chen", 30), Map.of("name", citation), Map.of("name", confidence), prov); + + JsonNode entry = MAPPER.readTree(result.toAuditJson()) + .path("prov:wasDerivedFrom") + .get(0); + + JsonNode bbox = entry.path("doctruth:boundingBox"); + assertThat(bbox.path("x0").asDouble()).isEqualTo(10.0); + assertThat(bbox.path("y1").asDouble()).isEqualTo(40.0); + } + @Test @DisplayName("each confidence entry appears under 'doctruth:confidence' with score + rationale") void confidenceMapAppears() throws Exception { diff --git a/src/test/java/ai/doctruth/ExtractionResultConvenienceTest.java b/src/test/java/ai/doctruth/ExtractionResultConvenienceTest.java new file mode 100644 index 0000000..8767ccc --- /dev/null +++ b/src/test/java/ai/doctruth/ExtractionResultConvenienceTest.java @@ -0,0 +1,92 @@ +package ai.doctruth; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatNullPointerException; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.time.Instant; +import java.util.Map; +import java.util.Optional; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class ExtractionResultConvenienceTest { + + @TempDir + Path tempDir; + + @Test + void citationReturnsFieldCitationByPath() { + var citation = new Citation(new SourceLocation(1, 1, 2, 2, 10), "Alex Chen", 0.99); + var result = result(Map.of("name", citation)); + + assertThat(result.citation("name")).isEqualTo(citation); + assertThat(result.citation("missing")).isNull(); + assertThat(result.findCitation("name")).contains(citation); + assertThat(result.findCitation("missing")).isEmpty(); + assertThat(result.requireCitation("name")).isEqualTo(citation); + } + + @Test + void citationRejectsNullFieldPath() { + var result = result(Map.of()); + + assertThatNullPointerException().isThrownBy(() -> result.citation(null)).withMessageContaining("fieldPath"); + assertThatNullPointerException() + .isThrownBy(() -> result.findCitation(null)) + .withMessageContaining("fieldPath"); + assertThatNullPointerException() + .isThrownBy(() -> result.requireCitation(null)) + .withMessageContaining("fieldPath"); + } + + @Test + void requireCitationExplainsMissingFieldPath() { + var result = result(Map.of()); + + assertThatThrownBy(() -> result.requireCitation("name")) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("name") + .hasMessageContaining("No citation"); + } + + @Test + void writeAuditPathDelegatesToAuditJsonFileOutput() throws Exception { + var result = result(Map.of("name", new Citation(new SourceLocation(1, 1, 2, 2, 10), "Alex Chen", 0.99))); + Path path = tempDir.resolve("audit/result.json"); + + result.writeAudit(path); + + assertThat(Files.readString(path)).isEqualTo(result.toAuditJson()); + } + + @Test + void writeAuditStringDelegatesToAuditJsonFileOutput() throws Exception { + var result = result(Map.of("name", new Citation(new SourceLocation(1, 1, 2, 2, 10), "Alex Chen", 0.99))); + Path path = tempDir.resolve("audit-string.json"); + + result.writeAudit(path.toString()); + + assertThat(Files.readString(path)).isEqualTo(result.toAuditJson()); + } + + private static ExtractionResult result(Map citations) { + return new ExtractionResult<>( + new Person("Alex Chen"), + citations, + Map.of(), + new Provenance( + "openai", + "test-model", + Instant.EPOCH, + Optional.empty(), + Optional.empty(), + Optional.empty(), + 0)); + } + + private record Person(String name) {} +} diff --git a/src/test/java/ai/doctruth/PdfDocumentParserTest.java b/src/test/java/ai/doctruth/PdfDocumentParserTest.java index 3f3a60a..65e1de8 100644 --- a/src/test/java/ai/doctruth/PdfDocumentParserTest.java +++ b/src/test/java/ai/doctruth/PdfDocumentParserTest.java @@ -56,6 +56,14 @@ void singlePageProducesOneTextSection() throws Exception { assertThat(ts.location().pageStart()).isEqualTo(1); assertThat(ts.location().pageEnd()).isEqualTo(1); assertThat(ts.kind()).isNotNull(); + assertThat(ts.boundingBox()).hasValueSatisfying(bbox -> { + assertThat(bbox.x0()).isLessThan(bbox.x1()); + assertThat(bbox.y0()).isLessThan(bbox.y1()); + assertThat(bbox.x0()).isBetween(0.0, 1000.0); + assertThat(bbox.x1()).isBetween(0.0, 1000.0); + assertThat(bbox.y0()).isBetween(0.0, 1000.0); + assertThat(bbox.y1()).isBetween(0.0, 1000.0); + }); } @Test diff --git a/src/test/java/ai/doctruth/PublicApiSnapshotTest.java b/src/test/java/ai/doctruth/PublicApiSnapshotTest.java new file mode 100644 index 0000000..6bc2cb4 --- /dev/null +++ b/src/test/java/ai/doctruth/PublicApiSnapshotTest.java @@ -0,0 +1,169 @@ +package ai.doctruth; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.IOException; +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; +import java.lang.reflect.Modifier; +import java.lang.reflect.RecordComponent; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Test; + +class PublicApiSnapshotTest { + + private static final Path SNAPSHOT = Path.of("src/test/resources/ai/doctruth/public-api-snapshot.txt"); + + @Test + @DisplayName("public SDK API surface changes are explicit") + void publicApiSurfaceMatchesSnapshot() throws Exception { + String actual = String.join("\n", publicApiLines()) + "\n"; + if (Boolean.getBoolean("doctruth.updatePublicApiSnapshot")) { + Files.createDirectories(SNAPSHOT.getParent()); + Files.writeString(SNAPSHOT, actual); + } + assertThat(Files.readString(SNAPSHOT)).isEqualTo(actual); + } + + private static List publicApiLines() throws Exception { + var lines = new ArrayList(); + for (Class type : publicApiTypes()) { + addType(lines, type); + } + return lines; + } + + private static List> publicApiTypes() throws Exception { + var types = new ArrayList>(); + for (Path source : publicApiSources()) { + Class type = Class.forName(className(source)); + if (Modifier.isPublic(type.getModifiers())) { + types.add(type); + } + } + types.sort(Comparator.comparing(Class::getCanonicalName)); + return types; + } + + private static List publicApiSources() throws IOException { + var roots = List.of(Path.of("src/main/java/ai/doctruth"), Path.of("src/main/java/ai/doctruth/spi")); + var sources = new ArrayList(); + for (Path root : roots) { + try (var files = Files.list(root)) { + files.filter(PublicApiSnapshotTest::isJavaSource).forEach(sources::add); + } + } + return sources; + } + + private static boolean isJavaSource(Path path) { + return path.toString().endsWith(".java") + && !path.getFileName().toString().equals("package-info.java"); + } + + private static String className(Path source) { + String path = Path.of("src/main/java").relativize(source).toString(); + return path.substring(0, path.length() - ".java".length()).replace('/', '.'); + } + + private static void addType(List lines, Class type) { + lines.add("TYPE " + kind(type) + " " + type.getCanonicalName() + modifiers(type.getModifiers())); + addPermits(lines, type); + addEnumConstants(lines, type); + addRecordComponents(lines, type); + addConstructors(lines, type); + addMethods(lines, type); + lines.add(""); + } + + private static String kind(Class type) { + if (type.isAnnotation()) return "annotation"; + if (type.isEnum()) return "enum"; + if (type.isRecord()) return "record"; + if (type.isInterface()) return "interface"; + return "class"; + } + + private static String modifiers(int modifiers) { + String text = Modifier.toString(modifiers); + return text.isBlank() ? "" : " [" + text + "]"; + } + + private static void addPermits(List lines, Class type) { + Class[] permitted = type.getPermittedSubclasses(); + if (permitted != null && permitted.length > 0) { + lines.add(" permits " + joinTypes(permitted)); + } + } + + private static void addEnumConstants(List lines, Class type) { + Object[] constants = type.getEnumConstants(); + if (constants != null) { + lines.add(" enum-constants " + + String.join( + ", ", Arrays.stream(constants).map(Object::toString).toList())); + } + } + + private static void addRecordComponents(List lines, Class type) { + RecordComponent[] components = type.getRecordComponents(); + if (components != null) { + lines.add(" record-components " + + String.join( + ", ", + Arrays.stream(components) + .map(c -> typeName(c.getType()) + " " + c.getName()) + .toList())); + } + } + + private static void addConstructors(List lines, Class type) { + Arrays.stream(type.getDeclaredConstructors()) + .filter(c -> Modifier.isPublic(c.getModifiers())) + .sorted(Comparator.comparing(PublicApiSnapshotTest::constructorSignature)) + .map(c -> " ctor " + constructorSignature(c)) + .forEach(lines::add); + } + + private static String constructorSignature(Constructor constructor) { + return constructor.getDeclaringClass().getSimpleName() + "(" + joinTypes(constructor.getParameterTypes()) + ")"; + } + + private static void addMethods(List lines, Class type) { + Arrays.stream(type.getDeclaredMethods()) + .filter(m -> Modifier.isPublic(m.getModifiers())) + .filter(m -> !m.isBridge() && !m.isSynthetic()) + .sorted(Comparator.comparing(PublicApiSnapshotTest::methodSignature)) + .map(m -> " method " + methodSignature(m)) + .forEach(lines::add); + } + + private static String methodSignature(Method method) { + return typeName(method.getReturnType()) + + " " + + method.getName() + + "(" + + joinTypes(method.getParameterTypes()) + + ")" + + modifiers(method.getModifiers()); + } + + private static String joinTypes(Class[] types) { + return String.join( + ", ", Arrays.stream(types).map(PublicApiSnapshotTest::typeName).toList()); + } + + private static String typeName(Class type) { + if (!type.isArray()) { + return type.getCanonicalName(); + } + return typeName(type.componentType()) + "[]"; + } +} diff --git a/src/test/java/ai/doctruth/TextSectionTest.java b/src/test/java/ai/doctruth/TextSectionTest.java index 7c2bde4..328493b 100644 --- a/src/test/java/ai/doctruth/TextSectionTest.java +++ b/src/test/java/ai/doctruth/TextSectionTest.java @@ -3,6 +3,8 @@ import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatThrownBy; +import java.util.Optional; + import org.junit.jupiter.api.DisplayName; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -22,6 +24,7 @@ class TextSectionTest { private static final SourceLocation LOC = new SourceLocation(1, 1, 1, 1, 0); + private static final BoundingBox BBOX = new BoundingBox(10.0, 20.0, 110.0, 40.0); @Nested @DisplayName("happy path") @@ -45,6 +48,15 @@ void threeArgRetainsKind() { assertThat(section.text()).isEqualTo("Section 1 — Indemnities"); assertThat(section.location()).isEqualTo(LOC); assertThat(section.kind()).isEqualTo(BlockKind.HEADING); + assertThat(section.boundingBox()).isEmpty(); + } + + @Test + @DisplayName("four-arg constructor retains a page-normalized bounding box") + void fourArgRetainsBoundingBox() { + var section = new TextSection("Section 1", LOC, BlockKind.HEADING, Optional.of(BBOX)); + + assertThat(section.boundingBox()).contains(BBOX); } @Test @@ -112,6 +124,14 @@ void nullKind() { .hasMessageContaining("kind"); } + @Test + @DisplayName("rejects null boundingBox optional") + void nullBoundingBoxOptional() { + assertThatThrownBy(() -> new TextSection("hello", LOC, BlockKind.OTHER, null)) + .isInstanceOf(NullPointerException.class) + .hasMessageContaining("boundingBox"); + } + @Test @DisplayName("two-arg convenience constructor still rejects null text") void twoArgNullText() { diff --git a/src/test/java/ai/doctruth/cli/CliExtractOptionsTest.java b/src/test/java/ai/doctruth/cli/CliExtractOptionsTest.java new file mode 100644 index 0000000..cb2564f --- /dev/null +++ b/src/test/java/ai/doctruth/cli/CliExtractOptionsTest.java @@ -0,0 +1,136 @@ +package ai.doctruth.cli; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Map; +import java.util.concurrent.atomic.AtomicReference; + +import ai.doctruth.LlmProvider; +import ai.doctruth.OpenAiProvider; +import ai.doctruth.ProviderRequest; +import ai.doctruth.ProviderResponse; +import ai.doctruth.ProviderUsage; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class CliExtractOptionsTest { + + @TempDir + Path tempDir; + + @Test + void extractAcceptsProviderPromptBaseUrlAndExplicitRequiredFields() throws Exception { + Path pdf = samplePdf(); + Path schema = schemaFile(); + Path out = tempDir.resolve("run"); + var seenProvider = new AtomicReference(); + var seenPrompt = new AtomicReference(); + var cli = cliWithProviderFactory(options -> { + seenProvider.set(options); + return providerCapturingPrompt(seenPrompt); + }); + + int code = cli.run(new String[] { + "extract", + pdf.toString(), + "-s", + schema.toString(), + "-o", + out.toString(), + "--provider", + "openai", + "--model", + "gpt-test", + "--base-url", + "https://example.test/v1", + "--require", + "partyA,totalValue", + "--prompt", + "Extract contract fields." + }); + + assertThat(code).isZero(); + assertThat(seenProvider.get().model()).contains("gpt-test"); + assertThat(seenProvider.get().baseUrl()).contains(URI.create("https://example.test/v1")); + assertThat(seenPrompt.get()).isEqualTo("Extract contract fields."); + assertThat(Files.readString(out.resolve("audit.json"))) + .contains("partyA") + .contains("totalValue"); + } + + private TestCli cliWithProviderFactory(DocTruthCli.ProviderFactory providers) { + var out = new ByteArrayOutputStream(); + var err = new ByteArrayOutputStream(); + var cli = new DocTruthCli( + Map.of("OPENAI_API_KEY", "test"), + new PrintStream(out, true, StandardCharsets.UTF_8), + new PrintStream(err, true, StandardCharsets.UTF_8), + spec -> "{}", + providers); + return new TestCli(cli, out, err); + } + + private static LlmProvider providerCapturingPrompt(AtomicReference seenPrompt) { + return new OpenAiProvider("test", URI.create("http://localhost"), "test-model") { + @Override + public ProviderResponse complete(ProviderRequest request) { + seenPrompt.set(request.systemPrompt()); + return new ProviderResponse( + "{\"partyA\":\"Acme Industrial Materials Pty Ltd\",\"totalValue\":\"AUD 2,450,000\"}", + new ProviderUsage(1, 1, "test-model")); + } + }; + } + + private Path schemaFile() throws Exception { + Path schema = tempDir.resolve("contract.schema.json"); + Files.writeString(schema, """ + { + "type": "object", + "properties": { + "partyA": { "type": "string" }, + "totalValue": { "type": "string" } + }, + "required": ["partyA", "totalValue"] + } + """); + return schema; + } + + private Path samplePdf() throws Exception { + Path path = tempDir.resolve("contract.pdf"); + try (var pdf = new PDDocument()) { + var page = new PDPage(); + pdf.addPage(page); + try (var cs = new PDPageContentStream(pdf, page)) { + cs.beginText(); + cs.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + cs.newLineAtOffset(50, 720); + cs.showText("Party A: Acme Industrial Materials Pty Ltd"); + cs.newLineAtOffset(0, -18); + cs.showText("Total Value: AUD 2,450,000"); + cs.endText(); + } + pdf.save(path.toFile()); + } + return path; + } + + private record TestCli(DocTruthCli delegate, ByteArrayOutputStream outBytes, ByteArrayOutputStream errBytes) { + int run(String[] args) { + return delegate.run(args); + } + } +} diff --git a/src/test/java/ai/doctruth/cli/CliSupportTest.java b/src/test/java/ai/doctruth/cli/CliSupportTest.java new file mode 100644 index 0000000..722f2e0 --- /dev/null +++ b/src/test/java/ai/doctruth/cli/CliSupportTest.java @@ -0,0 +1,182 @@ +package ai.doctruth.cli; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.lang.reflect.InvocationTargetException; +import java.net.URI; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import ai.doctruth.BlockKind; +import ai.doctruth.DocumentMetadata; +import ai.doctruth.FigureSection; +import ai.doctruth.ParsedDocument; +import ai.doctruth.SourceLocation; +import ai.doctruth.TableSection; +import ai.doctruth.TextSection; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import org.apache.poi.xwpf.usermodel.XWPFDocument; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +class CliSupportTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + @TempDir + Path tempDir; + + @Test + void parsedDocumentJsonHandlesAllSectionTypes() throws Exception { + var loc = new SourceLocation(1, 1, 1, 1, 0); + var doc = new ParsedDocument( + "doc", + java.util.List.of( + new TextSection("hello", loc, BlockKind.BODY), + new TableSection(java.util.List.of(java.util.List.of("a")), loc), + new FigureSection("chart", loc)), + new DocumentMetadata("sample.pdf", 1, Optional.empty())); + + var tree = MAPPER.readTree(ParsedDocumentJson.toJson(doc)); + + assertThat(tree.path("sections").get(0).path("type").asText()).isEqualTo("text"); + assertThat(tree.path("sections").get(1).path("type").asText()).isEqualTo("table"); + assertThat(tree.path("sections").get(2).path("type").asText()).isEqualTo("figure"); + } + + @Test + void providerFactoryCreatesSupportedProviders() throws Exception { + var env = Map.of( + "OPENAI_API_KEY", "openai-key", + "ANTHROPIC_API_KEY", "anthropic-key", + "GOOGLE_API_KEY", "gemini-key", + "DEEPSEEK_API_KEY", "deepseek-key"); + var config = new CliConfig("openai", Optional.empty(), tempDir, env); + + assertThat(Providers.create(new ProviderConfig("openai", Optional.of("gpt-test"), Optional.empty(), config)) + .name()) + .isEqualTo("openai"); + assertThat(Providers.create(new ProviderConfig("anthropic", Optional.empty(), Optional.empty(), config)) + .name()) + .isEqualTo("anthropic"); + assertThat(Providers.create(new ProviderConfig("gemini", Optional.empty(), Optional.empty(), config)) + .name()) + .isEqualTo("gemini"); + assertThat(Providers.create(new ProviderConfig("deepseek", Optional.empty(), Optional.empty(), config)) + .name()) + .isEqualTo("deepseek"); + } + + @Test + void providerFactoryHandlesBaseUrlsAndErrors() throws Exception { + var config = new CliConfig("openai", Optional.empty(), tempDir, Map.of("OPENAI_API_KEY", "openai-key")); + + assertThat(Providers.create(new ProviderConfig( + "openai", Optional.empty(), Optional.of(URI.create("https://example.test/v1")), config)) + .name()) + .isEqualTo("openai"); + assertThat(Providers.create(new ProviderConfig( + "openai", + Optional.empty(), + Optional.of(URI.create("https://example.test/v1/chat/completions")), + config)) + .name()) + .isEqualTo("openai"); + assertThatThrownBy( + () -> Providers.create(new ProviderConfig("wat", Optional.empty(), Optional.empty(), config))) + .isInstanceOf(CliException.class) + .hasMessageContaining("unsupported provider"); + assertThatThrownBy(() -> Providers.create(new ProviderConfig( + "openai", + Optional.empty(), + Optional.empty(), + new CliConfig("openai", Optional.empty(), tempDir, Map.of("OPENAI_API_KEY", " "))))) + .isInstanceOf(CliException.class) + .hasMessageContaining("missing OPENAI_API_KEY"); + } + + @Test + void documentParsersRouteSupportedFormatsAndReportFailures() throws Exception { + Path docx = writeDocx("default.docx"); + Path xlsx = writeXlsx("skills.xlsx"); + Path csv = writeCsv("iris.csv"); + + assertThat(DocumentParsers.parse(docx).metadata().sourceFilename()).isEqualTo("default.docx"); + assertThat(DocumentParsers.parse(xlsx).sections()).isNotEmpty(); + assertThat(DocumentParsers.parse(csv).sections()).isNotEmpty(); + + Path invalidPdf = tempDir.resolve("broken.pdf"); + Files.writeString(invalidPdf, "not a pdf"); + assertThatThrownBy(() -> DocumentParsers.parse(invalidPdf)) + .isInstanceOf(CliException.class) + .hasMessageContaining("failed to parse"); + assertThatThrownBy(() -> DocumentParsers.parse(tempDir.resolve("README"))) + .isInstanceOf(CliException.class) + .hasMessageContaining("unsupported document format"); + } + + private Path writeDocx(String filename) throws Exception { + Path path = tempDir.resolve(filename); + try (var docx = new XWPFDocument()) { + docx.createParagraph().createRun().setText("hello from docx"); + try (var out = Files.newOutputStream(path)) { + docx.write(out); + } + } + return path; + } + + private Path writeXlsx(String filename) throws Exception { + Path path = tempDir.resolve(filename); + try (var workbook = new XSSFWorkbook()) { + var sheet = workbook.createSheet("Sheet1"); + var row = sheet.createRow(0); + row.createCell(0).setCellValue("skill"); + row.createCell(1).setCellValue("evidence"); + try (var out = Files.newOutputStream(path)) { + workbook.write(out); + } + } + return path; + } + + private Path writeCsv(String filename) throws Exception { + Path path = tempDir.resolve(filename); + Files.write(path, List.of("species,sepal", "setosa,5.1")); + return path; + } + + @Test + void utilityConstructorsRejectReflectionInstantiation() throws Exception { + assertUtilityConstructorRejects(Usage.class); + assertUtilityConstructorRejects(Providers.class); + assertUtilityConstructorRejects(DocumentParsers.class); + } + + @Test + void cliConfigLoadsYamlLikeDefaults() throws Exception { + Path configPath = tempDir.resolve("doctruth.yml"); + Files.writeString(configPath, "provider: anthropic\nmodel: claude-test\noutput: build/runs\n"); + + var config = CliConfig.load(configPath, Map.of()); + + assertThat(config.provider()).isEqualTo("anthropic"); + assertThat(config.model()).contains("claude-test"); + assertThat(config.output()).isEqualTo(Path.of("build/runs")); + } + + private static void assertUtilityConstructorRejects(Class type) throws Exception { + var constructor = type.getDeclaredConstructor(); + constructor.setAccessible(true); + assertThatThrownBy(constructor::newInstance) + .isInstanceOf(InvocationTargetException.class) + .cause() + .isInstanceOf(AssertionError.class); + } +} diff --git a/src/test/java/ai/doctruth/cli/DocTruthCliDoctorCompletionTest.java b/src/test/java/ai/doctruth/cli/DocTruthCliDoctorCompletionTest.java new file mode 100644 index 0000000..604faa8 --- /dev/null +++ b/src/test/java/ai/doctruth/cli/DocTruthCliDoctorCompletionTest.java @@ -0,0 +1,110 @@ +package ai.doctruth.cli; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.util.Map; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.jupiter.api.Test; + +class DocTruthCliDoctorCompletionTest { + + private static final ObjectMapper MAPPER = new ObjectMapper(); + + @Test + void doctorReportsRuntimeAndConfigurationReadiness() { + var cli = cli(Map.of("OPENAI_API_KEY", "test-key")); + + int code = cli.run(new String[] {"doctor"}); + + assertThat(code).isZero(); + assertThat(cli.out()) + .contains("DocTruth doctor") + .contains("java:") + .contains("project:") + .contains("OPENAI_API_KEY: set") + .contains("ready:"); + } + + @Test + void doctorJsonReportsMachineReadableReadiness() throws Exception { + var cli = cli(Map.of()); + + int code = cli.run(new String[] {"doctor", "--json"}); + + assertThat(code).isZero(); + var tree = MAPPER.readTree(cli.out()); + assertThat(tree.path("java").path("version").asText()).isNotBlank(); + assertThat(tree.path("env").path("OPENAI_API_KEY").asBoolean()).isFalse(); + } + + @Test + void completionPrintsShellScript() { + var cli = cli(Map.of()); + + int code = cli.run(new String[] {"completion", "bash"}); + + assertThat(code).isZero(); + assertThat(cli.out()).contains("_doctruth").contains("doctor").contains("completion"); + } + + @Test + void completionSupportsZshAndFish() { + var zsh = cli(Map.of()); + var fish = cli(Map.of()); + + assertThat(zsh.run(new String[] {"completion", "zsh"})).isZero(); + assertThat(fish.run(new String[] {"completion", "fish"})).isZero(); + assertThat(zsh.out()).contains("#compdef doctruth").contains("compadd"); + assertThat(fish.out()).contains("complete -c doctruth"); + } + + @Test + void completionRejectsMissingShell() { + var cli = cli(Map.of()); + + int code = cli.run(new String[] {"completion"}); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("doctruth completion "); + } + + @Test + void completionRejectsUnsupportedShell() { + var cli = cli(Map.of()); + + int code = cli.run(new String[] {"completion", "powershell"}); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("supported shells: bash, zsh, fish"); + } + + private static TestCli cli(Map env) { + var out = new ByteArrayOutputStream(); + var err = new ByteArrayOutputStream(); + var cli = new DocTruthCli( + env, + new PrintStream(out, true, StandardCharsets.UTF_8), + new PrintStream(err, true, StandardCharsets.UTF_8), + spec -> "{}", + Providers::create); + return new TestCli(cli, out, err); + } + + private record TestCli(DocTruthCli delegate, ByteArrayOutputStream outBytes, ByteArrayOutputStream errBytes) { + int run(String[] args) { + return delegate.run(args); + } + + String err() { + return errBytes.toString(StandardCharsets.UTF_8); + } + + String out() { + return outBytes.toString(StandardCharsets.UTF_8); + } + } +} diff --git a/src/test/java/ai/doctruth/cli/DocTruthCliTest.java b/src/test/java/ai/doctruth/cli/DocTruthCliTest.java index 557e66f..4a7feed 100644 --- a/src/test/java/ai/doctruth/cli/DocTruthCliTest.java +++ b/src/test/java/ai/doctruth/cli/DocTruthCliTest.java @@ -6,134 +6,374 @@ import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; +import java.net.URI; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.Map; +import ai.doctruth.LlmProvider; +import ai.doctruth.OpenAiProvider; +import ai.doctruth.ProviderRequest; +import ai.doctruth.ProviderResponse; +import ai.doctruth.ProviderUsage; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; class DocTruthCliTest { + private static final ObjectMapper MAPPER = new ObjectMapper(); + @TempDir Path tempDir; @Test - void migratePydanticExportsSchemaAndChecksCompatibility() throws Exception { - Path schemaPath = tempDir.resolve("resume.schema.json"); - var cli = cliReturning(""" - { - "$defs": { - "Address": { - "type": "object", - "properties": { "city": { "type": "string" } } - } - }, - "type": "object", - "properties": { "address": { "$ref": "#/$defs/Address" } }, - "required": ["address"] - } - """); + void helpReturnsZeroAndListsProductCommands() { + var cli = cliWithRealProviders(Map.of()); - int code = cli.run( - new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "--out", schemaPath.toString(), "--check" - }); + int code = cli.run(new String[] {"--help"}); - assertThat(code).isEqualTo(0); - assertThat(Files.readString(schemaPath)).contains("\"$defs\"").contains("\"Address\""); - assertThat(cli.out()).contains("schema compatible").contains(schemaPath.toString()); + assertThat(code).isZero(); + assertThat(cli.out()) + .contains("doctruth parse ") + .contains("doctruth extract -s ") + .doesNotContain("migrate pydantic"); } @Test - void migratePydanticRejectsInvalidModelSpec() { + void versionReturnsZeroAndPrintsVersion() { + var cli = cliWithRealProviders(Map.of()); + + int code = cli.run(new String[] {"version"}); + + assertThat(code).isZero(); + assertThat(cli.out()).contains("DocTruth").contains("0.2.0-alpha"); + } + + @Test + void unknownCommandReturnsUsageError() { + var cli = cliWithRealProviders(Map.of()); + + int code = cli.run(new String[] {"wat"}); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("unknown command: wat").contains("doctruth --help"); + } + + @Test + void initWritesDefaultConfigAndDirectories() throws Exception { var cli = cliReturning("{}"); - int code = cli.run(new String[] { - "migrate", "pydantic", "Resume", "--out", tempDir.resolve("x.json").toString() - }); + int code = cli.run(new String[] {"init", "--dir", tempDir.toString()}); + + assertThat(code).isZero(); + assertThat(Files.readString(tempDir.resolve("doctruth.yml"))).contains("provider: openai"); + assertThat(Files.isDirectory(tempDir.resolve("schemas"))).isTrue(); + assertThat(Files.isDirectory(tempDir.resolve(".doctruth/runs"))).isTrue(); + } + + @Test + void initRejectsUnknownOption() { + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"init", "--wat"}); assertThat(code).isEqualTo(2); - assertThat(cli.err()).contains("expected :"); + assertThat(cli.err()).contains("unknown init option"); } @Test - void migratePydanticRejectsMissingOutOption() { + void parsePrintsSummaryWithoutLlmKey() throws Exception { + Path pdf = samplePdf(); var cli = cliReturning("{}"); - int code = cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume"}); + int code = cli.run(new String[] {"parse", pdf.toString(), "--bboxes"}); + + assertThat(code).isZero(); + assertThat(cli.out()).contains("pages: 1").contains("sections:").contains("bbox coverage:"); + } + + @Test + void parseJsonWritesStructuredSections() throws Exception { + Path pdf = samplePdf(); + Path out = tempDir.resolve("parsed.json"); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"parse", pdf.toString(), "--json", "-o", out.toString()}); + + assertThat(code).isZero(); + var tree = MAPPER.readTree(Files.readString(out)); + assertThat(tree.path("metadata").path("sourceFilename").asText()) + .isEqualTo(pdf.getFileName().toString()); + assertThat(tree.path("sections")).isNotEmpty(); + } + + @Test + void parseRejectsUnsupportedFormat() throws Exception { + Path file = tempDir.resolve("notes.txt"); + Files.writeString(file, "hello"); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"parse", file.toString()}); + + assertThat(code).isEqualTo(1); + assertThat(cli.err()).contains("unsupported document format"); + } + + @Test + void parseRejectsBadUsage() { + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"parse", "--json"}); + + assertThat(code).isEqualTo(1); + assertThat(cli.err()).contains("unsupported document format"); + } + + @Test + void schemaChecksJsonSchemaByDefault() throws Exception { + Path schema = schemaFile(); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"schema", schema.toString()}); + + assertThat(code).isZero(); + assertThat(cli.out()) + .contains("schema compatible") + .contains("fields: 2") + .contains("required: 2"); + } + + @Test + void schemaJsonPrintsMachineReadableSummary() throws Exception { + Path schema = schemaFile(); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"schema", schema.toString(), "--json"}); + + assertThat(code).isZero(); + var tree = MAPPER.readTree(cli.out()); + assertThat(tree.path("compatible").asBoolean()).isTrue(); + assertThat(tree.path("fieldCount").asInt()).isEqualTo(2); + } + + @Test + void schemaRejectsRemoteRefs() throws Exception { + Path schema = tempDir.resolve("bad.schema.json"); + Files.writeString(schema, "{\"type\":\"object\",\"properties\":{\"x\":{\"$ref\":\"https://example.com/x\"}}}"); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"schema", schema.toString()}); + + assertThat(code).isEqualTo(1); + assertThat(cli.err()).contains("schema compatibility check failed"); + } + + @Test + void schemaRejectsUnknownOption() throws Exception { + Path schema = schemaFile(); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"schema", schema.toString(), "--wat"}); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("unknown schema option"); + } + + @Test + void extractWritesResultAndAuditToRunDirectory() throws Exception { + Path pdf = samplePdf(); + Path schema = schemaFile(); + Path out = tempDir.resolve("run"); + var cli = cliWithProvider(cannedProvider()); + + int code = cli.run(new String[] {"extract", pdf.toString(), "-s", schema.toString(), "-o", out.toString()}); + + assertThat(code).isZero(); + assertThat(Files.readString(out.resolve("result.json"))).contains("Acme Industrial Materials Pty Ltd"); + assertThat(Files.readString(out.resolve("audit.json"))).contains("doctruth:fieldPath"); + assertThat(cli.out()).contains("fields: 2").contains("cited: 2").contains("audit:"); + } + + @Test + void extractReportsMissingProviderKey() throws Exception { + Path pdf = samplePdf(); + Path schema = schemaFile(); + var cli = cliWithRealProviders(Map.of()); + + int code = cli.run(new String[] {"extract", pdf.toString(), "-s", schema.toString()}); + + assertThat(code).isEqualTo(1); + assertThat(cli.err()).contains("missing OPENAI_API_KEY"); + } + + @Test + void extractRejectsMissingSchemaOption() throws Exception { + Path pdf = samplePdf(); + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"extract", pdf.toString()}); assertThat(code).isEqualTo(2); - assertThat(cli.err()).contains("--out is required"); + assertThat(cli.err()).contains("-s is required"); } @Test - void helpAndUnknownCommandsReturnUsage() { - var help = cliReturning("{}"); - var unknown = cliReturning("{}"); + void extractAllowUncitedDoesNotRequireFieldMatches() throws Exception { + Path pdf = samplePdf(); + Path schema = schemaFile(); + Path out = tempDir.resolve("loose-run"); + var cli = cliWithProvider(new OpenAiProvider("test", URI.create("http://localhost"), "test-model") { + @Override + public ProviderResponse complete(ProviderRequest request) { + return new ProviderResponse( + "{\"partyA\":\"not in source\",\"totalValue\":\"not in source\"}", + new ProviderUsage(1, 1, "test-model")); + } + }); + + int code = cli.run(new String[] { + "extract", pdf.toString(), "-s", schema.toString(), "-o", out.toString(), "--allow-uncited" + }); + + assertThat(code).isZero(); + assertThat(Files.readString(out.resolve("audit.json"))).contains("not in source"); + } - int helpCode = help.run(new String[] {"--help"}); - int unknownCode = unknown.run(new String[] {"parse"}); + @Test + void auditPrintsReadableCitationSummary() throws Exception { + Path pdf = samplePdf(); + Path schema = schemaFile(); + Path out = tempDir.resolve("run"); + var extract = cliWithProvider(cannedProvider()); + assertThat(extract.run(new String[] {"extract", pdf.toString(), "-s", schema.toString(), "-o", out.toString()})) + .isZero(); + var audit = cliReturning("{}"); + + int code = audit.run(new String[] {"audit", out.resolve("audit.json").toString()}); + + assertThat(code).isZero(); + assertThat(audit.out()).contains("fields: 2").contains("partyA").contains("match:"); + } - assertThat(helpCode).isEqualTo(2); - assertThat(help.err()).contains("usage: doctruth migrate pydantic"); - assertThat(unknownCode).isEqualTo(2); - assertThat(unknown.err()).contains("usage: doctruth migrate pydantic"); + @Test + void auditJsonPrintsMachineReadableSummary() throws Exception { + Path pdf = samplePdf(); + Path schema = schemaFile(); + Path out = tempDir.resolve("run-json"); + var extract = cliWithProvider(cannedProvider()); + assertThat(extract.run(new String[] {"extract", pdf.toString(), "-s", schema.toString(), "-o", out.toString()})) + .isZero(); + var audit = cliReturning("{}"); + + int code = audit.run(new String[] {"audit", out.resolve("audit.json").toString(), "--json"}); + + assertThat(code).isZero(); + assertThat(MAPPER.readTree(audit.out()).path("fields").asInt()).isEqualTo(2); } @Test - void migratePydanticCreatesOutputParents() throws Exception { - Path schemaPath = tempDir.resolve("schemas/generated/resume.schema.json"); + void auditRejectsUnknownOption() { + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"audit", "x.json", "--wat"}); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("unknown audit option"); + } + + @Test + void migratePydanticSupportsShortOutputOption() throws Exception { + Path schemaPath = tempDir.resolve("resume.schema.json"); var cli = cliReturning("{\"type\":\"object\"}"); - int code = - cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "--out", schemaPath.toString()}); + int code = cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "-o", schemaPath.toString()}); - assertThat(code).isEqualTo(0); + assertThat(code).isZero(); assertThat(Files.readString(schemaPath)).isEqualTo("{\"type\":\"object\"}"); } + @Test + void migratePydanticRejectsInvalidModelSpec() { + var cli = cliReturning("{}"); + + int code = cli.run(new String[] { + "migrate", "pydantic", "Resume", "-o", tempDir.resolve("x.json").toString() + }); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("expected :"); + } + + @Test + void migratePydanticRejectsMissingOutput() { + var cli = cliReturning("{}"); + + int code = cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume"}); + + assertThat(code).isEqualTo(2); + assertThat(cli.err()).contains("-o is required"); + } + @Test void migratePydanticReportsInvalidExporterJson() { - Path schemaPath = tempDir.resolve("invalid.schema.json"); var cli = cliReturning("{not-json"); - int code = - cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "--out", schemaPath.toString()}); + int code = cli.run(new String[] { + "migrate", + "pydantic", + "myapp.schemas:Resume", + "-o", + tempDir.resolve("bad.json").toString() + }); assertThat(code).isEqualTo(1); - assertThat(Files.exists(schemaPath)).isFalse(); assertThat(cli.err()).contains("exported Pydantic schema is not valid JSON"); } @Test - void migratePydanticReportsExporterIoFailure() { - Path schemaPath = tempDir.resolve("resume.schema.json"); + void migratePydanticReportsExporterFailure() { var cli = cliWithExporter(spec -> { throw new IOException("python missing"); }); - int code = - cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "--out", schemaPath.toString()}); + int code = cli.run(new String[] { + "migrate", + "pydantic", + "myapp.schemas:Resume", + "-o", + tempDir.resolve("resume.json").toString() + }); assertThat(code).isEqualTo(1); assertThat(cli.err()).contains("failed to export Pydantic schema").contains("python missing"); } @Test - void migratePydanticPreservesInterruptedStatus() { - Path schemaPath = tempDir.resolve("resume.schema.json"); - var cli = cliWithExporter(spec -> { - throw new InterruptedException("cancelled"); - }); + void migratePydanticRejectsRemoteRefsBeforeWritingOutput() { + Path schemaPath = tempDir.resolve("bad.schema.json"); + var cli = cliReturning(""" + { + "type": "object", + "properties": { + "address": { "$ref": "https://example.com/schemas/address.json" } + } + } + """); - int code = - cli.run(new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "--out", schemaPath.toString()}); + int code = cli.run( + new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "-o", schemaPath.toString(), "--check"}); assertThat(code).isEqualTo(1); - assertThat(Thread.currentThread().isInterrupted()).isTrue(); - assertThat(cli.err()).contains("Pydantic schema export interrupted"); - Thread.interrupted(); + assertThat(Files.exists(schemaPath)).isFalse(); + assertThat(cli.err()).contains("unsupported $ref"); } @Test @@ -167,52 +407,91 @@ void pythonPydanticExporterReportsNonZeroExit() throws Exception { .hasMessageContaining("no pydantic"); } - @Test - void migratePydanticCheckRejectsRemoteRefsBeforeWritingOutput() { - Path schemaPath = tempDir.resolve("bad.schema.json"); - var cli = cliReturning(""" - { - "type": "object", - "properties": { - "address": { "$ref": "https://example.com/schemas/address.json" } - } - } - """); + private TestCli cliReturning(String schemaJson) { + return cliWithExporter(spec -> schemaJson); + } - int code = cli.run( - new String[] {"migrate", "pydantic", "myapp.schemas:Resume", "--out", schemaPath.toString(), "--check" - }); + private TestCli cliWithExporter(DocTruthCli.PydanticExporter exporter) { + return cliWith(Map.of(), exporter, opts -> cannedProvider()); + } - assertThat(code).isEqualTo(1); - assertThat(Files.exists(schemaPath)).isFalse(); - assertThat(cli.err()).contains("unsupported $ref"); + private TestCli cliWithProvider(LlmProvider provider) { + return cliWith(Map.of("OPENAI_API_KEY", "test"), spec -> "{}", opts -> provider); } - private TestCli cliReturning(String schemaJson) { - return cliWithExporter(spec -> schemaJson); + private TestCli cliWithRealProviders(Map env) { + return cliWith(env, spec -> "{}", Providers::create); } - private TestCli cliWithExporter(DocTruthCli.PydanticExporter exporter) { + private TestCli cliWith( + Map env, DocTruthCli.PydanticExporter exporter, DocTruthCli.ProviderFactory providers) { var out = new ByteArrayOutputStream(); var err = new ByteArrayOutputStream(); var cli = new DocTruthCli( - Map.of(), + env, new PrintStream(out, true, StandardCharsets.UTF_8), new PrintStream(err, true, StandardCharsets.UTF_8), - exporter); + exporter, + providers); return new TestCli(cli, out, err); } + private static LlmProvider cannedProvider() { + return new OpenAiProvider("test", URI.create("http://localhost"), "test-model") { + @Override + public ProviderResponse complete(ProviderRequest request) { + return new ProviderResponse( + "{\"partyA\":\"Acme Industrial Materials Pty Ltd\",\"totalValue\":\"AUD 2,450,000\"}", + new ProviderUsage(1, 1, "test-model")); + } + }; + } + + private Path schemaFile() throws IOException { + Path schema = tempDir.resolve("contract.schema.json"); + Files.writeString(schema, """ + { + "type": "object", + "properties": { + "partyA": { "type": "string" }, + "totalValue": { "type": "string" } + }, + "required": ["partyA", "totalValue"], + "additionalProperties": false + } + """); + return schema; + } + + private Path samplePdf() throws IOException { + Path path = tempDir.resolve("contract.pdf"); + try (var pdf = new PDDocument()) { + var page = new PDPage(); + pdf.addPage(page); + try (var cs = new PDPageContentStream(pdf, page)) { + cs.beginText(); + cs.setFont(new PDType1Font(Standard14Fonts.FontName.HELVETICA), 12); + cs.newLineAtOffset(50, 720); + cs.showText("Party A: Acme Industrial Materials Pty Ltd"); + cs.newLineAtOffset(0, -18); + cs.showText("Total Value: AUD 2,450,000"); + cs.endText(); + } + pdf.save(path.toFile()); + } + return path; + } + private record TestCli(DocTruthCli delegate, ByteArrayOutputStream outBytes, ByteArrayOutputStream errBytes) { int run(String[] args) { return delegate.run(args); } - public String err() { + String err() { return errBytes.toString(StandardCharsets.UTF_8); } - public String out() { + String out() { return outBytes.toString(StandardCharsets.UTF_8); } } diff --git a/src/test/java/ai/doctruth/internal/citation/CitationMatcherTest.java b/src/test/java/ai/doctruth/internal/citation/CitationMatcherTest.java index 9c198e5..27ba02a 100644 --- a/src/test/java/ai/doctruth/internal/citation/CitationMatcherTest.java +++ b/src/test/java/ai/doctruth/internal/citation/CitationMatcherTest.java @@ -8,6 +8,7 @@ import java.util.Map; import java.util.Optional; +import ai.doctruth.BoundingBox; import ai.doctruth.Citation; import ai.doctruth.DocumentMetadata; import ai.doctruth.ParsedDocument; @@ -61,6 +62,12 @@ private static ParsedDocument doc(List pages) { "doc-1", List.copyOf(sections), new DocumentMetadata("test.pdf", pages.size(), Optional.empty())); } + private static ParsedDocument docWithBox(String text, BoundingBox box) { + var loc = new SourceLocation(1, 1, 1, 1, 0); + var section = new TextSection(text, loc, ai.doctruth.BlockKind.BODY, Optional.of(box)); + return new ParsedDocument("doc-1", List.of(section), new DocumentMetadata("test.pdf", 1, Optional.empty())); + } + @Nested @DisplayName("ExactMatch") class ExactMatch { @@ -80,6 +87,18 @@ void stringFieldExactMatch() { assertThat(c.location().pageStart()).isEqualTo(1); } + @Test + @DisplayName("an exact text-section match carries the section bounding box onto the citation") + void exactMatchCarriesBoundingBox() { + var box = new BoundingBox(10.0, 20.0, 110.0, 40.0); + var doc = docWithBox("Alex Chen, 30 years old", box); + var matcher = new CitationMatcher(); + + Map out = matcher.matchAll(new Person("Alex Chen", 30), doc); + + assertThat(out.get("name").boundingBox()).contains(box); + } + @Test @DisplayName("an integer field whose toString appears verbatim in a section gets matchScore == 1.0") void integerFieldExactMatch() { diff --git a/src/test/resources/ai/doctruth/public-api-snapshot.txt b/src/test/resources/ai/doctruth/public-api-snapshot.txt new file mode 100644 index 0000000..bd1d8da --- /dev/null +++ b/src/test/resources/ai/doctruth/public-api-snapshot.txt @@ -0,0 +1,422 @@ +TYPE class ai.doctruth.AnthropicProvider [public] + ctor AnthropicProvider(java.lang.String) + ctor AnthropicProvider(java.lang.String, java.net.URI, java.lang.String) + method ai.doctruth.ProviderResponse complete(ai.doctruth.ProviderRequest) [public] + method java.lang.String apiKey() [public] + method java.lang.String name() [public] + +TYPE enum ai.doctruth.BlockKind [public final] + enum-constants HEADING, BODY, LIST, OTHER + method ai.doctruth.BlockKind valueOf(java.lang.String) [public static] + method ai.doctruth.BlockKind[] values() [public static] + +TYPE record ai.doctruth.BoundingBox [public final] + record-components double x0, double y0, double x1, double y1 + ctor BoundingBox(double, double, double, double) + method boolean equals(java.lang.Object) [public final] + method double x0() [public] + method double x1() [public] + method double y0() [public] + method double y1() [public] + method int hashCode() [public final] + method java.lang.String toString() [public final] + +TYPE record ai.doctruth.Citation [public final] + record-components ai.doctruth.SourceLocation location, java.lang.String exactQuote, double matchScore, java.util.Optional boundingBox + ctor Citation(ai.doctruth.SourceLocation, java.lang.String, double) + ctor Citation(ai.doctruth.SourceLocation, java.lang.String, double, java.util.Optional) + method ai.doctruth.SourceLocation location() [public] + method boolean equals(java.lang.Object) [public final] + method double matchScore() [public] + method int hashCode() [public final] + method java.lang.String exactQuote() [public] + method java.lang.String toString() [public final] + method java.util.Optional boundingBox() [public] + +TYPE record ai.doctruth.Confidence [public final] + record-components double score, java.lang.String rationale + ctor Confidence(double, java.lang.String) + method boolean equals(java.lang.Object) [public final] + method double score() [public] + method int hashCode() [public final] + method java.lang.String rationale() [public] + method java.lang.String toString() [public final] + +TYPE interface ai.doctruth.ContextStrategy [public abstract interface] + permits ai.doctruth.PriorityTruncate, ai.doctruth.SlidingWindow, ai.doctruth.Hierarchical + method java.lang.String assemble(ai.doctruth.ParsedDocument) [public abstract] + +TYPE class ai.doctruth.CsvDocumentParser [public final] + method ai.doctruth.ParsedDocument parse(java.nio.file.Path) [public static] + +TYPE class ai.doctruth.DeepSeekProvider [public] + ctor DeepSeekProvider(java.lang.String) + ctor DeepSeekProvider(java.lang.String, java.net.URI, java.lang.String) + method ai.doctruth.ProviderResponse complete(ai.doctruth.ProviderRequest) [public] + method java.lang.String apiKey() [public] + method java.lang.String name() [public] + +TYPE class ai.doctruth.DocTruth [public final] + method ai.doctruth.DocTruth from(ai.doctruth.LlmProvider) [public static] + method ai.doctruth.DocTruthClient withOpenAi() [public static] + method ai.doctruth.DocTruthClient withOpenAi(java.lang.String) [public static] + method ai.doctruth.DocTruthClient withProvider(ai.doctruth.LlmProvider) [public static] + method ai.doctruth.ExtractionBuilder extract(java.lang.String, java.lang.Class) [public] + method ai.doctruth.JsonExtractionBuilder extractJson(java.lang.String, ai.doctruth.JsonSchema) [public] + +TYPE class ai.doctruth.DocTruthClient [public final] + method ai.doctruth.DocTruthDocument from(ai.doctruth.ParsedDocument) [public] + method ai.doctruth.DocTruthDocument fromCsv(java.nio.file.Path) [public] + method ai.doctruth.DocTruthDocument fromDocx(java.nio.file.Path) [public] + method ai.doctruth.DocTruthDocument fromPdf(java.lang.String) [public] + method ai.doctruth.DocTruthDocument fromPdf(java.nio.file.Path) [public] + method ai.doctruth.DocTruthDocument fromXlsx(java.nio.file.Path) [public] + +TYPE class ai.doctruth.DocTruthDocument [public final] + method ai.doctruth.DocumentExtractionBuilder extract(java.lang.String, java.lang.Class) [public] + method ai.doctruth.DocumentJsonExtractionBuilder extractJson(java.lang.String, ai.doctruth.JsonSchema) [public] + +TYPE class ai.doctruth.DocumentExtractionBuilder [public final] + method ai.doctruth.DocumentExtractionBuilder withContextStrategy(ai.doctruth.ContextStrategy) [public] + method ai.doctruth.DocumentExtractionBuilder withEvidence() [public] + method ai.doctruth.DocumentExtractionBuilder withMaxRetries(int) [public] + method ai.doctruth.DocumentExtractionBuilder withSourcePublishedAt(java.time.Instant) [public] + method ai.doctruth.ExtractionResult run() [public] + +TYPE class ai.doctruth.DocumentJsonExtractionBuilder [public final] + method ai.doctruth.DocumentJsonExtractionBuilder requireCitation(java.lang.String) [public] + method ai.doctruth.DocumentJsonExtractionBuilder withContextStrategy(ai.doctruth.ContextStrategy) [public] + method ai.doctruth.DocumentJsonExtractionBuilder withEvidence() [public] + method ai.doctruth.DocumentJsonExtractionBuilder withMaxRetries(int) [public] + method ai.doctruth.DocumentJsonExtractionBuilder withSourcePublishedAt(java.time.Instant) [public] + method ai.doctruth.ExtractionResult runJson() [public] + +TYPE record ai.doctruth.DocumentMetadata [public final] + record-components java.lang.String sourceFilename, int pageCount, java.util.Optional sourcePublishedAt + ctor DocumentMetadata(java.lang.String, int, java.util.Optional) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int pageCount() [public] + method java.lang.String sourceFilename() [public] + method java.lang.String toString() [public final] + method java.util.Optional sourcePublishedAt() [public] + +TYPE class ai.doctruth.DocxDocumentParser [public final] + method ai.doctruth.ParsedDocument parse(java.nio.file.Path) [public static] + +TYPE class ai.doctruth.ExtractionBuilder [public final] + method ai.doctruth.ExtractionBuilder withAuditListener(ai.doctruth.spi.AuditEventListener) [public] + method ai.doctruth.ExtractionBuilder withBitemporal() [public] + method ai.doctruth.ExtractionBuilder withConfidence() [public] + method ai.doctruth.ExtractionBuilder withContextStrategy(ai.doctruth.ContextStrategy) [public] + method ai.doctruth.ExtractionBuilder withFieldConstraint(java.lang.String, java.lang.Class, java.util.function.Predicate, java.lang.String) [public] + method ai.doctruth.ExtractionBuilder withMaxRetries(int) [public] + method ai.doctruth.ExtractionBuilder withObjectConstraint(java.util.function.Predicate, java.lang.String) [public] + method ai.doctruth.ExtractionBuilder withProvenance() [public] + method ai.doctruth.ExtractionBuilder withSourcePublishedAt(java.time.Instant) [public] + method ai.doctruth.ExtractionResult run(ai.doctruth.ParsedDocument) [public] + +TYPE class ai.doctruth.ExtractionException [public] + ctor ExtractionException(java.lang.String, java.lang.String, int) + ctor ExtractionException(java.lang.String, java.lang.String, int, java.lang.Throwable) + method int retries() [public] + method java.lang.String errorCode() [public] + +TYPE record ai.doctruth.ExtractionResult [public final] + record-components java.lang.Object value, java.util.Map citations, java.util.Map confidence, ai.doctruth.Provenance provenance + ctor ExtractionResult(java.lang.Object, java.util.Map, java.util.Map, ai.doctruth.Provenance) + method ai.doctruth.Citation citation(java.lang.String) [public] + method ai.doctruth.Citation requireCitation(java.lang.String) [public] + method ai.doctruth.Provenance provenance() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.Object value() [public] + method java.lang.String toAuditJson() [public] + method java.lang.String toAuditJson(ai.doctruth.spi.SignatureProvider) [public] + method java.lang.String toString() [public final] + method java.util.Map citations() [public] + method java.util.Map confidence() [public] + method java.util.Optional findCitation(java.lang.String) [public] + method void toAuditJson(java.nio.file.Path) [public] + method void toAuditJson(java.nio.file.Path, ai.doctruth.spi.SignatureProvider) [public] + method void writeAudit(java.lang.String) [public] + method void writeAudit(java.nio.file.Path) [public] + +TYPE record ai.doctruth.FigureSection [public final] + record-components java.lang.String caption, ai.doctruth.SourceLocation location + ctor FigureSection(java.lang.String, ai.doctruth.SourceLocation) + method ai.doctruth.SourceLocation location() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.String caption() [public] + method java.lang.String toString() [public final] + +TYPE class ai.doctruth.GeminiProvider [public] + ctor GeminiProvider(java.lang.String) + ctor GeminiProvider(java.lang.String, java.net.URI, java.lang.String) + method ai.doctruth.ProviderResponse complete(ai.doctruth.ProviderRequest) [public] + method java.lang.String apiKey() [public] + method java.lang.String name() [public] + +TYPE record ai.doctruth.Hierarchical [public final] + record-components int maxDepth + ctor Hierarchical(int) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int maxDepth() [public] + method java.lang.String assemble(ai.doctruth.ParsedDocument) [public] + method java.lang.String toString() [public final] + +TYPE class ai.doctruth.JsonExtractionBuilder [public final] + method ai.doctruth.ExtractionResult runJson(ai.doctruth.ParsedDocument) [public] + method ai.doctruth.JsonExtractionBuilder requireCitation(java.lang.String) [public] + method ai.doctruth.JsonExtractionBuilder withBitemporal() [public] + method ai.doctruth.JsonExtractionBuilder withConfidence() [public] + method ai.doctruth.JsonExtractionBuilder withContextStrategy(ai.doctruth.ContextStrategy) [public] + method ai.doctruth.JsonExtractionBuilder withMaxRetries(int) [public] + method ai.doctruth.JsonExtractionBuilder withProvenance() [public] + method ai.doctruth.JsonExtractionBuilder withSourcePublishedAt(java.time.Instant) [public] + +TYPE class ai.doctruth.JsonSchema [public final] + method ai.doctruth.JsonSchema from(java.lang.String) [public static] + method ai.doctruth.JsonSchema from(java.nio.file.Path) [public static] + method com.fasterxml.jackson.databind.JsonNode node() [public] + +TYPE interface ai.doctruth.LlmProvider [public abstract interface] + permits ai.doctruth.AnthropicProvider, ai.doctruth.OpenAiProvider, ai.doctruth.GeminiProvider, ai.doctruth.DeepSeekProvider + method ai.doctruth.ProviderResponse complete(ai.doctruth.ProviderRequest) [public abstract] + method java.lang.String name() [public abstract] + method java.util.Optional region() [public] + +TYPE class ai.doctruth.LlmProviders [public final] + method ai.doctruth.AnthropicProvider anthropic(java.lang.String) [public static] + method ai.doctruth.DeepSeekProvider deepSeek(java.lang.String) [public static] + method ai.doctruth.GeminiProvider gemini(java.lang.String) [public static] + method ai.doctruth.OpenAiProvider openAi(java.lang.String) [public static] + method ai.doctruth.OpenAiProvider openAiCompatible(java.lang.String, java.net.URI, java.lang.String) [public static] + +TYPE class ai.doctruth.OpenAiProvider [public] + ctor OpenAiProvider(java.lang.String) + ctor OpenAiProvider(java.lang.String, java.net.URI, java.lang.String) + method ai.doctruth.ProviderResponse complete(ai.doctruth.ProviderRequest) [public] + method java.lang.String apiKey() [public] + method java.lang.String model() [public] + method java.lang.String name() [public] + method java.net.URI endpoint() [public] + +TYPE enum ai.doctruth.OverBudgetPolicy [public final] + enum-constants STRICT, WARN_AND_INCLUDE + method ai.doctruth.OverBudgetPolicy valueOf(java.lang.String) [public static] + method ai.doctruth.OverBudgetPolicy[] values() [public static] + +TYPE class ai.doctruth.ParseException [public] + ctor ParseException(java.lang.String, java.lang.String, java.lang.String, java.util.OptionalInt) + ctor ParseException(java.lang.String, java.lang.String, java.lang.String, java.util.OptionalInt, java.lang.Throwable) + method java.lang.String errorCode() [public] + method java.lang.String sourceName() [public] + method java.util.OptionalInt pageNumber() [public] + +TYPE record ai.doctruth.ParsedDocument [public final] + record-components java.lang.String docId, java.util.List sections, ai.doctruth.DocumentMetadata metadata + ctor ParsedDocument(java.lang.String, java.util.List, ai.doctruth.DocumentMetadata) + method ai.doctruth.DocumentMetadata metadata() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.String docId() [public] + method java.lang.String toString() [public final] + method java.util.List sections() [public] + +TYPE interface ai.doctruth.ParsedSection [public abstract interface] + permits ai.doctruth.TextSection, ai.doctruth.TableSection, ai.doctruth.FigureSection + +TYPE class ai.doctruth.PdfDocumentParser [public final] + method ai.doctruth.ParsedDocument parse(java.nio.file.Path) [public static] + +TYPE record ai.doctruth.PriorityTruncate [public final] + record-components java.util.List prioritySectionPatterns, int maxChars, ai.doctruth.OverBudgetPolicy onOverBudget + ctor PriorityTruncate(java.util.List, int, ai.doctruth.OverBudgetPolicy) + method ai.doctruth.OverBudgetPolicy onOverBudget() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int maxChars() [public] + method java.lang.String assemble(ai.doctruth.ParsedDocument) [public] + method java.lang.String toString() [public final] + method java.util.List prioritySectionPatterns() [public] + +TYPE record ai.doctruth.Provenance [public final] + record-components java.lang.String model, java.lang.String modelVersion, java.time.Instant extractedAt, java.util.Optional sourcePublishedAt, ai.doctruth.ProvenanceDetails details + ctor Provenance(java.lang.String, java.lang.String, java.time.Instant, java.util.Optional, ai.doctruth.ProvenanceDetails) + ctor Provenance(java.lang.String, java.lang.String, java.time.Instant, java.util.Optional, java.util.Optional, java.util.Optional, int) + method ai.doctruth.ProvenanceDetails details() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int retries() [public] + method java.lang.String model() [public] + method java.lang.String modelVersion() [public] + method java.lang.String toString() [public final] + method java.time.Instant extractedAt() [public] + method java.util.Optional region() [public] + method java.util.Optional retainUntil() [public] + method java.util.Optional sourcePublishedAt() [public] + +TYPE record ai.doctruth.ProvenanceDetails [public final] + record-components java.util.Optional region, java.util.Optional retainUntil, int retries + ctor ProvenanceDetails(java.util.Optional, java.util.Optional, int) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int retries() [public] + method java.lang.String toString() [public final] + method java.util.Optional region() [public] + method java.util.Optional retainUntil() [public] + +TYPE class ai.doctruth.ProviderException [public] + ctor ProviderException(java.lang.String, java.lang.String, java.lang.String, java.util.OptionalInt, boolean) + ctor ProviderException(java.lang.String, java.lang.String, java.lang.String, java.util.OptionalInt, boolean, java.lang.Throwable) + method boolean retryable() [public] + method java.lang.String errorCode() [public] + method java.lang.String providerName() [public] + method java.util.OptionalInt httpStatus() [public] + +TYPE record ai.doctruth.ProviderOptions [public final] + record-components int maxRetries, java.time.Duration timeout + ctor ProviderOptions(int, java.time.Duration) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int maxRetries() [public] + method java.lang.String toString() [public final] + method java.time.Duration timeout() [public] + +TYPE record ai.doctruth.ProviderRequest [public final] + record-components java.lang.String systemPrompt, java.lang.String userPrompt, com.fasterxml.jackson.databind.JsonNode responseSchema, ai.doctruth.ProviderOptions options + ctor ProviderRequest(java.lang.String, java.lang.String, com.fasterxml.jackson.databind.JsonNode, ai.doctruth.ProviderOptions) + method ai.doctruth.ProviderOptions options() [public] + method boolean equals(java.lang.Object) [public final] + method com.fasterxml.jackson.databind.JsonNode responseSchema() [public] + method int hashCode() [public final] + method java.lang.String systemPrompt() [public] + method java.lang.String toString() [public final] + method java.lang.String userPrompt() [public] + +TYPE record ai.doctruth.ProviderResponse [public final] + record-components java.lang.String rawJson, ai.doctruth.ProviderUsage usage + ctor ProviderResponse(java.lang.String, ai.doctruth.ProviderUsage) + method ai.doctruth.ProviderUsage usage() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.String rawJson() [public] + method java.lang.String toString() [public final] + +TYPE record ai.doctruth.ProviderUsage [public final] + record-components int inputTokens, int outputTokens, java.lang.String modelVersion + ctor ProviderUsage(int, int, java.lang.String) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int inputTokens() [public] + method int outputTokens() [public] + method java.lang.String modelVersion() [public] + method java.lang.String toString() [public final] + +TYPE record ai.doctruth.SlidingWindow [public final] + record-components int windowChars, int overlapChars + ctor SlidingWindow(int, int) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int overlapChars() [public] + method int windowChars() [public] + method java.lang.String assemble(ai.doctruth.ParsedDocument) [public] + method java.lang.String toString() [public final] + +TYPE record ai.doctruth.SourceLocation [public final] + record-components int pageStart, int pageEnd, int lineStart, int lineEnd, int charOffset + ctor SourceLocation(int, int, int, int, int) + method boolean equals(java.lang.Object) [public final] + method int charOffset() [public] + method int hashCode() [public final] + method int lineEnd() [public] + method int lineStart() [public] + method int pageEnd() [public] + method int pageStart() [public] + method java.lang.String toString() [public final] + +TYPE record ai.doctruth.TableSection [public final] + record-components java.util.List rows, ai.doctruth.SourceLocation location + ctor TableSection(java.util.List, ai.doctruth.SourceLocation) + method ai.doctruth.SourceLocation location() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.String toString() [public final] + method java.util.List rows() [public] + +TYPE record ai.doctruth.TextSection [public final] + record-components java.lang.String text, ai.doctruth.SourceLocation location, ai.doctruth.BlockKind kind, java.util.Optional boundingBox + ctor TextSection(java.lang.String, ai.doctruth.SourceLocation) + ctor TextSection(java.lang.String, ai.doctruth.SourceLocation, ai.doctruth.BlockKind) + ctor TextSection(java.lang.String, ai.doctruth.SourceLocation, ai.doctruth.BlockKind, java.util.Optional) + method ai.doctruth.BlockKind kind() [public] + method ai.doctruth.SourceLocation location() [public] + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.String text() [public] + method java.lang.String toString() [public final] + method java.util.Optional boundingBox() [public] + +TYPE class ai.doctruth.XlsxDocumentParser [public final] + method ai.doctruth.ParsedDocument parse(java.nio.file.Path) [public static] + +TYPE record ai.doctruth.spi.AuditEvent [public final] + record-components java.lang.String kind, java.time.Instant at, java.util.Map attributes + ctor AuditEvent(java.lang.String, java.time.Instant, java.util.Map) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method java.lang.String kind() [public] + method java.lang.String toString() [public final] + method java.time.Instant at() [public] + method java.util.Map attributes() [public] + +TYPE interface ai.doctruth.spi.AuditEventListener [public abstract interface] + method void onEvent(ai.doctruth.spi.AuditEvent) [public abstract] + +TYPE record ai.doctruth.spi.OcrBox [public final] + record-components int x, int y, int width, int height + ctor OcrBox(int, int, int, int) + method boolean equals(java.lang.Object) [public final] + method int hashCode() [public final] + method int height() [public] + method int width() [public] + method int x() [public] + method int y() [public] + method java.lang.String toString() [public final] + +TYPE interface ai.doctruth.spi.OcrEngine [public abstract interface] + method ai.doctruth.spi.OcrPageResult ocr(java.awt.image.BufferedImage, int) [public abstract] + +TYPE record ai.doctruth.spi.OcrPageResult [public final] + record-components java.lang.String text, double confidence, java.util.List regions, int pageNumber + ctor OcrPageResult(java.lang.String, double, java.util.List, int) + method ai.doctruth.spi.OcrPageResult empty(int) [public static] + method boolean equals(java.lang.Object) [public final] + method double confidence() [public] + method int hashCode() [public final] + method int pageNumber() [public] + method java.lang.String text() [public] + method java.lang.String toString() [public final] + method java.util.List regions() [public] + +TYPE record ai.doctruth.spi.OcrRegion [public final] + record-components java.lang.String text, ai.doctruth.spi.OcrBox box, double confidence + ctor OcrRegion(java.lang.String, ai.doctruth.spi.OcrBox, double) + ctor OcrRegion(java.lang.String, int, int, int, int, double) + method ai.doctruth.spi.OcrBox box() [public] + method boolean equals(java.lang.Object) [public final] + method double confidence() [public] + method int hashCode() [public final] + method int height() [public] + method int width() [public] + method int x() [public] + method int y() [public] + method java.lang.String text() [public] + method java.lang.String toString() [public final] + +TYPE interface ai.doctruth.spi.SignatureProvider [public abstract interface] + method java.lang.String sign(java.lang.String) [public abstract] +