From 84591371c7e1e51c573498dafc8178e23e9ce71b Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 24 Jun 2026 14:48:20 +0530 Subject: [PATCH 1/4] Add deterministic evidence anchor command Signed-off-by: docushell-admin --- .../test_milestone_d_internal_contracts.py | 5 + CHANGELOG.md | 1 + Cargo.lock | 1 + README.md | 11 + crates/ethos-cli/src/cmd/evidence.rs | 67 ++ crates/ethos-cli/src/cmd/mod.rs | 1 + crates/ethos-cli/src/main.rs | 29 + crates/ethos-cli/tests/evidence_anchor.rs | 377 +++++++++++ crates/ethos-core/src/evidence_anchor.rs | 324 ++++++++++ crates/ethos-core/src/lib.rs | 2 + crates/ethos-verify/Cargo.toml | 1 + crates/ethos-verify/src/lib.rs | 596 ++++++++++++++++++ ...terministic-evidence-anchoring-boundary.md | 45 ++ docs/decisions/README.md | 1 + schemas/README.md | 2 + .../ethos-evidence-anchor-report.schema.json | 130 ++++ .../ethos-evidence-anchor-request.schema.json | 77 +++ ...idence-anchor-report-negative.example.json | 36 ++ .../evidence-anchor-report.example.json | 36 ++ .../evidence-anchor-request.example.json | 20 + schemas/validate_examples.py | 7 + 21 files changed, 1769 insertions(+) create mode 100644 crates/ethos-cli/src/cmd/evidence.rs create mode 100644 crates/ethos-cli/tests/evidence_anchor.rs create mode 100644 crates/ethos-core/src/evidence_anchor.rs create mode 100644 docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md create mode 100644 schemas/ethos-evidence-anchor-report.schema.json create mode 100644 schemas/ethos-evidence-anchor-request.schema.json create mode 100644 schemas/examples/evidence-anchor-report-negative.example.json create mode 100644 schemas/examples/evidence-anchor-report.example.json create mode 100644 schemas/examples/evidence-anchor-request.example.json diff --git a/.github/scripts/test_milestone_d_internal_contracts.py b/.github/scripts/test_milestone_d_internal_contracts.py index 153660cf..c8ecd34e 100644 --- a/.github/scripts/test_milestone_d_internal_contracts.py +++ b/.github/scripts/test_milestone_d_internal_contracts.py @@ -380,9 +380,14 @@ def discovered_d_contract_schemas() -> list[str]: def discovered_d_request_envelope_schemas() -> list[str]: + current_d_request_names = { + "ethos-crop-element-request.schema.json", + "ethos-sandbox-subprocess-request.schema.json", + } return sorted( str(path.relative_to(ROOT)) for path in (ROOT / "schemas").glob("ethos-*-request.schema.json") + if path.name in current_d_request_names ) diff --git a/CHANGELOG.md b/CHANGELOG.md index 207a5cd0..65b138b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- boundary-exception: add source-only `ethos evidence anchor` schema and CLI surface for deterministic evidence refs; no hosted, production, Windows, bundled PDFium, benchmark, parser-quality, table-quality, or release-posture boundary change. - boundary-exception: refresh patch `0.1.1` execution status for published evaluation surfaces while retaining hosted, production, Windows, bundled PDFium, benchmark, `ethos-doc`, and `ethos-rag` blockers. - boundary-exception: document bounded patch `0.1.1` public install paths for published evaluation surfaces while retaining hosted, production, Windows, bundled PDFium, benchmark, `ethos-doc`, and `ethos-rag` blockers. - boundary-exception: close patch `0.1.1` Python PyPI publication with exact registry evidence; no public install wording, hosted, production, Windows, bundled PDFium, benchmark, `ethos-doc`, or `ethos-rag` boundary change. diff --git a/Cargo.lock b/Cargo.lock index 04bc0a17..b6dffd69 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -258,6 +258,7 @@ dependencies = [ "ethos-doc-core", "serde", "serde_json", + "sha2", ] [[package]] diff --git a/README.md b/README.md index 5eaaba7b..8a94f715 100644 --- a/README.md +++ b/README.md @@ -211,6 +211,17 @@ The command exits `0` and writes a verification report shaped like this: } ``` +## Evidence anchoring + +Ethos can check whether caller-provided evidence refs bind to source document evidence. +This is deterministic source tracing, not semantic validation of an answer. + +```bash +./target/debug/ethos evidence anchor schemas/examples/document.example.json \ + --evidence-refs schemas/examples/evidence-anchor-request.example.json \ + --out /tmp/ethos-evidence-anchor-report.json +``` + ## Try the alpha verification loop From a source checkout, the current verification loop is: diff --git a/crates/ethos-cli/src/cmd/evidence.rs b/crates/ethos-cli/src/cmd/evidence.rs new file mode 100644 index 00000000..fd6fd7fc --- /dev/null +++ b/crates/ethos-cli/src/cmd/evidence.rs @@ -0,0 +1,67 @@ +/* + * Copyright 2026 The Ethos maintainers + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use ethos_core::error::EthosError; +use ethos_core::evidence_anchor::{EvidenceAnchorReport, EvidenceAnchorRequest}; +use ethos_grounding_opendataloader_json::OdlJsonSource; + +use crate::{ + default_max_input_bytes, read_document, read_file_limited, write_output, EvidenceAnchorArgs, + Failure, +}; + +pub(crate) fn evidence_anchor(args: EvidenceAnchorArgs) -> Result<(), Failure> { + let max_input_bytes = default_max_input_bytes(); + let request_bytes = read_file_limited(&args.evidence_refs, max_input_bytes)?; + let request: EvidenceAnchorRequest = serde_json::from_slice(&request_bytes).map_err(|_| { + Failure::Usage("evidence refs file does not match the evidence anchor request shape".into()) + })?; + + let report = match args.grounding.as_str() { + "ethos-json" => { + let doc = read_document(&args.input)?; + ethos_verify::anchor_evidence(&doc, request) + .map_err(|error| Failure::Usage(error.to_string()))? + } + "opendataloader-json" => { + let bytes = read_file_limited(&args.input, max_input_bytes)?; + let text = String::from_utf8(bytes) + .map_err(|_| Failure::Usage("grounding input is not UTF-8".to_string()))?; + let source = OdlJsonSource::from_json_str(&text) + .map_err(|e| Failure::Usage(format!("opendataloader-json adapter: {e}")))?; + ethos_verify::anchor_evidence(&source, request) + .map_err(|error| Failure::Usage(error.to_string()))? + } + other => { + return Err(Failure::Usage(format!( + "unknown grounding adapter '{other}' (available: ethos-json, opendataloader-json)" + ))); + } + }; + + write_anchor_report(args.out, &report) +} + +fn write_anchor_report( + out: Option, + report: &EvidenceAnchorReport, +) -> Result<(), Failure> { + let value = serde_json::to_value(report).map_err(|e| EthosError::internal(e.to_string()))?; + let mut bytes = + ethos_core::c14n::c14n_bytes(&value).map_err(|e| EthosError::internal(e.message))?; + bytes.push(b'\n'); + write_output(out, &bytes) +} diff --git a/crates/ethos-cli/src/cmd/mod.rs b/crates/ethos-cli/src/cmd/mod.rs index 8ecd402d..9f7c3141 100644 --- a/crates/ethos-cli/src/cmd/mod.rs +++ b/crates/ethos-cli/src/cmd/mod.rs @@ -18,6 +18,7 @@ pub(crate) mod crop; pub(crate) mod crop_artifacts; pub(crate) mod doc; pub(crate) mod doctor; +pub(crate) mod evidence; pub(crate) mod rag; pub(crate) mod security; pub(crate) mod verify; diff --git a/crates/ethos-cli/src/main.rs b/crates/ethos-cli/src/main.rs index 4e85c910..ca7232b0 100644 --- a/crates/ethos-cli/src/main.rs +++ b/crates/ethos-cli/src/main.rs @@ -74,6 +74,11 @@ enum Command { #[command(subcommand)] command: SecurityCommand, }, + /// Deterministic evidence anchoring + Evidence { + #[command(subcommand)] + command: EvidenceCommand, + }, /// Citation evidence verification (ethos-verify) Verify(VerifyArgs), /// Recompute and check a document fingerprint @@ -214,6 +219,12 @@ enum SecurityCommand { Report(SecurityReportArgs), } +#[derive(Subcommand)] +enum EvidenceCommand { + /// Check caller-provided evidence refs against source evidence + Anchor(EvidenceAnchorArgs), +} + #[derive(Args)] pub(crate) struct RagChunkArgs { /// Canonical document (`*.ethos.json`) @@ -232,6 +243,21 @@ pub(crate) struct SecurityReportArgs { pub(crate) out: Option, } +#[derive(Args)] +pub(crate) struct EvidenceAnchorArgs { + /// Grounding input: canonical Ethos document, or foreign output with --grounding + pub(crate) input: PathBuf, + /// Evidence refs request JSON. + #[arg(long)] + pub(crate) evidence_refs: PathBuf, + /// Grounding adapter id: ethos-json or opendataloader-json. + #[arg(long, default_value = "ethos-json")] + pub(crate) grounding: String, + /// Output path for evidence_anchor_report.json (default: stdout) + #[arg(long)] + pub(crate) out: Option, +} + #[derive(Args)] pub(crate) struct VerifyArgs { /// Grounding input: canonical Ethos document, or foreign output with --grounding @@ -363,6 +389,9 @@ fn run(cli: Cli) -> Result<(), Failure> { Command::Security { command: SecurityCommand::Report(args), } => cmd::security::security_report(args), + Command::Evidence { + command: EvidenceCommand::Anchor(args), + } => cmd::evidence::evidence_anchor(args), Command::Verify(args) => cmd::verify::verify(args), Command::Fingerprint(args) => cmd::doc::fingerprint(args), Command::Doctor(args) => cmd::doctor::doctor(args), diff --git a/crates/ethos-cli/tests/evidence_anchor.rs b/crates/ethos-cli/tests/evidence_anchor.rs new file mode 100644 index 00000000..896c7b5f --- /dev/null +++ b/crates/ethos-cli/tests/evidence_anchor.rs @@ -0,0 +1,377 @@ +/* + * Copyright 2026 The Ethos maintainers + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +use std::path::{Path, PathBuf}; +use std::process::{Command, Output}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use serde_json::Value; + +fn ethos_bin() -> &'static str { + env!("CARGO_BIN_EXE_ethos") +} + +fn repo_root() -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")).join("../..") +} + +fn document_example() -> PathBuf { + repo_root().join("schemas/examples/document.example.json") +} + +fn opendataloader_example() -> PathBuf { + repo_root().join("examples/verify/opendataloader.json") +} + +fn run_ethos(args: &[&str]) -> Output { + Command::new(ethos_bin()) + .args(args) + .output() + .expect("ethos command runs") +} + +fn parse_success(args: &[&str]) -> Value { + let output = run_ethos(args); + assert!( + output.status.success(), + "ethos failed\nstatus: {:?}\nstderr:\n{}\nstdout:\n{}", + output.status.code(), + String::from_utf8_lossy(&output.stderr), + String::from_utf8_lossy(&output.stdout) + ); + assert_eq!(output.stderr, b""); + serde_json::from_slice(&output.stdout).expect("stdout is JSON") +} + +fn temp_json(name: &str, value: Value) -> PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock after unix epoch") + .as_nanos(); + let path = std::env::temp_dir().join(format!("ethos-evidence-anchor-{name}-{nanos}.json")); + std::fs::write( + &path, + serde_json::to_string(&value).expect("temp JSON serializes"), + ) + .expect("temp JSON is writable"); + path +} + +fn request(evidence_refs: Value) -> PathBuf { + temp_json( + "request", + serde_json::json!({ + "artifact_type": "ethos.evidence_anchor_request.v1", + "schema_version": "1.0.0", + "evidence_refs": evidence_refs + }), + ) +} + +fn request_with_fingerprint(source_fingerprint: &str, evidence_refs: Value) -> PathBuf { + temp_json( + "request", + serde_json::json!({ + "artifact_type": "ethos.evidence_anchor_request.v1", + "schema_version": "1.0.0", + "source_fingerprint": source_fingerprint, + "evidence_refs": evidence_refs + }), + ) +} + +#[test] +fn help_lists_evidence_anchor() { + let output = run_ethos(&["--help"]); + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("evidence"), "{stdout}"); + + let output = run_ethos(&["evidence", "--help"]); + assert!(output.status.success()); + let stdout = String::from_utf8_lossy(&output.stdout); + assert!(stdout.contains("anchor"), "{stdout}"); +} + +#[test] +fn missing_evidence_refs_exits_usage() { + let output = run_ethos(&["evidence", "anchor", document_example().to_str().unwrap()]); + assert_eq!(output.status.code(), Some(2)); + assert!(output.stdout.is_empty()); +} + +#[test] +fn empty_refs_succeeds_with_empty_anchors() { + let request = request(serde_json::json!([])); + let report = parse_success(&[ + "evidence", + "anchor", + document_example().to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(report["artifact_type"], "ethos.evidence_anchor_report.v1"); + assert_eq!(report["anchors"], serde_json::json!([])); + assert_eq!(report["grounding"]["parser"]["adapter"], Value::Null); +} + +#[test] +fn native_page_text_bbox_and_table_cell_bind() { + let request = request_with_fingerprint( + "sha256:b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3", + serde_json::json!([ + { + "evidence_id": "ev_page", + "evidence_kind": "page", + "required_anchor_level": "page", + "locator": { "page_index": 1 } + }, + { + "evidence_id": "ev_text", + "evidence_kind": "text", + "required_anchor_level": "text", + "locator": { "page_index": 1 }, + "expected_text": "Revenue grew to $12.4M in Q3 2025, driven by enterprise expansion.", + "expected_text_sha256": "sha256:49f675142f930e31d75679b14e23f7c639cb9903029eacb3235c709c14fe4be5", + "text_normalization_profile": "ethos_collapse_whitespace_v1" + }, + { + "evidence_id": "ev_text_bbox", + "evidence_kind": "text_region", + "required_anchor_level": "text_bbox", + "locator": { + "page_index": 1, + "bbox": [7200, 10100, 54000, 11500], + "coordinate_profile": "ethos_quantized_top_left_v1" + }, + "expected_text": "Revenue grew to $12.4M in Q3 2025" + }, + { + "evidence_id": "ev_cell", + "evidence_kind": "table_cell", + "required_anchor_level": "table_cell", + "locator": { + "table_id": "t0001", + "cell": { "row": 1, "col": 1 } + }, + "expected_text": "$12.4M" + } + ]), + ); + let report = parse_success(&[ + "evidence", + "anchor", + document_example().to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]); + let anchors = report["anchors"].as_array().unwrap(); + assert_eq!(anchors.len(), 4); + for anchor in anchors { + assert_eq!(anchor["anchor_status"], "bound"); + } + assert_eq!(anchors[0]["achieved_anchor_level"], "page"); + assert_eq!(anchors[1]["achieved_anchor_level"], "text"); + assert_eq!(anchors[2]["achieved_anchor_level"], "text_bbox"); + assert_eq!(anchors[3]["achieved_anchor_level"], "table_cell"); +} + +#[test] +fn non_bound_outcomes_still_exit_zero() { + let request = request_with_fingerprint( + "sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + serde_json::json!([ + { + "evidence_id": "ev_stale", + "evidence_kind": "text", + "required_anchor_level": "text", + "locator": { "page_index": 1 }, + "expected_text": "Revenue" + }, + { + "evidence_id": "ev_region", + "evidence_kind": "region", + "required_anchor_level": "bbox", + "locator": { + "page_index": 1, + "bbox": [1, 2, 3, 4], + "coordinate_profile": "ethos_quantized_top_left_v1" + } + } + ]), + ); + let report = parse_success(&[ + "evidence", + "anchor", + document_example().to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(report["anchors"][0]["anchor_status"], "stale_fingerprint"); + assert_eq!(report["anchors"][0]["checks"]["page"], "not_checked"); + assert_eq!( + report["anchors"][1]["anchor_status"], + "unsupported_evidence_kind" + ); +} + +#[test] +fn opendataloader_text_binds_and_bbox_is_capability_limited() { + let request = request(serde_json::json!([ + { + "evidence_id": "odl_text", + "evidence_kind": "text", + "required_anchor_level": "text", + "locator": { "page_index": 1 }, + "expected_text": "Revenue grew to $12.4M in Q3 2025." + }, + { + "evidence_id": "odl_bbox", + "evidence_kind": "text_region", + "required_anchor_level": "text_bbox", + "locator": { + "page_index": 1, + "bbox": [72, 101, 540, 115], + "coordinate_profile": "ethos_quantized_top_left_v1" + }, + "expected_text": "Revenue grew to $12.4M in Q3 2025." + }, + { + "evidence_id": "odl_cell", + "evidence_kind": "table_cell", + "required_anchor_level": "table_cell", + "locator": { + "table_id": "odl-t1", + "cell": { "row": 1, "col": 1 } + }, + "expected_text": "$12.4M" + } + ])); + let report = parse_success(&[ + "evidence", + "anchor", + opendataloader_example().to_str().unwrap(), + "--grounding", + "opendataloader-json", + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(report["anchors"][0]["anchor_status"], "bound"); + assert_eq!(report["anchors"][1]["anchor_status"], "capability_limited"); + assert_eq!(report["anchors"][1]["achieved_anchor_level"], "text"); + assert_eq!(report["anchors"][2]["anchor_status"], "bound"); +} + +#[test] +fn usage_errors_are_exit_two() { + let cases = [ + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "page", + "required_anchor_level": "page", + "locator": { "page_index": 0 } + } + ])), + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "text_region", + "required_anchor_level": "text_bbox", + "locator": { "page_index": 1, "bbox": [1, 2, 3, 4] }, + "expected_text": "x" + } + ])), + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "page", + "required_anchor_level": "page", + "locator": { "page_index": 1, "page_id": "p0001" } + } + ])), + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "page", + "required_anchor_level": "text_bbox", + "locator": { "page_index": 1 } + } + ])), + ]; + for request in cases { + let output = run_ethos(&[ + "evidence", + "anchor", + document_example().to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(output.status.code(), Some(2)); + assert!(output.stdout.is_empty()); + } +} + +#[test] +fn unknown_grounding_and_source_shape_exit_two() { + let request = request(serde_json::json!([])); + let output = run_ethos(&[ + "evidence", + "anchor", + document_example().to_str().unwrap(), + "--grounding", + "unknown", + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(output.status.code(), Some(2)); + + let bad_source = temp_json("bad-source", serde_json::json!({"not": "a source"})); + let output = run_ethos(&[ + "evidence", + "anchor", + bad_source.to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(output.status.code(), Some(2)); +} + +#[test] +fn repeated_input_is_byte_identical() { + let request = request(serde_json::json!([ + { + "evidence_id": "ev_text", + "evidence_kind": "text", + "required_anchor_level": "text", + "locator": { "page_index": 1 }, + "expected_text": "Revenue grew to $12.4M in Q3 2025" + } + ])); + let document = document_example(); + let args = [ + "evidence", + "anchor", + document.to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]; + let first = run_ethos(&args); + let second = run_ethos(&args); + assert!(first.status.success()); + assert!(second.status.success()); + assert_eq!(first.stdout, second.stdout); +} diff --git a/crates/ethos-core/src/evidence_anchor.rs b/crates/ethos-core/src/evidence_anchor.rs new file mode 100644 index 00000000..79bfb1e0 --- /dev/null +++ b/crates/ethos-core/src/evidence_anchor.rs @@ -0,0 +1,324 @@ +/* + * Copyright 2026 The Ethos maintainers + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//! Evidence-anchor request/report schema types. +//! +//! Evidence anchoring is a deterministic source-tracing primitive: caller-provided +//! evidence refs are checked against a [`crate::grounding::GroundingSource`]. +//! It does not perform semantic answer verification. + +use serde::{Deserialize, Serialize}; + +use crate::grounding::{Capabilities, ParserIdentity}; +use crate::verify_types::CapabilityLimit; + +/// Request artifact type for evidence anchoring. +pub const EVIDENCE_ANCHOR_REQUEST_ARTIFACT_TYPE: &str = "ethos.evidence_anchor_request.v1"; +/// Report artifact type for evidence anchoring. +pub const EVIDENCE_ANCHOR_REPORT_ARTIFACT_TYPE: &str = "ethos.evidence_anchor_report.v1"; + +/// Evidence-anchor request envelope. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct EvidenceAnchorRequest { + /// Artifact type identity. + pub artifact_type: String, + /// Schema version. + pub schema_version: String, + /// Optional source fingerprint the evidence refs were produced against. + #[serde(skip_serializing_if = "Option::is_none")] + pub source_fingerprint: Option, + /// Caller-provided evidence refs in deterministic input order. + pub evidence_refs: Vec, +} + +/// One caller-provided evidence reference. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct EvidenceRef { + /// Caller correlation key. Unique within one request. + pub evidence_id: String, + /// Evidence kind. + pub evidence_kind: EvidenceKind, + /// Minimum anchor level required by the caller. + pub required_anchor_level: AnchorLevel, + /// Source locator. + pub locator: EvidenceLocator, + /// Expected text, when text matching is required. + #[serde(skip_serializing_if = "Option::is_none")] + pub expected_text: Option, + /// SHA-256 of normalized expected text, when supplied by the caller. + #[serde(skip_serializing_if = "Option::is_none")] + pub expected_text_sha256: Option, + /// Text normalization profile for expected text. + #[serde(skip_serializing_if = "Option::is_none")] + pub text_normalization_profile: Option, +} + +/// Supported and accepted evidence kinds. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum EvidenceKind { + /// Page existence. + Page, + /// Text evidence. + Text, + /// Text and/or region evidence. + TextRegion, + /// Table cell evidence. + TableCell, + /// Accepted but unsupported in v1. + Region, + /// Accepted but unsupported in v1. + Other, +} + +/// Required or achieved anchor level. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum AnchorLevel { + /// No anchor. + None, + /// Page anchor. + Page, + /// Text anchor. + Text, + /// Bounding-box anchor. + Bbox, + /// Text plus bounding-box anchor. + TextBbox, + /// Table-cell anchor. + TableCell, +} + +/// Source locator for an evidence ref. +#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct EvidenceLocator { + /// 1-based parser-neutral page index. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_index: Option, + /// Parser-specific page id. + #[serde(skip_serializing_if = "Option::is_none")] + pub page_id: Option, + /// Parser-specific element id. + #[serde(skip_serializing_if = "Option::is_none")] + pub element_id: Option, + /// Parser-specific span id. + #[serde(skip_serializing_if = "Option::is_none")] + pub span_id: Option, + /// Source bbox `[x0, y0, x1, y1]` in integer quanta. + #[serde(skip_serializing_if = "Option::is_none")] + pub bbox: Option<[i64; 4]>, + /// Parser-specific table id. + #[serde(skip_serializing_if = "Option::is_none")] + pub table_id: Option, + /// Table cell address. + #[serde(skip_serializing_if = "Option::is_none")] + pub cell: Option, + /// Coordinate profile for bbox locators. + #[serde(skip_serializing_if = "Option::is_none")] + pub coordinate_profile: Option, +} + +/// 0-based table cell address. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct AnchorCellRef { + /// Row index. + pub row: u32, + /// Column index. + pub col: u32, +} + +/// Supported text normalization profiles. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TextNormalizationProfile { + /// Collapse ASCII whitespace, matching the existing verifier normalization. + EthosCollapseWhitespaceV1, +} + +/// Supported coordinate profiles. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum CoordinateProfile { + /// Ethos integer quanta with top-left origin. + EthosQuantizedTopLeftV1, +} + +/// Evidence-anchor report envelope. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceAnchorReport { + /// Artifact type identity. + pub artifact_type: String, + /// Schema version. + pub schema_version: String, + /// Source fingerprint, when declared by the grounding source. + #[serde(skip_serializing_if = "Option::is_none")] + pub source_fingerprint: Option, + /// Grounding metadata reused from existing verification reports. + pub grounding: EvidenceAnchorGrounding, + /// Per-ref anchor outcomes. + pub anchors: Vec, +} + +/// Grounding metadata embedded in evidence-anchor reports. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceAnchorGrounding { + /// Producing parser identity. + pub parser: ParserIdentity, + /// Declared source capabilities. + pub capabilities: Capabilities, +} + +/// One evidence anchor outcome. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct EvidenceAnchor { + /// Caller correlation key. + pub evidence_id: String, + /// Evidence kind. + pub evidence_kind: EvidenceKind, + /// Rollup status. + pub anchor_status: AnchorStatus, + /// Required level from the request. + pub required_anchor_level: AnchorLevel, + /// Best deterministic level achieved. + pub achieved_anchor_level: AnchorLevel, + /// Per-axis checks. + pub checks: AnchorChecks, + /// Capability limits that affected this anchor. + pub capability_limits: Vec, +} + +/// Rollup status for one evidence anchor. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum AnchorStatus { + /// Required evidence bound to source evidence. + Bound, + /// A located target failed the expected content/location check. + Mismatch, + /// Required source target was not found. + NotFound, + /// Request/source fingerprints differ. + StaleFingerprint, + /// The source lacks a capability needed to decide the required anchor. + CapabilityLimited, + /// The evidence kind is accepted but unsupported in v1. + UnsupportedEvidenceKind, +} + +/// Per-axis evidence-anchor checks. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub struct AnchorChecks { + /// Fingerprint check. + pub fingerprint: FingerprintCheck, + /// Page check. + pub page: PageCheck, + /// Text check. + pub text: TextCheck, + /// Bbox check. + pub bbox: BboxCheck, + /// Table-cell check. + pub table_cell: TableCellCheck, +} + +impl Default for AnchorChecks { + fn default() -> Self { + AnchorChecks { + fingerprint: FingerprintCheck::NotChecked, + page: PageCheck::NotChecked, + text: TextCheck::NotChecked, + bbox: BboxCheck::NotChecked, + table_cell: TableCellCheck::NotChecked, + } + } +} + +/// Fingerprint axis result. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum FingerprintCheck { + /// Fingerprints match. + Matched, + /// Fingerprints differ. + Stale, + /// Not checked. + NotChecked, + /// Source cannot declare a fingerprint. + CapabilityLimited, +} + +/// Page axis result. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PageCheck { + /// Page was found. + Found, + /// Page was not found. + NotFound, + /// Not checked. + NotChecked, +} + +/// Text axis result. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TextCheck { + /// Text matched. + Matched, + /// Located text mismatched. + Mismatch, + /// Text target was not found. + NotFound, + /// Not checked. + NotChecked, + /// Source lacks required text capability. + CapabilityLimited, +} + +/// Bbox axis result. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum BboxCheck { + /// Bbox is valid. + Valid, + /// Located bbox mismatched. + Invalid, + /// Bbox target was not found. + NotFound, + /// Not checked. + NotChecked, + /// Source lacks required coordinate capability. + CapabilityLimited, +} + +/// Table-cell axis result. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum TableCellCheck { + /// Table cell matched. + Matched, + /// Located table cell mismatched. + Mismatch, + /// Table cell was not found. + NotFound, + /// Not checked. + NotChecked, + /// Source lacks table capability. + CapabilityLimited, +} diff --git a/crates/ethos-core/src/lib.rs b/crates/ethos-core/src/lib.rs index d301e4f3..7ca754be 100644 --- a/crates/ethos-core/src/lib.rs +++ b/crates/ethos-core/src/lib.rs @@ -41,6 +41,8 @@ pub mod grounding; #[cfg(feature = "verify-types")] pub mod codes; #[cfg(feature = "verify-types")] +pub mod evidence_anchor; +#[cfg(feature = "verify-types")] pub mod verify_types; #[cfg(feature = "full")] diff --git a/crates/ethos-verify/Cargo.toml b/crates/ethos-verify/Cargo.toml index ad445eef..7025b1d6 100644 --- a/crates/ethos-verify/Cargo.toml +++ b/crates/ethos-verify/Cargo.toml @@ -27,6 +27,7 @@ publication_status = "approved_for_crates_io_publication" # the canonical model, never backend traits, never ethos-pdf. ethos-core = { workspace = true, features = ["grounding", "verify-types"] } serde = { workspace = true } +sha2 = { workspace = true } [dev-dependencies] serde_json = { workspace = true } diff --git a/crates/ethos-verify/src/lib.rs b/crates/ethos-verify/src/lib.rs index ae521250..06e5855c 100644 --- a/crates/ethos-verify/src/lib.rs +++ b/crates/ethos-verify/src/lib.rs @@ -35,6 +35,12 @@ use std::collections::BTreeMap; use ethos_core::codes::WarningCode; +use ethos_core::evidence_anchor::{ + AnchorChecks, AnchorLevel, AnchorStatus, BboxCheck, CoordinateProfile, EvidenceAnchor, + EvidenceAnchorGrounding, EvidenceAnchorReport, EvidenceAnchorRequest, EvidenceKind, + EvidenceRef, FingerprintCheck, PageCheck, TableCellCheck, TextCheck, TextNormalizationProfile, + EVIDENCE_ANCHOR_REPORT_ARTIFACT_TYPE, +}; use ethos_core::grounding::{ CoordinateOrigin, GroundingCell, GroundingElement, GroundingSource, GroundingSpan, GroundingTable, PageGeometry, @@ -45,6 +51,7 @@ use ethos_core::verify_types::{ VerificationReport, }; use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; /// Citation input accepted by the alpha verifier. /// @@ -71,6 +78,595 @@ pub struct CitationEnvelope { pub claims: Vec, } +/// Validation or source-shape error for evidence anchoring. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct EvidenceAnchorError { + message: String, +} + +impl EvidenceAnchorError { + fn new(message: impl Into) -> Self { + EvidenceAnchorError { + message: message.into(), + } + } +} + +impl std::fmt::Display for EvidenceAnchorError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(&self.message) + } +} + +impl std::error::Error for EvidenceAnchorError {} + +/// Validate and resolve evidence refs over one grounding source. +pub fn anchor_evidence( + source: &dyn GroundingSource, + request: EvidenceAnchorRequest, +) -> Result { + validate_anchor_request(&request)?; + let index = SourceIndex::new(source); + let fingerprint_check = fingerprint_check(request.source_fingerprint.as_deref(), source); + let source_fingerprint = source.fingerprint(); + let grounding = EvidenceAnchorGrounding { + parser: source.parser(), + capabilities: source.capabilities(), + }; + let anchors = request + .evidence_refs + .iter() + .map(|evidence_ref| anchor_one(&index, fingerprint_check, evidence_ref)) + .collect(); + Ok(EvidenceAnchorReport { + artifact_type: EVIDENCE_ANCHOR_REPORT_ARTIFACT_TYPE.to_string(), + schema_version: ethos_core::SCHEMA_VERSION.to_string(), + source_fingerprint, + grounding, + anchors, + }) +} + +fn validate_anchor_request(request: &EvidenceAnchorRequest) -> Result<(), EvidenceAnchorError> { + if request.artifact_type != ethos_core::evidence_anchor::EVIDENCE_ANCHOR_REQUEST_ARTIFACT_TYPE { + return Err(EvidenceAnchorError::new( + "evidence anchor request artifact_type is not supported", + )); + } + if request.schema_version != ethos_core::SCHEMA_VERSION { + return Err(EvidenceAnchorError::new( + "evidence anchor request schema_version is not supported", + )); + } + let mut ids = std::collections::BTreeSet::new(); + for evidence_ref in &request.evidence_refs { + if !ids.insert(evidence_ref.evidence_id.as_str()) { + return Err(EvidenceAnchorError::new(format!( + "duplicate evidence_id '{}'", + evidence_ref.evidence_id + ))); + } + validate_evidence_ref(evidence_ref)?; + } + Ok(()) +} + +fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { + if evidence_ref.locator.page_index == Some(0) { + return Err(EvidenceAnchorError::new("page_index must be 1-based")); + } + if evidence_ref.locator.page_index.is_some() && evidence_ref.locator.page_id.is_some() { + return Err(EvidenceAnchorError::new( + "use exactly one of page_index or page_id", + )); + } + if evidence_ref.locator.bbox.is_some() + && evidence_ref.locator.coordinate_profile + != Some(CoordinateProfile::EthosQuantizedTopLeftV1) + { + return Err(EvidenceAnchorError::new( + "bbox requires coordinate_profile ethos_quantized_top_left_v1", + )); + } + if evidence_ref.expected_text_sha256.is_some() { + let Some(expected_text) = evidence_ref.expected_text.as_deref() else { + return Err(EvidenceAnchorError::new( + "expected_text_sha256 requires expected_text", + )); + }; + if evidence_ref.text_normalization_profile + != Some(TextNormalizationProfile::EthosCollapseWhitespaceV1) + { + return Err(EvidenceAnchorError::new( + "expected_text_sha256 requires text_normalization_profile ethos_collapse_whitespace_v1", + )); + } + let expected_hash = format!( + "sha256:{}", + sha256_hex(normalize_expected_text(expected_text).as_bytes()) + ); + if evidence_ref.expected_text_sha256.as_deref() != Some(expected_hash.as_str()) { + return Err(EvidenceAnchorError::new( + "expected_text_sha256 does not match normalized expected_text", + )); + } + } + match evidence_ref.evidence_kind { + EvidenceKind::Page if evidence_ref.required_anchor_level != AnchorLevel::Page => { + return Err(EvidenceAnchorError::new( + "page evidence supports only required_anchor_level page", + )); + } + EvidenceKind::Text if evidence_ref.required_anchor_level != AnchorLevel::Text => { + return Err(EvidenceAnchorError::new( + "text evidence supports only required_anchor_level text", + )); + } + EvidenceKind::TextRegion + if !matches!( + evidence_ref.required_anchor_level, + AnchorLevel::Text | AnchorLevel::Bbox | AnchorLevel::TextBbox + ) => + { + return Err(EvidenceAnchorError::new( + "text_region evidence supports only text, bbox, or text_bbox anchor levels", + )); + } + EvidenceKind::TableCell if evidence_ref.required_anchor_level != AnchorLevel::TableCell => { + return Err(EvidenceAnchorError::new( + "table_cell evidence supports only required_anchor_level table_cell", + )); + } + EvidenceKind::TableCell + if evidence_ref.locator.table_id.is_none() || evidence_ref.locator.cell.is_none() => + { + return Err(EvidenceAnchorError::new( + "table_cell evidence requires table_id and cell", + )); + } + EvidenceKind::Region | EvidenceKind::Other => {} + _ => {} + } + if page_locator_required(evidence_ref) + && evidence_ref.locator.page_index.is_none() + && evidence_ref.locator.page_id.is_none() + { + return Err(EvidenceAnchorError::new( + "page_index or page_id is required for this evidence ref", + )); + } + Ok(()) +} + +fn page_locator_required(evidence_ref: &EvidenceRef) -> bool { + matches!(evidence_ref.evidence_kind, EvidenceKind::Page) + || evidence_ref.locator.bbox.is_some() + || (evidence_ref.locator.element_id.is_none() + && evidence_ref.locator.span_id.is_none() + && evidence_ref.locator.table_id.is_none()) +} + +fn fingerprint_check( + request_fingerprint: Option<&str>, + source: &dyn GroundingSource, +) -> FingerprintCheck { + match (request_fingerprint, source.fingerprint()) { + (None, _) => FingerprintCheck::NotChecked, + (Some(_), None) => FingerprintCheck::CapabilityLimited, + (Some(expected), Some(actual)) if expected == actual => FingerprintCheck::Matched, + (Some(_), Some(_)) => FingerprintCheck::Stale, + } +} + +fn anchor_one( + index: &SourceIndex, + fingerprint: FingerprintCheck, + evidence_ref: &EvidenceRef, +) -> EvidenceAnchor { + let mut checks = AnchorChecks { + fingerprint, + ..AnchorChecks::default() + }; + let mut capability_limits = Vec::new(); + + if matches!( + evidence_ref.evidence_kind, + EvidenceKind::Region | EvidenceKind::Other + ) { + return anchor_result( + evidence_ref, + AnchorStatus::UnsupportedEvidenceKind, + AnchorLevel::None, + checks, + capability_limits, + ); + } + if fingerprint == FingerprintCheck::Stale { + return anchor_result( + evidence_ref, + AnchorStatus::StaleFingerprint, + AnchorLevel::None, + checks, + capability_limits, + ); + } + if fingerprint == FingerprintCheck::CapabilityLimited { + capability_limits.push(CapabilityLimit::MissingFingerprint); + } + + let page = resolve_page(index, evidence_ref); + checks.page = page.check; + let mut achieved_page = page.check == PageCheck::Found; + let mut text_ok = false; + let mut bbox_ok = false; + let mut table_ok = false; + + match evidence_ref.evidence_kind { + EvidenceKind::Page => {} + EvidenceKind::Text | EvidenceKind::TextRegion => { + if anchor_requires_text(evidence_ref) { + let text = resolve_text(index, evidence_ref, page.page_id.as_deref()); + checks.text = text.check; + text_ok = text.check == TextCheck::Matched; + if text.check == TextCheck::CapabilityLimited { + capability_limits.push(CapabilityLimit::MissingSpans); + } + } + if requires_bbox(evidence_ref) { + let bbox = resolve_bbox(index, evidence_ref, page.page_id.as_deref()); + checks.bbox = bbox; + bbox_ok = bbox == BboxCheck::Valid; + if bbox == BboxCheck::CapabilityLimited { + capability_limits.push(CapabilityLimit::UnknownCoordinateOrigin); + } + } + } + EvidenceKind::TableCell => { + let table = resolve_anchor_table_cell(index, evidence_ref); + checks.table_cell = table.check; + table_ok = table.check == TableCellCheck::Matched; + achieved_page = table.page_found; + if table.check == TableCellCheck::CapabilityLimited { + capability_limits.push(CapabilityLimit::MissingTables); + } + } + EvidenceKind::Region | EvidenceKind::Other => {} + } + + capability_limits.sort_by_key(|limit| format!("{limit:?}")); + capability_limits.dedup(); + let achieved_anchor_level = + achieved_anchor_level(evidence_ref, achieved_page, text_ok, bbox_ok, table_ok); + let anchor_status = anchor_status(evidence_ref, &checks, &capability_limits); + anchor_result( + evidence_ref, + anchor_status, + achieved_anchor_level, + checks, + capability_limits, + ) +} + +fn anchor_result( + evidence_ref: &EvidenceRef, + anchor_status: AnchorStatus, + achieved_anchor_level: AnchorLevel, + checks: AnchorChecks, + capability_limits: Vec, +) -> EvidenceAnchor { + EvidenceAnchor { + evidence_id: evidence_ref.evidence_id.clone(), + evidence_kind: evidence_ref.evidence_kind, + anchor_status, + required_anchor_level: evidence_ref.required_anchor_level, + achieved_anchor_level, + checks, + capability_limits, + } +} + +struct PageResolution { + check: PageCheck, + page_id: Option, +} + +fn resolve_page(index: &SourceIndex, evidence_ref: &EvidenceRef) -> PageResolution { + if let Some(page_id) = evidence_ref.locator.page_id.as_deref() { + return if index.pages.iter().any(|page| page.id == page_id) { + PageResolution { + check: PageCheck::Found, + page_id: Some(page_id.to_string()), + } + } else { + PageResolution { + check: PageCheck::NotFound, + page_id: None, + } + }; + } + if let Some(page_index) = evidence_ref.locator.page_index { + return index + .pages + .iter() + .find(|page| page.index == page_index) + .map(|page| PageResolution { + check: PageCheck::Found, + page_id: Some(page.id.clone()), + }) + .unwrap_or(PageResolution { + check: PageCheck::NotFound, + page_id: None, + }); + } + PageResolution { + check: PageCheck::NotChecked, + page_id: None, + } +} + +struct TextResolution { + check: TextCheck, +} + +fn resolve_text( + index: &SourceIndex, + evidence_ref: &EvidenceRef, + page_id: Option<&str>, +) -> TextResolution { + let Some(expected_text) = evidence_ref.expected_text.as_deref() else { + return TextResolution { + check: TextCheck::NotFound, + }; + }; + if let Some(span_id) = evidence_ref.locator.span_id.as_deref() { + if !index.capabilities.spans { + return TextResolution { + check: TextCheck::CapabilityLimited, + }; + } + return match index.span(span_id) { + Some(span) => TextResolution { + check: text_check(expected_text, &span.text), + }, + None => TextResolution { + check: TextCheck::NotFound, + }, + }; + } + if let Some(element_id) = evidence_ref.locator.element_id.as_deref() { + return index + .element_by_id + .get(element_id) + .and_then(|position| index.elements.get(*position)) + .and_then(|element| element.text.as_deref()) + .map(|actual| TextResolution { + check: text_check(expected_text, actual), + }) + .unwrap_or(TextResolution { + check: TextCheck::NotFound, + }); + } + let Some(page_id) = page_id else { + return TextResolution { + check: TextCheck::NotFound, + }; + }; + if index + .elements + .iter() + .filter(|element| element.page == page_id) + .filter_map(|element| element.text.as_deref()) + .any(|actual| text_check(expected_text, actual) == TextCheck::Matched) + { + return TextResolution { + check: TextCheck::Matched, + }; + } + if index + .spans + .iter() + .filter(|span| span.page == page_id) + .any(|span| text_check(expected_text, &span.text) == TextCheck::Matched) + { + return TextResolution { + check: TextCheck::Matched, + }; + } + TextResolution { + check: if index.elements.iter().any(|element| element.page == page_id) + || index.spans.iter().any(|span| span.page == page_id) + { + TextCheck::Mismatch + } else { + TextCheck::NotFound + }, + } +} + +fn resolve_bbox( + index: &SourceIndex, + evidence_ref: &EvidenceRef, + page_id: Option<&str>, +) -> BboxCheck { + let Some(bbox) = evidence_ref.locator.bbox else { + return BboxCheck::NotChecked; + }; + if index.capabilities.coordinate_origin != CoordinateOrigin::TopLeft { + return BboxCheck::CapabilityLimited; + } + let Some(page_id) = page_id else { + return BboxCheck::NotFound; + }; + let tolerance = VerificationConfig::default_v1() + .matching + .bbox_containment_tolerance_q + .unwrap_or(0); + if index + .elements + .iter() + .any(|element| element.page == page_id && contains_bbox(element.bbox, bbox, tolerance)) + || index + .spans + .iter() + .any(|span| span.page == page_id && contains_bbox(span.bbox, bbox, tolerance)) + || index + .tables + .iter() + .any(|table| table.page == page_id && contains_bbox(table.bbox, bbox, tolerance)) + { + BboxCheck::Valid + } else { + BboxCheck::NotFound + } +} + +struct TableResolution { + check: TableCellCheck, + page_found: bool, +} + +fn resolve_anchor_table_cell(index: &SourceIndex, evidence_ref: &EvidenceRef) -> TableResolution { + if !index.capabilities.tables { + return TableResolution { + check: TableCellCheck::CapabilityLimited, + page_found: false, + }; + } + let Some(table_id) = evidence_ref.locator.table_id.as_deref() else { + return TableResolution { + check: TableCellCheck::NotFound, + page_found: false, + }; + }; + let Some(cell_ref) = evidence_ref.locator.cell else { + return TableResolution { + check: TableCellCheck::NotFound, + page_found: false, + }; + }; + let Some(table) = index.table(table_id) else { + return TableResolution { + check: TableCellCheck::NotFound, + page_found: false, + }; + }; + let page_found = index.pages.iter().any(|page| page.id == table.page); + let Some(cell) = table + .cells + .iter() + .find(|cell| table_cell_covers(cell, cell_ref.row, cell_ref.col)) + else { + return TableResolution { + check: TableCellCheck::NotFound, + page_found, + }; + }; + let check = match evidence_ref.expected_text.as_deref() { + Some(expected) => { + if text_check(expected, &cell.text) == TextCheck::Matched { + TableCellCheck::Matched + } else { + TableCellCheck::Mismatch + } + } + None => TableCellCheck::Matched, + }; + TableResolution { check, page_found } +} + +fn anchor_requires_text(evidence_ref: &EvidenceRef) -> bool { + matches!( + evidence_ref.required_anchor_level, + AnchorLevel::Text | AnchorLevel::TextBbox + ) +} + +fn requires_bbox(evidence_ref: &EvidenceRef) -> bool { + matches!( + evidence_ref.required_anchor_level, + AnchorLevel::Bbox | AnchorLevel::TextBbox + ) +} + +fn text_check(expected: &str, actual: &str) -> TextCheck { + if normalize_expected_text(actual).contains(&normalize_expected_text(expected)) { + TextCheck::Matched + } else { + TextCheck::Mismatch + } +} + +fn normalize_expected_text(input: &str) -> String { + normalize_quote(input) +} + +fn sha256_hex(bytes: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(bytes); + format!("{:x}", hasher.finalize()) +} + +fn achieved_anchor_level( + evidence_ref: &EvidenceRef, + page_ok: bool, + text_ok: bool, + bbox_ok: bool, + table_ok: bool, +) -> AnchorLevel { + match evidence_ref.evidence_kind { + EvidenceKind::Page if page_ok => AnchorLevel::Page, + EvidenceKind::Text if text_ok => AnchorLevel::Text, + EvidenceKind::TextRegion if text_ok && bbox_ok => AnchorLevel::TextBbox, + EvidenceKind::TextRegion if text_ok => AnchorLevel::Text, + EvidenceKind::TextRegion if bbox_ok => AnchorLevel::Bbox, + EvidenceKind::TableCell if table_ok => AnchorLevel::TableCell, + _ => AnchorLevel::None, + } +} + +fn anchor_status( + evidence_ref: &EvidenceRef, + checks: &AnchorChecks, + capability_limits: &[CapabilityLimit], +) -> AnchorStatus { + if checks.page == PageCheck::NotFound + || checks.text == TextCheck::NotFound + || checks.bbox == BboxCheck::NotFound + || checks.table_cell == TableCellCheck::NotFound + { + return AnchorStatus::NotFound; + } + if checks.text == TextCheck::Mismatch + || checks.bbox == BboxCheck::Invalid + || checks.table_cell == TableCellCheck::Mismatch + { + return AnchorStatus::Mismatch; + } + if checks.fingerprint == FingerprintCheck::CapabilityLimited + || checks.text == TextCheck::CapabilityLimited + || checks.bbox == BboxCheck::CapabilityLimited + || checks.table_cell == TableCellCheck::CapabilityLimited + || !capability_limits.is_empty() + { + return AnchorStatus::CapabilityLimited; + } + let bound = match evidence_ref.required_anchor_level { + AnchorLevel::Page => checks.page == PageCheck::Found, + AnchorLevel::Text => checks.text == TextCheck::Matched, + AnchorLevel::Bbox => checks.bbox == BboxCheck::Valid, + AnchorLevel::TextBbox => { + checks.text == TextCheck::Matched && checks.bbox == BboxCheck::Valid + } + AnchorLevel::TableCell => checks.table_cell == TableCellCheck::Matched, + AnchorLevel::None => false, + }; + if bound { + AnchorStatus::Bound + } else { + AnchorStatus::NotFound + } +} + impl CitationInput { /// Claims in deterministic input order. pub fn claims(&self) -> &[Claim] { diff --git a/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md b/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md new file mode 100644 index 00000000..aee0c429 --- /dev/null +++ b/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md @@ -0,0 +1,45 @@ +# ADR-0012: Deterministic Evidence Anchoring Boundary + +- Status: **Accepted** +- Date: 2026-06-24 +- Decider: Gate Zero decider +- Governs: `ethos evidence anchor`, evidence-anchor request/report schemas, and parser-agnostic source binding. + +## Context + +`ethos verify --citations` checks AI-style citation claims against source evidence. That remains +useful, but document pipelines also need a lower-level primitive: check whether caller-provided +evidence refs bind to source evidence before any semantic answer workflow exists. + +The boundary must remain parser- and app-agnostic. Native Ethos JSON and supported foreign parser +outputs enter through `GroundingSource`; callers own what they do with the resulting anchor report. + +## Decision + +Add `ethos evidence anchor --evidence-refs ` as a narrow deterministic source-tracing +command. + +The command: + +- consumes one source document representation per invocation; +- consumes caller-provided evidence refs; +- emits a deterministic evidence-anchor report; +- reuses `GroundingSource`, `ParserIdentity`, `Capabilities`, `CoordinateOrigin`, and + `CapabilityLimit`; +- supports native Ethos JSON and OpenDataLoader-style JSON in v1; +- reports stale fingerprints, missing evidence, mismatches, unsupported v1 kinds, and source + capability limits explicitly. + +The command does not perform semantic support checks, AI answer verification, RAG workflow, +source-map validation, evidence export, crop rendering, batch document-set processing, or +production-readiness gating. + +## Consequences + +- Evidence anchoring becomes a first-class source-bound primitive rather than being squeezed through + citation-claim input. +- Existing `ethos verify` report semantics and goldens must remain unchanged. +- Public docs must describe evidence anchoring generically and preserve the public beta evaluation + posture. +- Future parser adapters can participate when they expose the required data through + `GroundingSource`; missing capabilities must remain explicit diagnostics. diff --git a/docs/decisions/README.md b/docs/decisions/README.md index 39bdf066..4dc8b89d 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -15,5 +15,6 @@ Every closing PRD §15 open question gets an ADR here. Output-changing merges re | 0008 | Gate Zero G2 footprint policy | Accepted | | 0009 | G3 geometry fingerprint policy | Accepted | | 0010 | Deterministic table candidates | Accepted | +| 0012 | Deterministic evidence anchoring boundary | Accepted | Format: Status / Date / Governs, Context, Decision, Consequences. Statuses: Proposed → Accepted | Rejected | Superseded-by-NNNN. diff --git a/schemas/README.md b/schemas/README.md index 7328fdfc..d92e7c9f 100644 --- a/schemas/README.md +++ b/schemas/README.md @@ -11,6 +11,8 @@ bumps and downstream sign-off; output-changing heuristics are semver events (PRD | `ethos-security-report.schema.json` | `security_report.json` | | `ethos-citations.schema.json` | citation input consumed by `ethos verify --citations` | | `ethos-verification-report.schema.json` | `verification_report.json` | +| `ethos-evidence-anchor-request.schema.json` | evidence refs consumed by `ethos evidence anchor --evidence-refs` | +| `ethos-evidence-anchor-report.schema.json` | `evidence_anchor_report.json` emitted by `ethos evidence anchor` | | `ethos-verification-config.schema.json` | verification config (its c14n hash stamps reports) | | `ethos-crop-descriptor.schema.json` | crop descriptor JSON emitted by `ethos crop_element` and `ethos verify --crop-dir` | | `ethos-crop-element-request.schema.json` | source-only request envelope for Milestone D `crop_element` v1 contract work | diff --git a/schemas/ethos-evidence-anchor-report.schema.json b/schemas/ethos-evidence-anchor-report.schema.json new file mode 100644 index 00000000..826eed6d --- /dev/null +++ b/schemas/ethos-evidence-anchor-report.schema.json @@ -0,0 +1,130 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "urn:ethos:schema:evidence-anchor-report:1", + "title": "Ethos evidence anchor report", + "description": "Deterministic report showing whether caller-provided evidence refs bind to source document evidence. This is source tracing, not semantic answer verification.", + "type": "object", + "required": ["artifact_type", "schema_version", "grounding", "anchors"], + "additionalProperties": false, + "properties": { + "artifact_type": { "const": "ethos.evidence_anchor_report.v1" }, + "schema_version": { "const": "1.0.0" }, + "source_fingerprint": { "$ref": "#/$defs/fingerprint" }, + "grounding": { + "type": "object", + "required": ["parser", "capabilities"], + "additionalProperties": false, + "properties": { + "parser": { + "type": "object", + "required": ["name", "version"], + "additionalProperties": false, + "properties": { + "name": { "type": "string" }, + "version": { "type": "string" }, + "adapter": { "type": "string" }, + "adapter_version": { "type": "string" } + } + }, + "capabilities": { + "type": "object", + "required": [ + "spans", + "char_offsets", + "tables", + "fingerprint", + "coordinate_origin", + "crop_support" + ], + "additionalProperties": false, + "properties": { + "spans": { "type": "boolean" }, + "char_offsets": { "type": "boolean" }, + "tables": { "type": "boolean" }, + "fingerprint": { "type": "boolean" }, + "coordinate_origin": { "enum": ["top-left", "bottom-left", "unknown"] }, + "crop_support": { "type": "boolean" } + } + } + } + }, + "anchors": { + "type": "array", + "items": { "$ref": "#/$defs/anchor" } + } + }, + "$defs": { + "fingerprint": { "type": "string", "pattern": "^sha256:[0-9a-f]{64}$" }, + "capability_limit": { + "enum": [ + "missing_spans", + "missing_char_offsets", + "missing_tables", + "missing_fingerprint", + "unknown_coordinate_origin", + "missing_crop_support" + ] + }, + "checks": { + "type": "object", + "required": ["fingerprint", "page", "text", "bbox", "table_cell"], + "additionalProperties": false, + "properties": { + "fingerprint": { + "enum": ["matched", "stale", "not_checked", "capability_limited"] + }, + "page": { "enum": ["found", "not_found", "not_checked"] }, + "text": { + "enum": ["matched", "mismatch", "not_found", "not_checked", "capability_limited"] + }, + "bbox": { + "enum": ["valid", "invalid", "not_found", "not_checked", "capability_limited"] + }, + "table_cell": { + "enum": ["matched", "mismatch", "not_found", "not_checked", "capability_limited"] + } + } + }, + "anchor": { + "type": "object", + "required": [ + "evidence_id", + "evidence_kind", + "anchor_status", + "required_anchor_level", + "achieved_anchor_level", + "checks", + "capability_limits" + ], + "additionalProperties": false, + "properties": { + "evidence_id": { "type": "string", "minLength": 1 }, + "evidence_kind": { + "enum": ["page", "text", "text_region", "table_cell", "region", "other"] + }, + "anchor_status": { + "enum": [ + "bound", + "mismatch", + "not_found", + "stale_fingerprint", + "capability_limited", + "unsupported_evidence_kind" + ] + }, + "required_anchor_level": { + "enum": ["page", "text", "bbox", "text_bbox", "table_cell"] + }, + "achieved_anchor_level": { + "enum": ["none", "page", "text", "bbox", "text_bbox", "table_cell"] + }, + "checks": { "$ref": "#/$defs/checks" }, + "capability_limits": { + "type": "array", + "items": { "$ref": "#/$defs/capability_limit" }, + "uniqueItems": true + } + } + } + } +} diff --git a/schemas/ethos-evidence-anchor-request.schema.json b/schemas/ethos-evidence-anchor-request.schema.json new file mode 100644 index 00000000..7bd6dfba --- /dev/null +++ b/schemas/ethos-evidence-anchor-request.schema.json @@ -0,0 +1,77 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "urn:ethos:schema:evidence-anchor-request:1", + "title": "Ethos evidence anchor request", + "description": "Caller-provided evidence refs to check against one source document representation. Evidence anchoring is deterministic source tracing, not semantic answer verification.", + "type": "object", + "required": ["artifact_type", "schema_version", "evidence_refs"], + "additionalProperties": false, + "properties": { + "artifact_type": { "const": "ethos.evidence_anchor_request.v1" }, + "schema_version": { "const": "1.0.0" }, + "source_fingerprint": { "$ref": "#/$defs/fingerprint" }, + "evidence_refs": { + "type": "array", + "items": { "$ref": "#/$defs/evidence_ref" } + } + }, + "$defs": { + "fingerprint": { "type": "string", "pattern": "^sha256:[0-9a-f]{64}$" }, + "bbox": { + "type": "array", + "items": { "type": "integer" }, + "minItems": 4, + "maxItems": 4 + }, + "cell": { + "type": "object", + "required": ["row", "col"], + "additionalProperties": false, + "properties": { + "row": { "type": "integer", "minimum": 0 }, + "col": { "type": "integer", "minimum": 0 } + } + }, + "locator": { + "type": "object", + "additionalProperties": false, + "properties": { + "page_index": { "type": "integer", "minimum": 1 }, + "page_id": { "type": "string" }, + "element_id": { "type": "string" }, + "span_id": { "type": "string" }, + "bbox": { "$ref": "#/$defs/bbox" }, + "table_id": { "type": "string" }, + "cell": { "$ref": "#/$defs/cell" }, + "coordinate_profile": { "const": "ethos_quantized_top_left_v1" } + }, + "dependentRequired": { + "bbox": ["coordinate_profile"] + }, + "not": { + "required": ["page_index", "page_id"] + } + }, + "evidence_ref": { + "type": "object", + "required": ["evidence_id", "evidence_kind", "required_anchor_level", "locator"], + "additionalProperties": false, + "properties": { + "evidence_id": { "type": "string", "minLength": 1 }, + "evidence_kind": { + "enum": ["page", "text", "text_region", "table_cell", "region", "other"] + }, + "required_anchor_level": { + "enum": ["page", "text", "bbox", "text_bbox", "table_cell"] + }, + "locator": { "$ref": "#/$defs/locator" }, + "expected_text": { "type": "string" }, + "expected_text_sha256": { "$ref": "#/$defs/fingerprint" }, + "text_normalization_profile": { "const": "ethos_collapse_whitespace_v1" } + }, + "dependentRequired": { + "expected_text_sha256": ["expected_text", "text_normalization_profile"] + } + } + } +} diff --git a/schemas/examples/evidence-anchor-report-negative.example.json b/schemas/examples/evidence-anchor-report-negative.example.json new file mode 100644 index 00000000..eea2610b --- /dev/null +++ b/schemas/examples/evidence-anchor-report-negative.example.json @@ -0,0 +1,36 @@ +{ + "artifact_type": "ethos.evidence_anchor_report.v1", + "schema_version": "1.0.0", + "source_fingerprint": "sha256:b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3", + "grounding": { + "parser": { + "name": "ethos", + "version": "0.1.0" + }, + "capabilities": { + "spans": true, + "char_offsets": true, + "tables": true, + "fingerprint": true, + "coordinate_origin": "top-left", + "crop_support": false + } + }, + "anchors": [ + { + "evidence_id": "ev_missing", + "evidence_kind": "text", + "anchor_status": "not_found", + "required_anchor_level": "text", + "achieved_anchor_level": "none", + "checks": { + "fingerprint": "matched", + "page": "not_found", + "text": "not_found", + "bbox": "not_checked", + "table_cell": "not_checked" + }, + "capability_limits": [] + } + ] +} diff --git a/schemas/examples/evidence-anchor-report.example.json b/schemas/examples/evidence-anchor-report.example.json new file mode 100644 index 00000000..72bf551d --- /dev/null +++ b/schemas/examples/evidence-anchor-report.example.json @@ -0,0 +1,36 @@ +{ + "artifact_type": "ethos.evidence_anchor_report.v1", + "schema_version": "1.0.0", + "source_fingerprint": "sha256:b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3", + "grounding": { + "parser": { + "name": "ethos", + "version": "0.1.0" + }, + "capabilities": { + "spans": true, + "char_offsets": true, + "tables": true, + "fingerprint": true, + "coordinate_origin": "top-left", + "crop_support": false + } + }, + "anchors": [ + { + "evidence_id": "ev_001", + "evidence_kind": "text_region", + "anchor_status": "bound", + "required_anchor_level": "text_bbox", + "achieved_anchor_level": "text_bbox", + "checks": { + "fingerprint": "matched", + "page": "found", + "text": "matched", + "bbox": "valid", + "table_cell": "not_checked" + }, + "capability_limits": [] + } + ] +} diff --git a/schemas/examples/evidence-anchor-request.example.json b/schemas/examples/evidence-anchor-request.example.json new file mode 100644 index 00000000..bc9d7467 --- /dev/null +++ b/schemas/examples/evidence-anchor-request.example.json @@ -0,0 +1,20 @@ +{ + "artifact_type": "ethos.evidence_anchor_request.v1", + "schema_version": "1.0.0", + "source_fingerprint": "sha256:b5d30710d0c25cc38d8dec924ecaf57ae4f81276dd5dc14d75cb3b5b6bde62d3", + "evidence_refs": [ + { + "evidence_id": "ev_001", + "evidence_kind": "text_region", + "required_anchor_level": "text_bbox", + "locator": { + "page_index": 1, + "bbox": [7200, 10100, 54000, 11500], + "coordinate_profile": "ethos_quantized_top_left_v1" + }, + "expected_text": "Revenue grew to $12.4M in Q3 2025, driven by enterprise expansion.", + "expected_text_sha256": "sha256:49f675142f930e31d75679b14e23f7c639cb9903029eacb3235c709c14fe4be5", + "text_normalization_profile": "ethos_collapse_whitespace_v1" + } + ] +} diff --git a/schemas/validate_examples.py b/schemas/validate_examples.py index 542e2f01..c7af0a3f 100644 --- a/schemas/validate_examples.py +++ b/schemas/validate_examples.py @@ -78,6 +78,13 @@ EXAMPLES / "verification-report.example.json", EXAMPLES / "verification-report-negative.example.json", ]), + ("ethos-evidence-anchor-request.schema.json", [ + EXAMPLES / "evidence-anchor-request.example.json", + ]), + ("ethos-evidence-anchor-report.schema.json", [ + EXAMPLES / "evidence-anchor-report.example.json", + EXAMPLES / "evidence-anchor-report-negative.example.json", + ]), ("ethos-verification-config.schema.json", [EXAMPLES / "verification-config.example.json"]), ("ethos-crop-descriptor.schema.json", [EXAMPLES / "crop-descriptor.example.json"]), ("ethos-crop-element-request.schema.json", [ From e6334d12fdbff3087c32d98b1053694962edad2c Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 24 Jun 2026 15:06:35 +0530 Subject: [PATCH 2/4] Harden evidence anchor validation Signed-off-by: docushell-admin --- crates/ethos-cli/tests/evidence_anchor.rs | 79 +++++++++++++++++++ crates/ethos-core/src/evidence_anchor.rs | 4 + crates/ethos-verify/src/lib.rs | 36 ++++++++- ...terministic-evidence-anchoring-boundary.md | 13 +++ .../ethos-evidence-anchor-request.schema.json | 2 +- 5 files changed, 131 insertions(+), 3 deletions(-) diff --git a/crates/ethos-cli/tests/evidence_anchor.rs b/crates/ethos-cli/tests/evidence_anchor.rs index 896c7b5f..a8dea1df 100644 --- a/crates/ethos-cli/tests/evidence_anchor.rs +++ b/crates/ethos-cli/tests/evidence_anchor.rs @@ -278,6 +278,31 @@ fn opendataloader_text_binds_and_bbox_is_capability_limited() { #[test] fn usage_errors_are_exit_two() { let cases = [ + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "text", + "required_anchor_level": "text", + "locator": { "page_index": 1 } + } + ])), + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "text_region", + "required_anchor_level": "bbox", + "locator": { "page_index": 1 } + } + ])), + request(serde_json::json!([ + { + "evidence_id": "ev", + "evidence_kind": "text", + "required_anchor_level": "text", + "locator": { "page_index": 1 }, + "expected_text": " " + } + ])), request(serde_json::json!([ { "evidence_id": "ev", @@ -325,6 +350,60 @@ fn usage_errors_are_exit_two() { } } +#[test] +fn table_cell_expected_text_uses_exact_normalized_match() { + let request = request(serde_json::json!([ + { + "evidence_id": "ev_cell", + "evidence_kind": "table_cell", + "required_anchor_level": "table_cell", + "locator": { + "table_id": "t0001", + "cell": { "row": 1, "col": 1 } + }, + "expected_text": "12.4" + } + ])); + let report = parse_success(&[ + "evidence", + "anchor", + document_example().to_str().unwrap(), + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(report["anchors"][0]["anchor_status"], "mismatch"); + assert_eq!(report["anchors"][0]["checks"]["table_cell"], "mismatch"); +} + +#[test] +fn text_mismatch_takes_precedence_over_bbox_capability_limit() { + let request = request(serde_json::json!([ + { + "evidence_id": "odl_mismatch_limited", + "evidence_kind": "text_region", + "required_anchor_level": "text_bbox", + "locator": { + "page_index": 1, + "bbox": [72, 101, 540, 115], + "coordinate_profile": "ethos_quantized_top_left_v1" + }, + "expected_text": "not present in the source" + } + ])); + let report = parse_success(&[ + "evidence", + "anchor", + opendataloader_example().to_str().unwrap(), + "--grounding", + "opendataloader-json", + "--evidence-refs", + request.to_str().unwrap(), + ]); + assert_eq!(report["anchors"][0]["checks"]["text"], "mismatch"); + assert_eq!(report["anchors"][0]["checks"]["bbox"], "capability_limited"); + assert_eq!(report["anchors"][0]["anchor_status"], "mismatch"); +} + #[test] fn unknown_grounding_and_source_shape_exit_two() { let request = request(serde_json::json!([])); diff --git a/crates/ethos-core/src/evidence_anchor.rs b/crates/ethos-core/src/evidence_anchor.rs index 79bfb1e0..3d08591f 100644 --- a/crates/ethos-core/src/evidence_anchor.rs +++ b/crates/ethos-core/src/evidence_anchor.rs @@ -162,6 +162,7 @@ pub enum CoordinateProfile { /// Evidence-anchor report envelope. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct EvidenceAnchorReport { /// Artifact type identity. pub artifact_type: String, @@ -178,6 +179,7 @@ pub struct EvidenceAnchorReport { /// Grounding metadata embedded in evidence-anchor reports. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct EvidenceAnchorGrounding { /// Producing parser identity. pub parser: ParserIdentity, @@ -187,6 +189,7 @@ pub struct EvidenceAnchorGrounding { /// One evidence anchor outcome. #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct EvidenceAnchor { /// Caller correlation key. pub evidence_id: String, @@ -224,6 +227,7 @@ pub enum AnchorStatus { /// Per-axis evidence-anchor checks. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct AnchorChecks { /// Fingerprint check. pub fingerprint: FingerprintCheck, diff --git a/crates/ethos-verify/src/lib.rs b/crates/ethos-verify/src/lib.rs index 06e5855c..78169070 100644 --- a/crates/ethos-verify/src/lib.rs +++ b/crates/ethos-verify/src/lib.rs @@ -168,6 +168,13 @@ fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAncho "bbox requires coordinate_profile ethos_quantized_top_left_v1", )); } + if let Some(expected_text) = evidence_ref.expected_text.as_deref() { + if normalize_expected_text(expected_text).is_empty() { + return Err(EvidenceAnchorError::new( + "expected_text must not be empty after normalization", + )); + } + } if evidence_ref.expected_text_sha256.is_some() { let Some(expected_text) = evidence_ref.expected_text.as_deref() else { return Err(EvidenceAnchorError::new( @@ -227,6 +234,16 @@ fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAncho EvidenceKind::Region | EvidenceKind::Other => {} _ => {} } + if anchor_requires_text(evidence_ref) && evidence_ref.expected_text.is_none() { + return Err(EvidenceAnchorError::new( + "required_anchor_level text or text_bbox requires expected_text", + )); + } + if requires_bbox(evidence_ref) && evidence_ref.locator.bbox.is_none() { + return Err(EvidenceAnchorError::new( + "required_anchor_level bbox or text_bbox requires locator.bbox", + )); + } if page_locator_required(evidence_ref) && evidence_ref.locator.page_index.is_none() && evidence_ref.locator.page_id.is_none() @@ -333,7 +350,7 @@ fn anchor_one( EvidenceKind::Region | EvidenceKind::Other => {} } - capability_limits.sort_by_key(|limit| format!("{limit:?}")); + capability_limits.sort_by_key(|limit| capability_limit_order(*limit)); capability_limits.dedup(); let achieved_anchor_level = achieved_anchor_level(evidence_ref, achieved_page, text_ok, bbox_ok, table_ok); @@ -563,7 +580,7 @@ fn resolve_anchor_table_cell(index: &SourceIndex, evidence_ref: &EvidenceRef) -> }; let check = match evidence_ref.expected_text.as_deref() { Some(expected) => { - if text_check(expected, &cell.text) == TextCheck::Matched { + if table_cell_text_matches(expected, &cell.text) { TableCellCheck::Matched } else { TableCellCheck::Mismatch @@ -596,10 +613,25 @@ fn text_check(expected: &str, actual: &str) -> TextCheck { } } +fn table_cell_text_matches(expected: &str, actual: &str) -> bool { + normalize_expected_text(actual) == normalize_expected_text(expected) +} + fn normalize_expected_text(input: &str) -> String { normalize_quote(input) } +fn capability_limit_order(limit: CapabilityLimit) -> u8 { + match limit { + CapabilityLimit::MissingSpans => 0, + CapabilityLimit::MissingCharOffsets => 1, + CapabilityLimit::MissingTables => 2, + CapabilityLimit::MissingFingerprint => 3, + CapabilityLimit::UnknownCoordinateOrigin => 4, + CapabilityLimit::MissingCropSupport => 5, + } +} + fn sha256_hex(bytes: &[u8]) -> String { let mut hasher = Sha256::new(); hasher.update(bytes); diff --git a/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md b/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md index aee0c429..4e1a3cc6 100644 --- a/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md +++ b/docs/decisions/ADR-0012-deterministic-evidence-anchoring-boundary.md @@ -30,6 +30,19 @@ The command: - reports stale fingerprints, missing evidence, mismatches, unsupported v1 kinds, and source capability limits explicitly. +Per-ref rollup is fail-closed and ordered: + +1. accepted-but-unsupported v1 evidence kinds report `unsupported_evidence_kind`; +2. stale source fingerprints short-circuit page/text/bbox/table checks and report + `stale_fingerprint`; +3. missing required targets report `not_found`; +4. checked mismatches report `mismatch`; +5. capability gaps report `capability_limited` only when no checked required axis failed; +6. only fully satisfied required axes report `bound`. + +This preserves the trust boundary for mixed outcomes. For example, if text mismatches while bbox +checking is capability-limited, the anchor reports `mismatch`, not `capability_limited`. + The command does not perform semantic support checks, AI answer verification, RAG workflow, source-map validation, evidence export, crop rendering, batch document-set processing, or production-readiness gating. diff --git a/schemas/ethos-evidence-anchor-request.schema.json b/schemas/ethos-evidence-anchor-request.schema.json index 7bd6dfba..ad89e5a6 100644 --- a/schemas/ethos-evidence-anchor-request.schema.json +++ b/schemas/ethos-evidence-anchor-request.schema.json @@ -65,7 +65,7 @@ "enum": ["page", "text", "bbox", "text_bbox", "table_cell"] }, "locator": { "$ref": "#/$defs/locator" }, - "expected_text": { "type": "string" }, + "expected_text": { "type": "string", "minLength": 1 }, "expected_text_sha256": { "$ref": "#/$defs/fingerprint" }, "text_normalization_profile": { "const": "ethos_collapse_whitespace_v1" } }, From c3c4cdb19a9d401cde983ca20be96d908a77a4f4 Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 24 Jun 2026 15:09:21 +0530 Subject: [PATCH 3/4] Fix package candidate verify dependencies Signed-off-by: docushell-admin --- .github/scripts/package_publication_candidate_activation.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/scripts/package_publication_candidate_activation.py b/.github/scripts/package_publication_candidate_activation.py index f1dc154f..2e6136cb 100644 --- a/.github/scripts/package_publication_candidate_activation.py +++ b/.github/scripts/package_publication_candidate_activation.py @@ -36,7 +36,7 @@ CANDIDATE_NORMAL_DEPENDENCIES = { CORE_PACKAGE: ["serde", "serde_json", "sha2", "thiserror"], "ethos-pdf": [CORE_PACKAGE, "serde", "serde_json"], - "ethos-verify": [CORE_PACKAGE, "serde"], + "ethos-verify": [CORE_PACKAGE, "serde", "sha2"], CONSUMER_PACKAGE: [CORE_PACKAGE, "ethos-pdf", "ethos-verify"], } IGNORE_NAMES = { @@ -242,6 +242,9 @@ def generated_manifest(package: str) -> str: [dependencies.serde] version = "1" features = ["derive"] + +[dependencies.sha2] +version = "0.10" """ if package == "ethos-pdf": From 1814b1597fedcdb8d25acbcc7428aca59f4b1c3d Mon Sep 17 00:00:00 2001 From: docushell-admin Date: Wed, 24 Jun 2026 15:17:07 +0530 Subject: [PATCH 4/4] Refactor evidence anchor validation Signed-off-by: docushell-admin --- crates/ethos-verify/src/lib.rs | 35 +++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/crates/ethos-verify/src/lib.rs b/crates/ethos-verify/src/lib.rs index 78169070..e4a6a401 100644 --- a/crates/ethos-verify/src/lib.rs +++ b/crates/ethos-verify/src/lib.rs @@ -152,22 +152,35 @@ fn validate_anchor_request(request: &EvidenceAnchorRequest) -> Result<(), Eviden } fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { - if evidence_ref.locator.page_index == Some(0) { + validate_locator(evidence_ref)?; + validate_expected_text(evidence_ref)?; + validate_kind_level_compat(evidence_ref)?; + validate_required_anchor_inputs(evidence_ref)?; + validate_required_page_locator(evidence_ref)?; + Ok(()) +} + +fn validate_locator(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { + let locator = &evidence_ref.locator; + if locator.page_index == Some(0) { return Err(EvidenceAnchorError::new("page_index must be 1-based")); } - if evidence_ref.locator.page_index.is_some() && evidence_ref.locator.page_id.is_some() { + if locator.page_index.is_some() && locator.page_id.is_some() { return Err(EvidenceAnchorError::new( "use exactly one of page_index or page_id", )); } - if evidence_ref.locator.bbox.is_some() - && evidence_ref.locator.coordinate_profile - != Some(CoordinateProfile::EthosQuantizedTopLeftV1) + if locator.bbox.is_some() + && locator.coordinate_profile != Some(CoordinateProfile::EthosQuantizedTopLeftV1) { return Err(EvidenceAnchorError::new( "bbox requires coordinate_profile ethos_quantized_top_left_v1", )); } + Ok(()) +} + +fn validate_expected_text(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { if let Some(expected_text) = evidence_ref.expected_text.as_deref() { if normalize_expected_text(expected_text).is_empty() { return Err(EvidenceAnchorError::new( @@ -198,6 +211,10 @@ fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAncho )); } } + Ok(()) +} + +fn validate_kind_level_compat(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { match evidence_ref.evidence_kind { EvidenceKind::Page if evidence_ref.required_anchor_level != AnchorLevel::Page => { return Err(EvidenceAnchorError::new( @@ -234,6 +251,10 @@ fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAncho EvidenceKind::Region | EvidenceKind::Other => {} _ => {} } + Ok(()) +} + +fn validate_required_anchor_inputs(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { if anchor_requires_text(evidence_ref) && evidence_ref.expected_text.is_none() { return Err(EvidenceAnchorError::new( "required_anchor_level text or text_bbox requires expected_text", @@ -244,6 +265,10 @@ fn validate_evidence_ref(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAncho "required_anchor_level bbox or text_bbox requires locator.bbox", )); } + Ok(()) +} + +fn validate_required_page_locator(evidence_ref: &EvidenceRef) -> Result<(), EvidenceAnchorError> { if page_locator_required(evidence_ref) && evidence_ref.locator.page_index.is_none() && evidence_ref.locator.page_id.is_none()