From 5bbbcf33b82b7c36395b1e06d458dac030e0da8b Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Sat, 14 Mar 2026 23:01:01 +0530 Subject: [PATCH 1/4] feat: add semantic key-based mapping between extracted JSON and PDF widgets --- src/filler.py | 74 +++++++++++ src/semantic_mapper.py | 212 +++++++++++++++++++++++++++++++ src/test/test_semantic_mapper.py | 113 ++++++++++++++++ 3 files changed, 399 insertions(+) create mode 100644 src/filler.py create mode 100644 src/semantic_mapper.py create mode 100644 src/test/test_semantic_mapper.py diff --git a/src/filler.py b/src/filler.py new file mode 100644 index 0000000..d9c517d --- /dev/null +++ b/src/filler.py @@ -0,0 +1,74 @@ +from pdfrw import PdfReader, PdfWriter +from src.llm import LLM +from src.semantic_mapper import SemanticMapper +from datetime import datetime + + +class Filler: + def __init__(self): + pass + + def fill_form(self, pdf_form: str, llm: LLM, template_config: dict = None): + """ + Fill a PDF form with values extracted by LLM. + + Fields are matched semantically (JSON key ↔ PDF widget name) first. + Any unmatched fields fall back to visual-order positional assignment + (top-to-bottom, left-to-right). + + Parameters + ---------- + pdf_form : path to the input PDF template + llm : configured LLM instance (main_loop not yet called) + template_config : optional per-template mapping hints, e.g. + { + "field_mappings": {"Employee's name": "EmployeeName"}, + "aliases": {"Employee's name": ["name"]}, + "required_fields": ["Employee's name", "Date"] + } + """ + output_pdf = ( + pdf_form[:-4] + + "_" + + datetime.now().strftime("%Y%m%d_%H%M%S") + + "_filled.pdf" + ) + + # ── 1. Extract structured data from LLM ────────────────────────────── + t2j = llm.main_loop() + textbox_answers = t2j.get_data() # {json_key: value} + + # ── 2. Collect PDF widgets in visual order (global across pages) ────── + pdf = PdfReader(pdf_form) + ordered_annots = [] + pdf_field_names = [] + + for page in pdf.pages: + if page.Annots: + sorted_annots = sorted( + page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0])) + ) + for annot in sorted_annots: + if annot.Subtype == "/Widget" and annot.T: + # pdfrw wraps field names in parens: e.g. '(EmployeeName)' + pdf_field_names.append(annot.T[1:-1]) + ordered_annots.append(annot) + + # ── 3. Semantic mapping ─────────────────────────────────────────────── + mapper = SemanticMapper(template_config) + result = mapper.map(textbox_answers, pdf_field_names) + print(result.report()) + + # ── 4. Fill: semantic matches first, positional fallback for the rest ─ + positional_idx = 0 + for annot, pdf_field in zip(ordered_annots, pdf_field_names): + if pdf_field in result.matched: + annot.V = f"{result.matched[pdf_field]}" + annot.AP = None + elif positional_idx < len(result.positional_values): + annot.V = f"{result.positional_values[positional_idx]}" + annot.AP = None + positional_idx += 1 + + PdfWriter().write(output_pdf, pdf) + return output_pdf diff --git a/src/semantic_mapper.py b/src/semantic_mapper.py new file mode 100644 index 0000000..0eaadcd --- /dev/null +++ b/src/semantic_mapper.py @@ -0,0 +1,212 @@ +""" +Semantic Mapping Layer +---------------------- +Matches extracted JSON keys to PDF form field names using: + 1. Explicit mappings from a per-template config + 2. Case-insensitive exact match + 3. Alias match (from template config) + 4. Fuzzy token-overlap (Jaccard similarity) + 5. Positional fallback for any remaining unmatched pairs + +Returns a MappingResult with matched values, warnings, and a printable report. +""" + +import re +from dataclasses import dataclass, field + + +@dataclass +class MappingResult: + """Holds the outcome of one semantic mapping run.""" + + matched: dict # {pdf_field_name: value} — semantically placed + positional_values: list # values for JSON keys that had no semantic PDF match + unmapped_json_keys: list + unmapped_pdf_fields: list + ambiguous: list # [(json_key, [candidate_pdf_fields])] + warnings: list # human-readable warning strings + + def report(self) -> str: + lines = [ + "=== Semantic Mapping Report ===", + f" Matched (semantic): {len(self.matched)}", + f" Positional fallback: {len(self.positional_values)}", + f" Unmapped JSON keys: {len(self.unmapped_json_keys)}", + f" Unmapped PDF fields: {len(self.unmapped_pdf_fields)}", + f" Ambiguous: {len(self.ambiguous)}", + ] + if self.matched: + lines.append("\n Semantic matches:") + for pdf_f, val in self.matched.items(): + lines.append(f" {pdf_f!r} ← {val!r}") + if self.ambiguous: + lines.append("\n Ambiguous (best candidate used):") + for json_key, candidates in self.ambiguous: + lines.append(f" {json_key!r} → {candidates}") + if self.unmapped_json_keys: + lines.append("\n Unmapped JSON keys (positional fallback):") + for k in self.unmapped_json_keys: + lines.append(f" - {k!r}") + if self.unmapped_pdf_fields: + lines.append("\n Unmapped PDF fields (left blank):") + for f in self.unmapped_pdf_fields: + lines.append(f" - {f!r}") + if self.warnings: + lines.append("\n Warnings:") + for w in self.warnings: + lines.append(f" ⚠ {w}") + lines.append("================================") + return "\n".join(lines) + + +class SemanticMapper: + """ + Maps extracted JSON keys to PDF widget field names. + + template_config schema (all keys optional): + { + "field_mappings": {"Employee's name": "EmployeeName"}, + "aliases": {"Employee's name": ["name", "worker name"]}, + "required_fields": ["Employee's name", "Date"] + } + """ + + FUZZY_THRESHOLD = 0.35 # Jaccard threshold for a fuzzy hit + AMBIGUITY_MARGIN = 0.05 # Scores within this of the top are ambiguous + + def __init__(self, template_config: dict = None): + cfg = template_config or {} + self._explicit: dict = cfg.get("field_mappings", {}) # json_key → pdf_field + self._aliases: dict = cfg.get("aliases", {}) # json_key → [alias…] + self._required: list = cfg.get("required_fields", []) + + # ── public ─────────────────────────────────────────────────────────────── + + def map(self, extracted: dict, pdf_field_names: list) -> MappingResult: + """ + Match extracted JSON keys to PDF widget field names. + + Parameters + ---------- + extracted : dict returned by LLM.get_data() {json_key: value} + pdf_field_names : ordered list of PDF widget names (annot.T stripped) + + Returns + ------- + MappingResult + """ + matched: dict = {} # pdf_field_name → value + used_pdf: set = set() + used_json: set = set() + ambiguous: list = [] + warnings: list = [] + + # ── Pass 1: explicit config mappings ───────────────────────────────── + for json_key, pdf_field in self._explicit.items(): + if ( + json_key in extracted + and pdf_field in pdf_field_names + and pdf_field not in used_pdf + ): + matched[pdf_field] = extracted[json_key] + used_pdf.add(pdf_field) + used_json.add(json_key) + + # ── Pass 2: exact / alias / fuzzy for remaining keys ───────────────── + remaining_pdf = [f for f in pdf_field_names if f not in used_pdf] + + for json_key, value in extracted.items(): + if json_key in used_json: + continue + + result = self._find_match(json_key, remaining_pdf) + + if result is None: + continue # will end up in positional fallback + + if isinstance(result, list): + # ambiguous: multiple close candidates — use the first, warn + ambiguous.append((json_key, result)) + best = result[0] + else: + best = result + + matched[best] = value + used_pdf.add(best) + used_json.add(json_key) + remaining_pdf = [f for f in remaining_pdf if f != best] + + # ── Required-field warnings ─────────────────────────────────────────── + for req in self._required: + if req not in used_json: + warnings.append(f"Required field not mapped: {req!r}") + + unmapped_json = [k for k in extracted if k not in used_json] + unmapped_pdf = [f for f in pdf_field_names if f not in used_pdf] + positional_vals = [extracted[k] for k in unmapped_json] + + return MappingResult( + matched=matched, + positional_values=positional_vals, + unmapped_json_keys=unmapped_json, + unmapped_pdf_fields=unmapped_pdf, + ambiguous=ambiguous, + warnings=warnings, + ) + + # ── private ─────────────────────────────────────────────────────────────── + + def _normalize(self, s: str) -> set: + """Split camelCase/PascalCase, lowercase, strip punctuation, return token set.""" + # Insert space before each uppercase letter that follows a lowercase letter + # so "EmployeeEmail" → "Employee Email" + s = re.sub(r"([a-z])([A-Z])", r"\1 \2", s) + s = s.lower() + s = re.sub(r"[^a-z0-9\s]", " ", s) + return set(s.split()) + + def _similarity(self, a: str, b: str) -> float: + """Jaccard similarity between token sets of two strings.""" + ta = self._normalize(a) + tb = self._normalize(b) + if not ta or not tb: + return 0.0 + return len(ta & tb) / len(ta | tb) + + def _find_match(self, json_key: str, pdf_fields: list): + """ + Returns + ------- + str : single unambiguous best match + list[str] : multiple candidates above threshold (ambiguous) + None : no match found + """ + # 1. Exact match (case-insensitive) + for pdf_f in pdf_fields: + if json_key.strip().lower() == pdf_f.strip().lower(): + return pdf_f + + # 2. Alias exact match + for alias in self._aliases.get(json_key, []): + for pdf_f in pdf_fields: + if alias.strip().lower() == pdf_f.strip().lower(): + return pdf_f + + # 3. Fuzzy token-overlap — try json_key AND any aliases vs each pdf field + candidates_to_try = [json_key] + self._aliases.get(json_key, []) + scored = [] + for pdf_f in pdf_fields: + best_score = max( + self._similarity(c, pdf_f) for c in candidates_to_try + ) + if best_score >= self.FUZZY_THRESHOLD: + scored.append((best_score, pdf_f)) + + if not scored: + return None + + scored.sort(key=lambda x: -x[0]) + top_score = scored[0][0] + top_candidates = [f for s, f in scored if top_score - s < self.AMBIGUITY_MARGIN] + + return top_candidates[0] if len(top_candidates) == 1 else top_candidates diff --git a/src/test/test_semantic_mapper.py b/src/test/test_semantic_mapper.py new file mode 100644 index 0000000..54e00cd --- /dev/null +++ b/src/test/test_semantic_mapper.py @@ -0,0 +1,113 @@ +from src.semantic_mapper import SemanticMapper, MappingResult + + +# ── helpers ─────────────────────────────────────────────────────────────────── + +def make_mapper(config=None): + return SemanticMapper(config) + + +# ── exact match ─────────────────────────────────────────────────────────────── + +def test_exact_case_insensitive_match(): + mapper = make_mapper() + extracted = {"Employee's name": "John Doe"} + pdf_fields = ["employee's name"] + result = mapper.map(extracted, pdf_fields) + assert "employee's name" in result.matched + assert result.matched["employee's name"] == "John Doe" + assert result.unmapped_json_keys == [] + + +# ── explicit config mapping ─────────────────────────────────────────────────── + +def test_explicit_config_mapping(): + config = {"field_mappings": {"Employee's name": "EmployeeName"}} + mapper = make_mapper(config) + extracted = {"Employee's name": "Jane Doe"} + pdf_fields = ["EmployeeName", "Date"] + result = mapper.map(extracted, pdf_fields) + assert result.matched["EmployeeName"] == "Jane Doe" + + +# ── alias match ─────────────────────────────────────────────────────────────── + +def test_alias_match(): + config = {"aliases": {"Employee's name": ["worker name"]}} + mapper = make_mapper(config) + extracted = {"Employee's name": "Alice"} + pdf_fields = ["worker name"] + result = mapper.map(extracted, pdf_fields) + assert result.matched["worker name"] == "Alice" + + +# ── fuzzy match ─────────────────────────────────────────────────────────────── + +def test_fuzzy_match(): + mapper = make_mapper() + extracted = {"employee email": "test@test.com"} + pdf_fields = ["EmployeeEmail"] + result = mapper.map(extracted, pdf_fields) + assert "EmployeeEmail" in result.matched + assert result.matched["EmployeeEmail"] == "test@test.com" + + +# ── positional fallback ─────────────────────────────────────────────────────── + +def test_positional_fallback_for_unmatched_key(): + mapper = make_mapper() + extracted = {"xqzwrandom": "some_value"} + pdf_fields = ["Text1"] + result = mapper.map(extracted, pdf_fields) + assert "xqzwrandom" in result.unmapped_json_keys + assert "some_value" in result.positional_values + + +# ── required field warning ──────────────────────────────────────────────────── + +def test_required_field_warning_when_missing(): + config = {"required_fields": ["Date"]} + mapper = make_mapper(config) + extracted = {"Employee's name": "Bob"} + pdf_fields = ["employee s name"] + result = mapper.map(extracted, pdf_fields) + assert any("Date" in w for w in result.warnings) + + +def test_no_warning_when_required_field_matched(): + config = { + "required_fields": ["Employee's name"], + "field_mappings": {"Employee's name": "EmployeeName"}, + } + mapper = make_mapper(config) + extracted = {"Employee's name": "Bob"} + pdf_fields = ["EmployeeName"] + result = mapper.map(extracted, pdf_fields) + assert result.warnings == [] + + +# ── multiple fields, partial semantic match ─────────────────────────────────── + +def test_mixed_semantic_and_positional(): + mapper = make_mapper() + extracted = { + "employee email": "a@b.com", + "zzznomatch": "fallback_value", + } + pdf_fields = ["EmployeeEmail", "SomeOtherField"] + result = mapper.map(extracted, pdf_fields) + assert result.matched["EmployeeEmail"] == "a@b.com" + assert "fallback_value" in result.positional_values + + +# ── report output ───────────────────────────────────────────────────────────── + +def test_report_contains_key_sections(): + mapper = make_mapper() + extracted = {"employee name": "John"} + pdf_fields = ["employee name", "Date"] + result = mapper.map(extracted, pdf_fields) + report = result.report() + assert "Semantic Mapping Report" in report + assert "Matched" in report + assert "Unmapped PDF fields" in report From 623d12ff00e2cb94409e3035925c09395cb2dc03 Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Sat, 14 Mar 2026 23:14:23 +0530 Subject: [PATCH 2/4] fix: resolve semantic mapper and filler typing diagnostics --- src/filler.py | 7 ++++--- src/semantic_mapper.py | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/filler.py b/src/filler.py index d9c517d..2b557b7 100644 --- a/src/filler.py +++ b/src/filler.py @@ -1,5 +1,6 @@ +from typing import Any, Optional + from pdfrw import PdfReader, PdfWriter -from src.llm import LLM from src.semantic_mapper import SemanticMapper from datetime import datetime @@ -8,7 +9,7 @@ class Filler: def __init__(self): pass - def fill_form(self, pdf_form: str, llm: LLM, template_config: dict = None): + def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = None): """ Fill a PDF form with values extracted by LLM. @@ -43,7 +44,7 @@ def fill_form(self, pdf_form: str, llm: LLM, template_config: dict = None): ordered_annots = [] pdf_field_names = [] - for page in pdf.pages: + for page in (pdf.pages or []): # type: ignore[operator] if page.Annots: sorted_annots = sorted( page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0])) diff --git a/src/semantic_mapper.py b/src/semantic_mapper.py index 0eaadcd..0634c57 100644 --- a/src/semantic_mapper.py +++ b/src/semantic_mapper.py @@ -12,7 +12,8 @@ """ import re -from dataclasses import dataclass, field +from dataclasses import dataclass +from typing import Optional @dataclass @@ -74,7 +75,7 @@ class SemanticMapper: FUZZY_THRESHOLD = 0.35 # Jaccard threshold for a fuzzy hit AMBIGUITY_MARGIN = 0.05 # Scores within this of the top are ambiguous - def __init__(self, template_config: dict = None): + def __init__(self, template_config: Optional[dict] = None): cfg = template_config or {} self._explicit: dict = cfg.get("field_mappings", {}) # json_key → pdf_field self._aliases: dict = cfg.get("aliases", {}) # json_key → [alias…] From b595b3301e3e931823fd9f93ffd29636218829b1 Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Tue, 17 Mar 2026 21:18:39 +0530 Subject: [PATCH 3/4] feat(validation): add InputExtractionJSONPDF gates with per-run report --- src/filler.py | 67 ++++++++++---- src/validation_gates.py | 164 +++++++++++++++++++++++++++++++++ tests/test_validation_gates.py | 57 ++++++++++++ 3 files changed, 269 insertions(+), 19 deletions(-) create mode 100644 src/validation_gates.py create mode 100644 tests/test_validation_gates.py diff --git a/src/filler.py b/src/filler.py index 2b557b7..d2b46c8 100644 --- a/src/filler.py +++ b/src/filler.py @@ -3,13 +3,20 @@ from pdfrw import PdfReader, PdfWriter from src.semantic_mapper import SemanticMapper from datetime import datetime +from src.validation_gates import ValidationGates class Filler: def __init__(self): pass - def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = None): + def fill_form( + self, + pdf_form: str, + llm: Any, + template_config: Optional[dict] = None, + strict_validation: bool = True, + ): """ Fill a PDF form with values extracted by LLM. @@ -17,28 +24,34 @@ def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = N Any unmatched fields fall back to visual-order positional assignment (top-to-bottom, left-to-right). - Parameters - ---------- - pdf_form : path to the input PDF template - llm : configured LLM instance (main_loop not yet called) - template_config : optional per-template mapping hints, e.g. - { - "field_mappings": {"Employee's name": "EmployeeName"}, - "aliases": {"Employee's name": ["name"]}, - "required_fields": ["Employee's name", "Date"] - } + Validation gates: + 1) Input -> Extraction + 2) Extraction -> JSON + 3) JSON -> PDF + + If strict_validation=True and any gate fails, PDF is not written. """ - output_pdf = ( - pdf_form[:-4] - + "_" - + datetime.now().strftime("%Y%m%d_%H%M%S") - + "_filled.pdf" - ) + cfg = template_config or {} + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + + output_pdf = pdf_form[:-4] + "_" + ts + "_filled.pdf" + validation_report_path = pdf_form[:-4] + "_" + ts + "_validation_report.json" + + validation_report = ValidationGates.new_report(source_pdf=pdf_form) # ── 1. Extract structured data from LLM ────────────────────────────── t2j = llm.main_loop() textbox_answers = t2j.get_data() # {json_key: value} + gate_1 = ValidationGates.input_to_extraction(pdf_form, llm, textbox_answers) + validation_report.add_gate(gate_1) + + gate_2 = ValidationGates.extraction_to_json( + textbox_answers if isinstance(textbox_answers, dict) else {}, + cfg, + ) + validation_report.add_gate(gate_2) + # ── 2. Collect PDF widgets in visual order (global across pages) ────── pdf = PdfReader(pdf_form) ordered_annots = [] @@ -51,15 +64,29 @@ def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = N ) for annot in sorted_annots: if annot.Subtype == "/Widget" and annot.T: - # pdfrw wraps field names in parens: e.g. '(EmployeeName)' pdf_field_names.append(annot.T[1:-1]) ordered_annots.append(annot) # ── 3. Semantic mapping ─────────────────────────────────────────────── - mapper = SemanticMapper(template_config) + mapper = SemanticMapper(cfg) result = mapper.map(textbox_answers, pdf_field_names) print(result.report()) + gate_3 = ValidationGates.json_to_pdf( + textbox_answers if isinstance(textbox_answers, dict) else {}, + pdf_field_names, + result, + cfg, + ) + validation_report.add_gate(gate_3) + + # Block final output if validation fails + if strict_validation and not validation_report.passed: + validation_report.write(validation_report_path, output_pdf=None) + raise ValueError( + f"Validation failed. Report generated at: {validation_report_path}" + ) + # ── 4. Fill: semantic matches first, positional fallback for the rest ─ positional_idx = 0 for annot, pdf_field in zip(ordered_annots, pdf_field_names): @@ -72,4 +99,6 @@ def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = N positional_idx += 1 PdfWriter().write(output_pdf, pdf) + validation_report.write(validation_report_path, output_pdf=output_pdf) + return output_pdf diff --git a/src/validation_gates.py b/src/validation_gates.py new file mode 100644 index 0000000..23557cb --- /dev/null +++ b/src/validation_gates.py @@ -0,0 +1,164 @@ +from __future__ import annotations + +import json +import os +from dataclasses import dataclass, field, asdict +from datetime import datetime +from typing import Any + + +@dataclass +class GateResult: + name: str + passed: bool + reason_codes: list[str] = field(default_factory=list) + details: dict[str, Any] = field(default_factory=dict) + + +@dataclass +class ValidationReport: + run_id: str + created_at: str + source_pdf: str + gates: list[GateResult] = field(default_factory=list) + output_pdf: str | None = None + + @property + def passed(self) -> bool: + return all(g.passed for g in self.gates) + + def add_gate(self, gate: GateResult) -> None: + self.gates.append(gate) + + def to_dict(self) -> dict[str, Any]: + payload = asdict(self) + payload["passed"] = self.passed + return payload + + def write(self, report_path: str, output_pdf: str | None = None) -> str: + if output_pdf: + self.output_pdf = output_pdf + with open(report_path, "w", encoding="utf-8") as f: + json.dump(self.to_dict(), f, indent=2) + return report_path + + +class ValidationGates: + @staticmethod + def input_to_extraction(pdf_form: str, llm: Any, extracted: Any) -> GateResult: + reasons: list[str] = [] + details: dict[str, Any] = {} + + if not pdf_form or not os.path.exists(pdf_form): + reasons.append("INPUT_PDF_NOT_FOUND") + + if llm is None or not hasattr(llm, "main_loop"): + reasons.append("LLM_NOT_CONFIGURED") + + if not isinstance(extracted, dict): + reasons.append("EXTRACTION_NOT_DICT") + elif not extracted: + reasons.append("EXTRACTION_EMPTY") + + details["extracted_key_count"] = len(extracted) if isinstance(extracted, dict) else 0 + + return GateResult( + name="Input -> Extraction", + passed=len(reasons) == 0, + reason_codes=reasons, + details=details, + ) + + @staticmethod + def extraction_to_json(extracted: dict[str, Any], template_config: dict[str, Any]) -> GateResult: + reasons: list[str] = [] + details: dict[str, Any] = {} + required_fields: list[str] = template_config.get("required_fields", []) if template_config else [] + + missing_required = [k for k in required_fields if not extracted.get(k)] + if missing_required: + reasons.append("MANDATORY_FIELDS_MISSING") + + non_empty_count = sum(1 for _, v in extracted.items() if v not in (None, "", [])) + total_count = len(extracted) + completeness_ratio = (non_empty_count / total_count) if total_count else 0.0 + + min_ratio = float(template_config.get("min_completeness_ratio", 0.8)) if template_config else 0.8 + if total_count == 0 or completeness_ratio < min_ratio: + reasons.append("COMPLETENESS_BELOW_THRESHOLD") + + details.update( + { + "total_extracted_fields": total_count, + "non_empty_fields": non_empty_count, + "completeness_ratio": round(completeness_ratio, 4), + "min_required_ratio": min_ratio, + "missing_required_fields": missing_required, + } + ) + + return GateResult( + name="Extraction -> JSON", + passed=len(reasons) == 0, + reason_codes=reasons, + details=details, + ) + + @staticmethod + def json_to_pdf( + extracted: dict[str, Any], + pdf_field_names: list[str], + mapping_result: Any, + template_config: dict[str, Any], + ) -> GateResult: + reasons: list[str] = [] + details: dict[str, Any] = {} + + matched: dict[str, Any] = getattr(mapping_result, "matched", {}) or {} + positional_values: list[Any] = getattr(mapping_result, "positional_values", []) or [] + + unmatched_pdf_fields = [f for f in pdf_field_names if f not in matched] + required_pdf_fields: list[str] = template_config.get("required_pdf_fields", []) if template_config else [] + missing_required_pdf = [f for f in required_pdf_fields if f not in matched] + + if not pdf_field_names: + reasons.append("PDF_WIDGETS_NOT_FOUND") + + if missing_required_pdf: + reasons.append("MANDATORY_PDF_FIELDS_UNMATCHED") + + if positional_values: + reasons.append("POSITIONAL_FALLBACK_USED") + + if unmatched_pdf_fields: + reasons.append("PDF_FIELDS_UNMATCHED") + + # Simple mismatch signal between extracted volume and semantic matches + if len(extracted) > 0 and len(matched) == 0: + reasons.append("JSON_TO_PDF_MAPPING_EMPTY") + + details.update( + { + "pdf_field_count": len(pdf_field_names), + "matched_pdf_fields_count": len(matched), + "unmatched_pdf_fields": unmatched_pdf_fields, + "missing_required_pdf_fields": missing_required_pdf, + "positional_fallback_count": len(positional_values), + } + ) + + return GateResult( + name="JSON -> PDF", + passed=len(reasons) == 0, + reason_codes=reasons, + details=details, + ) + + @staticmethod + def new_report(source_pdf: str) -> ValidationReport: + run_id = datetime.now().strftime("%Y%m%d_%H%M%S") + return ValidationReport( + run_id=run_id, + created_at=datetime.now().isoformat(), + source_pdf=source_pdf, + ) \ No newline at end of file diff --git a/tests/test_validation_gates.py b/tests/test_validation_gates.py new file mode 100644 index 0000000..2f74adf --- /dev/null +++ b/tests/test_validation_gates.py @@ -0,0 +1,57 @@ +from types import SimpleNamespace + +from src.validation_gates import ValidationGates + + +def test_input_to_extraction_pass(tmp_path): + pdf = tmp_path / "form.pdf" + pdf.write_text("dummy") + llm = SimpleNamespace(main_loop=lambda: None) + extracted = {"name": "Aryama"} + + gate = ValidationGates.input_to_extraction(str(pdf), llm, extracted) + + assert gate.passed is True + assert gate.reason_codes == [] + + +def test_input_to_extraction_fail_missing_pdf_and_bad_extraction(): + gate = ValidationGates.input_to_extraction( + "C:/does-not-exist/form.pdf", + llm=None, + extracted=[], + ) + + assert gate.passed is False + assert "INPUT_PDF_NOT_FOUND" in gate.reason_codes + assert "LLM_NOT_CONFIGURED" in gate.reason_codes + assert "EXTRACTION_NOT_DICT" in gate.reason_codes + + +def test_extraction_to_json_mandatory_and_completeness_checks(): + extracted = {"name": "Aryama", "email": ""} + cfg = {"required_fields": ["name", "email"], "min_completeness_ratio": 1.0} + + gate = ValidationGates.extraction_to_json(extracted, cfg) + + assert gate.passed is False + assert "MANDATORY_FIELDS_MISSING" in gate.reason_codes + assert "COMPLETENESS_BELOW_THRESHOLD" in gate.reason_codes + + +def test_json_to_pdf_detects_mismatch_and_positional(): + extracted = {"name": "Aryama", "email": "a@x.com"} + pdf_fields = ["full_name", "email_address"] + + mapping_result = SimpleNamespace( + matched={"full_name": "Aryama"}, + positional_values=["a@x.com"], + ) + cfg = {"required_pdf_fields": ["full_name", "email_address"]} + + gate = ValidationGates.json_to_pdf(extracted, pdf_fields, mapping_result, cfg) + + assert gate.passed is False + assert "MANDATORY_PDF_FIELDS_UNMATCHED" in gate.reason_codes + assert "POSITIONAL_FALLBACK_USED" in gate.reason_codes + assert "PDF_FIELDS_UNMATCHED" in gate.reason_codes \ No newline at end of file From f4cf5542af62004ff6136d492ba9c055d4de55a3 Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Tue, 17 Mar 2026 22:40:56 +0530 Subject: [PATCH 4/4] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/filler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/filler.py b/src/filler.py index d2b46c8..3c54089 100644 --- a/src/filler.py +++ b/src/filler.py @@ -1,9 +1,9 @@ from typing import Any, Optional from pdfrw import PdfReader, PdfWriter -from src.semantic_mapper import SemanticMapper +from semantic_mapper import SemanticMapper from datetime import datetime -from src.validation_gates import ValidationGates +from validation_gates import ValidationGates class Filler: