From 5bbbcf33b82b7c36395b1e06d458dac030e0da8b Mon Sep 17 00:00:00 2001
From: Aryama Srivastav <coderbooth@gmail.com>
Date: Sat, 14 Mar 2026 23:01:01 +0530
Subject: [PATCH 1/4] feat: add semantic key-based mapping between extracted
 JSON and PDF widgets

---
 src/filler.py                    |  74 +++++++++++
 src/semantic_mapper.py           | 212 +++++++++++++++++++++++++++++++
 src/test/test_semantic_mapper.py | 113 ++++++++++++++++
 3 files changed, 399 insertions(+)
 create mode 100644 src/filler.py
 create mode 100644 src/semantic_mapper.py
 create mode 100644 src/test/test_semantic_mapper.py

diff --git a/src/filler.py b/src/filler.py
new file mode 100644
index 0000000..d9c517d
--- /dev/null
+++ b/src/filler.py
@@ -0,0 +1,74 @@
+from pdfrw import PdfReader, PdfWriter
+from src.llm import LLM
+from src.semantic_mapper import SemanticMapper
+from datetime import datetime
+
+
+class Filler:
+    def __init__(self):
+        pass
+
+    def fill_form(self, pdf_form: str, llm: LLM, template_config: dict = None):
+        """
+        Fill a PDF form with values extracted by LLM.
+
+        Fields are matched semantically (JSON key ↔ PDF widget name) first.
+        Any unmatched fields fall back to visual-order positional assignment
+        (top-to-bottom, left-to-right).
+
+        Parameters
+        ----------
+        pdf_form        : path to the input PDF template
+        llm             : configured LLM instance (main_loop not yet called)
+        template_config : optional per-template mapping hints, e.g.
+                          {
+                            "field_mappings": {"Employee's name": "EmployeeName"},
+                            "aliases":        {"Employee's name": ["name"]},
+                            "required_fields": ["Employee's name", "Date"]
+                          }
+        """
+        output_pdf = (
+            pdf_form[:-4]
+            + "_"
+            + datetime.now().strftime("%Y%m%d_%H%M%S")
+            + "_filled.pdf"
+        )
+
+        # ── 1. Extract structured data from LLM ──────────────────────────────
+        t2j = llm.main_loop()
+        textbox_answers = t2j.get_data()  # {json_key: value}
+
+        # ── 2. Collect PDF widgets in visual order (global across pages) ──────
+        pdf = PdfReader(pdf_form)
+        ordered_annots = []
+        pdf_field_names = []
+
+        for page in pdf.pages:
+            if page.Annots:
+                sorted_annots = sorted(
+                    page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0]))
+                )
+                for annot in sorted_annots:
+                    if annot.Subtype == "/Widget" and annot.T:
+                        # pdfrw wraps field names in parens: e.g. '(EmployeeName)'
+                        pdf_field_names.append(annot.T[1:-1])
+                        ordered_annots.append(annot)
+
+        # ── 3. Semantic mapping ───────────────────────────────────────────────
+        mapper = SemanticMapper(template_config)
+        result = mapper.map(textbox_answers, pdf_field_names)
+        print(result.report())
+
+        # ── 4. Fill: semantic matches first, positional fallback for the rest ─
+        positional_idx = 0
+        for annot, pdf_field in zip(ordered_annots, pdf_field_names):
+            if pdf_field in result.matched:
+                annot.V = f"{result.matched[pdf_field]}"
+                annot.AP = None
+            elif positional_idx < len(result.positional_values):
+                annot.V = f"{result.positional_values[positional_idx]}"
+                annot.AP = None
+                positional_idx += 1
+
+        PdfWriter().write(output_pdf, pdf)
+        return output_pdf
diff --git a/src/semantic_mapper.py b/src/semantic_mapper.py
new file mode 100644
index 0000000..0eaadcd
--- /dev/null
+++ b/src/semantic_mapper.py
@@ -0,0 +1,212 @@
+"""
+Semantic Mapping Layer
+----------------------
+Matches extracted JSON keys to PDF form field names using:
+  1. Explicit mappings from a per-template config
+  2. Case-insensitive exact match
+  3. Alias match (from template config)
+  4. Fuzzy token-overlap (Jaccard similarity)
+  5. Positional fallback for any remaining unmatched pairs
+
+Returns a MappingResult with matched values, warnings, and a printable report.
+"""
+
+import re
+from dataclasses import dataclass, field
+
+
+@dataclass
+class MappingResult:
+    """Holds the outcome of one semantic mapping run."""
+
+    matched: dict           # {pdf_field_name: value}  — semantically placed
+    positional_values: list # values for JSON keys that had no semantic PDF match
+    unmapped_json_keys: list
+    unmapped_pdf_fields: list
+    ambiguous: list         # [(json_key, [candidate_pdf_fields])]
+    warnings: list          # human-readable warning strings
+
+    def report(self) -> str:
+        lines = [
+            "=== Semantic Mapping Report ===",
+            f"  Matched (semantic):   {len(self.matched)}",
+            f"  Positional fallback:  {len(self.positional_values)}",
+            f"  Unmapped JSON keys:   {len(self.unmapped_json_keys)}",
+            f"  Unmapped PDF fields:  {len(self.unmapped_pdf_fields)}",
+            f"  Ambiguous:            {len(self.ambiguous)}",
+        ]
+        if self.matched:
+            lines.append("\n  Semantic matches:")
+            for pdf_f, val in self.matched.items():
+                lines.append(f"    {pdf_f!r}  ←  {val!r}")
+        if self.ambiguous:
+            lines.append("\n  Ambiguous (best candidate used):")
+            for json_key, candidates in self.ambiguous:
+                lines.append(f"    {json_key!r}  →  {candidates}")
+        if self.unmapped_json_keys:
+            lines.append("\n  Unmapped JSON keys (positional fallback):")
+            for k in self.unmapped_json_keys:
+                lines.append(f"    - {k!r}")
+        if self.unmapped_pdf_fields:
+            lines.append("\n  Unmapped PDF fields (left blank):")
+            for f in self.unmapped_pdf_fields:
+                lines.append(f"    - {f!r}")
+        if self.warnings:
+            lines.append("\n  Warnings:")
+            for w in self.warnings:
+                lines.append(f"    ⚠  {w}")
+        lines.append("================================")
+        return "\n".join(lines)
+
+
+class SemanticMapper:
+    """
+    Maps extracted JSON keys to PDF widget field names.
+
+    template_config schema (all keys optional):
+    {
+        "field_mappings": {"Employee's name": "EmployeeName"},
+        "aliases":         {"Employee's name": ["name", "worker name"]},
+        "required_fields": ["Employee's name", "Date"]
+    }
+    """
+
+    FUZZY_THRESHOLD = 0.35          # Jaccard threshold for a fuzzy hit
+    AMBIGUITY_MARGIN = 0.05         # Scores within this of the top are ambiguous
+
+    def __init__(self, template_config: dict = None):
+        cfg = template_config or {}
+        self._explicit: dict  = cfg.get("field_mappings", {})  # json_key → pdf_field
+        self._aliases: dict   = cfg.get("aliases", {})         # json_key → [alias…]
+        self._required: list  = cfg.get("required_fields", [])
+
+    # ── public ───────────────────────────────────────────────────────────────
+
+    def map(self, extracted: dict, pdf_field_names: list) -> MappingResult:
+        """
+        Match extracted JSON keys to PDF widget field names.
+
+        Parameters
+        ----------
+        extracted       : dict returned by LLM.get_data()   {json_key: value}
+        pdf_field_names : ordered list of PDF widget names (annot.T stripped)
+
+        Returns
+        -------
+        MappingResult
+        """
+        matched: dict    = {}   # pdf_field_name → value
+        used_pdf: set    = set()
+        used_json: set   = set()
+        ambiguous: list  = []
+        warnings: list   = []
+
+        # ── Pass 1: explicit config mappings ─────────────────────────────────
+        for json_key, pdf_field in self._explicit.items():
+            if (
+                json_key in extracted
+                and pdf_field in pdf_field_names
+                and pdf_field not in used_pdf
+            ):
+                matched[pdf_field] = extracted[json_key]
+                used_pdf.add(pdf_field)
+                used_json.add(json_key)
+
+        # ── Pass 2: exact / alias / fuzzy for remaining keys ─────────────────
+        remaining_pdf = [f for f in pdf_field_names if f not in used_pdf]
+
+        for json_key, value in extracted.items():
+            if json_key in used_json:
+                continue
+
+            result = self._find_match(json_key, remaining_pdf)
+
+            if result is None:
+                continue  # will end up in positional fallback
+
+            if isinstance(result, list):
+                # ambiguous: multiple close candidates — use the first, warn
+                ambiguous.append((json_key, result))
+                best = result[0]
+            else:
+                best = result
+
+            matched[best] = value
+            used_pdf.add(best)
+            used_json.add(json_key)
+            remaining_pdf = [f for f in remaining_pdf if f != best]
+
+        # ── Required-field warnings ───────────────────────────────────────────
+        for req in self._required:
+            if req not in used_json:
+                warnings.append(f"Required field not mapped: {req!r}")
+
+        unmapped_json   = [k for k in extracted if k not in used_json]
+        unmapped_pdf    = [f for f in pdf_field_names if f not in used_pdf]
+        positional_vals = [extracted[k] for k in unmapped_json]
+
+        return MappingResult(
+            matched=matched,
+            positional_values=positional_vals,
+            unmapped_json_keys=unmapped_json,
+            unmapped_pdf_fields=unmapped_pdf,
+            ambiguous=ambiguous,
+            warnings=warnings,
+        )
+
+    # ── private ───────────────────────────────────────────────────────────────
+
+    def _normalize(self, s: str) -> set:
+        """Split camelCase/PascalCase, lowercase, strip punctuation, return token set."""
+        # Insert space before each uppercase letter that follows a lowercase letter
+        # so "EmployeeEmail" → "Employee Email"
+        s = re.sub(r"([a-z])([A-Z])", r"\1 \2", s)
+        s = s.lower()
+        s = re.sub(r"[^a-z0-9\s]", " ", s)
+        return set(s.split())
+
+    def _similarity(self, a: str, b: str) -> float:
+        """Jaccard similarity between token sets of two strings."""
+        ta = self._normalize(a)
+        tb = self._normalize(b)
+        if not ta or not tb:
+            return 0.0
+        return len(ta & tb) / len(ta | tb)
+
+    def _find_match(self, json_key: str, pdf_fields: list):
+        """
+        Returns
+        -------
+        str        : single unambiguous best match
+        list[str]  : multiple candidates above threshold (ambiguous)
+        None       : no match found
+        """
+        # 1. Exact match (case-insensitive)
+        for pdf_f in pdf_fields:
+            if json_key.strip().lower() == pdf_f.strip().lower():
+                return pdf_f
+
+        # 2. Alias exact match
+        for alias in self._aliases.get(json_key, []):
+            for pdf_f in pdf_fields:
+                if alias.strip().lower() == pdf_f.strip().lower():
+                    return pdf_f
+
+        # 3. Fuzzy token-overlap — try json_key AND any aliases vs each pdf field
+        candidates_to_try = [json_key] + self._aliases.get(json_key, [])
+        scored = []
+        for pdf_f in pdf_fields:
+            best_score = max(
+                self._similarity(c, pdf_f) for c in candidates_to_try
+            )
+            if best_score >= self.FUZZY_THRESHOLD:
+                scored.append((best_score, pdf_f))
+
+        if not scored:
+            return None
+
+        scored.sort(key=lambda x: -x[0])
+        top_score = scored[0][0]
+        top_candidates = [f for s, f in scored if top_score - s < self.AMBIGUITY_MARGIN]
+
+        return top_candidates[0] if len(top_candidates) == 1 else top_candidates
diff --git a/src/test/test_semantic_mapper.py b/src/test/test_semantic_mapper.py
new file mode 100644
index 0000000..54e00cd
--- /dev/null
+++ b/src/test/test_semantic_mapper.py
@@ -0,0 +1,113 @@
+from src.semantic_mapper import SemanticMapper, MappingResult
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def make_mapper(config=None):
+    return SemanticMapper(config)
+
+
+# ── exact match ───────────────────────────────────────────────────────────────
+
+def test_exact_case_insensitive_match():
+    mapper = make_mapper()
+    extracted = {"Employee's name": "John Doe"}
+    pdf_fields = ["employee's name"]
+    result = mapper.map(extracted, pdf_fields)
+    assert "employee's name" in result.matched
+    assert result.matched["employee's name"] == "John Doe"
+    assert result.unmapped_json_keys == []
+
+
+# ── explicit config mapping ───────────────────────────────────────────────────
+
+def test_explicit_config_mapping():
+    config = {"field_mappings": {"Employee's name": "EmployeeName"}}
+    mapper = make_mapper(config)
+    extracted = {"Employee's name": "Jane Doe"}
+    pdf_fields = ["EmployeeName", "Date"]
+    result = mapper.map(extracted, pdf_fields)
+    assert result.matched["EmployeeName"] == "Jane Doe"
+
+
+# ── alias match ───────────────────────────────────────────────────────────────
+
+def test_alias_match():
+    config = {"aliases": {"Employee's name": ["worker name"]}}
+    mapper = make_mapper(config)
+    extracted = {"Employee's name": "Alice"}
+    pdf_fields = ["worker name"]
+    result = mapper.map(extracted, pdf_fields)
+    assert result.matched["worker name"] == "Alice"
+
+
+# ── fuzzy match ───────────────────────────────────────────────────────────────
+
+def test_fuzzy_match():
+    mapper = make_mapper()
+    extracted = {"employee email": "test@test.com"}
+    pdf_fields = ["EmployeeEmail"]
+    result = mapper.map(extracted, pdf_fields)
+    assert "EmployeeEmail" in result.matched
+    assert result.matched["EmployeeEmail"] == "test@test.com"
+
+
+# ── positional fallback ───────────────────────────────────────────────────────
+
+def test_positional_fallback_for_unmatched_key():
+    mapper = make_mapper()
+    extracted = {"xqzwrandom": "some_value"}
+    pdf_fields = ["Text1"]
+    result = mapper.map(extracted, pdf_fields)
+    assert "xqzwrandom" in result.unmapped_json_keys
+    assert "some_value" in result.positional_values
+
+
+# ── required field warning ────────────────────────────────────────────────────
+
+def test_required_field_warning_when_missing():
+    config = {"required_fields": ["Date"]}
+    mapper = make_mapper(config)
+    extracted = {"Employee's name": "Bob"}
+    pdf_fields = ["employee s name"]
+    result = mapper.map(extracted, pdf_fields)
+    assert any("Date" in w for w in result.warnings)
+
+
+def test_no_warning_when_required_field_matched():
+    config = {
+        "required_fields": ["Employee's name"],
+        "field_mappings": {"Employee's name": "EmployeeName"},
+    }
+    mapper = make_mapper(config)
+    extracted = {"Employee's name": "Bob"}
+    pdf_fields = ["EmployeeName"]
+    result = mapper.map(extracted, pdf_fields)
+    assert result.warnings == []
+
+
+# ── multiple fields, partial semantic match ───────────────────────────────────
+
+def test_mixed_semantic_and_positional():
+    mapper = make_mapper()
+    extracted = {
+        "employee email": "a@b.com",
+        "zzznomatch": "fallback_value",
+    }
+    pdf_fields = ["EmployeeEmail", "SomeOtherField"]
+    result = mapper.map(extracted, pdf_fields)
+    assert result.matched["EmployeeEmail"] == "a@b.com"
+    assert "fallback_value" in result.positional_values
+
+
+# ── report output ─────────────────────────────────────────────────────────────
+
+def test_report_contains_key_sections():
+    mapper = make_mapper()
+    extracted = {"employee name": "John"}
+    pdf_fields = ["employee name", "Date"]
+    result = mapper.map(extracted, pdf_fields)
+    report = result.report()
+    assert "Semantic Mapping Report" in report
+    assert "Matched" in report
+    assert "Unmapped PDF fields" in report

From 623d12ff00e2cb94409e3035925c09395cb2dc03 Mon Sep 17 00:00:00 2001
From: Aryama Srivastav <coderbooth@gmail.com>
Date: Sat, 14 Mar 2026 23:14:23 +0530
Subject: [PATCH 2/4] fix: resolve semantic mapper and filler typing
 diagnostics

---
 src/filler.py          | 7 ++++---
 src/semantic_mapper.py | 5 +++--
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/filler.py b/src/filler.py
index d9c517d..2b557b7 100644
--- a/src/filler.py
+++ b/src/filler.py
@@ -1,5 +1,6 @@
+from typing import Any, Optional
+
 from pdfrw import PdfReader, PdfWriter
-from src.llm import LLM
 from src.semantic_mapper import SemanticMapper
 from datetime import datetime
 
@@ -8,7 +9,7 @@ class Filler:
     def __init__(self):
         pass
 
-    def fill_form(self, pdf_form: str, llm: LLM, template_config: dict = None):
+    def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = None):
         """
         Fill a PDF form with values extracted by LLM.
 
@@ -43,7 +44,7 @@ def fill_form(self, pdf_form: str, llm: LLM, template_config: dict = None):
         ordered_annots = []
         pdf_field_names = []
 
-        for page in pdf.pages:
+        for page in (pdf.pages or []):  # type: ignore[operator]
             if page.Annots:
                 sorted_annots = sorted(
                     page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0]))
diff --git a/src/semantic_mapper.py b/src/semantic_mapper.py
index 0eaadcd..0634c57 100644
--- a/src/semantic_mapper.py
+++ b/src/semantic_mapper.py
@@ -12,7 +12,8 @@
 """
 
 import re
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+from typing import Optional
 
 
 @dataclass
@@ -74,7 +75,7 @@ class SemanticMapper:
     FUZZY_THRESHOLD = 0.35          # Jaccard threshold for a fuzzy hit
     AMBIGUITY_MARGIN = 0.05         # Scores within this of the top are ambiguous
 
-    def __init__(self, template_config: dict = None):
+    def __init__(self, template_config: Optional[dict] = None):
         cfg = template_config or {}
         self._explicit: dict  = cfg.get("field_mappings", {})  # json_key → pdf_field
         self._aliases: dict   = cfg.get("aliases", {})         # json_key → [alias…]

From b595b3301e3e931823fd9f93ffd29636218829b1 Mon Sep 17 00:00:00 2001
From: Aryama Srivastav <coderbooth@gmail.com>
Date: Tue, 17 Mar 2026 21:18:39 +0530
Subject: [PATCH 3/4] feat(validation): add InputExtractionJSONPDF gates with
 per-run report

---
 src/filler.py                  |  67 ++++++++++----
 src/validation_gates.py        | 164 +++++++++++++++++++++++++++++++++
 tests/test_validation_gates.py |  57 ++++++++++++
 3 files changed, 269 insertions(+), 19 deletions(-)
 create mode 100644 src/validation_gates.py
 create mode 100644 tests/test_validation_gates.py

diff --git a/src/filler.py b/src/filler.py
index 2b557b7..d2b46c8 100644
--- a/src/filler.py
+++ b/src/filler.py
@@ -3,13 +3,20 @@
 from pdfrw import PdfReader, PdfWriter
 from src.semantic_mapper import SemanticMapper
 from datetime import datetime
+from src.validation_gates import ValidationGates
 
 
 class Filler:
     def __init__(self):
         pass
 
-    def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = None):
+    def fill_form(
+        self,
+        pdf_form: str,
+        llm: Any,
+        template_config: Optional[dict] = None,
+        strict_validation: bool = True,
+    ):
         """
         Fill a PDF form with values extracted by LLM.
 
@@ -17,28 +24,34 @@ def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = N
         Any unmatched fields fall back to visual-order positional assignment
         (top-to-bottom, left-to-right).
 
-        Parameters
-        ----------
-        pdf_form        : path to the input PDF template
-        llm             : configured LLM instance (main_loop not yet called)
-        template_config : optional per-template mapping hints, e.g.
-                          {
-                            "field_mappings": {"Employee's name": "EmployeeName"},
-                            "aliases":        {"Employee's name": ["name"]},
-                            "required_fields": ["Employee's name", "Date"]
-                          }
+        Validation gates:
+          1) Input -> Extraction
+          2) Extraction -> JSON
+          3) JSON -> PDF
+
+        If strict_validation=True and any gate fails, PDF is not written.
         """
-        output_pdf = (
-            pdf_form[:-4]
-            + "_"
-            + datetime.now().strftime("%Y%m%d_%H%M%S")
-            + "_filled.pdf"
-        )
+        cfg = template_config or {}
+        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
+
+        output_pdf = pdf_form[:-4] + "_" + ts + "_filled.pdf"
+        validation_report_path = pdf_form[:-4] + "_" + ts + "_validation_report.json"
+
+        validation_report = ValidationGates.new_report(source_pdf=pdf_form)
 
         # ── 1. Extract structured data from LLM ──────────────────────────────
         t2j = llm.main_loop()
         textbox_answers = t2j.get_data()  # {json_key: value}
 
+        gate_1 = ValidationGates.input_to_extraction(pdf_form, llm, textbox_answers)
+        validation_report.add_gate(gate_1)
+
+        gate_2 = ValidationGates.extraction_to_json(
+            textbox_answers if isinstance(textbox_answers, dict) else {},
+            cfg,
+        )
+        validation_report.add_gate(gate_2)
+
         # ── 2. Collect PDF widgets in visual order (global across pages) ──────
         pdf = PdfReader(pdf_form)
         ordered_annots = []
@@ -51,15 +64,29 @@ def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = N
                 )
                 for annot in sorted_annots:
                     if annot.Subtype == "/Widget" and annot.T:
-                        # pdfrw wraps field names in parens: e.g. '(EmployeeName)'
                         pdf_field_names.append(annot.T[1:-1])
                         ordered_annots.append(annot)
 
         # ── 3. Semantic mapping ───────────────────────────────────────────────
-        mapper = SemanticMapper(template_config)
+        mapper = SemanticMapper(cfg)
         result = mapper.map(textbox_answers, pdf_field_names)
         print(result.report())
 
+        gate_3 = ValidationGates.json_to_pdf(
+            textbox_answers if isinstance(textbox_answers, dict) else {},
+            pdf_field_names,
+            result,
+            cfg,
+        )
+        validation_report.add_gate(gate_3)
+
+        # Block final output if validation fails
+        if strict_validation and not validation_report.passed:
+            validation_report.write(validation_report_path, output_pdf=None)
+            raise ValueError(
+                f"Validation failed. Report generated at: {validation_report_path}"
+            )
+
         # ── 4. Fill: semantic matches first, positional fallback for the rest ─
         positional_idx = 0
         for annot, pdf_field in zip(ordered_annots, pdf_field_names):
@@ -72,4 +99,6 @@ def fill_form(self, pdf_form: str, llm: Any, template_config: Optional[dict] = N
                 positional_idx += 1
 
         PdfWriter().write(output_pdf, pdf)
+        validation_report.write(validation_report_path, output_pdf=output_pdf)
+
         return output_pdf
diff --git a/src/validation_gates.py b/src/validation_gates.py
new file mode 100644
index 0000000..23557cb
--- /dev/null
+++ b/src/validation_gates.py
@@ -0,0 +1,164 @@
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import dataclass, field, asdict
+from datetime import datetime
+from typing import Any
+
+
+@dataclass
+class GateResult:
+    name: str
+    passed: bool
+    reason_codes: list[str] = field(default_factory=list)
+    details: dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class ValidationReport:
+    run_id: str
+    created_at: str
+    source_pdf: str
+    gates: list[GateResult] = field(default_factory=list)
+    output_pdf: str | None = None
+
+    @property
+    def passed(self) -> bool:
+        return all(g.passed for g in self.gates)
+
+    def add_gate(self, gate: GateResult) -> None:
+        self.gates.append(gate)
+
+    def to_dict(self) -> dict[str, Any]:
+        payload = asdict(self)
+        payload["passed"] = self.passed
+        return payload
+
+    def write(self, report_path: str, output_pdf: str | None = None) -> str:
+        if output_pdf:
+            self.output_pdf = output_pdf
+        with open(report_path, "w", encoding="utf-8") as f:
+            json.dump(self.to_dict(), f, indent=2)
+        return report_path
+
+
+class ValidationGates:
+    @staticmethod
+    def input_to_extraction(pdf_form: str, llm: Any, extracted: Any) -> GateResult:
+        reasons: list[str] = []
+        details: dict[str, Any] = {}
+
+        if not pdf_form or not os.path.exists(pdf_form):
+            reasons.append("INPUT_PDF_NOT_FOUND")
+
+        if llm is None or not hasattr(llm, "main_loop"):
+            reasons.append("LLM_NOT_CONFIGURED")
+
+        if not isinstance(extracted, dict):
+            reasons.append("EXTRACTION_NOT_DICT")
+        elif not extracted:
+            reasons.append("EXTRACTION_EMPTY")
+
+        details["extracted_key_count"] = len(extracted) if isinstance(extracted, dict) else 0
+
+        return GateResult(
+            name="Input -> Extraction",
+            passed=len(reasons) == 0,
+            reason_codes=reasons,
+            details=details,
+        )
+
+    @staticmethod
+    def extraction_to_json(extracted: dict[str, Any], template_config: dict[str, Any]) -> GateResult:
+        reasons: list[str] = []
+        details: dict[str, Any] = {}
+        required_fields: list[str] = template_config.get("required_fields", []) if template_config else []
+
+        missing_required = [k for k in required_fields if not extracted.get(k)]
+        if missing_required:
+            reasons.append("MANDATORY_FIELDS_MISSING")
+
+        non_empty_count = sum(1 for _, v in extracted.items() if v not in (None, "", []))
+        total_count = len(extracted)
+        completeness_ratio = (non_empty_count / total_count) if total_count else 0.0
+
+        min_ratio = float(template_config.get("min_completeness_ratio", 0.8)) if template_config else 0.8
+        if total_count == 0 or completeness_ratio < min_ratio:
+            reasons.append("COMPLETENESS_BELOW_THRESHOLD")
+
+        details.update(
+            {
+                "total_extracted_fields": total_count,
+                "non_empty_fields": non_empty_count,
+                "completeness_ratio": round(completeness_ratio, 4),
+                "min_required_ratio": min_ratio,
+                "missing_required_fields": missing_required,
+            }
+        )
+
+        return GateResult(
+            name="Extraction -> JSON",
+            passed=len(reasons) == 0,
+            reason_codes=reasons,
+            details=details,
+        )
+
+    @staticmethod
+    def json_to_pdf(
+        extracted: dict[str, Any],
+        pdf_field_names: list[str],
+        mapping_result: Any,
+        template_config: dict[str, Any],
+    ) -> GateResult:
+        reasons: list[str] = []
+        details: dict[str, Any] = {}
+
+        matched: dict[str, Any] = getattr(mapping_result, "matched", {}) or {}
+        positional_values: list[Any] = getattr(mapping_result, "positional_values", []) or []
+
+        unmatched_pdf_fields = [f for f in pdf_field_names if f not in matched]
+        required_pdf_fields: list[str] = template_config.get("required_pdf_fields", []) if template_config else []
+        missing_required_pdf = [f for f in required_pdf_fields if f not in matched]
+
+        if not pdf_field_names:
+            reasons.append("PDF_WIDGETS_NOT_FOUND")
+
+        if missing_required_pdf:
+            reasons.append("MANDATORY_PDF_FIELDS_UNMATCHED")
+
+        if positional_values:
+            reasons.append("POSITIONAL_FALLBACK_USED")
+
+        if unmatched_pdf_fields:
+            reasons.append("PDF_FIELDS_UNMATCHED")
+
+        # Simple mismatch signal between extracted volume and semantic matches
+        if len(extracted) > 0 and len(matched) == 0:
+            reasons.append("JSON_TO_PDF_MAPPING_EMPTY")
+
+        details.update(
+            {
+                "pdf_field_count": len(pdf_field_names),
+                "matched_pdf_fields_count": len(matched),
+                "unmatched_pdf_fields": unmatched_pdf_fields,
+                "missing_required_pdf_fields": missing_required_pdf,
+                "positional_fallback_count": len(positional_values),
+            }
+        )
+
+        return GateResult(
+            name="JSON -> PDF",
+            passed=len(reasons) == 0,
+            reason_codes=reasons,
+            details=details,
+        )
+
+    @staticmethod
+    def new_report(source_pdf: str) -> ValidationReport:
+        run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        return ValidationReport(
+            run_id=run_id,
+            created_at=datetime.now().isoformat(),
+            source_pdf=source_pdf,
+        )
\ No newline at end of file
diff --git a/tests/test_validation_gates.py b/tests/test_validation_gates.py
new file mode 100644
index 0000000..2f74adf
--- /dev/null
+++ b/tests/test_validation_gates.py
@@ -0,0 +1,57 @@
+from types import SimpleNamespace
+
+from src.validation_gates import ValidationGates
+
+
+def test_input_to_extraction_pass(tmp_path):
+    pdf = tmp_path / "form.pdf"
+    pdf.write_text("dummy")
+    llm = SimpleNamespace(main_loop=lambda: None)
+    extracted = {"name": "Aryama"}
+
+    gate = ValidationGates.input_to_extraction(str(pdf), llm, extracted)
+
+    assert gate.passed is True
+    assert gate.reason_codes == []
+
+
+def test_input_to_extraction_fail_missing_pdf_and_bad_extraction():
+    gate = ValidationGates.input_to_extraction(
+        "C:/does-not-exist/form.pdf",
+        llm=None,
+        extracted=[],
+    )
+
+    assert gate.passed is False
+    assert "INPUT_PDF_NOT_FOUND" in gate.reason_codes
+    assert "LLM_NOT_CONFIGURED" in gate.reason_codes
+    assert "EXTRACTION_NOT_DICT" in gate.reason_codes
+
+
+def test_extraction_to_json_mandatory_and_completeness_checks():
+    extracted = {"name": "Aryama", "email": ""}
+    cfg = {"required_fields": ["name", "email"], "min_completeness_ratio": 1.0}
+
+    gate = ValidationGates.extraction_to_json(extracted, cfg)
+
+    assert gate.passed is False
+    assert "MANDATORY_FIELDS_MISSING" in gate.reason_codes
+    assert "COMPLETENESS_BELOW_THRESHOLD" in gate.reason_codes
+
+
+def test_json_to_pdf_detects_mismatch_and_positional():
+    extracted = {"name": "Aryama", "email": "a@x.com"}
+    pdf_fields = ["full_name", "email_address"]
+
+    mapping_result = SimpleNamespace(
+        matched={"full_name": "Aryama"},
+        positional_values=["a@x.com"],
+    )
+    cfg = {"required_pdf_fields": ["full_name", "email_address"]}
+
+    gate = ValidationGates.json_to_pdf(extracted, pdf_fields, mapping_result, cfg)
+
+    assert gate.passed is False
+    assert "MANDATORY_PDF_FIELDS_UNMATCHED" in gate.reason_codes
+    assert "POSITIONAL_FALLBACK_USED" in gate.reason_codes
+    assert "PDF_FIELDS_UNMATCHED" in gate.reason_codes
\ No newline at end of file

From f4cf5542af62004ff6136d492ba9c055d4de55a3 Mon Sep 17 00:00:00 2001
From: Aryama Srivastav <coderbooth@gmail.com>
Date: Tue, 17 Mar 2026 22:40:56 +0530
Subject: [PATCH 4/4] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 src/filler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/filler.py b/src/filler.py
index d2b46c8..3c54089 100644
--- a/src/filler.py
+++ b/src/filler.py
@@ -1,9 +1,9 @@
 from typing import Any, Optional
 
 from pdfrw import PdfReader, PdfWriter
-from src.semantic_mapper import SemanticMapper
+from semantic_mapper import SemanticMapper
 from datetime import datetime
-from src.validation_gates import ValidationGates
+from validation_gates import ValidationGates
 
 
 class Filler: