diff --git a/api/db/repositories.py b/api/db/repositories.py index 6608718..3ea9625 100644 --- a/api/db/repositories.py +++ b/api/db/repositories.py @@ -11,6 +11,13 @@ def create_template(session: Session, template: Template) -> Template: def get_template(session: Session, template_id: int) -> Template | None: return session.get(Template, template_id) + +def get_templates_by_ids(session: Session, template_ids: list[int]) -> list[Template]: + if not template_ids: + return [] + statement = select(Template).where(Template.id.in_(template_ids)) + return list(session.exec(statement).all()) + # Forms def create_form(session: Session, form: FormSubmission) -> FormSubmission: session.add(form) diff --git a/api/main.py b/api/main.py index d0b8c79..469103b 100644 --- a/api/main.py +++ b/api/main.py @@ -1,7 +1,9 @@ from fastapi import FastAPI from api.routes import templates, forms +from api.errors.handlers import register_exception_handlers app = FastAPI() +register_exception_handlers(app) app.include_router(templates.router) app.include_router(forms.router) \ No newline at end of file diff --git a/api/routes/forms.py b/api/routes/forms.py index f3430ed..43dd3c9 100644 --- a/api/routes/forms.py +++ b/api/routes/forms.py @@ -1,8 +1,11 @@ +import os + from fastapi import APIRouter, Depends +from fastapi.responses import FileResponse from sqlmodel import Session from api.deps import get_db -from api.schemas.forms import FormFill, FormFillResponse -from api.db.repositories import create_form, get_template +from api.schemas.forms import FormBatchFill, FormBatchFillResponse, FormFill, FormFillResponse +from api.db.repositories import create_form, get_template, get_templates_by_ids from api.db.models import FormSubmission from api.errors.base import AppError from src.controller import Controller @@ -23,3 +26,43 @@ def fill_form(form: FormFill, db: Session = Depends(get_db)): return create_form(db, submission) +@router.post("/fill-batch", response_model=FormBatchFillResponse) +def fill_forms_batch(form: FormBatchFill, db: Session = Depends(get_db)): + templates = get_templates_by_ids(db, form.template_ids) + if not templates: + raise AppError("No templates found for provided template_ids", status_code=404) + + template_ids_found = {tpl.id for tpl in templates} + template_ids_missing = [tid for tid in form.template_ids if tid not in template_ids_found] + if template_ids_missing: + raise AppError( + f"Template(s) not found: {template_ids_missing}", + status_code=404, + ) + + controller = Controller() + batch_result = controller.fill_multiple_forms( + incident_record=form.incident_record, + templates=templates, + ) + + batch_id = batch_result["batch_id"] + return { + **batch_result, + "download_url": f"/forms/batch-download/{batch_id}", + } + + +@router.get("/batch-download/{batch_id}") +def download_batch_package(batch_id: str): + zip_path = os.path.join("src", "outputs", "batches", f"{batch_id}.zip") + if not os.path.exists(zip_path): + raise AppError("Batch package not found", status_code=404) + + return FileResponse( + path=zip_path, + media_type="application/zip", + filename=f"{batch_id}.zip", + ) + + diff --git a/api/schemas/forms.py b/api/schemas/forms.py index 3cce650..ffdd448 100644 --- a/api/schemas/forms.py +++ b/api/schemas/forms.py @@ -12,4 +12,39 @@ class FormFillResponse(BaseModel): output_pdf_path: str class Config: - from_attributes = True \ No newline at end of file + from_attributes = True + + +class BatchTemplateMappingReport(BaseModel): + compatible: bool + missing_fields: list[str] + extra_fields: list[str] + unmapped_fields: list[str] + type_mismatches: dict[str, str] + dependency_violations: list[list[str] | tuple[str, str]] + warnings: list[str] + matched_fields: list[str] + + +class BatchTemplateResult(BaseModel): + template_id: int + template_name: str + status: str + output_pdf_path: str | None + error: str | None + mapping_report: BatchTemplateMappingReport + + +class FormBatchFill(BaseModel): + template_ids: list[int] + incident_record: dict + + +class FormBatchFillResponse(BaseModel): + batch_id: str + total_templates: int + successful_count: int + failed_count: int + package_zip_path: str + download_url: str + results: list[BatchTemplateResult] \ No newline at end of file diff --git a/src/batch_orchestrator.py b/src/batch_orchestrator.py new file mode 100644 index 0000000..24abaec --- /dev/null +++ b/src/batch_orchestrator.py @@ -0,0 +1,276 @@ +import json +import os +import re +import uuid +from dataclasses import dataclass +from datetime import datetime +from typing import Any, Callable +from zipfile import ZIP_DEFLATED, ZipFile + + +@dataclass +class BatchTemplateResult: + template_id: int + template_name: str + status: str + output_pdf_path: str | None + error: str | None + mapping_report: dict[str, Any] + + +class BatchOrchestrator: + def __init__( + self, + fill_single_form_fn: Callable[[str, dict[str, Any], dict[str, Any]], str], + ): + self.fill_single_form_fn = fill_single_form_fn + + def run_batch( + self, + incident_record: dict[str, Any], + templates: list[Any], + output_root: str = "src/outputs/batches", + ) -> dict[str, Any]: + batch_id = f"batch_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:8]}" + output_dir = os.path.join(output_root, batch_id) + os.makedirs(output_dir, exist_ok=True) + + results: list[BatchTemplateResult] = [] + successful_outputs: list[str] = [] + + for template in templates: + template_fields = self._normalize_template_fields(getattr(template, "fields", {})) + mapping_report = self._build_mapping_report(template_fields, incident_record) + + if not mapping_report["compatible"]: + results.append( + BatchTemplateResult( + template_id=template.id, + template_name=template.name, + status="failed_validation", + output_pdf_path=None, + error="Compatibility validation failed", + mapping_report=mapping_report, + ) + ) + continue + + try: + output_pdf_path = self.fill_single_form_fn( + template.pdf_path, + incident_record, + template_fields, + ) + successful_outputs.append(output_pdf_path) + results.append( + BatchTemplateResult( + template_id=template.id, + template_name=template.name, + status="success", + output_pdf_path=output_pdf_path, + error=None, + mapping_report=mapping_report, + ) + ) + except Exception as exc: + results.append( + BatchTemplateResult( + template_id=template.id, + template_name=template.name, + status="failed_runtime", + output_pdf_path=None, + error=str(exc), + mapping_report=mapping_report, + ) + ) + + report_payload = { + "batch_id": batch_id, + "created_at": datetime.now().isoformat(), + "total_templates": len(templates), + "successful_count": len([r for r in results if r.status == "success"]), + "failed_count": len([r for r in results if r.status != "success"]), + "results": [ + { + "template_id": r.template_id, + "template_name": r.template_name, + "status": r.status, + "output_pdf_path": r.output_pdf_path, + "error": r.error, + "mapping_report": r.mapping_report, + } + for r in results + ], + } + + report_json_path = os.path.join(output_dir, "batch_report.json") + with open(report_json_path, "w", encoding="utf-8") as f: + json.dump(report_payload, f, indent=2) + + zip_path = os.path.join(output_root, f"{batch_id}.zip") + self._build_batch_zip(zip_path, successful_outputs, report_json_path) + + report_payload["package_zip_path"] = zip_path + return report_payload + + @staticmethod + def _build_batch_zip( + zip_path: str, + successful_outputs: list[str], + report_json_path: str, + ) -> None: + os.makedirs(os.path.dirname(zip_path), exist_ok=True) + with ZipFile(zip_path, "w", compression=ZIP_DEFLATED) as zip_file: + for output_file in successful_outputs: + if os.path.exists(output_file): + zip_file.write(output_file, arcname=os.path.basename(output_file)) + zip_file.write(report_json_path, arcname="batch_report.json") + + @staticmethod + def _normalize_template_fields(fields: Any) -> dict[str, Any]: + if isinstance(fields, dict): + return fields + if isinstance(fields, list): + return {str(field): "text" for field in fields} + return {} + + @staticmethod + def _build_mapping_report( + template_fields: dict[str, Any], + incident_record: dict[str, Any], + ) -> dict[str, Any]: + normalized_record = { + BatchOrchestrator._normalize_key(str(k)): v for k, v in incident_record.items() + } + normalized_to_original = { + BatchOrchestrator._normalize_key(str(k)): str(k) for k in incident_record.keys() + } + + missing_fields: set[str] = set() + matched_fields: set[str] = set() + type_mismatches: dict[str, str] = {} + + normalized_template_tokens: set[str] = set() + + for field_name, field_meta in template_fields.items(): + aliases = BatchOrchestrator._infer_aliases(field_meta) + candidate_names = [field_name, *aliases] + candidate_tokens = [BatchOrchestrator._normalize_key(name) for name in candidate_names] + normalized_template_tokens.update(candidate_tokens) + + matched_key = next( + (token for token in candidate_tokens if token in normalized_record), + None, + ) + + required = BatchOrchestrator._infer_required(field_meta) + if matched_key is None: + if required: + missing_fields.add(field_name) + continue + + matched_fields.add(field_name) + expected_type = BatchOrchestrator._infer_field_type(field_meta) + value = normalized_record[matched_key] + issue = BatchOrchestrator._validate_value_type(expected_type, value) + if issue: + type_mismatches[field_name] = issue + + extra_fields = { + original + for token, original in normalized_to_original.items() + if token not in normalized_template_tokens + } + + warnings = [] + if extra_fields: + warnings.append(f"Found {len(extra_fields)} field(s) not in template schema") + if type_mismatches: + warnings.append(f"Found {len(type_mismatches)} type mismatch(es)") + + compatible = not missing_fields and not type_mismatches + + return { + "compatible": compatible, + "missing_fields": sorted(missing_fields), + "extra_fields": sorted(extra_fields), + "unmapped_fields": sorted(extra_fields), + "type_mismatches": type_mismatches, + "dependency_violations": [], + "warnings": warnings, + "matched_fields": sorted(matched_fields), + } + + @staticmethod + def _infer_field_type(field_meta: Any) -> str: + if isinstance(field_meta, dict): + raw_type = str(field_meta.get("type", "text")).lower() + else: + raw_type = str(field_meta).lower() + + if raw_type in {"email"}: + return "email" + if raw_type in {"phone", "tel", "telephone"}: + return "phone" + if raw_type in {"date", "datetime"}: + return "date" + if raw_type in {"number", "int", "float", "integer"}: + return "number" + if raw_type in {"checkbox", "bool", "boolean"}: + return "checkbox" + if raw_type in {"dropdown", "select", "choice"}: + return "dropdown" + return "text" + + @staticmethod + def _infer_required(field_meta: Any) -> bool: + if isinstance(field_meta, dict): + return bool(field_meta.get("required", False)) + return False + + @staticmethod + def _infer_aliases(field_meta: Any) -> list[str]: + if isinstance(field_meta, dict): + aliases = field_meta.get("aliases", []) + if isinstance(aliases, list): + return [str(alias) for alias in aliases] + return [] + + @staticmethod + def _normalize_key(value: str) -> str: + return "".join(ch.lower() for ch in value if ch.isalnum()) + + @staticmethod + def _validate_value_type(expected_type: str, value: Any) -> str | None: + value_str = str(value).strip() + + if expected_type == "email": + if "@" not in value_str or "." not in value_str.split("@")[-1]: + return f"Invalid email format: {value}" + + if expected_type == "phone": + digits = "".join(ch for ch in value_str if ch.isdigit()) + if len(digits) < 7: + return f"Invalid phone format (need 7+ digits): {value}" + + if expected_type == "date": + date_patterns = [ + r"^\d{1,2}/\d{1,2}/\d{4}$", + r"^\d{4}-\d{1,2}-\d{1,2}$", + r"^[A-Za-z]+ \d{1,2}, \d{4}$", + ] + if not any(re.match(pattern, value_str) for pattern in date_patterns): + return f"Invalid date format: {value}" + + if expected_type == "number": + try: + float(value_str) + except ValueError: + return f"Invalid number: {value}" + + if expected_type == "checkbox": + valid = {"yes", "no", "true", "false", "1", "0", "checked", "unchecked"} + if value_str.lower() not in valid: + return f"Invalid checkbox value: {value}" + + return None \ No newline at end of file diff --git a/src/controller.py b/src/controller.py index d31ec9c..18d4bdd 100644 --- a/src/controller.py +++ b/src/controller.py @@ -8,4 +8,7 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str): return self.file_manipulator.fill_form(user_input, fields, pdf_form_path) def create_template(self, pdf_path: str): - return self.file_manipulator.create_template(pdf_path) \ No newline at end of file + return self.file_manipulator.create_template(pdf_path) + + def fill_multiple_forms(self, incident_record: dict, templates: list): + return self.file_manipulator.fill_multiple_forms(incident_record, templates) \ No newline at end of file diff --git a/src/file_manipulator.py b/src/file_manipulator.py index b7815cc..f95a6d3 100644 --- a/src/file_manipulator.py +++ b/src/file_manipulator.py @@ -1,18 +1,21 @@ import os from src.filler import Filler from src.llm import LLM -from commonforms import prepare_form +from src.batch_orchestrator import BatchOrchestrator class FileManipulator: def __init__(self): self.filler = Filler() self.llm = LLM() + self.batch_orchestrator = BatchOrchestrator(self.filler.fill_form_from_record) def create_template(self, pdf_path: str): """ By using commonforms, we create an editable .pdf template and we store it. """ + from commonforms import prepare_form + template_path = pdf_path[:-4] + "_template.pdf" prepare_form(pdf_path, template_path) return template_path @@ -45,3 +48,13 @@ def fill_form(self, user_input: str, fields: list, pdf_form_path: str): print(f"An error occurred during PDF generation: {e}") # Re-raise the exception so the frontend can handle it raise e + + def fill_multiple_forms(self, incident_record: dict, templates: list): + """Fill multiple templates from one structured incident record.""" + print("[BATCH] Received request for multi-document generation.") + print(f"[BATCH] Templates requested: {len(templates)}") + + return self.batch_orchestrator.run_batch( + incident_record=incident_record, + templates=templates, + ) diff --git a/src/filler.py b/src/filler.py index e31e535..e372f25 100644 --- a/src/filler.py +++ b/src/filler.py @@ -1,6 +1,7 @@ from pdfrw import PdfReader, PdfWriter from src.llm import LLM from datetime import datetime +from typing import Any class Filler: @@ -29,7 +30,7 @@ def fill_form(self, pdf_form: str, llm: LLM): pdf = PdfReader(pdf_form) # Loop through pages - for page in pdf.pages: + for page in pdf.pages: # type: ignore[operator] if page.Annots: sorted_annots = sorted( page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0])) @@ -50,3 +51,82 @@ def fill_form(self, pdf_form: str, llm: LLM): # Your main.py expects this function to return the path return output_pdf + + def fill_form_from_record( + self, + pdf_form: str, + incident_record: dict[str, Any], + template_fields: dict[str, Any], + ) -> str: + """Fill a PDF from a pre-extracted incident record. + + This bypasses LLM extraction and maps one structured incident object + into a specific template's field set. + """ + output_pdf = ( + pdf_form[:-4] + + "_" + + datetime.now().strftime("%Y%m%d_%H%M%S") + + "_filled.pdf" + ) + + ordered_template_fields = list(template_fields.keys()) + normalized_record = { + self._normalize_key(str(k)): v for k, v in incident_record.items() + } + + pdf = PdfReader(pdf_form) + field_index = 0 + + for page in pdf.pages: # type: ignore[operator] + if not page.Annots: + continue + + sorted_annots = sorted( + page.Annots, key=lambda a: (-float(a.Rect[1]), float(a.Rect[0])) + ) + + for annot in sorted_annots: + if annot.Subtype != "/Widget" or not annot.T: + continue + + pdf_field_name = str(annot.T)[1:-1] + expected_template_field = ( + ordered_template_fields[field_index] + if field_index < len(ordered_template_fields) + else None + ) + + value = self._resolve_value_for_pdf_field( + pdf_field_name=pdf_field_name, + expected_template_field=expected_template_field, + normalized_record=normalized_record, + ) + + annot.V = "" if value is None else f"{value}" + annot.AP = None + field_index += 1 + + PdfWriter().write(output_pdf, pdf) + return output_pdf + + @staticmethod + def _normalize_key(field_name: str) -> str: + return "".join(ch.lower() for ch in field_name if ch.isalnum()) + + def _resolve_value_for_pdf_field( + self, + pdf_field_name: str, + expected_template_field: str | None, + normalized_record: dict[str, Any], + ) -> Any: + pdf_key = self._normalize_key(pdf_field_name) + if pdf_key in normalized_record: + return normalized_record[pdf_key] + + if expected_template_field: + template_key = self._normalize_key(expected_template_field) + if template_key in normalized_record: + return normalized_record[template_key] + + return None diff --git a/src/inputs/input.txt b/src/inputs/input.txt index faa55cd..e68385c 100644 --- a/src/inputs/input.txt +++ b/src/inputs/input.txt @@ -1 +1,10 @@ -Officer Voldemort here, at an incident reported at 456 Oak Street. Two victims, Mark Smith and Jane Doe. Medical aid rendered for minor lacerations. Handed off to Sheriff's Deputy Alvarez. End of transmission. +UC Vaccine Declination Statement + +Name/SID: Sarah Johnson, SID 4527891 +Job Title: Research Scientist +Department: Microbiology +Phone Number: 831-555-0142 +Email: sjohnson@ucsc.edu +Date: 03/15/2026 + +Signature: ________________________ \ No newline at end of file diff --git a/tests/test_batch_orchestrator.py b/tests/test_batch_orchestrator.py new file mode 100644 index 0000000..90e14bb --- /dev/null +++ b/tests/test_batch_orchestrator.py @@ -0,0 +1,97 @@ +import os +from types import SimpleNamespace +from zipfile import ZipFile + +from src.batch_orchestrator import BatchOrchestrator + + +def test_run_batch_processes_templates_independently(tmp_path): + output_root = tmp_path / "batches" + generated = [] + + def fill_single_form(pdf_path, incident_record, template_fields): + if "fail" in pdf_path: + raise RuntimeError("Simulated fill failure") + + output_file = tmp_path / f"{os.path.basename(pdf_path)}.filled.pdf" + output_file.write_text("pdf-bytes", encoding="utf-8") + generated.append(str(output_file)) + return str(output_file) + + orchestrator = BatchOrchestrator(fill_single_form) + + templates = [ + SimpleNamespace( + id=1, + name="Fire Incident Form", + pdf_path="fire.pdf", + fields={"incident_id": {"type": "text", "required": True}}, + ), + SimpleNamespace( + id=2, + name="Medical Incident Form", + pdf_path="medical_fail.pdf", + fields={"incident_id": {"type": "text", "required": True}}, + ), + SimpleNamespace( + id=3, + name="Insurance Claim Form", + pdf_path="insurance.pdf", + fields={"incident_id": {"type": "text", "required": True}}, + ), + ] + + result = orchestrator.run_batch( + incident_record={"incident_id": "INC-123"}, + templates=templates, + output_root=str(output_root), + ) + + assert result["total_templates"] == 3 + assert result["successful_count"] == 2 + assert result["failed_count"] == 1 + assert os.path.exists(result["package_zip_path"]) + + statuses = {item["template_id"]: item["status"] for item in result["results"]} + assert statuses[1] == "success" + assert statuses[2] == "failed_runtime" + assert statuses[3] == "success" + + with ZipFile(result["package_zip_path"], "r") as zip_file: + names = set(zip_file.namelist()) + assert "batch_report.json" in names + assert len([name for name in names if name.endswith(".pdf")]) == 2 + + +def test_run_batch_reports_validation_failures(tmp_path): + output_root = tmp_path / "batches" + + def fill_single_form(pdf_path, incident_record, template_fields): + output_file = tmp_path / "unused.pdf" + output_file.write_text("unused", encoding="utf-8") + return str(output_file) + + orchestrator = BatchOrchestrator(fill_single_form) + + templates = [ + SimpleNamespace( + id=7, + name="Fire Validation Form", + pdf_path="fire_validation.pdf", + fields={ + "incident_id": {"type": "text", "required": True}, + "location": {"type": "text", "required": True}, + }, + ), + ] + + result = orchestrator.run_batch( + incident_record={"incident_id": "INC-7"}, + templates=templates, + output_root=str(output_root), + ) + + assert result["successful_count"] == 0 + assert result["failed_count"] == 1 + assert result["results"][0]["status"] == "failed_validation" + assert "location" in result["results"][0]["mapping_report"]["missing_fields"] diff --git a/tests/test_forms_batch.py b/tests/test_forms_batch.py new file mode 100644 index 0000000..f533ab0 --- /dev/null +++ b/tests/test_forms_batch.py @@ -0,0 +1,109 @@ +from pathlib import Path +from types import SimpleNamespace + +import api.routes.forms as forms_route + + +def test_fill_batch_endpoint(client, monkeypatch): + templates = [ + SimpleNamespace(id=1, name="Fire Form", pdf_path="fire.pdf", fields={"incident_id": "text"}), + SimpleNamespace(id=2, name="Medical Form", pdf_path="medical.pdf", fields={"incident_id": "text"}), + ] + + def fake_get_templates_by_ids(db, template_ids): + return templates + + def fake_fill_multiple_forms(self, incident_record, templates): + return { + "batch_id": "batch_abc123", + "total_templates": 2, + "successful_count": 1, + "failed_count": 1, + "package_zip_path": "src/outputs/batches/batch_abc123.zip", + "results": [ + { + "template_id": 1, + "template_name": "Fire Form", + "status": "success", + "output_pdf_path": "fire_filled.pdf", + "error": None, + "mapping_report": { + "compatible": True, + "missing_fields": [], + "extra_fields": [], + "unmapped_fields": [], + "type_mismatches": {}, + "dependency_violations": [], + "warnings": [], + "matched_fields": ["incident_id"], + }, + }, + { + "template_id": 2, + "template_name": "Medical Form", + "status": "failed_runtime", + "output_pdf_path": None, + "error": "failed", + "mapping_report": { + "compatible": True, + "missing_fields": [], + "extra_fields": [], + "unmapped_fields": [], + "type_mismatches": {}, + "dependency_violations": [], + "warnings": [], + "matched_fields": ["incident_id"], + }, + }, + ], + } + + monkeypatch.setattr(forms_route, "get_templates_by_ids", fake_get_templates_by_ids) + monkeypatch.setattr(forms_route.Controller, "fill_multiple_forms", fake_fill_multiple_forms) + + payload = { + "template_ids": [1, 2], + "incident_record": {"incident_id": "INC-42"}, + } + response = client.post("/forms/fill-batch", json=payload) + + assert response.status_code == 200 + body = response.json() + assert body["batch_id"] == "batch_abc123" + assert body["total_templates"] == 2 + assert body["successful_count"] == 1 + assert body["failed_count"] == 1 + assert body["download_url"] == "/forms/batch-download/batch_abc123" + + +def test_batch_download_endpoint(client): + batch_id = "batch_test_download" + zip_path = Path("src/outputs/batches") + zip_path.mkdir(parents=True, exist_ok=True) + + target_file = zip_path / f"{batch_id}.zip" + target_file.write_bytes(b"zip-content") + + response = client.get(f"/forms/batch-download/{batch_id}") + + assert response.status_code == 200 + assert response.headers["content-type"] == "application/zip" + + +def test_fill_batch_endpoint_missing_template(client, monkeypatch): + templates = [ + SimpleNamespace(id=1, name="Fire Form", pdf_path="fire.pdf", fields={"incident_id": "text"}), + ] + + def fake_get_templates_by_ids(db, template_ids): + return templates + + monkeypatch.setattr(forms_route, "get_templates_by_ids", fake_get_templates_by_ids) + + payload = { + "template_ids": [1, 2], + "incident_record": {"incident_id": "INC-404"}, + } + response = client.post("/forms/fill-batch", json=payload) + + assert response.status_code == 404