diff --git a/src/compatibility_checker.py b/src/compatibility_checker.py new file mode 100644 index 0000000..250609a --- /dev/null +++ b/src/compatibility_checker.py @@ -0,0 +1,254 @@ +"""Compatibility checking between extracted data and template schemas.""" + +from typing import Dict, Any, Set, Optional +from src.template_schema import ( + TemplateSchema, + TemplateRegistry, + CompatibilityReport, + FieldType, +) + + +class CompatibilityChecker: + """Validates extracted data against template schemas before filling.""" + + def __init__(self, registry: TemplateRegistry): + """Initialize compatibility checker with a template registry. + + Args: + registry: TemplateRegistry containing registered templates. + """ + self.registry = registry + + def check_compatibility( + self, + template_id: str, + extracted_data: Dict[str, Any], + ) -> CompatibilityReport: + """Check if extracted data is compatible with a template. + + Args: + template_id: ID of the template to validate against. + extracted_data: Extracted field data to validate. + + Returns: + CompatibilityReport with detailed validation results. + + Raises: + ValueError: If template_id not found in registry. + """ + template = self.registry.get_template(template_id) + if not template: + raise ValueError(f"Template {template_id} not found in registry") + + report = CompatibilityReport(template_id=template_id, compatible=True) + + # Track which fields we've seen + extracted_field_names = set(extracted_data.keys()) + matched_template_fields = set() + + # Check each extracted field + for extracted_name, extracted_value in extracted_data.items(): + canonical_name = template.resolve_field_name(extracted_name) + + if canonical_name is None: + # Field not found in template + report.extra_fields.add(extracted_name) + report.unmapped_fields.add(extracted_name) + report.compatible = False + else: + # Field found - validate it + field_schema = template.fields[canonical_name] + matched_template_fields.add(canonical_name) + report.matched_fields.add(canonical_name) + + # Type validation + type_issue = self._validate_field_type( + canonical_name, + extracted_value, + field_schema, + ) + if type_issue: + report.type_mismatches[canonical_name] = type_issue + report.compatible = False + + # Check for missing required fields + required_fields = template.get_required_fields() + missing_required = required_fields - matched_template_fields + if missing_required: + report.missing_fields = missing_required + report.compatible = False + + # Check dependencies + for matched_field in matched_template_fields: + field_schema = template.fields[matched_field] + for dependency in field_schema.dependencies: + if dependency not in matched_template_fields: + report.dependency_violations.append((matched_field, dependency)) + report.compatible = False + + # Warnings for informational purposes + if report.extra_fields: + report.warnings.append( + f"Found {len(report.extra_fields)} field(s) not in template schema" + ) + + if report.type_mismatches: + report.warnings.append( + f"Found {len(report.type_mismatches)} type mismatch(es)" + ) + + return report + + def check_family_compatibility( + self, + family: str, + extracted_data: Dict[str, Any], + ) -> Dict[str, CompatibilityReport]: + """Check extracted data against all templates in a family. + + Args: + family: Template family name. + extracted_data: Extracted field data to validate. + + Returns: + Dictionary mapping template_id to CompatibilityReport for each + template in the family. + + Raises: + ValueError: If family not found in registry. + """ + templates = self.registry.get_templates_by_family(family) + if not templates: + raise ValueError(f"Family {family} not found in registry") + + reports = {} + for template in templates: + reports[template.template_id] = self.check_compatibility( + template.template_id, + extracted_data, + ) + + return reports + + def find_compatible_templates( + self, + extracted_data: Dict[str, Any], + tolerance: int = 0, + ) -> list[tuple[str, CompatibilityReport]]: + """Find all compatible templates for extracted data. + + Args: + extracted_data: Extracted field data. + tolerance: Maximum number of incompatibilities allowed. + Default 0 means only fully compatible templates. + + Returns: + List of (template_id, CompatibilityReport) tuples sorted by + compatibility score (best first). + """ + results = [] + + for template in self.registry.list_templates(): + report = self.check_compatibility(template.template_id, extracted_data) + + # Calculate compatibility score + incompatibility_count = ( + len(report.missing_fields) + + len(report.extra_fields) + + len(report.type_mismatches) + + len(report.dependency_violations) + ) + + if incompatibility_count <= tolerance: + results.append((template.template_id, report)) + + # Sort by compatibility score (compatible first, then by severity) + results.sort( + key=lambda x: ( + not x[1].compatible, # Compatible templates first + len(x[1].missing_fields), # Fewer missing fields + len(x[1].type_mismatches), # Fewer type mismatches + x[0], # Deterministic tie-breaker: template_id + ), + ) + + return results + + def _validate_field_type( + self, + field_name: str, + value: Any, + field_schema, + ) -> Optional[str]: + """Validate that a field value matches the expected type. + + Args: + field_name: Name of the field being validated. + value: The value to validate. + field_schema: The FieldSchema with expected type information. + + Returns: + Error message if validation fails, None if valid. + """ + if value is None: + return None + + value_str = str(value).strip() + + if field_schema.field_type == FieldType.EMAIL: + if "@" not in value_str or "." not in value_str.split("@")[-1]: + return f"Invalid email format: {value}" + + elif field_schema.field_type == FieldType.PHONE: + # Basic phone validation: at least 7 digits + digits = "".join(c for c in value_str if c.isdigit()) + if len(digits) < 7: + return f"Invalid phone format (need 7+ digits): {value}" + + elif field_schema.field_type == FieldType.DATE: + # Check for common date formats + if not self._is_valid_date_format(value_str): + return f"Invalid date format: {value}" + + elif field_schema.field_type == FieldType.NUMBER: + try: + float(value_str) + except ValueError: + return f"Invalid number: {value}" + + elif field_schema.field_type == FieldType.CHECKBOX: + valid_checkbox_values = {"yes", "no", "true", "false", "1", "0", "checked", "unchecked"} + if value_str.lower() not in valid_checkbox_values: + return f"Invalid checkbox value: {value}" + + elif field_schema.field_type == FieldType.DROPDOWN: + if field_schema.expected_values: + if value_str not in field_schema.expected_values: + return f"Invalid dropdown value: {value}. Expected one of: {', '.join(field_schema.expected_values)}" + + elif field_schema.field_type == FieldType.TEXT: + if field_schema.max_length and len(value_str) > field_schema.max_length: + return f"Text exceeds max length ({field_schema.max_length}): {value}" + + return None + + @staticmethod + def _is_valid_date_format(value: str) -> bool: + """Check if a value appears to be a valid date format. + + Accepts common formats like: + - YYYY-MM-DD + - MM/DD/YYYY + - DD/MM/YYYY + - Month DD, YYYY + """ + import re + + patterns = [ + r"^\d{1,2}/\d{1,2}/\d{4}$", # MM/DD/YYYY or DD/MM/YYYY + r"^\d{4}-\d{1,2}-\d{1,2}$", # YYYY-MM-DD + r"^[A-Za-z]+ \d{1,2}, \d{4}$", # Month DD, YYYY + ] + + return any(re.match(pattern, value) for pattern in patterns) diff --git a/src/filler.py b/src/filler.py index e31e535..315cde7 100644 --- a/src/filler.py +++ b/src/filler.py @@ -1,11 +1,59 @@ from pdfrw import PdfReader, PdfWriter from src.llm import LLM +from src.compatibility_checker import CompatibilityChecker +from src.template_schema import TemplateRegistry from datetime import datetime +from typing import Optional class Filler: - def __init__(self): - pass + def __init__( + self, + template_registry: Optional[TemplateRegistry] = None, + ): + self.template_registry = template_registry + self.compatibility_checker = ( + CompatibilityChecker(template_registry) + if template_registry + else None + ) + + def check_compatibility_before_fill( + self, + template_id: str, + extracted_data: dict, + ) -> dict: + """Check if extracted data is compatible with a template before filling. + + Args: + template_id: ID of the template to check against. + extracted_data: Extracted field data to validate. + + Returns: + Compatibility report dict with status and details. + + Raises: + ValueError: If no template registry is configured or template not found. + """ + if not self.compatibility_checker: + raise ValueError("Template registry not configured in Filler instance") + + report = self.compatibility_checker.check_compatibility( + template_id, + extracted_data, + ) + + return { + "compatible": report.compatible, + "missing_fields": sorted(report.missing_fields), + "extra_fields": sorted(report.extra_fields), + "unmapped_fields": sorted(report.unmapped_fields), + "type_mismatches": report.type_mismatches, + "dependency_violations": report.dependency_violations, + "warnings": report.warnings, + "matched_fields": sorted(report.matched_fields), + "summary": report.summary(), + } def fill_form(self, pdf_form: str, llm: LLM): """ diff --git a/src/inputs/input.txt b/src/inputs/input.txt index faa55cd..e68385c 100644 --- a/src/inputs/input.txt +++ b/src/inputs/input.txt @@ -1 +1,10 @@ -Officer Voldemort here, at an incident reported at 456 Oak Street. Two victims, Mark Smith and Jane Doe. Medical aid rendered for minor lacerations. Handed off to Sheriff's Deputy Alvarez. End of transmission. +UC Vaccine Declination Statement + +Name/SID: Sarah Johnson, SID 4527891 +Job Title: Research Scientist +Department: Microbiology +Phone Number: 831-555-0142 +Email: sjohnson@ucsc.edu +Date: 03/15/2026 + +Signature: ________________________ \ No newline at end of file diff --git a/src/template_schema.py b/src/template_schema.py new file mode 100644 index 0000000..36f1726 --- /dev/null +++ b/src/template_schema.py @@ -0,0 +1,290 @@ +"""Template schema registry and compatibility checking for FireForm. + +Provides template metadata management describing required fields, aliases, +expected data types, and form versioning. Enables predictable behavior and +reproducibility across multiple PDF templates. +""" + +from dataclasses import dataclass, field +from typing import Dict, List, Optional, Set +from enum import Enum + + +class FieldType(Enum): + """Enumeration of supported field data types.""" + TEXT = "text" + EMAIL = "email" + PHONE = "phone" + DATE = "date" + NUMBER = "number" + CHECKBOX = "checkbox" + DROPDOWN = "dropdown" + UNKNOWN = "unknown" + + +@dataclass +class FieldSchema: + """Schema definition for a single form field.""" + name: str + """Primary field name as it appears in the PDF form.""" + + field_type: FieldType = FieldType.TEXT + """Expected data type of the field.""" + + required: bool = False + """Whether the field must be present in extraction.""" + + aliases: List[str] = field(default_factory=list) + """Alternative names this field might appear as in extracted data.""" + + expected_values: Optional[List[str]] = None + """Enumerated valid values for dropdown/select fields.""" + + dependencies: List[str] = field(default_factory=list) + """Other fields that must be present if this field is filled.""" + + max_length: Optional[int] = None + """Maximum character length for text fields.""" + + description: str = "" + """Human-readable description of the field.""" + + def get_all_names(self) -> Set[str]: + """Return all possible names (primary + aliases) for this field.""" + return {self.name} | set(self.aliases) + + +@dataclass +class TemplateSchema: + """Complete schema definition for a PDF template.""" + template_id: str + """Unique identifier for this template version.""" + + template_name: str + """Human-readable name of the template.""" + + version: str + """Semantic version of the template (e.g., '1.0.0').""" + + family: str + """Template family for grouping related templates.""" + + fields: Dict[str, FieldSchema] = field(default_factory=dict) + """Mapping of field names to their schemas.""" + + description: str = "" + """Description of the template purpose and use cases.""" + + created_at: str = "" + """ISO timestamp when template schema was created.""" + + metadata: Dict[str, str] = field(default_factory=dict) + """Additional custom metadata about the template.""" + + def get_required_fields(self) -> Set[str]: + """Return set of all required field names.""" + return {name for name, schema in self.fields.items() if schema.required} + + def get_all_field_names(self) -> Set[str]: + """Return set of all possible field names (primary + all aliases).""" + all_names = set() + for schema in self.fields.values(): + all_names.update(schema.get_all_names()) + return all_names + + def resolve_field_name(self, extracted_name: str) -> Optional[str]: + """Map an extracted field name to the canonical field name in schema. + + Args: + extracted_name: Field name from extraction. + + Returns: + Canonical field name if found, None otherwise. + """ + # Exact match + if extracted_name in self.fields: + return extracted_name + + # Check aliases + for canonical_name, schema in self.fields.items(): + if extracted_name in schema.aliases: + return canonical_name + + return None + + +@dataclass +class CompatibilityReport: + """Results of compatibility check between extracted data and template schema.""" + template_id: str + """ID of the template being checked against.""" + + compatible: bool + """Whether extracted data is compatible with template.""" + + missing_fields: Set[str] = field(default_factory=set) + """Required fields that are missing from extraction.""" + + extra_fields: Set[str] = field(default_factory=set) + """Extracted fields that don't match any template field.""" + + unmapped_fields: Set[str] = field(default_factory=set) + """Extracted fields that couldn't be mapped to template fields.""" + + type_mismatches: Dict[str, str] = field(default_factory=dict) + """Fields where extracted value type doesn't match expected type.""" + + dependency_violations: List[tuple] = field(default_factory=list) + """List of (field, missing_dependency) tuples for unmet dependencies.""" + + warnings: List[str] = field(default_factory=list) + """Non-fatal issues that should be reviewed.""" + + matched_fields: Set[str] = field(default_factory=set) + """Fields that successfully matched and passed validation.""" + + def summary(self) -> str: + """Generate human-readable summary of compatibility check.""" + lines = [f"Template: {self.template_id}"] + lines.append(f"Status: {'✓ Compatible' if self.compatible else '✗ Incompatible'}") + + if self.missing_fields: + lines.append(f"Missing Required Fields: {', '.join(sorted(self.missing_fields))}") + + if self.extra_fields: + lines.append(f"Extra Fields: {', '.join(sorted(self.extra_fields))}") + + if self.unmapped_fields: + lines.append(f"Unmapped Fields: {', '.join(sorted(self.unmapped_fields))}") + + if self.type_mismatches: + for field_name, issue in self.type_mismatches.items(): + lines.append(f"Type Mismatch in {field_name}: {issue}") + + if self.dependency_violations: + for field_name, missing_dep in self.dependency_violations: + lines.append(f"Dependency Violation: {field_name} requires {missing_dep}") + + if self.warnings: + for warning in self.warnings: + lines.append(f"⚠ Warning: {warning}") + + lines.append(f"Matched Fields: {len(self.matched_fields)}") + + return "\n".join(lines) + + +class TemplateRegistry: + """Registry for managing multiple template schemas organized by family.""" + + def __init__(self): + """Initialize empty template registry.""" + self._templates: Dict[str, TemplateSchema] = {} + self._families: Dict[str, List[str]] = {} + + def register_template(self, schema: TemplateSchema) -> None: + """Register a new template schema. + + Args: + schema: TemplateSchema to register. + + Raises: + ValueError: If template_id already registered. + """ + if schema.template_id in self._templates: + raise ValueError(f"Template {schema.template_id} already registered") + + self._templates[schema.template_id] = schema + + # Track by family + if schema.family not in self._families: + self._families[schema.family] = [] + self._families[schema.family].append(schema.template_id) + + def get_template(self, template_id: str) -> Optional[TemplateSchema]: + """Get a template schema by ID. + + Args: + template_id: ID of the template. + + Returns: + TemplateSchema if found, None otherwise. + """ + return self._templates.get(template_id) + + def get_templates_by_family(self, family: str) -> List[TemplateSchema]: + """Get all templates in a specific family. + + Args: + family: Family name to retrieve. + + Returns: + List of TemplateSchema objects in the family. + """ + template_ids = self._families.get(family, []) + return [self._templates[tid] for tid in template_ids] + + def list_families(self) -> List[str]: + """Get all template families in the registry. + + Returns: + Sorted list of family names. + """ + return sorted(self._families.keys()) + + def list_templates(self) -> List[TemplateSchema]: + """Get all registered templates. + + Returns: + List of all TemplateSchema objects. + """ + return list(self._templates.values()) + + def update_template(self, schema: TemplateSchema) -> None: + """Update an existing template schema. + + Args: + schema: Updated TemplateSchema. + + Raises: + ValueError: If template_id not found. + """ + if schema.template_id not in self._templates: + raise ValueError(f"Template {schema.template_id} not found") + + old_family = self._templates[schema.template_id].family + + # Update template + self._templates[schema.template_id] = schema + + # Update family tracking if family changed + if old_family != schema.family: + self._families[old_family].remove(schema.template_id) + if not self._families[old_family]: + del self._families[old_family] + + if schema.family not in self._families: + self._families[schema.family] = [] + self._families[schema.family].append(schema.template_id) + + def delete_template(self, template_id: str) -> None: + """Delete a template from the registry. + + Args: + template_id: ID of template to delete. + + Raises: + ValueError: If template_id not found. + """ + if template_id not in self._templates: + raise ValueError(f"Template {template_id} not found") + + schema = self._templates[template_id] + + # Remove from templates + del self._templates[template_id] + + # Remove from families + self._families[schema.family].remove(template_id) + if not self._families[schema.family]: + del self._families[schema.family] diff --git a/src/test/test_template_schema.py b/src/test/test_template_schema.py new file mode 100644 index 0000000..5f138ba --- /dev/null +++ b/src/test/test_template_schema.py @@ -0,0 +1,429 @@ +"""Tests for template schema registry and compatibility checking.""" + +import pytest +from src.template_schema import ( + FieldSchema, + FieldType, + TemplateSchema, + TemplateRegistry, + CompatibilityReport, +) +from src.compatibility_checker import CompatibilityChecker + + +@pytest.fixture +def registry(): + """Create a template registry with sample templates.""" + reg = TemplateRegistry() + + # Template 1: Employment Form + employment_schema = TemplateSchema( + template_id="emp-form-v1", + template_name="Employment Application", + version="1.0.0", + family="employment", + description="Standard employment application form", + fields={ + "employee_name": FieldSchema( + name="employee_name", + field_type=FieldType.TEXT, + required=True, + aliases=["full_name", "applicant_name"], + max_length=100, + ), + "email": FieldSchema( + name="email", + field_type=FieldType.EMAIL, + required=True, + aliases=["email_address", "contact_email"], + ), + "phone": FieldSchema( + name="phone", + field_type=FieldType.PHONE, + required=True, + aliases=["phone_number", "contact_phone"], + ), + "position": FieldSchema( + name="position", + field_type=FieldType.TEXT, + required=True, + aliases=["job_title"], + ), + "start_date": FieldSchema( + name="start_date", + field_type=FieldType.DATE, + required=False, + aliases=["employment_start"], + ), + "years_experience": FieldSchema( + name="years_experience", + field_type=FieldType.NUMBER, + required=False, + aliases=["experience_years"], + ), + }, + ) + reg.register_template(employment_schema) + + # Template 2: Medical Form + medical_schema = TemplateSchema( + template_id="med-form-v1", + template_name="Medical History", + version="1.0.0", + family="medical", + description="Patient medical history form", + fields={ + "patient_name": FieldSchema( + name="patient_name", + field_type=FieldType.TEXT, + required=True, + aliases=["full_name"], + ), + "date_of_birth": FieldSchema( + name="date_of_birth", + field_type=FieldType.DATE, + required=True, + aliases=["birthdate", "dob"], + ), + "allergies": FieldSchema( + name="allergies", + field_type=FieldType.TEXT, + required=False, + ), + }, + ) + reg.register_template(medical_schema) + + return reg + + +def test_template_registration(registry): + """Test that templates are properly registered in the registry.""" + assert len(registry.list_templates()) == 2 + assert registry.get_template("emp-form-v1") is not None + assert registry.get_template("med-form-v1") is not None + assert registry.get_template("nonexistent") is None + + +def test_template_family_grouping(registry): + """Test that templates are grouped by family correctly.""" + employment_templates = registry.get_templates_by_family("employment") + assert len(employment_templates) == 1 + assert employment_templates[0].template_id == "emp-form-v1" + + medical_templates = registry.get_templates_by_family("medical") + assert len(medical_templates) == 1 + assert medical_templates[0].template_id == "med-form-v1" + + +def test_template_families_listing(registry): + """Test listing all template families.""" + families = registry.list_families() + assert "employment" in families + assert "medical" in families + + +def test_field_name_resolution(registry): + """Test resolving extracted field names to canonical names.""" + template = registry.get_template("emp-form-v1") + + # Exact match + assert template.resolve_field_name("employee_name") == "employee_name" + + # Alias match + assert template.resolve_field_name("full_name") == "employee_name" + assert template.resolve_field_name("applicant_name") == "employee_name" + + # Non-match + assert template.resolve_field_name("nonexistent_field") is None + + +def test_compatibility_check_valid_data(registry): + """Test that valid extracted data passes compatibility check.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "john@example.com", + "phone": "555-1234567", + "position": "Software Engineer", + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert report.compatible + assert len(report.missing_fields) == 0 + assert len(report.extra_fields) == 0 + + +def test_compatibility_check_missing_required_fields(registry): + """Test that missing required fields trigger incompatibility.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + # Missing required: email, phone, position + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert not report.compatible + assert "email" in report.missing_fields + assert "phone" in report.missing_fields + assert "position" in report.missing_fields + + +def test_compatibility_check_extra_fields(registry): + """Test that extra fields not in template are reported.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "john@example.com", + "phone": "555-1234567", + "position": "Software Engineer", + "salary": "100000", # Extra field + "department": "Engineering", # Extra field + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert not report.compatible + assert "salary" in report.extra_fields + assert "department" in report.extra_fields + + +def test_compatibility_check_with_aliases(registry): + """Test that aliased field names are correctly mapped.""" + checker = CompatibilityChecker(registry) + + # Using aliases instead of canonical names + extracted_data = { + "full_name": "John Doe", # Alias for employee_name + "email_address": "john@example.com", # Alias for email + "contact_phone": "555-1234567", # Alias for phone + "job_title": "Software Engineer", # Alias for position + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert report.compatible + assert len(report.missing_fields) == 0 + assert len(report.extra_fields) == 0 + + +def test_compatibility_check_type_validation_email(registry): + """Test that email field type is validated.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "invalid-email", # Not a valid email + "phone": "555-1234567", + "position": "Software Engineer", + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert not report.compatible + assert "email" in report.type_mismatches + + +def test_compatibility_check_type_validation_phone(registry): + """Test that phone field type is validated.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "john@example.com", + "phone": "123", # Too short for valid phone + "position": "Software Engineer", + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert not report.compatible + assert "phone" in report.type_mismatches + + +def test_compatibility_check_type_validation_date(registry): + """Test that date field type is validated.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "patient_name": "Jane Doe", + "date_of_birth": "not-a-date", # Invalid date format + } + + report = checker.check_compatibility("med-form-v1", extracted_data) + assert not report.compatible + assert "date_of_birth" in report.type_mismatches + + +def test_compatibility_check_type_validation_date_valid_formats(registry): + """Test that various valid date formats are accepted.""" + checker = CompatibilityChecker(registry) + + valid_dates = [ + "01/15/1990", + "1990-01-15", + "January 15, 1990", + ] + + for date_value in valid_dates: + extracted_data = { + "patient_name": "Jane Doe", + "date_of_birth": date_value, + } + + report = checker.check_compatibility("med-form-v1", extracted_data) + assert report.compatible, f"Date format '{date_value}' should be valid" + + +def test_compatibility_check_type_validation_number(registry): + """Test that number field type is validated.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "john@example.com", + "phone": "555-1234567", + "position": "Software Engineer", + "years_experience": "not-a-number", + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert not report.compatible + assert "years_experience" in report.type_mismatches + + +def test_compatibility_check_matched_fields(registry): + """Test that successfully matched fields are tracked.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "john@example.com", + "phone": "555-1234567", + "position": "Software Engineer", + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + assert len(report.matched_fields) == 4 + assert "employee_name" in report.matched_fields + + +def test_family_compatibility_check(registry): + """Test checking extracted data against all templates in a family.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "john@example.com", + "phone": "555-1234567", + "position": "Software Engineer", + } + + reports = checker.check_family_compatibility("employment", extracted_data) + assert len(reports) == 1 + assert "emp-form-v1" in reports + assert reports["emp-form-v1"].compatible + + +def test_find_compatible_templates(registry): + """Test finding templates compatible with extracted data.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "patient_name": "Jane Doe", + "date_of_birth": "01/15/1990", + } + + compatible = checker.find_compatible_templates(extracted_data) + assert len(compatible) > 0 + assert compatible[0][0] == "med-form-v1" + assert compatible[0][1].compatible + + +def test_compatibility_report_summary(registry): + """Test that compatibility report generates readable summary.""" + checker = CompatibilityChecker(registry) + + extracted_data = { + "employee_name": "John Doe", + "email": "invalid-email", + "phone": "555-1234567", + "position": "Software Engineer", + "extra_field": "value", + } + + report = checker.check_compatibility("emp-form-v1", extracted_data) + summary = report.summary() + + assert "emp-form-v1" in summary + assert "Incompatible" in summary + assert "email" in summary.lower() + + +def test_template_update(registry): + """Test updating an existing template in the registry.""" + template = registry.get_template("emp-form-v1") + original_version = template.version + + template.version = "1.1.0" + registry.update_template(template) + + updated = registry.get_template("emp-form-v1") + assert updated.version == "1.1.0" + + +def test_template_delete(registry): + """Test deleting a template from the registry.""" + registry.delete_template("emp-form-v1") + + assert registry.get_template("emp-form-v1") is None + assert len(registry.list_templates()) == 1 + + +def test_template_schema_get_required_fields(registry): + """Test getting all required fields from a template.""" + template = registry.get_template("emp-form-v1") + required = template.get_required_fields() + + assert "employee_name" in required + assert "email" in required + assert "phone" in required + assert "position" in required + + +def test_field_schema_get_all_names(registry): + """Test getting all possible field names (primary + aliases).""" + template = registry.get_template("emp-form-v1") + field_schema = template.fields["employee_name"] + + all_names = field_schema.get_all_names() + assert "employee_name" in all_names + assert "full_name" in all_names + assert "applicant_name" in all_names + + +def test_compatibility_check_invalid_template(): + """Test that checking against non-existent template raises error.""" + registry = TemplateRegistry() + checker = CompatibilityChecker(registry) + + extracted_data = {"field": "value"} + + with pytest.raises(ValueError, match="not found"): + checker.check_compatibility("nonexistent", extracted_data) + + +def test_duplicate_template_registration(): + """Test that registering duplicate template ID raises error.""" + registry = TemplateRegistry() + + schema = TemplateSchema( + template_id="test-v1", + template_name="Test", + version="1.0.0", + family="test", + ) + + registry.register_template(schema) + + with pytest.raises(ValueError, match="already registered"): + registry.register_template(schema)