fireform-core · Aryama-srivastav · Mar 17, 2026 · Mar 17, 2026
diff --git a/src/backend.py b/src/backend.py
@@ -4,14 +4,18 @@
 from json_manager import JsonManager
 from input_manager import InputManager
 from pdfrw import PdfReader, PdfWriter
+from .extraction_quality import ExtractionQualityProcessor
 
 
 
 class textToJSON():
-    def __init__(self, transcript_text, target_fields, json={}):
+    def __init__(self, transcript_text, target_fields, json=None):
+        if json is None:
+            json = {}
         self.__transcript_text = transcript_text # str
         self.__target_fields = target_fields # List, contains the template field.
         self.__json = json # dictionary
+        self.__quality = ExtractionQualityProcessor()
         self.type_check_all()
         self.main_loop()
 
@@ -71,6 +75,8 @@ def main_loop(self): #FUTURE -> Refactor this to its own class
         print("----------------------------------")
         print("\t[LOG] Resulting JSON created from the input text:")
         print(json.dumps(self.__json, indent=2))
+        print("\t[LOG] Extraction quality report:")
+        print(json.dumps(self.__quality.build_report(), indent=2))
         print("--------- extracted data ---------")
 
         return None
@@ -80,22 +86,13 @@ def add_response_to_json(self, field, value):
             this method adds the following value under the specified field, 
             or under a new field if the field doesn't exist, to the json dict 
         """
-        value = value.strip().replace('"', '')
-        parsed_value = None
-        plural = False
-
-        if value != "-1":
-            parsed_value = value       
-
-        if ";" in value:
-            parsed_value = self.handle_plural_values(value)
-            plural = True
-
-
-        if field in self.__json.keys():
-            self.__json[field].append(parsed_value)
-        else: 
-            self.__json[field] = parsed_value
+        existing_value = self.__json.get(field)
+        normalized_value = self.__quality.process(
+            field=field,
+            raw_value=value,
+            existing_value=existing_value,
+        )
+        self.__json[field] = normalized_value
 
         return
 

diff --git a/src/extraction_quality.py b/src/extraction_quality.py
@@ -0,0 +1,113 @@
+import re
+
+
+MISSING_VALUE_SENTINEL = "__MISSING__"
+
+
+class ExtractionQualityProcessor:
+    """Post-processing for extracted field values.
+
+    Guarantees:
+    - missing values use one sentinel (MISSING_VALUE_SENTINEL)
+    - plural values use a normalized list format
+    - duplicates are merged deterministically (order-preserving unique)
+    - ambiguous values are flagged for review
+    """
+
+    def __init__(self, missing_sentinel=MISSING_VALUE_SENTINEL):
+        self.missing_sentinel = missing_sentinel
+        self.duplicate_fields = set()
+        self.ambiguous_fields = set()
+        self.plural_normalized_fields = set()
+        self.missing_fields = set()
+
+    def process(self, field, raw_value, existing_value=None):
+        normalized_value, was_plural, is_ambiguous = self._normalize_value(raw_value)
+
+        if was_plural:
+            self.plural_normalized_fields.add(field)
+        if is_ambiguous:
+            self.ambiguous_fields.add(field)
+        if self._is_missing(normalized_value):
+            self.missing_fields.add(field)
+
+        if existing_value is None:
+            return normalized_value
+
+        merged, had_duplicate = self._merge_values(existing_value, normalized_value)
+        if had_duplicate:
+            self.duplicate_fields.add(field)
+        return merged
+
+    def build_report(self):
+        return {
+            "missing_sentinel": self.missing_sentinel,
+            "duplicate_fields": sorted(self.duplicate_fields),
+            "ambiguous_fields": sorted(self.ambiguous_fields),
+            "plural_normalized_fields": sorted(self.plural_normalized_fields),
+            "missing_fields": sorted(self.missing_fields),
+        }
+
+    def _normalize_value(self, raw_value):
+        if raw_value is None:
+            return self.missing_sentinel, False, False
+
+        value = str(raw_value).strip().replace('"', "")
+        if value == "" or value == "-1":
+            return self.missing_sentinel, False, False
+
+        if ";" in value:
+            items = self._normalize_plural_values(value)
+            if not items:
+                return self.missing_sentinel, True, False
+            ambiguous = any(self._is_ambiguous_token(item) for item in items)
+            return items, True, ambiguous
+
+        return value, False, self._is_ambiguous_token(value)
+
+    def _normalize_plural_values(self, raw_plural_value):
+        parts = [part.strip() for part in raw_plural_value.split(";")]
+        parts = [part for part in parts if part and part != "-1"]
+        return self._unique_ordered(parts)
+
+    def _merge_values(self, existing_value, new_value):
+        # Do not replace known values with missing sentinel.
+        if self._is_missing(new_value):
+            return existing_value, False
+
+        # Missing existing value should be replaced by real data.
+        if self._is_missing(existing_value):
+            return new_value, False
+
+        existing_items = existing_value if isinstance(existing_value, list) else [existing_value]
+        new_items = new_value if isinstance(new_value, list) else [new_value]
+
+        merged = list(existing_items)
+        had_duplicate = False
+
+        for item in new_items:
+            if item in merged:
+                had_duplicate = True
+                continue
+            merged.append(item)
+
+        if len(merged) == 1:
+            return merged[0], had_duplicate
+        return merged, had_duplicate
+
+    def _is_missing(self, value):
+        return value == self.missing_sentinel
+
+    def _is_ambiguous_token(self, token):
+        token = token.strip().lower()
+        return bool(re.search(r"\b(or|maybe|possibly|unclear|unknown)\b", token))
+
+    def _unique_ordered(self, values):
+        seen = set()
+        out = []
+        for value in values:
+            if value in seen:
+                continue
+            seen.add(value)
+            out.append(value)
+        return out
diff --git a/src/test/test_extraction_quality.py b/src/test/test_extraction_quality.py
@@ -0,0 +1,51 @@
+from src.extraction_quality import ExtractionQualityProcessor, MISSING_VALUE_SENTINEL
+
+
+def test_missing_values_use_single_sentinel():
+    processor = ExtractionQualityProcessor()
+
+    value = processor.process("Phone", "-1")
+
+    assert value == MISSING_VALUE_SENTINEL
+    report = processor.build_report()
+    assert report["missing_sentinel"] == MISSING_VALUE_SENTINEL
+    assert report["missing_fields"] == ["Phone"]
+
+
+def test_plural_values_normalize_to_deduplicated_list():
+    processor = ExtractionQualityProcessor()
+
+    value = processor.process("Victims", "Jane Doe; John Doe; Jane Doe")
+
+    assert value == ["Jane Doe", "John Doe"]
+    report = processor.build_report()
+    assert report["plural_normalized_fields"] == ["Victims"]
+
+
+def test_duplicate_merge_is_deterministic():
+    processor = ExtractionQualityProcessor()
+
+    existing = processor.process("Officer", "Alvarez")
+    merged = processor.process("Officer", "Alvarez", existing_value=existing)
+
+    assert merged == "Alvarez"
+    report = processor.build_report()
+    assert report["duplicate_fields"] == ["Officer"]
+
+
+def test_duplicate_merge_promotes_to_list_when_values_differ():
+    processor = ExtractionQualityProcessor()
+
+    existing = processor.process("Officer", "Alvarez")
+    merged = processor.process("Officer", "Martinez", existing_value=existing)
+
+    assert merged == ["Alvarez", "Martinez"]
+
+
+def test_ambiguous_values_are_flagged_for_review():
+    processor = ExtractionQualityProcessor()
+
+    processor.process("Incident Type", "Fire or smoke event")
+
+    report = processor.build_report()
+    assert report["ambiguous_fields"] == ["Incident Type"]