diff --git a/src/backend.py b/src/backend.py index 6e45f24..d0012c4 100644 --- a/src/backend.py +++ b/src/backend.py @@ -4,14 +4,18 @@ from json_manager import JsonManager from input_manager import InputManager from pdfrw import PdfReader, PdfWriter +from .extraction_quality import ExtractionQualityProcessor class textToJSON(): - def __init__(self, transcript_text, target_fields, json={}): + def __init__(self, transcript_text, target_fields, json=None): + if json is None: + json = {} self.__transcript_text = transcript_text # str self.__target_fields = target_fields # List, contains the template field. self.__json = json # dictionary + self.__quality = ExtractionQualityProcessor() self.type_check_all() self.main_loop() @@ -71,6 +75,8 @@ def main_loop(self): #FUTURE -> Refactor this to its own class print("----------------------------------") print("\t[LOG] Resulting JSON created from the input text:") print(json.dumps(self.__json, indent=2)) + print("\t[LOG] Extraction quality report:") + print(json.dumps(self.__quality.build_report(), indent=2)) print("--------- extracted data ---------") return None @@ -80,22 +86,13 @@ def add_response_to_json(self, field, value): this method adds the following value under the specified field, or under a new field if the field doesn't exist, to the json dict """ - value = value.strip().replace('"', '') - parsed_value = None - plural = False - - if value != "-1": - parsed_value = value - - if ";" in value: - parsed_value = self.handle_plural_values(value) - plural = True - - - if field in self.__json.keys(): - self.__json[field].append(parsed_value) - else: - self.__json[field] = parsed_value + existing_value = self.__json.get(field) + normalized_value = self.__quality.process( + field=field, + raw_value=value, + existing_value=existing_value, + ) + self.__json[field] = normalized_value return diff --git a/src/extraction_quality.py b/src/extraction_quality.py new file mode 100644 index 0000000..f993882 --- /dev/null +++ b/src/extraction_quality.py @@ -0,0 +1,113 @@ +import re + + +MISSING_VALUE_SENTINEL = "__MISSING__" + + +class ExtractionQualityProcessor: + """Post-processing for extracted field values. + + Guarantees: + - missing values use one sentinel (MISSING_VALUE_SENTINEL) + - plural values use a normalized list format + - duplicates are merged deterministically (order-preserving unique) + - ambiguous values are flagged for review + """ + + def __init__(self, missing_sentinel=MISSING_VALUE_SENTINEL): + self.missing_sentinel = missing_sentinel + self.duplicate_fields = set() + self.ambiguous_fields = set() + self.plural_normalized_fields = set() + self.missing_fields = set() + + def process(self, field, raw_value, existing_value=None): + normalized_value, was_plural, is_ambiguous = self._normalize_value(raw_value) + + if was_plural: + self.plural_normalized_fields.add(field) + if is_ambiguous: + self.ambiguous_fields.add(field) + if self._is_missing(normalized_value): + self.missing_fields.add(field) + + if existing_value is None: + return normalized_value + + merged, had_duplicate = self._merge_values(existing_value, normalized_value) + if had_duplicate: + self.duplicate_fields.add(field) + return merged + + def build_report(self): + return { + "missing_sentinel": self.missing_sentinel, + "duplicate_fields": sorted(self.duplicate_fields), + "ambiguous_fields": sorted(self.ambiguous_fields), + "plural_normalized_fields": sorted(self.plural_normalized_fields), + "missing_fields": sorted(self.missing_fields), + } + + def _normalize_value(self, raw_value): + if raw_value is None: + return self.missing_sentinel, False, False + + value = str(raw_value).strip().replace('"', "") + if value == "" or value == "-1": + return self.missing_sentinel, False, False + + if ";" in value: + items = self._normalize_plural_values(value) + if not items: + return self.missing_sentinel, True, False + ambiguous = any(self._is_ambiguous_token(item) for item in items) + return items, True, ambiguous + + return value, False, self._is_ambiguous_token(value) + + def _normalize_plural_values(self, raw_plural_value): + parts = [part.strip() for part in raw_plural_value.split(";")] + parts = [part for part in parts if part and part != "-1"] + return self._unique_ordered(parts) + + def _merge_values(self, existing_value, new_value): + # Do not replace known values with missing sentinel. + if self._is_missing(new_value): + return existing_value, False + + # Missing existing value should be replaced by real data. + if self._is_missing(existing_value): + return new_value, False + + existing_items = existing_value if isinstance(existing_value, list) else [existing_value] + new_items = new_value if isinstance(new_value, list) else [new_value] + + merged = list(existing_items) + had_duplicate = False + + for item in new_items: + if item in merged: + had_duplicate = True + continue + merged.append(item) + + if len(merged) == 1: + return merged[0], had_duplicate + return merged, had_duplicate + + def _is_missing(self, value): + return value == self.missing_sentinel + + def _is_ambiguous_token(self, token): + token = token.strip().lower() + return bool(re.search(r"\b(or|maybe|possibly|unclear|unknown)\b", token)) + + def _unique_ordered(self, values): + seen = set() + out = [] + for value in values: + if value in seen: + continue + seen.add(value) + out.append(value) + return out diff --git a/src/test/test_extraction_quality.py b/src/test/test_extraction_quality.py new file mode 100644 index 0000000..c001930 --- /dev/null +++ b/src/test/test_extraction_quality.py @@ -0,0 +1,51 @@ +from src.extraction_quality import ExtractionQualityProcessor, MISSING_VALUE_SENTINEL + + +def test_missing_values_use_single_sentinel(): + processor = ExtractionQualityProcessor() + + value = processor.process("Phone", "-1") + + assert value == MISSING_VALUE_SENTINEL + report = processor.build_report() + assert report["missing_sentinel"] == MISSING_VALUE_SENTINEL + assert report["missing_fields"] == ["Phone"] + + +def test_plural_values_normalize_to_deduplicated_list(): + processor = ExtractionQualityProcessor() + + value = processor.process("Victims", "Jane Doe; John Doe; Jane Doe") + + assert value == ["Jane Doe", "John Doe"] + report = processor.build_report() + assert report["plural_normalized_fields"] == ["Victims"] + + +def test_duplicate_merge_is_deterministic(): + processor = ExtractionQualityProcessor() + + existing = processor.process("Officer", "Alvarez") + merged = processor.process("Officer", "Alvarez", existing_value=existing) + + assert merged == "Alvarez" + report = processor.build_report() + assert report["duplicate_fields"] == ["Officer"] + + +def test_duplicate_merge_promotes_to_list_when_values_differ(): + processor = ExtractionQualityProcessor() + + existing = processor.process("Officer", "Alvarez") + merged = processor.process("Officer", "Martinez", existing_value=existing) + + assert merged == ["Alvarez", "Martinez"] + + +def test_ambiguous_values_are_flagged_for_review(): + processor = ExtractionQualityProcessor() + + processor.process("Incident Type", "Fire or smoke event") + + report = processor.build_report() + assert report["ambiguous_fields"] == ["Incident Type"]