diff --git a/src/extraction_quality.py b/src/extraction_quality.py new file mode 100644 index 0000000..f8eecaa --- /dev/null +++ b/src/extraction_quality.py @@ -0,0 +1,115 @@ +import re + + +MISSING_VALUE_SENTINEL = "__MISSING__" + + +class ExtractionQualityProcessor: + """Normalize and merge extracted values from noisy conversational outputs.""" + + def __init__(self, missing_sentinel=MISSING_VALUE_SENTINEL): + self.missing_sentinel = missing_sentinel + self.duplicate_fields = set() + self.ambiguous_fields = set() + self.plural_normalized_fields = set() + self.missing_fields = set() + + def process(self, field, raw_value, existing_value=None): + normalized_value, was_plural, is_ambiguous = self._normalize_value(raw_value) + + if was_plural: + self.plural_normalized_fields.add(field) + if is_ambiguous: + self.ambiguous_fields.add(field) + + if existing_value is None: + merged_value = normalized_value + had_duplicate = False + else: + merged_value, had_duplicate = self._merge_values(existing_value, normalized_value) + + if had_duplicate: + self.duplicate_fields.add(field) + + if self._is_missing(merged_value): + self.missing_fields.add(field) + else: + # Ensure the field is not reported as missing if the final merged value is present + self.missing_fields.discard(field) + + return merged_value + + def build_report(self): + return { + "missing_sentinel": self.missing_sentinel, + "duplicate_fields": sorted(self.duplicate_fields), + "ambiguous_fields": sorted(self.ambiguous_fields), + "plural_normalized_fields": sorted(self.plural_normalized_fields), + "missing_fields": sorted(self.missing_fields), + } + + def _normalize_value(self, raw_value): + if raw_value is None: + return self.missing_sentinel, False, False + + value = str(raw_value).strip().replace('"', "") + + if value == "" or value == "-1": + return self.missing_sentinel, False, False + + if ";" in value: + plural_values = self._normalize_plural_values(value) + if not plural_values: + return self.missing_sentinel, True, False + is_ambiguous = any(self._is_ambiguous_token(item) for item in plural_values) + return plural_values, True, is_ambiguous + + return value, False, self._is_ambiguous_token(value) + + def _normalize_plural_values(self, raw_plural_value): + values = [part.strip() for part in raw_plural_value.split(";")] + values = [value for value in values if value and value != "-1"] + return self._unique_ordered(values) + + def _merge_values(self, existing_value, new_value): + if self._is_missing(new_value): + return existing_value, False + + if self._is_missing(existing_value): + return new_value, False + + existing_values = existing_value if isinstance(existing_value, list) else [existing_value] + new_values = new_value if isinstance(new_value, list) else [new_value] + + merged = list(existing_values) + had_duplicate = False + + for value in new_values: + if value in merged: + had_duplicate = True + continue + merged.append(value) + + if len(merged) == 1: + return merged[0], had_duplicate + + return merged, had_duplicate + + def _is_missing(self, value): + return value == self.missing_sentinel + + def _is_ambiguous_token(self, token): + token = token.strip().lower() + return bool(re.search(r"\b(or|maybe|possibly|unclear|unknown)\b", token)) + + def _unique_ordered(self, values): + seen = set() + out = [] + + for value in values: + if value in seen: + continue + seen.add(value) + out.append(value) + + return out diff --git a/src/inputs/input.txt b/src/inputs/input.txt index faa55cd..e68385c 100644 --- a/src/inputs/input.txt +++ b/src/inputs/input.txt @@ -1 +1,10 @@ -Officer Voldemort here, at an incident reported at 456 Oak Street. Two victims, Mark Smith and Jane Doe. Medical aid rendered for minor lacerations. Handed off to Sheriff's Deputy Alvarez. End of transmission. +UC Vaccine Declination Statement + +Name/SID: Sarah Johnson, SID 4527891 +Job Title: Research Scientist +Department: Microbiology +Phone Number: 831-555-0142 +Email: sjohnson@ucsc.edu +Date: 03/15/2026 + +Signature: ________________________ \ No newline at end of file diff --git a/src/llm.py b/src/llm.py index 70937f9..1058104 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,6 +1,7 @@ import json import os import requests +from src.extraction_quality import ExtractionQualityProcessor class LLM: @@ -10,6 +11,7 @@ def __init__(self, transcript_text=None, target_fields=None, json=None): self._transcript_text = transcript_text # str self._target_fields = target_fields # List, contains the template field. self._json = json # dictionary + self._quality = ExtractionQualityProcessor() def type_check_all(self): if type(self._transcript_text) is not str: @@ -79,6 +81,8 @@ def main_loop(self): print("----------------------------------") print("\t[LOG] Resulting JSON created from the input text:") print(json.dumps(self._json, indent=2)) + print("\t[LOG] Extraction quality report:") + print(json.dumps(self._quality.build_report(), indent=2)) print("--------- extracted data ---------") return self @@ -88,22 +92,19 @@ def add_response_to_json(self, field, value): this method adds the following value under the specified field, or under a new field if the field doesn't exist, to the json dict """ - value = value.strip().replace('"', "") - parsed_value = None - - if value != "-1": - parsed_value = value - - if ";" in value: - parsed_value = self.handle_plural_values(value) - - if field in self._json.keys(): - self._json[field].append(parsed_value) - else: - self._json[field] = parsed_value + existing_value = self._json.get(field) + normalized_value = self._quality.process( + field=field, + raw_value=value, + existing_value=existing_value, + ) + self._json[field] = normalized_value return + def get_quality_report(self): + return self._quality.build_report() + def handle_plural_values(self, plural_value): """ This method handles plural values. diff --git a/src/test/test_extraction_quality_controls.py b/src/test/test_extraction_quality_controls.py new file mode 100644 index 0000000..91a2e1d --- /dev/null +++ b/src/test/test_extraction_quality_controls.py @@ -0,0 +1,55 @@ +from src.extraction_quality import ExtractionQualityProcessor, MISSING_VALUE_SENTINEL +from src.llm import LLM + + +def test_missing_values_use_consistent_sentinel(): + processor = ExtractionQualityProcessor() + + value = processor.process("Phone", "-1") + + assert value == MISSING_VALUE_SENTINEL + report = processor.build_report() + assert report["missing_sentinel"] == MISSING_VALUE_SENTINEL + assert report["missing_fields"] == ["Phone"] + + +def test_plural_values_are_normalized_and_deduplicated(): + processor = ExtractionQualityProcessor() + + value = processor.process("Victims", "Jane Doe; John Doe; Jane Doe") + + assert value == ["Jane Doe", "John Doe"] + assert processor.build_report()["plural_normalized_fields"] == ["Victims"] + + +def test_duplicate_merge_is_deterministic(): + processor = ExtractionQualityProcessor() + + first = processor.process("Officer", "Alvarez") + merged = processor.process("Officer", "Alvarez", existing_value=first) + + assert merged == "Alvarez" + assert processor.build_report()["duplicate_fields"] == ["Officer"] + + +def test_ambiguity_is_flagged_for_review(): + processor = ExtractionQualityProcessor() + + processor.process("Incident Type", "Fire or smoke event") + + assert processor.build_report()["ambiguous_fields"] == ["Incident Type"] + + +def test_llm_add_response_uses_quality_pipeline(): + llm = LLM() + + llm.add_response_to_json("Victims", "Jane Doe; John Doe; Jane Doe") + llm.add_response_to_json("Victims", "John Doe") + llm.add_response_to_json("Phone", "-1") + + data = llm.get_data() + report = llm.get_quality_report() + + assert data["Victims"] == ["Jane Doe", "John Doe"] + assert data["Phone"] == MISSING_VALUE_SENTINEL + assert report["plural_normalized_fields"] == ["Victims"]