From f8e01e744f3d2686f4463ebfccb977fe06e47817 Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Sat, 7 Mar 2026 22:18:11 +0530 Subject: [PATCH 1/3] Update input.txt with UC Vaccine Declination Statement for Sarah Johnson. --- src/inputs/input.txt | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/inputs/input.txt b/src/inputs/input.txt index faa55cd..e68385c 100644 --- a/src/inputs/input.txt +++ b/src/inputs/input.txt @@ -1 +1,10 @@ -Officer Voldemort here, at an incident reported at 456 Oak Street. Two victims, Mark Smith and Jane Doe. Medical aid rendered for minor lacerations. Handed off to Sheriff's Deputy Alvarez. End of transmission. +UC Vaccine Declination Statement + +Name/SID: Sarah Johnson, SID 4527891 +Job Title: Research Scientist +Department: Microbiology +Phone Number: 831-555-0142 +Email: sjohnson@ucsc.edu +Date: 03/15/2026 + +Signature: ________________________ \ No newline at end of file From ab4605d8d7b887c7c24ec8e48ae28d15df4a515b Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Wed, 18 Mar 2026 22:20:07 +0530 Subject: [PATCH 2/3] feat: add extraction quality controls for duplicates ambiguity and plural normalization --- src/extraction_quality.py | 109 +++++++++++++++++++ src/llm.py | 27 ++--- src/test/test_extraction_quality_controls.py | 55 ++++++++++ 3 files changed, 178 insertions(+), 13 deletions(-) create mode 100644 src/extraction_quality.py create mode 100644 src/test/test_extraction_quality_controls.py diff --git a/src/extraction_quality.py b/src/extraction_quality.py new file mode 100644 index 0000000..d33f08f --- /dev/null +++ b/src/extraction_quality.py @@ -0,0 +1,109 @@ +import re + + +MISSING_VALUE_SENTINEL = "__MISSING__" + + +class ExtractionQualityProcessor: + """Normalize and merge extracted values from noisy conversational outputs.""" + + def __init__(self, missing_sentinel=MISSING_VALUE_SENTINEL): + self.missing_sentinel = missing_sentinel + self.duplicate_fields = set() + self.ambiguous_fields = set() + self.plural_normalized_fields = set() + self.missing_fields = set() + + def process(self, field, raw_value, existing_value=None): + normalized_value, was_plural, is_ambiguous = self._normalize_value(raw_value) + + if was_plural: + self.plural_normalized_fields.add(field) + if is_ambiguous: + self.ambiguous_fields.add(field) + if self._is_missing(normalized_value): + self.missing_fields.add(field) + + if existing_value is None: + return normalized_value + + merged_value, had_duplicate = self._merge_values(existing_value, normalized_value) + if had_duplicate: + self.duplicate_fields.add(field) + + return merged_value + + def build_report(self): + return { + "missing_sentinel": self.missing_sentinel, + "duplicate_fields": sorted(self.duplicate_fields), + "ambiguous_fields": sorted(self.ambiguous_fields), + "plural_normalized_fields": sorted(self.plural_normalized_fields), + "missing_fields": sorted(self.missing_fields), + } + + def _normalize_value(self, raw_value): + if raw_value is None: + return self.missing_sentinel, False, False + + value = str(raw_value).strip().replace('"', "") + + if value == "" or value == "-1": + return self.missing_sentinel, False, False + + if ";" in value: + plural_values = self._normalize_plural_values(value) + if not plural_values: + return self.missing_sentinel, True, False + is_ambiguous = any(self._is_ambiguous_token(item) for item in plural_values) + return plural_values, True, is_ambiguous + + return value, False, self._is_ambiguous_token(value) + + def _normalize_plural_values(self, raw_plural_value): + values = [part.strip() for part in raw_plural_value.split(";")] + values = [value for value in values if value and value != "-1"] + return self._unique_ordered(values) + + def _merge_values(self, existing_value, new_value): + if self._is_missing(new_value): + return existing_value, False + + if self._is_missing(existing_value): + return new_value, False + + existing_values = existing_value if isinstance(existing_value, list) else [existing_value] + new_values = new_value if isinstance(new_value, list) else [new_value] + + merged = list(existing_values) + had_duplicate = False + + for value in new_values: + if value in merged: + had_duplicate = True + continue + merged.append(value) + + if len(merged) == 1: + return merged[0], had_duplicate + + return merged, had_duplicate + + def _is_missing(self, value): + return value == self.missing_sentinel + + def _is_ambiguous_token(self, token): + token = token.strip().lower() + return bool(re.search(r"\b(or|maybe|possibly|unclear|unknown)\b", token)) + + def _unique_ordered(self, values): + seen = set() + out = [] + + for value in values: + if value in seen: + continue + seen.add(value) + out.append(value) + + return out diff --git a/src/llm.py b/src/llm.py index 70937f9..1058104 100644 --- a/src/llm.py +++ b/src/llm.py @@ -1,6 +1,7 @@ import json import os import requests +from src.extraction_quality import ExtractionQualityProcessor class LLM: @@ -10,6 +11,7 @@ def __init__(self, transcript_text=None, target_fields=None, json=None): self._transcript_text = transcript_text # str self._target_fields = target_fields # List, contains the template field. self._json = json # dictionary + self._quality = ExtractionQualityProcessor() def type_check_all(self): if type(self._transcript_text) is not str: @@ -79,6 +81,8 @@ def main_loop(self): print("----------------------------------") print("\t[LOG] Resulting JSON created from the input text:") print(json.dumps(self._json, indent=2)) + print("\t[LOG] Extraction quality report:") + print(json.dumps(self._quality.build_report(), indent=2)) print("--------- extracted data ---------") return self @@ -88,22 +92,19 @@ def add_response_to_json(self, field, value): this method adds the following value under the specified field, or under a new field if the field doesn't exist, to the json dict """ - value = value.strip().replace('"', "") - parsed_value = None - - if value != "-1": - parsed_value = value - - if ";" in value: - parsed_value = self.handle_plural_values(value) - - if field in self._json.keys(): - self._json[field].append(parsed_value) - else: - self._json[field] = parsed_value + existing_value = self._json.get(field) + normalized_value = self._quality.process( + field=field, + raw_value=value, + existing_value=existing_value, + ) + self._json[field] = normalized_value return + def get_quality_report(self): + return self._quality.build_report() + def handle_plural_values(self, plural_value): """ This method handles plural values. diff --git a/src/test/test_extraction_quality_controls.py b/src/test/test_extraction_quality_controls.py new file mode 100644 index 0000000..91a2e1d --- /dev/null +++ b/src/test/test_extraction_quality_controls.py @@ -0,0 +1,55 @@ +from src.extraction_quality import ExtractionQualityProcessor, MISSING_VALUE_SENTINEL +from src.llm import LLM + + +def test_missing_values_use_consistent_sentinel(): + processor = ExtractionQualityProcessor() + + value = processor.process("Phone", "-1") + + assert value == MISSING_VALUE_SENTINEL + report = processor.build_report() + assert report["missing_sentinel"] == MISSING_VALUE_SENTINEL + assert report["missing_fields"] == ["Phone"] + + +def test_plural_values_are_normalized_and_deduplicated(): + processor = ExtractionQualityProcessor() + + value = processor.process("Victims", "Jane Doe; John Doe; Jane Doe") + + assert value == ["Jane Doe", "John Doe"] + assert processor.build_report()["plural_normalized_fields"] == ["Victims"] + + +def test_duplicate_merge_is_deterministic(): + processor = ExtractionQualityProcessor() + + first = processor.process("Officer", "Alvarez") + merged = processor.process("Officer", "Alvarez", existing_value=first) + + assert merged == "Alvarez" + assert processor.build_report()["duplicate_fields"] == ["Officer"] + + +def test_ambiguity_is_flagged_for_review(): + processor = ExtractionQualityProcessor() + + processor.process("Incident Type", "Fire or smoke event") + + assert processor.build_report()["ambiguous_fields"] == ["Incident Type"] + + +def test_llm_add_response_uses_quality_pipeline(): + llm = LLM() + + llm.add_response_to_json("Victims", "Jane Doe; John Doe; Jane Doe") + llm.add_response_to_json("Victims", "John Doe") + llm.add_response_to_json("Phone", "-1") + + data = llm.get_data() + report = llm.get_quality_report() + + assert data["Victims"] == ["Jane Doe", "John Doe"] + assert data["Phone"] == MISSING_VALUE_SENTINEL + assert report["plural_normalized_fields"] == ["Victims"] From 942462a2b469d82ddc8e17167ae30e5d0d3748f2 Mon Sep 17 00:00:00 2001 From: Aryama Srivastav Date: Wed, 18 Mar 2026 22:38:24 +0530 Subject: [PATCH 3/3] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- src/extraction_quality.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/extraction_quality.py b/src/extraction_quality.py index d33f08f..f8eecaa 100644 --- a/src/extraction_quality.py +++ b/src/extraction_quality.py @@ -21,16 +21,22 @@ def process(self, field, raw_value, existing_value=None): self.plural_normalized_fields.add(field) if is_ambiguous: self.ambiguous_fields.add(field) - if self._is_missing(normalized_value): - self.missing_fields.add(field) if existing_value is None: - return normalized_value + merged_value = normalized_value + had_duplicate = False + else: + merged_value, had_duplicate = self._merge_values(existing_value, normalized_value) - merged_value, had_duplicate = self._merge_values(existing_value, normalized_value) if had_duplicate: self.duplicate_fields.add(field) + if self._is_missing(merged_value): + self.missing_fields.add(field) + else: + # Ensure the field is not reported as missing if the final merged value is present + self.missing_fields.discard(field) + return merged_value def build_report(self):