Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions src/extraction_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import re


MISSING_VALUE_SENTINEL = "__MISSING__"


class ExtractionQualityProcessor:
"""Normalize and merge extracted values from noisy conversational outputs."""

def __init__(self, missing_sentinel=MISSING_VALUE_SENTINEL):
self.missing_sentinel = missing_sentinel
self.duplicate_fields = set()
self.ambiguous_fields = set()
self.plural_normalized_fields = set()
self.missing_fields = set()

def process(self, field, raw_value, existing_value=None):
normalized_value, was_plural, is_ambiguous = self._normalize_value(raw_value)

if was_plural:
self.plural_normalized_fields.add(field)
if is_ambiguous:
self.ambiguous_fields.add(field)

if existing_value is None:
merged_value = normalized_value
had_duplicate = False
else:
merged_value, had_duplicate = self._merge_values(existing_value, normalized_value)

if had_duplicate:
self.duplicate_fields.add(field)

if self._is_missing(merged_value):
self.missing_fields.add(field)
else:
# Ensure the field is not reported as missing if the final merged value is present
self.missing_fields.discard(field)

return merged_value

def build_report(self):
return {
"missing_sentinel": self.missing_sentinel,
"duplicate_fields": sorted(self.duplicate_fields),
"ambiguous_fields": sorted(self.ambiguous_fields),
"plural_normalized_fields": sorted(self.plural_normalized_fields),
"missing_fields": sorted(self.missing_fields),
}

def _normalize_value(self, raw_value):
if raw_value is None:
return self.missing_sentinel, False, False

value = str(raw_value).strip().replace('"', "")

if value == "" or value == "-1":
return self.missing_sentinel, False, False

if ";" in value:
plural_values = self._normalize_plural_values(value)
if not plural_values:
return self.missing_sentinel, True, False
is_ambiguous = any(self._is_ambiguous_token(item) for item in plural_values)
return plural_values, True, is_ambiguous

return value, False, self._is_ambiguous_token(value)

def _normalize_plural_values(self, raw_plural_value):
values = [part.strip() for part in raw_plural_value.split(";")]
values = [value for value in values if value and value != "-1"]
return self._unique_ordered(values)

def _merge_values(self, existing_value, new_value):
if self._is_missing(new_value):
return existing_value, False

if self._is_missing(existing_value):
return new_value, False

existing_values = existing_value if isinstance(existing_value, list) else [existing_value]
new_values = new_value if isinstance(new_value, list) else [new_value]

merged = list(existing_values)
had_duplicate = False

for value in new_values:
if value in merged:
had_duplicate = True
continue
merged.append(value)

if len(merged) == 1:
return merged[0], had_duplicate

return merged, had_duplicate
Comment on lines +93 to +96

def _is_missing(self, value):
return value == self.missing_sentinel

def _is_ambiguous_token(self, token):
token = token.strip().lower()
return bool(re.search(r"\b(or|maybe|possibly|unclear|unknown)\b", token))

def _unique_ordered(self, values):
seen = set()
out = []

for value in values:
if value in seen:
continue
seen.add(value)
out.append(value)

return out
11 changes: 10 additions & 1 deletion src/inputs/input.txt
Original file line number Diff line number Diff line change
@@ -1 +1,10 @@
Officer Voldemort here, at an incident reported at 456 Oak Street. Two victims, Mark Smith and Jane Doe. Medical aid rendered for minor lacerations. Handed off to Sheriff's Deputy Alvarez. End of transmission.
UC Vaccine Declination Statement

Name/SID: Sarah Johnson, SID 4527891
Job Title: Research Scientist
Department: Microbiology
Phone Number: 831-555-0142
Email: sjohnson@ucsc.edu
Date: 03/15/2026
Comment on lines +3 to +8

Signature: ________________________
27 changes: 14 additions & 13 deletions src/llm.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import json
import os
import requests
from src.extraction_quality import ExtractionQualityProcessor


class LLM:
Expand All @@ -10,6 +11,7 @@ def __init__(self, transcript_text=None, target_fields=None, json=None):
self._transcript_text = transcript_text # str
self._target_fields = target_fields # List, contains the template field.
self._json = json # dictionary
self._quality = ExtractionQualityProcessor()

def type_check_all(self):
if type(self._transcript_text) is not str:
Expand Down Expand Up @@ -79,6 +81,8 @@ def main_loop(self):
print("----------------------------------")
print("\t[LOG] Resulting JSON created from the input text:")
print(json.dumps(self._json, indent=2))
print("\t[LOG] Extraction quality report:")
print(json.dumps(self._quality.build_report(), indent=2))
print("--------- extracted data ---------")

return self
Expand All @@ -88,22 +92,19 @@ def add_response_to_json(self, field, value):
this method adds the following value under the specified field,
or under a new field if the field doesn't exist, to the json dict
"""
value = value.strip().replace('"', "")
parsed_value = None

if value != "-1":
parsed_value = value

if ";" in value:
parsed_value = self.handle_plural_values(value)

if field in self._json.keys():
self._json[field].append(parsed_value)
else:
self._json[field] = parsed_value
existing_value = self._json.get(field)
normalized_value = self._quality.process(
field=field,
raw_value=value,
existing_value=existing_value,
)
self._json[field] = normalized_value

return

def get_quality_report(self):
return self._quality.build_report()

def handle_plural_values(self, plural_value):
"""
This method handles plural values.
Expand Down
55 changes: 55 additions & 0 deletions src/test/test_extraction_quality_controls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from src.extraction_quality import ExtractionQualityProcessor, MISSING_VALUE_SENTINEL
from src.llm import LLM


def test_missing_values_use_consistent_sentinel():
processor = ExtractionQualityProcessor()

value = processor.process("Phone", "-1")

assert value == MISSING_VALUE_SENTINEL
report = processor.build_report()
assert report["missing_sentinel"] == MISSING_VALUE_SENTINEL
assert report["missing_fields"] == ["Phone"]


def test_plural_values_are_normalized_and_deduplicated():
processor = ExtractionQualityProcessor()

value = processor.process("Victims", "Jane Doe; John Doe; Jane Doe")

assert value == ["Jane Doe", "John Doe"]
assert processor.build_report()["plural_normalized_fields"] == ["Victims"]


def test_duplicate_merge_is_deterministic():
processor = ExtractionQualityProcessor()

first = processor.process("Officer", "Alvarez")
merged = processor.process("Officer", "Alvarez", existing_value=first)

assert merged == "Alvarez"
assert processor.build_report()["duplicate_fields"] == ["Officer"]


def test_ambiguity_is_flagged_for_review():
processor = ExtractionQualityProcessor()

processor.process("Incident Type", "Fire or smoke event")

assert processor.build_report()["ambiguous_fields"] == ["Incident Type"]


def test_llm_add_response_uses_quality_pipeline():
llm = LLM()

llm.add_response_to_json("Victims", "Jane Doe; John Doe; Jane Doe")
llm.add_response_to_json("Victims", "John Doe")
llm.add_response_to_json("Phone", "-1")

data = llm.get_data()
report = llm.get_quality_report()

assert data["Victims"] == ["Jane Doe", "John Doe"]
assert data["Phone"] == MISSING_VALUE_SENTINEL
assert report["plural_normalized_fields"] == ["Victims"]