Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 14 additions & 17 deletions src/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,18 @@
from json_manager import JsonManager
from input_manager import InputManager
from pdfrw import PdfReader, PdfWriter
from .extraction_quality import ExtractionQualityProcessor



class textToJSON():
def __init__(self, transcript_text, target_fields, json={}):
def __init__(self, transcript_text, target_fields, json=None):
if json is None:
json = {}
self.__transcript_text = transcript_text # str
self.__target_fields = target_fields # List, contains the template field.
self.__json = json # dictionary
self.__quality = ExtractionQualityProcessor()
self.type_check_all()
self.main_loop()

Expand Down Expand Up @@ -71,6 +75,8 @@ def main_loop(self): #FUTURE -> Refactor this to its own class
print("----------------------------------")
print("\t[LOG] Resulting JSON created from the input text:")
print(json.dumps(self.__json, indent=2))
print("\t[LOG] Extraction quality report:")
print(json.dumps(self.__quality.build_report(), indent=2))
print("--------- extracted data ---------")

return None
Expand All @@ -80,22 +86,13 @@ def add_response_to_json(self, field, value):
this method adds the following value under the specified field,
or under a new field if the field doesn't exist, to the json dict
"""
value = value.strip().replace('"', '')
parsed_value = None
plural = False

if value != "-1":
parsed_value = value

if ";" in value:
parsed_value = self.handle_plural_values(value)
plural = True


if field in self.__json.keys():
self.__json[field].append(parsed_value)
else:
self.__json[field] = parsed_value
existing_value = self.__json.get(field)
normalized_value = self.__quality.process(
field=field,
raw_value=value,
existing_value=existing_value,
)
self.__json[field] = normalized_value

return

Expand Down
113 changes: 113 additions & 0 deletions src/extraction_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import re


MISSING_VALUE_SENTINEL = "__MISSING__"


class ExtractionQualityProcessor:
"""Post-processing for extracted field values.

Guarantees:
- missing values use one sentinel (MISSING_VALUE_SENTINEL)
- plural values use a normalized list format
- duplicates are merged deterministically (order-preserving unique)
- ambiguous values are flagged for review
"""

def __init__(self, missing_sentinel=MISSING_VALUE_SENTINEL):
self.missing_sentinel = missing_sentinel
self.duplicate_fields = set()
self.ambiguous_fields = set()
self.plural_normalized_fields = set()
self.missing_fields = set()

def process(self, field, raw_value, existing_value=None):
normalized_value, was_plural, is_ambiguous = self._normalize_value(raw_value)

if was_plural:
self.plural_normalized_fields.add(field)
if is_ambiguous:
self.ambiguous_fields.add(field)
if self._is_missing(normalized_value):
self.missing_fields.add(field)

if existing_value is None:
return normalized_value

merged, had_duplicate = self._merge_values(existing_value, normalized_value)
if had_duplicate:
self.duplicate_fields.add(field)
return merged
Comment on lines +31 to +40

def build_report(self):
return {
"missing_sentinel": self.missing_sentinel,
"duplicate_fields": sorted(self.duplicate_fields),
"ambiguous_fields": sorted(self.ambiguous_fields),
"plural_normalized_fields": sorted(self.plural_normalized_fields),
"missing_fields": sorted(self.missing_fields),
}

def _normalize_value(self, raw_value):
if raw_value is None:
return self.missing_sentinel, False, False

value = str(raw_value).strip().replace('"', "")
if value == "" or value == "-1":
return self.missing_sentinel, False, False

if ";" in value:
items = self._normalize_plural_values(value)
if not items:
return self.missing_sentinel, True, False
ambiguous = any(self._is_ambiguous_token(item) for item in items)
return items, True, ambiguous

return value, False, self._is_ambiguous_token(value)

def _normalize_plural_values(self, raw_plural_value):
parts = [part.strip() for part in raw_plural_value.split(";")]
parts = [part for part in parts if part and part != "-1"]
return self._unique_ordered(parts)

def _merge_values(self, existing_value, new_value):
# Do not replace known values with missing sentinel.
if self._is_missing(new_value):
return existing_value, False

# Missing existing value should be replaced by real data.
if self._is_missing(existing_value):
return new_value, False

existing_items = existing_value if isinstance(existing_value, list) else [existing_value]
new_items = new_value if isinstance(new_value, list) else [new_value]

merged = list(existing_items)
had_duplicate = False

for item in new_items:
if item in merged:
had_duplicate = True
continue
merged.append(item)

if len(merged) == 1:
return merged[0], had_duplicate
return merged, had_duplicate
Comment on lines +82 to +96

def _is_missing(self, value):
return value == self.missing_sentinel

Comment on lines +99 to +100
def _is_ambiguous_token(self, token):
token = token.strip().lower()
return bool(re.search(r"\b(or|maybe|possibly|unclear|unknown)\b", token))

def _unique_ordered(self, values):
seen = set()
out = []
for value in values:
if value in seen:
continue
seen.add(value)
out.append(value)
return out
51 changes: 51 additions & 0 deletions src/test/test_extraction_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from src.extraction_quality import ExtractionQualityProcessor, MISSING_VALUE_SENTINEL


def test_missing_values_use_single_sentinel():
processor = ExtractionQualityProcessor()

value = processor.process("Phone", "-1")

assert value == MISSING_VALUE_SENTINEL
report = processor.build_report()
assert report["missing_sentinel"] == MISSING_VALUE_SENTINEL
assert report["missing_fields"] == ["Phone"]


def test_plural_values_normalize_to_deduplicated_list():
processor = ExtractionQualityProcessor()

value = processor.process("Victims", "Jane Doe; John Doe; Jane Doe")

assert value == ["Jane Doe", "John Doe"]
report = processor.build_report()
assert report["plural_normalized_fields"] == ["Victims"]


def test_duplicate_merge_is_deterministic():
processor = ExtractionQualityProcessor()

existing = processor.process("Officer", "Alvarez")
merged = processor.process("Officer", "Alvarez", existing_value=existing)

assert merged == "Alvarez"
report = processor.build_report()
assert report["duplicate_fields"] == ["Officer"]


def test_duplicate_merge_promotes_to_list_when_values_differ():
processor = ExtractionQualityProcessor()

existing = processor.process("Officer", "Alvarez")
merged = processor.process("Officer", "Martinez", existing_value=existing)

assert merged == ["Alvarez", "Martinez"]


def test_ambiguous_values_are_flagged_for_review():
processor = ExtractionQualityProcessor()

processor.process("Incident Type", "Fire or smoke event")

report = processor.build_report()
assert report["ambiguous_fields"] == ["Incident Type"]