From 5672c86bf39afc701fe71ee91757b1d9a7b9b8cb Mon Sep 17 00:00:00 2001 From: Julius Scheuerer <95489434+JuliusScheuerer@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:18:09 +0100 Subject: [PATCH] Deduplicate overlapping entity detections When Presidio's built-in recognizers and custom German recognizers both match the same text span (e.g., IBAN_CODE + DE_IBAN), the entity appeared twice in the review panel. Add _deduplicate_overlapping() to text_handler that keeps the highest-confidence result per character range, with span length as tiebreaker for equal scores. --- .../document/text_handler.py | 33 ++++++- tests/test_document/test_text_handler.py | 89 +++++++++++++++++++ 2 files changed, 120 insertions(+), 2 deletions(-) diff --git a/src/document_anonymizer/document/text_handler.py b/src/document_anonymizer/document/text_handler.py index 6cb7c3a..b3fa6ac 100644 --- a/src/document_anonymizer/document/text_handler.py +++ b/src/document_anonymizer/document/text_handler.py @@ -7,6 +7,34 @@ from document_anonymizer.anonymization.strategies import AnonymizationStrategy +def _deduplicate_overlapping( + results: list[RecognizerResult], +) -> list[RecognizerResult]: + """Remove overlapping entity detections, keeping the highest-confidence one. + + When Presidio's built-in recognizers and custom German recognizers both + match the same text span (e.g., IBAN_CODE + DE_IBAN), this keeps only + the highest-scoring result for each character range. + + Tiebreaker when scores are equal: longer span wins (more specific match). + """ + if len(results) <= 1: + return results + + # Sort by score descending, then span length descending (tiebreaker) + sorted_results = sorted(results, key=lambda r: (-r.score, -(r.end - r.start))) + + accepted: list[RecognizerResult] = [] + for candidate in sorted_results: + overlaps = any( + candidate.start < a.end and candidate.end > a.start for a in accepted + ) + if not overlaps: + accepted.append(candidate) + + return accepted + + def detect_pii_in_text( engine: AnalyzerEngine, text: str, @@ -22,10 +50,11 @@ def detect_pii_in_text( score_threshold: Minimum confidence score. Returns: - List of detected PII entities. + Deduplicated list of detected PII entities above the score threshold. """ results = engine.analyze(text=text, language=language) - return [r for r in results if r.score >= score_threshold] + filtered = [r for r in results if r.score >= score_threshold] + return _deduplicate_overlapping(filtered) def anonymize_plain_text( diff --git a/tests/test_document/test_text_handler.py b/tests/test_document/test_text_handler.py index ff56223..cb22cac 100644 --- a/tests/test_document/test_text_handler.py +++ b/tests/test_document/test_text_handler.py @@ -2,8 +2,11 @@ from unittest.mock import MagicMock, patch +from presidio_analyzer import RecognizerResult + from document_anonymizer.anonymization.strategies import AnonymizationStrategy from document_anonymizer.document.text_handler import ( + _deduplicate_overlapping, anonymize_plain_text, detect_pii_in_text, ) @@ -70,3 +73,89 @@ def test_passes_entity_strategies(self) -> None: ) _, kwargs = mock_anon.call_args assert kwargs["entity_strategies"] == entity_strats + + +def _make_result( + entity_type: str, start: int, end: int, score: float +) -> RecognizerResult: + """Helper to create a RecognizerResult for tests.""" + return RecognizerResult(entity_type=entity_type, start=start, end=end, score=score) + + +class TestDeduplicateOverlapping: + def test_empty_list(self) -> None: + assert _deduplicate_overlapping([]) == [] + + def test_single_result_unchanged(self) -> None: + result = _make_result("PERSON", 0, 10, 0.9) + assert _deduplicate_overlapping([result]) == [result] + + def test_exact_overlap_keeps_higher_score(self) -> None: + """IBAN_CODE (0.4) and DE_IBAN (0.85) on same span → keep DE_IBAN.""" + builtin = _make_result("IBAN_CODE", 10, 32, 0.4) + custom = _make_result("DE_IBAN", 10, 32, 0.85) + results = _deduplicate_overlapping([builtin, custom]) + assert len(results) == 1 + assert results[0].entity_type == "DE_IBAN" + + def test_partial_overlap_keeps_higher_score(self) -> None: + """Two entities that partially overlap — higher score wins.""" + a = _make_result("PERSON", 0, 15, 0.7) + b = _make_result("LOCATION", 10, 25, 0.9) + results = _deduplicate_overlapping([a, b]) + assert len(results) == 1 + assert results[0].entity_type == "LOCATION" + + def test_non_overlapping_both_kept(self) -> None: + """Two entities at different positions are both kept.""" + a = _make_result("PERSON", 0, 10, 0.9) + b = _make_result("DE_IBAN", 20, 42, 0.85) + results = _deduplicate_overlapping([a, b]) + assert len(results) == 2 + + def test_equal_score_longer_span_wins(self) -> None: + """Same score on overlapping spans — longer span is preferred.""" + short = _make_result("PHONE_NUMBER", 5, 15, 0.6) + long = _make_result("DE_PHONE", 5, 20, 0.6) + results = _deduplicate_overlapping([short, long]) + assert len(results) == 1 + assert results[0].entity_type == "DE_PHONE" + + def test_adjacent_not_overlapping(self) -> None: + """Entities that touch but don't overlap are both kept.""" + a = _make_result("PERSON", 0, 10, 0.9) + b = _make_result("LOCATION", 10, 20, 0.8) + results = _deduplicate_overlapping([a, b]) + assert len(results) == 2 + + def test_three_way_overlap(self) -> None: + """Three overlapping entities — only the highest-scoring survives.""" + a = _make_result("PERSON", 0, 10, 0.5) + b = _make_result("LOCATION", 2, 12, 0.7) + c = _make_result("ORG", 5, 15, 0.9) + results = _deduplicate_overlapping([a, b, c]) + assert len(results) == 1 + assert results[0].entity_type == "ORG" + + def test_phone_dedup_de_phone_vs_phone_number(self) -> None: + """DE_PHONE vs PHONE_NUMBER on same span — higher score wins.""" + builtin = _make_result("PHONE_NUMBER", 0, 14, 0.4) + custom = _make_result("DE_PHONE", 0, 14, 0.6) + results = _deduplicate_overlapping([builtin, custom]) + assert len(results) == 1 + assert results[0].entity_type == "DE_PHONE" + + def test_detect_pii_deduplicates(self) -> None: + """Integration: detect_pii_in_text returns deduplicated results.""" + mock_analyzer = MagicMock() + overlap_a = MagicMock(score=0.85, entity_type="DE_IBAN", start=0, end=22) + overlap_b = MagicMock(score=0.4, entity_type="IBAN_CODE", start=0, end=22) + separate = MagicMock(score=0.9, entity_type="PERSON", start=30, end=45) + mock_analyzer.analyze.return_value = [overlap_a, overlap_b, separate] + + results = detect_pii_in_text(mock_analyzer, "test text") + assert len(results) == 2 + entity_types = {r.entity_type for r in results} + assert "IBAN_CODE" not in entity_types + assert "DE_IBAN" in entity_types + assert "PERSON" in entity_types