Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions src/document_anonymizer/api/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pydantic import BaseModel, Field

from document_anonymizer.anonymization.strategies import AnonymizationStrategy
from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD

_EXAMPLE_TEXT = (
"Herr Max Mustermann, geboren am 15.03.1985, "
Expand All @@ -26,10 +27,10 @@ class DetectionRequest(BaseModel):
examples=["de"],
)
score_threshold: float = Field(
default=0.35,
default=DEFAULT_SCORE_THRESHOLD,
ge=0.0,
le=1.0,
examples=[0.35],
examples=[DEFAULT_SCORE_THRESHOLD],
)


Expand Down Expand Up @@ -74,10 +75,10 @@ class AnonymizeRequest(BaseModel):
examples=[{"PERSON": "fake", "DE_IBAN": "mask"}],
)
score_threshold: float = Field(
default=0.35,
default=DEFAULT_SCORE_THRESHOLD,
ge=0.0,
le=1.0,
examples=[0.35],
examples=[DEFAULT_SCORE_THRESHOLD],
)


Expand Down
24 changes: 24 additions & 0 deletions src/document_anonymizer/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Shared constants for confidence scoring and detection thresholds."""

__all__ = [
"DEFAULT_SCORE_THRESHOLD",
"RECOGNIZER_BASE_SCORE_HIGH",
"RECOGNIZER_BASE_SCORE_LOW",
"TIER_HIGH_THRESHOLD",
"TIER_MEDIUM_THRESHOLD",
]

# Minimum confidence score for a detection to be included in results.
# Lower values catch more entities but increase false positives.
DEFAULT_SCORE_THRESHOLD = 0.35

# Tier boundaries for the entity review panel.
# High-confidence entities (>= TIER_HIGH_THRESHOLD) are shown expanded by default.
TIER_HIGH_THRESHOLD = 0.7
TIER_MEDIUM_THRESHOLD = 0.5

# Base confidence scores for pattern-based recognizers.
# Higher base = more confident the pattern alone is a real match.
# Context words boost these scores at runtime.
RECOGNIZER_BASE_SCORE_HIGH = 0.5
RECOGNIZER_BASE_SCORE_LOW = 0.3
4 changes: 3 additions & 1 deletion src/document_anonymizer/detection/recognizers/german_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from presidio_analyzer import Pattern, PatternRecognizer

from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_LOW

# DD.MM.YYYY — standard German date format
_DATE_PATTERN = r"\b(?:0[1-9]|[12]\d|3[01])\.(?:0[1-9]|1[0-2])\.\d{4}\b"

Expand Down Expand Up @@ -37,7 +39,7 @@ class GermanDateRecognizer(PatternRecognizer):

def __init__(self) -> None:
patterns = [
Pattern("german_date_full", _DATE_PATTERN, 0.3),
Pattern("german_date_full", _DATE_PATTERN, RECOGNIZER_BASE_SCORE_LOW),
Pattern("german_date_short", _DATE_SHORT_PATTERN, 0.2),
]
super().__init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

from presidio_analyzer import Pattern, PatternRecognizer

from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_HIGH

_HANDELSREG_PATTERN = r"\bHR[AB]\s?\d{3,6}\s?[A-Z]?\b"

_CONTEXT_WORDS = [
Expand All @@ -30,7 +32,11 @@ class GermanHandelsregisterRecognizer(PatternRecognizer):

def __init__(self) -> None:
patterns = [
Pattern("german_handelsregister", _HANDELSREG_PATTERN, 0.5),
Pattern(
"german_handelsregister",
_HANDELSREG_PATTERN,
RECOGNIZER_BASE_SCORE_HIGH,
),
]
super().__init__(
supported_entity="DE_HANDELSREGISTER",
Expand Down
4 changes: 3 additions & 1 deletion src/document_anonymizer/detection/recognizers/german_iban.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from presidio_analyzer import Pattern, PatternRecognizer

from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_HIGH

# DE + 2 check digits + 18 digits (bank code + account number)
# Allows optional spaces every 4 characters
_IBAN_PATTERN = r"\bDE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}\b"
Expand Down Expand Up @@ -42,7 +44,7 @@ class GermanIbanRecognizer(PatternRecognizer):
"""Detects German IBANs with context boosting and checksum validation."""

ENTITIES: ClassVar[list[str]] = ["DE_IBAN"]
DEFAULT_SCORE = 0.5
DEFAULT_SCORE = RECOGNIZER_BASE_SCORE_HIGH

def __init__(self) -> None:
patterns = [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from presidio_analyzer import Pattern, PatternRecognizer

from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_LOW

# Restricted alphanumeric: letters that cannot be confused with digits
_VALID_LETTERS = "CFGHJKLMNPRTVWXYZ"

Expand Down Expand Up @@ -63,7 +65,7 @@ class GermanIdCardRecognizer(PatternRecognizer):

def __init__(self) -> None:
patterns = [
Pattern("german_id_card", _ID_PATTERN, 0.3),
Pattern("german_id_card", _ID_PATTERN, RECOGNIZER_BASE_SCORE_LOW),
]
super().__init__(
supported_entity="DE_ID_CARD",
Expand Down
11 changes: 9 additions & 2 deletions src/document_anonymizer/detection/recognizers/german_phone.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,11 @@

from presidio_analyzer import Pattern, PatternRecognizer

from document_anonymizer.constants import (
RECOGNIZER_BASE_SCORE_HIGH,
RECOGNIZER_BASE_SCORE_LOW,
)

# International format: +49 followed by area code and number
_INTL_PATTERN = r"\+49\s?\(?\d{2,4}\)?\s?\d{3,8}(?:[\s-]?\d{1,5})?\b"

Expand Down Expand Up @@ -37,8 +42,10 @@ class GermanPhoneRecognizer(PatternRecognizer):

def __init__(self) -> None:
patterns = [
Pattern("german_phone_intl", _INTL_PATTERN, 0.5),
Pattern("german_phone_domestic", _DOMESTIC_PATTERN, 0.3),
Pattern("german_phone_intl", _INTL_PATTERN, RECOGNIZER_BASE_SCORE_HIGH),
Pattern(
"german_phone_domestic", _DOMESTIC_PATTERN, RECOGNIZER_BASE_SCORE_LOW
),
Pattern("german_phone_mobile", _MOBILE_PATTERN, 0.6),
]
super().__init__(
Expand Down
8 changes: 6 additions & 2 deletions src/document_anonymizer/detection/recognizers/german_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

from presidio_analyzer import Pattern, PatternRecognizer

from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_LOW

# Steuer-ID: exactly 11 digits, no leading zero
_STEUER_ID_PATTERN = r"\b[1-9]\d{10}\b"

Expand Down Expand Up @@ -67,12 +69,14 @@ def __init__(self) -> None:
Pattern(
"steuer_id",
_STEUER_ID_PATTERN,
0.3, # Low base — 11 digits are common; context boosts it
# Low base — 11 digits are common; context boosts it
RECOGNIZER_BASE_SCORE_LOW,
),
Pattern(
"steuernummer",
_STEUERNUMMER_PATTERN,
0.3, # Below default threshold; requires context words
# Below default threshold; requires context words
RECOGNIZER_BASE_SCORE_LOW,
),
]
super().__init__(
Expand Down
7 changes: 4 additions & 3 deletions src/document_anonymizer/document/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from document_anonymizer.anonymization.engine import anonymize_text
from document_anonymizer.anonymization.strategies import AnonymizationStrategy
from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD
from document_anonymizer.document.text_handler import detect_pii_in_text

if TYPE_CHECKING:
Expand Down Expand Up @@ -82,7 +83,7 @@ def detect_pii_in_pdf(
analyzer: AnalyzerEngine,
pdf_bytes: bytes,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> list[PdfDetection]:
"""Detect PII in a PDF with page and position metadata.

Expand Down Expand Up @@ -136,7 +137,7 @@ def redact_pdf(
analyzer: AnalyzerEngine,
pdf_bytes: bytes,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> tuple[bytes, list[PdfDetection]]:
"""Physically redact PII from a PDF document.

Expand Down Expand Up @@ -284,7 +285,7 @@ def anonymize_pdf_text(
pdf_bytes: bytes,
strategy: AnonymizationStrategy = AnonymizationStrategy.REPLACE,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> tuple[str, list[RecognizerResult]]:
"""Extract text from PDF, detect and anonymize PII.

Expand Down
5 changes: 3 additions & 2 deletions src/document_anonymizer/document/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from presidio_anonymizer import AnonymizerEngine

from document_anonymizer.anonymization.strategies import AnonymizationStrategy
from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD
from document_anonymizer.document.pdf_handler import PdfDetection, redact_pdf
from document_anonymizer.document.text_handler import anonymize_plain_text

Expand All @@ -15,7 +16,7 @@ def process_text(
strategy: AnonymizationStrategy = AnonymizationStrategy.REPLACE,
entity_strategies: dict[str, AnonymizationStrategy] | None = None,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> tuple[str, list[RecognizerResult]]:
"""Process plain text: detect and anonymize PII."""
return anonymize_plain_text(
Expand All @@ -33,7 +34,7 @@ def process_pdf(
analyzer: AnalyzerEngine,
pdf_bytes: bytes,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> tuple[bytes, list[PdfDetection]]:
"""Process PDF: detect PII and apply physical redaction."""
return redact_pdf(
Expand Down
5 changes: 3 additions & 2 deletions src/document_anonymizer/document/text_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from document_anonymizer.anonymization.engine import anonymize_text
from document_anonymizer.anonymization.strategies import AnonymizationStrategy
from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD


def _deduplicate_overlapping(
Expand Down Expand Up @@ -39,7 +40,7 @@ def detect_pii_in_text(
engine: AnalyzerEngine,
text: str,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> list[RecognizerResult]:
"""Detect PII entities in plain text.

Expand All @@ -64,7 +65,7 @@ def anonymize_plain_text(
strategy: AnonymizationStrategy = AnonymizationStrategy.REPLACE,
entity_strategies: dict[str, AnonymizationStrategy] | None = None,
language: str = "de",
score_threshold: float = 0.35,
score_threshold: float = DEFAULT_SCORE_THRESHOLD,
) -> tuple[str, list[RecognizerResult]]:
"""Detect and anonymize PII in plain text.

Expand Down
15 changes: 10 additions & 5 deletions src/document_anonymizer/web/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
from document_anonymizer.anonymization.engine import anonymize_text
from document_anonymizer.anonymization.strategies import AnonymizationStrategy
from document_anonymizer.api.dependencies import get_analyzer, get_anonymizer
from document_anonymizer.constants import (
DEFAULT_SCORE_THRESHOLD,
TIER_HIGH_THRESHOLD,
TIER_MEDIUM_THRESHOLD,
)
from document_anonymizer.document.pdf_handler import (
IncompleteRedactionError,
PdfPageLimitExceededError,
Expand Down Expand Up @@ -130,9 +135,9 @@ def _template_response(

def _score_to_tier(score: float) -> Tier:
"""Map a confidence score to a review tier."""
if score >= 0.7:
if score >= TIER_HIGH_THRESHOLD:
return "high"
if score >= 0.5:
if score >= TIER_MEDIUM_THRESHOLD:
return "medium"
return "low"

Expand Down Expand Up @@ -363,7 +368,7 @@ def _normalize_line_endings(text: str) -> str:
async def detect_form(
request: Request,
text: Annotated[str, Form(max_length=_MAX_TEXT_LENGTH)] = "",
score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = 0.35,
score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = DEFAULT_SCORE_THRESHOLD,
file: UploadFile | None = File(default=None), # noqa: B008
analyzer: AnalyzerEngine = Depends(get_analyzer), # noqa: B008
) -> HTMLResponse:
Expand Down Expand Up @@ -489,7 +494,7 @@ async def anonymize_form(
request: Request,
text: Annotated[str, Form(max_length=_MAX_TEXT_LENGTH)] = "",
strategy: str = Form(default="replace"),
score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = 0.35,
score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = DEFAULT_SCORE_THRESHOLD,
is_pdf: bool = Form(default=False),
pdf_b64: str = Form(default=""),
selected_entities: str = Form(default=""),
Expand Down Expand Up @@ -603,7 +608,7 @@ async def anonymize_form(
async def redact_pdf_form(
request: Request,
pdf_b64: str = Form(...),
score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = 0.35,
score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = DEFAULT_SCORE_THRESHOLD,
selected_entities: str = Form(default=""),
analyzer: AnalyzerEngine = Depends(get_analyzer), # noqa: B008
) -> Response:
Expand Down
Loading