From 7ebd36f47f9294b36e72495418fca6c996aec2ec Mon Sep 17 00:00:00 2001 From: Julius Scheuerer <95489434+JuliusScheuerer@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:37:10 +0100 Subject: [PATCH] Extract hardcoded confidence thresholds to named constants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create constants.py with DEFAULT_SCORE_THRESHOLD (0.35), TIER_HIGH_THRESHOLD (0.7), TIER_MEDIUM_THRESHOLD (0.5), RECOGNIZER_BASE_SCORE_HIGH (0.5), and RECOGNIZER_BASE_SCORE_LOW (0.3). Replace all 20+ magic number occurrences across 11 files with these named constants. No logic changes — only literal replacements. --- src/document_anonymizer/api/schemas.py | 9 +++---- src/document_anonymizer/constants.py | 24 +++++++++++++++++++ .../detection/recognizers/german_date.py | 4 +++- .../recognizers/german_handelsreg.py | 8 ++++++- .../detection/recognizers/german_iban.py | 4 +++- .../detection/recognizers/german_id_card.py | 4 +++- .../detection/recognizers/german_phone.py | 11 +++++++-- .../detection/recognizers/german_tax.py | 8 +++++-- .../document/pdf_handler.py | 7 +++--- src/document_anonymizer/document/processor.py | 5 ++-- .../document/text_handler.py | 5 ++-- src/document_anonymizer/web/routes.py | 15 ++++++++---- 12 files changed, 80 insertions(+), 24 deletions(-) create mode 100644 src/document_anonymizer/constants.py diff --git a/src/document_anonymizer/api/schemas.py b/src/document_anonymizer/api/schemas.py index 360092e..b5887a4 100644 --- a/src/document_anonymizer/api/schemas.py +++ b/src/document_anonymizer/api/schemas.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, Field from document_anonymizer.anonymization.strategies import AnonymizationStrategy +from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD _EXAMPLE_TEXT = ( "Herr Max Mustermann, geboren am 15.03.1985, " @@ -26,10 +27,10 @@ class DetectionRequest(BaseModel): examples=["de"], ) score_threshold: float = Field( - default=0.35, + default=DEFAULT_SCORE_THRESHOLD, ge=0.0, le=1.0, - examples=[0.35], + examples=[DEFAULT_SCORE_THRESHOLD], ) @@ -74,10 +75,10 @@ class AnonymizeRequest(BaseModel): examples=[{"PERSON": "fake", "DE_IBAN": "mask"}], ) score_threshold: float = Field( - default=0.35, + default=DEFAULT_SCORE_THRESHOLD, ge=0.0, le=1.0, - examples=[0.35], + examples=[DEFAULT_SCORE_THRESHOLD], ) diff --git a/src/document_anonymizer/constants.py b/src/document_anonymizer/constants.py new file mode 100644 index 0000000..424a7e4 --- /dev/null +++ b/src/document_anonymizer/constants.py @@ -0,0 +1,24 @@ +"""Shared constants for confidence scoring and detection thresholds.""" + +__all__ = [ + "DEFAULT_SCORE_THRESHOLD", + "RECOGNIZER_BASE_SCORE_HIGH", + "RECOGNIZER_BASE_SCORE_LOW", + "TIER_HIGH_THRESHOLD", + "TIER_MEDIUM_THRESHOLD", +] + +# Minimum confidence score for a detection to be included in results. +# Lower values catch more entities but increase false positives. +DEFAULT_SCORE_THRESHOLD = 0.35 + +# Tier boundaries for the entity review panel. +# High-confidence entities (>= TIER_HIGH_THRESHOLD) are shown expanded by default. +TIER_HIGH_THRESHOLD = 0.7 +TIER_MEDIUM_THRESHOLD = 0.5 + +# Base confidence scores for pattern-based recognizers. +# Higher base = more confident the pattern alone is a real match. +# Context words boost these scores at runtime. +RECOGNIZER_BASE_SCORE_HIGH = 0.5 +RECOGNIZER_BASE_SCORE_LOW = 0.3 diff --git a/src/document_anonymizer/detection/recognizers/german_date.py b/src/document_anonymizer/detection/recognizers/german_date.py index acf6308..4aef64c 100644 --- a/src/document_anonymizer/detection/recognizers/german_date.py +++ b/src/document_anonymizer/detection/recognizers/german_date.py @@ -8,6 +8,8 @@ from presidio_analyzer import Pattern, PatternRecognizer +from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_LOW + # DD.MM.YYYY — standard German date format _DATE_PATTERN = r"\b(?:0[1-9]|[12]\d|3[01])\.(?:0[1-9]|1[0-2])\.\d{4}\b" @@ -37,7 +39,7 @@ class GermanDateRecognizer(PatternRecognizer): def __init__(self) -> None: patterns = [ - Pattern("german_date_full", _DATE_PATTERN, 0.3), + Pattern("german_date_full", _DATE_PATTERN, RECOGNIZER_BASE_SCORE_LOW), Pattern("german_date_short", _DATE_SHORT_PATTERN, 0.2), ] super().__init__( diff --git a/src/document_anonymizer/detection/recognizers/german_handelsreg.py b/src/document_anonymizer/detection/recognizers/german_handelsreg.py index b266d24..e372262 100644 --- a/src/document_anonymizer/detection/recognizers/german_handelsreg.py +++ b/src/document_anonymizer/detection/recognizers/german_handelsreg.py @@ -8,6 +8,8 @@ from presidio_analyzer import Pattern, PatternRecognizer +from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_HIGH + _HANDELSREG_PATTERN = r"\bHR[AB]\s?\d{3,6}\s?[A-Z]?\b" _CONTEXT_WORDS = [ @@ -30,7 +32,11 @@ class GermanHandelsregisterRecognizer(PatternRecognizer): def __init__(self) -> None: patterns = [ - Pattern("german_handelsregister", _HANDELSREG_PATTERN, 0.5), + Pattern( + "german_handelsregister", + _HANDELSREG_PATTERN, + RECOGNIZER_BASE_SCORE_HIGH, + ), ] super().__init__( supported_entity="DE_HANDELSREGISTER", diff --git a/src/document_anonymizer/detection/recognizers/german_iban.py b/src/document_anonymizer/detection/recognizers/german_iban.py index 742122d..3c96355 100644 --- a/src/document_anonymizer/detection/recognizers/german_iban.py +++ b/src/document_anonymizer/detection/recognizers/german_iban.py @@ -4,6 +4,8 @@ from presidio_analyzer import Pattern, PatternRecognizer +from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_HIGH + # DE + 2 check digits + 18 digits (bank code + account number) # Allows optional spaces every 4 characters _IBAN_PATTERN = r"\bDE\d{2}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{4}\s?\d{2}\b" @@ -42,7 +44,7 @@ class GermanIbanRecognizer(PatternRecognizer): """Detects German IBANs with context boosting and checksum validation.""" ENTITIES: ClassVar[list[str]] = ["DE_IBAN"] - DEFAULT_SCORE = 0.5 + DEFAULT_SCORE = RECOGNIZER_BASE_SCORE_HIGH def __init__(self) -> None: patterns = [ diff --git a/src/document_anonymizer/detection/recognizers/german_id_card.py b/src/document_anonymizer/detection/recognizers/german_id_card.py index 2076d45..890817b 100644 --- a/src/document_anonymizer/detection/recognizers/german_id_card.py +++ b/src/document_anonymizer/detection/recognizers/german_id_card.py @@ -13,6 +13,8 @@ from presidio_analyzer import Pattern, PatternRecognizer +from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_LOW + # Restricted alphanumeric: letters that cannot be confused with digits _VALID_LETTERS = "CFGHJKLMNPRTVWXYZ" @@ -63,7 +65,7 @@ class GermanIdCardRecognizer(PatternRecognizer): def __init__(self) -> None: patterns = [ - Pattern("german_id_card", _ID_PATTERN, 0.3), + Pattern("german_id_card", _ID_PATTERN, RECOGNIZER_BASE_SCORE_LOW), ] super().__init__( supported_entity="DE_ID_CARD", diff --git a/src/document_anonymizer/detection/recognizers/german_phone.py b/src/document_anonymizer/detection/recognizers/german_phone.py index ad9b1cd..bec9ed4 100644 --- a/src/document_anonymizer/detection/recognizers/german_phone.py +++ b/src/document_anonymizer/detection/recognizers/german_phone.py @@ -4,6 +4,11 @@ from presidio_analyzer import Pattern, PatternRecognizer +from document_anonymizer.constants import ( + RECOGNIZER_BASE_SCORE_HIGH, + RECOGNIZER_BASE_SCORE_LOW, +) + # International format: +49 followed by area code and number _INTL_PATTERN = r"\+49\s?\(?\d{2,4}\)?\s?\d{3,8}(?:[\s-]?\d{1,5})?\b" @@ -37,8 +42,10 @@ class GermanPhoneRecognizer(PatternRecognizer): def __init__(self) -> None: patterns = [ - Pattern("german_phone_intl", _INTL_PATTERN, 0.5), - Pattern("german_phone_domestic", _DOMESTIC_PATTERN, 0.3), + Pattern("german_phone_intl", _INTL_PATTERN, RECOGNIZER_BASE_SCORE_HIGH), + Pattern( + "german_phone_domestic", _DOMESTIC_PATTERN, RECOGNIZER_BASE_SCORE_LOW + ), Pattern("german_phone_mobile", _MOBILE_PATTERN, 0.6), ] super().__init__( diff --git a/src/document_anonymizer/detection/recognizers/german_tax.py b/src/document_anonymizer/detection/recognizers/german_tax.py index 3c8c53d..e91b35b 100644 --- a/src/document_anonymizer/detection/recognizers/german_tax.py +++ b/src/document_anonymizer/detection/recognizers/german_tax.py @@ -4,6 +4,8 @@ from presidio_analyzer import Pattern, PatternRecognizer +from document_anonymizer.constants import RECOGNIZER_BASE_SCORE_LOW + # Steuer-ID: exactly 11 digits, no leading zero _STEUER_ID_PATTERN = r"\b[1-9]\d{10}\b" @@ -67,12 +69,14 @@ def __init__(self) -> None: Pattern( "steuer_id", _STEUER_ID_PATTERN, - 0.3, # Low base — 11 digits are common; context boosts it + # Low base — 11 digits are common; context boosts it + RECOGNIZER_BASE_SCORE_LOW, ), Pattern( "steuernummer", _STEUERNUMMER_PATTERN, - 0.3, # Below default threshold; requires context words + # Below default threshold; requires context words + RECOGNIZER_BASE_SCORE_LOW, ), ] super().__init__( diff --git a/src/document_anonymizer/document/pdf_handler.py b/src/document_anonymizer/document/pdf_handler.py index c5f6608..7e3d373 100644 --- a/src/document_anonymizer/document/pdf_handler.py +++ b/src/document_anonymizer/document/pdf_handler.py @@ -15,6 +15,7 @@ from document_anonymizer.anonymization.engine import anonymize_text from document_anonymizer.anonymization.strategies import AnonymizationStrategy +from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD from document_anonymizer.document.text_handler import detect_pii_in_text if TYPE_CHECKING: @@ -82,7 +83,7 @@ def detect_pii_in_pdf( analyzer: AnalyzerEngine, pdf_bytes: bytes, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> list[PdfDetection]: """Detect PII in a PDF with page and position metadata. @@ -136,7 +137,7 @@ def redact_pdf( analyzer: AnalyzerEngine, pdf_bytes: bytes, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> tuple[bytes, list[PdfDetection]]: """Physically redact PII from a PDF document. @@ -284,7 +285,7 @@ def anonymize_pdf_text( pdf_bytes: bytes, strategy: AnonymizationStrategy = AnonymizationStrategy.REPLACE, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> tuple[str, list[RecognizerResult]]: """Extract text from PDF, detect and anonymize PII. diff --git a/src/document_anonymizer/document/processor.py b/src/document_anonymizer/document/processor.py index de65f35..a204ed0 100644 --- a/src/document_anonymizer/document/processor.py +++ b/src/document_anonymizer/document/processor.py @@ -4,6 +4,7 @@ from presidio_anonymizer import AnonymizerEngine from document_anonymizer.anonymization.strategies import AnonymizationStrategy +from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD from document_anonymizer.document.pdf_handler import PdfDetection, redact_pdf from document_anonymizer.document.text_handler import anonymize_plain_text @@ -15,7 +16,7 @@ def process_text( strategy: AnonymizationStrategy = AnonymizationStrategy.REPLACE, entity_strategies: dict[str, AnonymizationStrategy] | None = None, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> tuple[str, list[RecognizerResult]]: """Process plain text: detect and anonymize PII.""" return anonymize_plain_text( @@ -33,7 +34,7 @@ def process_pdf( analyzer: AnalyzerEngine, pdf_bytes: bytes, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> tuple[bytes, list[PdfDetection]]: """Process PDF: detect PII and apply physical redaction.""" return redact_pdf( diff --git a/src/document_anonymizer/document/text_handler.py b/src/document_anonymizer/document/text_handler.py index b3fa6ac..d00de5a 100644 --- a/src/document_anonymizer/document/text_handler.py +++ b/src/document_anonymizer/document/text_handler.py @@ -5,6 +5,7 @@ from document_anonymizer.anonymization.engine import anonymize_text from document_anonymizer.anonymization.strategies import AnonymizationStrategy +from document_anonymizer.constants import DEFAULT_SCORE_THRESHOLD def _deduplicate_overlapping( @@ -39,7 +40,7 @@ def detect_pii_in_text( engine: AnalyzerEngine, text: str, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> list[RecognizerResult]: """Detect PII entities in plain text. @@ -64,7 +65,7 @@ def anonymize_plain_text( strategy: AnonymizationStrategy = AnonymizationStrategy.REPLACE, entity_strategies: dict[str, AnonymizationStrategy] | None = None, language: str = "de", - score_threshold: float = 0.35, + score_threshold: float = DEFAULT_SCORE_THRESHOLD, ) -> tuple[str, list[RecognizerResult]]: """Detect and anonymize PII in plain text. diff --git a/src/document_anonymizer/web/routes.py b/src/document_anonymizer/web/routes.py index 76a4483..b576624 100644 --- a/src/document_anonymizer/web/routes.py +++ b/src/document_anonymizer/web/routes.py @@ -19,6 +19,11 @@ from document_anonymizer.anonymization.engine import anonymize_text from document_anonymizer.anonymization.strategies import AnonymizationStrategy from document_anonymizer.api.dependencies import get_analyzer, get_anonymizer +from document_anonymizer.constants import ( + DEFAULT_SCORE_THRESHOLD, + TIER_HIGH_THRESHOLD, + TIER_MEDIUM_THRESHOLD, +) from document_anonymizer.document.pdf_handler import ( IncompleteRedactionError, PdfPageLimitExceededError, @@ -130,9 +135,9 @@ def _template_response( def _score_to_tier(score: float) -> Tier: """Map a confidence score to a review tier.""" - if score >= 0.7: + if score >= TIER_HIGH_THRESHOLD: return "high" - if score >= 0.5: + if score >= TIER_MEDIUM_THRESHOLD: return "medium" return "low" @@ -363,7 +368,7 @@ def _normalize_line_endings(text: str) -> str: async def detect_form( request: Request, text: Annotated[str, Form(max_length=_MAX_TEXT_LENGTH)] = "", - score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = 0.35, + score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = DEFAULT_SCORE_THRESHOLD, file: UploadFile | None = File(default=None), # noqa: B008 analyzer: AnalyzerEngine = Depends(get_analyzer), # noqa: B008 ) -> HTMLResponse: @@ -489,7 +494,7 @@ async def anonymize_form( request: Request, text: Annotated[str, Form(max_length=_MAX_TEXT_LENGTH)] = "", strategy: str = Form(default="replace"), - score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = 0.35, + score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = DEFAULT_SCORE_THRESHOLD, is_pdf: bool = Form(default=False), pdf_b64: str = Form(default=""), selected_entities: str = Form(default=""), @@ -603,7 +608,7 @@ async def anonymize_form( async def redact_pdf_form( request: Request, pdf_b64: str = Form(...), - score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = 0.35, + score_threshold: Annotated[float, Form(ge=0.0, le=1.0)] = DEFAULT_SCORE_THRESHOLD, selected_entities: str = Form(default=""), analyzer: AnalyzerEngine = Depends(get_analyzer), # noqa: B008 ) -> Response: