From 879d3e5e5b4ce4a17904aa2e5aec9a885e83d660 Mon Sep 17 00:00:00 2001 From: Julius Scheuerer <95489434+JuliusScheuerer@users.noreply.github.com> Date: Thu, 5 Mar 2026 13:07:30 +0100 Subject: [PATCH 1/2] Add i18n support for web UI (German + English) Introduce a JSON-based translation system with cookie-persisted language switching. All UI strings are now translatable via server-side Jinja2 _() and client-side window.__t() helpers. New files: - i18n module with LRU-cached translation loading and TypeGuard narrowing - Translation files (de.json, en.json) with 53 keys each - Client-side i18n.js helper with global placeholder replacement - Unit tests (19) and integration tests (17) for i18n Key changes: - Language detection: query param > cookie > default (German) - Secure cookie (httponly, secure, samesite=lax) set on explicit switch - XSS prevention: html.escape on all interpolated values, breakout escaping in embedded JSON - TypeGuard is_supported_lang() eliminates type: ignore comments - Existing tests pinned to explicit language via cookies --- src/document_anonymizer/i18n/__init__.py | 88 ++++++++ .../i18n/translations/de.json | 54 +++++ .../i18n/translations/en.json | 54 +++++ src/document_anonymizer/web/routes.py | 206 +++++++++++++----- src/document_anonymizer/web/static/js/app.js | 18 +- src/document_anonymizer/web/static/js/i18n.js | 31 +++ .../web/static/js/review.js | 26 ++- .../web/templates/anonymized.html | 14 +- .../web/templates/base.html | 19 +- .../web/templates/index.html | 22 +- .../web/templates/results.html | 24 +- tests/test_i18n/__init__.py | 0 tests/test_i18n/test_translations.py | 108 +++++++++ tests/test_web/test_highlighted_text.py | 15 ++ tests/test_web/test_i18n_integration.py | 109 +++++++++ tests/test_web/test_review_helpers.py | 22 +- tests/test_web/test_review_routes.py | 5 +- tests/test_web/test_routes.py | 3 + 18 files changed, 700 insertions(+), 118 deletions(-) create mode 100644 src/document_anonymizer/i18n/__init__.py create mode 100644 src/document_anonymizer/i18n/translations/de.json create mode 100644 src/document_anonymizer/i18n/translations/en.json create mode 100644 src/document_anonymizer/web/static/js/i18n.js create mode 100644 tests/test_i18n/__init__.py create mode 100644 tests/test_i18n/test_translations.py create mode 100644 tests/test_web/test_i18n_integration.py diff --git a/src/document_anonymizer/i18n/__init__.py b/src/document_anonymizer/i18n/__init__.py new file mode 100644 index 0000000..83d7c88 --- /dev/null +++ b/src/document_anonymizer/i18n/__init__.py @@ -0,0 +1,88 @@ +"""Internationalization support for the web UI.""" + +from __future__ import annotations + +import json +from functools import lru_cache +from pathlib import Path +from typing import TYPE_CHECKING, Any, Literal, TypeGuard, get_args + +if TYPE_CHECKING: + from collections.abc import Mapping + +import structlog +from jinja2 import pass_context + +logger = structlog.get_logger(__name__) + +Lang = Literal["de", "en"] +SUPPORTED_LANGUAGES: set[str] = {"de", "en"} +DEFAULT_LANGUAGE: Lang = "de" + +# Ensure Lang type and SUPPORTED_LANGUAGES stay in sync +if set(get_args(Lang)) != SUPPORTED_LANGUAGES: # pragma: no cover + msg = "Lang type and SUPPORTED_LANGUAGES are out of sync" + raise RuntimeError(msg) + +_TRANSLATIONS_DIR = Path(__file__).parent / "translations" + + +def is_supported_lang(value: str) -> TypeGuard[Lang]: + """Check if a string is a supported language code (narrows type to Lang).""" + return value in SUPPORTED_LANGUAGES + + +def _load_translations(lang: str) -> dict[str, str]: + """Normalize unsupported lang codes to default, return cached.""" + if lang not in SUPPORTED_LANGUAGES: + lang = DEFAULT_LANGUAGE + return _load_translations_cached(lang) + + +@lru_cache(maxsize=2) # maxsize matches len(SUPPORTED_LANGUAGES) +def _load_translations_cached(lang: str) -> dict[str, str]: + """Load and cache a translation file.""" + path = _TRANSLATIONS_DIR / f"{lang}.json" + try: + with path.open(encoding="utf-8") as f: + data: dict[str, str] = json.load(f) + except (OSError, json.JSONDecodeError, UnicodeDecodeError): + logger.exception("translation_file_load_failed", lang=lang) + return {} + return data + + +def get_translations(lang: str) -> dict[str, str]: + """Get all translations for a language (public API).""" + return _load_translations(lang) + + +def translate( + key: str, lang: Lang = DEFAULT_LANGUAGE, **kwargs: str | int | float +) -> str: + """Look up a translation key and interpolate any kwargs.""" + translations = _load_translations(lang) + template = translations.get(key) + if template is None: + logger.warning("translation_key_missing", key=key, lang=lang) + return key + if kwargs: + try: + return template.format(**kwargs) + except (KeyError, IndexError, ValueError): + logger.warning("translation_format_error", key=key, lang=lang) + return template + return template + + +@pass_context +def jinja_translate( + context: Mapping[str, Any], key: str, **kwargs: str | int | float +) -> str: + """Jinja2 global function: {{ _("key", arg=val) }}. + + The @pass_context decorator injects a jinja2.runtime.Context (Mapping-like). + """ + raw_lang = context.get("lang", DEFAULT_LANGUAGE) + lang = raw_lang if is_supported_lang(raw_lang) else DEFAULT_LANGUAGE + return translate(key, lang=lang, **kwargs) diff --git a/src/document_anonymizer/i18n/translations/de.json b/src/document_anonymizer/i18n/translations/de.json new file mode 100644 index 0000000..205db16 --- /dev/null +++ b/src/document_anonymizer/i18n/translations/de.json @@ -0,0 +1,54 @@ +{ + "brand.name": "Dokument-Anonymisierer", + "brand.subtitle": "Deutsche PII-Erkennung & Schwärzung", + "index.heading": "Dokument anonymisieren", + "index.subtitle": "Text eingeben oder Datei hochladen. Alle Daten werden nur im Arbeitsspeicher verarbeitet — keine Persistenz.", + "index.text_label": "Text", + "index.example_btn": "Beispieltext laden", + "index.file_label": "Oder Datei hochladen (TXT, PDF)", + "index.threshold_label": "Konfidenzschwelle", + "index.detect_btn": "PII erkennen", + "index.loading": "Analysiere...", + "index.placeholder": "Geben Sie hier den zu anonymisierenden Text ein...\n\nBeispiel: Herr Max Mustermann, geboren am 15.03.1985, wohnhaft in 10115 Berlin, Musterstraße 42. IBAN: DE89 3704 0044 0532 0130 00. Steuer-ID: 12345679811. Tel: +49 30 12345678.", + "results.heading": "Erkennungsergebnisse", + "results.entities_found": "{count} Entität(en) gefunden", + "results.preview_heading": "Erkannte PII im Text", + "results.review_heading": "Entitäten prüfen und auswählen", + "results.tier_high": "Hohe Konfidenz", + "results.tier_medium": "Mittlere Konfidenz", + "results.tier_low": "Niedrige Konfidenz", + "results.strategy_label": "Strategie:", + "results.anonymize_btn": "Anonymisieren", + "results.redact_btn": "PDF schwärzen", + "results.loading": "Anonymisiere...", + "results.no_entities": "Keine PII-Entitäten erkannt.", + "anonymized.heading": "Ergebnis", + "anonymized.entities_anonymized": "{count} Entität(en) anonymisiert", + "anonymized.strategy": "Strategie: {strategy}", + "anonymized.original_heading": "Original (mit Markierungen)", + "anonymized.result_heading": "Anonymisiert", + "anonymized.copy_btn": "Text kopieren", + "anonymized.download_pdf_btn": "PDF mit Schwärzungen herunterladen", + "noscript.text": "JavaScript ist für die interaktive Oberfläche erforderlich. Bitte aktivieren Sie JavaScript oder nutzen Sie die", + "noscript.link": "REST API", + "footer.text": "Zero-Persistence-Architektur · Keine externen API-Aufrufe · Physische PDF-Schwärzung", + "error.no_input": "Bitte Text eingeben oder Datei hochladen.", + "error.unknown_strategy": "Unbekannte Strategie: {strategy}", + "error.detection_failed": "Fehler bei der PII-Erkennung. (Referenz: {request_id})", + "error.anonymization_failed": "Fehler bei der Anonymisierung. (Referenz: {request_id})", + "error.entity_parse_failed": "Entitätsauswahl konnte nicht verarbeitet werden.", + "error.entity_invalid_format": "Entitätsauswahl hat ein ungültiges Format.", + "error.entity_skipped": "{skipped} von {total} ausgewählten Entitäten konnten nicht verarbeitet werden. Bitte erneut versuchen.", + "error.pdf_redaction_failed": "PDF-Schwärzung fehlgeschlagen. (Referenz: {request_id})", + "error.invalid_pdf": "Ungültige PDF-Daten. Bitte laden Sie die Datei erneut hoch.", + "error.incomplete_redaction": "Unvollständige Schwärzung: {unredacted} von {total} erkannten PII-Entitäten konnten im PDF nicht visuell lokalisiert werden. Manuelle Überprüfung empfohlen.", + "common.confidence": "Konfidenz: {score}", + "review.entities_selected": "{count} von {total} Entitäten ausgewählt", + "review.no_selection_warning": "Keine Entitäten ausgewählt. Trotzdem fortfahren?", + "review.entity_load_error": "Fehler beim Laden der Entitätsdaten. Bitte führen Sie die Erkennung erneut durch.", + "review.download_error": "Fehler beim Herunterladen der PDF.", + "app.example_loaded": "Geladen!", + "app.example_error": "Fehler!", + "app.copied": "Kopiert!", + "app.copy_failed": "Kopieren fehlgeschlagen" +} diff --git a/src/document_anonymizer/i18n/translations/en.json b/src/document_anonymizer/i18n/translations/en.json new file mode 100644 index 0000000..ffcbced --- /dev/null +++ b/src/document_anonymizer/i18n/translations/en.json @@ -0,0 +1,54 @@ +{ + "brand.name": "Document Anonymizer", + "brand.subtitle": "German PII Detection & Redaction", + "index.heading": "Anonymize document", + "index.subtitle": "Enter text or upload a file. All data is processed in memory only — no persistence.", + "index.text_label": "Text", + "index.example_btn": "Load example text", + "index.file_label": "Or upload file (TXT, PDF)", + "index.threshold_label": "Confidence threshold", + "index.detect_btn": "Detect PII", + "index.loading": "Analyzing...", + "index.placeholder": "Enter text to anonymize here...\n\nExample: Herr Max Mustermann, born on 15.03.1985, residing at 10115 Berlin, Musterstraße 42. IBAN: DE89 3704 0044 0532 0130 00. Tax ID: 12345679811. Tel: +49 30 12345678.", + "results.heading": "Detection results", + "results.entities_found": "{count} entity(ies) found", + "results.preview_heading": "Detected PII in text", + "results.review_heading": "Review and select entities", + "results.tier_high": "High confidence", + "results.tier_medium": "Medium confidence", + "results.tier_low": "Low confidence", + "results.strategy_label": "Strategy:", + "results.anonymize_btn": "Anonymize", + "results.redact_btn": "Redact PDF", + "results.loading": "Anonymizing...", + "results.no_entities": "No PII entities detected.", + "anonymized.heading": "Result", + "anonymized.entities_anonymized": "{count} entity(ies) anonymized", + "anonymized.strategy": "Strategy: {strategy}", + "anonymized.original_heading": "Original (with highlights)", + "anonymized.result_heading": "Anonymized", + "anonymized.copy_btn": "Copy text", + "anonymized.download_pdf_btn": "Download redacted PDF", + "noscript.text": "JavaScript is required for the interactive interface. Please enable JavaScript or use the", + "noscript.link": "REST API", + "footer.text": "Zero-persistence architecture · No external API calls · Physical PDF redaction", + "error.no_input": "Please enter text or upload a file.", + "error.unknown_strategy": "Unknown strategy: {strategy}", + "error.detection_failed": "PII detection failed. (Reference: {request_id})", + "error.anonymization_failed": "Anonymization failed. (Reference: {request_id})", + "error.entity_parse_failed": "Entity selection could not be processed.", + "error.entity_invalid_format": "Entity selection has an invalid format.", + "error.entity_skipped": "{skipped} of {total} selected entities could not be processed. Please try again.", + "error.pdf_redaction_failed": "PDF redaction failed. (Reference: {request_id})", + "error.invalid_pdf": "Invalid PDF data. Please upload the file again.", + "error.incomplete_redaction": "Incomplete redaction: {unredacted} of {total} detected PII entities could not be visually located in the PDF. Manual review recommended.", + "common.confidence": "Confidence: {score}", + "review.entities_selected": "{count} of {total} entities selected", + "review.no_selection_warning": "No entities selected. Continue anyway?", + "review.entity_load_error": "Error loading entity data. Please run detection again.", + "review.download_error": "Error downloading PDF.", + "app.example_loaded": "Loaded!", + "app.example_error": "Error!", + "app.copied": "Copied!", + "app.copy_failed": "Copy failed" +} diff --git a/src/document_anonymizer/web/routes.py b/src/document_anonymizer/web/routes.py index e60c616..d649690 100644 --- a/src/document_anonymizer/web/routes.py +++ b/src/document_anonymizer/web/routes.py @@ -28,6 +28,14 @@ redact_pdf_with_entities, ) from document_anonymizer.document.text_handler import detect_pii_in_text +from document_anonymizer.i18n import ( + DEFAULT_LANGUAGE, + Lang, + get_translations, + is_supported_lang, + jinja_translate, + translate, +) from document_anonymizer.security.validation import ( FileValidationError, validate_file_content, @@ -55,10 +63,62 @@ class _EntityHighlight(TypedDict): _TEMPLATE_DIR = Path(__file__).parent / "templates" templates = Jinja2Templates(directory=str(_TEMPLATE_DIR)) +templates.env.globals["_"] = jinja_translate web_router = APIRouter(tags=["web"]) +def _get_lang(request: Request) -> Lang: + """Extract language preference: query param > cookie > default.""" + query_lang = request.query_params.get("lang", "") + if is_supported_lang(query_lang): + return query_lang + cookie_lang = request.cookies.get("lang", "") + if is_supported_lang(cookie_lang): + return cookie_lang + return DEFAULT_LANGUAGE + + +def _template_response( + request: Request, + template_name: str, + context: dict[str, object] | None = None, + *, + status_code: int = 200, +) -> HTMLResponse: + """Create a TemplateResponse with i18n context. + + Includes translations_json for client-side ``window.__t()``. + Sets a lang cookie when the user switches language via query param. + Escapes `` Tier: """Map a confidence score to a review tier.""" if score >= 0.7: @@ -106,7 +166,9 @@ def _group_entities_by_tier( _MAX_ENTITY_TEXT_LENGTH = 1000 -def _parse_selected_entities_json(json_str: str, context: str) -> list[dict] | None: # type: ignore[type-arg] +def _parse_selected_entities_json( + json_str: str, context: str, lang: Lang = DEFAULT_LANGUAGE +) -> list[dict] | None: # type: ignore[type-arg] """Parse and validate the shared JSON envelope for selected entities. Returns None if json_str is empty (no review panel interaction). @@ -119,19 +181,23 @@ def _parse_selected_entities_json(json_str: str, context: str) -> list[dict] | N raw = json.loads(json_str) except (json.JSONDecodeError, ValueError): logger.warning(f"selected_entities_{context}_invalid_json") - msg = "Entitätsauswahl konnte nicht verarbeitet werden." - raise ValueError(msg) from None + raise ValueError(translate("error.entity_parse_failed", lang=lang)) from None if not isinstance(raw, list) or len(raw) > _MAX_SELECTED_ENTITIES: count = len(raw) if isinstance(raw, list) else 0 logger.warning(f"selected_entities_{context}_bad_format", count=count) - msg = "Entitätsauswahl hat ein ungültiges Format." - raise ValueError(msg) + raise ValueError(translate("error.entity_invalid_format", lang=lang)) return raw -def _report_skipped(skipped: int, total: int, accepted: int, context: str) -> None: +def _report_skipped( + skipped: int, + total: int, + accepted: int, + context: str, + lang: Lang = DEFAULT_LANGUAGE, +) -> None: """Log and raise if any items were skipped during entity reconstruction.""" if skipped == 0: return @@ -141,15 +207,13 @@ def _report_skipped(skipped: int, total: int, accepted: int, context: str) -> No total=total, accepted=accepted, ) - msg = ( - f"{skipped} von {total} ausgewählten Entitäten konnten nicht " - f"verarbeitet werden. Bitte erneut versuchen." + raise ValueError( + translate("error.entity_skipped", lang=lang, skipped=skipped, total=total) ) - raise ValueError(msg) def _reconstruct_recognizer_results( - json_str: str, text: str + json_str: str, text: str, lang: Lang = DEFAULT_LANGUAGE ) -> list[RecognizerResult] | None: """Deserialize selected entities JSON back to Presidio RecognizerResult objects. @@ -157,7 +221,7 @@ def _reconstruct_recognizer_results( Raises ValueError if json_str is non-empty but malformed, so callers can distinguish "no selection made" from "selection corrupted". """ - raw = _parse_selected_entities_json(json_str, context="text") + raw = _parse_selected_entities_json(json_str, context="text", lang=lang) if raw is None: return None @@ -210,19 +274,21 @@ def _reconstruct_recognizer_results( ) ) - _report_skipped(skipped, total=len(raw), accepted=len(results), context="text") + _report_skipped( + skipped, total=len(raw), accepted=len(results), context="text", lang=lang + ) return results def _reconstruct_selected_entities_for_pdf( - json_str: str, + json_str: str, lang: Lang = DEFAULT_LANGUAGE ) -> list[RedactionTarget] | None: """Parse selected entities JSON into the format needed by redact_pdf_with_entities. Returns None only if json_str is empty (no review panel interaction). Raises ValueError if json_str is non-empty but malformed. """ - raw = _parse_selected_entities_json(json_str, context="pdf") + raw = _parse_selected_entities_json(json_str, context="pdf", lang=lang) if raw is None: return None @@ -242,7 +308,9 @@ def _reconstruct_selected_entities_for_pdf( continue targets.append(RedactionTarget(text=text)) - _report_skipped(skipped, total=len(raw), accepted=len(targets), context="pdf") + _report_skipped( + skipped, total=len(raw), accepted=len(targets), context="pdf", lang=lang + ) return targets @@ -261,7 +329,7 @@ async def _require_htmx_header(request: Request) -> None: async def index(request: Request) -> HTMLResponse: """Main page with upload/paste interface.""" strategies = [s.value for s in AnonymizationStrategy] - return templates.TemplateResponse(request, "index.html", {"strategies": strategies}) + return _template_response(request, "index.html", {"strategies": strategies}) _MAX_TEXT_LENGTH = 100_000 @@ -292,6 +360,7 @@ async def detect_form( ) -> HTMLResponse: """Handle detection form submission, return results fragment.""" start = time.perf_counter() + lang = _get_lang(request) is_pdf = False pdf_b64 = "" @@ -301,20 +370,19 @@ async def detect_form( try: mime_type = validate_file_content(content, filename=file.filename) except FileValidationError as e: - return templates.TemplateResponse( - request, "error_fragment.html", {"error": str(e)} - ) + # error_fragment.html receives pre-translated error strings + return _template_response(request, "error_fragment.html", {"error": str(e)}) if mime_type == "application/pdf": try: validate_pdf_structure(content) text = extract_text_from_pdf(content) except FileValidationError as e: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", {"error": str(e)} ) except PdfPageLimitExceededError as e: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", {"error": str(e)} ) is_pdf = True @@ -325,10 +393,10 @@ async def detect_form( text = _normalize_line_endings(text) if not text.strip(): - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", - {"error": "Bitte Text eingeben oder Datei hochladen."}, + {"error": translate("error.no_input", lang=lang)}, ) try: @@ -355,10 +423,10 @@ async def detect_form( # Replace breakout (XSS). entities_json = json.dumps(entities, ensure_ascii=False).replace(" HTMLResponse: """Handle anonymization form submission.""" text = _normalize_line_endings(text) + lang = _get_lang(request) try: strat = AnonymizationStrategy(strategy) except ValueError: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", - {"error": f"Unbekannte Strategie: {html.escape(strategy)}"}, + { + "error": translate( + "error.unknown_strategy", + lang=lang, + strategy=html.escape(strategy), + ) + }, ) try: @@ -418,9 +497,11 @@ async def anonymize_form( # Use pre-selected entities if provided, otherwise re-detect try: - detections = _reconstruct_recognizer_results(selected_entities, text) + detections = _reconstruct_recognizer_results( + selected_entities, text, lang=lang + ) except ValueError as e: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", {"error": str(e)}, @@ -445,11 +526,13 @@ async def anonymize_form( ) for idx, r in enumerate(sorted_detections) ] - highlighted_original = _build_highlighted_text(text, entities_for_highlight) + highlighted_original = _build_highlighted_text( + text, entities_for_highlight, lang=lang + ) elapsed_ms = (time.perf_counter() - start) * 1000 - return templates.TemplateResponse( + return _template_response( request, "anonymized.html", { @@ -468,10 +551,14 @@ async def anonymize_form( except Exception: logger.exception("anonymize_form_error") request_id = getattr(request.state, "request_id", "unknown") - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", - {"error": f"Fehler bei der Anonymisierung. (Referenz: {request_id})"}, + { + "error": translate( + "error.anonymization_failed", lang=lang, request_id=request_id + ) + }, ) @@ -484,6 +571,7 @@ async def redact_pdf_form( analyzer: AnalyzerEngine = Depends(get_analyzer), # noqa: B008 ) -> Response: """Handle PDF redaction — returns redacted PDF for download.""" + lang = _get_lang(request) try: pdf_bytes = base64.b64decode(pdf_b64) validate_file_content(pdf_bytes) @@ -491,9 +579,11 @@ async def redact_pdf_form( # Use pre-selected entities if provided try: - selected = _reconstruct_selected_entities_for_pdf(selected_entities) + selected = _reconstruct_selected_entities_for_pdf( + selected_entities, lang=lang + ) except ValueError as e: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", {"error": str(e)}, @@ -507,21 +597,21 @@ async def redact_pdf_form( analyzer, pdf_bytes, score_threshold=score_threshold ) except binascii.Error: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", - {"error": "Ungültige PDF-Daten. Bitte laden Sie die Datei erneut hoch."}, + {"error": translate("error.invalid_pdf", lang=lang)}, status_code=400, ) except FileValidationError as e: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", {"error": str(e)}, status_code=400, ) except PdfPageLimitExceededError as e: - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", {"error": str(e)}, @@ -533,15 +623,15 @@ async def redact_pdf_form( unredacted=e.unredacted_count, total=e.total_count, ) - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", { - "error": ( - f"Unvollständige Schwärzung: {e.unredacted_count} von " - f"{e.total_count} erkannten PII-Entitäten konnten im PDF " - f"nicht visuell lokalisiert werden. " - f"Manuelle Überprüfung empfohlen." + "error": translate( + "error.incomplete_redaction", + lang=lang, + unredacted=e.unredacted_count, + total=e.total_count, ), }, status_code=422, @@ -549,10 +639,14 @@ async def redact_pdf_form( except Exception: logger.exception("redact_pdf_error") request_id = getattr(request.state, "request_id", "unknown") - return templates.TemplateResponse( + return _template_response( request, "error_fragment.html", - {"error": f"PDF-Schwärzung fehlgeschlagen. (Referenz: {request_id})"}, + { + "error": translate( + "error.pdf_redaction_failed", lang=lang, request_id=request_id + ) + }, status_code=500, ) @@ -563,12 +657,17 @@ async def redact_pdf_form( ) -def _build_highlighted_text(text: str, entities: list[_EntityHighlight]) -> str: +def _build_highlighted_text( + text: str, + entities: list[_EntityHighlight], + lang: Lang = DEFAULT_LANGUAGE, +) -> str: """Build HTML with color-coded PII highlights. Walks through the text segment by segment, escaping non-entity gaps and entity content individually, then wrapping entities in tags. - Overlapping entities are skipped (the first span by start position wins). + Overlapping entities are skipped (earliest start position wins; for ties, + the longest span wins). """ if not entities: return html.escape(text) @@ -599,7 +698,8 @@ def _build_highlighted_text(text: str, entities: list[_EntityHighlight]) -> str: original = text[start:end] safe_type = html.escape(entity_type.lower().replace("_", "-")) css_class = f"entity-{safe_type}" - tooltip = f"{html.escape(entity_type)} (Konfidenz: {score:.0%})" + confidence = translate("common.confidence", lang=lang, score=f"{score:.0%}") + tooltip = f"{html.escape(entity_type)} ({html.escape(confidence)})" parts.append( f' boolean @@ -23,12 +30,12 @@ try { entities = JSON.parse(dataEl.textContent); } catch (e) { + console.error("Failed to parse entities data:", e); var panel = document.getElementById("review-panel"); if (panel) { var errDiv = document.createElement("div"); errDiv.className = "bg-red-50 border border-red-200 rounded-lg p-4"; - errDiv.textContent = - "Fehler beim Laden der Entit\u00e4tsdaten. Bitte f\u00fchren Sie die Erkennung erneut durch."; + errDiv.textContent = __t("review.entity_load_error"); panel.prepend(errDiv); } var anonymizeBtn = document.getElementById("anonymize-btn"); @@ -152,7 +159,7 @@ if (e.detail && e.detail.path === "/anonymize-form") { var count = countSelected(); if (count === 0) { - if (!confirm("Keine Entitäten ausgewählt. Trotzdem fortfahren?")) { + if (!confirm(__t("review.no_selection_warning"))) { e.preventDefault(); } } @@ -246,7 +253,7 @@ var total = entities.length; var counter = document.getElementById("selection-counter"); if (counter) { - counter.textContent = count + " von " + total + " Entitäten ausgewählt"; + counter.textContent = __t("review.entities_selected", {count: count, total: total}); } } @@ -265,8 +272,6 @@ return "low"; } - var DOWNLOAD_ERROR = "Fehler beim Herunterladen der PDF."; - /** * Extract plain text from an HTML error fragment returned by the server. * Uses DOMParser to safely parse without injecting into the live document. @@ -274,9 +279,10 @@ function extractTextFromHtml(html) { try { var doc = new DOMParser().parseFromString(html, "text/html"); - return doc.body.textContent.trim() || DOWNLOAD_ERROR; - } catch (_) { - return DOWNLOAD_ERROR; + return doc.body.textContent.trim() || __t("review.download_error"); + } catch (err) { + console.error("Failed to parse error HTML:", err); + return __t("review.download_error"); } } @@ -312,7 +318,7 @@ URL.revokeObjectURL(url); }) .catch(function (err) { - alert(err.message || DOWNLOAD_ERROR); + alert(err.message || __t("review.download_error")); }) .finally(function () { if (btn) btn.disabled = false; diff --git a/src/document_anonymizer/web/templates/anonymized.html b/src/document_anonymizer/web/templates/anonymized.html index 5fd8444..625942d 100644 --- a/src/document_anonymizer/web/templates/anonymized.html +++ b/src/document_anonymizer/web/templates/anonymized.html @@ -1,9 +1,9 @@
-

Ergebnis

+

{{ _("anonymized.heading") }}

- {{ entities_found }} Entität{{ "en" if entities_found != 1 else "" }} anonymisiert - · Strategie: {{ strategy }} + {{ _("anonymized.entities_anonymized", count=entities_found) }} + · {{ _("anonymized.strategy", strategy=strategy) }} · {{ processing_time_ms }} ms
@@ -11,11 +11,11 @@

Ergebnis

-

Original (mit Markierungen)

+

{{ _("anonymized.original_heading") }}

{{ highlighted_original | safe }}
-

Anonymisiert

+

{{ _("anonymized.result_heading") }}

{{ anonymized_text }}
@@ -25,7 +25,7 @@

Anonymisiert

id="copy-btn" class="btn btn-secondary text-sm" > - Text kopieren + {{ _("anonymized.copy_btn") }} {% if is_pdf and pdf_b64 %}
@@ -33,7 +33,7 @@

Anonymisiert

{% endif %} diff --git a/src/document_anonymizer/web/templates/base.html b/src/document_anonymizer/web/templates/base.html index 4b2e9bd..08bf7a2 100644 --- a/src/document_anonymizer/web/templates/base.html +++ b/src/document_anonymizer/web/templates/base.html @@ -1,5 +1,5 @@ - + @@ -10,8 +10,10 @@ - {% block title %}Document Anonymizer{% endblock %} + {% block title %}{{ _("brand.name") }}{% endblock %} + + @@ -24,16 +26,21 @@
- Document Anonymizer + {{ _("brand.name") }} - German PII Detection & Redaction +
+ {{ _("brand.subtitle") }} + | + DE + EN +
@@ -44,7 +51,7 @@
diff --git a/src/document_anonymizer/web/templates/index.html b/src/document_anonymizer/web/templates/index.html index 9a45847..1455951 100644 --- a/src/document_anonymizer/web/templates/index.html +++ b/src/document_anonymizer/web/templates/index.html @@ -1,40 +1,38 @@ {% extends "base.html" %} -{% block title %}Document Anonymizer{% endblock %} +{% block title %}{{ _("brand.name") }}{% endblock %} {% block content %}
-

Dokument anonymisieren

-

Text eingeben oder Datei hochladen. Alle Daten werden nur im Arbeitsspeicher verarbeitet — keine Persistenz.

+

{{ _("index.heading") }}

+

{{ _("index.subtitle") }}

- - Beispieltext laden + + {{ _("index.example_btn") }}
- +
Dokument anonymisieren
-

Analysiere...

+

{{ _("index.loading") }}

diff --git a/src/document_anonymizer/web/templates/results.html b/src/document_anonymizer/web/templates/results.html index 4f57c85..fcac789 100644 --- a/src/document_anonymizer/web/templates/results.html +++ b/src/document_anonymizer/web/templates/results.html @@ -2,9 +2,9 @@
-

Erkennungsergebnisse

+

{{ _("results.heading") }}

- {{ entity_count }} Entität{{ "en" if entity_count != 1 else "" }} gefunden + {{ _("results.entities_found", count=entity_count) }} · {{ processing_time_ms }} ms
@@ -12,18 +12,18 @@

Erkennungsergebnisse

{% if entities %}
-

Erkannte PII im Text

+

{{ _("results.preview_heading") }}

{{ highlighted_text | safe }}
-

Entitäten prüfen und auswählen

+

{{ _("results.review_heading") }}

{% set tier_config = [ - ("high", "Hohe Konfidenz", "tier-count-badge--high", "≥ 70%"), - ("medium", "Mittlere Konfidenz", "tier-count-badge--medium", "50–69%"), - ("low", "Niedrige Konfidenz", "tier-count-badge--low", "35–49%") + ("high", _("results.tier_high"), "tier-count-badge--high", "≥ 70%"), + ("medium", _("results.tier_medium"), "tier-count-badge--medium", "50–69%"), + ("low", _("results.tier_low"), "tier-count-badge--low", "35–49%") ] %} {% for tier_key, tier_label, badge_class, tier_range in tier_config %} @@ -75,7 +75,7 @@

Entitäten prüfen und auswählen

- + {% if is_pdf and pdf_b64 %} @@ -101,7 +101,7 @@

Entitäten prüfen und auswählen

{% endif %} @@ -109,13 +109,13 @@

Entitäten prüfen und auswählen

-

Anonymisiere...

+

{{ _("results.loading") }}

{% else %} -

Keine PII-Entitäten erkannt.

+

{{ _("results.no_entities") }}

{% endif %}
diff --git a/tests/test_i18n/__init__.py b/tests/test_i18n/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_i18n/test_translations.py b/tests/test_i18n/test_translations.py new file mode 100644 index 0000000..95ffd80 --- /dev/null +++ b/tests/test_i18n/test_translations.py @@ -0,0 +1,108 @@ +"""Tests for the i18n translation module.""" + +from typing import get_args +from unittest.mock import patch + +from document_anonymizer.i18n import ( + DEFAULT_LANGUAGE, + SUPPORTED_LANGUAGES, + Lang, + get_translations, + jinja_translate, + translate, +) + + +class TestTranslate: + def test_returns_german_string(self) -> None: + result = translate("brand.name", lang="de") + assert result == "Dokument-Anonymisierer" + + def test_returns_english_string(self) -> None: + result = translate("brand.name", lang="en") + assert result == "Document Anonymizer" + + def test_missing_key_returns_key(self) -> None: + result = translate("nonexistent.key", lang="de") + assert result == "nonexistent.key" + + def test_missing_key_logs_warning(self) -> None: + with patch("document_anonymizer.i18n.logger") as mock_logger: + translate("nonexistent.key", lang="de") + mock_logger.warning.assert_called_once_with( + "translation_key_missing", key="nonexistent.key", lang="de" + ) + + def test_format_interpolation(self) -> None: + result = translate("results.entities_found", lang="de", count=5) + assert "5" in result + + def test_format_error_returns_template(self) -> None: + """Missing format arg should return the template without interpolation.""" + result = translate("results.entities_found", lang="de", wrong_key="x") + assert "{count}" in result + + def test_format_error_logs_warning(self) -> None: + with patch("document_anonymizer.i18n.logger") as mock_logger: + translate("results.entities_found", lang="de", wrong_key="x") + mock_logger.warning.assert_called_with( + "translation_format_error", key="results.entities_found", lang="de" + ) + + def test_unsupported_lang_falls_back_to_default(self) -> None: + result = translate("brand.name", lang="fr") # type: ignore[arg-type] + assert result == translate("brand.name", lang=DEFAULT_LANGUAGE) + + def test_default_lang_is_german(self) -> None: + assert DEFAULT_LANGUAGE == "de" + result = translate("brand.name") + assert result == "Dokument-Anonymisierer" + + +class TestGetTranslations: + def test_loads_german(self) -> None: + data = get_translations("de") + assert "brand.name" in data + + def test_loads_english(self) -> None: + data = get_translations("en") + assert "brand.name" in data + + def test_unsupported_lang_falls_back(self) -> None: + data = get_translations("fr") + assert data == get_translations(DEFAULT_LANGUAGE) + + +class TestJinjaTranslate: + def test_german_context(self) -> None: + result = jinja_translate({"lang": "de"}, "brand.name") + assert result == "Dokument-Anonymisierer" + + def test_english_context(self) -> None: + result = jinja_translate({"lang": "en"}, "brand.name") + assert result == "Document Anonymizer" + + def test_unsupported_lang_falls_back(self) -> None: + result = jinja_translate({"lang": "fr"}, "brand.name") + assert result == translate("brand.name", lang=DEFAULT_LANGUAGE) + + def test_missing_lang_uses_default(self) -> None: + result = jinja_translate({}, "brand.name") + assert result == translate("brand.name", lang=DEFAULT_LANGUAGE) + + +class TestTranslationFiles: + def test_de_and_en_have_same_keys(self) -> None: + de = get_translations("de") + en = get_translations("en") + assert set(de.keys()) == set(en.keys()), ( + f"Key mismatch: de_only={set(de) - set(en)}, en_only={set(en) - set(de)}" + ) + + def test_supported_languages_matches_lang_type(self) -> None: + assert set(get_args(Lang)) == SUPPORTED_LANGUAGES + + def test_all_supported_languages_load(self) -> None: + for lang in SUPPORTED_LANGUAGES: + data = get_translations(lang) + assert len(data) > 0, f"Empty translations for {lang}" diff --git a/tests/test_web/test_highlighted_text.py b/tests/test_web/test_highlighted_text.py index 8361928..716906f 100644 --- a/tests/test_web/test_highlighted_text.py +++ b/tests/test_web/test_highlighted_text.py @@ -74,6 +74,21 @@ def test_overlapping_entities_first_wins(self) -> None: assert result.count("entity-highlight") == 1 assert "Max Mustermann" in result + def test_english_tooltip(self) -> None: + """Verify lang='en' produces English tooltip text.""" + text = "Max Mustermann" + entities = [_ent("PERSON", 0, 14, 0.85, "Max Mustermann")] + result = _build_highlighted_text(text, entities, lang="en") # type: ignore[arg-type] + assert "Confidence:" in result + assert "Konfidenz:" not in result + + def test_german_tooltip_default(self) -> None: + """Default lang produces German tooltip text.""" + text = "Max Mustermann" + entities = [_ent("PERSON", 0, 14, 0.85, "Max Mustermann")] + result = _build_highlighted_text(text, entities) # type: ignore[arg-type] + assert "Konfidenz:" in result + def test_adjacent_entities_not_merged(self) -> None: """Adjacent (non-overlapping) entities should each get their own highlight.""" text = "AB" diff --git a/tests/test_web/test_i18n_integration.py b/tests/test_web/test_i18n_integration.py new file mode 100644 index 0000000..dda22a5 --- /dev/null +++ b/tests/test_web/test_i18n_integration.py @@ -0,0 +1,109 @@ +"""Integration tests for i18n in web routes.""" + +from fastapi.testclient import TestClient + +from document_anonymizer.api.app import app + +_HTMX_HEADERS = {"HX-Request": "true"} +client = TestClient(app) + + +class TestLanguageSwitching: + def test_default_language_is_german(self) -> None: + r = client.get("/") + assert r.status_code == 200 + assert "Dokument anonymisieren" in r.text + + def test_english_via_query_param(self) -> None: + r = client.get("/?lang=en") + assert r.status_code == 200 + assert "Anonymize document" in r.text + + def test_german_via_query_param(self) -> None: + r = client.get("/?lang=de") + assert r.status_code == 200 + assert "Dokument anonymisieren" in r.text + + def test_unsupported_lang_falls_back_to_default(self) -> None: + r = client.get("/?lang=fr") + assert r.status_code == 200 + assert "Dokument anonymisieren" in r.text + + def test_lang_cookie_set_on_explicit_switch(self) -> None: + r = client.get("/?lang=en") + assert "lang" in r.cookies + assert r.cookies["lang"] == "en" + + def test_unsupported_lang_does_not_set_cookie(self) -> None: + r = client.get("/?lang=fr") + assert "lang" not in r.cookies + + def test_cookie_persists_language(self) -> None: + """After setting lang=en via query param, subsequent requests use English.""" + r = client.get("/", cookies={"lang": "en"}) + assert "Anonymize document" in r.text + + def test_query_param_overrides_cookie(self) -> None: + """Query param ?lang= should take precedence over cookie.""" + r = client.get("/?lang=en", cookies={"lang": "de"}) + assert "Anonymize document" in r.text + r = client.get("/?lang=de", cookies={"lang": "en"}) + assert "Dokument anonymisieren" in r.text + + +class TestEnglishErrorMessages: + def test_empty_text_error_in_english(self) -> None: + r = client.post( + "/detect", + headers=_HTMX_HEADERS, + data={"text": "", "score_threshold": "0.35"}, + cookies={"lang": "en"}, + ) + assert r.status_code == 200 + assert "Please enter text or upload a file" in r.text + + def test_invalid_strategy_error_in_english(self) -> None: + r = client.post( + "/anonymize-form", + headers=_HTMX_HEADERS, + data={ + "text": "Max Mustermann", + "strategy": "nonexistent", + "score_threshold": "0.35", + "is_pdf": "false", + "pdf_b64": "", + }, + cookies={"lang": "en"}, + ) + assert r.status_code == 200 + assert "Unknown strategy" in r.text + + +class TestTranslationsJsonSecurity: + def test_translations_json_escapes_script_breakout(self) -> None: + """The translations_json block must escape XSS.""" + r = client.get("/") + assert r.status_code == 200 + # translations JSON is in a ")[0] + assert " None: + r = client.get("/") + assert "?lang=en" in r.text + assert "?lang=de" in r.text + + def test_html_lang_attribute_german(self) -> None: + r = client.get("/") + assert 'lang="de"' in r.text + + def test_html_lang_attribute_english(self) -> None: + r = client.get("/?lang=en") + assert 'lang="en"' in r.text diff --git a/tests/test_web/test_review_helpers.py b/tests/test_web/test_review_helpers.py index c4a42c4..c018588 100644 --- a/tests/test_web/test_review_helpers.py +++ b/tests/test_web/test_review_helpers.py @@ -130,14 +130,14 @@ def test_out_of_bounds_raises_valueerror(self) -> None: data = [ {"entity_type": "PERSON", "start": 0, "end": 100, "score": 0.9}, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "short") def test_negative_start_raises_valueerror(self) -> None: data = [ {"entity_type": "PERSON", "start": -1, "end": 5, "score": 0.9}, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello world") def test_zero_length_entity_raises_valueerror(self) -> None: @@ -145,14 +145,14 @@ def test_zero_length_entity_raises_valueerror(self) -> None: data = [ {"entity_type": "PERSON", "start": 0, "end": 0, "score": 0.9}, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello") def test_missing_fields_raises_valueerror(self) -> None: data = [ {"entity_type": "PERSON"}, # missing start/end/score ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello world") def test_non_dict_items_raises_valueerror(self) -> None: @@ -162,7 +162,7 @@ def test_non_dict_items_raises_valueerror(self) -> None: None, {"entity_type": "PERSON", "start": 0, "end": 5, "score": 0.9}, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello") def test_multiple_entities(self) -> None: @@ -194,21 +194,21 @@ def test_xss_entity_type_raises_valueerror(self) -> None: "score": 0.9, }, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello") def test_score_above_one_raises_valueerror(self) -> None: data = [ {"entity_type": "PERSON", "start": 0, "end": 5, "score": 1.5}, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello") def test_score_below_zero_raises_valueerror(self) -> None: data = [ {"entity_type": "PERSON", "start": 0, "end": 5, "score": -0.1}, ] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_recognizer_results(json.dumps(data), "hello") def test_valid_entity_type_formats(self) -> None: @@ -236,7 +236,7 @@ def test_invalid_json_raises_valueerror(self) -> None: def test_missing_text_field_raises_valueerror(self) -> None: data = [{"entity_type": "PERSON"}] # no "text" key - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_selected_entities_for_pdf(json.dumps(data)) def test_exceeding_max_entities_raises_valueerror(self) -> None: @@ -247,13 +247,13 @@ def test_exceeding_max_entities_raises_valueerror(self) -> None: def test_oversized_text_raises_valueerror(self) -> None: """Entity text exceeding 1000 chars should raise ValueError.""" data = [{"text": "x" * 1001}] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_selected_entities_for_pdf(json.dumps(data)) def test_empty_text_raises_valueerror(self) -> None: """Empty text values should raise ValueError.""" data = [{"text": ""}, {"text": " "}] - with pytest.raises(ValueError, match="konnten nicht"): + with pytest.raises(ValueError): _reconstruct_selected_entities_for_pdf(json.dumps(data)) def test_non_string_text_coerced(self) -> None: diff --git a/tests/test_web/test_review_routes.py b/tests/test_web/test_review_routes.py index 2f1fad4..71863ad 100644 --- a/tests/test_web/test_review_routes.py +++ b/tests/test_web/test_review_routes.py @@ -127,6 +127,7 @@ def test_anonymize_malformed_entities_returns_error(self) -> None: "pdf_b64": "", "selected_entities": "{not valid json}", }, + cookies={"lang": "de"}, ) assert r.status_code == 200 assert "konnte nicht verarbeitet werden" in r.text @@ -191,9 +192,9 @@ def test_anonymize_xss_entity_type_returns_error(self) -> None: "pdf_b64": "", "selected_entities": selected, }, + cookies={"lang": "de"}, ) assert r.status_code == 200 - # Skipped items now raise ValueError, returning error to user assert "konnten nicht" in r.text assert "