JuliusScheuerer · JuliusScheuerer · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 5, 2026
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+*.woff2 binary
diff --git a/.gitignore b/.gitignore
@@ -27,3 +27,9 @@ coverage.xml
 .copier-answers.yml
 # Generated test documents
 test_documents/
+# Claude Code / AI tooling
+CLAUDE.md
+.claude/
+.playwright-mcp/
+# Design artifacts
+design-*.png
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/src/document_anonymizer/security/middleware.py b/src/document_anonymizer/security/middleware.py
@@ -7,6 +7,8 @@
 from starlette.requests import Request
 from starlette.responses import Response
 
+_STATIC_PATH_PREFIX = "/static/"
+
 
 class SecurityHeadersMiddleware(BaseHTTPMiddleware):
     """Add security headers to all responses."""
@@ -44,9 +46,21 @@ async def dispatch(
             response.headers["Permissions-Policy"] = (
                 "camera=(), microphone=(), geolocation=()"
             )
-            # Prevent browsers from caching PII-containing responses
-            response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate"
-            response.headers["Pragma"] = "no-cache"
+            # SECURITY INVARIANT: Only successful /static/ responses may be
+            # cached.  All other responses may contain PII and MUST use
+            # no-store.  If you add routes under /static/, ensure they
+            # serve no user data.
+            is_cacheable_static = (
+                request.url.path.startswith(_STATIC_PATH_PREFIX)
+                and response.status_code == 200
+            )
+            if is_cacheable_static:
+                response.headers["Cache-Control"] = "public, max-age=86400"
+            else:
+                response.headers["Cache-Control"] = (
+                    "no-store, no-cache, must-revalidate"
+                )
+                response.headers["Pragma"] = "no-cache"
             response.headers["X-Request-ID"] = request_id
             return response
         finally:

diff --git a/src/document_anonymizer/web/routes.py b/src/document_anonymizer/web/routes.py
@@ -168,27 +168,37 @@ def _reconstruct_recognizer_results(
     for item in raw:
         if not isinstance(item, dict):
             skipped += 1
+            logger.debug("entity_skip_not_dict")
             continue
         try:
             start = int(item["start"])
             end = int(item["end"])
             score = float(item["score"])
             if not (0.0 <= score <= 1.0):
                 skipped += 1
+                logger.debug("entity_skip_score_range", score=score)
                 continue
             entity_type = str(item["entity_type"])
         except (KeyError, ValueError, TypeError):
             skipped += 1
+            logger.debug("entity_skip_parse_error")
             continue
 
         # Validate bounds
         if start < 0 or end <= start or end > text_len:
             skipped += 1
+            logger.debug(
+                "entity_skip_bounds",
+                start=start,
+                end=end,
+                text_len=text_len,
+            )
             continue
 
         # Validate entity type format (prevent XSS in CSS classes)
         if not _ENTITY_TYPE_RE.match(entity_type):
             skipped += 1
+            logger.debug("entity_skip_type_format", entity_type_len=len(entity_type))
             continue
 
         results.append(
@@ -221,10 +231,14 @@ def _reconstruct_selected_entities_for_pdf(
     for item in raw:
         if not isinstance(item, dict) or "text" not in item:
             skipped += 1
+            logger.debug("pdf_entity_skip_invalid_item")
             continue
         text = str(item["text"]).strip()
         if not text or len(text) > _MAX_ENTITY_TEXT_LENGTH:
             skipped += 1
+            logger.debug(
+                "pdf_entity_skip_text_validation", text_len=len(str(item["text"]))
+            )
             continue
         targets.append(RedactionTarget(text=text))
 
@@ -253,6 +267,17 @@ async def index(request: Request) -> HTMLResponse:
 _MAX_TEXT_LENGTH = 100_000
 
 
+def _normalize_line_endings(text: str) -> str:
+    """Normalize CRLF and CR line endings to LF.
+
+    Browser form submissions may encode line endings as CRLF, but when text
+    is later embedded in an HTML hidden input's value attribute, the HTML
+    parser normalizes CRLF and CR to LF. Normalizing upfront ensures entity
+    positions remain valid across the detect -> anonymize round-trip.
+    """
+    return text.replace("\r\n", "\n").replace("\r", "\n")
+
+
 @web_router.post(
     "/detect",
     response_class=HTMLResponse,
@@ -297,6 +322,8 @@ async def detect_form(
         else:
             text = content.decode("utf-8", errors="replace")
 
+    text = _normalize_line_endings(text)
+
     if not text.strip():
         return templates.TemplateResponse(
             request,
@@ -375,6 +402,8 @@ async def anonymize_form(
     anonymizer: AnonymizerEngine = Depends(get_anonymizer),  # noqa: B008
 ) -> HTMLResponse:
     """Handle anonymization form submission."""
+    text = _normalize_line_endings(text)
+
     try:
         strat = AnonymizationStrategy(strategy)
     except ValueError: