Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.woff2 binary
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,9 @@ coverage.xml
.copier-answers.yml
# Generated test documents
test_documents/
# Claude Code / AI tooling
CLAUDE.md
.claude/
.playwright-mcp/
# Design artifacts
design-*.png
76 changes: 0 additions & 76 deletions CLAUDE.md

This file was deleted.

20 changes: 17 additions & 3 deletions src/document_anonymizer/security/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from starlette.requests import Request
from starlette.responses import Response

_STATIC_PATH_PREFIX = "/static/"


class SecurityHeadersMiddleware(BaseHTTPMiddleware):
"""Add security headers to all responses."""
Expand Down Expand Up @@ -44,9 +46,21 @@ async def dispatch(
response.headers["Permissions-Policy"] = (
"camera=(), microphone=(), geolocation=()"
)
# Prevent browsers from caching PII-containing responses
response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate"
response.headers["Pragma"] = "no-cache"
# SECURITY INVARIANT: Only successful /static/ responses may be
# cached. All other responses may contain PII and MUST use
# no-store. If you add routes under /static/, ensure they
# serve no user data.
is_cacheable_static = (
request.url.path.startswith(_STATIC_PATH_PREFIX)
and response.status_code == 200
)
if is_cacheable_static:
response.headers["Cache-Control"] = "public, max-age=86400"
else:
response.headers["Cache-Control"] = (
"no-store, no-cache, must-revalidate"
)
response.headers["Pragma"] = "no-cache"
response.headers["X-Request-ID"] = request_id
return response
finally:
Expand Down
29 changes: 29 additions & 0 deletions src/document_anonymizer/web/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,27 +168,37 @@ def _reconstruct_recognizer_results(
for item in raw:
if not isinstance(item, dict):
skipped += 1
logger.debug("entity_skip_not_dict")
continue
try:
start = int(item["start"])
end = int(item["end"])
score = float(item["score"])
if not (0.0 <= score <= 1.0):
skipped += 1
logger.debug("entity_skip_score_range", score=score)
continue
entity_type = str(item["entity_type"])
except (KeyError, ValueError, TypeError):
skipped += 1
logger.debug("entity_skip_parse_error")
continue

# Validate bounds
if start < 0 or end <= start or end > text_len:
skipped += 1
logger.debug(
"entity_skip_bounds",
start=start,
end=end,
text_len=text_len,
)
continue

# Validate entity type format (prevent XSS in CSS classes)
if not _ENTITY_TYPE_RE.match(entity_type):
skipped += 1
logger.debug("entity_skip_type_format", entity_type_len=len(entity_type))
continue

results.append(
Expand Down Expand Up @@ -221,10 +231,14 @@ def _reconstruct_selected_entities_for_pdf(
for item in raw:
if not isinstance(item, dict) or "text" not in item:
skipped += 1
logger.debug("pdf_entity_skip_invalid_item")
continue
text = str(item["text"]).strip()
if not text or len(text) > _MAX_ENTITY_TEXT_LENGTH:
skipped += 1
logger.debug(
"pdf_entity_skip_text_validation", text_len=len(str(item["text"]))
)
continue
targets.append(RedactionTarget(text=text))

Expand Down Expand Up @@ -253,6 +267,17 @@ async def index(request: Request) -> HTMLResponse:
_MAX_TEXT_LENGTH = 100_000


def _normalize_line_endings(text: str) -> str:
"""Normalize CRLF and CR line endings to LF.

Browser form submissions may encode line endings as CRLF, but when text
is later embedded in an HTML hidden input's value attribute, the HTML
parser normalizes CRLF and CR to LF. Normalizing upfront ensures entity
positions remain valid across the detect -> anonymize round-trip.
"""
return text.replace("\r\n", "\n").replace("\r", "\n")


@web_router.post(
"/detect",
response_class=HTMLResponse,
Expand Down Expand Up @@ -297,6 +322,8 @@ async def detect_form(
else:
text = content.decode("utf-8", errors="replace")

text = _normalize_line_endings(text)

if not text.strip():
return templates.TemplateResponse(
request,
Expand Down Expand Up @@ -375,6 +402,8 @@ async def anonymize_form(
anonymizer: AnonymizerEngine = Depends(get_anonymizer), # noqa: B008
) -> HTMLResponse:
"""Handle anonymization form submission."""
text = _normalize_line_endings(text)

try:
strat = AnonymizationStrategy(strategy)
except ValueError:
Expand Down
Loading