diff --git a/docs/src/pages/create.astro b/docs/src/pages/create.astro index 6773eb6..d5262ad 100644 --- a/docs/src/pages/create.astro +++ b/docs/src/pages/create.astro @@ -39,11 +39,11 @@ const jsonLd = graph([

1 · PDF

-
+
+ +
@@ -56,11 +56,11 @@ const jsonLd = graph([
@@ -82,10 +82,13 @@ const jsonLd = graph([

- Want to add BGE-M3 semantic embeddings? Install the CLI - (brew install cvfile/tap/cv) and run - cv pack --embed-with bge-m3. The model is ~285 MB so it - runs once locally on your machine, not on every visitor's browser. + Want to add BGE-M3 semantic embeddings? Generate them with the Python + package (pip install "cvfile[embed]"): + run embed(markdown) then pack with + pack(..., embeddings=encode_embeddings(payload)). The model is + ~285 MB so it runs once locally on your machine, not on every visitor's + browser. The cv CLI is reader-only (extract, inspect, validate, + search) and does not generate embeddings.

@@ -65,7 +81,6 @@ const jsonLd = graph([ await viewer.loadFromBytes(buf); } - dz.addEventListener('click', () => picker.click()); picker.addEventListener('change', () => { const file = picker.files?.[0]; if (file) void handle(file); diff --git a/integrations/cvfile-haystack/pyproject.toml b/integrations/cvfile-haystack/pyproject.toml index 2a72390..39f3e1f 100644 --- a/integrations/cvfile-haystack/pyproject.toml +++ b/integrations/cvfile-haystack/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "cvfile>=0.1.0,<1", + "cvfile>=0.1,<2", "haystack-ai>=2.8,<3", ] diff --git a/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py b/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py index 1150f4b..2c201d5 100644 --- a/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py +++ b/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py @@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]: "mime_type": payload.mime_type, "payload": payload.name, "relationship": payload.relationship, - "language": payload.language or file.metadata.primary_language, + "language": payload.language, "primary": payload.name == file.metadata.primary_payload, "cv_version": file.metadata.version, "cv_generator": file.metadata.generator, } +def _resolve_chunks(file: CvFile) -> list: + """Decode the file's embeddings.cbor into text-resolved chunks. + + Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets + (spec §5.1) and stays the single source of truth. Returns an empty list + when the embed extra is not installed or the file carries no embeddings. + """ + try: + from cvfile.embed import resolve_embedding_chunks + except ImportError: + return [] + return resolve_embedding_chunks(file) + + +def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]: + return { + "language": file.metadata.primary_language, + "cv_version": file.metadata.version, + "cv_generator": file.metadata.generator, + "chunk_id": chunk.id, + "chunk_offset": chunk.text_offset, + "chunk_length": chunk.text_length, + "embedding_model": chunk.model, + "embedding_dimension": chunk.dimension, + "embedding_metric": chunk.metric, + } + + @component class CVFileToDocument: """Convert ``.cv`` files into Haystack ``Document`` objects. @@ -48,18 +76,32 @@ class CVFileToDocument: Set ``primary_only=True`` to emit only the payload marked as ``primaryPayload`` in the file's XMP metadata (usually the canonical Markdown copy), and skip all alternates. + + Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding + chunk instead of one per payload. Each chunk ``Document`` carries its vector + on ``Document.embedding`` and its text is sliced from the markdown using + UTF-8 byte offsets. Files without an embeddings payload fall back to a single + Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is + ignored (chunks already index a single text payload). """ - def __init__(self, primary_only: bool = False) -> None: + def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None: """Create a CVFileToDocument component. :param primary_only: If ``True``, emit only the payload marked as ``primaryPayload`` in the file's XMP metadata. If ``False`` (default), emit one ``Document`` per textual payload (the primary plus any - language alternates and supplements). + language alternates and supplements). Ignored in ``mode="chunks"``. + :param mode: + ``"payloads"`` (default) emits one ``Document`` per textual payload. + ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk + with its vector attached. """ + if mode not in ("payloads", "chunks"): + raise ValueError("mode must be 'payloads' or 'chunks'") self.primary_only = primary_only + self.mode = mode @component.output_types(documents=list[Document]) def run( @@ -105,6 +147,10 @@ def run( stream_meta = bytestream.meta or {} source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source) + if self.mode == "chunks": + documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label)) + continue + for payload in file.payloads: if not _is_text_payload(payload): continue @@ -115,3 +161,28 @@ def run( documents.append(Document(content=payload.text(), meta=merged)) return {"documents": documents} + + @staticmethod + def _chunk_documents( + file: CvFile, + stream_meta: dict[str, Any], + source_meta: dict[str, Any], + source_label: str, + ) -> list[Document]: + chunks = _resolve_chunks(file) + if not chunks: + primary = next( + (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)), + None, + ) + if primary is None: + return [] + payload_meta = _payload_meta(primary, file) + merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label} + return [Document(content=primary.text(), meta=merged)] + + out: list[Document] = [] + for chunk in chunks: + merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label} + out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector))) + return out diff --git a/integrations/cvfile-haystack/tests/test_converter.py b/integrations/cvfile-haystack/tests/test_converter.py index 797dc73..3c8af06 100644 --- a/integrations/cvfile-haystack/tests/test_converter.py +++ b/integrations/cvfile-haystack/tests/test_converter.py @@ -11,6 +11,7 @@ from haystack_integrations.components.converters.cvfile import CVFileToDocument FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv" +UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv" @pytest.fixture(scope="module") @@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None: not_a_cv.write_bytes(b"not a real cv file") result = converter.run(sources=[not_a_cv]) assert result["documents"] == [] + + +def test_chunks_mode_attaches_a_vector_per_chunk() -> None: + if not FIXTURE.exists(): + pytest.skip(f"fixture not found: {FIXTURE}") + docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"] + assert len(docs) >= 1 + for doc in docs: + assert doc.embedding is not None + assert len(doc.embedding) == doc.meta["embedding_dimension"] + assert all(isinstance(v, float) for v in doc.embedding) + assert doc.content.strip(), "chunk text should not be empty" + + +def test_invalid_mode_rejected() -> None: + with pytest.raises(ValueError): + CVFileToDocument(mode="bogus") + + +def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None: + if not UNICODE_FIXTURE.exists(): + pytest.skip(f"fixture not found: {UNICODE_FIXTURE}") + docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"] + joined = "".join(d.content for d in docs) + assert "Élodie" in joined + assert "工程師" in joined + assert "🚀" in joined + assert "经验" in joined + for doc in docs: + assert doc.content == doc.content.encode("utf-8").decode("utf-8") diff --git a/integrations/langchain-cvfile/pyproject.toml b/integrations/langchain-cvfile/pyproject.toml index 1b7ebd8..7842cfd 100644 --- a/integrations/langchain-cvfile/pyproject.toml +++ b/integrations/langchain-cvfile/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "cvfile>=0.1.0,<1", + "cvfile>=0.1,<2", "langchain-core>=0.3,<1", ] diff --git a/integrations/langchain-cvfile/src/langchain_cvfile/loader.py b/integrations/langchain-cvfile/src/langchain_cvfile/loader.py index ee27ed6..c048e55 100644 --- a/integrations/langchain-cvfile/src/langchain_cvfile/loader.py +++ b/integrations/langchain-cvfile/src/langchain_cvfile/loader.py @@ -29,7 +29,7 @@ def _payload_to_document(payload: ExtractedPayload, file: CvFile, source: str) - "mime_type": payload.mime_type, "payload": payload.name, "relationship": payload.relationship, - "language": payload.language or file.metadata.primary_language, + "language": payload.language, "primary": payload.name == file.metadata.primary_payload, "cv_version": file.metadata.version, "cv_generator": file.metadata.generator, @@ -37,28 +37,86 @@ def _payload_to_document(payload: ExtractedPayload, file: CvFile, source: str) - ) +def _resolve_chunks(file: CvFile) -> list: + """Decode the file's embeddings.cbor into text-resolved chunks. + + Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets + (spec §5.1) and stays the single source of truth. Returns an empty list + when the embed extra is not installed or the file carries no embeddings. + """ + try: + from cvfile.embed import resolve_embedding_chunks + except ImportError: + return [] + return resolve_embedding_chunks(file) + + class CVFileLoader(BaseLoader): - """Load a ``.cv`` file and emit one ``Document`` per embedded text payload. + """Load a ``.cv`` file and emit ``Document`` objects. A ``.cv`` file is a PDF/A-3u with Markdown, HTML, and optional JSON - payloads attached via PDF Associated Files. This loader returns one - ``Document`` per textual payload (the visual PDF layer is intentionally - skipped: the embedded Markdown is a cleaner text representation of the - same content, which is the whole point of the format). - - The payload marked as ``primaryPayload`` in the file's XMP metadata is - flagged in ``metadata["primary"] = True`` so downstream code can keep - just the canonical text and drop alternates if needed. + payloads attached via PDF Associated Files. The visual PDF layer is + intentionally skipped: the embedded Markdown is a cleaner text + representation of the same content, which is the whole point of the format. + + Two modes are supported: + + - ``mode="payloads"`` (default): one ``Document`` per textual payload. The + payload marked as ``primaryPayload`` in the file's XMP metadata is flagged + in ``metadata["primary"] = True`` so downstream code can keep just the + canonical text and drop alternates if needed. + - ``mode="chunks"``: one ``Document`` per pre-computed embedding chunk, with + the chunk's vector attached as ``metadata["embedding"]`` and the chunk + text sliced from the markdown using UTF-8 byte offsets. Falls back to a + single Markdown ``Document`` when the file carries no embeddings. """ - def __init__(self, file_path: Union[str, Path]) -> None: + def __init__(self, file_path: Union[str, Path], *, mode: str = "payloads") -> None: + if mode not in ("payloads", "chunks"): + raise ValueError("mode must be 'payloads' or 'chunks'") self.file_path = Path(file_path) + self.mode = mode def lazy_load(self) -> Iterator[Document]: data = self.file_path.read_bytes() file = extract(data) source = str(self.file_path) + + if self.mode == "chunks": + yield from self._lazy_load_chunks(file, source) + return + for payload in file.payloads: if not _is_text_payload(payload): continue yield _payload_to_document(payload, file, source) + + def _lazy_load_chunks(self, file: CvFile, source: str) -> Iterator[Document]: + chunks = _resolve_chunks(file) + if not chunks: + # No precomputed embeddings: fall back to the primary text payload. + primary = next( + (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)), + None, + ) + if primary is not None: + yield _payload_to_document(primary, file, source) + return + + for chunk in chunks: + yield Document( + page_content=chunk.text, + metadata={ + "source": source, + "language": file.metadata.primary_language, + "cv_version": file.metadata.version, + "cv_generator": file.metadata.generator, + "chunk_id": chunk.id, + "chunk_offset": chunk.text_offset, + "chunk_length": chunk.text_length, + "embedding": list(chunk.vector), + "embedding_model": chunk.model, + "embedding_dimension": chunk.dimension, + "embedding_metric": chunk.metric, + }, + ) diff --git a/integrations/langchain-cvfile/tests/test_loader.py b/integrations/langchain-cvfile/tests/test_loader.py index f2c99b6..fb76457 100644 --- a/integrations/langchain-cvfile/tests/test_loader.py +++ b/integrations/langchain-cvfile/tests/test_loader.py @@ -10,6 +10,7 @@ from langchain_cvfile import CVFileLoader FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv" +UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv" @pytest.fixture(scope="module") @@ -46,3 +47,36 @@ def test_lazy_load_is_streaming(loader: CVFileLoader) -> None: it = loader.lazy_load() first = next(it) assert isinstance(first, Document) + + +def test_chunks_mode_attaches_a_vector_per_chunk() -> None: + if not FIXTURE.exists(): + pytest.skip(f"fixture not found: {FIXTURE}") + docs = CVFileLoader(FIXTURE, mode="chunks").load() + assert len(docs) >= 1 + for doc in docs: + emb = doc.metadata.get("embedding") + assert isinstance(emb, list) and len(emb) == doc.metadata["embedding_dimension"] + assert all(isinstance(v, float) for v in emb) + assert doc.metadata["embedding_model"] + assert doc.page_content.strip(), "chunk text should not be empty" + + +def test_invalid_mode_rejected() -> None: + with pytest.raises(ValueError): + CVFileLoader(FIXTURE, mode="bogus") + + +def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None: + if not UNICODE_FIXTURE.exists(): + pytest.skip(f"fixture not found: {UNICODE_FIXTURE}") + docs = CVFileLoader(UNICODE_FIXTURE, mode="chunks").load() + joined = "".join(d.page_content for d in docs) + # Multibyte characters survive intact: a code-point slice would mojibake these. + assert "Élodie" in joined + assert "工程師" in joined + assert "🚀" in joined + assert "经验" in joined + # Every chunk decodes to valid text (no broken surrogate / partial byte runs). + for doc in docs: + assert doc.page_content == doc.page_content.encode("utf-8").decode("utf-8") diff --git a/integrations/llama-index-readers-cvfile/pyproject.toml b/integrations/llama-index-readers-cvfile/pyproject.toml index 9277c13..2d2fe3f 100644 --- a/integrations/llama-index-readers-cvfile/pyproject.toml +++ b/integrations/llama-index-readers-cvfile/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "cvfile>=0.1.0,<1", + "cvfile>=0.1,<2", "llama-index-core>=0.11,<0.15", ] diff --git a/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py b/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py index c34d891..b9b31f0 100644 --- a/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py +++ b/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py @@ -32,7 +32,7 @@ def _payload_to_document( "mime_type": payload.mime_type, "payload": payload.name, "relationship": payload.relationship, - "language": payload.language or file.metadata.primary_language, + "language": payload.language, "primary": payload.name == file.metadata.primary_payload, "cv_version": file.metadata.version, "cv_generator": file.metadata.generator, @@ -42,15 +42,41 @@ def _payload_to_document( return Document(text=payload.text(), metadata=metadata) +def _resolve_chunks(file: CvFile) -> list: + """Decode the file's embeddings.cbor into text-resolved chunks. + + Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets + (spec §5.1) and stays the single source of truth. Returns an empty list + when the embed extra is not installed or the file carries no embeddings. + """ + try: + from cvfile.embed import resolve_embedding_chunks + except ImportError: + return [] + return resolve_embedding_chunks(file) + + class CVFileReader(BaseReader): - """Read a ``.cv`` file and emit one ``Document`` per embedded text payload. + """Read a ``.cv`` file and emit ``Document`` objects. A ``.cv`` file is a PDF/A-3u carrying Markdown, HTML, and optional JSON - payloads via PDF Associated Files. This reader returns one ``Document`` - per textual payload (the visual PDF layer is skipped because the embedded - Markdown is a cleaner text representation of the same content). + payloads via PDF Associated Files. The visual PDF layer is skipped because + the embedded Markdown is a cleaner text representation of the same content. + + Two modes are supported: + + - ``mode="payloads"`` (default): one ``Document`` per textual payload. + - ``mode="chunks"``: one ``Document`` per pre-computed embedding chunk, with + the chunk's vector attached on ``Document.embedding`` and the chunk text + sliced from the markdown using UTF-8 byte offsets. Falls back to a single + Markdown ``Document`` when the file carries no embeddings. """ + def __init__(self, *, mode: str = "payloads") -> None: + if mode not in ("payloads", "chunks"): + raise ValueError("mode must be 'payloads' or 'chunks'") + self.mode = mode + def load_data( self, file: Path, @@ -59,8 +85,54 @@ def load_data( path = Path(file) cv_file = extract(path.read_bytes()) source = str(path) + + if self.mode == "chunks": + return self._load_chunks(cv_file, source, extra_info) + return [ _payload_to_document(payload, cv_file, source, extra_info) for payload in cv_file.payloads if _is_text_payload(payload) ] + + def _load_chunks( + self, + cv_file: CvFile, + source: str, + extra_info: Optional[dict], + ) -> list[Document]: + chunks = _resolve_chunks(cv_file) + if not chunks: + primary = next( + ( + p + for p in cv_file.payloads + if p.name == cv_file.metadata.primary_payload and _is_text_payload(p) + ), + None, + ) + if primary is None: + return [] + return [_payload_to_document(primary, cv_file, source, extra_info)] + + out: list[Document] = [] + for chunk in chunks: + metadata: dict = { + "source": source, + "file_name": Path(source).name, + "language": cv_file.metadata.primary_language, + "cv_version": cv_file.metadata.version, + "cv_generator": cv_file.metadata.generator, + "chunk_id": chunk.id, + "chunk_offset": chunk.text_offset, + "chunk_length": chunk.text_length, + "embedding_model": chunk.model, + "embedding_dimension": chunk.dimension, + "embedding_metric": chunk.metric, + } + if extra_info: + metadata.update(extra_info) + doc = Document(text=chunk.text, metadata=metadata) + doc.embedding = list(chunk.vector) + out.append(doc) + return out diff --git a/integrations/llama-index-readers-cvfile/tests/test_reader.py b/integrations/llama-index-readers-cvfile/tests/test_reader.py index 4d86c25..0aa1492 100644 --- a/integrations/llama-index-readers-cvfile/tests/test_reader.py +++ b/integrations/llama-index-readers-cvfile/tests/test_reader.py @@ -10,6 +10,7 @@ from llama_index.readers.cvfile import CVFileReader FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv" +UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv" @pytest.fixture(scope="module") @@ -45,3 +46,33 @@ def test_primary_is_text_content(reader: CVFileReader) -> None: def test_extra_info_is_merged(reader: CVFileReader) -> None: docs = reader.load_data(file=FIXTURE, extra_info={"tenant": "acme"}) assert all(d.metadata.get("tenant") == "acme" for d in docs) + + +def test_chunks_mode_attaches_a_vector_per_chunk() -> None: + if not FIXTURE.exists(): + pytest.skip(f"fixture not found: {FIXTURE}") + docs = CVFileReader(mode="chunks").load_data(file=FIXTURE) + assert len(docs) >= 1 + for doc in docs: + assert doc.embedding is not None + assert len(doc.embedding) == doc.metadata["embedding_dimension"] + assert all(isinstance(v, float) for v in doc.embedding) + assert doc.text.strip(), "chunk text should not be empty" + + +def test_invalid_mode_rejected() -> None: + with pytest.raises(ValueError): + CVFileReader(mode="bogus") + + +def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None: + if not UNICODE_FIXTURE.exists(): + pytest.skip(f"fixture not found: {UNICODE_FIXTURE}") + docs = CVFileReader(mode="chunks").load_data(file=UNICODE_FIXTURE) + joined = "".join(d.text for d in docs) + assert "Élodie" in joined + assert "工程師" in joined + assert "🚀" in joined + assert "经验" in joined + for doc in docs: + assert doc.text == doc.text.encode("utf-8").decode("utf-8") diff --git a/integrations/tests/fixtures/build_unicode_fixture.py b/integrations/tests/fixtures/build_unicode_fixture.py new file mode 100644 index 0000000..e04717a --- /dev/null +++ b/integrations/tests/fixtures/build_unicode_fixture.py @@ -0,0 +1,99 @@ +"""Build a non-ASCII .cv fixture so the byte-offset chunk path is covered. + +The markdown mixes accented Latin, CJK and emoji so that UTF-8 byte offsets +diverge from code-point indices: a chunker that sliced on str indices would +return garbled text here, while byte-offset slicing recovers the exact source. + +Run with the cvfile SDK (and its [embed] extra) on PYTHONPATH: + + python integrations/tests/fixtures/build_unicode_fixture.py +""" + +from __future__ import annotations + +import hashlib +import io +import struct +from pathlib import Path + +import pypdf +from cvfile import extract, pack, validate +from cvfile.embed import EmbedOptions, embed + +_EMBED_DIMENSION = 8 + +# Heading-led sections so the section chunker emits one chunk per section, and +# every section contains multibyte characters before later sections start. +UNICODE_MD = """# Élodie Gauthier · 工程師 🚀 + +Ingénieure logicielle à Montréal, Québec. Café ☕ et résolution de problèmes. + +## Expérience 经验 + +* Conçu des systèmes distribués 分布式系统 à grande échelle +* Mentorat d'équipe 团队 et révisions de code 🔍 + +## Compétences + +* Python, Rust, Go — performance et fiabilité +* Langues : français, English, 中文 🌏 +""" + +UNICODE_HTML = ( + '\n' + 'Élodie Gauthier\n' + "

Élodie Gauthier · 工程師 🚀

Ingénieure logicielle.

" +) + + +class DeterministicBackend: + """Offline, reproducible embedding backend (see build_python_sample.py).""" + + model = "fixture/deterministic-hash" + model_revision = "v1" + metric = "cosine" + normalized = False + + def embed(self, texts: list[str]) -> tuple[list[tuple[float, ...]], int]: + vectors: list[tuple[float, ...]] = [] + for text in texts: + digest = hashlib.sha256(text.encode("utf-8")).digest() + raw = (digest * ((_EMBED_DIMENSION * 4) // len(digest) + 1))[: _EMBED_DIMENSION * 4] + vectors.append(struct.unpack(f"<{_EMBED_DIMENSION}f", raw)) + return vectors, _EMBED_DIMENSION + + +def make_blank_pdf() -> bytes: + writer = pypdf.PdfWriter() + writer.add_blank_page(width=300, height=400) + buf = io.BytesIO() + writer.write(buf) + return buf.getvalue() + + +def main() -> None: + out_dir = Path(__file__).resolve().parent + out_dir.mkdir(parents=True, exist_ok=True) + + embeddings = embed(UNICODE_MD, EmbedOptions(chunking="section", backend=DeterministicBackend())) + + cv = pack( + pdf=make_blank_pdf(), + markdown=UNICODE_MD, + html=UNICODE_HTML, + embeddings=embeddings, + metadata={"primary_language": "fr", "generator": "cvfile-integrations/unicode-fixture"}, + ) + + out_path = out_dir / "unicode.cv" + out_path.write_bytes(cv) + print(f"Wrote {out_path} ({len(cv)} bytes)") + + file = extract(cv) + print(f" payloads: {[p.name for p in file.payloads]}") + report = validate(cv) + print(f" validate: ok={report.ok} issues={len(report.issues)}") + + +if __name__ == "__main__": + main() diff --git a/integrations/tests/fixtures/unicode.cv b/integrations/tests/fixtures/unicode.cv new file mode 100644 index 0000000..6f666e1 Binary files /dev/null and b/integrations/tests/fixtures/unicode.cv differ diff --git a/packages/embed-js/src/chunk.ts b/packages/embed-js/src/chunk.ts index e2855e3..abf9c32 100644 --- a/packages/embed-js/src/chunk.ts +++ b/packages/embed-js/src/chunk.ts @@ -5,8 +5,18 @@ * carries the byte offset and length into the original UTF-8 source so a * downstream consumer can map a vector hit back to the exact substring * without re-tokenising. Pre-heading content becomes a "preamble" chunk. + * + * Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the + * markdown source. We encode the document once with `TextEncoder`, track a + * byte cursor while iterating lines (counting the trailing `\n` byte), and + * derive each chunk's `text` by decoding the corresponding byte slice. This + * keeps the offsets in agreement with the Go and Python SDKs for any + * non-ASCII résumé. */ +const encoder = new TextEncoder(); +const decoder = new TextDecoder(); + export type ChunkingMode = 'document' | 'section' | 'paragraph'; export interface MarkdownChunk { @@ -22,26 +32,38 @@ export interface ChunkOptions { const HEADING = /^(#{1,6})\s+(.+?)\s*$/; +/** A source line plus its UTF-8 byte offset and byte length (including any trailing `\n`). */ +interface ByteLine { + text: string; + offset: number; + byteLength: number; +} + export function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] { const mode = opts.mode ?? 'section'; + const bytes = encoder.encode(markdown); if (mode === 'document') { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } if (mode === 'paragraph') { - return paragraphChunks(markdown); + return paragraphChunks(bytes); } - return sectionChunks(markdown); + return sectionChunks(bytes); +} + +function documentChunk(bytes: Uint8Array): MarkdownChunk { + return { id: 'document', textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) }; } -function sectionChunks(markdown: string): MarkdownChunk[] { - const lines = splitWithOffsets(markdown); +function sectionChunks(bytes: Uint8Array): MarkdownChunk[] { + const lines = splitWithByteOffsets(bytes); const sections: MarkdownChunk[] = []; let current: { id: string; start: number; end: number } | null = null; const ids = new Set(); function flush(end: number): void { if (!current) return; - const text = markdown.slice(current.start, end); + const text = sliceText(bytes, current.start, end); if (text.trim().length === 0) { current = null; return; @@ -52,63 +74,93 @@ function sectionChunks(markdown: string): MarkdownChunk[] { for (const line of lines) { const match = HEADING.exec(line.text); + const lineEnd = line.offset + line.byteLength; if (match) { flush(line.offset); const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids); ids.add(id); - current = { id, start: line.offset, end: line.offset + line.text.length }; + current = { id, start: line.offset, end: lineEnd }; continue; } if (current === null) { const id = uniqueId('preamble', ids); ids.add(id); - current = { id, start: line.offset, end: line.offset + line.text.length }; + current = { id, start: line.offset, end: lineEnd }; } else { - current.end = line.offset + line.text.length; + current.end = lineEnd; } } - flush(markdown.length); + flush(bytes.byteLength); if (sections.length === 0) { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } return sections; } -function paragraphChunks(markdown: string): MarkdownChunk[] { +function paragraphChunks(bytes: Uint8Array): MarkdownChunk[] { const out: MarkdownChunk[] = []; const ids = new Set(); + const separator = encoder.encode('\n\n'); let cursor = 0; let i = 0; - while (cursor < markdown.length) { - let end = markdown.indexOf('\n\n', cursor); - if (end === -1) end = markdown.length; - const text = markdown.slice(cursor, end); + while (cursor < bytes.byteLength) { + let end = indexOfBytes(bytes, separator, cursor); + if (end === -1) end = bytes.byteLength; + const text = sliceText(bytes, cursor, end); if (text.trim().length > 0) { const id = uniqueId(slugify(text.split('\n')[0] ?? `p-${i}`), ids); ids.add(id); - out.push({ id, textOffset: cursor, textLength: text.length, text }); + out.push({ id, textOffset: cursor, textLength: end - cursor, text }); i += 1; } - cursor = end + 2; + cursor = end + separator.byteLength; } if (out.length === 0) { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } return out; } -function splitWithOffsets(s: string): { text: string; offset: number }[] { - const lines: { text: string; offset: number }[] = []; - let offset = 0; - for (const line of s.split('\n')) { - const withNl = line + (offset + line.length < s.length ? '\n' : ''); - lines.push({ text: withNl, offset }); - offset += withNl.length; +/** Decode the UTF-8 byte slice `[start, end)` back into a string. */ +function sliceText(bytes: Uint8Array, start: number, end: number): string { + return decoder.decode(bytes.subarray(start, end)); +} + +/** Split UTF-8 bytes into lines, each tagged with its byte offset and byte length (newline included). */ +function splitWithByteOffsets(bytes: Uint8Array): ByteLine[] { + const newline = 0x0a; // '\n' + const lines: ByteLine[] = []; + let start = 0; + for (let i = 0; i < bytes.byteLength; i += 1) { + if (bytes[i] === newline) { + const byteLength = i - start + 1; + lines.push({ text: sliceText(bytes, start, i + 1), offset: start, byteLength }); + start = i + 1; + } + } + if (start < bytes.byteLength) { + lines.push({ text: sliceText(bytes, start, bytes.byteLength), offset: start, byteLength: bytes.byteLength - start }); } return lines; } +/** Find the byte index of `needle` in `haystack` at or after `from`, or -1. */ +function indexOfBytes(haystack: Uint8Array, needle: Uint8Array, from: number): number { + const last = haystack.byteLength - needle.byteLength; + for (let i = from; i <= last; i += 1) { + let matched = true; + for (let j = 0; j < needle.byteLength; j += 1) { + if (haystack[i + j] !== needle[j]) { + matched = false; + break; + } + } + if (matched) return i; + } + return -1; +} + function slugify(s: string): string { return ( s diff --git a/packages/embed-js/tests/chunk.test.ts b/packages/embed-js/tests/chunk.test.ts index 63c24ea..99fd07e 100644 --- a/packages/embed-js/tests/chunk.test.ts +++ b/packages/embed-js/tests/chunk.test.ts @@ -1,6 +1,13 @@ import { describe, expect, it } from 'vitest'; import { chunkMarkdown } from '../src/chunk.js'; +const encoder = new TextEncoder(); +const decoder = new TextDecoder(); + +function byteSlice(source: string, offset: number, length: number): string { + return decoder.decode(encoder.encode(source).subarray(offset, offset + length)); +} + const sample = `# Jane Doe Senior engineer. @@ -24,8 +31,39 @@ describe('chunkMarkdown', () => { it('preserves byte offsets that index back into the source', () => { const chunks = chunkMarkdown(sample); for (const c of chunks) { - const slice = sample.slice(c.textOffset, c.textOffset + c.textLength); - expect(slice).toBe(c.text); + expect(byteSlice(sample, c.textOffset, c.textLength)).toBe(c.text); + } + }); + + it('emits UTF-8 byte offsets that slice back correctly for multibyte content', () => { + const multibyte = `# Résumé de Zoé 🚀 + +Ingénieure logicielle. 日本語 も少し. + +## Expérience + +- Société Générale, 2020 à 2024 — café ☕ inclus + +## Compétences + +TypeScript, Go, Python. Naïve façade. +`; + const chunks = chunkMarkdown(multibyte); + expect(chunks.length).toBeGreaterThan(1); + const totalBytes = new TextEncoder().encode(multibyte).byteLength; + for (const c of chunks) { + // Offsets address bytes, not UTF-16 code units. + expect(c.textOffset).toBeGreaterThanOrEqual(0); + expect(c.textOffset + c.textLength).toBeLessThanOrEqual(totalBytes); + expect(byteSlice(multibyte, c.textOffset, c.textLength)).toBe(c.text); + } + // Heading slug retains ASCII slugification of the multibyte title. + expect(chunks[0]?.id).toBe('r-sum-de-zo'); + // First chunk's byte length exceeds its UTF-16 length because of the emoji + accents. + const first = chunks[0]; + expect(first).toBeDefined(); + if (first) { + expect(first.textLength).toBeGreaterThan(first.text.length); } }); diff --git a/packages/sdk-js/src/digest.ts b/packages/sdk-js/src/digest.ts index 43051a1..b435c8e 100644 --- a/packages/sdk-js/src/digest.ts +++ b/packages/sdk-js/src/digest.ts @@ -16,3 +16,94 @@ function bytesToHex(bytes: Uint8Array): string { } return hex; } + +/** + * MD5 of the given bytes, returned as a lowercase hex string. + * + * MD5 is cryptographically broken and is used here ONLY because the PDF + * specification mandates an MD5 /CheckSum on embedded-file /Params (spec §4.1); + * Web Crypto does not expose MD5, so we provide a small RFC 1321 implementation. + */ +export function md5Hex(bytes: Uint8Array): string { + const digest = md5(bytes); + return bytesToHex(digest); +} + +const S = [ + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, + 21, +]; + +const K = (() => { + const out = new Uint32Array(64); + for (let i = 0; i < 64; i += 1) { + out[i] = Math.floor(Math.abs(Math.sin(i + 1)) * 2 ** 32) >>> 0; + } + return out; +})(); + +function md5(input: Uint8Array): Uint8Array { + const originalBitLen = input.length * 8; + const padLen = ((input.length + 8) >> 6) + 1; // number of 64-byte blocks + const msg = new Uint8Array(padLen * 64); + msg.set(input); + msg[input.length] = 0x80; + const view = new DataView(msg.buffer); + view.setUint32(msg.length - 8, originalBitLen >>> 0, true); + view.setUint32(msg.length - 4, Math.floor(originalBitLen / 2 ** 32) >>> 0, true); + + let a0 = 0x67452301; + let b0 = 0xefcdab89; + let c0 = 0x98badcfe; + let d0 = 0x10325476; + + const m = new Uint32Array(16); + for (let off = 0; off < msg.length; off += 64) { + for (let i = 0; i < 16; i += 1) { + m[i] = view.getUint32(off + i * 4, true); + } + let a = a0; + let b = b0; + let c = c0; + let d = d0; + for (let i = 0; i < 64; i += 1) { + let f: number; + let g: number; + if (i < 16) { + f = (b & c) | (~b & d); + g = i; + } else if (i < 32) { + f = (d & b) | (~d & c); + g = (5 * i + 1) % 16; + } else if (i < 48) { + f = b ^ c ^ d; + g = (3 * i + 5) % 16; + } else { + f = c ^ (b | ~d); + g = (7 * i) % 16; + } + f = (f + a + K[i]! + m[g]!) >>> 0; + a = d; + d = c; + c = b; + b = (b + rotl(f, S[i]!)) >>> 0; + } + a0 = (a0 + a) >>> 0; + b0 = (b0 + b) >>> 0; + c0 = (c0 + c) >>> 0; + d0 = (d0 + d) >>> 0; + } + + const out = new Uint8Array(16); + const outView = new DataView(out.buffer); + outView.setUint32(0, a0, true); + outView.setUint32(4, b0, true); + outView.setUint32(8, c0, true); + outView.setUint32(12, d0, true); + return out; +} + +function rotl(x: number, c: number): number { + return ((x << c) | (x >>> (32 - c))) >>> 0; +} diff --git a/packages/sdk-js/src/embeddings.ts b/packages/sdk-js/src/embeddings.ts index 717dfb2..a860e79 100644 --- a/packages/sdk-js/src/embeddings.ts +++ b/packages/sdk-js/src/embeddings.ts @@ -96,9 +96,26 @@ function toCborSpace(space: EmbeddingSpace): CborSpace { } function fromCborSpace(raw: CborSpace): EmbeddingSpace { - if (typeof raw.dimension !== 'number' || raw.dimension <= 0) { - throw new Error('Invalid embedding space: dimension must be a positive integer'); + if (typeof raw !== 'object' || raw === null) { + throw new Error('Invalid embedding space: not a map'); } + if (typeof raw.normalized !== 'boolean') { + throw new Error(`Invalid embedding space "${raw.model}": normalized must be a boolean`); + } + if (!Array.isArray(raw.chunks)) { + throw new Error(`Invalid embedding space "${raw.model}": chunks must be an array`); + } + // Mirror the encode-side guarantees (validateSpace) so attacker-supplied CBOR + // cannot smuggle untyped model/metric/chunking/dimension values past readers. + // Scalars are checked before decoding chunks so the dimension used for vector + // length checks is known-valid. + validateScalars({ + model: raw.model, + modelRevision: raw['model-revision'], + dimension: raw.dimension, + metric: raw.metric, + chunking: raw.chunking, + }); return { model: raw.model, modelRevision: raw['model-revision'], @@ -140,19 +157,40 @@ function fromCborChunk(raw: CborChunk, dimension: number): EmbeddingChunk { } function validateSpace(space: EmbeddingSpace): void { - if (!space.model) throw new Error('Embedding space missing model'); - if (!space.modelRevision) throw new Error(`Embedding space "${space.model}" missing modelRevision`); - if (!Number.isInteger(space.dimension) || space.dimension <= 0) { + validateScalars({ + model: space.model, + modelRevision: space.modelRevision, + dimension: space.dimension, + metric: space.metric, + chunking: space.chunking, + }); + if (!Array.isArray(space.chunks) || space.chunks.length === 0) { + throw new Error(`Embedding space "${space.model}" must contain at least one chunk`); + } +} + +interface SpaceScalars { + model: unknown; + modelRevision: unknown; + dimension: unknown; + metric: unknown; + chunking: unknown; +} + +/** Validates the scalar header fields shared by the encode and decode paths. */ +function validateScalars(space: SpaceScalars): void { + if (!space.model || typeof space.model !== 'string') throw new Error('Embedding space missing model'); + if (!space.modelRevision || typeof space.modelRevision !== 'string') { + throw new Error(`Embedding space "${space.model}" missing modelRevision`); + } + if (!Number.isInteger(space.dimension) || (space.dimension as number) <= 0) { throw new Error(`Embedding space "${space.model}" dimension must be a positive integer`); } if (space.metric !== 'cosine' && space.metric !== 'dot' && space.metric !== 'euclidean') { - throw new Error(`Embedding space "${space.model}" has invalid metric "${space.metric}"`); + throw new Error(`Embedding space "${space.model}" has invalid metric "${String(space.metric)}"`); } if (space.chunking !== 'document' && space.chunking !== 'section' && space.chunking !== 'paragraph') { - throw new Error(`Embedding space "${space.model}" has invalid chunking "${space.chunking}"`); - } - if (!Array.isArray(space.chunks) || space.chunks.length === 0) { - throw new Error(`Embedding space "${space.model}" must contain at least one chunk`); + throw new Error(`Embedding space "${space.model}" has invalid chunking "${String(space.chunking)}"`); } } diff --git a/packages/sdk-js/src/pack.ts b/packages/sdk-js/src/pack.ts index 6136e7c..41b0266 100644 --- a/packages/sdk-js/src/pack.ts +++ b/packages/sdk-js/src/pack.ts @@ -1,6 +1,6 @@ -import { AFRelationship, PDFArray, PDFDocument, PDFHexString, PDFName, PDFString } from 'pdf-lib'; +import { AFRelationship, PDFArray, PDFDict, PDFDocument, PDFHexString, PDFName, PDFStream, PDFString } from 'pdf-lib'; import { CV_SPEC_VERSION, DEFAULT_GENERATOR, DEFAULT_PAYLOAD_NAMES, PAYLOAD_MIME_TYPES } from './constants.js'; -import { sha256Hex } from './digest.js'; +import { md5Hex, sha256Hex } from './digest.js'; import { encodeEmbeddings, type EmbeddingsPayload } from './embeddings.js'; import { toBytes, toUint8Array } from './normalize.js'; import { setMetadataXml } from './pdf.js'; @@ -35,8 +35,10 @@ export async function pack(input: PackInput): Promise { } } + const payloadBytesByName = new Map(); for (const p of payloads) { const bytes = toBytes(p.data); + payloadBytesByName.set(p.name, bytes); pdfDoc.attach(bytes, p.name, { mimeType: p.mimeType, description: p.description ?? defaultDescription(p), @@ -72,9 +74,90 @@ export async function pack(input: PackInput): Promise { setTrailerId(pdfDoc); } + // Materialize the embedded-file streams so we can amend their /Params before + // serialization. flush() is idempotent (each embeddable guards re-embedding). + await pdfDoc.flush(); + setEmbeddedFileChecksums(pdfDoc, payloadBytesByName, created, modified); + setInfoDates(pdfDoc, created, modified); + return pdfDoc.save({ useObjectStreams: false }); } +const PARAMS = PDFName.of('Params'); +const CHECKSUM = PDFName.of('CheckSum'); +const CREATION_DATE = PDFName.of('CreationDate'); +const MOD_DATE = PDFName.of('ModDate'); + +/** + * Set the spec-mandated MD5 /CheckSum (spec §4.1) on each embedded-file + * stream's /Params, computed over the unwrapped payload bytes. pdf-lib emits + * /Size and /ModDate but never /CheckSum. Dates are rewritten with an explicit + * UTC offset so they agree with the XMP UTC dateTime (PDF/A-3 date hygiene). + */ +function setEmbeddedFileChecksums( + pdfDoc: PDFDocument, + payloadBytesByName: Map, + created: Date, + modified: Date, +): void { + const afRaw = pdfDoc.catalog.get(PDFName.of('AF')); + if (!afRaw) return; + const afArray = pdfDoc.context.lookup(afRaw, PDFArray); + + for (let i = 0; i < afArray.size(); i += 1) { + const filespec = pdfDoc.context.lookup(afArray.get(i), PDFDict); + const name = filespecName(filespec); + const efRaw = filespec.get(PDFName.of('EF')); + if (!efRaw) continue; + const efDict = pdfDoc.context.lookup(efRaw, PDFDict); + const streamRef = efDict.get(PDFName.of('F')) ?? efDict.get(PDFName.of('UF')); + const stream = pdfDoc.context.lookup(streamRef); + if (!(stream instanceof PDFStream)) continue; + + const bytes = name ? payloadBytesByName.get(name) : undefined; + if (!bytes) continue; + + let params = stream.dict.get(PARAMS); + if (!(params instanceof PDFDict)) { + params = pdfDoc.context.obj({}); + stream.dict.set(PARAMS, params); + } + const paramsDict = params as PDFDict; + paramsDict.set(CHECKSUM, PDFHexString.of(md5Hex(bytes))); + paramsDict.set(CREATION_DATE, PDFString.of(pdfDate(created))); + paramsDict.set(MOD_DATE, PDFString.of(pdfDate(modified))); + } +} + +function filespecName(filespec: PDFDict): string | undefined { + const uf = filespec.get(PDFName.of('UF')); + if (uf instanceof PDFHexString) return uf.decodeText(); + if (uf instanceof PDFString) return uf.asString(); + const f = filespec.get(PDFName.of('F')); + if (f instanceof PDFString) return f.asString(); + if (f instanceof PDFHexString) return f.decodeText(); + return undefined; +} + +/** + * Set the document Info CreationDate/ModDate to the cv created/modified values + * with an explicit UTC offset so they match the XMP UTC dateTime. + */ +function setInfoDates(pdfDoc: PDFDocument, created: Date, modified: Date): void { + const info = pdfDoc.context.lookup(pdfDoc.context.trailerInfo.Info, PDFDict); + info.set(CREATION_DATE, PDFString.of(pdfDate(created))); + info.set(MOD_DATE, PDFString.of(pdfDate(modified))); +} + +/** PDF date string in UTC with an explicit "+00'00'" offset (ISO 32000 §7.9.4). */ +function pdfDate(d: Date): string { + const p = (n: number, w = 2): string => String(n).padStart(w, '0'); + return ( + `D:${p(d.getUTCFullYear(), 4)}${p(d.getUTCMonth() + 1)}${p(d.getUTCDate())}` + + `${p(d.getUTCHours())}${p(d.getUTCMinutes())}${p(d.getUTCSeconds())}+00'00'` + ); +} + function addPdfaOutputIntent(pdfDoc: PDFDocument): void { const existing = pdfDoc.catalog.lookup(PDFName.of('OutputIntents')); if (existing instanceof PDFArray && existing.size() > 0) { @@ -169,6 +252,7 @@ function collectPayloads(input: PackInput): { payloads: Payload[]; embeddingSumm const seen = new Set(); for (const p of out) { + assertPortableName(p.name); if (seen.has(p.name)) { throw new Error(`Duplicate payload name: ${p.name}`); } @@ -177,6 +261,24 @@ function collectPayloads(input: PackInput): { payloads: Payload[]; embeddingSumm return { payloads: out, embeddingSummaries }; } +/** Matches the POSIX-portable filename charset required by spec §4.4. */ +export const PORTABLE_NAME_RE = /^[A-Za-z0-9._/-]+$/; + +/** + * Reject payload names that are not POSIX-portable (spec §4.4) or that contain + * "." / ".." path segments, which would allow path traversal on extraction. + */ +export function assertPortableName(name: string): void { + if (!PORTABLE_NAME_RE.test(name)) { + throw new Error(`Payload name "${name}" is not POSIX-portable; allowed charset is [A-Za-z0-9._/-] (spec §4.4)`); + } + for (const segment of name.split('/')) { + if (segment === '.' || segment === '..') { + throw new Error(`Payload name "${name}" contains a "${segment}" path segment (spec §4.4)`); + } + } +} + function resolveEmbeddings( input: PackInput['embeddings'], summaryOut: EmbeddingSpaceSummary[], diff --git a/packages/sdk-js/src/pdf.ts b/packages/sdk-js/src/pdf.ts index abb4f90..822b1cc 100644 --- a/packages/sdk-js/src/pdf.ts +++ b/packages/sdk-js/src/pdf.ts @@ -4,6 +4,7 @@ import { PDFDocument, PDFHexString, PDFName, + PDFNumber, PDFRawStream, PDFStream, PDFString, @@ -36,6 +37,10 @@ export async function loadDocument(bytes: Uint8Array): Promise { return PDFDocument.load(bytes, { updateMetadata: false, throwOnInvalidObject: false, + // Parse encrypted files instead of throwing, so the validator can surface + // the authoritative /Encrypt trailer entry as the documented spec-§3.4 + // error code rather than a generic parse failure. + ignoreEncryption: true, }); } @@ -217,9 +222,13 @@ function decodeStream(stream: PDFStream): Uint8Array | null { } } + const decodeParms = collectDecodeParms(stream.dict, filters.length); + let bytes: Uint8Array = raw; - for (const f of filters) { + for (let i = 0; i < filters.length; i += 1) { + const f = filters[i]!; if (f === 'FlateDecode') { + assertNoPredictor(decodeParms[i]); bytes = pako.inflate(bytes); } else { return null; @@ -228,6 +237,44 @@ function decodeStream(stream: PDFStream): Uint8Array | null { return bytes; } +const DECODE_PARMS_NAME = PDFName.of('DecodeParms'); +const PREDICTOR_NAME = PDFName.of('Predictor'); + +/** + * Returns the /DecodeParms dictionary that applies to each filter, in filter + * order. /DecodeParms may be a single dict (one filter) or an array aligned + * with /Filter; missing entries are undefined (no parameters). + */ +function collectDecodeParms(dict: PDFDict, filterCount: number): (PDFDict | undefined)[] { + const out: (PDFDict | undefined)[] = new Array(filterCount).fill(undefined); + const parms = dict.get(DECODE_PARMS_NAME); + if (parms instanceof PDFDict) { + out[0] = parms; + } else if (parms instanceof PDFArray) { + for (let i = 0; i < parms.size() && i < filterCount; i += 1) { + const entry = parms.get(i); + if (entry instanceof PDFDict) out[i] = entry; + } + } + return out; +} + +/** + * FlateDecode streams may declare a PNG/TIFF /Predictor in /DecodeParms. This + * SDK does not implement predictor reversal, so rather than silently returning + * garbage we reject such streams with a clear, actionable error. + */ +function assertNoPredictor(parms: PDFDict | undefined): void { + if (!parms) return; + const predictor = parms.get(PREDICTOR_NAME); + if (predictor instanceof PDFNumber && predictor.asNumber() > 1) { + throw new Error( + `Unsupported FlateDecode /DecodeParms /Predictor ${predictor.asNumber()}; ` + + 'PNG/TIFF predictors are not supported by this SDK', + ); + } +} + export function toExtractedPayload(raw: RawPayload, language?: string): ExtractedPayload { const out: ExtractedPayload = { name: raw.name, diff --git a/packages/sdk-js/src/security.ts b/packages/sdk-js/src/security.ts index 873b1d0..7b77f71 100644 --- a/packages/sdk-js/src/security.ts +++ b/packages/sdk-js/src/security.ts @@ -1,28 +1,27 @@ -import { PDFArray, PDFDict, PDFDocument, PDFHexString, PDFName, PDFObject, PDFString } from 'pdf-lib'; +import { PDFArray, PDFDict, PDFDocument, PDFHexString, PDFName, PDFObject, PDFRef, PDFString } from 'pdf-lib'; import type { ValidationIssue } from './types.js'; const TYPE = PDFName.of('Type'); -const SUBTYPE = PDFName.of('Subtype'); const S_KEY = PDFName.of('S'); const JS_KEY = PDFName.of('JS'); const JAVASCRIPT_KEY = PDFName.of('JavaScript'); const F_KEY = PDFName.of('F'); const UF_KEY = PDFName.of('UF'); const EF_KEY = PDFName.of('EF'); -const NAMES_KEY = PDFName.of('Names'); -const ACTION = PDFName.of('Action'); -const FILESPEC = PDFName.of('Filespec'); - -const SUBMIT_FORM = PDFName.of('SubmitForm'); -const LAUNCH = PDFName.of('Launch'); -const IMPORT_DATA = PDFName.of('ImportData'); -const JS_ACTION = PDFName.of('JavaScript'); +const SUBMIT_FORM = 'SubmitForm'; +const LAUNCH = 'Launch'; +const IMPORT_DATA = 'ImportData'; +const JS_ACTION = 'JavaScript'; /** - * Walk the entire indirect-object graph and report any construct prohibited - * by the .cv spec §3.4. Each rule maps to a stable error code so consumers - * can pattern-match without parsing free-text messages. + * Walk the entire object graph from the catalog and report any construct + * prohibited by the .cv spec §3.4. The walk descends through every PDFDict and + * PDFArray value, resolving indirect references, so that forbidden actions + * carried as DIRECT/inline children (e.g. catalog /OpenAction, page /Annots/A, + * /AA, AcroForm field actions) are caught as well as indirect ones. Each rule + * maps to a stable error code so consumers can pattern-match without parsing + * free-text messages. Mirrors the Python reference impl in _security.py. */ export function scanForbiddenConstructs(pdfDoc: PDFDocument): ValidationIssue[] { const issues: ValidationIssue[] = []; @@ -35,44 +34,58 @@ export function scanForbiddenConstructs(pdfDoc: PDFDocument): ValidationIssue[] }); } - for (const [, obj] of pdfDoc.context.enumerateIndirectObjects()) { - if (!(obj instanceof PDFDict)) continue; - inspectDict(pdfDoc, obj, issues); - } + const seen = new Set(); + walk(pdfDoc, pdfDoc.catalog, seen, issues); return dedupe(issues); } +function walk(pdfDoc: PDFDocument, value: PDFObject | undefined, seen: Set, issues: ValidationIssue[]): void { + const obj = resolve(pdfDoc, value); + if (obj === undefined || seen.has(obj)) return; + seen.add(obj); + + if (obj instanceof PDFDict) { + inspectDict(pdfDoc, obj, issues); + for (const [, child] of obj.entries()) { + walk(pdfDoc, child, seen, issues); + } + } else if (obj instanceof PDFArray) { + for (let i = 0; i < obj.size(); i += 1) { + walk(pdfDoc, obj.get(i), seen, issues); + } + } +} + function inspectDict(pdfDoc: PDFDocument, dict: PDFDict, issues: ValidationIssue[]): void { const type = nameOf(dict.get(TYPE)); + const subtype = nameOf(dict.get(S_KEY)); - if (type === 'Action' || dict.get(S_KEY) instanceof PDFName) { - inspectAction(pdfDoc, dict, issues); + if (type === 'Action' || subtype) { + inspectAction(pdfDoc, dict, subtype, issues); } if (type === 'Filespec') { inspectFilespec(dict, issues); } - // /Names tree for document-level JavaScript: catalog→/Names→/JavaScript - // surfaces as a dict with a JavaScript key whose entry is a name tree. - // Any presence of /JavaScript on a Names dict is forbidden. - const namesEntry = dict.get(NAMES_KEY); - if (dict.get(JAVASCRIPT_KEY) || (namesEntry instanceof PDFDict && namesEntry.get(JAVASCRIPT_KEY))) { - if (!issues.some((i) => i.code === 'javascript-names-tree')) { - issues.push({ - code: 'javascript-names-tree', - level: 'error', - message: 'Document declares /JavaScript names entries; JavaScript actions are forbidden (spec §3.4)', - }); - } + // A /JavaScript entry on any dict (catalog→/Names→/JavaScript name tree, or + // the leaf nodes thereof) signals document-level JavaScript, which is forbidden. + if (dict.get(JAVASCRIPT_KEY) !== undefined) { + issues.push({ + code: 'javascript-names-tree', + level: 'error', + message: 'Document declares /JavaScript names entries; JavaScript actions are forbidden (spec §3.4)', + }); } } -function inspectAction(pdfDoc: PDFDocument, dict: PDFDict, issues: ValidationIssue[]): void { - const subtype = dict.get(S_KEY); - if (!(subtype instanceof PDFName)) return; - +function inspectAction( + pdfDoc: PDFDocument, + dict: PDFDict, + subtype: string | undefined, + issues: ValidationIssue[], +): void { if (subtype === JS_ACTION || dict.get(JS_KEY) !== undefined) { issues.push({ code: 'javascript-action', @@ -101,7 +114,7 @@ function inspectAction(pdfDoc: PDFDocument, dict: PDFDict, issues: ValidationIss } if (subtype === SUBMIT_FORM) { - const fEntry = pdfDoc.context.lookup(dict.get(F_KEY)); + const fEntry = resolve(pdfDoc, dict.get(F_KEY)); const target = filespecTarget(fEntry); if (!target || !target.toLowerCase().startsWith('mailto:')) { issues.push({ @@ -119,16 +132,13 @@ function inspectFilespec(dict: PDFDict, issues: ValidationIssue[]): void { if (dict.get(EF_KEY) !== undefined) return; // No /EF means the filespec points outside the container. const target = filespecTarget(dict); - if (!issues.some((i) => i.code === 'external-filespec' && i.payload === target)) { - issues.push({ - code: 'external-filespec', - level: 'error', - message: target - ? `External /Filespec "${target}" (spec §3.4)` - : 'External /Filespec with no /EF (spec §3.4)', - payload: target, - }); - } + const issue: ValidationIssue = { + code: 'external-filespec', + level: 'error', + message: target ? `External /Filespec "${target}" (spec §3.4)` : 'External /Filespec with no /EF (spec §3.4)', + }; + if (target !== undefined) issue.payload = target; + issues.push(issue); } function filespecTarget(value: PDFObject | undefined): string | undefined { @@ -155,6 +165,14 @@ function filespecTarget(value: PDFObject | undefined): string | undefined { return undefined; } +function resolve(pdfDoc: PDFDocument, value: PDFObject | undefined): PDFObject | undefined { + if (value === undefined) return undefined; + if (value instanceof PDFRef) { + return pdfDoc.context.lookup(value) ?? undefined; + } + return value; +} + function nameOf(value: PDFObject | undefined): string | undefined { return value instanceof PDFName ? value.asString().slice(1) : undefined; } diff --git a/packages/sdk-js/src/validate.ts b/packages/sdk-js/src/validate.ts index b8e7c82..90b4b41 100644 --- a/packages/sdk-js/src/validate.ts +++ b/packages/sdk-js/src/validate.ts @@ -1,5 +1,7 @@ +import { CV_SPEC_VERSION } from './constants.js'; import { sha256Hex } from './digest.js'; import { toUint8Array } from './normalize.js'; +import { PORTABLE_NAME_RE } from './pack.js'; import { loadDocument, readAssociatedFiles, readMetadataXml } from './pdf.js'; import { scanForbiddenConstructs } from './security.js'; import type { BinaryInput, ValidationIssue, ValidationLevel, ValidationReport } from './types.js'; @@ -20,15 +22,6 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): const issues: ValidationIssue[] = []; const bytes = await toUint8Array(input); - if (looksEncrypted(bytes)) { - issues.push({ - code: 'encrypted-document', - level: 'error', - message: 'Trailer declares /Encrypt; encryption is forbidden in cv 0.x (spec §3.4)', - }); - return { ok: false, level, issues }; - } - let pdfDoc; try { pdfDoc = await loadDocument(bytes); @@ -37,6 +30,17 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): return { ok: false, level, issues }; } + // An /Encrypt trailer entry is authoritative; encrypted files carry encrypted + // streams that cannot be meaningfully inspected, so reject immediately. + if (pdfDoc.context.trailerInfo.Encrypt) { + issues.push({ + code: 'encrypted-document', + level: 'error', + message: 'Document declares an /Encrypt dictionary; encryption is forbidden in cv 0.x (spec §3.4)', + }); + return { ok: false, level, issues }; + } + for (const issue of scanForbiddenConstructs(pdfDoc)) { issues.push(issue); } @@ -53,6 +57,9 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): return { ok: false, level, issues }; } + const newerVersionIssue = checkVersion(meta.version); + if (newerVersionIssue) issues.push(newerVersionIssue); + const payloads = readAssociatedFiles(pdfDoc); if (payloads.length === 0) { issues.push({ code: 'no-payloads', level: 'error', message: 'No /AF Associated Files present' }); @@ -67,6 +74,14 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): payload: payload.name, }); } + if (!isPortableName(payload.name)) { + issues.push({ + code: 'filename-not-portable', + level: 'error', + message: `Payload name "${payload.name}" is not POSIX-portable (spec §4.4)`, + payload: payload.name, + }); + } } if (!payloads.some((p) => p.name === meta.primaryPayload)) { @@ -120,14 +135,34 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): return { ok, level, issues }; } +function isPortableName(name: string): boolean { + if (!PORTABLE_NAME_RE.test(name)) return false; + return name.split('/').every((segment) => segment !== '.' && segment !== '..'); +} + +/** + * The highest cv MAJOR version this SDK fully understands. The 0.x pre-stable + * series and the 1.x stable series are normatively identical (spec §12), so the + * SDK knows both; a MAJOR of 2 or greater is "newer". + */ +const KNOWN_MAJOR = 1; + /** - * Byte-level pre-check for an /Encrypt trailer entry. pdf-lib refuses to - * parse encrypted PDFs at load time, so without this the validator would - * surface a generic parse failure instead of the documented spec-§3.4 code. + * Emit a "newer-format-version" warning when the file's cv:version MAJOR exceeds + * what this SDK knows (spec §8.3). Both "0.1" and "1.0" are known; only a MAJOR + * of 2 or greater warns. Extraction is never blocked: this is a warning only. */ -function looksEncrypted(bytes: Uint8Array): boolean { - // Search the last 4 KiB where the trailer lives. - const tail = bytes.subarray(Math.max(0, bytes.length - 4096)); - const text = new TextDecoder('latin1').decode(tail); - return /\/Encrypt\b/.test(text); +function checkVersion(version: string): ValidationIssue | null { + const major = parseMajor(version); + if (major === null || major <= KNOWN_MAJOR) return null; + return { + code: 'newer-format-version', + level: 'warning', + message: `cv:version "${version}" has a newer MAJOR than this SDK knows (${CV_SPEC_VERSION}); rendering may be incomplete (spec §8.3)`, + }; +} + +function parseMajor(version: string): number | null { + const major = Number.parseInt(version.split('.')[0] ?? '', 10); + return Number.isNaN(major) ? null : major; } diff --git a/packages/sdk-js/tests/fixes.test.ts b/packages/sdk-js/tests/fixes.test.ts new file mode 100644 index 0000000..67c2551 --- /dev/null +++ b/packages/sdk-js/tests/fixes.test.ts @@ -0,0 +1,256 @@ +import { encode as cborEncode } from 'cbor-x'; +import { PDFDocument, PDFName, PDFNumber, PDFString, StandardFonts } from 'pdf-lib'; +import * as pako from 'pako'; +import { describe, expect, it } from 'vitest'; +import { decodeEmbeddings, encodeEmbeddings, pack, validate, type EmbeddingsPayload } from '../src/index.js'; + +async function blankPdf(text = 'Sample CV'): Promise { + const pdf = await PDFDocument.create(); + const page = pdf.addPage([300, 400]); + const font = await pdf.embedFont(StandardFonts.Helvetica); + page.drawText(text, { x: 30, y: 350, size: 18, font }); + return pdf.save(); +} + +const sampleEmbeddings: EmbeddingsPayload = { + formatVersion: 1, + spaces: [ + { + model: 'BAAI/bge-m3', + modelRevision: 'rev1', + dimension: 4, + metric: 'cosine', + normalized: true, + chunking: 'section', + chunks: [{ id: 'a', textOffset: 0, textLength: 10, vector: new Float32Array([0.1, 0.2, 0.3, 0.4]) }], + }, + ], +}; + +// Fix 1 — inline /OpenAction JavaScript action must be rejected. +describe('security: inline OpenAction JavaScript (fix 1)', () => { + it('rejects a catalog /OpenAction stored as a direct (inline) JavaScript action', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + + // Re-open and inject an INLINE OpenAction so it is NOT an indirect object. + const doc = await PDFDocument.load(cv, { updateMetadata: false }); + const openAction = doc.context.obj({ + Type: 'Action', + S: 'JavaScript', + JS: PDFString.of('app.alert("pwned");'), + }); + doc.catalog.set(PDFName.of('OpenAction'), openAction); + const tampered = await doc.save({ useObjectStreams: false }); + + const report = await validate(tampered); + expect(report.ok).toBe(false); + expect(report.issues.map((i) => i.code)).toContain('javascript-action'); + }); + + it('still passes a clean file (no false positives)', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const report = await validate(cv); + expect(report.ok).toBe(true); + }); +}); + +// Fix 2 — embedded-file /Params must carry an MD5 /CheckSum. +describe('pack: embedded-file /CheckSum (fix 2)', () => { + it('writes a /CheckSum entry into each embedded-file /Params', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const text = new TextDecoder('latin1').decode(cv); + expect(text).toMatch(/\/CheckSum/); + }); +}); + +// Fix 3 — non-portable payload names rejected on write and flagged on read. +describe('filename portability (fix 3)', () => { + it('rejects path-traversal payload names at pack time', async () => { + await expect( + pack({ + pdf: await blankPdf(), + payloads: [{ data: 'x', name: '../../etc/passwd', mimeType: 'text/plain', relationship: 'Supplement' }], + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }), + ).rejects.toThrow(/portable|path segment/i); + }); + + it('rejects non-portable charset payload names at pack time', async () => { + await expect( + pack({ + pdf: await blankPdf(), + payloads: [{ data: 'x', name: 'résumé .md', mimeType: 'text/plain', relationship: 'Supplement' }], + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }), + ).rejects.toThrow(/portable/i); + }); + + it('flags a non-portable name on the read side', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + // Inject a filespec with a non-portable name directly into /AF. + const doc = await PDFDocument.load(cv, { updateMetadata: false }); + const stream = doc.context.flateStream(new TextEncoder().encode('bad'), { + Type: 'EmbeddedFile', + Subtype: 'text/plain', + }); + const streamRef = doc.context.register(stream); + const filespec = doc.context.obj({ + Type: 'Filespec', + F: PDFString.of('../evil.txt'), + UF: PDFString.of('../evil.txt'), + EF: { F: streamRef }, + AFRelationship: 'Supplement', + }); + const afRef = doc.context.register(filespec); + const af = doc.catalog.lookup(PDFName.of('AF')); + (af as { push(x: unknown): void }).push(afRef); + const tampered = await doc.save({ useObjectStreams: false }); + + const report = await validate(tampered); + expect(report.issues.map((i) => i.code)).toContain('filename-not-portable'); + expect(report.ok).toBe(false); + }); +}); + +// Fix 4 — decode-side validation mirrors encode-side guarantees. +describe('embeddings: decode validation (fix 4)', () => { + it('round-trips a valid space', () => { + const decoded = decodeEmbeddings(encodeEmbeddings(sampleEmbeddings)); + expect(decoded.spaces[0]!.metric).toBe('cosine'); + }); + + it('rejects a malformed space with an invalid metric from attacker CBOR', () => { + // Hand-build CBOR that bypasses the encode-side validation. + const malformed = cborEncode({ + 'format-version': 1, + spaces: [ + { + model: 'evil', + 'model-revision': 'r', + dimension: 4, + metric: 'totally-not-a-metric', + normalized: true, + chunking: 'section', + chunks: [{ id: 'a', 'text-offset': 0, 'text-length': 1, vector: new Uint8Array(16) }], + }, + ], + }); + expect(() => decodeEmbeddings(malformed)).toThrow(/metric/i); + }); + + it('rejects a malformed space with an invalid chunking', () => { + const malformed = cborEncode({ + 'format-version': 1, + spaces: [ + { + model: 'evil', + 'model-revision': 'r', + dimension: 4, + metric: 'cosine', + normalized: true, + chunking: 'invalid-chunking', + chunks: [{ id: 'a', 'text-offset': 0, 'text-length': 1, vector: new Uint8Array(16) }], + }, + ], + }); + expect(() => decodeEmbeddings(malformed)).toThrow(/chunking/i); + }); +}); + +// Fix 5 — FlateDecode /Predictor must be rejected, not silently mis-decoded. +describe('pdf: FlateDecode /Predictor rejection (fix 5)', () => { + it('rejects an embedded-file stream with /DecodeParms /Predictor 12', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const doc = await PDFDocument.load(cv, { updateMetadata: false }); + + // Build a flate stream and slap a predictor on it, then add to /AF. + const compressed = pako.deflate(new TextEncoder().encode('predicted')); + const stream = doc.context.stream(compressed, { + Type: 'EmbeddedFile', + Subtype: 'text/plain', + Filter: 'FlateDecode', + DecodeParms: doc.context.obj({ Predictor: PDFNumber.of(12), Columns: PDFNumber.of(4) }), + }); + const streamRef = doc.context.register(stream); + const filespec = doc.context.obj({ + Type: 'Filespec', + F: PDFString.of('predicted.txt'), + UF: PDFString.of('predicted.txt'), + EF: { F: streamRef }, + AFRelationship: 'Supplement', + }); + const afRef = doc.context.register(filespec); + const af = doc.catalog.lookup(PDFName.of('AF')); + (af as { push(x: unknown): void }).push(afRef); + const tampered = await doc.save({ useObjectStreams: false }); + + await expect(validate(tampered)).rejects.toThrow(/Predictor/i); + }); +}); + +// Fix 6 — newer MAJOR cv:version surfaces a warning but does not block. +describe('validate: newer-format-version warning (fix 6)', () => { + it('warns (but stays ok) when cv:version MAJOR is 2', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const text = new TextDecoder('latin1').decode(cv); + expect(text).toContain('0.1'); + + // Rewrite the cv:version to a future major. The XMP stream is uncompressed + // metadata so we can patch the bytes in place (same byte length). + const patched = patchBytes(cv, '0.1', '2.0'); + const report = await validate(patched); + expect(report.issues.map((i) => i.code)).toContain('newer-format-version'); + const versionIssue = report.issues.find((i) => i.code === 'newer-format-version'); + expect(versionIssue?.level).toBe('warning'); + // A newer-version warning alone must not block (extraction still works). + expect(report.ok).toBe(true); + }); + + it('does not warn for known versions 0.1 and 1.0', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const v10 = patchBytes(cv, '0.1', '1.0'); + const report = await validate(v10); + expect(report.issues.map((i) => i.code)).not.toContain('newer-format-version'); + }); +}); + +function patchBytes(bytes: Uint8Array, from: string, to: string): Uint8Array { + if (from.length !== to.length) throw new Error('patchBytes requires equal lengths'); + const text = new TextDecoder('latin1').decode(bytes); + const idx = text.indexOf(from); + if (idx < 0) throw new Error(`pattern not found: ${from}`); + const out = new Uint8Array(bytes); + const enc = new TextEncoder().encode(to); + out.set(enc, idx); + return out; +} diff --git a/packages/sdk-js/tests/fixtures/python-produced.cv b/packages/sdk-js/tests/fixtures/python-produced.cv index 24f0566..d850014 100644 Binary files a/packages/sdk-js/tests/fixtures/python-produced.cv and b/packages/sdk-js/tests/fixtures/python-produced.cv differ diff --git a/packages/sdk-js/tests/interop.test.ts b/packages/sdk-js/tests/interop.test.ts index 3aa9445..c2e53ff 100644 --- a/packages/sdk-js/tests/interop.test.ts +++ b/packages/sdk-js/tests/interop.test.ts @@ -47,11 +47,19 @@ describe('JS reads Python-produced .cv (interop)', () => { expect(await extractHtml(bytes)).toBe(PY_HTML); }); - it('extracts all three Python-produced payloads', async () => { + it('extracts all Python-produced payloads (incl. precomputed embeddings)', async () => { const bytes = new Uint8Array(await readFile(FIXTURE)); const file = await extract(bytes); const names = file.payloads.map((p) => p.name).sort(); - expect(names).toEqual(['resume.html', 'resume.json', 'resume.md']); + expect(names).toEqual(['embeddings.cbor', 'resume.html', 'resume.json', 'resume.md']); + }); + + it('surfaces the embedding-space summary in metadata', async () => { + const bytes = new Uint8Array(await readFile(FIXTURE)); + const meta = await inspect(bytes); + expect(meta.embeddings.length).toBeGreaterThan(0); + expect(meta.embeddings[0].chunks).toBeGreaterThan(0); + expect(meta.embeddings[0].dimension).toBeGreaterThan(0); }); it('validates the Python-produced file', async () => { diff --git a/packages/server-middleware-node/examples/public/jane.cv b/packages/server-middleware-node/examples/public/jane.cv index ab9a10a..0f6f8e7 100644 Binary files a/packages/server-middleware-node/examples/public/jane.cv and b/packages/server-middleware-node/examples/public/jane.cv differ diff --git a/packages/server-middleware-node/src/conneg.ts b/packages/server-middleware-node/src/conneg.ts index 451b452..18b6e15 100644 --- a/packages/server-middleware-node/src/conneg.ts +++ b/packages/server-middleware-node/src/conneg.ts @@ -4,6 +4,7 @@ export interface NegotiationInput { accept?: string | undefined; acceptLanguage?: string | undefined; formatQuery?: string | undefined; + defaultFormat?: ServeFormat | undefined; } export interface NegotiationResult { @@ -11,14 +12,9 @@ export interface NegotiationResult { language: string | undefined; } -const FORMAT_BY_MIME: Record = { - 'text/markdown': 'markdown', - 'text/x-markdown': 'markdown', - 'text/html': 'html', - 'application/xhtml+xml': 'html', - 'application/pdf': 'pdf', - 'application/vnd.cv+pdf': 'pdf', -}; +const MARKDOWN_MIMES = new Set(['text/markdown', 'text/x-markdown', 'application/vnd.cv+markdown']); +const PDF_MIMES = new Set(['application/pdf', 'application/vnd.cv+pdf']); +const HTML_MIMES = new Set(['text/html', 'application/xhtml+xml']); const FORMAT_BY_QUERY: Record = { md: 'markdown', @@ -37,19 +33,34 @@ export function parseAccept(header: string | undefined | null): ParsedAccept[] { if (!header) return []; return header .split(',') - .map((part) => { + .map((part): ParsedAccept | null => { const [type, ...params] = part.trim().split(';').map((s) => s.trim()); - let q = 1; - for (const p of params) { - const m = p.match(/^q\s*=\s*(\d*\.?\d+)/i); - if (m) q = Number(m[1]); - } + const q = parseQ(params); + // A malformed q (present but unparseable) marks the type as unusable per RFC 9110. + if (q === null) return null; return { type: (type ?? '').toLowerCase(), q }; }) - .filter((p) => p.type) + .filter((p): p is ParsedAccept => p !== null && p.type !== '' && p.q > 0) .sort((a, b) => b.q - a.q); } +/** + * Resolve the q-value of a media-range's parameters. + * Returns 1 when absent, the clamped [0,1] value when valid, and null when a + * q parameter is present but cannot be parsed (signalling a malformed type). + */ +function parseQ(params: string[]): number | null { + for (const p of params) { + if (!/^q\s*=/i.test(p)) continue; + const m = p.match(/^q\s*=\s*(\d*\.?\d+)\s*$/i); + if (!m) return null; + const value = Number(m[1]); + if (Number.isNaN(value)) return null; + return Math.min(1, Math.max(0, value)); + } + return 1; +} + export function parseAcceptLanguage(header: string | undefined | null): string[] { if (!header) return []; return header @@ -71,6 +82,7 @@ export function parseAcceptLanguage(header: string | undefined | null): string[] export function negotiate(input: NegotiationInput): NegotiationResult { const language = parseAcceptLanguage(input.acceptLanguage)[0]; + // An explicit ?format= query is the only override and wins over Accept. if (input.formatQuery) { const fromQuery = FORMAT_BY_QUERY[input.formatQuery.toLowerCase()]; if (fromQuery) { @@ -78,21 +90,52 @@ export function negotiate(input: NegotiationInput): NegotiationResult { } } - const accepts = parseAccept(input.accept); - for (const a of accepts) { - const direct = FORMAT_BY_MIME[a.type]; - if (direct) { - return { format: direct, language }; - } - if (a.type === '*/*' || a.type === 'application/*') { - return { format: 'pdf', language }; - } - if (a.type === 'text/*') { - return { format: 'html', language }; - } + const fromAccept = negotiateFromAccept(input.accept); + const format = fromAccept ?? input.defaultFormat ?? 'pdf'; + return { format, language }; +} + +/** + * Map an Accept header to a format following the .cv contract: + * - markdown only when it is an explicit, top, non-wildcard preference; + * - html only when text/html is requested without a wildcard (a deliberate fetch); + * - pdf for the browser case (text/html alongside a wildcard, or any wildcard); + * - undefined when the header expresses no usable preference (caller falls back). + */ +function negotiateFromAccept(header: string | undefined): ServeFormat | undefined { + const accepts = parseAccept(header); + if (accepts.length === 0) return undefined; + + const topQ = accepts[0]!.q; + const top = accepts.filter((a) => a.q === topQ); + const hasWildcard = accepts.some((a) => a.type === '*/*' || a.type === 'application/*'); + + // Markdown wins only as an explicit, top, non-wildcard preference. + if (top.some((a) => MARKDOWN_MIMES.has(a.type))) { + return 'markdown'; + } + + // An explicit, top preference for the PDF type also serves PDF. + if (top.some((a) => PDF_MIMES.has(a.type))) { + return 'pdf'; + } + + // A deliberate HTML fetch: text/html requested without a catch-all wildcard. + if (top.some((a) => HTML_MIMES.has(a.type)) && !hasWildcard) { + return 'html'; + } + + // Browser case (text/html + */*) or any wildcard: serve the visual PDF. + if (hasWildcard || top.some((a) => HTML_MIMES.has(a.type))) { + return 'pdf'; + } + + // text/* (without a more specific match) is a deliberate text fetch -> html. + if (top.some((a) => a.type === 'text/*')) { + return 'html'; } - return { format: 'pdf', language }; + return undefined; } export interface BuildLinkHeaderInput { diff --git a/packages/server-middleware-node/src/fastify.ts b/packages/server-middleware-node/src/fastify.ts index 7710749..6104903 100644 --- a/packages/server-middleware-node/src/fastify.ts +++ b/packages/server-middleware-node/src/fastify.ts @@ -12,10 +12,12 @@ export const cvFastifyPlugin: FastifyPluginAsync = async ( const handler = cvHandler(opts); const route = `${opts.prefix ?? ''}/*`; fastify.get(route, async (request: FastifyRequest, reply: FastifyReply) => { - if (!request.url.toLowerCase().includes('.cv')) { + const { pathname } = new URL(request.url, `http://${request.headers.host ?? 'localhost'}`); + if (!decodeURIComponent(pathname).toLowerCase().endsWith('.cv')) { reply.code(404).send('Not found'); return; } + reply.hijack(); await handler(request.raw, reply.raw); }); }; diff --git a/packages/server-middleware-node/src/handler.ts b/packages/server-middleware-node/src/handler.ts index 82f57bc..cd3e053 100644 --- a/packages/server-middleware-node/src/handler.ts +++ b/packages/server-middleware-node/src/handler.ts @@ -2,14 +2,14 @@ import type { IncomingMessage, ServerResponse } from 'node:http'; import { readFile, stat } from 'node:fs/promises'; import { normalize, resolve, sep } from 'node:path'; import { isCvFile } from '@cvfile/sdk'; -import { buildLinkHeader, PDF_PRIMARY_MIME } from './conneg.js'; -import { serveCv } from './serve.js'; +import type { ServeFormat } from './conneg.js'; +import { buildCvResponse } from './response.js'; export interface CvHandlerOptions { root?: string; loader?: (logicalPath: string) => Promise; cacheControl?: string; - defaultFormat?: 'pdf' | 'markdown' | 'html'; + defaultFormat?: ServeFormat; } export type CvHandler = (req: IncomingMessage, res: ServerResponse) => Promise; @@ -26,45 +26,45 @@ export function cvHandler(options: CvHandlerOptions = {}): CvHandler { try { const url = new URL(req.url ?? '/', `http://${req.headers.host ?? 'localhost'}`); const logical = decodeURIComponent(url.pathname); - const formatQuery = url.searchParams.get('format') ?? defaultFormat; - const bytes = await load(logical, { baseRoot, loader }); - if (!bytes) { + const loaded = await load(logical, { baseRoot, loader }); + if (!loaded) { res.statusCode = 404; res.end('Not found'); return; } - if (!(await isCvFile(bytes))) { + if (!(await isCvFile(loaded.bytes))) { res.statusCode = 415; res.end('Not a .cv file'); return; } - const result = await serveCv({ - bytes, + const built = await buildCvResponse({ + bytes: loaded.bytes, + selfUrl: url.pathname, accept: req.headers['accept'], acceptLanguage: req.headers['accept-language'], - formatQuery: formatQuery ?? undefined, + formatQuery: url.searchParams.get('format') ?? undefined, + defaultFormat, + cacheControl, + lastModified: loaded.lastModified, + ifNoneMatch: req.headers['if-none-match'], + ifModifiedSince: req.headers['if-modified-since'], }); - const link = buildLinkHeader({ selfUrl: url.pathname, cvMime: PDF_PRIMARY_MIME }); - - res.setHeader('Content-Type', result.contentType); - res.setHeader('Content-Length', String(result.body.length)); - res.setHeader('Vary', 'Accept, Accept-Language'); - res.setHeader('Link', link); - res.setHeader('Cache-Control', cacheControl); - if (result.language) { - res.setHeader('Content-Language', result.language); + for (const [name, value] of Object.entries(built.headers)) { + res.setHeader(name, value); + } + res.statusCode = built.status; + res.end(built.status === 304 ? undefined : Buffer.from(built.body)); + } catch { + if (res.headersSent) { + res.end(); + return; } - const filename = filenameForFormat(logical, result.format); - res.setHeader('Content-Disposition', `inline; filename="${filename}"`); - res.statusCode = 200; - res.end(Buffer.from(result.body)); - } catch (err) { res.statusCode = 500; - res.end(`cvHandler error: ${(err as Error).message}`); + res.end('Internal Server Error'); } }; } @@ -74,10 +74,15 @@ interface LoadOpts { loader?: ((logicalPath: string) => Promise) | undefined; } -async function load(logicalPath: string, { baseRoot, loader }: LoadOpts): Promise { +interface LoadedFile { + bytes: Uint8Array; + lastModified?: Date | undefined; +} + +async function load(logicalPath: string, { baseRoot, loader }: LoadOpts): Promise { if (loader) { const bytes = await loader(logicalPath); - return bytes ?? null; + return bytes ? { bytes } : null; } if (!baseRoot) return null; const safe = normalize(logicalPath).replace(/^[/\\]+/, ''); @@ -88,7 +93,7 @@ async function load(logicalPath: string, { baseRoot, loader }: LoadOpts): Promis try { const s = await stat(full); if (!s.isFile()) return null; - return new Uint8Array(await readFile(full)); + return { bytes: new Uint8Array(await readFile(full)), lastModified: s.mtime }; } catch { return null; } @@ -99,12 +104,3 @@ function isWithin(parent: string, child: string): boolean { const base = resolve(parent); return rel === base || rel.startsWith(base + sep); } - -function filenameForFormat(logical: string, format: 'pdf' | 'markdown' | 'html'): string { - const base = logical.split('/').pop() ?? 'document'; - const stem = base.replace(/\.cv$/i, '').replace(/\.(pdf|md|html)$/i, '') || 'document'; - if (format === 'markdown') return `${stem}.md`; - if (format === 'html') return `${stem}.html`; - return `${stem}.cv`; -} - diff --git a/packages/server-middleware-node/src/hono.ts b/packages/server-middleware-node/src/hono.ts index 5e2db6c..9932405 100644 --- a/packages/server-middleware-node/src/hono.ts +++ b/packages/server-middleware-node/src/hono.ts @@ -1,15 +1,16 @@ import type { Context, MiddlewareHandler } from 'hono'; import { isCvFile } from '@cvfile/sdk'; -import { buildLinkHeader, PDF_PRIMARY_MIME } from './conneg.js'; -import { serveCv } from './serve.js'; +import type { ServeFormat } from './conneg.js'; +import { buildCvResponse } from './response.js'; export interface CvHonoOptions { loader: (logicalPath: string) => Promise; cacheControl?: string; + defaultFormat?: ServeFormat; } export function cvHono(options: CvHonoOptions): MiddlewareHandler { - const { loader, cacheControl = 'public, max-age=300' } = options; + const { loader, cacheControl = 'public, max-age=300', defaultFormat } = options; return async (c: Context) => { const url = new URL(c.req.url); const logical = decodeURIComponent(url.pathname); @@ -21,22 +22,24 @@ export function cvHono(options: CvHonoOptions): MiddlewareHandler { if (!(await isCvFile(bytes))) { return c.text('Not a .cv file', 415); } - const result = await serveCv({ + + const built = await buildCvResponse({ bytes, + selfUrl: url.pathname, accept: c.req.header('accept'), acceptLanguage: c.req.header('accept-language'), formatQuery: c.req.query('format') ?? undefined, + defaultFormat, + cacheControl, + ifNoneMatch: c.req.header('if-none-match'), + ifModifiedSince: c.req.header('if-modified-since'), }); - const link = buildLinkHeader({ selfUrl: url.pathname, cvMime: PDF_PRIMARY_MIME }); - const headers: Record = { - 'Content-Type': result.contentType, - Vary: 'Accept, Accept-Language', - Link: link, - 'Cache-Control': cacheControl, - }; - if (result.language) headers['Content-Language'] = result.language; - const view = new Uint8Array(result.body.byteLength); - view.set(result.body); - return c.newResponse(view.buffer, 200, headers); + + if (built.status === 304) { + return c.body(null, 304, built.headers); + } + const view = new Uint8Array(built.body.byteLength); + view.set(built.body); + return c.newResponse(view.buffer, 200, built.headers); }; } diff --git a/packages/server-middleware-node/src/response.ts b/packages/server-middleware-node/src/response.ts new file mode 100644 index 0000000..0dfa208 --- /dev/null +++ b/packages/server-middleware-node/src/response.ts @@ -0,0 +1,161 @@ +import { createHash } from 'node:crypto'; +import { buildLinkHeader, PDF_PRIMARY_MIME, type ServeFormat } from './conneg.js'; +import { serveCv } from './serve.js'; + +export interface BuildResponseInput { + bytes: Uint8Array; + selfUrl: string; + accept?: string | undefined; + acceptLanguage?: string | undefined; + formatQuery?: string | undefined; + defaultFormat?: ServeFormat | undefined; + cacheControl: string; + lastModified?: Date | undefined; + ifNoneMatch?: string | undefined; + ifModifiedSince?: string | undefined; +} + +export interface BuiltResponse { + status: 200 | 304; + headers: Record; + format: ServeFormat; + body: Uint8Array; +} + +/** + * Negotiate the format and assemble the full set of response headers shared by + * every adapter. The same URL yields a different body per negotiated format, so + * the ETag is keyed on both the bytes and the format. Returns a 304 (empty body) + * when the conditional request headers match. + */ +export async function buildCvResponse(input: BuildResponseInput): Promise { + const result = await serveCv({ + bytes: input.bytes, + accept: input.accept, + acceptLanguage: input.acceptLanguage, + formatQuery: input.formatQuery, + defaultFormat: input.defaultFormat, + }); + + const etag = computeETag(result.body, result.format); + const lastModified = input.lastModified?.toUTCString(); + + const headers: Record = { + 'Content-Type': result.contentType, + Vary: 'Accept, Accept-Language', + Link: buildLinkHeader({ selfUrl: input.selfUrl, cvMime: PDF_PRIMARY_MIME }), + 'Cache-Control': input.cacheControl, + ETag: etag, + 'Content-Disposition': contentDisposition(input.selfUrl, result.format), + }; + if (result.language) { + headers['Content-Language'] = result.language; + } + if (lastModified) { + headers['Last-Modified'] = lastModified; + } + + const notModified = isNotModified({ + etag, + lastModified: input.lastModified, + ifNoneMatch: input.ifNoneMatch, + ifModifiedSince: input.ifModifiedSince, + }); + if (notModified) { + return { status: 304, headers, format: result.format, body: new Uint8Array(0) }; + } + + headers['Content-Length'] = String(result.body.length); + return { status: 200, headers, format: result.format, body: result.body }; +} + +/** Weak ETag keyed on the negotiated body so each format gets a distinct tag. */ +function computeETag(body: Uint8Array, format: ServeFormat): string { + const hash = createHash('sha1').update(body).digest('base64url'); + return `W/"${format}-${body.length.toString(16)}-${hash}"`; +} + +interface NotModifiedInput { + etag: string; + lastModified?: Date | undefined; + ifNoneMatch?: string | undefined; + ifModifiedSince?: string | undefined; +} + +function isNotModified({ etag, lastModified, ifNoneMatch, ifModifiedSince }: NotModifiedInput): boolean { + if (ifNoneMatch) { + return etagMatches(ifNoneMatch, etag); + } + if (ifModifiedSince && lastModified) { + const since = Date.parse(ifModifiedSince); + if (!Number.isNaN(since)) { + // Compare at second resolution, matching HTTP-date granularity. + return Math.floor(lastModified.getTime() / 1000) <= Math.floor(since / 1000); + } + } + return false; +} + +function etagMatches(ifNoneMatch: string, etag: string): boolean { + if (ifNoneMatch.trim() === '*') return true; + const normalize = (tag: string): string => tag.trim().replace(/^W\//, ''); + const target = normalize(etag); + return ifNoneMatch.split(',').some((candidate) => normalize(candidate) === target); +} + +/** + * Build a header-injection-safe Content-Disposition value. Control characters, + * CR/LF, quotes and backslashes are stripped from the ASCII filename; non-ASCII + * names also get an RFC 5987 filename* form. + */ +export function contentDisposition(selfUrl: string, format: ServeFormat): string { + const filename = filenameForFormat(selfUrl, format); + const asciiSafe = sanitizeAsciiFilename(filename); + const base = `inline; filename="${asciiSafe}"`; + if (!hasNonAscii(filename)) return base; + return `${base}; filename*=UTF-8''${encodeRFC5987(filename)}`; +} + +/** + * Produce a safe quoted-string filename: drop control chars (CR/LF/DEL), + * double quotes and backslashes; replace any remaining non-ASCII with '_'. + */ +function sanitizeAsciiFilename(filename: string): string { + let out = ''; + for (const ch of filename) { + const code = ch.codePointAt(0) ?? 0; + if (code < 0x20 || code === 0x7f) continue; // control chars incl. CR/LF + if (ch === '"' || ch === '\\') continue; // quoted-string delimiters + out += code > 0x7e ? '_' : ch; // collapse non-ASCII to underscore + } + return out || 'document'; +} + +function hasNonAscii(value: string): boolean { + for (const ch of value) { + const code = ch.codePointAt(0) ?? 0; + if (code < 0x20 || code > 0x7e) return true; + } + return false; +} + +function filenameForFormat(selfUrl: string, format: ServeFormat): string { + const pathname = selfUrl.split('?')[0] ?? selfUrl; + const base = decodeOrRaw(pathname.split('/').pop() ?? 'document'); + const stem = base.replace(/\.cv$/i, '').replace(/\.(pdf|md|html)$/i, '') || 'document'; + if (format === 'markdown') return `${stem}.md`; + if (format === 'html') return `${stem}.html`; + return `${stem}.cv`; +} + +function decodeOrRaw(value: string): string { + try { + return decodeURIComponent(value); + } catch { + return value; + } +} + +function encodeRFC5987(value: string): string { + return encodeURIComponent(value).replace(/['()*]/g, (c) => `%${c.charCodeAt(0).toString(16).toUpperCase()}`); +} diff --git a/packages/server-middleware-node/src/serve.ts b/packages/server-middleware-node/src/serve.ts index 52a1f80..07d368d 100644 --- a/packages/server-middleware-node/src/serve.ts +++ b/packages/server-middleware-node/src/serve.ts @@ -7,6 +7,7 @@ export interface ServeRequest { accept?: string | undefined; acceptLanguage?: string | undefined; formatQuery?: string | undefined; + defaultFormat?: ServeFormat | undefined; } export interface ServeResponse { @@ -23,6 +24,7 @@ export async function serveCv(req: ServeRequest): Promise { accept: req.accept, acceptLanguage: req.acceptLanguage, formatQuery: req.formatQuery, + defaultFormat: req.defaultFormat, }); if (decision.format === 'pdf') { @@ -42,7 +44,7 @@ export async function serveCv(req: ServeRequest): Promise { if (md) { return { format: 'markdown', - contentType: `text/markdown; charset=utf-8; cv-language=${md.language ?? preferLang}`, + contentType: 'text/markdown; charset=utf-8', ...(md.language !== undefined ? { language: md.language } : {}), body: md.bytes, }; @@ -55,7 +57,7 @@ export async function serveCv(req: ServeRequest): Promise { if (html) { return { format: 'html', - contentType: `text/html; charset=utf-8; cv-language=${html.language ?? preferLang}`, + contentType: 'text/html; charset=utf-8', ...(html.language !== undefined ? { language: html.language } : {}), body: html.bytes, }; diff --git a/packages/server-middleware-node/tests/conneg.test.ts b/packages/server-middleware-node/tests/conneg.test.ts index 9297015..8d3b0c2 100644 --- a/packages/server-middleware-node/tests/conneg.test.ts +++ b/packages/server-middleware-node/tests/conneg.test.ts @@ -17,6 +17,21 @@ describe('parseAccept', () => { expect(parseAccept(null)).toEqual([]); expect(parseAccept('')).toEqual([]); }); + + it('drops entries with q=0 (not acceptable)', () => { + const parsed = parseAccept('text/markdown;q=0, text/html;q=0.8'); + expect(parsed.map((p) => p.type)).toEqual(['text/html']); + }); + + it('clamps q to [0,1]', () => { + const parsed = parseAccept('text/html;q=5, text/markdown;q=0.5'); + expect(parsed[0]!.q).toBe(1); + }); + + it('skips a type whose q is malformed', () => { + const parsed = parseAccept('text/html;q=abc, text/markdown'); + expect(parsed.map((p) => p.type)).toEqual(['text/markdown']); + }); }); describe('parseAcceptLanguage', () => { @@ -35,10 +50,18 @@ describe('negotiate', () => { expect(negotiate({ accept: 'text/markdown' }).format).toBe('markdown'); }); - it('text/html returns html', () => { + it('deliberate text/html (no wildcard) returns html', () => { + expect(negotiate({ accept: 'text/html' }).format).toBe('html'); expect(negotiate({ accept: 'text/html,application/xhtml+xml' }).format).toBe('html'); }); + it('browser request (text/html + */*) returns pdf', () => { + expect(negotiate({ accept: 'text/html,*/*' }).format).toBe('pdf'); + expect( + negotiate({ accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' }).format, + ).toBe('pdf'); + }); + it('application/pdf returns pdf', () => { expect(negotiate({ accept: 'application/pdf' }).format).toBe('pdf'); }); @@ -47,20 +70,33 @@ describe('negotiate', () => { expect(negotiate({ accept: 'application/vnd.cv+pdf' }).format).toBe('pdf'); }); - it('q-values pick the highest preference', () => { - const r = negotiate({ accept: 'text/html;q=0.5, text/markdown;q=0.9' }); - expect(r.format).toBe('markdown'); + it('markdown only wins as a top, non-wildcard preference', () => { + expect(negotiate({ accept: 'text/html;q=0.5, text/markdown;q=0.9' }).format).toBe('markdown'); + // markdown not at the top -> browser/wildcard case wins (pdf) + expect(negotiate({ accept: 'text/html, text/markdown;q=0.5' }).format).toBe('html'); + }); + + it('q=0 markdown falls through to pdf', () => { + expect(negotiate({ accept: 'text/markdown;q=0' }).format).toBe('pdf'); }); - it('default is pdf when nothing matches', () => { + it('default is pdf when nothing usable matches', () => { expect(negotiate({}).format).toBe('pdf'); expect(negotiate({ accept: '*/*' }).format).toBe('pdf'); }); - it('text/* falls through to html', () => { + it('text/* falls through to html (deliberate text fetch)', () => { expect(negotiate({ accept: 'text/*' }).format).toBe('html'); }); + it('defaultFormat is the final fallback only, never overrides an explicit Accept', () => { + // explicit Accept beats defaultFormat + expect(negotiate({ accept: 'application/pdf', defaultFormat: 'markdown' }).format).toBe('pdf'); + // no usable Accept -> defaultFormat applies + expect(negotiate({ defaultFormat: 'markdown' }).format).toBe('markdown'); + expect(negotiate({ accept: '*/*', defaultFormat: 'markdown' }).format).toBe('pdf'); + }); + it('captures accept-language', () => { expect(negotiate({ accept: 'text/markdown', acceptLanguage: 'fr-CA, en;q=0.5' }).language).toBe('fr-ca'); }); diff --git a/packages/server-middleware-node/tests/handler.test.ts b/packages/server-middleware-node/tests/handler.test.ts index 051c63a..1c5371e 100644 --- a/packages/server-middleware-node/tests/handler.test.ts +++ b/packages/server-middleware-node/tests/handler.test.ts @@ -54,11 +54,29 @@ describe('cvHandler over HTTP', () => { expect(res.headers.get('content-type')).toBe('application/vnd.cv+pdf'); expect(res.headers.get('vary')).toBe('Accept, Accept-Language'); expect(res.headers.get('link')).toContain('text/markdown'); + expect(res.headers.get('content-disposition')).toBe('inline; filename="jane.cv"'); + expect(res.headers.get('etag')).toMatch(/^W\/"pdf-/); + expect(res.headers.get('last-modified')).toBeTruthy(); const body = new Uint8Array(await res.arrayBuffer()); const header = new TextDecoder().decode(body.slice(0, 4)); expect(header).toBe('%PDF'); }); + it('serves PDF for a real browser Accept (text/html + */*)', async () => { + const res = await fetch(`${baseUrl}/jane.cv`, { + headers: { accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' }, + }); + expect(res.status).toBe(200); + expect(res.headers.get('content-type')).toBe('application/vnd.cv+pdf'); + const body = new Uint8Array(await res.arrayBuffer()); + expect(new TextDecoder().decode(body.slice(0, 4))).toBe('%PDF'); + }); + + it('q=0 markdown falls through to PDF', async () => { + const res = await fetch(`${baseUrl}/jane.cv`, { headers: { accept: 'text/markdown;q=0' } }); + expect(res.headers.get('content-type')).toBe('application/vnd.cv+pdf'); + }); + it('serves markdown when Accept is text/markdown', async () => { const res = await fetch(`${baseUrl}/jane.cv`, { headers: { accept: 'text/markdown' } }); expect(res.status).toBe(200); @@ -81,6 +99,39 @@ describe('cvHandler over HTTP', () => { expect(await res.text()).toBe(SAMPLE_MD); }); + it('markdown Content-Type carries no cv-language parameter', async () => { + const res = await fetch(`${baseUrl}/jane.cv`, { headers: { accept: 'text/markdown' } }); + expect(res.headers.get('content-type')).toBe('text/markdown; charset=utf-8'); + }); + + it('ETag varies by negotiated format', async () => { + const pdf = await fetch(`${baseUrl}/jane.cv`); + const md = await fetch(`${baseUrl}/jane.cv`, { headers: { accept: 'text/markdown' } }); + expect(pdf.headers.get('etag')).not.toBe(md.headers.get('etag')); + await pdf.arrayBuffer(); + await md.text(); + }); + + it('honors If-None-Match with a 304', async () => { + const first = await fetch(`${baseUrl}/jane.cv`, { headers: { accept: 'text/markdown' } }); + const etag = first.headers.get('etag')!; + await first.text(); + const second = await fetch(`${baseUrl}/jane.cv`, { + headers: { accept: 'text/markdown', 'if-none-match': etag }, + }); + expect(second.status).toBe(304); + expect(second.headers.get('etag')).toBe(etag); + expect(await second.text()).toBe(''); + }); + + it('honors If-Modified-Since with a 304', async () => { + const first = await fetch(`${baseUrl}/jane.cv`); + const lastModified = first.headers.get('last-modified')!; + await first.arrayBuffer(); + const second = await fetch(`${baseUrl}/jane.cv`, { headers: { 'if-modified-since': lastModified } }); + expect(second.status).toBe(304); + }); + it('returns 404 for missing file', async () => { const res = await fetch(`${baseUrl}/nope.cv`); expect(res.status).toBe(404); @@ -91,3 +142,82 @@ describe('cvHandler over HTTP', () => { expect([404, 415]).toContain(res.status); }); }); + +describe('Content-Disposition sanitization', () => { + let injServer: Server; + let injBase: string; + let cvBytes: Uint8Array; + + beforeAll(async () => { + cvBytes = await pack({ + pdf: await makeBlankPdf(), + markdown: SAMPLE_MD, + html: SAMPLE_HTML, + metadata: { primaryLanguage: 'en' }, + }); + const handler = cvHandler({ loader: async () => cvBytes }); + injServer = createServer((req, res) => void handler(req, res)); + await new Promise((resolve) => injServer.listen(0, '127.0.0.1', resolve)); + const addr = injServer.address(); + if (typeof addr !== 'object' || !addr) throw new Error('no address'); + injBase = `http://127.0.0.1:${addr.port}`; + }); + + afterAll(async () => { + await new Promise((resolve) => injServer.close(() => resolve())); + }); + + it('strips quotes from the filename', async () => { + const res = await fetch(`${injBase}/a%22b.cv`); + const cd = res.headers.get('content-disposition')!; + expect(cd).toBe('inline; filename="ab.cv"'); + expect(cd).not.toContain('a"b'); + await res.arrayBuffer(); + }); + + it('emits RFC 5987 filename* for non-ASCII names', async () => { + const res = await fetch(`${injBase}/${encodeURIComponent('café')}.cv`); + const cd = res.headers.get('content-disposition')!; + expect(cd).toContain("filename*=UTF-8''caf%C3%A9.cv"); + await res.arrayBuffer(); + }); + + it('never lets CR/LF reach the header value', async () => { + // %0d%0a in the path would be header-injection if interpolated raw. + const res = await fetch(`${injBase}/a%0d%0aX-Injected:1b.cv`); + const cd = res.headers.get('content-disposition')!; + expect(cd).not.toMatch(/[\r\n]/); + expect(res.headers.get('x-injected')).toBeNull(); + await res.arrayBuffer(); + }); +}); + +describe('defaultFormat as final fallback', () => { + let mdServer: Server; + let mdBase: string; + + beforeAll(async () => { + const handler = cvHandler({ root: tmpRoot, defaultFormat: 'markdown' }); + mdServer = createServer((req, res) => void handler(req, res)); + await new Promise((resolve) => mdServer.listen(0, '127.0.0.1', resolve)); + const addr = mdServer.address(); + if (typeof addr !== 'object' || !addr) throw new Error('no address'); + mdBase = `http://127.0.0.1:${addr.port}`; + }); + + afterAll(async () => { + await new Promise((resolve) => mdServer.close(() => resolve())); + }); + + it('does not override an explicit Accept', async () => { + const res = await fetch(`${mdBase}/jane.cv`, { headers: { accept: 'application/pdf' } }); + expect(res.headers.get('content-type')).toBe('application/vnd.cv+pdf'); + await res.arrayBuffer(); + }); + + it('applies when there is no usable Accept', async () => { + const res = await fetch(`${mdBase}/jane.cv`, { headers: { accept: '' } }); + expect(res.headers.get('content-type')).toContain('text/markdown'); + await res.text(); + }); +}); diff --git a/packages/viewer-web/demo/index.html b/packages/viewer-web/demo/index.html index a06a49b..4da9b88 100644 --- a/packages/viewer-web/demo/index.html +++ b/packages/viewer-web/demo/index.html @@ -49,6 +49,19 @@ cursor: pointer; font-weight: 600; } + .drop button:focus-visible { outline: 2px solid var(--fg); outline-offset: 2px; } + /* Visually hidden but focusable: the styled button proxies clicks/keys to it. */ + .visually-hidden { + position: absolute; + width: 1px; + height: 1px; + margin: -1px; + padding: 0; + overflow: hidden; + clip: rect(0 0 0 0); + clip-path: inset(50%); + border: 0; + } .viewer { margin-top: 2rem; } footer { max-width: 920px; @@ -72,8 +85,15 @@

.cv viewer

Drop a .cv file here
or pick one from disk
- - + +