cvfile · ilanoh · May 28, 2026 · May 28, 2026 · May 28, 2026 · May 28, 2026
diff --git a/docs/src/pages/create.astro b/docs/src/pages/create.astro
@@ -39,11 +39,11 @@ const jsonLd = graph([
 
   <section class="cv-fieldset">
     <h2 style="margin: 0 0 0.5rem;">1 · PDF</h2>
-    <div id="pdf-drop" class="dropzone">
+    <label id="pdf-drop" class="dropzone">
       <strong id="pdf-label">Drop your PDF here</strong>
       <div style="color: var(--muted); margin-top: 0.4rem;">or click to choose</div>
-      <input id="pdf-picker" type="file" accept="application/pdf,.pdf" hidden />
-    </div>
+      <input id="pdf-picker" class="visually-hidden-file" type="file" accept="application/pdf,.pdf" />
+    </label>
   </section>
 
   <section class="cv-fieldset">
@@ -56,11 +56,11 @@ const jsonLd = graph([
       <textarea id="md-text" rows="10" placeholder="# Jane Doe&#10;&#10;## Experience&#10;&#10;### Senior Engineer · Acme Inc.&#10;*Jan 2023 — present*&#10;&#10;Led the platform team..."></textarea>
     </div>
     <div id="md-drop-panel" class="tab-panel" hidden>
-      <div id="md-drop" class="dropzone">
+      <label id="md-drop" class="dropzone">
         <strong id="md-label">Drop your .md here</strong>
         <div style="color: var(--muted); margin-top: 0.4rem;">or click to choose</div>
-        <input id="md-picker" type="file" accept=".md,text/markdown,text/plain" hidden />
-      </div>
+        <input id="md-picker" class="visually-hidden-file" type="file" accept=".md,text/markdown,text/plain" />
+      </label>
     </div>
   </section>
 
@@ -82,10 +82,13 @@ const jsonLd = graph([
   </div>
 
   <p style="color: var(--muted); font-size: 0.9rem; margin-top: 2rem;">
-    Want to add BGE-M3 semantic embeddings? Install the CLI
-    (<a href="/install/"><code>brew install cvfile/tap/cv</code></a>) and run
-    <code>cv pack --embed-with bge-m3</code>. The model is ~285 MB so it
-    runs once locally on your machine, not on every visitor's browser.
+    Want to add BGE-M3 semantic embeddings? Generate them with the Python
+    package (<a href="/install/"><code>pip install "cvfile[embed]"</code></a>):
+    run <code>embed(markdown)</code> then pack with
+    <code>pack(..., embeddings=encode_embeddings(payload))</code>. The model is
+    ~285 MB so it runs once locally on your machine, not on every visitor's
+    browser. The <code>cv</code> CLI is reader-only (extract, inspect, validate,
+    search) and does not generate embeddings.
   </p>
 
   <style is:global>
@@ -114,9 +117,23 @@ const jsonLd = graph([
     .row { display: grid; grid-template-columns: 1fr 1fr; gap: 0.75rem; }
     @media (max-width: 600px) { .row { grid-template-columns: 1fr; } }
     .dropzone {
+      display: block;
       border: 2px dashed var(--border); border-radius: 10px; padding: 1.75rem;
       text-align: center; cursor: pointer; transition: background 120ms;
     }
+    /* Visually hide the native file input while keeping it focusable and in the
+       accessibility tree, so the styled label acts as the visible control. */
+    .cv-fieldset input.visually-hidden-file {
+      position: absolute;
+      width: 1px;
+      height: 1px;
+      padding: 0;
+      margin: -1px;
+      overflow: hidden;
+      clip: rect(0, 0, 0, 0);
+      white-space: nowrap;
+      border: 0;
+    }
     .dropzone.over { background: rgba(127,127,127,0.08); }
     .dropzone.loaded { border-style: solid; border-color: var(--accent); }
     .tabs { display: flex; gap: 0.25rem; margin-bottom: 0.75rem; border-bottom: 1px solid var(--border); }
@@ -138,7 +155,6 @@ const jsonLd = graph([
 
   <script>
     import { pack } from '@cvfile/sdk';
-    import { marked } from 'marked';
 
     const pdfDrop = document.getElementById('pdf-drop')!;
     const pdfPicker = document.getElementById('pdf-picker') as HTMLInputElement;
@@ -156,7 +172,6 @@ const jsonLd = graph([
     const buildBtn = document.getElementById('build') as HTMLButtonElement;
 
     function bindDrop(zone: HTMLElement, picker: HTMLInputElement, onFile: (f: File) => void): void {
-      zone.addEventListener('click', () => picker.click());
       picker.addEventListener('change', () => { const f = picker.files?.[0]; if (f) onFile(f); });
       zone.addEventListener('dragover', (e) => { e.preventDefault(); zone.classList.add('over'); });
       zone.addEventListener('dragleave', () => zone.classList.remove('over'));
@@ -191,11 +206,27 @@ const jsonLd = graph([
       });
     });
 
-    function renderHtml(md: string, title: string, lang: string): string {
-      const body = marked.parse(md, { async: false }) as string;
-      const safeTitle = title.replace(/[&<>"']/g, (c) =>
+    function escapeHtml(s: string): string {
+      return s.replace(/[&<>"']/g, (c) =>
         ({ '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;', "'": '&#39;' })[c]!
       );
+    }
+
+    async function renderHtml(md: string, title: string, lang: string): Promise<string> {
+      // Lazy-load marked so its ~500KB bundle stays out of the initial chunk.
+      const { marked } = await import('marked');
+      // Spec §7.3: never bake raw, unsanitised HTML into resume.html. Escape
+      // any inline/block HTML the author put in the Markdown instead of
+      // passing it through verbatim.
+      marked.use({
+        renderer: {
+          html(token: string | { text: string }): string {
+            return escapeHtml(typeof token === 'string' ? token : token.text);
+          },
+        },
+      });
+      const body = marked.parse(md, { async: false }) as string;
+      const safeTitle = escapeHtml(title);
       return `<!doctype html>
 <html lang="${lang}">
 <head>
@@ -248,7 +279,7 @@ ${body}
       try {
         const lang = (langInput.value.trim() || 'en');
         const title = firstHeading(md);
-        const html = renderHtml(md, title, lang);
+        const html = await renderHtml(md, title, lang);
 
         setStatus('Packing .cv…');
         const cvBytes = await pack({

diff --git a/docs/src/pages/view.astro b/docs/src/pages/view.astro
@@ -33,14 +33,30 @@ const jsonLd = graph([
     </a>
   </div>
 
-  <div
+  <label
     id="dropzone"
-    style="border: 2px dashed var(--border); border-radius: 12px; padding: 2rem; text-align: center; cursor: pointer; margin: 0 0 1.5rem;"
+    style="display: block; border: 2px dashed var(--border); border-radius: 12px; padding: 2rem; text-align: center; cursor: pointer; margin: 0 0 1.5rem;"
   >
     <strong>Drop a .cv file here</strong>
     <div style="color: var(--muted); margin-top: 0.5rem;">or click to choose</div>
-    <input id="picker" type="file" accept=".cv,application/pdf,application/vnd.cv+pdf" hidden />
-  </div>
+    <input id="picker" class="visually-hidden-file" type="file" accept=".cv,application/pdf,application/vnd.cv+pdf" />
+  </label>
+
+  <style>
+    /* Visually hide the native file input while keeping it focusable and in the
+       accessibility tree, so the styled label acts as the visible control. */
+    .visually-hidden-file {
+      position: absolute;
+      width: 1px;
+      height: 1px;
+      padding: 0;
+      margin: -1px;
+      overflow: hidden;
+      clip: rect(0, 0, 0, 0);
+      white-space: nowrap;
+      border: 0;
+    }
+  </style>
 
   <cv-embed id="viewer" view="auto" theme="auto" style="display: none;"></cv-embed>
 
@@ -65,7 +81,6 @@ const jsonLd = graph([
       await viewer.loadFromBytes(buf);
     }
 
-    dz.addEventListener('click', () => picker.click());
     picker.addEventListener('change', () => {
       const file = picker.files?.[0];
       if (file) void handle(file);

diff --git a/integrations/cvfile-haystack/pyproject.toml b/integrations/cvfile-haystack/pyproject.toml
@@ -21,7 +21,7 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-  "cvfile>=0.1.0,<1",
+  "cvfile>=0.1,<2",
   "haystack-ai>=2.8,<3",
 ]
 

diff --git a/...tions/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py b/...tions/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py
@@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
         "mime_type": payload.mime_type,
         "payload": payload.name,
         "relationship": payload.relationship,
-        "language": payload.language or file.metadata.primary_language,
+        "language": payload.language,
         "primary": payload.name == file.metadata.primary_payload,
         "cv_version": file.metadata.version,
         "cv_generator": file.metadata.generator,
     }
 
 
+def _resolve_chunks(file: CvFile) -> list:
+    """Decode the file's embeddings.cbor into text-resolved chunks.
+
+    Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
+    (spec §5.1) and stays the single source of truth. Returns an empty list
+    when the embed extra is not installed or the file carries no embeddings.
+    """
+    try:
+        from cvfile.embed import resolve_embedding_chunks
+    except ImportError:
+        return []
+    return resolve_embedding_chunks(file)
+
+
+def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
+    return {
+        "language": file.metadata.primary_language,
+        "cv_version": file.metadata.version,
+        "cv_generator": file.metadata.generator,
+        "chunk_id": chunk.id,
+        "chunk_offset": chunk.text_offset,
+        "chunk_length": chunk.text_length,
+        "embedding_model": chunk.model,
+        "embedding_dimension": chunk.dimension,
+        "embedding_metric": chunk.metric,
+    }
+
+
 @component
 class CVFileToDocument:
     """Convert ``.cv`` files into Haystack ``Document`` objects.
@@ -48,18 +76,32 @@ class CVFileToDocument:
     Set ``primary_only=True`` to emit only the payload marked as
     ``primaryPayload`` in the file's XMP metadata (usually the canonical
     Markdown copy), and skip all alternates.
+
+    Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
+    chunk instead of one per payload. Each chunk ``Document`` carries its vector
+    on ``Document.embedding`` and its text is sliced from the markdown using
+    UTF-8 byte offsets. Files without an embeddings payload fall back to a single
+    Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
+    ignored (chunks already index a single text payload).
     """
 
-    def __init__(self, primary_only: bool = False) -> None:
+    def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
         """Create a CVFileToDocument component.
 
         :param primary_only:
             If ``True``, emit only the payload marked as ``primaryPayload``
             in the file's XMP metadata. If ``False`` (default), emit one
             ``Document`` per textual payload (the primary plus any
-            language alternates and supplements).
+            language alternates and supplements). Ignored in ``mode="chunks"``.
+        :param mode:
+            ``"payloads"`` (default) emits one ``Document`` per textual payload.
+            ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
+            with its vector attached.
         """
+        if mode not in ("payloads", "chunks"):
+            raise ValueError("mode must be 'payloads' or 'chunks'")
         self.primary_only = primary_only
+        self.mode = mode
 
     @component.output_types(documents=list[Document])
     def run(
@@ -105,6 +147,10 @@ def run(
             stream_meta = bytestream.meta or {}
             source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)
 
+            if self.mode == "chunks":
+                documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
+                continue
+
             for payload in file.payloads:
                 if not _is_text_payload(payload):
                     continue
@@ -115,3 +161,28 @@ def run(
                 documents.append(Document(content=payload.text(), meta=merged))
 
         return {"documents": documents}
+
+    @staticmethod
+    def _chunk_documents(
+        file: CvFile,
+        stream_meta: dict[str, Any],
+        source_meta: dict[str, Any],
+        source_label: str,
+    ) -> list[Document]:
+        chunks = _resolve_chunks(file)
+        if not chunks:
+            primary = next(
+                (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
+                None,
+            )
+            if primary is None:
+                return []
+            payload_meta = _payload_meta(primary, file)
+            merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
+            return [Document(content=primary.text(), meta=merged)]
+
+        out: list[Document] = []
+        for chunk in chunks:
+            merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
+            out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
+        return out
diff --git a/integrations/cvfile-haystack/tests/test_converter.py b/integrations/cvfile-haystack/tests/test_converter.py
@@ -11,6 +11,7 @@
 from haystack_integrations.components.converters.cvfile import CVFileToDocument
 
 FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
+UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"
 
 
 @pytest.fixture(scope="module")
@@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
     not_a_cv.write_bytes(b"not a real cv file")
     result = converter.run(sources=[not_a_cv])
     assert result["documents"] == []
+
+
+def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
+    if not FIXTURE.exists():
+        pytest.skip(f"fixture not found: {FIXTURE}")
+    docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
+    assert len(docs) >= 1
+    for doc in docs:
+        assert doc.embedding is not None
+        assert len(doc.embedding) == doc.meta["embedding_dimension"]
+        assert all(isinstance(v, float) for v in doc.embedding)
+        assert doc.content.strip(), "chunk text should not be empty"
+
+
+def test_invalid_mode_rejected() -> None:
+    with pytest.raises(ValueError):
+        CVFileToDocument(mode="bogus")
+
+
+def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
+    if not UNICODE_FIXTURE.exists():
+        pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
+    docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
+    joined = "".join(d.content for d in docs)
+    assert "Élodie" in joined
+    assert "工程師" in joined
+    assert "🚀" in joined
+    assert "经验" in joined
+    for doc in docs:
+        assert doc.content == doc.content.encode("utf-8").decode("utf-8")
diff --git a/integrations/langchain-cvfile/pyproject.toml b/integrations/langchain-cvfile/pyproject.toml
@@ -21,7 +21,7 @@ classifiers = [
   "Programming Language :: Python :: 3.13",
 ]
 dependencies = [
-  "cvfile>=0.1.0,<1",
+  "cvfile>=0.1,<2",
   "langchain-core>=0.3,<1",
 ]