Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 47 additions & 16 deletions docs/src/pages/create.astro
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ const jsonLd = graph([

<section class="cv-fieldset">
<h2 style="margin: 0 0 0.5rem;">1 · PDF</h2>
<div id="pdf-drop" class="dropzone">
<label id="pdf-drop" class="dropzone">
<strong id="pdf-label">Drop your PDF here</strong>
<div style="color: var(--muted); margin-top: 0.4rem;">or click to choose</div>
<input id="pdf-picker" type="file" accept="application/pdf,.pdf" hidden />
</div>
<input id="pdf-picker" class="visually-hidden-file" type="file" accept="application/pdf,.pdf" />
</label>
</section>

<section class="cv-fieldset">
Expand All @@ -56,11 +56,11 @@ const jsonLd = graph([
<textarea id="md-text" rows="10" placeholder="# Jane Doe&#10;&#10;## Experience&#10;&#10;### Senior Engineer · Acme Inc.&#10;*Jan 2023 — present*&#10;&#10;Led the platform team..."></textarea>
</div>
<div id="md-drop-panel" class="tab-panel" hidden>
<div id="md-drop" class="dropzone">
<label id="md-drop" class="dropzone">
<strong id="md-label">Drop your .md here</strong>
<div style="color: var(--muted); margin-top: 0.4rem;">or click to choose</div>
<input id="md-picker" type="file" accept=".md,text/markdown,text/plain" hidden />
</div>
<input id="md-picker" class="visually-hidden-file" type="file" accept=".md,text/markdown,text/plain" />
</label>
</div>
</section>

Expand All @@ -82,10 +82,13 @@ const jsonLd = graph([
</div>

<p style="color: var(--muted); font-size: 0.9rem; margin-top: 2rem;">
Want to add BGE-M3 semantic embeddings? Install the CLI
(<a href="/install/"><code>brew install cvfile/tap/cv</code></a>) and run
<code>cv pack --embed-with bge-m3</code>. The model is ~285 MB so it
runs once locally on your machine, not on every visitor's browser.
Want to add BGE-M3 semantic embeddings? Generate them with the Python
package (<a href="/install/"><code>pip install "cvfile[embed]"</code></a>):
run <code>embed(markdown)</code> then pack with
<code>pack(..., embeddings=encode_embeddings(payload))</code>. The model is
~285 MB so it runs once locally on your machine, not on every visitor's
browser. The <code>cv</code> CLI is reader-only (extract, inspect, validate,
search) and does not generate embeddings.
</p>

<style is:global>
Expand Down Expand Up @@ -114,9 +117,23 @@ const jsonLd = graph([
.row { display: grid; grid-template-columns: 1fr 1fr; gap: 0.75rem; }
@media (max-width: 600px) { .row { grid-template-columns: 1fr; } }
.dropzone {
display: block;
border: 2px dashed var(--border); border-radius: 10px; padding: 1.75rem;
text-align: center; cursor: pointer; transition: background 120ms;
}
/* Visually hide the native file input while keeping it focusable and in the
accessibility tree, so the styled label acts as the visible control. */
.cv-fieldset input.visually-hidden-file {
position: absolute;
width: 1px;
height: 1px;
padding: 0;
margin: -1px;
overflow: hidden;
clip: rect(0, 0, 0, 0);
white-space: nowrap;
border: 0;
}
.dropzone.over { background: rgba(127,127,127,0.08); }
.dropzone.loaded { border-style: solid; border-color: var(--accent); }
.tabs { display: flex; gap: 0.25rem; margin-bottom: 0.75rem; border-bottom: 1px solid var(--border); }
Expand All @@ -138,7 +155,6 @@ const jsonLd = graph([

<script>
import { pack } from '@cvfile/sdk';
import { marked } from 'marked';

const pdfDrop = document.getElementById('pdf-drop')!;
const pdfPicker = document.getElementById('pdf-picker') as HTMLInputElement;
Expand All @@ -156,7 +172,6 @@ const jsonLd = graph([
const buildBtn = document.getElementById('build') as HTMLButtonElement;

function bindDrop(zone: HTMLElement, picker: HTMLInputElement, onFile: (f: File) => void): void {
zone.addEventListener('click', () => picker.click());
picker.addEventListener('change', () => { const f = picker.files?.[0]; if (f) onFile(f); });
zone.addEventListener('dragover', (e) => { e.preventDefault(); zone.classList.add('over'); });
zone.addEventListener('dragleave', () => zone.classList.remove('over'));
Expand Down Expand Up @@ -191,11 +206,27 @@ const jsonLd = graph([
});
});

function renderHtml(md: string, title: string, lang: string): string {
const body = marked.parse(md, { async: false }) as string;
const safeTitle = title.replace(/[&<>"']/g, (c) =>
function escapeHtml(s: string): string {
return s.replace(/[&<>"']/g, (c) =>
({ '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;', "'": '&#39;' })[c]!
);
}

async function renderHtml(md: string, title: string, lang: string): Promise<string> {
// Lazy-load marked so its ~500KB bundle stays out of the initial chunk.
const { marked } = await import('marked');
// Spec §7.3: never bake raw, unsanitised HTML into resume.html. Escape
// any inline/block HTML the author put in the Markdown instead of
// passing it through verbatim.
marked.use({
renderer: {
html(token: string | { text: string }): string {
return escapeHtml(typeof token === 'string' ? token : token.text);
},
},
});
const body = marked.parse(md, { async: false }) as string;
const safeTitle = escapeHtml(title);
return `<!doctype html>
<html lang="${lang}">
<head>
Expand Down Expand Up @@ -248,7 +279,7 @@ ${body}
try {
const lang = (langInput.value.trim() || 'en');
const title = firstHeading(md);
const html = renderHtml(md, title, lang);
const html = await renderHtml(md, title, lang);

setStatus('Packing .cv…');
const cvBytes = await pack({
Expand Down
25 changes: 20 additions & 5 deletions docs/src/pages/view.astro
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,30 @@ const jsonLd = graph([
</a>
</div>

<div
<label
id="dropzone"
style="border: 2px dashed var(--border); border-radius: 12px; padding: 2rem; text-align: center; cursor: pointer; margin: 0 0 1.5rem;"
style="display: block; border: 2px dashed var(--border); border-radius: 12px; padding: 2rem; text-align: center; cursor: pointer; margin: 0 0 1.5rem;"
>
<strong>Drop a .cv file here</strong>
<div style="color: var(--muted); margin-top: 0.5rem;">or click to choose</div>
<input id="picker" type="file" accept=".cv,application/pdf,application/vnd.cv+pdf" hidden />
</div>
<input id="picker" class="visually-hidden-file" type="file" accept=".cv,application/pdf,application/vnd.cv+pdf" />
</label>

<style>
/* Visually hide the native file input while keeping it focusable and in the
accessibility tree, so the styled label acts as the visible control. */
.visually-hidden-file {
position: absolute;
width: 1px;
height: 1px;
padding: 0;
margin: -1px;
overflow: hidden;
clip: rect(0, 0, 0, 0);
white-space: nowrap;
border: 0;
}
</style>

<cv-embed id="viewer" view="auto" theme="auto" style="display: none;"></cv-embed>

Expand All @@ -65,7 +81,6 @@ const jsonLd = graph([
await viewer.loadFromBytes(buf);
}

dz.addEventListener('click', () => picker.click());
picker.addEventListener('change', () => {
const file = picker.files?.[0];
if (file) void handle(file);
Expand Down
2 changes: 1 addition & 1 deletion integrations/cvfile-haystack/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"cvfile>=0.1.0,<1",
"cvfile>=0.1,<2",
"haystack-ai>=2.8,<3",
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]:
"mime_type": payload.mime_type,
"payload": payload.name,
"relationship": payload.relationship,
"language": payload.language or file.metadata.primary_language,
"language": payload.language,
"primary": payload.name == file.metadata.primary_payload,
"cv_version": file.metadata.version,
"cv_generator": file.metadata.generator,
}


def _resolve_chunks(file: CvFile) -> list:
"""Decode the file's embeddings.cbor into text-resolved chunks.

Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets
(spec §5.1) and stays the single source of truth. Returns an empty list
when the embed extra is not installed or the file carries no embeddings.
"""
try:
from cvfile.embed import resolve_embedding_chunks
except ImportError:
return []
return resolve_embedding_chunks(file)


def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]:
return {
"language": file.metadata.primary_language,
"cv_version": file.metadata.version,
"cv_generator": file.metadata.generator,
"chunk_id": chunk.id,
"chunk_offset": chunk.text_offset,
"chunk_length": chunk.text_length,
"embedding_model": chunk.model,
"embedding_dimension": chunk.dimension,
"embedding_metric": chunk.metric,
}


@component
class CVFileToDocument:
"""Convert ``.cv`` files into Haystack ``Document`` objects.
Expand All @@ -48,18 +76,32 @@ class CVFileToDocument:
Set ``primary_only=True`` to emit only the payload marked as
``primaryPayload`` in the file's XMP metadata (usually the canonical
Markdown copy), and skip all alternates.

Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding
chunk instead of one per payload. Each chunk ``Document`` carries its vector
on ``Document.embedding`` and its text is sliced from the markdown using
UTF-8 byte offsets. Files without an embeddings payload fall back to a single
Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is
ignored (chunks already index a single text payload).
"""

def __init__(self, primary_only: bool = False) -> None:
def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None:
"""Create a CVFileToDocument component.

:param primary_only:
If ``True``, emit only the payload marked as ``primaryPayload``
in the file's XMP metadata. If ``False`` (default), emit one
``Document`` per textual payload (the primary plus any
language alternates and supplements).
language alternates and supplements). Ignored in ``mode="chunks"``.
:param mode:
``"payloads"`` (default) emits one ``Document`` per textual payload.
``"chunks"`` emits one ``Document`` per pre-computed embedding chunk
with its vector attached.
"""
if mode not in ("payloads", "chunks"):
raise ValueError("mode must be 'payloads' or 'chunks'")
self.primary_only = primary_only
self.mode = mode

@component.output_types(documents=list[Document])
def run(
Expand Down Expand Up @@ -105,6 +147,10 @@ def run(
stream_meta = bytestream.meta or {}
source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source)

if self.mode == "chunks":
documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label))
continue

for payload in file.payloads:
if not _is_text_payload(payload):
continue
Expand All @@ -115,3 +161,28 @@ def run(
documents.append(Document(content=payload.text(), meta=merged))

return {"documents": documents}

@staticmethod
def _chunk_documents(
file: CvFile,
stream_meta: dict[str, Any],
source_meta: dict[str, Any],
source_label: str,
) -> list[Document]:
chunks = _resolve_chunks(file)
if not chunks:
primary = next(
(p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)),
None,
)
if primary is None:
return []
payload_meta = _payload_meta(primary, file)
merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label}
return [Document(content=primary.text(), meta=merged)]

out: list[Document] = []
for chunk in chunks:
merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label}
out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector)))
return out
31 changes: 31 additions & 0 deletions integrations/cvfile-haystack/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from haystack_integrations.components.converters.cvfile import CVFileToDocument

FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv"
UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv"


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None:
not_a_cv.write_bytes(b"not a real cv file")
result = converter.run(sources=[not_a_cv])
assert result["documents"] == []


def test_chunks_mode_attaches_a_vector_per_chunk() -> None:
if not FIXTURE.exists():
pytest.skip(f"fixture not found: {FIXTURE}")
docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"]
assert len(docs) >= 1
for doc in docs:
assert doc.embedding is not None
assert len(doc.embedding) == doc.meta["embedding_dimension"]
assert all(isinstance(v, float) for v in doc.embedding)
assert doc.content.strip(), "chunk text should not be empty"


def test_invalid_mode_rejected() -> None:
with pytest.raises(ValueError):
CVFileToDocument(mode="bogus")


def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None:
if not UNICODE_FIXTURE.exists():
pytest.skip(f"fixture not found: {UNICODE_FIXTURE}")
docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"]
joined = "".join(d.content for d in docs)
assert "Élodie" in joined
assert "工程師" in joined
assert "🚀" in joined
assert "经验" in joined
for doc in docs:
assert doc.content == doc.content.encode("utf-8").decode("utf-8")
2 changes: 1 addition & 1 deletion integrations/langchain-cvfile/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ classifiers = [
"Programming Language :: Python :: 3.13",
]
dependencies = [
"cvfile>=0.1.0,<1",
"cvfile>=0.1,<2",
"langchain-core>=0.3,<1",
]

Expand Down
Loading
Loading