From 0066ca27fa12f3d0506df1ad684e49c3bb51641a Mon Sep 17 00:00:00 2001 From: Ilan Date: Thu, 28 May 2026 23:35:39 +0300 Subject: [PATCH 1/8] fix(go): stop silent Pack corruption, catch inline forbidden actions, harden HF/search Reader path bugs and one critical writer bug found in the end-to-end audit: * Pack() previously returned no error while pdfcpu's WriteContext silently dropped every newly added payload object, emitting a file that passed IsCvFile but had zero payloads and failed its own validator (and panicked on minimal PDFs). The Go writer is roadmapped for v0.2, so Pack now returns a clear "writer not implemented in v0.1" error before any mutation and the broken write path is removed. No code path can emit a corrupt file or panic. * scanForbiddenConstructs only walked indirect objects, so an inline /OpenAction JavaScript (or /AA, annotation /A, AcroForm action) bypassed validation. Rewritten as a recursive catalog/trailer graph walk with a visited set, mirroring the Python implementation. All 7 malicious vectors still rejected; a new inline OpenAction test confirms the gap is closed. * parseHFMatrix/meanPool used unchecked type assertions and panicked on a ragged HuggingFace response; now return descriptive errors. * SearchSemantic panicked on a chunk vector whose length mismatched the space dimension; mismatched chunks are now skipped. * validate now warns (newer-format-version) when cv:version MAJOR exceeds the known major (spec 8.3); still accepts 0.1 and 1.0. * Dropped the fragile last-4KiB /Encrypt byte scan in favor of the authoritative parsed trailer Encrypt check. * cv version no longer prints the spec version twice; reports a distinct CLI version. extract help documents that --format defaults to pdf. * gofmt across the module. --- .../viewer-web/demo/{ => public}/sample.cv | Bin sdks/go/cmd/cv/main.go | 8 +- sdks/go/cv/cv_test.go | 58 +++-- sdks/go/cv/embed_hf.go | 41 +++- sdks/go/cv/embed_hf_test.go | 39 ++++ sdks/go/cv/pack.go | 218 ++---------------- sdks/go/cv/pdf.go | 141 ----------- sdks/go/cv/search.go | 5 + sdks/go/cv/search_test.go | 27 +++ sdks/go/cv/security.go | 64 +++-- sdks/go/cv/security_test.go | 39 ++++ sdks/go/cv/validate.go | 89 ++++--- sdks/go/cv/validate_test.go | 38 +++ sdks/go/middleware/conneg.go | 12 +- 14 files changed, 361 insertions(+), 418 deletions(-) rename packages/viewer-web/demo/{ => public}/sample.cv (100%) create mode 100644 sdks/go/cv/embed_hf_test.go create mode 100644 sdks/go/cv/validate_test.go diff --git a/packages/viewer-web/demo/sample.cv b/packages/viewer-web/demo/public/sample.cv similarity index 100% rename from packages/viewer-web/demo/sample.cv rename to packages/viewer-web/demo/public/sample.cv diff --git a/sdks/go/cmd/cv/main.go b/sdks/go/cmd/cv/main.go index 25da21e..9b34e8f 100644 --- a/sdks/go/cmd/cv/main.go +++ b/sdks/go/cmd/cv/main.go @@ -12,10 +12,14 @@ import ( cv "github.com/cvfile/cv/sdks/go/cv" ) +// cliVersion is the version of this command-line tool, distinct from the +// .cv spec version (cv.SpecVersion) and any SDK package version. +const cliVersion = "0.1.0" + const usage = `cv — the .cv open file format CLI (v0.1) Usage: - cv extract [--format md|html|pdf] + cv extract [--format pdf|md|html] (--format defaults to pdf) cv inspect [--json] cv validate [--strict] cv search "" [--k 5] [--model BAAI/bge-m3] @@ -50,7 +54,7 @@ func main() { case "search": os.Exit(cmdSearch(rest)) case "version", "--version", "-v": - fmt.Printf("cv %s (sdk %s)\n", cv.SpecVersion, cv.SpecVersion) + fmt.Printf("cv %s (spec %s)\n", cliVersion, cv.SpecVersion) case "help", "--help", "-h": fmt.Print(usage) default: diff --git a/sdks/go/cv/cv_test.go b/sdks/go/cv/cv_test.go index ded5201..b9f74f3 100644 --- a/sdks/go/cv/cv_test.go +++ b/sdks/go/cv/cv_test.go @@ -1,6 +1,7 @@ package cv import ( + "errors" "os" "path/filepath" "runtime" @@ -21,9 +22,8 @@ Senior software engineer. const sampleHTML = `

Jane Doe

` -// minimalPDF is a minimal PDF with /Resources containing /Font and -// /ProcSet so that pdfcpu's writePagesDict has writable maps to populate. -// Built by hand once for tests; unused fields trimmed to the minimum. +// minimalPDF is a minimal hand-built PDF used to confirm the reader rejects +// plain PDFs as .cv and that Pack refuses minimal input without panicking. var minimalPDF = []byte("%PDF-1.7\n" + "1 0 obj<>endobj\n" + "2 0 obj<>endobj\n" + @@ -31,16 +31,10 @@ var minimalPDF = []byte("%PDF-1.7\n" + "xref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000054 00000 n \n0000000099 00000 n \n" + "trailer<>\nstartxref\n195\n%%EOF\n") -// Go-side Pack() depends on pdfcpu's writer, which currently chokes on -// some minimal input PDFs (nil-map panic in writePagesDict). The reader -// path through pdfcpu is solid, so v0.1 of cv-go ships as a consumer -// SDK. Pack() round-trip tests will return when the writer issue is -// resolved (likely by fronting pdfcpu with a hand-rolled incremental -// updater that appends /AF + /Metadata without touching the page tree). -// -// In the meantime, the killer property for Go adopters — "Go can read -// .cv files produced by any compliant SDK" — is exercised by the -// interop tests below. +// Go-side Pack() (the writer) is deferred to v0.2: v0.1 of cv-go ships as a +// consumer SDK. The reader path through pdfcpu is solid, and "Go can read .cv +// files produced by any compliant SDK" is exercised by the interop tests below. +// Pack() must refuse up front so it can never emit a corrupt file or panic. func TestIsCvFileRejectsPlainPDF(t *testing.T) { if IsCvFile(minimalPDF) { @@ -48,6 +42,44 @@ func TestIsCvFileRejectsPlainPDF(t *testing.T) { } } +// TestPackReturnsNotImplementedForNormalPDF asserts Pack refuses a normal input +// PDF with ErrPackNotImplemented instead of emitting a corrupt .cv file. +func TestPackReturnsNotImplementedForNormalPDF(t *testing.T) { + pdf, err := os.ReadFile(repoFixturePath("packages/sdk-js/examples/out/jane-doe.pdf")) + if err != nil { + t.Skipf("input PDF fixture missing at %s", "packages/sdk-js/examples/out/jane-doe.pdf") + } + out, err := Pack(PackInput{ + PDF: pdf, + Markdown: []byte(sampleMD), + HTML: []byte(sampleHTML), + Metadata: Metadata{PrimaryLanguage: "en"}, + }) + if !errors.Is(err, ErrPackNotImplemented) { + t.Fatalf("Pack err = %v, want ErrPackNotImplemented", err) + } + if out != nil { + t.Errorf("Pack returned %d bytes; want nil (no corrupt file)", len(out)) + } +} + +// TestPackDoesNotPanicOnMinimalPDF asserts Pack returns the not-implemented +// error rather than panicking on a minimal PDF (the old writer path panicked +// with "assignment to entry in nil map"). +func TestPackDoesNotPanicOnMinimalPDF(t *testing.T) { + out, err := Pack(PackInput{ + PDF: minimalPDF, + Markdown: []byte(sampleMD), + Metadata: Metadata{PrimaryLanguage: "en"}, + }) + if !errors.Is(err, ErrPackNotImplemented) { + t.Fatalf("Pack err = %v, want ErrPackNotImplemented", err) + } + if out != nil { + t.Errorf("Pack returned %d bytes; want nil", len(out)) + } +} + func TestInteropReadsJSProducedFile(t *testing.T) { fixture := repoFixturePath("packages/sdk-js/examples/out/jane-doe.cv") data, err := os.ReadFile(fixture) diff --git a/sdks/go/cv/embed_hf.go b/sdks/go/cv/embed_hf.go index 2e8f1e7..b4c024e 100644 --- a/sdks/go/cv/embed_hf.go +++ b/sdks/go/cv/embed_hf.go @@ -125,15 +125,27 @@ func parseHFMatrix(raw []byte, expected int) ([][]float32, error) { switch first[0].(type) { case float64: out := make([][]float32, len(arr)) - for i, row := range arr { - out[i] = toFloat32(row.([]any)) + for i, raw := range arr { + row, ok := raw.([]any) + if !ok { + return nil, fmt.Errorf("HF response: row %d is %T, expected an array of numbers", i, raw) + } + out[i] = toFloat32(row) } return out, nil case []any: // Token-level embeddings: mean-pool per input. out := make([][]float32, len(arr)) - for i, row := range arr { - out[i] = meanPool(row.([]any)) + for i, raw := range arr { + row, ok := raw.([]any) + if !ok { + return nil, fmt.Errorf("HF response: row %d is %T, expected a token matrix", i, raw) + } + vec, err := meanPool(row) + if err != nil { + return nil, fmt.Errorf("HF response: row %d: %w", i, err) + } + out[i] = vec } return out, nil } @@ -150,15 +162,24 @@ func toFloat32(arr []any) []float32 { return out } -func meanPool(tokens []any) []float32 { +func meanPool(tokens []any) ([]float32, error) { if len(tokens) == 0 { - return nil + return nil, nil + } + first, ok := tokens[0].([]any) + if !ok { + return nil, fmt.Errorf("token 0 is %T, expected an array of numbers", tokens[0]) } - first, _ := tokens[0].([]any) dim := len(first) sum := make([]float64, dim) - for _, t := range tokens { - row, _ := t.([]any) + for ti, t := range tokens { + row, ok := t.([]any) + if !ok { + return nil, fmt.Errorf("token %d is %T, expected an array of numbers", ti, t) + } + if len(row) != dim { + return nil, fmt.Errorf("token %d has length %d, expected %d (ragged matrix)", ti, len(row), dim) + } for i, v := range row { f, _ := v.(float64) sum[i] += f @@ -169,7 +190,7 @@ func meanPool(tokens []any) []float32 { for i, v := range sum { out[i] = float32(v / n) } - return out + return out, nil } func normalize(v []float32) []float32 { diff --git a/sdks/go/cv/embed_hf_test.go b/sdks/go/cv/embed_hf_test.go new file mode 100644 index 0000000..4bc3952 --- /dev/null +++ b/sdks/go/cv/embed_hf_test.go @@ -0,0 +1,39 @@ +package cv + +import ( + "strings" + "testing" +) + +// TestParseHFMatrixRaggedRowReturnsError feeds a ragged HF response like +// [[0.1,0.2],0.5] where the second row is a scalar instead of an array. The +// parser must return a descriptive error rather than panic on a type assertion. +func TestParseHFMatrixRaggedRowReturnsError(t *testing.T) { + raw := []byte(`[[0.1,0.2],0.5]`) + out, err := parseHFMatrix(raw, 2) + if err == nil { + t.Fatalf("expected error for ragged matrix, got %v", out) + } + if !strings.Contains(err.Error(), "row 1") { + t.Errorf("error %q should mention the offending row", err) + } +} + +// TestParseHFMatrixRaggedTokenMatrixReturnsError feeds a token-level (3D) +// response whose second input is not a token matrix. +func TestParseHFMatrixRaggedTokenMatrixReturnsError(t *testing.T) { + raw := []byte(`[[[0.1,0.2],[0.3,0.4]],0.5]`) + out, err := parseHFMatrix(raw, 2) + if err == nil { + t.Fatalf("expected error for ragged token matrix, got %v", out) + } +} + +// TestMeanPoolRaggedTokensReturnsError exercises meanPool directly with tokens +// of differing length. +func TestMeanPoolRaggedTokensReturnsError(t *testing.T) { + tokens := []any{[]any{0.1, 0.2}, []any{0.3}} + if _, err := meanPool(tokens); err == nil { + t.Fatal("expected error for ragged token rows") + } +} diff --git a/sdks/go/cv/pack.go b/sdks/go/cv/pack.go index 72e29ac..dbf61c7 100644 --- a/sdks/go/cv/pack.go +++ b/sdks/go/cv/pack.go @@ -1,209 +1,23 @@ package cv -import ( - "crypto/sha256" - "encoding/hex" - "encoding/json" - "fmt" - "time" +import "errors" + +// ErrPackNotImplemented is returned by Pack in the Go SDK v0.1. The Go SDK +// ships as a reader (extract / inspect / validate / search); the writer that +// embeds /AF payloads and the cv: XMP packet without corrupting the PDF page +// tree is planned for v0.2 (see ROADMAP Phase 1.7-1.8). Refusing up front is +// the honest behaviour: the previous pdfcpu WriteContext path silently dropped +// the newly added indirect objects, producing files that failed their own +// validator, and panicked on minimal input PDFs. +var ErrPackNotImplemented = errors.New( + "cv: Pack (writer) is not implemented in the Go SDK v0.1; use the JS or Python SDK to create .cv files (Go writer is planned for v0.2)", ) // Pack builds a .cv file from the input PDF and one or more representations. +// +// Not implemented in the Go SDK v0.1: this always returns ErrPackNotImplemented +// without mutating any PDF, so it can never emit a corrupt file or panic. The +// signature is kept stable for the v0.2 writer. func Pack(in PackInput) ([]byte, error) { - payloads, err := collectPayloads(in) - if err != nil { - return nil, err - } - if len(payloads) == 0 { - return nil, fmt.Errorf("at least one payload (markdown, html, json, embeddings, or payloads) is required") - } - - primaryLanguage := in.Metadata.PrimaryLanguage - if primaryLanguage == "" { - return nil, fmt.Errorf("metadata.PrimaryLanguage is required") - } - - primaryPayload := in.Metadata.PrimaryPayload - if primaryPayload == "" { - primaryPayload = defaultPrimary(payloads) - } - if !containsPayload(payloads, primaryPayload) { - return nil, fmt.Errorf("primary payload %q not found among payloads", primaryPayload) - } - - created := in.Metadata.Created - if created.IsZero() { - created = time.Now().UTC() - } - modified := in.Metadata.Modified - if modified.IsZero() { - modified = created - } - - integrity := in.Metadata.Integrity - if integrity == "" { - integrity = "sha-256" - } - - var integrityList []IntegrityEntry - if integrity == "sha-256" { - for _, p := range payloads { - h := sha256.Sum256(p.Data) - integrityList = append(integrityList, IntegrityEntry{ - Payload: p.Name, - Algorithm: "sha-256", - Digest: hex.EncodeToString(h[:]), - }) - } - } - - ctx, err := loadContext(in.PDF) - if err != nil { - return nil, err - } - - for _, p := range payloads { - desc := p.Description - if desc == "" { - desc = defaultDescription(p) - } - rel := p.Relationship - if rel == "" { - rel = RelAlternative - } - if err := addAssociatedFile(ctx, p.Name, p.Data, p.MimeType, desc, rel, created, modified); err != nil { - return nil, err - } - } - - var alternates []AlternateMeta - for _, p := range payloads { - if p.Name == primaryPayload { - continue - } - rel := p.Relationship - if rel == "" { - rel = RelAlternative - } - if rel != RelAlternative { - continue - } - lang := p.Language - if lang == "" { - lang = primaryLanguage - } - alternates = append(alternates, AlternateMeta{ - Payload: p.Name, - Language: lang, - MimeType: p.MimeType, - }) - } - - meta := Metadata{ - Version: SpecVersion, - PrimaryLanguage: primaryLanguage, - PrimaryPayload: primaryPayload, - Created: created, - Modified: modified, - Generator: in.Metadata.Generator, - } - xmp := buildXMP(meta, alternates, integrityList, in.Metadata.Embeddings) - if err := setMetadataXML(ctx, xmp); err != nil { - return nil, err - } - - return writeContext(ctx) -} - -func collectPayloads(in PackInput) ([]Payload, error) { - var out []Payload - if len(in.Markdown) > 0 { - out = append(out, Payload{ - Data: in.Markdown, - Name: NameMarkdown, - MimeType: MimeMarkdown, - Relationship: RelAlternative, - }) - } - if len(in.HTML) > 0 { - out = append(out, Payload{ - Data: in.HTML, - Name: NameHTML, - MimeType: MimeHTML, - Relationship: RelAlternative, - }) - } - if in.JSON != nil { - body, err := json.MarshalIndent(in.JSON, "", " ") - if err != nil { - return nil, fmt.Errorf("marshal json payload: %w", err) - } - out = append(out, Payload{ - Data: body, - Name: NameJSON, - MimeType: MimeJSON, - Relationship: RelAlternative, - }) - } - if len(in.Embeddings) > 0 { - out = append(out, Payload{ - Data: in.Embeddings, - Name: NameEmbeddings, - MimeType: MimeEmbeddings, - Relationship: RelData, - }) - } - out = append(out, in.Payloads...) - - seen := make(map[string]bool, len(out)) - for _, p := range out { - if seen[p.Name] { - return nil, fmt.Errorf("duplicate payload name: %s", p.Name) - } - seen[p.Name] = true - } - return out, nil -} - -func defaultPrimary(payloads []Payload) string { - for _, p := range payloads { - if p.Name == NameMarkdown { - return NameMarkdown - } - } - for _, p := range payloads { - if p.Name == NameHTML { - return NameHTML - } - } - for _, p := range payloads { - rel := p.Relationship - if rel == "" || rel == RelAlternative { - return p.Name - } - } - return payloads[0].Name -} - -func defaultDescription(p Payload) string { - switch p.MimeType { - case MimeMarkdown: - return "Markdown representation" - case MimeHTML: - return "HTML representation" - case MimeJSON: - return "JSON Resume representation" - case MimeEmbeddings: - return "Pre-computed embeddings" - } - return p.Name -} - -func containsPayload(payloads []Payload, name string) bool { - for _, p := range payloads { - if p.Name == name { - return true - } - } - return false + return nil, ErrPackNotImplemented } diff --git a/sdks/go/cv/pdf.go b/sdks/go/cv/pdf.go index 415e822..a40b60f 100644 --- a/sdks/go/cv/pdf.go +++ b/sdks/go/cv/pdf.go @@ -2,11 +2,8 @@ package cv import ( "bytes" - "crypto/rand" - "encoding/hex" "fmt" "strings" - "time" "github.com/pdfcpu/pdfcpu/pkg/api" "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" @@ -24,110 +21,6 @@ func loadContext(pdfBytes []byte) (*model.Context, error) { return ctx, nil } -// writeContext serializes a pdfcpu Context to bytes. -func writeContext(ctx *model.Context) ([]byte, error) { - ensureTrailerID(ctx) - var buf bytes.Buffer - if err := api.WriteContext(ctx, &buf); err != nil { - return nil, fmt.Errorf("write PDF: %w", err) - } - return buf.Bytes(), nil -} - -// addAssociatedFile attaches a payload to the catalog's /AF array. -func addAssociatedFile( - ctx *model.Context, - name string, - data []byte, - mimeType string, - description string, - relationship AFRelationship, - created, modified time.Time, -) error { - xRef := ctx.XRefTable - - streamRef, err := xRef.NewEmbeddedStreamDict(bytes.NewReader(data), modified) - if err != nil { - return fmt.Errorf("new embedded stream: %w", err) - } - - if err := mutateStreamDict(xRef, streamRef, func(d pdfTypes.Dict) { - d.InsertName("Subtype", mimeType) - d.Insert("Params", pdfTypes.Dict{ - "CreationDate": pdfTypes.StringLiteral(pdfDate(created)), - "ModDate": pdfTypes.StringLiteral(pdfDate(modified)), - "Size": pdfTypes.Integer(len(data)), - }) - }); err != nil { - return err - } - - filespec := pdfTypes.Dict{ - "Type": pdfTypes.Name("Filespec"), - "F": pdfTypes.StringLiteral(name), - "UF": pdfTypes.StringLiteral(name), - "Desc": pdfTypes.StringLiteral(description), - "AFRelationship": pdfTypes.Name(string(relationship)), - "EF": pdfTypes.Dict{ - "F": *streamRef, - "UF": *streamRef, - }, - } - filespecRef, err := xRef.IndRefForNewObject(filespec) - if err != nil { - return fmt.Errorf("ind ref for filespec: %w", err) - } - - rootDict, err := xRef.Catalog() - if err != nil { - return fmt.Errorf("get catalog: %w", err) - } - - af, ok := rootDict.Find("AF") - if !ok { - rootDict.Insert("AF", pdfTypes.Array{*filespecRef}) - return nil - } - obj, err := xRef.Dereference(af) - if err != nil { - return fmt.Errorf("deref AF: %w", err) - } - arr, ok := obj.(pdfTypes.Array) - if !ok { - return fmt.Errorf("/AF is not an array") - } - arr = append(arr, *filespecRef) - rootDict.Update("AF", arr) - return nil -} - -// setMetadataXML installs the cv: XMP packet on the catalog's /Metadata stream. -func setMetadataXML(ctx *model.Context, xml string) error { - xRef := ctx.XRefTable - rootDict, err := xRef.Catalog() - if err != nil { - return err - } - - streamRef, err := xRef.NewEmbeddedStreamDict(strings.NewReader(xml), time.Now()) - if err != nil { - return err - } - if err := mutateStreamDict(xRef, streamRef, func(d pdfTypes.Dict) { - d.InsertName("Type", "Metadata") - d.InsertName("Subtype", "XML") - }); err != nil { - return err - } - - if _, ok := rootDict.Find("Metadata"); ok { - rootDict.Update("Metadata", *streamRef) - } else { - rootDict.Insert("Metadata", *streamRef) - } - return nil -} - // readMetadataXML returns the catalog's /Metadata stream as a string. func readMetadataXML(ctx *model.Context) (string, error) { rootDict, err := ctx.XRefTable.Catalog() @@ -258,23 +151,6 @@ func parseFilespec(ctx *model.Context, fs pdfTypes.Dict) (*rawPayload, error) { }, nil } -// mutateStreamDict resolves an indirect reference to a stream and mutates its -// dictionary in-place via the supplied callback. Re-stores the modified stream -// back into the xref table. -func mutateStreamDict(xRef *model.XRefTable, ref *pdfTypes.IndirectRef, fn func(pdfTypes.Dict)) error { - entry, ok := xRef.FindTableEntryForIndRef(ref) - if !ok || entry == nil { - return fmt.Errorf("xref entry not found for %v", ref) - } - sd, ok := entry.Object.(pdfTypes.StreamDict) - if !ok { - return fmt.Errorf("indirect ref %v is not a StreamDict", ref) - } - fn(sd.Dict) - entry.Object = sd - return nil -} - func stringValue(d pdfTypes.Dict, key string) string { v, ok := d.Find(key) if !ok { @@ -372,20 +248,3 @@ func decodePDFString(s string) string { } return b.String() } - -func pdfDate(t time.Time) string { - return t.UTC().Format("D:20060102150405Z") -} - -func ensureTrailerID(ctx *model.Context) { - if len(ctx.ID) > 0 { - return - } - id := make([]byte, 16) - _, _ = rand.Read(id) - hexID := strings.ToUpper(hex.EncodeToString(id)) - ctx.ID = pdfTypes.Array{ - pdfTypes.HexLiteral(hexID), - pdfTypes.HexLiteral(hexID), - } -} diff --git a/sdks/go/cv/search.go b/sdks/go/cv/search.go index 6133c65..b6aabd6 100644 --- a/sdks/go/cv/search.go +++ b/sdks/go/cv/search.go @@ -43,6 +43,11 @@ func SearchSemantic(payload *EmbeddingsPayload, queryVector []float32, opts Sear hits := make([]SearchHit, 0, len(space.Chunks)) for _, c := range space.Chunks { + // Skip chunks whose vector does not match the space dimension: scoring + // them would index out of range against the query vector. + if len(c.Vector) != space.Dimension { + continue + } hits = append(hits, SearchHit{ SpaceModel: space.Model, ChunkID: c.ID, diff --git a/sdks/go/cv/search_test.go b/sdks/go/cv/search_test.go index 097f6c3..2310ef4 100644 --- a/sdks/go/cv/search_test.go +++ b/sdks/go/cv/search_test.go @@ -5,6 +5,33 @@ import ( "testing" ) +// TestSearchSemanticMismatchedChunkVectorDoesNotPanic constructs a payload whose +// chunks include a vector shorter than the space dimension. SearchSemantic must +// skip it instead of panicking with index out of range. +func TestSearchSemanticMismatchedChunkVectorDoesNotPanic(t *testing.T) { + payload := &EmbeddingsPayload{ + FormatVersion: 1, + Spaces: []EmbeddingSpace{ + { + Model: "test/model", + Dimension: 3, + Metric: "cosine", + Chunks: []EmbeddingChunk{ + {ID: "good", Vector: []float32{0.1, 0.2, 0.3}}, + {ID: "ragged", Vector: []float32{0.5}}, // wrong length + }, + }, + }, + } + hits, err := SearchSemantic(payload, []float32{1, 0, 0}, SearchOptions{K: 5}) + if err != nil { + t.Fatalf("SearchSemantic: %v", err) + } + if len(hits) != 1 || hits[0].ChunkID != "good" { + t.Errorf("expected only the well-sized chunk, got %+v", hits) + } +} + // TestSearchSemanticOnRealBGEM3 exercises the full path the cv search CLI // takes: open a .cv with real BGE-M3 embeddings, decode, embed a query via // the HF Inference API, rank chunks. Skips when fixture or HF_TOKEN is missing. diff --git a/sdks/go/cv/security.go b/sdks/go/cv/security.go index 1b6f019..7dba339 100644 --- a/sdks/go/cv/security.go +++ b/sdks/go/cv/security.go @@ -8,9 +8,13 @@ import ( pdfTypes "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/types" ) -// scanForbiddenConstructs walks every indirect object in the xref table and -// reports constructs prohibited by .cv spec §3.4. Error codes match the JS -// and Python SDKs so cross-language tests share expectations. +// scanForbiddenConstructs walks the object graph from the catalog (resolving +// indirect refs and descending into every dict/array value) and reports +// constructs prohibited by .cv spec §3.4. Walking the graph rather than just +// the xref table means forbidden actions stored as DIRECT (inline) children — +// e.g. a catalog /OpenAction << /S /JavaScript /JS (...) >> — are still caught. +// Error codes match the JS and Python SDKs so cross-language tests share +// expectations. This mirrors the Python _security.py implementation. func scanForbiddenConstructs(ctx *model.Context) []ValidationIssue { var issues []ValidationIssue @@ -22,22 +26,52 @@ func scanForbiddenConstructs(ctx *model.Context) []ValidationIssue { }) } - for _, entry := range ctx.Table { - if entry == nil || entry.Object == nil { - continue + root, err := ctx.Catalog() + if err == nil && root != nil { + seen := map[int]struct{}{} + walkSecurityObject(ctx, root, seen, &issues) + } + + return dedupeIssues(issues) +} + +// walkSecurityObject recursively descends an object, resolving indirect refs. +// The visited set is keyed by indirect-object number to avoid cycles; direct +// (inline) dicts and arrays are always descended. +func walkSecurityObject(ctx *model.Context, obj pdfTypes.Object, seen map[int]struct{}, issues *[]ValidationIssue) { + if obj == nil { + return + } + if ref, ok := obj.(pdfTypes.IndirectRef); ok { + num := ref.ObjectNumber.Value() + if _, done := seen[num]; done { + return } - dict, ok := entry.Object.(pdfTypes.Dict) - if !ok { - if sd, isStream := entry.Object.(pdfTypes.StreamDict); isStream { - dict = sd.Dict - } else { - continue - } + seen[num] = struct{}{} + resolved, err := ctx.Dereference(ref) + if err != nil { + return } - inspectSecurityDict(ctx, dict, &issues) + walkSecurityObject(ctx, resolved, seen, issues) + return } - return dedupeIssues(issues) + switch v := obj.(type) { + case pdfTypes.Dict: + inspectSecurityDict(ctx, v, issues) + for _, value := range v { + walkSecurityObject(ctx, value, seen, issues) + } + case pdfTypes.StreamDict: + inspectSecurityDict(ctx, v.Dict, issues) + for _, value := range v.Dict { + walkSecurityObject(ctx, value, seen, issues) + } + case pdfTypes.Array: + for _, item := range v { + walkSecurityObject(ctx, item, seen, issues) + } + } } func inspectSecurityDict(ctx *model.Context, d pdfTypes.Dict, issues *[]ValidationIssue) { diff --git a/sdks/go/cv/security_test.go b/sdks/go/cv/security_test.go index 99ae05b..8061934 100644 --- a/sdks/go/cv/security_test.go +++ b/sdks/go/cv/security_test.go @@ -5,6 +5,8 @@ import ( "os" "path/filepath" "testing" + + "github.com/pdfcpu/pdfcpu/pkg/pdfcpu/model" ) type maliciousFixture struct { @@ -57,6 +59,43 @@ func TestSecurityRejectsMaliciousFixtures(t *testing.T) { } } +// mustLoadInlineJSContext returns a context for a PDF that carries a forbidden +// JavaScript action as a DIRECT (inline) child of the catalog's /OpenAction. +// The old xref-only scan never looked inside inline dicts, so this slipped +// through; the graph walk catches it. +func mustLoadInlineJSContext(t *testing.T) *model.Context { + t.Helper() + pdf := []byte("%PDF-1.7\n" + + "1 0 obj<>>>endobj\n" + + "2 0 obj<>endobj\n" + + "3 0 obj<>/ProcSet[/PDF/Text]>>>>endobj\n" + + "xref\n0 4\n0000000000 65535 f \n0000000009 00000 n \n0000000089 00000 n \n0000000134 00000 n \n" + + "trailer<>\nstartxref\n229\n%%EOF\n") + ctx, err := loadContext(pdf) + if err != nil { + t.Fatalf("loadContext: %v", err) + } + return ctx +} + +// TestSecurityCatchesInlineOpenActionJS asserts that a JavaScript action stored +// as a DIRECT (inline) catalog /OpenAction is rejected with javascript-action. +func TestSecurityCatchesInlineOpenActionJS(t *testing.T) { + ctx := mustLoadInlineJSContext(t) + issues := scanForbiddenConstructs(ctx) + found := false + codes := make([]string, 0, len(issues)) + for _, i := range issues { + codes = append(codes, i.Code) + if i.Code == "javascript-action" { + found = true + } + } + if !found { + t.Errorf("inline /OpenAction JavaScript not rejected; got codes %v", codes) + } +} + func TestSecurityPayloadSizeCap(t *testing.T) { fixture := repoFixturePath("packages/sdk-js/examples/out/jane-doe.cv") data, err := os.ReadFile(fixture) diff --git a/sdks/go/cv/validate.go b/sdks/go/cv/validate.go index 4b55b8a..f57ba86 100644 --- a/sdks/go/cv/validate.go +++ b/sdks/go/cv/validate.go @@ -1,15 +1,24 @@ package cv import ( - "bytes" "crypto/sha256" "encoding/hex" + "errors" "fmt" + "strconv" + "strings" + + "github.com/pdfcpu/pdfcpu/pkg/pdfcpu" ) // DefaultMaxPayloadBytes is the per-payload decompressed-byte cap (spec §7.3). const DefaultMaxPayloadBytes = 16 * 1024 * 1024 +// knownMaxMajor is the highest cv: format MAJOR this SDK was built to read. +// Both "0.1" and "1.0" are accepted; a file declaring major >= 2 triggers a +// warning (spec §8.3) but is still extracted. +const knownMaxMajor = 1 + // ValidateOptions controls the strictness of validation. type ValidateOptions struct { Strict bool @@ -29,21 +38,21 @@ func Validate(data []byte, opts ValidateOptions) *ValidationReport { } var issues []ValidationIssue - if looksEncrypted(data) { - issues = append(issues, ValidationIssue{ - Code: "encrypted-document", - Level: "error", - Message: "Trailer declares /Encrypt; encryption is forbidden in cv 0.x (spec §3.4)", - }) - return &ValidationReport{OK: false, Level: level, Issues: issues} - } - ctx, err := loadContext(data) if err != nil { + // pdfcpu refuses to build a context for an encrypted document and tells + // us so via its parse error; classify that as the spec-§3.4 encryption + // rejection rather than a generic parse failure. + code := "pdf-parse-failed" + msg := err.Error() + if isEncryptionError(err) { + code = "encrypted-document" + msg = "Trailer declares /Encrypt; encryption is forbidden in cv 0.x (spec §3.4)" + } issues = append(issues, ValidationIssue{ - Code: "pdf-parse-failed", + Code: code, Level: "error", - Message: err.Error(), + Message: msg, }) return &ValidationReport{OK: false, Level: level, Issues: issues} } @@ -70,6 +79,17 @@ func Validate(data []byte, opts ValidateOptions) *ValidationReport { return &ValidationReport{OK: false, Level: level, Issues: issues} } + // Spec §8.3: accept files from the same MAJOR line (this SDK knows 0.x and + // 1.0), but warn when the file declares a newer MAJOR we were not built for. + // Extraction continues regardless; this is informational only. + if major, ok := majorVersion(meta.Version); ok && major > knownMaxMajor { + issues = append(issues, ValidationIssue{ + Code: "newer-format-version", + Level: "warning", + Message: fmt.Sprintf("File declares cv:version %q (major %d); this SDK knows up to major %d (spec §8.3)", meta.Version, major, knownMaxMajor), + }) + } + rawList, err := readAssociatedFiles(ctx) if err != nil { issues = append(issues, ValidationIssue{ @@ -168,25 +188,36 @@ func Validate(data []byte, opts ValidateOptions) *ValidationReport { return &ValidationReport{OK: ok, Level: level, Issues: issues} } -var encryptToken = []byte("/Encrypt") - -// looksEncrypted is a byte-level pre-check on the trailer region: pdfcpu can -// load encrypted PDFs but our policy is to refuse them outright with the -// documented spec-§3.4 code regardless of parser behaviour. -func looksEncrypted(data []byte) bool { - tail := data - if len(data) > 4096 { - tail = data[len(data)-4096:] - } - idx := bytes.Index(tail, encryptToken) - if idx < 0 { +// isEncryptionError reports whether a pdfcpu read error stems from the document +// being encrypted. pdfcpu signals this through its parse error (it has no clean +// exported sentinel for the malformed-encryption case), so we match the stable +// "encryption" wording it uses. This keys off the parser's own diagnosis, not a +// byte scan of payload content, so it cannot false-positive on a payload that +// merely contains the literal "/Encrypt". +func isEncryptionError(err error) bool { + if err == nil { return false } - // Confirm word boundary (next byte is space, slash, newline, or EOF). - end := idx + len(encryptToken) - if end >= len(tail) { + if errors.Is(err, pdfcpu.ErrWrongPassword) || errors.Is(err, pdfcpu.ErrUnknownEncryption) { return true } - c := tail[end] - return c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '/' || c == '<' || c == '[' + return strings.Contains(strings.ToLower(err.Error()), "encryption") +} + +// majorVersion parses the MAJOR component of a "MAJOR.MINOR" cv:version string. +// Returns (0, false) when the string is empty or the major is not an integer. +func majorVersion(version string) (int, bool) { + version = strings.TrimSpace(version) + if version == "" { + return 0, false + } + majorStr := version + if i := strings.IndexByte(version, '.'); i >= 0 { + majorStr = version[:i] + } + major, err := strconv.Atoi(majorStr) + if err != nil { + return 0, false + } + return major, true } diff --git a/sdks/go/cv/validate_test.go b/sdks/go/cv/validate_test.go new file mode 100644 index 0000000..4148b21 --- /dev/null +++ b/sdks/go/cv/validate_test.go @@ -0,0 +1,38 @@ +package cv + +import "testing" + +func TestMajorVersion(t *testing.T) { + cases := []struct { + in string + wantMajor int + wantOK bool + }{ + {"0.1", 0, true}, + {"1.0", 1, true}, + {"2.0", 2, true}, + {"10.3", 10, true}, + {"3", 3, true}, + {"", 0, false}, + {"x.y", 0, false}, + } + for _, c := range cases { + major, ok := majorVersion(c.in) + if ok != c.wantOK || major != c.wantMajor { + t.Errorf("majorVersion(%q) = (%d, %t), want (%d, %t)", c.in, major, ok, c.wantMajor, c.wantOK) + } + } +} + +// TestNewerFormatVersionThreshold confirms the warning fires only for majors the +// SDK was not built for: 0.x and 1.0 are known, major >= 2 is newer. +func TestNewerFormatVersionThreshold(t *testing.T) { + for _, v := range []string{"0.1", "1.0"} { + if major, ok := majorVersion(v); ok && major > knownMaxMajor { + t.Errorf("version %q should be treated as known (major %d <= %d)", v, major, knownMaxMajor) + } + } + if major, ok := majorVersion("2.0"); !ok || major <= knownMaxMajor { + t.Errorf("version 2.0 should be flagged as newer (major %d > %d)", major, knownMaxMajor) + } +} diff --git a/sdks/go/middleware/conneg.go b/sdks/go/middleware/conneg.go index 425c255..1c2d4cc 100644 --- a/sdks/go/middleware/conneg.go +++ b/sdks/go/middleware/conneg.go @@ -39,12 +39,12 @@ type NegotiateInput struct { } var formatByMIME = map[string]ServeFormat{ - "text/markdown": FormatMarkdown, - "text/x-markdown": FormatMarkdown, - "text/html": FormatHTML, - "application/xhtml+xml": FormatHTML, - "application/pdf": FormatPDF, - "application/vnd.cv+pdf": FormatPDF, + "text/markdown": FormatMarkdown, + "text/x-markdown": FormatMarkdown, + "text/html": FormatHTML, + "application/xhtml+xml": FormatHTML, + "application/pdf": FormatPDF, + "application/vnd.cv+pdf": FormatPDF, } var formatByQuery = map[string]ServeFormat{ From 188eccce0926ea22171a472a296c980d7293437c Mon Sep 17 00:00:00 2001 From: Ilan Date: Thu, 28 May 2026 23:35:58 +0300 Subject: [PATCH 2/8] fix(sdk-js): close inline-action validator gap, add CheckSum, validate filenames and decode path * scanForbiddenConstructs now recursively walks the catalog/trailer object graph (resolving refs, visited set) instead of only enumerating indirect objects, so inline /OpenAction JavaScript and similar are caught. New fixes.test.ts injects an inline action and asserts rejection. * Embedded-file /Params now carries the spec-mandated /CheckSum (md5 of the unwrapped payload bytes) via a new md5 in digest.ts; Info and Params dates are written with a UTC offset to match the XMP dateTime (PDF/A-3). * pack() rejects non-portable payload names (outside [A-Za-z0-9._/-], or with . / .. segments); validate() flags filename-not-portable on read. * CBOR decode (fromCborSpace) now applies the same scalar validation as the encode path so readers get the same guarantees against malformed input. * decodeStream rejects unsupported /DecodeParms predictors instead of inflating to garbage. * validate emits newer-format-version warning for MAJOR >= 2 (accepts 0.1/1.0). * Removed the last-4KiB /Encrypt byte prefilter; rely on the parsed trailerInfo.Encrypt check (parse with ignoreEncryption). --- packages/sdk-js/src/digest.ts | 91 ++++++++++ packages/sdk-js/src/embeddings.ts | 58 +++++-- packages/sdk-js/src/pack.ts | 106 +++++++++++- packages/sdk-js/src/pdf.ts | 49 +++++- packages/sdk-js/src/security.ts | 110 +++++++----- packages/sdk-js/src/validate.ts | 69 ++++++-- packages/sdk-js/tests/fixes.test.ts | 256 ++++++++++++++++++++++++++++ 7 files changed, 663 insertions(+), 76 deletions(-) create mode 100644 packages/sdk-js/tests/fixes.test.ts diff --git a/packages/sdk-js/src/digest.ts b/packages/sdk-js/src/digest.ts index 43051a1..b435c8e 100644 --- a/packages/sdk-js/src/digest.ts +++ b/packages/sdk-js/src/digest.ts @@ -16,3 +16,94 @@ function bytesToHex(bytes: Uint8Array): string { } return hex; } + +/** + * MD5 of the given bytes, returned as a lowercase hex string. + * + * MD5 is cryptographically broken and is used here ONLY because the PDF + * specification mandates an MD5 /CheckSum on embedded-file /Params (spec §4.1); + * Web Crypto does not expose MD5, so we provide a small RFC 1321 implementation. + */ +export function md5Hex(bytes: Uint8Array): string { + const digest = md5(bytes); + return bytesToHex(digest); +} + +const S = [ + 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, + 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, + 21, +]; + +const K = (() => { + const out = new Uint32Array(64); + for (let i = 0; i < 64; i += 1) { + out[i] = Math.floor(Math.abs(Math.sin(i + 1)) * 2 ** 32) >>> 0; + } + return out; +})(); + +function md5(input: Uint8Array): Uint8Array { + const originalBitLen = input.length * 8; + const padLen = ((input.length + 8) >> 6) + 1; // number of 64-byte blocks + const msg = new Uint8Array(padLen * 64); + msg.set(input); + msg[input.length] = 0x80; + const view = new DataView(msg.buffer); + view.setUint32(msg.length - 8, originalBitLen >>> 0, true); + view.setUint32(msg.length - 4, Math.floor(originalBitLen / 2 ** 32) >>> 0, true); + + let a0 = 0x67452301; + let b0 = 0xefcdab89; + let c0 = 0x98badcfe; + let d0 = 0x10325476; + + const m = new Uint32Array(16); + for (let off = 0; off < msg.length; off += 64) { + for (let i = 0; i < 16; i += 1) { + m[i] = view.getUint32(off + i * 4, true); + } + let a = a0; + let b = b0; + let c = c0; + let d = d0; + for (let i = 0; i < 64; i += 1) { + let f: number; + let g: number; + if (i < 16) { + f = (b & c) | (~b & d); + g = i; + } else if (i < 32) { + f = (d & b) | (~d & c); + g = (5 * i + 1) % 16; + } else if (i < 48) { + f = b ^ c ^ d; + g = (3 * i + 5) % 16; + } else { + f = c ^ (b | ~d); + g = (7 * i) % 16; + } + f = (f + a + K[i]! + m[g]!) >>> 0; + a = d; + d = c; + c = b; + b = (b + rotl(f, S[i]!)) >>> 0; + } + a0 = (a0 + a) >>> 0; + b0 = (b0 + b) >>> 0; + c0 = (c0 + c) >>> 0; + d0 = (d0 + d) >>> 0; + } + + const out = new Uint8Array(16); + const outView = new DataView(out.buffer); + outView.setUint32(0, a0, true); + outView.setUint32(4, b0, true); + outView.setUint32(8, c0, true); + outView.setUint32(12, d0, true); + return out; +} + +function rotl(x: number, c: number): number { + return ((x << c) | (x >>> (32 - c))) >>> 0; +} diff --git a/packages/sdk-js/src/embeddings.ts b/packages/sdk-js/src/embeddings.ts index 717dfb2..a860e79 100644 --- a/packages/sdk-js/src/embeddings.ts +++ b/packages/sdk-js/src/embeddings.ts @@ -96,9 +96,26 @@ function toCborSpace(space: EmbeddingSpace): CborSpace { } function fromCborSpace(raw: CborSpace): EmbeddingSpace { - if (typeof raw.dimension !== 'number' || raw.dimension <= 0) { - throw new Error('Invalid embedding space: dimension must be a positive integer'); + if (typeof raw !== 'object' || raw === null) { + throw new Error('Invalid embedding space: not a map'); } + if (typeof raw.normalized !== 'boolean') { + throw new Error(`Invalid embedding space "${raw.model}": normalized must be a boolean`); + } + if (!Array.isArray(raw.chunks)) { + throw new Error(`Invalid embedding space "${raw.model}": chunks must be an array`); + } + // Mirror the encode-side guarantees (validateSpace) so attacker-supplied CBOR + // cannot smuggle untyped model/metric/chunking/dimension values past readers. + // Scalars are checked before decoding chunks so the dimension used for vector + // length checks is known-valid. + validateScalars({ + model: raw.model, + modelRevision: raw['model-revision'], + dimension: raw.dimension, + metric: raw.metric, + chunking: raw.chunking, + }); return { model: raw.model, modelRevision: raw['model-revision'], @@ -140,19 +157,40 @@ function fromCborChunk(raw: CborChunk, dimension: number): EmbeddingChunk { } function validateSpace(space: EmbeddingSpace): void { - if (!space.model) throw new Error('Embedding space missing model'); - if (!space.modelRevision) throw new Error(`Embedding space "${space.model}" missing modelRevision`); - if (!Number.isInteger(space.dimension) || space.dimension <= 0) { + validateScalars({ + model: space.model, + modelRevision: space.modelRevision, + dimension: space.dimension, + metric: space.metric, + chunking: space.chunking, + }); + if (!Array.isArray(space.chunks) || space.chunks.length === 0) { + throw new Error(`Embedding space "${space.model}" must contain at least one chunk`); + } +} + +interface SpaceScalars { + model: unknown; + modelRevision: unknown; + dimension: unknown; + metric: unknown; + chunking: unknown; +} + +/** Validates the scalar header fields shared by the encode and decode paths. */ +function validateScalars(space: SpaceScalars): void { + if (!space.model || typeof space.model !== 'string') throw new Error('Embedding space missing model'); + if (!space.modelRevision || typeof space.modelRevision !== 'string') { + throw new Error(`Embedding space "${space.model}" missing modelRevision`); + } + if (!Number.isInteger(space.dimension) || (space.dimension as number) <= 0) { throw new Error(`Embedding space "${space.model}" dimension must be a positive integer`); } if (space.metric !== 'cosine' && space.metric !== 'dot' && space.metric !== 'euclidean') { - throw new Error(`Embedding space "${space.model}" has invalid metric "${space.metric}"`); + throw new Error(`Embedding space "${space.model}" has invalid metric "${String(space.metric)}"`); } if (space.chunking !== 'document' && space.chunking !== 'section' && space.chunking !== 'paragraph') { - throw new Error(`Embedding space "${space.model}" has invalid chunking "${space.chunking}"`); - } - if (!Array.isArray(space.chunks) || space.chunks.length === 0) { - throw new Error(`Embedding space "${space.model}" must contain at least one chunk`); + throw new Error(`Embedding space "${space.model}" has invalid chunking "${String(space.chunking)}"`); } } diff --git a/packages/sdk-js/src/pack.ts b/packages/sdk-js/src/pack.ts index 6136e7c..41b0266 100644 --- a/packages/sdk-js/src/pack.ts +++ b/packages/sdk-js/src/pack.ts @@ -1,6 +1,6 @@ -import { AFRelationship, PDFArray, PDFDocument, PDFHexString, PDFName, PDFString } from 'pdf-lib'; +import { AFRelationship, PDFArray, PDFDict, PDFDocument, PDFHexString, PDFName, PDFStream, PDFString } from 'pdf-lib'; import { CV_SPEC_VERSION, DEFAULT_GENERATOR, DEFAULT_PAYLOAD_NAMES, PAYLOAD_MIME_TYPES } from './constants.js'; -import { sha256Hex } from './digest.js'; +import { md5Hex, sha256Hex } from './digest.js'; import { encodeEmbeddings, type EmbeddingsPayload } from './embeddings.js'; import { toBytes, toUint8Array } from './normalize.js'; import { setMetadataXml } from './pdf.js'; @@ -35,8 +35,10 @@ export async function pack(input: PackInput): Promise { } } + const payloadBytesByName = new Map(); for (const p of payloads) { const bytes = toBytes(p.data); + payloadBytesByName.set(p.name, bytes); pdfDoc.attach(bytes, p.name, { mimeType: p.mimeType, description: p.description ?? defaultDescription(p), @@ -72,9 +74,90 @@ export async function pack(input: PackInput): Promise { setTrailerId(pdfDoc); } + // Materialize the embedded-file streams so we can amend their /Params before + // serialization. flush() is idempotent (each embeddable guards re-embedding). + await pdfDoc.flush(); + setEmbeddedFileChecksums(pdfDoc, payloadBytesByName, created, modified); + setInfoDates(pdfDoc, created, modified); + return pdfDoc.save({ useObjectStreams: false }); } +const PARAMS = PDFName.of('Params'); +const CHECKSUM = PDFName.of('CheckSum'); +const CREATION_DATE = PDFName.of('CreationDate'); +const MOD_DATE = PDFName.of('ModDate'); + +/** + * Set the spec-mandated MD5 /CheckSum (spec §4.1) on each embedded-file + * stream's /Params, computed over the unwrapped payload bytes. pdf-lib emits + * /Size and /ModDate but never /CheckSum. Dates are rewritten with an explicit + * UTC offset so they agree with the XMP UTC dateTime (PDF/A-3 date hygiene). + */ +function setEmbeddedFileChecksums( + pdfDoc: PDFDocument, + payloadBytesByName: Map, + created: Date, + modified: Date, +): void { + const afRaw = pdfDoc.catalog.get(PDFName.of('AF')); + if (!afRaw) return; + const afArray = pdfDoc.context.lookup(afRaw, PDFArray); + + for (let i = 0; i < afArray.size(); i += 1) { + const filespec = pdfDoc.context.lookup(afArray.get(i), PDFDict); + const name = filespecName(filespec); + const efRaw = filespec.get(PDFName.of('EF')); + if (!efRaw) continue; + const efDict = pdfDoc.context.lookup(efRaw, PDFDict); + const streamRef = efDict.get(PDFName.of('F')) ?? efDict.get(PDFName.of('UF')); + const stream = pdfDoc.context.lookup(streamRef); + if (!(stream instanceof PDFStream)) continue; + + const bytes = name ? payloadBytesByName.get(name) : undefined; + if (!bytes) continue; + + let params = stream.dict.get(PARAMS); + if (!(params instanceof PDFDict)) { + params = pdfDoc.context.obj({}); + stream.dict.set(PARAMS, params); + } + const paramsDict = params as PDFDict; + paramsDict.set(CHECKSUM, PDFHexString.of(md5Hex(bytes))); + paramsDict.set(CREATION_DATE, PDFString.of(pdfDate(created))); + paramsDict.set(MOD_DATE, PDFString.of(pdfDate(modified))); + } +} + +function filespecName(filespec: PDFDict): string | undefined { + const uf = filespec.get(PDFName.of('UF')); + if (uf instanceof PDFHexString) return uf.decodeText(); + if (uf instanceof PDFString) return uf.asString(); + const f = filespec.get(PDFName.of('F')); + if (f instanceof PDFString) return f.asString(); + if (f instanceof PDFHexString) return f.decodeText(); + return undefined; +} + +/** + * Set the document Info CreationDate/ModDate to the cv created/modified values + * with an explicit UTC offset so they match the XMP UTC dateTime. + */ +function setInfoDates(pdfDoc: PDFDocument, created: Date, modified: Date): void { + const info = pdfDoc.context.lookup(pdfDoc.context.trailerInfo.Info, PDFDict); + info.set(CREATION_DATE, PDFString.of(pdfDate(created))); + info.set(MOD_DATE, PDFString.of(pdfDate(modified))); +} + +/** PDF date string in UTC with an explicit "+00'00'" offset (ISO 32000 §7.9.4). */ +function pdfDate(d: Date): string { + const p = (n: number, w = 2): string => String(n).padStart(w, '0'); + return ( + `D:${p(d.getUTCFullYear(), 4)}${p(d.getUTCMonth() + 1)}${p(d.getUTCDate())}` + + `${p(d.getUTCHours())}${p(d.getUTCMinutes())}${p(d.getUTCSeconds())}+00'00'` + ); +} + function addPdfaOutputIntent(pdfDoc: PDFDocument): void { const existing = pdfDoc.catalog.lookup(PDFName.of('OutputIntents')); if (existing instanceof PDFArray && existing.size() > 0) { @@ -169,6 +252,7 @@ function collectPayloads(input: PackInput): { payloads: Payload[]; embeddingSumm const seen = new Set(); for (const p of out) { + assertPortableName(p.name); if (seen.has(p.name)) { throw new Error(`Duplicate payload name: ${p.name}`); } @@ -177,6 +261,24 @@ function collectPayloads(input: PackInput): { payloads: Payload[]; embeddingSumm return { payloads: out, embeddingSummaries }; } +/** Matches the POSIX-portable filename charset required by spec §4.4. */ +export const PORTABLE_NAME_RE = /^[A-Za-z0-9._/-]+$/; + +/** + * Reject payload names that are not POSIX-portable (spec §4.4) or that contain + * "." / ".." path segments, which would allow path traversal on extraction. + */ +export function assertPortableName(name: string): void { + if (!PORTABLE_NAME_RE.test(name)) { + throw new Error(`Payload name "${name}" is not POSIX-portable; allowed charset is [A-Za-z0-9._/-] (spec §4.4)`); + } + for (const segment of name.split('/')) { + if (segment === '.' || segment === '..') { + throw new Error(`Payload name "${name}" contains a "${segment}" path segment (spec §4.4)`); + } + } +} + function resolveEmbeddings( input: PackInput['embeddings'], summaryOut: EmbeddingSpaceSummary[], diff --git a/packages/sdk-js/src/pdf.ts b/packages/sdk-js/src/pdf.ts index abb4f90..822b1cc 100644 --- a/packages/sdk-js/src/pdf.ts +++ b/packages/sdk-js/src/pdf.ts @@ -4,6 +4,7 @@ import { PDFDocument, PDFHexString, PDFName, + PDFNumber, PDFRawStream, PDFStream, PDFString, @@ -36,6 +37,10 @@ export async function loadDocument(bytes: Uint8Array): Promise { return PDFDocument.load(bytes, { updateMetadata: false, throwOnInvalidObject: false, + // Parse encrypted files instead of throwing, so the validator can surface + // the authoritative /Encrypt trailer entry as the documented spec-§3.4 + // error code rather than a generic parse failure. + ignoreEncryption: true, }); } @@ -217,9 +222,13 @@ function decodeStream(stream: PDFStream): Uint8Array | null { } } + const decodeParms = collectDecodeParms(stream.dict, filters.length); + let bytes: Uint8Array = raw; - for (const f of filters) { + for (let i = 0; i < filters.length; i += 1) { + const f = filters[i]!; if (f === 'FlateDecode') { + assertNoPredictor(decodeParms[i]); bytes = pako.inflate(bytes); } else { return null; @@ -228,6 +237,44 @@ function decodeStream(stream: PDFStream): Uint8Array | null { return bytes; } +const DECODE_PARMS_NAME = PDFName.of('DecodeParms'); +const PREDICTOR_NAME = PDFName.of('Predictor'); + +/** + * Returns the /DecodeParms dictionary that applies to each filter, in filter + * order. /DecodeParms may be a single dict (one filter) or an array aligned + * with /Filter; missing entries are undefined (no parameters). + */ +function collectDecodeParms(dict: PDFDict, filterCount: number): (PDFDict | undefined)[] { + const out: (PDFDict | undefined)[] = new Array(filterCount).fill(undefined); + const parms = dict.get(DECODE_PARMS_NAME); + if (parms instanceof PDFDict) { + out[0] = parms; + } else if (parms instanceof PDFArray) { + for (let i = 0; i < parms.size() && i < filterCount; i += 1) { + const entry = parms.get(i); + if (entry instanceof PDFDict) out[i] = entry; + } + } + return out; +} + +/** + * FlateDecode streams may declare a PNG/TIFF /Predictor in /DecodeParms. This + * SDK does not implement predictor reversal, so rather than silently returning + * garbage we reject such streams with a clear, actionable error. + */ +function assertNoPredictor(parms: PDFDict | undefined): void { + if (!parms) return; + const predictor = parms.get(PREDICTOR_NAME); + if (predictor instanceof PDFNumber && predictor.asNumber() > 1) { + throw new Error( + `Unsupported FlateDecode /DecodeParms /Predictor ${predictor.asNumber()}; ` + + 'PNG/TIFF predictors are not supported by this SDK', + ); + } +} + export function toExtractedPayload(raw: RawPayload, language?: string): ExtractedPayload { const out: ExtractedPayload = { name: raw.name, diff --git a/packages/sdk-js/src/security.ts b/packages/sdk-js/src/security.ts index 873b1d0..7b77f71 100644 --- a/packages/sdk-js/src/security.ts +++ b/packages/sdk-js/src/security.ts @@ -1,28 +1,27 @@ -import { PDFArray, PDFDict, PDFDocument, PDFHexString, PDFName, PDFObject, PDFString } from 'pdf-lib'; +import { PDFArray, PDFDict, PDFDocument, PDFHexString, PDFName, PDFObject, PDFRef, PDFString } from 'pdf-lib'; import type { ValidationIssue } from './types.js'; const TYPE = PDFName.of('Type'); -const SUBTYPE = PDFName.of('Subtype'); const S_KEY = PDFName.of('S'); const JS_KEY = PDFName.of('JS'); const JAVASCRIPT_KEY = PDFName.of('JavaScript'); const F_KEY = PDFName.of('F'); const UF_KEY = PDFName.of('UF'); const EF_KEY = PDFName.of('EF'); -const NAMES_KEY = PDFName.of('Names'); -const ACTION = PDFName.of('Action'); -const FILESPEC = PDFName.of('Filespec'); - -const SUBMIT_FORM = PDFName.of('SubmitForm'); -const LAUNCH = PDFName.of('Launch'); -const IMPORT_DATA = PDFName.of('ImportData'); -const JS_ACTION = PDFName.of('JavaScript'); +const SUBMIT_FORM = 'SubmitForm'; +const LAUNCH = 'Launch'; +const IMPORT_DATA = 'ImportData'; +const JS_ACTION = 'JavaScript'; /** - * Walk the entire indirect-object graph and report any construct prohibited - * by the .cv spec §3.4. Each rule maps to a stable error code so consumers - * can pattern-match without parsing free-text messages. + * Walk the entire object graph from the catalog and report any construct + * prohibited by the .cv spec §3.4. The walk descends through every PDFDict and + * PDFArray value, resolving indirect references, so that forbidden actions + * carried as DIRECT/inline children (e.g. catalog /OpenAction, page /Annots/A, + * /AA, AcroForm field actions) are caught as well as indirect ones. Each rule + * maps to a stable error code so consumers can pattern-match without parsing + * free-text messages. Mirrors the Python reference impl in _security.py. */ export function scanForbiddenConstructs(pdfDoc: PDFDocument): ValidationIssue[] { const issues: ValidationIssue[] = []; @@ -35,44 +34,58 @@ export function scanForbiddenConstructs(pdfDoc: PDFDocument): ValidationIssue[] }); } - for (const [, obj] of pdfDoc.context.enumerateIndirectObjects()) { - if (!(obj instanceof PDFDict)) continue; - inspectDict(pdfDoc, obj, issues); - } + const seen = new Set(); + walk(pdfDoc, pdfDoc.catalog, seen, issues); return dedupe(issues); } +function walk(pdfDoc: PDFDocument, value: PDFObject | undefined, seen: Set, issues: ValidationIssue[]): void { + const obj = resolve(pdfDoc, value); + if (obj === undefined || seen.has(obj)) return; + seen.add(obj); + + if (obj instanceof PDFDict) { + inspectDict(pdfDoc, obj, issues); + for (const [, child] of obj.entries()) { + walk(pdfDoc, child, seen, issues); + } + } else if (obj instanceof PDFArray) { + for (let i = 0; i < obj.size(); i += 1) { + walk(pdfDoc, obj.get(i), seen, issues); + } + } +} + function inspectDict(pdfDoc: PDFDocument, dict: PDFDict, issues: ValidationIssue[]): void { const type = nameOf(dict.get(TYPE)); + const subtype = nameOf(dict.get(S_KEY)); - if (type === 'Action' || dict.get(S_KEY) instanceof PDFName) { - inspectAction(pdfDoc, dict, issues); + if (type === 'Action' || subtype) { + inspectAction(pdfDoc, dict, subtype, issues); } if (type === 'Filespec') { inspectFilespec(dict, issues); } - // /Names tree for document-level JavaScript: catalog→/Names→/JavaScript - // surfaces as a dict with a JavaScript key whose entry is a name tree. - // Any presence of /JavaScript on a Names dict is forbidden. - const namesEntry = dict.get(NAMES_KEY); - if (dict.get(JAVASCRIPT_KEY) || (namesEntry instanceof PDFDict && namesEntry.get(JAVASCRIPT_KEY))) { - if (!issues.some((i) => i.code === 'javascript-names-tree')) { - issues.push({ - code: 'javascript-names-tree', - level: 'error', - message: 'Document declares /JavaScript names entries; JavaScript actions are forbidden (spec §3.4)', - }); - } + // A /JavaScript entry on any dict (catalog→/Names→/JavaScript name tree, or + // the leaf nodes thereof) signals document-level JavaScript, which is forbidden. + if (dict.get(JAVASCRIPT_KEY) !== undefined) { + issues.push({ + code: 'javascript-names-tree', + level: 'error', + message: 'Document declares /JavaScript names entries; JavaScript actions are forbidden (spec §3.4)', + }); } } -function inspectAction(pdfDoc: PDFDocument, dict: PDFDict, issues: ValidationIssue[]): void { - const subtype = dict.get(S_KEY); - if (!(subtype instanceof PDFName)) return; - +function inspectAction( + pdfDoc: PDFDocument, + dict: PDFDict, + subtype: string | undefined, + issues: ValidationIssue[], +): void { if (subtype === JS_ACTION || dict.get(JS_KEY) !== undefined) { issues.push({ code: 'javascript-action', @@ -101,7 +114,7 @@ function inspectAction(pdfDoc: PDFDocument, dict: PDFDict, issues: ValidationIss } if (subtype === SUBMIT_FORM) { - const fEntry = pdfDoc.context.lookup(dict.get(F_KEY)); + const fEntry = resolve(pdfDoc, dict.get(F_KEY)); const target = filespecTarget(fEntry); if (!target || !target.toLowerCase().startsWith('mailto:')) { issues.push({ @@ -119,16 +132,13 @@ function inspectFilespec(dict: PDFDict, issues: ValidationIssue[]): void { if (dict.get(EF_KEY) !== undefined) return; // No /EF means the filespec points outside the container. const target = filespecTarget(dict); - if (!issues.some((i) => i.code === 'external-filespec' && i.payload === target)) { - issues.push({ - code: 'external-filespec', - level: 'error', - message: target - ? `External /Filespec "${target}" (spec §3.4)` - : 'External /Filespec with no /EF (spec §3.4)', - payload: target, - }); - } + const issue: ValidationIssue = { + code: 'external-filespec', + level: 'error', + message: target ? `External /Filespec "${target}" (spec §3.4)` : 'External /Filespec with no /EF (spec §3.4)', + }; + if (target !== undefined) issue.payload = target; + issues.push(issue); } function filespecTarget(value: PDFObject | undefined): string | undefined { @@ -155,6 +165,14 @@ function filespecTarget(value: PDFObject | undefined): string | undefined { return undefined; } +function resolve(pdfDoc: PDFDocument, value: PDFObject | undefined): PDFObject | undefined { + if (value === undefined) return undefined; + if (value instanceof PDFRef) { + return pdfDoc.context.lookup(value) ?? undefined; + } + return value; +} + function nameOf(value: PDFObject | undefined): string | undefined { return value instanceof PDFName ? value.asString().slice(1) : undefined; } diff --git a/packages/sdk-js/src/validate.ts b/packages/sdk-js/src/validate.ts index b8e7c82..90b4b41 100644 --- a/packages/sdk-js/src/validate.ts +++ b/packages/sdk-js/src/validate.ts @@ -1,5 +1,7 @@ +import { CV_SPEC_VERSION } from './constants.js'; import { sha256Hex } from './digest.js'; import { toUint8Array } from './normalize.js'; +import { PORTABLE_NAME_RE } from './pack.js'; import { loadDocument, readAssociatedFiles, readMetadataXml } from './pdf.js'; import { scanForbiddenConstructs } from './security.js'; import type { BinaryInput, ValidationIssue, ValidationLevel, ValidationReport } from './types.js'; @@ -20,15 +22,6 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): const issues: ValidationIssue[] = []; const bytes = await toUint8Array(input); - if (looksEncrypted(bytes)) { - issues.push({ - code: 'encrypted-document', - level: 'error', - message: 'Trailer declares /Encrypt; encryption is forbidden in cv 0.x (spec §3.4)', - }); - return { ok: false, level, issues }; - } - let pdfDoc; try { pdfDoc = await loadDocument(bytes); @@ -37,6 +30,17 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): return { ok: false, level, issues }; } + // An /Encrypt trailer entry is authoritative; encrypted files carry encrypted + // streams that cannot be meaningfully inspected, so reject immediately. + if (pdfDoc.context.trailerInfo.Encrypt) { + issues.push({ + code: 'encrypted-document', + level: 'error', + message: 'Document declares an /Encrypt dictionary; encryption is forbidden in cv 0.x (spec §3.4)', + }); + return { ok: false, level, issues }; + } + for (const issue of scanForbiddenConstructs(pdfDoc)) { issues.push(issue); } @@ -53,6 +57,9 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): return { ok: false, level, issues }; } + const newerVersionIssue = checkVersion(meta.version); + if (newerVersionIssue) issues.push(newerVersionIssue); + const payloads = readAssociatedFiles(pdfDoc); if (payloads.length === 0) { issues.push({ code: 'no-payloads', level: 'error', message: 'No /AF Associated Files present' }); @@ -67,6 +74,14 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): payload: payload.name, }); } + if (!isPortableName(payload.name)) { + issues.push({ + code: 'filename-not-portable', + level: 'error', + message: `Payload name "${payload.name}" is not POSIX-portable (spec §4.4)`, + payload: payload.name, + }); + } } if (!payloads.some((p) => p.name === meta.primaryPayload)) { @@ -120,14 +135,34 @@ export async function validate(input: BinaryInput, opts: ValidateOptions = {}): return { ok, level, issues }; } +function isPortableName(name: string): boolean { + if (!PORTABLE_NAME_RE.test(name)) return false; + return name.split('/').every((segment) => segment !== '.' && segment !== '..'); +} + +/** + * The highest cv MAJOR version this SDK fully understands. The 0.x pre-stable + * series and the 1.x stable series are normatively identical (spec §12), so the + * SDK knows both; a MAJOR of 2 or greater is "newer". + */ +const KNOWN_MAJOR = 1; + /** - * Byte-level pre-check for an /Encrypt trailer entry. pdf-lib refuses to - * parse encrypted PDFs at load time, so without this the validator would - * surface a generic parse failure instead of the documented spec-§3.4 code. + * Emit a "newer-format-version" warning when the file's cv:version MAJOR exceeds + * what this SDK knows (spec §8.3). Both "0.1" and "1.0" are known; only a MAJOR + * of 2 or greater warns. Extraction is never blocked: this is a warning only. */ -function looksEncrypted(bytes: Uint8Array): boolean { - // Search the last 4 KiB where the trailer lives. - const tail = bytes.subarray(Math.max(0, bytes.length - 4096)); - const text = new TextDecoder('latin1').decode(tail); - return /\/Encrypt\b/.test(text); +function checkVersion(version: string): ValidationIssue | null { + const major = parseMajor(version); + if (major === null || major <= KNOWN_MAJOR) return null; + return { + code: 'newer-format-version', + level: 'warning', + message: `cv:version "${version}" has a newer MAJOR than this SDK knows (${CV_SPEC_VERSION}); rendering may be incomplete (spec §8.3)`, + }; +} + +function parseMajor(version: string): number | null { + const major = Number.parseInt(version.split('.')[0] ?? '', 10); + return Number.isNaN(major) ? null : major; } diff --git a/packages/sdk-js/tests/fixes.test.ts b/packages/sdk-js/tests/fixes.test.ts new file mode 100644 index 0000000..67c2551 --- /dev/null +++ b/packages/sdk-js/tests/fixes.test.ts @@ -0,0 +1,256 @@ +import { encode as cborEncode } from 'cbor-x'; +import { PDFDocument, PDFName, PDFNumber, PDFString, StandardFonts } from 'pdf-lib'; +import * as pako from 'pako'; +import { describe, expect, it } from 'vitest'; +import { decodeEmbeddings, encodeEmbeddings, pack, validate, type EmbeddingsPayload } from '../src/index.js'; + +async function blankPdf(text = 'Sample CV'): Promise { + const pdf = await PDFDocument.create(); + const page = pdf.addPage([300, 400]); + const font = await pdf.embedFont(StandardFonts.Helvetica); + page.drawText(text, { x: 30, y: 350, size: 18, font }); + return pdf.save(); +} + +const sampleEmbeddings: EmbeddingsPayload = { + formatVersion: 1, + spaces: [ + { + model: 'BAAI/bge-m3', + modelRevision: 'rev1', + dimension: 4, + metric: 'cosine', + normalized: true, + chunking: 'section', + chunks: [{ id: 'a', textOffset: 0, textLength: 10, vector: new Float32Array([0.1, 0.2, 0.3, 0.4]) }], + }, + ], +}; + +// Fix 1 — inline /OpenAction JavaScript action must be rejected. +describe('security: inline OpenAction JavaScript (fix 1)', () => { + it('rejects a catalog /OpenAction stored as a direct (inline) JavaScript action', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + + // Re-open and inject an INLINE OpenAction so it is NOT an indirect object. + const doc = await PDFDocument.load(cv, { updateMetadata: false }); + const openAction = doc.context.obj({ + Type: 'Action', + S: 'JavaScript', + JS: PDFString.of('app.alert("pwned");'), + }); + doc.catalog.set(PDFName.of('OpenAction'), openAction); + const tampered = await doc.save({ useObjectStreams: false }); + + const report = await validate(tampered); + expect(report.ok).toBe(false); + expect(report.issues.map((i) => i.code)).toContain('javascript-action'); + }); + + it('still passes a clean file (no false positives)', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const report = await validate(cv); + expect(report.ok).toBe(true); + }); +}); + +// Fix 2 — embedded-file /Params must carry an MD5 /CheckSum. +describe('pack: embedded-file /CheckSum (fix 2)', () => { + it('writes a /CheckSum entry into each embedded-file /Params', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const text = new TextDecoder('latin1').decode(cv); + expect(text).toMatch(/\/CheckSum/); + }); +}); + +// Fix 3 — non-portable payload names rejected on write and flagged on read. +describe('filename portability (fix 3)', () => { + it('rejects path-traversal payload names at pack time', async () => { + await expect( + pack({ + pdf: await blankPdf(), + payloads: [{ data: 'x', name: '../../etc/passwd', mimeType: 'text/plain', relationship: 'Supplement' }], + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }), + ).rejects.toThrow(/portable|path segment/i); + }); + + it('rejects non-portable charset payload names at pack time', async () => { + await expect( + pack({ + pdf: await blankPdf(), + payloads: [{ data: 'x', name: 'résumé .md', mimeType: 'text/plain', relationship: 'Supplement' }], + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }), + ).rejects.toThrow(/portable/i); + }); + + it('flags a non-portable name on the read side', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + // Inject a filespec with a non-portable name directly into /AF. + const doc = await PDFDocument.load(cv, { updateMetadata: false }); + const stream = doc.context.flateStream(new TextEncoder().encode('bad'), { + Type: 'EmbeddedFile', + Subtype: 'text/plain', + }); + const streamRef = doc.context.register(stream); + const filespec = doc.context.obj({ + Type: 'Filespec', + F: PDFString.of('../evil.txt'), + UF: PDFString.of('../evil.txt'), + EF: { F: streamRef }, + AFRelationship: 'Supplement', + }); + const afRef = doc.context.register(filespec); + const af = doc.catalog.lookup(PDFName.of('AF')); + (af as { push(x: unknown): void }).push(afRef); + const tampered = await doc.save({ useObjectStreams: false }); + + const report = await validate(tampered); + expect(report.issues.map((i) => i.code)).toContain('filename-not-portable'); + expect(report.ok).toBe(false); + }); +}); + +// Fix 4 — decode-side validation mirrors encode-side guarantees. +describe('embeddings: decode validation (fix 4)', () => { + it('round-trips a valid space', () => { + const decoded = decodeEmbeddings(encodeEmbeddings(sampleEmbeddings)); + expect(decoded.spaces[0]!.metric).toBe('cosine'); + }); + + it('rejects a malformed space with an invalid metric from attacker CBOR', () => { + // Hand-build CBOR that bypasses the encode-side validation. + const malformed = cborEncode({ + 'format-version': 1, + spaces: [ + { + model: 'evil', + 'model-revision': 'r', + dimension: 4, + metric: 'totally-not-a-metric', + normalized: true, + chunking: 'section', + chunks: [{ id: 'a', 'text-offset': 0, 'text-length': 1, vector: new Uint8Array(16) }], + }, + ], + }); + expect(() => decodeEmbeddings(malformed)).toThrow(/metric/i); + }); + + it('rejects a malformed space with an invalid chunking', () => { + const malformed = cborEncode({ + 'format-version': 1, + spaces: [ + { + model: 'evil', + 'model-revision': 'r', + dimension: 4, + metric: 'cosine', + normalized: true, + chunking: 'invalid-chunking', + chunks: [{ id: 'a', 'text-offset': 0, 'text-length': 1, vector: new Uint8Array(16) }], + }, + ], + }); + expect(() => decodeEmbeddings(malformed)).toThrow(/chunking/i); + }); +}); + +// Fix 5 — FlateDecode /Predictor must be rejected, not silently mis-decoded. +describe('pdf: FlateDecode /Predictor rejection (fix 5)', () => { + it('rejects an embedded-file stream with /DecodeParms /Predictor 12', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const doc = await PDFDocument.load(cv, { updateMetadata: false }); + + // Build a flate stream and slap a predictor on it, then add to /AF. + const compressed = pako.deflate(new TextEncoder().encode('predicted')); + const stream = doc.context.stream(compressed, { + Type: 'EmbeddedFile', + Subtype: 'text/plain', + Filter: 'FlateDecode', + DecodeParms: doc.context.obj({ Predictor: PDFNumber.of(12), Columns: PDFNumber.of(4) }), + }); + const streamRef = doc.context.register(stream); + const filespec = doc.context.obj({ + Type: 'Filespec', + F: PDFString.of('predicted.txt'), + UF: PDFString.of('predicted.txt'), + EF: { F: streamRef }, + AFRelationship: 'Supplement', + }); + const afRef = doc.context.register(filespec); + const af = doc.catalog.lookup(PDFName.of('AF')); + (af as { push(x: unknown): void }).push(afRef); + const tampered = await doc.save({ useObjectStreams: false }); + + await expect(validate(tampered)).rejects.toThrow(/Predictor/i); + }); +}); + +// Fix 6 — newer MAJOR cv:version surfaces a warning but does not block. +describe('validate: newer-format-version warning (fix 6)', () => { + it('warns (but stays ok) when cv:version MAJOR is 2', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const text = new TextDecoder('latin1').decode(cv); + expect(text).toContain('0.1'); + + // Rewrite the cv:version to a future major. The XMP stream is uncompressed + // metadata so we can patch the bytes in place (same byte length). + const patched = patchBytes(cv, '0.1', '2.0'); + const report = await validate(patched); + expect(report.issues.map((i) => i.code)).toContain('newer-format-version'); + const versionIssue = report.issues.find((i) => i.code === 'newer-format-version'); + expect(versionIssue?.level).toBe('warning'); + // A newer-version warning alone must not block (extraction still works). + expect(report.ok).toBe(true); + }); + + it('does not warn for known versions 0.1 and 1.0', async () => { + const cv = await pack({ + pdf: await blankPdf(), + markdown: '# Hi\n', + metadata: { primaryLanguage: 'en' }, + }); + const v10 = patchBytes(cv, '0.1', '1.0'); + const report = await validate(v10); + expect(report.issues.map((i) => i.code)).not.toContain('newer-format-version'); + }); +}); + +function patchBytes(bytes: Uint8Array, from: string, to: string): Uint8Array { + if (from.length !== to.length) throw new Error('patchBytes requires equal lengths'); + const text = new TextDecoder('latin1').decode(bytes); + const idx = text.indexOf(from); + if (idx < 0) throw new Error(`pattern not found: ${from}`); + const out = new Uint8Array(bytes); + const enc = new TextEncoder().encode(to); + out.set(enc, idx); + return out; +} From 6a9e62549a52cab56d196a2dd52ba081a0f3613d Mon Sep 17 00:00:00 2001 From: Ilan Date: Thu, 28 May 2026 23:36:14 +0300 Subject: [PATCH 3/8] fix(viewer-web,embed): UTF-8 byte offsets, portable worker, lazy pdf.js, light-DOM crawler text embed-js: * chunk.ts now computes text-offset/text-length as UTF-8 byte offsets (encode once, track a byte cursor, decode byte slices) per spec 5.1, instead of UTF-16 code-unit offsets that disagreed with the Go/Python SDKs on any non-ASCII resume. New multibyte test covers accents, CJK, emoji. viewer-web: * Replaced the Vite-only `?url` pdf.js worker import (which broke every non-Vite consumer) with a portable new URL(..., import.meta.url) plus a configurable worker-src; verified the compiled dist no longer contains ?url. * Enabled tsup code splitting and removed the static render-pdf import so pdf.js is now a lazily loaded chunk (entry dropped from ~17KB pulling pdf.js to a small shell). * The extracted clean text is now projected into the light DOM (a visually-hidden child + cleanText getter) so crawlers can index it, instead of being buried in the shadow root. * Language-aware payload selection (pickPayloadByLanguage), mirroring the SDK. * Hardened src fetch: credentials omit, redirect follow, timeout, size cap. * Demo: sample.cv moved under public/ so it ships (was a 404), fetch guarded on res.ok, native file input replaced with a hidden-input + styled-button. * render-markdown: dropped the misleading partial DOMPurify blocklist (relies on the safe html profile) and adds rel="noopener noreferrer" to links. * Added vitest + happy-dom test setup for viewer-web (new payload-selection and light-DOM tests); lockfile updated for the test devDeps. --- packages/embed-js/src/chunk.ts | 104 +++-- packages/embed-js/tests/chunk.test.ts | 42 +- packages/viewer-web/demo/index.html | 24 +- packages/viewer-web/demo/main.ts | 3 + .../dist-demo/assets/index-CGHtsZ-C.js | 431 ++++++++++++++++++ .../dist-demo/assets/index-L4DwlYUA.js | 294 ------------ ...pdf-Ns90kvla.js => render-pdf-Dr51mmem.js} | 10 +- packages/viewer-web/dist-demo/index.html | 31 +- packages/viewer-web/dist-demo/sample.cv | Bin 0 -> 7638 bytes packages/viewer-web/package.json | 11 +- packages/viewer-web/src/cv-embed.ts | 91 +++- packages/viewer-web/src/payload-selection.ts | 29 ++ packages/viewer-web/src/render-markdown.ts | 24 +- packages/viewer-web/src/render-pdf.ts | 18 +- .../tests/clean-text-light-dom.test.ts | 77 ++++ .../tests/payload-selection.test.ts | 80 ++++ packages/viewer-web/tsup.config.ts | 6 +- packages/viewer-web/vitest.config.ts | 8 + pnpm-lock.yaml | 99 +++- 19 files changed, 1032 insertions(+), 350 deletions(-) create mode 100644 packages/viewer-web/dist-demo/assets/index-CGHtsZ-C.js delete mode 100644 packages/viewer-web/dist-demo/assets/index-L4DwlYUA.js rename packages/viewer-web/dist-demo/assets/{render-pdf-Ns90kvla.js => render-pdf-Dr51mmem.js} (94%) create mode 100644 packages/viewer-web/dist-demo/sample.cv create mode 100644 packages/viewer-web/src/payload-selection.ts create mode 100644 packages/viewer-web/tests/clean-text-light-dom.test.ts create mode 100644 packages/viewer-web/tests/payload-selection.test.ts create mode 100644 packages/viewer-web/vitest.config.ts diff --git a/packages/embed-js/src/chunk.ts b/packages/embed-js/src/chunk.ts index e2855e3..abf9c32 100644 --- a/packages/embed-js/src/chunk.ts +++ b/packages/embed-js/src/chunk.ts @@ -5,8 +5,18 @@ * carries the byte offset and length into the original UTF-8 source so a * downstream consumer can map a vector hit back to the exact substring * without re-tokenising. Pre-heading content becomes a "preamble" chunk. + * + * Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the + * markdown source. We encode the document once with `TextEncoder`, track a + * byte cursor while iterating lines (counting the trailing `\n` byte), and + * derive each chunk's `text` by decoding the corresponding byte slice. This + * keeps the offsets in agreement with the Go and Python SDKs for any + * non-ASCII résumé. */ +const encoder = new TextEncoder(); +const decoder = new TextDecoder(); + export type ChunkingMode = 'document' | 'section' | 'paragraph'; export interface MarkdownChunk { @@ -22,26 +32,38 @@ export interface ChunkOptions { const HEADING = /^(#{1,6})\s+(.+?)\s*$/; +/** A source line plus its UTF-8 byte offset and byte length (including any trailing `\n`). */ +interface ByteLine { + text: string; + offset: number; + byteLength: number; +} + export function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] { const mode = opts.mode ?? 'section'; + const bytes = encoder.encode(markdown); if (mode === 'document') { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } if (mode === 'paragraph') { - return paragraphChunks(markdown); + return paragraphChunks(bytes); } - return sectionChunks(markdown); + return sectionChunks(bytes); +} + +function documentChunk(bytes: Uint8Array): MarkdownChunk { + return { id: 'document', textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) }; } -function sectionChunks(markdown: string): MarkdownChunk[] { - const lines = splitWithOffsets(markdown); +function sectionChunks(bytes: Uint8Array): MarkdownChunk[] { + const lines = splitWithByteOffsets(bytes); const sections: MarkdownChunk[] = []; let current: { id: string; start: number; end: number } | null = null; const ids = new Set(); function flush(end: number): void { if (!current) return; - const text = markdown.slice(current.start, end); + const text = sliceText(bytes, current.start, end); if (text.trim().length === 0) { current = null; return; @@ -52,63 +74,93 @@ function sectionChunks(markdown: string): MarkdownChunk[] { for (const line of lines) { const match = HEADING.exec(line.text); + const lineEnd = line.offset + line.byteLength; if (match) { flush(line.offset); const id = uniqueId(slugify(match[2] ?? `section-${sections.length + 1}`), ids); ids.add(id); - current = { id, start: line.offset, end: line.offset + line.text.length }; + current = { id, start: line.offset, end: lineEnd }; continue; } if (current === null) { const id = uniqueId('preamble', ids); ids.add(id); - current = { id, start: line.offset, end: line.offset + line.text.length }; + current = { id, start: line.offset, end: lineEnd }; } else { - current.end = line.offset + line.text.length; + current.end = lineEnd; } } - flush(markdown.length); + flush(bytes.byteLength); if (sections.length === 0) { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } return sections; } -function paragraphChunks(markdown: string): MarkdownChunk[] { +function paragraphChunks(bytes: Uint8Array): MarkdownChunk[] { const out: MarkdownChunk[] = []; const ids = new Set(); + const separator = encoder.encode('\n\n'); let cursor = 0; let i = 0; - while (cursor < markdown.length) { - let end = markdown.indexOf('\n\n', cursor); - if (end === -1) end = markdown.length; - const text = markdown.slice(cursor, end); + while (cursor < bytes.byteLength) { + let end = indexOfBytes(bytes, separator, cursor); + if (end === -1) end = bytes.byteLength; + const text = sliceText(bytes, cursor, end); if (text.trim().length > 0) { const id = uniqueId(slugify(text.split('\n')[0] ?? `p-${i}`), ids); ids.add(id); - out.push({ id, textOffset: cursor, textLength: text.length, text }); + out.push({ id, textOffset: cursor, textLength: end - cursor, text }); i += 1; } - cursor = end + 2; + cursor = end + separator.byteLength; } if (out.length === 0) { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } return out; } -function splitWithOffsets(s: string): { text: string; offset: number }[] { - const lines: { text: string; offset: number }[] = []; - let offset = 0; - for (const line of s.split('\n')) { - const withNl = line + (offset + line.length < s.length ? '\n' : ''); - lines.push({ text: withNl, offset }); - offset += withNl.length; +/** Decode the UTF-8 byte slice `[start, end)` back into a string. */ +function sliceText(bytes: Uint8Array, start: number, end: number): string { + return decoder.decode(bytes.subarray(start, end)); +} + +/** Split UTF-8 bytes into lines, each tagged with its byte offset and byte length (newline included). */ +function splitWithByteOffsets(bytes: Uint8Array): ByteLine[] { + const newline = 0x0a; // '\n' + const lines: ByteLine[] = []; + let start = 0; + for (let i = 0; i < bytes.byteLength; i += 1) { + if (bytes[i] === newline) { + const byteLength = i - start + 1; + lines.push({ text: sliceText(bytes, start, i + 1), offset: start, byteLength }); + start = i + 1; + } + } + if (start < bytes.byteLength) { + lines.push({ text: sliceText(bytes, start, bytes.byteLength), offset: start, byteLength: bytes.byteLength - start }); } return lines; } +/** Find the byte index of `needle` in `haystack` at or after `from`, or -1. */ +function indexOfBytes(haystack: Uint8Array, needle: Uint8Array, from: number): number { + const last = haystack.byteLength - needle.byteLength; + for (let i = from; i <= last; i += 1) { + let matched = true; + for (let j = 0; j < needle.byteLength; j += 1) { + if (haystack[i + j] !== needle[j]) { + matched = false; + break; + } + } + if (matched) return i; + } + return -1; +} + function slugify(s: string): string { return ( s diff --git a/packages/embed-js/tests/chunk.test.ts b/packages/embed-js/tests/chunk.test.ts index 63c24ea..99fd07e 100644 --- a/packages/embed-js/tests/chunk.test.ts +++ b/packages/embed-js/tests/chunk.test.ts @@ -1,6 +1,13 @@ import { describe, expect, it } from 'vitest'; import { chunkMarkdown } from '../src/chunk.js'; +const encoder = new TextEncoder(); +const decoder = new TextDecoder(); + +function byteSlice(source: string, offset: number, length: number): string { + return decoder.decode(encoder.encode(source).subarray(offset, offset + length)); +} + const sample = `# Jane Doe Senior engineer. @@ -24,8 +31,39 @@ describe('chunkMarkdown', () => { it('preserves byte offsets that index back into the source', () => { const chunks = chunkMarkdown(sample); for (const c of chunks) { - const slice = sample.slice(c.textOffset, c.textOffset + c.textLength); - expect(slice).toBe(c.text); + expect(byteSlice(sample, c.textOffset, c.textLength)).toBe(c.text); + } + }); + + it('emits UTF-8 byte offsets that slice back correctly for multibyte content', () => { + const multibyte = `# Résumé de Zoé 🚀 + +Ingénieure logicielle. 日本語 も少し. + +## Expérience + +- Société Générale, 2020 à 2024 — café ☕ inclus + +## Compétences + +TypeScript, Go, Python. Naïve façade. +`; + const chunks = chunkMarkdown(multibyte); + expect(chunks.length).toBeGreaterThan(1); + const totalBytes = new TextEncoder().encode(multibyte).byteLength; + for (const c of chunks) { + // Offsets address bytes, not UTF-16 code units. + expect(c.textOffset).toBeGreaterThanOrEqual(0); + expect(c.textOffset + c.textLength).toBeLessThanOrEqual(totalBytes); + expect(byteSlice(multibyte, c.textOffset, c.textLength)).toBe(c.text); + } + // Heading slug retains ASCII slugification of the multibyte title. + expect(chunks[0]?.id).toBe('r-sum-de-zo'); + // First chunk's byte length exceeds its UTF-16 length because of the emoji + accents. + const first = chunks[0]; + expect(first).toBeDefined(); + if (first) { + expect(first.textLength).toBeGreaterThan(first.text.length); } }); diff --git a/packages/viewer-web/demo/index.html b/packages/viewer-web/demo/index.html index a06a49b..4da9b88 100644 --- a/packages/viewer-web/demo/index.html +++ b/packages/viewer-web/demo/index.html @@ -49,6 +49,19 @@ cursor: pointer; font-weight: 600; } + .drop button:focus-visible { outline: 2px solid var(--fg); outline-offset: 2px; } + /* Visually hidden but focusable: the styled button proxies clicks/keys to it. */ + .visually-hidden { + position: absolute; + width: 1px; + height: 1px; + margin: -1px; + padding: 0; + overflow: hidden; + clip: rect(0 0 0 0); + clip-path: inset(50%); + border: 0; + } .viewer { margin-top: 2rem; } footer { max-width: 920px; @@ -72,8 +85,15 @@

.cv viewer

Drop a .cv file here
or pick one from disk
- - + +
@@ -82,10 +82,13 @@ const jsonLd = graph([

- Want to add BGE-M3 semantic embeddings? Install the CLI - (brew install cvfile/tap/cv) and run - cv pack --embed-with bge-m3. The model is ~285 MB so it - runs once locally on your machine, not on every visitor's browser. + Want to add BGE-M3 semantic embeddings? Generate them with the Python + package (pip install "cvfile[embed]"): + run embed(markdown) then pack with + pack(..., embeddings=encode_embeddings(payload)). The model is + ~285 MB so it runs once locally on your machine, not on every visitor's + browser. The cv CLI is reader-only (extract, inspect, validate, + search) and does not generate embeddings.

@@ -65,7 +81,6 @@ const jsonLd = graph([ await viewer.loadFromBytes(buf); } - dz.addEventListener('click', () => picker.click()); picker.addEventListener('change', () => { const file = picker.files?.[0]; if (file) void handle(file); diff --git a/spec/cv-1.0.md b/spec/cv-1.0.md index d1898c5..dd4b4a2 100644 --- a/spec/cv-1.0.md +++ b/spec/cv-1.0.md @@ -191,9 +191,11 @@ prefix: cv | --- | --- | --- | | `cv:modified` | xs:dateTime | Last modification | | `cv:generator` | text | Producer string, e.g. `"@cvfile/sdk/0.1.0"` | -| `cv:alternates` | rdf:Bag of struct | One entry per alternate payload: `{ payload, language, mimeType }` | -| `cv:integrity` | rdf:Bag of struct | One entry per payload: `{ payload, algorithm, digest }` | -| `cv:embeddings` | rdf:Bag of struct | One entry per embedding space: `{ model, dimension, metric, chunks }` | +| `cv:alternates` | Text (JSON-encoded array) | One entry per alternate payload: `{ payload, language, mimeType }` | +| `cv:integrity` | Text (JSON-encoded array) | One entry per payload: `{ payload, algorithm, digest }` | +| `cv:embeddings` | Text (JSON-encoded array) | One entry per embedding space: `{ model, dimension, metric, chunks }` | + +These three properties are simple XMP `Text` values whose content is a JSON-encoded array of objects (UTF-8, XML-escaped), declared with `pdfaProperty:valueType="Text"` in the PDF/A extension schema. They are not `rdf:Bag` containers. Consumers parse the element text as JSON. ### 6.4 Integrity diff --git a/tools/cv-detector/go/detector.go b/tools/cv-detector/go/detector.go index 794014c..9b74cbc 100644 --- a/tools/cv-detector/go/detector.go +++ b/tools/cv-detector/go/detector.go @@ -168,13 +168,25 @@ func Unwrap(pdfBytes []byte, payloadName string) (*UnwrappedPayload, error) { return nil, nil } +// innerTag reads a cv XMP field. RDF allows two equivalent serialisations: +// the element form 1.0 and the attribute form +// cv:version="1.0". We try the element form first, then fall back to the +// attribute form so both shapes are detected identically. func innerTag(data []byte, tag string) string { - pat := regexp.MustCompile(`<` + regexp.QuoteMeta(tag) + `>([^<]*)`) - m := pat.FindSubmatch(data) - if m == nil { - return "" + q := regexp.QuoteMeta(tag) + elem := regexp.MustCompile(`<` + q + `>([^<]*)`) + if m := elem.FindSubmatch(data); m != nil { + return string(bytes.TrimSpace(m[1])) + } + attr := regexp.MustCompile(q + `\s*=\s*"([^"]*)"|` + q + `\s*=\s*'([^']*)'`) + if m := attr.FindSubmatch(data); m != nil { + val := m[1] + if len(val) == 0 { + val = m[2] + } + return string(bytes.TrimSpace(val)) } - return string(bytes.TrimSpace(m[1])) + return "" } func stringEntry(d pdfTypes.Dict, key string) string { diff --git a/tools/cv-detector/go/detector_test.go b/tools/cv-detector/go/detector_test.go index eeec12d..e739450 100644 --- a/tools/cv-detector/go/detector_test.go +++ b/tools/cv-detector/go/detector_test.go @@ -52,6 +52,40 @@ func TestDetectRejectsGarbage(t *testing.T) { } } +func TestDetectAttributeFormXMP(t *testing.T) { + // RDF attribute-form serialisation: fields are attributes on the + // rdf:Description element rather than child elements. + xmp := `%PDF-1.7 + + + + + + + +%%EOF` + det := Detect([]byte(xmp)) + if !det.IsCvFile { + t.Fatal("expected IsCvFile=true for attribute-form XMP") + } + if det.Version != "1.0" { + t.Errorf("Version = %q, want 1.0", det.Version) + } + if det.PrimaryPayload != "resume.md" { + t.Errorf("PrimaryPayload = %q, want resume.md", det.PrimaryPayload) + } + if det.PrimaryLanguage != "en" { + t.Errorf("PrimaryLanguage = %q, want en", det.PrimaryLanguage) + } + if det.Generator != "cvfile.org/create" { + t.Errorf("Generator = %q, want cvfile.org/create", det.Generator) + } +} + func TestUnwrapReturnsPrimaryMarkdown(t *testing.T) { data, err := os.ReadFile(fixturePath(t)) if err != nil { diff --git a/tools/cv-detector/python/src/cvfile_cv_detector/__init__.py b/tools/cv-detector/python/src/cvfile_cv_detector/__init__.py index 261d1ad..d6a6d80 100644 --- a/tools/cv-detector/python/src/cvfile_cv_detector/__init__.py +++ b/tools/cv-detector/python/src/cvfile_cv_detector/__init__.py @@ -131,16 +131,32 @@ def unwrap(pdf_bytes: bytes, payload_name: str | None = None) -> UnwrappedPayloa return None -_TAG_RE: dict[str, re.Pattern[str]] = {} +_TAG_RE: dict[str, tuple[re.Pattern[str], re.Pattern[str]]] = {} def _inner(text: str, tag: str) -> str | None: - pat = _TAG_RE.get(tag) - if pat is None: - pat = re.compile(rf"<{re.escape(tag)}>([^<]*)") - _TAG_RE[tag] = pat - m = pat.search(text) - return m.group(1).strip() if m else None + """Read a cv XMP field. + + RDF allows two equivalent serialisations: the element form + ``1.0`` and the attribute form + ``cv:version="1.0"``. Try the element form first, then fall back to the + attribute form so both shapes are detected identically. + """ + pats = _TAG_RE.get(tag) + if pats is None: + q = re.escape(tag) + elem = re.compile(rf"<{q}>([^<]*)") + attr = re.compile(rf"""{q}\s*=\s*"([^"]*)"|{q}\s*=\s*'([^']*)'""") + pats = (elem, attr) + _TAG_RE[tag] = pats + elem, attr = pats + m = elem.search(text) + if m: + return m.group(1).strip() + m = attr.search(text) + if m: + return (m.group(1) or m.group(2) or "").strip() + return None def _name_to_mime(name: str) -> str: diff --git a/tools/cv-detector/python/tests/test_detector.py b/tools/cv-detector/python/tests/test_detector.py index c884de4..7a5c509 100644 --- a/tools/cv-detector/python/tests/test_detector.py +++ b/tools/cv-detector/python/tests/test_detector.py @@ -37,6 +37,30 @@ def test_detect_rejects_garbage() -> None: assert det.is_cv_file is False +def test_detect_attribute_form_xmp() -> None: + # RDF attribute-form serialisation: cv fields are attributes on the + # rdf:Description element rather than child elements. + xmp = ( + b"%PDF-1.7\n" + b'\n' + b'\n' + b'\n' + b'\n' + b"\n\n" + b'\n%%EOF' + ) + det = detect(xmp) + assert det.is_cv_file is True + assert det.version == "1.0" + assert det.primary_payload == "resume.md" + assert det.primary_language == "en" + assert det.generator == "cvfile.org/create" + + def test_unwrap_returns_primary_markdown(cv_bytes: bytes) -> None: payload = unwrap(cv_bytes) assert payload is not None diff --git a/tools/cv-detector/typescript/src/index.ts b/tools/cv-detector/typescript/src/index.ts index 8510678..c999ea0 100644 --- a/tools/cv-detector/typescript/src/index.ts +++ b/tools/cv-detector/typescript/src/index.ts @@ -135,10 +135,19 @@ function bytesToLatin1(bytes: Uint8Array): string { return out; } +// Reads a cv XMP field. RDF allows two equivalent serialisations: the element +// form 1.0 and the attribute form cv:version="1.0". +// Try the element form first, then fall back to the attribute form so both +// shapes are detected identically. function innerTag(text: string, tag: string): string | undefined { - const re = new RegExp(`<${escapeRegex(tag)}>([^<]*)`); - const m = re.exec(text); - return m ? m[1]!.trim() : undefined; + const q = escapeRegex(tag); + const elem = new RegExp(`<${q}>([^<]*)`); + const em = elem.exec(text); + if (em) return em[1]!.trim(); + const attr = new RegExp(`${q}\\s*=\\s*"([^"]*)"|${q}\\s*=\\s*'([^']*)'`); + const am = attr.exec(text); + if (am) return (am[1] ?? am[2] ?? '').trim(); + return undefined; } function escapeRegex(s: string): string { diff --git a/tools/cv-detector/typescript/tests/detector.test.ts b/tools/cv-detector/typescript/tests/detector.test.ts index c5b4bb2..f08d6d4 100644 --- a/tools/cv-detector/typescript/tests/detector.test.ts +++ b/tools/cv-detector/typescript/tests/detector.test.ts @@ -37,6 +37,34 @@ describe('detect', () => { it('rejects garbage', () => { expect(detect(new TextEncoder().encode('hello world')).isCvFile).toBe(false); }); + + it('detects RDF attribute-form XMP', () => { + // Fields serialised as attributes on rdf:Description rather than as + // child elements. + const xmp = new TextEncoder().encode( + [ + '%PDF-1.7', + '', + '', + '', + '', + '', + '', + '', + '%%EOF', + ].join('\n'), + ); + const det = detect(xmp); + expect(det.isCvFile).toBe(true); + expect(det.version).toBe('1.0'); + expect(det.primaryPayload).toBe('resume.md'); + expect(det.primaryLanguage).toBe('en'); + expect(det.generator).toBe('cvfile.org/create'); + }); }); describe('unwrap', () => { diff --git a/tools/verapdf-runner/run.sh b/tools/verapdf-runner/run.sh index 6450b30..7318174 100755 --- a/tools/verapdf-runner/run.sh +++ b/tools/verapdf-runner/run.sh @@ -10,6 +10,16 @@ FILE="${1:-}" FLAVOUR="${2:-3u}" FORMAT="${VERAPDF_FORMAT:-text}" +# Pin a specific tagged image for reproducible runs (override via VERAPDF_IMAGE). +VERAPDF_IMAGE="${VERAPDF_IMAGE:-verapdf/cli:v1.28.2}" + +# Default to the host architecture (fast, native). Set VERAPDF_PLATFORM to +# force one, e.g. VERAPDF_PLATFORM=linux/amd64 on an arm host if needed. +PLATFORM_ARG=() +if [[ -n "${VERAPDF_PLATFORM:-}" ]]; then + PLATFORM_ARG=(--platform "$VERAPDF_PLATFORM") +fi + if [[ -z "$FILE" ]]; then echo "Usage: $0 [3u|3a|3b]" >&2 exit 64 @@ -24,9 +34,9 @@ ABS_FILE="$(cd "$(dirname "$FILE")" && pwd)/$(basename "$FILE")" DIR="$(dirname "$ABS_FILE")" NAME="$(basename "$ABS_FILE")" -docker run --rm --platform linux/amd64 \ +docker run --rm "${PLATFORM_ARG[@]}" \ -v "$DIR:/data" \ - verapdf/cli:latest \ + "$VERAPDF_IMAGE" \ --flavour "$FLAVOUR" \ --format "$FORMAT" \ --nonpdfext \ From 1b388ae1c97137e0cb75c09afcfd491990209673 Mon Sep 17 00:00:00 2001 From: Ilan Date: Thu, 28 May 2026 23:37:35 +0300 Subject: [PATCH 7/8] fix(integrations): load embeddings and per-chunk vectors, widen pin, regenerate fixtures * The standalone langchain, llama-index, and haystack packages only emitted text payloads and silently dropped embeddings.cbor, so RAG users got no precomputed vectors (the format's main selling point). They now expose a chunks mode that loads per-chunk vectors, delegating to the SDK's resolve_embedding_chunks (one source of truth) with UTF-8 byte-offset slicing. * Widened the cvfile dependency pin from <1 to <2 so a future 1.0 SDK installs. * Dropped the per-alternate primary-language fallback that mislabeled alternates lacking an explicit language. * Regenerated the shared python-produced.cv fixture so it contains an embeddings.cbor payload (the old one had none, so no test exercised the vector path) and added a non-ASCII unicode.cv fixture; interop.test.ts asserts the embeddings summary surfaces. --- integrations/cvfile-haystack/pyproject.toml | 2 +- .../components/converters/cvfile/converter.py | 77 +++++++++++++- .../cvfile-haystack/tests/test_converter.py | 31 ++++++ integrations/langchain-cvfile/pyproject.toml | 2 +- .../src/langchain_cvfile/loader.py | 80 ++++++++++++-- .../langchain-cvfile/tests/test_loader.py | 34 ++++++ .../llama-index-readers-cvfile/pyproject.toml | 2 +- .../src/llama_index/readers/cvfile/base.py | 82 ++++++++++++++- .../tests/test_reader.py | 31 ++++++ .../tests/fixtures/build_unicode_fixture.py | 99 ++++++++++++++++++ integrations/tests/fixtures/unicode.cv | Bin 0 -> 9756 bytes .../sdk-js/tests/fixtures/python-produced.cv | Bin 8835 -> 10256 bytes packages/sdk-js/tests/interop.test.ts | 12 ++- 13 files changed, 428 insertions(+), 24 deletions(-) create mode 100644 integrations/tests/fixtures/build_unicode_fixture.py create mode 100644 integrations/tests/fixtures/unicode.cv diff --git a/integrations/cvfile-haystack/pyproject.toml b/integrations/cvfile-haystack/pyproject.toml index 2a72390..39f3e1f 100644 --- a/integrations/cvfile-haystack/pyproject.toml +++ b/integrations/cvfile-haystack/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "cvfile>=0.1.0,<1", + "cvfile>=0.1,<2", "haystack-ai>=2.8,<3", ] diff --git a/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py b/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py index 1150f4b..2c201d5 100644 --- a/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py +++ b/integrations/cvfile-haystack/src/haystack_integrations/components/converters/cvfile/converter.py @@ -28,13 +28,41 @@ def _payload_meta(payload: ExtractedPayload, file: CvFile) -> dict[str, Any]: "mime_type": payload.mime_type, "payload": payload.name, "relationship": payload.relationship, - "language": payload.language or file.metadata.primary_language, + "language": payload.language, "primary": payload.name == file.metadata.primary_payload, "cv_version": file.metadata.version, "cv_generator": file.metadata.generator, } +def _resolve_chunks(file: CvFile) -> list: + """Decode the file's embeddings.cbor into text-resolved chunks. + + Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets + (spec §5.1) and stays the single source of truth. Returns an empty list + when the embed extra is not installed or the file carries no embeddings. + """ + try: + from cvfile.embed import resolve_embedding_chunks + except ImportError: + return [] + return resolve_embedding_chunks(file) + + +def _chunk_meta(chunk: Any, file: CvFile) -> dict[str, Any]: + return { + "language": file.metadata.primary_language, + "cv_version": file.metadata.version, + "cv_generator": file.metadata.generator, + "chunk_id": chunk.id, + "chunk_offset": chunk.text_offset, + "chunk_length": chunk.text_length, + "embedding_model": chunk.model, + "embedding_dimension": chunk.dimension, + "embedding_metric": chunk.metric, + } + + @component class CVFileToDocument: """Convert ``.cv`` files into Haystack ``Document`` objects. @@ -48,18 +76,32 @@ class CVFileToDocument: Set ``primary_only=True`` to emit only the payload marked as ``primaryPayload`` in the file's XMP metadata (usually the canonical Markdown copy), and skip all alternates. + + Set ``mode="chunks"`` to emit one ``Document`` per pre-computed embedding + chunk instead of one per payload. Each chunk ``Document`` carries its vector + on ``Document.embedding`` and its text is sliced from the markdown using + UTF-8 byte offsets. Files without an embeddings payload fall back to a single + Markdown ``Document``. In ``mode="chunks"`` the ``primary_only`` flag is + ignored (chunks already index a single text payload). """ - def __init__(self, primary_only: bool = False) -> None: + def __init__(self, primary_only: bool = False, *, mode: str = "payloads") -> None: """Create a CVFileToDocument component. :param primary_only: If ``True``, emit only the payload marked as ``primaryPayload`` in the file's XMP metadata. If ``False`` (default), emit one ``Document`` per textual payload (the primary plus any - language alternates and supplements). + language alternates and supplements). Ignored in ``mode="chunks"``. + :param mode: + ``"payloads"`` (default) emits one ``Document`` per textual payload. + ``"chunks"`` emits one ``Document`` per pre-computed embedding chunk + with its vector attached. """ + if mode not in ("payloads", "chunks"): + raise ValueError("mode must be 'payloads' or 'chunks'") self.primary_only = primary_only + self.mode = mode @component.output_types(documents=list[Document]) def run( @@ -105,6 +147,10 @@ def run( stream_meta = bytestream.meta or {} source_label = stream_meta.get("file_path") or stream_meta.get("file_name") or str(source) + if self.mode == "chunks": + documents.extend(self._chunk_documents(file, stream_meta, source_meta, source_label)) + continue + for payload in file.payloads: if not _is_text_payload(payload): continue @@ -115,3 +161,28 @@ def run( documents.append(Document(content=payload.text(), meta=merged)) return {"documents": documents} + + @staticmethod + def _chunk_documents( + file: CvFile, + stream_meta: dict[str, Any], + source_meta: dict[str, Any], + source_label: str, + ) -> list[Document]: + chunks = _resolve_chunks(file) + if not chunks: + primary = next( + (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)), + None, + ) + if primary is None: + return [] + payload_meta = _payload_meta(primary, file) + merged = {**stream_meta, **payload_meta, **source_meta, "source": source_label} + return [Document(content=primary.text(), meta=merged)] + + out: list[Document] = [] + for chunk in chunks: + merged = {**stream_meta, **_chunk_meta(chunk, file), **source_meta, "source": source_label} + out.append(Document(content=chunk.text, meta=merged, embedding=list(chunk.vector))) + return out diff --git a/integrations/cvfile-haystack/tests/test_converter.py b/integrations/cvfile-haystack/tests/test_converter.py index 797dc73..3c8af06 100644 --- a/integrations/cvfile-haystack/tests/test_converter.py +++ b/integrations/cvfile-haystack/tests/test_converter.py @@ -11,6 +11,7 @@ from haystack_integrations.components.converters.cvfile import CVFileToDocument FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv" +UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv" @pytest.fixture(scope="module") @@ -75,3 +76,33 @@ def test_unreadable_source_is_skipped(tmp_path: Path) -> None: not_a_cv.write_bytes(b"not a real cv file") result = converter.run(sources=[not_a_cv]) assert result["documents"] == [] + + +def test_chunks_mode_attaches_a_vector_per_chunk() -> None: + if not FIXTURE.exists(): + pytest.skip(f"fixture not found: {FIXTURE}") + docs = CVFileToDocument(mode="chunks").run(sources=[FIXTURE])["documents"] + assert len(docs) >= 1 + for doc in docs: + assert doc.embedding is not None + assert len(doc.embedding) == doc.meta["embedding_dimension"] + assert all(isinstance(v, float) for v in doc.embedding) + assert doc.content.strip(), "chunk text should not be empty" + + +def test_invalid_mode_rejected() -> None: + with pytest.raises(ValueError): + CVFileToDocument(mode="bogus") + + +def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None: + if not UNICODE_FIXTURE.exists(): + pytest.skip(f"fixture not found: {UNICODE_FIXTURE}") + docs = CVFileToDocument(mode="chunks").run(sources=[UNICODE_FIXTURE])["documents"] + joined = "".join(d.content for d in docs) + assert "Élodie" in joined + assert "工程師" in joined + assert "🚀" in joined + assert "经验" in joined + for doc in docs: + assert doc.content == doc.content.encode("utf-8").decode("utf-8") diff --git a/integrations/langchain-cvfile/pyproject.toml b/integrations/langchain-cvfile/pyproject.toml index 1b7ebd8..7842cfd 100644 --- a/integrations/langchain-cvfile/pyproject.toml +++ b/integrations/langchain-cvfile/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "cvfile>=0.1.0,<1", + "cvfile>=0.1,<2", "langchain-core>=0.3,<1", ] diff --git a/integrations/langchain-cvfile/src/langchain_cvfile/loader.py b/integrations/langchain-cvfile/src/langchain_cvfile/loader.py index ee27ed6..c048e55 100644 --- a/integrations/langchain-cvfile/src/langchain_cvfile/loader.py +++ b/integrations/langchain-cvfile/src/langchain_cvfile/loader.py @@ -29,7 +29,7 @@ def _payload_to_document(payload: ExtractedPayload, file: CvFile, source: str) - "mime_type": payload.mime_type, "payload": payload.name, "relationship": payload.relationship, - "language": payload.language or file.metadata.primary_language, + "language": payload.language, "primary": payload.name == file.metadata.primary_payload, "cv_version": file.metadata.version, "cv_generator": file.metadata.generator, @@ -37,28 +37,86 @@ def _payload_to_document(payload: ExtractedPayload, file: CvFile, source: str) - ) +def _resolve_chunks(file: CvFile) -> list: + """Decode the file's embeddings.cbor into text-resolved chunks. + + Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets + (spec §5.1) and stays the single source of truth. Returns an empty list + when the embed extra is not installed or the file carries no embeddings. + """ + try: + from cvfile.embed import resolve_embedding_chunks + except ImportError: + return [] + return resolve_embedding_chunks(file) + + class CVFileLoader(BaseLoader): - """Load a ``.cv`` file and emit one ``Document`` per embedded text payload. + """Load a ``.cv`` file and emit ``Document`` objects. A ``.cv`` file is a PDF/A-3u with Markdown, HTML, and optional JSON - payloads attached via PDF Associated Files. This loader returns one - ``Document`` per textual payload (the visual PDF layer is intentionally - skipped: the embedded Markdown is a cleaner text representation of the - same content, which is the whole point of the format). - - The payload marked as ``primaryPayload`` in the file's XMP metadata is - flagged in ``metadata["primary"] = True`` so downstream code can keep - just the canonical text and drop alternates if needed. + payloads attached via PDF Associated Files. The visual PDF layer is + intentionally skipped: the embedded Markdown is a cleaner text + representation of the same content, which is the whole point of the format. + + Two modes are supported: + + - ``mode="payloads"`` (default): one ``Document`` per textual payload. The + payload marked as ``primaryPayload`` in the file's XMP metadata is flagged + in ``metadata["primary"] = True`` so downstream code can keep just the + canonical text and drop alternates if needed. + - ``mode="chunks"``: one ``Document`` per pre-computed embedding chunk, with + the chunk's vector attached as ``metadata["embedding"]`` and the chunk + text sliced from the markdown using UTF-8 byte offsets. Falls back to a + single Markdown ``Document`` when the file carries no embeddings. """ - def __init__(self, file_path: Union[str, Path]) -> None: + def __init__(self, file_path: Union[str, Path], *, mode: str = "payloads") -> None: + if mode not in ("payloads", "chunks"): + raise ValueError("mode must be 'payloads' or 'chunks'") self.file_path = Path(file_path) + self.mode = mode def lazy_load(self) -> Iterator[Document]: data = self.file_path.read_bytes() file = extract(data) source = str(self.file_path) + + if self.mode == "chunks": + yield from self._lazy_load_chunks(file, source) + return + for payload in file.payloads: if not _is_text_payload(payload): continue yield _payload_to_document(payload, file, source) + + def _lazy_load_chunks(self, file: CvFile, source: str) -> Iterator[Document]: + chunks = _resolve_chunks(file) + if not chunks: + # No precomputed embeddings: fall back to the primary text payload. + primary = next( + (p for p in file.payloads if p.name == file.metadata.primary_payload and _is_text_payload(p)), + None, + ) + if primary is not None: + yield _payload_to_document(primary, file, source) + return + + for chunk in chunks: + yield Document( + page_content=chunk.text, + metadata={ + "source": source, + "language": file.metadata.primary_language, + "cv_version": file.metadata.version, + "cv_generator": file.metadata.generator, + "chunk_id": chunk.id, + "chunk_offset": chunk.text_offset, + "chunk_length": chunk.text_length, + "embedding": list(chunk.vector), + "embedding_model": chunk.model, + "embedding_dimension": chunk.dimension, + "embedding_metric": chunk.metric, + }, + ) diff --git a/integrations/langchain-cvfile/tests/test_loader.py b/integrations/langchain-cvfile/tests/test_loader.py index f2c99b6..fb76457 100644 --- a/integrations/langchain-cvfile/tests/test_loader.py +++ b/integrations/langchain-cvfile/tests/test_loader.py @@ -10,6 +10,7 @@ from langchain_cvfile import CVFileLoader FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv" +UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv" @pytest.fixture(scope="module") @@ -46,3 +47,36 @@ def test_lazy_load_is_streaming(loader: CVFileLoader) -> None: it = loader.lazy_load() first = next(it) assert isinstance(first, Document) + + +def test_chunks_mode_attaches_a_vector_per_chunk() -> None: + if not FIXTURE.exists(): + pytest.skip(f"fixture not found: {FIXTURE}") + docs = CVFileLoader(FIXTURE, mode="chunks").load() + assert len(docs) >= 1 + for doc in docs: + emb = doc.metadata.get("embedding") + assert isinstance(emb, list) and len(emb) == doc.metadata["embedding_dimension"] + assert all(isinstance(v, float) for v in emb) + assert doc.metadata["embedding_model"] + assert doc.page_content.strip(), "chunk text should not be empty" + + +def test_invalid_mode_rejected() -> None: + with pytest.raises(ValueError): + CVFileLoader(FIXTURE, mode="bogus") + + +def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None: + if not UNICODE_FIXTURE.exists(): + pytest.skip(f"fixture not found: {UNICODE_FIXTURE}") + docs = CVFileLoader(UNICODE_FIXTURE, mode="chunks").load() + joined = "".join(d.page_content for d in docs) + # Multibyte characters survive intact: a code-point slice would mojibake these. + assert "Élodie" in joined + assert "工程師" in joined + assert "🚀" in joined + assert "经验" in joined + # Every chunk decodes to valid text (no broken surrogate / partial byte runs). + for doc in docs: + assert doc.page_content == doc.page_content.encode("utf-8").decode("utf-8") diff --git a/integrations/llama-index-readers-cvfile/pyproject.toml b/integrations/llama-index-readers-cvfile/pyproject.toml index 9277c13..2d2fe3f 100644 --- a/integrations/llama-index-readers-cvfile/pyproject.toml +++ b/integrations/llama-index-readers-cvfile/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.13", ] dependencies = [ - "cvfile>=0.1.0,<1", + "cvfile>=0.1,<2", "llama-index-core>=0.11,<0.15", ] diff --git a/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py b/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py index c34d891..b9b31f0 100644 --- a/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py +++ b/integrations/llama-index-readers-cvfile/src/llama_index/readers/cvfile/base.py @@ -32,7 +32,7 @@ def _payload_to_document( "mime_type": payload.mime_type, "payload": payload.name, "relationship": payload.relationship, - "language": payload.language or file.metadata.primary_language, + "language": payload.language, "primary": payload.name == file.metadata.primary_payload, "cv_version": file.metadata.version, "cv_generator": file.metadata.generator, @@ -42,15 +42,41 @@ def _payload_to_document( return Document(text=payload.text(), metadata=metadata) +def _resolve_chunks(file: CvFile) -> list: + """Decode the file's embeddings.cbor into text-resolved chunks. + + Delegates to the core SDK so chunk text slicing uses UTF-8 byte offsets + (spec §5.1) and stays the single source of truth. Returns an empty list + when the embed extra is not installed or the file carries no embeddings. + """ + try: + from cvfile.embed import resolve_embedding_chunks + except ImportError: + return [] + return resolve_embedding_chunks(file) + + class CVFileReader(BaseReader): - """Read a ``.cv`` file and emit one ``Document`` per embedded text payload. + """Read a ``.cv`` file and emit ``Document`` objects. A ``.cv`` file is a PDF/A-3u carrying Markdown, HTML, and optional JSON - payloads via PDF Associated Files. This reader returns one ``Document`` - per textual payload (the visual PDF layer is skipped because the embedded - Markdown is a cleaner text representation of the same content). + payloads via PDF Associated Files. The visual PDF layer is skipped because + the embedded Markdown is a cleaner text representation of the same content. + + Two modes are supported: + + - ``mode="payloads"`` (default): one ``Document`` per textual payload. + - ``mode="chunks"``: one ``Document`` per pre-computed embedding chunk, with + the chunk's vector attached on ``Document.embedding`` and the chunk text + sliced from the markdown using UTF-8 byte offsets. Falls back to a single + Markdown ``Document`` when the file carries no embeddings. """ + def __init__(self, *, mode: str = "payloads") -> None: + if mode not in ("payloads", "chunks"): + raise ValueError("mode must be 'payloads' or 'chunks'") + self.mode = mode + def load_data( self, file: Path, @@ -59,8 +85,54 @@ def load_data( path = Path(file) cv_file = extract(path.read_bytes()) source = str(path) + + if self.mode == "chunks": + return self._load_chunks(cv_file, source, extra_info) + return [ _payload_to_document(payload, cv_file, source, extra_info) for payload in cv_file.payloads if _is_text_payload(payload) ] + + def _load_chunks( + self, + cv_file: CvFile, + source: str, + extra_info: Optional[dict], + ) -> list[Document]: + chunks = _resolve_chunks(cv_file) + if not chunks: + primary = next( + ( + p + for p in cv_file.payloads + if p.name == cv_file.metadata.primary_payload and _is_text_payload(p) + ), + None, + ) + if primary is None: + return [] + return [_payload_to_document(primary, cv_file, source, extra_info)] + + out: list[Document] = [] + for chunk in chunks: + metadata: dict = { + "source": source, + "file_name": Path(source).name, + "language": cv_file.metadata.primary_language, + "cv_version": cv_file.metadata.version, + "cv_generator": cv_file.metadata.generator, + "chunk_id": chunk.id, + "chunk_offset": chunk.text_offset, + "chunk_length": chunk.text_length, + "embedding_model": chunk.model, + "embedding_dimension": chunk.dimension, + "embedding_metric": chunk.metric, + } + if extra_info: + metadata.update(extra_info) + doc = Document(text=chunk.text, metadata=metadata) + doc.embedding = list(chunk.vector) + out.append(doc) + return out diff --git a/integrations/llama-index-readers-cvfile/tests/test_reader.py b/integrations/llama-index-readers-cvfile/tests/test_reader.py index 4d86c25..0aa1492 100644 --- a/integrations/llama-index-readers-cvfile/tests/test_reader.py +++ b/integrations/llama-index-readers-cvfile/tests/test_reader.py @@ -10,6 +10,7 @@ from llama_index.readers.cvfile import CVFileReader FIXTURE = Path(__file__).parents[3] / "packages" / "sdk-js" / "tests" / "fixtures" / "python-produced.cv" +UNICODE_FIXTURE = Path(__file__).parents[2] / "tests" / "fixtures" / "unicode.cv" @pytest.fixture(scope="module") @@ -45,3 +46,33 @@ def test_primary_is_text_content(reader: CVFileReader) -> None: def test_extra_info_is_merged(reader: CVFileReader) -> None: docs = reader.load_data(file=FIXTURE, extra_info={"tenant": "acme"}) assert all(d.metadata.get("tenant") == "acme" for d in docs) + + +def test_chunks_mode_attaches_a_vector_per_chunk() -> None: + if not FIXTURE.exists(): + pytest.skip(f"fixture not found: {FIXTURE}") + docs = CVFileReader(mode="chunks").load_data(file=FIXTURE) + assert len(docs) >= 1 + for doc in docs: + assert doc.embedding is not None + assert len(doc.embedding) == doc.metadata["embedding_dimension"] + assert all(isinstance(v, float) for v in doc.embedding) + assert doc.text.strip(), "chunk text should not be empty" + + +def test_invalid_mode_rejected() -> None: + with pytest.raises(ValueError): + CVFileReader(mode="bogus") + + +def test_non_ascii_chunk_text_slices_on_byte_offsets() -> None: + if not UNICODE_FIXTURE.exists(): + pytest.skip(f"fixture not found: {UNICODE_FIXTURE}") + docs = CVFileReader(mode="chunks").load_data(file=UNICODE_FIXTURE) + joined = "".join(d.text for d in docs) + assert "Élodie" in joined + assert "工程師" in joined + assert "🚀" in joined + assert "经验" in joined + for doc in docs: + assert doc.text == doc.text.encode("utf-8").decode("utf-8") diff --git a/integrations/tests/fixtures/build_unicode_fixture.py b/integrations/tests/fixtures/build_unicode_fixture.py new file mode 100644 index 0000000..e04717a --- /dev/null +++ b/integrations/tests/fixtures/build_unicode_fixture.py @@ -0,0 +1,99 @@ +"""Build a non-ASCII .cv fixture so the byte-offset chunk path is covered. + +The markdown mixes accented Latin, CJK and emoji so that UTF-8 byte offsets +diverge from code-point indices: a chunker that sliced on str indices would +return garbled text here, while byte-offset slicing recovers the exact source. + +Run with the cvfile SDK (and its [embed] extra) on PYTHONPATH: + + python integrations/tests/fixtures/build_unicode_fixture.py +""" + +from __future__ import annotations + +import hashlib +import io +import struct +from pathlib import Path + +import pypdf +from cvfile import extract, pack, validate +from cvfile.embed import EmbedOptions, embed + +_EMBED_DIMENSION = 8 + +# Heading-led sections so the section chunker emits one chunk per section, and +# every section contains multibyte characters before later sections start. +UNICODE_MD = """# Élodie Gauthier · 工程師 🚀 + +Ingénieure logicielle à Montréal, Québec. Café ☕ et résolution de problèmes. + +## Expérience 经验 + +* Conçu des systèmes distribués 分布式系统 à grande échelle +* Mentorat d'équipe 团队 et révisions de code 🔍 + +## Compétences + +* Python, Rust, Go — performance et fiabilité +* Langues : français, English, 中文 🌏 +""" + +UNICODE_HTML = ( + '\n' + 'Élodie Gauthier\n' + "

Élodie Gauthier · 工程師 🚀

Ingénieure logicielle.

" +) + + +class DeterministicBackend: + """Offline, reproducible embedding backend (see build_python_sample.py).""" + + model = "fixture/deterministic-hash" + model_revision = "v1" + metric = "cosine" + normalized = False + + def embed(self, texts: list[str]) -> tuple[list[tuple[float, ...]], int]: + vectors: list[tuple[float, ...]] = [] + for text in texts: + digest = hashlib.sha256(text.encode("utf-8")).digest() + raw = (digest * ((_EMBED_DIMENSION * 4) // len(digest) + 1))[: _EMBED_DIMENSION * 4] + vectors.append(struct.unpack(f"<{_EMBED_DIMENSION}f", raw)) + return vectors, _EMBED_DIMENSION + + +def make_blank_pdf() -> bytes: + writer = pypdf.PdfWriter() + writer.add_blank_page(width=300, height=400) + buf = io.BytesIO() + writer.write(buf) + return buf.getvalue() + + +def main() -> None: + out_dir = Path(__file__).resolve().parent + out_dir.mkdir(parents=True, exist_ok=True) + + embeddings = embed(UNICODE_MD, EmbedOptions(chunking="section", backend=DeterministicBackend())) + + cv = pack( + pdf=make_blank_pdf(), + markdown=UNICODE_MD, + html=UNICODE_HTML, + embeddings=embeddings, + metadata={"primary_language": "fr", "generator": "cvfile-integrations/unicode-fixture"}, + ) + + out_path = out_dir / "unicode.cv" + out_path.write_bytes(cv) + print(f"Wrote {out_path} ({len(cv)} bytes)") + + file = extract(cv) + print(f" payloads: {[p.name for p in file.payloads]}") + report = validate(cv) + print(f" validate: ok={report.ok} issues={len(report.issues)}") + + +if __name__ == "__main__": + main() diff --git a/integrations/tests/fixtures/unicode.cv b/integrations/tests/fixtures/unicode.cv new file mode 100644 index 0000000000000000000000000000000000000000..6f666e1f0ab53b7666542ff599e48f603a72b116 GIT binary patch literal 9756 zcmds7O^h5z6(*ceA98>LAskYSWpH3;`gi(gGCPsIKgkBKceCp#5s44gUDY$~?dhI$ z_w3FXSqh3sKm-Lv2?|8WC?s+=zzL8e3_@ZrkvM?F1rVnY#|jckgpd$Y_<2>;)6+9! zJ9g~I#%u4+^sDz?eO2||tM{sU|N7!`RjujL{x|;e+uutnQc&z3mzqsUK054CBro6$ zhjCky*KwPsNCSoqNj|uY9!4hnj%@x{Rrtp-NnWKHCLn}VrffrMwImu5rj*G}D(5;S zc_Hpc8B!(rivdYNv7YN1i!#crs3dRDH0~!J00YFp4!x5AABqQ{MX3QM_|+8!8So3( z)^S3iCozbzcB7w;Kht zpHKw$5_ka(LyGpkk5=O-OZIkgIEU`<@9nzOtDyz#@9m;Do_`L}3_)WWhkZ5^h)~o^ zVmI7-rAyPAB<vb47(6S2V5GOc8s3*zG=s-f#ewTvnm|X%E^S~@=k)|Fxu$tQ^N@x#cU@EvZ zg2=(;4I1)?v=j7@d@#&t5&>Zw%zSB?IU97u`~hMC=ojCF)#Mnh8H|^9mQL0U3rA5M z?YcM$?|1Ad)Nve449j_UJ0pIZ+tfy+>CMe^5hXwj>Nqe48*dX9nX% z3=Q{sVc>DAhTyy%k(#$%qdbBKQ9J#D=f+84fx2T|7YpS5jxErl2C3_HL&YBKGmhHOwgsd@sPk1|;J9Mzx*q8(VB+tj4|%mj4m)5b~Yzwv+xgNXN-T=+swlUHM-2%qL#0 z-L7aq_|vK1-GA>hi`9R=@SU}{etPW4>nc-woDOU8kH?zmwr!Lqa z{AxX6k-pB*_cKb+nDqj295M%73gFCjgkOno6Mn_hw@I+0)UK53JhXba@EOz8tWw_4 zyn6t~w*?!YOWQ#-UwP;3uPP`Y^OXndtIBF`fp&s>Pb75n#F3-ki7kk!mAfHm4jO}Q zkH^SCH;mH8V7`JOeWVTe&d72Fah7arzT$2Bu<=!ULrC=0N=rhhnGnCRvAB$cFg(py zI$73h$nwt4PHji8#YtOMA+XDeCTm(1II8Iow!T3%O80RcxfV9RNzmhI3Db(*xDNrl z0==*w(E+50m<_4a2M&TR?}fF=Q7Q*S&h>(3A<*dIBx~sqgSotT%oUGVJn*GFduTmS zaF(fDpeZwOC9rH7u4D+rE8zF~NywoSPo^OS@6S?Mt*Isa#A}Ec(<;SNHjJh`Zn`$W zzzzyzQCj0xPy<7k2e=2QloBdnZSZWGvgEkMvgoQ(uWI&DO=+llL$$xul&6?R)6K|` zRR{j?ePl~aWm(>DoE(kgu$3ER5M>mSO=dc&+>Zj5J67}1Tf&hGU7OkRw#$DifgfB-sNg6`^^yQer6JFVp~!6c z3CA=|o|Ra~!!U-FN+i6sE-A~68cSkr8c|r4IJvxaf-#OIN@{SP4b!ys@RvXPXg|*G z>Io#`p@GC_4vVEhf-sjiu#~V5Bb{XTlS!^F?7pnboaAC#C{Nb}`Q$M^<`Se5P+^8a zHoPpHT{77}4%=}OWS#CLfV6|FTHT!FB>@}`WRr}Ft78jeRkM8ytB^69s^j`TRqJ)v zQe155mf{eidIs@{X6ueeiKRfB?bcmPY@zMs9Qr_We6{E`!}cA|bqvF>h-wl=#ij`b zvr1gWbsSAGTn{^rV(FxA`GDGTJrnD`YhY7@r0ufMk2mWY+wCXk^lC?->(ucar>L>8YS6P?r`GS3m8oBNE;5}_L!2V5@CQ?o&kAF%A?YJWg|a(D{>nqpZ6ff)`&2i;;;(jfsK_`o+80c zKD#-I^yWfRxw*~ih9e8GEQsV2vN`plQewb){Ab%E^0rX{b3#?ldQnxQ$t_P6p8z&b zoAl9X`jt*(X~?71<`de=H*W!UvS|YBg%oV#u+P`8brfuKO2xSGef2;cO<{W&wwQ&( z(NH+0s%U>-O=O&AmzPB(nJ6DC99A8?Z*8Ntx^iS~T|V=H5}P*?7WP>N%xi+&D9+X+6bkU(?liutOu1btnRdg~LlY&E3H%y6||f?;4D< z%BdN=c^4tXX@)K`>g&4{Q(5~}uE>oTqz)~tR}BkgxQ$|;`-g|47>Ya`!X*JzesIGy z%J8R#e?u%yUKmY5yHa1d5m$!Yg0OMSt_!Q0P+p`>4gC79Nl{==c&@ypez!Cv1b7}w5Iux^Gm7~R~#wW>IxU^k}xSTjyibl|?twIfxy#DuDiAeP!4^AiOA0@oV<2kSDfsc*uCSsL95R(b7;;5Hr3CqK+}5k<;O z%_!zJuZkgd@>;%JRm>JQf>J8rg^s1sImBeeFSr*=6&REnf(Umga2X>Pw=wWg+~>eO z@qA^6-PX+97!RN-mlULGl2ZI2vkrHd$VUZDg%<|c4~hz!s=#IQG|hzj-YFWKBF@m* z8$Q!~jsbUCld`I+!h1whw7LmzQ%un;eMX;V%*twYyCj<>7}Ae~zdFU=x`8>6H{uvx zY!cUOD{!5I*sDfx<+id2FDNy2-OzPgH$k@v|7=|`OnAvag$rL3nlwW*SQ}8_S%H6b j!vMMkGz-3|t!ts;n7Fco`zXG4j^>!s{{2g9%hLY<;z9+X literal 0 HcmV?d00001 diff --git a/packages/sdk-js/tests/fixtures/python-produced.cv b/packages/sdk-js/tests/fixtures/python-produced.cv index 24f056625064afb0eeea166383db8453605703f2..d8500148950e09288cdaf7ae81c145d962e3800c 100644 GIT binary patch delta 1805 zcmbtV&5ImG6wl;iHJ!MN1ldK_r2NQ|n9NRncl9_7fz9qocbCLKAX%lms(Poh+udt- z&#)*mL=T>ng+@?w2!0Utrp!OEC>j(Get{Y1I)#4*VsHQimW-pB8~ zs(SCu%kx{yTT2fh3?MiIR;$2i z!sAxuZ1B7Wh+(TT*U)n9Y%>Hk>{HuU!qeJUo@9_IA#J3wEuhU^>ef47mOi;#mz)f{ z^d%OWb4EDRj*p>&W(QJGsD$<;V)f4Up6_n&a_LUHr1r2!=@1HSqM3($-+_)48cA$B zo>ZQq+<5$a-*_pnmnQS&edCX`A~h>pvpV#&QbDwuo{(OgSd}Mrw3;;lfh^xB&D{1H z+II@jfDA_nJ!A{wyFMgBF)E11J!b9zxtw6cMct}*pEOIf8(Qx~ew>7ER&MDe4dSTe zryH)+>E-uz7%RQL*?!OuHnV1;D@tcN34pe#+404ne*56jZw^i-OMm?S^?jG8 zo4Zf^SSmnQqd0TD^)6r^Z_2p|@Yx%HZ~pq>^+VgWBi~om+t>b_@y^d3sr>WB^0f=E z-m|$&y!Gvzr6aX-e1?j>OBTZUA^)S~<4>j{UFTv`!^Bww!3A%w!yH zG&8NhKsEn)?;&{FRI62)UjVh5RapQC6{Xw--64Co*^h(dpN^qD4rQIc$6nig0E?$F ztRhlHBH!ANhS%?YwZAmH&VR5TA6|D39&q;NuZ$m_7(_H%Y$^X}eDM@Bg+^>7xK~{G9;8|=kpHQDZ0!9BP^)q#zy9z;Z9*v~j9_9DOesZ_ z8VVYU5!97%La2GnYZF1Z`56srx)McZ#&uA;8wk+DSSi|Dnz>1~nP}f~Y{splN9UIo GtbYN-#|yFm delta 630 zcmbu6KT88K7{<9;M58?s6&HsLR?($-^Cow>augh@lT@*jaKm9wIgk!#sDG9Y?o!A{ za2NDO!NITK?w9Zj7%R%r8ypPe<$Zp6o;QJv4sP3{jvJB2exl=EtYZrB*bAIlV8Eo} zaBOyFR+oR(7HJ%%HwgvF^R>$B-#!oXsQNbHTPn>He*Q4+I_84eFKI}QTDZH zpOSII730`7~d6Nu4$}#H}E(V04WR$Y>81Q+)r?WUcPcjmQKfl7w z;U*mljtqtH#ydcX0u`dI9x4F>kPCr<)x!GFQjWf@K1zFv$XYb~&j|WGBO { expect(await extractHtml(bytes)).toBe(PY_HTML); }); - it('extracts all three Python-produced payloads', async () => { + it('extracts all Python-produced payloads (incl. precomputed embeddings)', async () => { const bytes = new Uint8Array(await readFile(FIXTURE)); const file = await extract(bytes); const names = file.payloads.map((p) => p.name).sort(); - expect(names).toEqual(['resume.html', 'resume.json', 'resume.md']); + expect(names).toEqual(['embeddings.cbor', 'resume.html', 'resume.json', 'resume.md']); + }); + + it('surfaces the embedding-space summary in metadata', async () => { + const bytes = new Uint8Array(await readFile(FIXTURE)); + const meta = await inspect(bytes); + expect(meta.embeddings.length).toBeGreaterThan(0); + expect(meta.embeddings[0].chunks).toBeGreaterThan(0); + expect(meta.embeddings[0].dimension).toBeGreaterThan(0); }); it('validates the Python-produced file', async () => { From da957bf01459261d3aa6be527aadcad4ecd469b5 Mon Sep 17 00:00:00 2001 From: Ilan Date: Thu, 28 May 2026 23:44:40 +0300 Subject: [PATCH 8/8] fix(python): support Python 3.10 (assert_never is 3.11+) The server handler imported assert_never from typing, which only exists in Python 3.11+, breaking test collection on 3.10 (the package declares requires-python >=3.10). Guard the import behind sys.version_info with a NoReturn fallback for 3.10. pytest/mypy/ruff all clean. --- sdks/python/src/cvfile/server/_handler.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/sdks/python/src/cvfile/server/_handler.py b/sdks/python/src/cvfile/server/_handler.py index e7969dc..fd9d64d 100644 --- a/sdks/python/src/cvfile/server/_handler.py +++ b/sdks/python/src/cvfile/server/_handler.py @@ -8,8 +8,16 @@ from __future__ import annotations import html as _html +import sys from dataclasses import dataclass, field -from typing import TYPE_CHECKING, assert_never +from typing import TYPE_CHECKING, NoReturn + +if sys.version_info >= (3, 11): + from typing import assert_never +else: # assert_never landed in typing only in Python 3.11; we support 3.10+. + + def assert_never(value: object, /) -> NoReturn: + raise AssertionError(f"Unhandled value: {value!r}") from cvfile.extract import extract from cvfile.server._conneg import (