diff --git a/docs/src/pages/create.astro b/docs/src/pages/create.astro
index 6773eb6..d5262ad 100644
--- a/docs/src/pages/create.astro
+++ b/docs/src/pages/create.astro
@@ -39,11 +39,11 @@ const jsonLd = graph([
1 · PDF
-
- Want to add BGE-M3 semantic embeddings? Install the CLI
- (brew install cvfile/tap/cv) and run
- cv pack --embed-with bge-m3. The model is ~285 MB so it
- runs once locally on your machine, not on every visitor's browser.
+ Want to add BGE-M3 semantic embeddings? Generate them with the Python
+ package (pip install "cvfile[embed]"):
+ run embed(markdown) then pack with
+ pack(..., embeddings=encode_embeddings(payload)). The model is
+ ~285 MB so it runs once locally on your machine, not on every visitor's
+ browser. The cv CLI is reader-only (extract, inspect, validate,
+ search) and does not generate embeddings.
Ingénieure logicielle.
" +) + + +class DeterministicBackend: + """Offline, reproducible embedding backend (see build_python_sample.py).""" + + model = "fixture/deterministic-hash" + model_revision = "v1" + metric = "cosine" + normalized = False + + def embed(self, texts: list[str]) -> tuple[list[tuple[float, ...]], int]: + vectors: list[tuple[float, ...]] = [] + for text in texts: + digest = hashlib.sha256(text.encode("utf-8")).digest() + raw = (digest * ((_EMBED_DIMENSION * 4) // len(digest) + 1))[: _EMBED_DIMENSION * 4] + vectors.append(struct.unpack(f"<{_EMBED_DIMENSION}f", raw)) + return vectors, _EMBED_DIMENSION + + +def make_blank_pdf() -> bytes: + writer = pypdf.PdfWriter() + writer.add_blank_page(width=300, height=400) + buf = io.BytesIO() + writer.write(buf) + return buf.getvalue() + + +def main() -> None: + out_dir = Path(__file__).resolve().parent + out_dir.mkdir(parents=True, exist_ok=True) + + embeddings = embed(UNICODE_MD, EmbedOptions(chunking="section", backend=DeterministicBackend())) + + cv = pack( + pdf=make_blank_pdf(), + markdown=UNICODE_MD, + html=UNICODE_HTML, + embeddings=embeddings, + metadata={"primary_language": "fr", "generator": "cvfile-integrations/unicode-fixture"}, + ) + + out_path = out_dir / "unicode.cv" + out_path.write_bytes(cv) + print(f"Wrote {out_path} ({len(cv)} bytes)") + + file = extract(cv) + print(f" payloads: {[p.name for p in file.payloads]}") + report = validate(cv) + print(f" validate: ok={report.ok} issues={len(report.issues)}") + + +if __name__ == "__main__": + main() diff --git a/integrations/tests/fixtures/unicode.cv b/integrations/tests/fixtures/unicode.cv new file mode 100644 index 0000000..6f666e1 Binary files /dev/null and b/integrations/tests/fixtures/unicode.cv differ diff --git a/packages/embed-js/src/chunk.ts b/packages/embed-js/src/chunk.ts index e2855e3..abf9c32 100644 --- a/packages/embed-js/src/chunk.ts +++ b/packages/embed-js/src/chunk.ts @@ -5,8 +5,18 @@ * carries the byte offset and length into the original UTF-8 source so a * downstream consumer can map a vector hit back to the exact substring * without re-tokenising. Pre-heading content becomes a "preamble" chunk. + * + * Per spec §5.1, `textOffset`/`textLength` are UTF-8 *byte* offsets into the + * markdown source. We encode the document once with `TextEncoder`, track a + * byte cursor while iterating lines (counting the trailing `\n` byte), and + * derive each chunk's `text` by decoding the corresponding byte slice. This + * keeps the offsets in agreement with the Go and Python SDKs for any + * non-ASCII résumé. */ +const encoder = new TextEncoder(); +const decoder = new TextDecoder(); + export type ChunkingMode = 'document' | 'section' | 'paragraph'; export interface MarkdownChunk { @@ -22,26 +32,38 @@ export interface ChunkOptions { const HEADING = /^(#{1,6})\s+(.+?)\s*$/; +/** A source line plus its UTF-8 byte offset and byte length (including any trailing `\n`). */ +interface ByteLine { + text: string; + offset: number; + byteLength: number; +} + export function chunkMarkdown(markdown: string, opts: ChunkOptions = {}): MarkdownChunk[] { const mode = opts.mode ?? 'section'; + const bytes = encoder.encode(markdown); if (mode === 'document') { - return [{ id: 'document', textOffset: 0, textLength: markdown.length, text: markdown }]; + return [documentChunk(bytes)]; } if (mode === 'paragraph') { - return paragraphChunks(markdown); + return paragraphChunks(bytes); } - return sectionChunks(markdown); + return sectionChunks(bytes); +} + +function documentChunk(bytes: Uint8Array): MarkdownChunk { + return { id: 'document', textOffset: 0, textLength: bytes.byteLength, text: sliceText(bytes, 0, bytes.byteLength) }; } -function sectionChunks(markdown: string): MarkdownChunk[] { - const lines = splitWithOffsets(markdown); +function sectionChunks(bytes: Uint8Array): MarkdownChunk[] { + const lines = splitWithByteOffsets(bytes); const sections: MarkdownChunk[] = []; let current: { id: string; start: number; end: number } | null = null; const ids = new Set>>1;o >>24,h>>>=S,p-=S,S=w>>>16&255,S&16){if(C=w&65535,S&=15,p {const c=s.bits;let l=0,f=0,u=0,d=0,h=0,p=0,v=0,b=0,y=0,E=0,w,S,T,C,F,I=null,M;const z=new Uint16Array(Vn+1),O=new Uint16Array(Vn+1);let H=null,x,U,N;for(l=0;l<=Vn;l++)z[l]=0;for(f=0;f nh||r===oh&&y>ih)return 1;T=E&C,i[T]=h<<24|p<<16|F-a|0}}return E!==0&&(i[F+E]=l-v<<24|64<<16|0),s.bits=h,0};var ra=dx;const px=0,op=1,sp=2,{Z_FINISH:sh,Z_BLOCK:vx,Z_TREES:so,Z_OK:mn,Z_STREAM_END:gx,Z_NEED_DICT:yx,Z_STREAM_ERROR:Gt,Z_DATA_ERROR:lp,Z_MEM_ERROR:cp,Z_BUF_ERROR:mx,Z_DEFLATED:lh}=np,ws=16180,ch=16181,fh=16182,uh=16183,hh=16184,dh=16185,ph=16186,vh=16187,gh=16188,yh=16189,Zo=16190,xr=16191,fl=16192,mh=16193,ul=16194,bh=16195,xh=16196,wh=16197,Sh=16198,lo=16199,co=16200,kh=16201,Ah=16202,Th=16203,Fh=16204,Eh=16205,hl=16206,Ch=16207,Rh=16208,ze=16209,fp=16210,up=16211,bx=852,xx=592,wx=15,Sx=wx,Oh=r=>(r>>>24&255)+(r>>>8&65280)+((r&65280)<<8)+((r&255)<<24);function kx(){this.strm=null,this.mode=0,this.last=!1,this.wrap=0,this.havedict=!1,this.flags=0,this.dmax=0,this.check=0,this.total=0,this.head=null,this.wbits=0,this.wsize=0,this.whave=0,this.wnext=0,this.window=null,this.hold=0,this.bits=0,this.length=0,this.offset=0,this.extra=0,this.lencode=null,this.distcode=null,this.lenbits=0,this.distbits=0,this.ncode=0,this.nlen=0,this.ndist=0,this.have=0,this.next=null,this.lens=new Uint16Array(320),this.work=new Uint16Array(288),this.lendyn=null,this.distdyn=null,this.sane=0,this.back=0,this.was=0}const En=r=>{if(!r)return 1;const e=r.state;return!e||e.strm!==r||e.mode>1,l=e[s+1],f=c<<4|l,u=t-l,d=e[s]<>>15-t;n[p]=f,d++}},r.H.l=function(e,t){for(var n=r.H.m.r,i=15-t,a=0;a0){var s=o.lookup(0,ke,ee),c=o.lookupMaybe(1,ke,ee);n.push({value:s,display:c||s})}}return n}return[]},e}(fi),cs=function(r){q(e,r);function e(){return r!==null&&r.apply(this,arguments)||this}return e.fromDict=function(t,n){return new e(t,n)},e.create=function(t){var n=t.obj({FT:"Ch",Ff:Te.Combo,Kids:[]}),i=t.register(n);return new e(n,i)},e}(x0),Uo=function(r){q(e,r);function e(){return r!==null&&r.apply(this,arguments)||this}return e.prototype.addField=function(t){var n=this.normalizedEntries().Kids;n==null||n.push(t)},e.prototype.normalizedEntries=function(){var t=this.Kids();return t||(t=this.dict.context.obj([]),this.dict.set(g.of("Kids"),t)),{Kids:t}},e.fromDict=function(t,n){return new e(t,n)},e.create=function(t){var n=t.obj({}),i=t.register(n);return new e(n,i)},e}(b0),Ac=function(r){q(e,r);function e(){return r!==null&&r.apply(this,arguments)||this}return e.fromDict=function(t,n){return new e(t,n)},e}(fi),fs=function(r){q(e,r);function e(){return r!==null&&r.apply(this,arguments)||this}return e.prototype.MaxLen=function(){var t=this.dict.lookup(g.of("MaxLen"));if(t instanceof ne)return t},e.prototype.Q=function(){var t=this.dict.lookup(g.of("Q"));if(t instanceof ne)return t},e.prototype.setMaxLength=function(t){this.dict.set(g.of("MaxLen"),ne.of(t))},e.prototype.removeMaxLength=function(){this.dict.delete(g.of("MaxLen"))},e.prototype.getMaxLength=function(){var t;return(t=this.MaxLen())===null||t===void 0?void 0:t.asNumber()},e.prototype.setQuadding=function(t){this.dict.set(g.of("Q"),ne.of(t))},e.prototype.getQuadding=function(){var t;return(t=this.Q())===null||t===void 0?void 0:t.asNumber()},e.prototype.setValue=function(t){this.dict.set(g.of("V"),t)},e.prototype.removeValue=function(){this.dict.delete(g.of("V"))},e.prototype.getValue=function(){var t=this.V();if(t instanceof ke||t instanceof ee)return t},e.fromDict=function(t,n){return new e(t,n)},e.create=function(t){var n=t.obj({FT:"Tx",Kids:[]}),i=t.register(n);return new e(n,i)},e}(fi),us=function(r){q(e,r);function e(){return r!==null&&r.apply(this,arguments)||this}return e.fromDict=function(t,n){return new e(t,n)},e.create=function(t){var n=t.obj({FT:"Btn",Ff:It.PushButton,Kids:[]}),i=t.register(n);return new e(n,i)},e}(kc),hs=function(r){q(e,r);function e(){return r!==null&&r.apply(this,arguments)||this}return e.prototype.setValue=function(t){var n=this.getOnValues();if(!n.includes(t)&&t!==g.of("Off"))throw new mc;this.dict.set(g.of("V"),t);for(var i=this.getWidgets(),a=0,o=i.length;at[Bt(this._mapKey.has(i)?this._mapKey.get(i):i)]=n),t}mapDecode(e,t){let n=this.decode(e);if(this._keyMap)switch(n.constructor.name){case"Array":return n.map(i=>this.decodeKeys(i))}return n}decode(e,t){if(X)return Q0(()=>(Zl(),this?this.decode(e,t):ba.prototype.decode.call(Zu,e,t)));fn=t>-1?t:e.length,j=0,ya=0,Go=null,Je=null,X=e;try{ht=e.dataView||(e.dataView=new DataView(e.buffer,e.byteOffset,e.byteLength))}catch(n){throw X=null,e instanceof Uint8Array?n:new Error("Source must be a Uint8Array or Buffer but was a "+(e&&typeof e=="object"?e.constructor.name:typeof e))}if(this instanceof ba){if(ge=this,wt=this.sharedValues&&(this.pack?new Array(this.maxPrivatePackedValues||16).concat(this.sharedValues):this.sharedValues),this.structures)return Le=this.structures,io();(!Le||Le.length>0)&&(Le=[])}else ge=Zu,(!Le||Le.length>0)&&(Le=[]),wt=null;return io()}decodeMultiple(e,t){let n,i=0;try{let a=e.length;ma=!0;let o=this?this.decode(e,a):jc.decode(e,a);if(t){if(t(o)===!1)return;for(;j=Je.postBundlePosition){let e=new Error("Unexpected bundle position");throw e.incomplete=!0,e}j=Je.postBundlePosition,Je=null}if(j==fn)Le=null,X=null,Mt&&(Mt=null);else if(j>fn){let e=new Error("Unexpected end of CBOR data");throw e.incomplete=!0,e}else if(!ma)throw new Error("Data read, but end of buffer not reached");return r}catch(r){throw Zl(),(r instanceof RangeError||r.message.startsWith("Unexpected end of buffer"))&&(r.incomplete=!0),r}}function be(){let r=X[j++],e=r>>5;if(r=r&31,r>23)switch(r){case 24:r=X[j++];break;case 25:if(e==7)return k1();r=ht.getUint16(j),j+=2;break;case 26:if(e==7){let t=ht.getFloat32(j);if(ge.useFloat32>2){let n=Uc[(X[j]&127)<<1|X[j+1]>>7];return j+=4,(n*t+(t>0?.5:-.5)>>0)/n}return j+=4,t}if(r=ht.getUint32(j),j+=4,e===1)return-1-r;break;case 27:if(e==7){let t=ht.getFloat64(j);return j+=8,t}if(e>1){if(ht.getUint32(j)>0)throw new Error("JavaScript does not support arrays, maps, or strings with length over 4294967295");r=ht.getUint32(j+4)}else ge.int64AsNumber?(r=ht.getUint32(j)*4294967296,r+=ht.getUint32(j+4)):r=ht.getBigUint64(j);j+=8;break;case 31:switch(e){case 2:case 3:throw new Error("Indefinite length not supported for byte or text strings");case 4:let t=[],n,i=0;for(;(n=be())!=jn;){if(i>=Ei)throw new Error(`Array length exceeds ${Ei}`);t[i++]=n}return e==4?t:e==3?t.join(""):Buffer.concat(t);case 5:let a;if(ge.mapsAsObjects){let o={},s=0;if(ge.keyMap)for(;(a=be())!=jn;){if(s++>=br)throw new Error(`Property count exceeds ${br}`);o[Bt(ge.decodeKey(a))]=be()}else for(;(a=be())!=jn;){if(s++>=br)throw new Error(`Property count exceeds ${br}`);o[Bt(a)]=be()}return o}else{Vi&&(ge.mapsAsObjects=!0,Vi=!1);let o=new Map;if(ge.keyMap){let s=0;for(;(a=be())!=jn;){if(s++>=br)throw new Error(`Map size exceeds ${br}`);o.set(ge.decodeKey(a),be())}}else{let s=0;for(;(a=be())!=jn;){if(s++>=br)throw new Error(`Map size exceeds ${br}`);o.set(a,be())}}return o}case 7:return jn;default:throw new Error("Invalid major type for indefinite length "+e)}default:throw new Error("Unknown token "+r)}switch(e){case 0:return r;case 1:return~r;case 2:return S1(r);case 3:if(ya>=j)return Go.slice(j-Xo,(j+=r)-Xo);if(ya==0&&fn<140&&r<32){let i=r<16?Y0(r):w1(r);if(i!=null)return i}return x1(r);case 4:if(r>=Ei)throw new Error(`Array length exceeds ${Ei}`);let t=new Array(r);for(let i=0;i>>=S,p-=S),p<15&&(h+=M[n++]<c){e.msg="invalid distance too far back",O.mode=oo;break e}if(h>>>=S,p-=S,S=a-o,C>S){if(S=C-S,S>f&&O.sane){e.msg="invalid distance too far back",O.mode=oo;break e}if(F=0,I=d,u===0){if(F+=l-S,S>3,n-=T,p-=T<<3,h&=(1<