@@ -356,28 +356,10 @@ <h2>LLM in Your Browser</h2>
356356}
357357
358358function loadModelFromBytes ( bytes , name ) {
359- try {
360- Module . FS . writeFile ( '/model.gguf' , bytes ) ;
361- showLoading ( 'Initializing model...' ) ;
362- const rc = Module . _wasm_load_model ( Module . allocateUTF8 ( '/model.gguf' ) ) ;
363- if ( rc === 0 ) {
364- modelLoaded = true ;
365- const dropzone = document . getElementById ( 'dropzone' ) ;
366- dropzone . classList . add ( 'loaded' ) ;
367- dropzone . innerHTML = `<h2>✓ ${ name } (${ ( bytes . length / 1048576 ) . toFixed ( 0 ) } MB)</h2>
368- <p style="color:#6ee7b7">KV compression active — 3x longer context</p>` ;
369- document . getElementById ( 'kvBadge' ) . style . display = '' ;
370- document . getElementById ( 'prompt' ) . disabled = false ;
371- document . getElementById ( 'sendBtn' ) . disabled = false ;
372- document . getElementById ( 'prompt' ) . focus ( ) ;
373- addMessage ( 'system' , `Model loaded! ${ name } (${ ( bytes . length / 1048576 ) . toFixed ( 0 ) } MB). Ask anything.` ) ;
374- } else {
375- addMessage ( 'system' , 'Failed to load model.' ) ;
376- }
377- } catch ( e ) {
378- addMessage ( 'system' , `Error: ${ e . message } ` ) ;
379- }
380- hideLoading ( ) ;
359+ showLoading ( 'Loading model into WASM...' ) ;
360+ // Transfer ArrayBuffer to worker (zero-copy)
361+ const buffer = bytes . buffer . slice ( bytes . byteOffset , bytes . byteOffset + bytes . byteLength ) ;
362+ worker . postMessage ( { type : 'load' , bytes : buffer , name : name } , [ buffer ] ) ;
381363}
382364
383365async function loadModel ( file ) {
@@ -402,8 +384,81 @@ <h2>LLM in Your Browser</h2>
402384 return `<|im_start|>user\n${ text } <|im_end|>\n<|im_start|>assistant\n` ;
403385}
404386
405- async function generate ( ) {
406- if ( ! modelLoaded || generating ) return ;
387+ // ---- Web Worker inference engine (no ASYNCIFY overhead) ----
388+ let worker = null ;
389+ let pendingAssistantDiv = null ;
390+ let pendingOutput = '' ;
391+ let pendingTokenCount = 0 ;
392+ let pendingStartTime = 0 ;
393+
394+ function initWorker ( ) {
395+ worker = new Worker ( 'inference-worker.js' ) ;
396+ worker . onmessage = function ( e ) {
397+ const msg = e . data ;
398+
399+ if ( msg . type === 'ready' ) {
400+ addMessage ( 'system' , 'Runtime ready. Choose a model or drop your own GGUF file.' ) ;
401+ }
402+ else if ( msg . type === 'status' ) {
403+ if ( msg . msg === 'thinking' && pendingAssistantDiv ) {
404+ pendingAssistantDiv . innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>' ;
405+ document . getElementById ( 'statTokens' ) . textContent = 'Processing prompt...' ;
406+ document . getElementById ( 'statSpeed' ) . textContent = '' ;
407+ } else {
408+ addMessage ( 'system' , msg . msg ) ;
409+ }
410+ }
411+ else if ( msg . type === 'loaded' ) {
412+ modelLoaded = true ;
413+ const dropzone = document . getElementById ( 'dropzone' ) ;
414+ dropzone . classList . add ( 'loaded' ) ;
415+ dropzone . innerHTML = `<h2>✓ ${ msg . name } (${ ( msg . size / 1048576 ) . toFixed ( 0 ) } MB)</h2>
416+ <p style="color:#6ee7b7">KV compression active — 3x longer context</p>` ;
417+ document . getElementById ( 'kvBadge' ) . style . display = '' ;
418+ document . getElementById ( 'prompt' ) . disabled = false ;
419+ document . getElementById ( 'sendBtn' ) . disabled = false ;
420+ document . getElementById ( 'prompt' ) . focus ( ) ;
421+ hideLoading ( ) ;
422+ }
423+ else if ( msg . type === 'token' && pendingAssistantDiv ) {
424+ pendingOutput += msg . text ;
425+ pendingTokenCount ++ ;
426+ pendingAssistantDiv . textContent = pendingOutput ;
427+ const cursor = document . createElement ( 'span' ) ;
428+ cursor . className = 'cursor' ;
429+ cursor . textContent = '▌' ;
430+ pendingAssistantDiv . appendChild ( cursor ) ;
431+ const chat = document . getElementById ( 'chat' ) ;
432+ chat . scrollTop = chat . scrollHeight ;
433+ const elapsed = ( performance . now ( ) - pendingStartTime ) / 1000 ;
434+ if ( elapsed > 0.1 ) {
435+ document . getElementById ( 'statTokens' ) . textContent = `${ pendingTokenCount } tokens` ;
436+ document . getElementById ( 'statSpeed' ) . textContent = `${ ( pendingTokenCount / elapsed ) . toFixed ( 1 ) } tok/s` ;
437+ }
438+ }
439+ else if ( msg . type === 'done' ) {
440+ if ( pendingAssistantDiv ) {
441+ if ( pendingOutput ) {
442+ pendingAssistantDiv . innerHTML = formatText ( pendingOutput ) ;
443+ } else {
444+ pendingAssistantDiv . innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>' ;
445+ }
446+ const elapsed = ( performance . now ( ) - pendingStartTime ) / 1000 ;
447+ const tps = pendingTokenCount > 0 ? ( pendingTokenCount / elapsed ) . toFixed ( 1 ) : '0' ;
448+ document . getElementById ( 'statTokens' ) . textContent = `${ pendingTokenCount } tokens` ;
449+ document . getElementById ( 'statSpeed' ) . textContent = `${ tps } tok/s` ;
450+ }
451+ generating = false ;
452+ document . getElementById ( 'sendBtn' ) . disabled = false ;
453+ document . getElementById ( 'prompt' ) . disabled = false ;
454+ document . getElementById ( 'prompt' ) . focus ( ) ;
455+ pendingAssistantDiv = null ;
456+ }
457+ } ;
458+ }
459+
460+ function generate ( ) {
461+ if ( ! modelLoaded || generating || ! worker ) return ;
407462 const input = document . getElementById ( 'prompt' ) ;
408463 const text = input . value . trim ( ) ;
409464 if ( ! text ) return ;
@@ -414,84 +469,19 @@ <h2>LLM in Your Browser</h2>
414469 input . disabled = true ;
415470
416471 addMessage ( 'user' , text ) ;
417- const assistantDiv = addMessage ( 'assistant' , '' ) ;
418- // Show "thinking" indicator during prompt prefill (before first token)
419- assistantDiv . innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>' ;
420- let output = '' ;
421- let tokenCount = 0 ;
422- const startTime = performance . now ( ) ;
423- document . getElementById ( 'statTokens' ) . textContent = 'Processing prompt...' ;
424- document . getElementById ( 'statSpeed' ) . textContent = '' ;
425-
426- // Set streaming token callback
427- Module . onToken = ( token ) => {
428- output += token ;
429- tokenCount ++ ;
430- // Update the assistant message with raw text + blinking cursor
431- assistantDiv . textContent = output ;
432- const cursor = document . createElement ( 'span' ) ;
433- cursor . className = 'cursor' ;
434- cursor . textContent = '▌' ;
435- assistantDiv . appendChild ( cursor ) ;
436- // Auto-scroll
437- const chat = document . getElementById ( 'chat' ) ;
438- chat . scrollTop = chat . scrollHeight ;
439- // Live stats
440- const elapsed = ( performance . now ( ) - startTime ) / 1000 ;
441- if ( elapsed > 0.1 ) {
442- document . getElementById ( 'statTokens' ) . textContent = `${ tokenCount } tokens` ;
443- document . getElementById ( 'statSpeed' ) . textContent = `${ ( tokenCount / elapsed ) . toFixed ( 1 ) } tok/s` ;
444- }
445- } ;
446-
447- Module . onDone = ( nTokens , elapsedMs ) => {
448- // Final render with markdown formatting
449- assistantDiv . innerHTML = formatText ( output ) ;
450- const tps = nTokens > 0 ? ( nTokens / ( elapsedMs / 1000 ) ) . toFixed ( 1 ) : '0' ;
451- document . getElementById ( 'statTokens' ) . textContent = `${ nTokens } tokens` ;
452- document . getElementById ( 'statSpeed' ) . textContent = `${ tps } tok/s` ;
453- generating = false ;
454- document . getElementById ( 'sendBtn' ) . disabled = false ;
455- input . disabled = false ;
456- input . focus ( ) ;
457- } ;
472+ pendingAssistantDiv = addMessage ( 'assistant' , '' ) ;
473+ pendingAssistantDiv . innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>' ;
474+ pendingOutput = '' ;
475+ pendingTokenCount = 0 ;
476+ pendingStartTime = performance . now ( ) ;
458477
459478 const chatPrompt = getChatPrompt ( text ) ;
460-
461- // Use ASYNCIFY: _wasm_generate_async yields to browser between tokens
462- const promptPtr = Module . allocateUTF8 ( chatPrompt ) ;
463- try {
464- await Module . _wasm_generate_async ( promptPtr , 0.7 , 256 ) ;
465- } catch ( e ) {
466- // Fallback for non-ASYNCIFY builds
467- Module . _wasm_generate ( promptPtr , 0.7 , 256 ) ;
468- }
469- Module . _free ( promptPtr ) ;
470-
471- if ( ! output ) {
472- assistantDiv . innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>' ;
473- }
474- generating = false ;
475- document . getElementById ( 'sendBtn' ) . disabled = false ;
476- input . disabled = false ;
479+ worker . postMessage ( { type : 'generate' , prompt : chatPrompt , temperature : 0.7 , maxTokens : 256 } ) ;
477480}
478- </ script >
479481
480- <!-- Emscripten-generated JS will be loaded here -->
481- < script >
482- var Module = {
483- onToken : null ,
484- onDone : null ,
485- onStatus : null ,
486- print : function ( text ) { console . log ( text ) ; } ,
487- printErr : function ( text ) { console . warn ( text ) ; } ,
488- onRuntimeInitialized : function ( ) {
489- console . log ( 'quant.cpp WASM ready' ) ;
490- addMessage ( 'system' , 'Runtime ready. Choose a model or drop your own GGUF file.' ) ;
491- }
492- } ;
482+ // Initialize worker on page load
483+ initWorker ( ) ;
493484</ script >
494- < script src ="quant.js "> </ script >
495485
496486</ body >
497487</ html >
0 commit comments