quantumaikr
diff --git a/‎wasm/build.sh‎
Lines changed: 13 additions & 27 deletions b/‎wasm/build.sh‎
Lines changed: 13 additions & 27 deletions
diff --git a/‎wasm/index.html‎
Lines changed: 87 additions & 97 deletions b/‎wasm/index.html‎
Lines changed: 87 additions & 97 deletions
diff --git a/‎wasm/inference-worker.js‎
Lines changed: 67 additions & 0 deletions b/‎wasm/inference-worker.js‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎wasm/quant.js‎
Lines changed: 1 addition & 1 deletion b/‎wasm/quant.js‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎wasm/quant.wasm‎
-73 KB b/‎wasm/quant.wasm‎
-73 KB
@@ -1,54 +1,45 @@
 #!/bin/bash
-# Build quant.cpp WASM demo (multi-threaded + SIMD)
+# Build quant.cpp WASM demo (multi-threaded + SIMD, no ASYNCIFY)
 # Requires: Emscripten SDK (emcc)
 #
-# Usage: cd wasm && bash build.sh
-# Then:  python3 -m http.server 8080
-# Open:  http://localhost:8080
-#
-# Multi-threading requires Cross-Origin-Isolation headers.
-# coi-serviceworker.js injects them on GitHub Pages / static hosts.
+# Architecture: inference runs in a Web Worker (inference-worker.js)
+# so the main thread stays responsive. No ASYNCIFY needed — the worker
+# blocks on quant_generate() while postMessage streams tokens.
 
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
 
-echo "=== Building quant.cpp WASM (pthreads + SIMD) ==="
+echo "=== Building quant.cpp WASM (pthreads + SIMD, no ASYNCIFY) ==="
 
-# Check emcc
 if ! command -v emcc &>/dev/null; then
-    echo "Error: emcc not found. Install Emscripten:"
-    echo "  brew install emscripten"
-    echo "  # or: git clone https://github.com/emscripten-core/emsdk && ./emsdk install latest && ./emsdk activate latest"
+    echo "Error: emcc not found. Install Emscripten SDK."
     exit 1
 fi
 
 echo "emcc version: $(emcc --version | head -1)"
 
-# Build with pthreads + SIMD128 + ASYNCIFY
 emcc "$SCRIPT_DIR/quant_wasm.c" \
     -I"$PROJECT_DIR" \
     -o "$SCRIPT_DIR/quant.js" \
     -O3 \
     -msimd128 \
+    -mrelaxed-simd \
     -flto \
     -pthread \
     -s WASM=1 \
-    -s ALLOW_MEMORY_GROWTH=1 \
+    -s INITIAL_MEMORY=1GB \
     -s MAXIMUM_MEMORY=4GB \
-    -s INITIAL_MEMORY=256MB \
-    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_generate_async","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
+    -s ALLOW_MEMORY_GROWTH=0 \
+    -s EXPORTED_FUNCTIONS='["_main","_wasm_load_model","_wasm_generate","_wasm_model_info","_wasm_is_ready","_malloc","_free"]' \
     -s EXPORTED_RUNTIME_METHODS='["UTF8ToString","allocateUTF8","FS"]' \
     -s FORCE_FILESYSTEM=1 \
     -s MODULARIZE=0 \
     -s ENVIRONMENT='web,worker' \
     -s NO_EXIT_RUNTIME=1 \
     -s ASSERTIONS=0 \
     -s STACK_SIZE=1MB \
-    -s ASYNCIFY \
-    -s 'ASYNCIFY_IMPORTS=["emscripten_sleep"]' \
-    -s ASYNCIFY_STACK_SIZE=65536 \
     -s PTHREAD_POOL_SIZE=4 \
     -s PTHREAD_POOL_SIZE_STRICT=0 \
     -lm \
@@ -59,14 +50,9 @@ emcc "$SCRIPT_DIR/quant_wasm.c" \
 
 echo ""
 echo "=== Build complete ==="
-echo "Files:"
-for f in quant.js quant.wasm quant.worker.js; do
+for f in quant.js quant.wasm; do
     [ -f "$SCRIPT_DIR/$f" ] && echo "  $f ($(du -h "$SCRIPT_DIR/$f" | cut -f1))"
 done
 echo ""
-echo "To serve locally:"
-echo "  cd $SCRIPT_DIR && python3 -m http.server 8080"
-echo "  Open http://localhost:8080"
-echo ""
-echo "Note: Multi-threading requires Cross-Origin-Isolation."
-echo "coi-serviceworker.js handles this automatically on GitHub Pages."
+echo "  inference-worker.js — Web Worker wrapper (no ASYNCIFY overhead)"
+echo "  coi-serviceworker.js — COOP/COEP header injection for pthreads"
@@ -356,28 +356,10 @@ <h2>LLM in Your Browser</h2>
 }
 
 function loadModelFromBytes(bytes, name) {
-    try {
-        Module.FS.writeFile('/model.gguf', bytes);
-        showLoading('Initializing model...');
-        const rc = Module._wasm_load_model(Module.allocateUTF8('/model.gguf'));
-        if (rc === 0) {
-            modelLoaded = true;
-            const dropzone = document.getElementById('dropzone');
-            dropzone.classList.add('loaded');
-            dropzone.innerHTML = `<h2>✓ ${name} (${(bytes.length/1048576).toFixed(0)} MB)</h2>
-                <p style="color:#6ee7b7">KV compression active — 3x longer context</p>`;
-            document.getElementById('kvBadge').style.display = '';
-            document.getElementById('prompt').disabled = false;
-            document.getElementById('sendBtn').disabled = false;
-            document.getElementById('prompt').focus();
-            addMessage('system', `Model loaded! ${name} (${(bytes.length/1048576).toFixed(0)} MB). Ask anything.`);
-        } else {
-            addMessage('system', 'Failed to load model.');
-        }
-    } catch(e) {
-        addMessage('system', `Error: ${e.message}`);
-    }
-    hideLoading();
+    showLoading('Loading model into WASM...');
+    // Transfer ArrayBuffer to worker (zero-copy)
+    const buffer = bytes.buffer.slice(bytes.byteOffset, bytes.byteOffset + bytes.byteLength);
+    worker.postMessage({ type: 'load', bytes: buffer, name: name }, [buffer]);
 }
 
 async function loadModel(file) {
@@ -402,8 +384,81 @@ <h2>LLM in Your Browser</h2>
     return `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
 }
 
-async function generate() {
-    if (!modelLoaded || generating) return;
+// ---- Web Worker inference engine (no ASYNCIFY overhead) ----
+let worker = null;
+let pendingAssistantDiv = null;
+let pendingOutput = '';
+let pendingTokenCount = 0;
+let pendingStartTime = 0;
+
+function initWorker() {
+    worker = new Worker('inference-worker.js');
+    worker.onmessage = function(e) {
+        const msg = e.data;
+
+        if (msg.type === 'ready') {
+            addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
+        }
+        else if (msg.type === 'status') {
+            if (msg.msg === 'thinking' && pendingAssistantDiv) {
+                pendingAssistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
+                document.getElementById('statTokens').textContent = 'Processing prompt...';
+                document.getElementById('statSpeed').textContent = '';
+            } else {
+                addMessage('system', msg.msg);
+            }
+        }
+        else if (msg.type === 'loaded') {
+            modelLoaded = true;
+            const dropzone = document.getElementById('dropzone');
+            dropzone.classList.add('loaded');
+            dropzone.innerHTML = `<h2>✓ ${msg.name} (${(msg.size/1048576).toFixed(0)} MB)</h2>
+                <p style="color:#6ee7b7">KV compression active — 3x longer context</p>`;
+            document.getElementById('kvBadge').style.display = '';
+            document.getElementById('prompt').disabled = false;
+            document.getElementById('sendBtn').disabled = false;
+            document.getElementById('prompt').focus();
+            hideLoading();
+        }
+        else if (msg.type === 'token' && pendingAssistantDiv) {
+            pendingOutput += msg.text;
+            pendingTokenCount++;
+            pendingAssistantDiv.textContent = pendingOutput;
+            const cursor = document.createElement('span');
+            cursor.className = 'cursor';
+            cursor.textContent = '▌';
+            pendingAssistantDiv.appendChild(cursor);
+            const chat = document.getElementById('chat');
+            chat.scrollTop = chat.scrollHeight;
+            const elapsed = (performance.now() - pendingStartTime) / 1000;
+            if (elapsed > 0.1) {
+                document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`;
+                document.getElementById('statSpeed').textContent = `${(pendingTokenCount / elapsed).toFixed(1)} tok/s`;
+            }
+        }
+        else if (msg.type === 'done') {
+            if (pendingAssistantDiv) {
+                if (pendingOutput) {
+                    pendingAssistantDiv.innerHTML = formatText(pendingOutput);
+                } else {
+                    pendingAssistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
+                }
+                const elapsed = (performance.now() - pendingStartTime) / 1000;
+                const tps = pendingTokenCount > 0 ? (pendingTokenCount / elapsed).toFixed(1) : '0';
+                document.getElementById('statTokens').textContent = `${pendingTokenCount} tokens`;
+                document.getElementById('statSpeed').textContent = `${tps} tok/s`;
+            }
+            generating = false;
+            document.getElementById('sendBtn').disabled = false;
+            document.getElementById('prompt').disabled = false;
+            document.getElementById('prompt').focus();
+            pendingAssistantDiv = null;
+        }
+    };
+}
+
+function generate() {
+    if (!modelLoaded || generating || !worker) return;
     const input = document.getElementById('prompt');
     const text = input.value.trim();
     if (!text) return;
@@ -414,84 +469,19 @@ <h2>LLM in Your Browser</h2>
     input.disabled = true;
 
     addMessage('user', text);
-    const assistantDiv = addMessage('assistant', '');
-    // Show "thinking" indicator during prompt prefill (before first token)
-    assistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
-    let output = '';
-    let tokenCount = 0;
-    const startTime = performance.now();
-    document.getElementById('statTokens').textContent = 'Processing prompt...';
-    document.getElementById('statSpeed').textContent = '';
-
-    // Set streaming token callback
-    Module.onToken = (token) => {
-        output += token;
-        tokenCount++;
-        // Update the assistant message with raw text + blinking cursor
-        assistantDiv.textContent = output;
-        const cursor = document.createElement('span');
-        cursor.className = 'cursor';
-        cursor.textContent = '▌';
-        assistantDiv.appendChild(cursor);
-        // Auto-scroll
-        const chat = document.getElementById('chat');
-        chat.scrollTop = chat.scrollHeight;
-        // Live stats
-        const elapsed = (performance.now() - startTime) / 1000;
-        if (elapsed > 0.1) {
-            document.getElementById('statTokens').textContent = `${tokenCount} tokens`;
-            document.getElementById('statSpeed').textContent = `${(tokenCount / elapsed).toFixed(1)} tok/s`;
-        }
-    };
-
-    Module.onDone = (nTokens, elapsedMs) => {
-        // Final render with markdown formatting
-        assistantDiv.innerHTML = formatText(output);
-        const tps = nTokens > 0 ? (nTokens / (elapsedMs / 1000)).toFixed(1) : '0';
-        document.getElementById('statTokens').textContent = `${nTokens} tokens`;
-        document.getElementById('statSpeed').textContent = `${tps} tok/s`;
-        generating = false;
-        document.getElementById('sendBtn').disabled = false;
-        input.disabled = false;
-        input.focus();
-    };
+    pendingAssistantDiv = addMessage('assistant', '');
+    pendingAssistantDiv.innerHTML = '<span class="thinking"><span class="spinner" style="display:inline-block;width:12px;height:12px;vertical-align:middle;margin-right:6px"></span>Thinking...</span>';
+    pendingOutput = '';
+    pendingTokenCount = 0;
+    pendingStartTime = performance.now();
 
     const chatPrompt = getChatPrompt(text);
-
-    // Use ASYNCIFY: _wasm_generate_async yields to browser between tokens
-    const promptPtr = Module.allocateUTF8(chatPrompt);
-    try {
-        await Module._wasm_generate_async(promptPtr, 0.7, 256);
-    } catch(e) {
-        // Fallback for non-ASYNCIFY builds
-        Module._wasm_generate(promptPtr, 0.7, 256);
-    }
-    Module._free(promptPtr);
-
-    if (!output) {
-        assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
-    }
-    generating = false;
-    document.getElementById('sendBtn').disabled = false;
-    input.disabled = false;
+    worker.postMessage({ type: 'generate', prompt: chatPrompt, temperature: 0.7, maxTokens: 256 });
 }
-</script>
 
-<!-- Emscripten-generated JS will be loaded here -->
-<script>
-var Module = {
-    onToken: null,
-    onDone: null,
-    onStatus: null,
-    print: function(text) { console.log(text); },
-    printErr: function(text) { console.warn(text); },
-    onRuntimeInitialized: function() {
-        console.log('quant.cpp WASM ready');
-        addMessage('system', 'Runtime ready. Choose a model or drop your own GGUF file.');
-    }
-};
+// Initialize worker on page load
+initWorker();
 </script>
-<script src="quant.js"></script>
 
 </body>
 </html>
@@ -0,0 +1,67 @@
+/**
+ * inference-worker.js — Web Worker that runs WASM inference off the main thread.
+ *
+ * Eliminates ASYNCIFY entirely: the worker can block on quant_generate()
+ * while the main thread stays responsive. Tokens stream via postMessage().
+ *
+ * Protocol:
+ *   Main → Worker: { type: 'load', bytes: ArrayBuffer }
+ *   Main → Worker: { type: 'generate', prompt: string, temperature: number, maxTokens: number }
+ *   Worker → Main: { type: 'status', msg: string }
+ *   Worker → Main: { type: 'token', text: string }
+ *   Worker → Main: { type: 'done', nTokens: number, elapsed: number }
+ *   Worker → Main: { type: 'ready' }
+ */
+
+/* Load the Emscripten glue code. Module is configured before loading. */
+var Module = {
+    onToken: null,
+    onDone: null,
+    onStatus: null,
+    print: function(text) { /* suppress stdout in worker */ },
+    printErr: function(text) { /* suppress stderr in worker */ },
+    onRuntimeInitialized: function() {
+        postMessage({ type: 'ready' });
+    }
+};
+
+importScripts('quant.js');
+
+onmessage = function(e) {
+    const msg = e.data;
+
+    if (msg.type === 'load') {
+        try {
+            const bytes = new Uint8Array(msg.bytes);
+            Module.FS.writeFile('/model.gguf', bytes);
+            const rc = Module._wasm_load_model(Module.allocateUTF8('/model.gguf'));
+            if (rc === 0) {
+                const info = Module._wasm_model_info();
+                postMessage({ type: 'status', msg: 'Model loaded! Ready to chat. (' + msg.name + ')' });
+                postMessage({ type: 'loaded', size: bytes.length, name: msg.name });
+            } else {
+                postMessage({ type: 'status', msg: 'Error: failed to load model' });
+            }
+        } catch (err) {
+            postMessage({ type: 'status', msg: 'Error: ' + err.message });
+        }
+        return;
+    }
+
+    if (msg.type === 'generate') {
+        postMessage({ type: 'status', msg: 'thinking' });
+
+        /* Set up per-token callback — posts each token to main thread */
+        Module.onToken = function(text) {
+            postMessage({ type: 'token', text: text });
+        };
+
+        const promptPtr = Module.allocateUTF8(msg.prompt);
+        /* This blocks the worker (not the main thread!) until generation completes */
+        Module._wasm_generate(promptPtr, msg.temperature || 0.7, msg.maxTokens || 256);
+        Module._free(promptPtr);
+
+        postMessage({ type: 'done' });
+        return;
+    }
+};