From 576ae946ec96fae0499bc9cf198a47eb9d6f0dcb Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Wed, 15 Apr 2026 15:04:19 -0400
Subject: [PATCH 1/9] feat: bidirectional voice pipeline + MCP shim + dashboard
 enhancements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Migrated from cog-workspace/apps/tts-mcp — consolidating development
into the canonical mod3 repo.

Voice Pipeline (5 phases):
- Three-tier adaptive STT (Whisper Base 31ms + Large 470ms)
- Speculative generation (agent thinks while human speaks)
- Opacity-as-state rendering (transparent → solidifying → solid)
- Barge-in context stitching (state snapshot on interrupt)
- Self-barge draft revision (agent revises its own queued output)

New files:
- draft_queue.py: Thread-safe DraftQueue for speculative generation
- mcp_shim.py: Lightweight MCP-to-HTTP proxy (no model loading)

Modified:
- agent_loop.py: Context stitching, speculative inference, self-barge
- channels.py: Three-tier STT scheduler
- modules/voice.py: decode_streaming(), Whisper Base loader, TTS validation
- dashboard/index.html: Opacity CSS, solidification, partials, queue preview
- dashboard/playback.js: Progress tracking for word-level solidification
- server.py: Session-aware queue foundations

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 ARCHITECTURE.md        | 372 +++++++++++----------
 adaptive_player.py     |  25 +-
 agent_loop.py          | 305 ++++++++++++++++-
 channels.py            | 185 ++++++++--
 dashboard/index.html   | 307 ++++++++++++++++-
 dashboard/playback.js  |  77 ++++-
 dashboard/transport.js |   3 +
 draft_queue.py         | 267 +++++++++++++++
 http_api.py            |  18 +-
 mcp.channel.json       |   2 +-
 mcp_shim.py            | 742 +++++++++++++++++++++++++++++++++++++++++
 modules/voice.py       | 192 ++++++++++-
 providers.py           |  29 +-
 server.py              | 202 ++++++++++-
 14 files changed, 2449 insertions(+), 277 deletions(-)
 create mode 100644 draft_queue.py
 create mode 100644 mcp_shim.py

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 9def800..b037f9f 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -1,216 +1,218 @@
-# Mod3 Architecture: The Modality Bus
+# Mod³ Dashboard — Process Architecture
 
-The modality bus is the sensorimotor boundary between cognitive agents and physical signals. Agents think in cognitive events ("someone spoke", "say this"); the bus translates between those events and raw bytes (audio, text, future: vision, spatial).
+## Intended Flow
 
 ```
-                        ModalityBus
-    ┌──────────────────────────────────────────────┐
-    │                                              │
-    │  ┌─────────┐  ┌─────────┐  ┌─────────┐      │
-    │  │  Voice   │  │  Text   │  │ Vision* │ ...  │
-    │  │ Module   │  │ Module  │  │ Module  │      │
-    │  └────┬─────┘  └────┬────┘  └────┬────┘      │
-    │       │             │            │            │
-    │  ┌────┴─────────────┴────────────┴────┐      │
-    │  │         Event Log + Listeners       │      │
-    │  └────┬─────────────┬────────────┬────┘      │
-    │       │             │            │            │
-    │  ┌────┴────┐  ┌─────┴─────┐  ┌──┴───┐       │
-    │  │ Channel │  │  Channel  │  │ ...  │       │
-    │  │ discord │  │  http-api │  │      │       │
-    │  └─────────┘  └───────────┘  └──────┘       │
-    └──────────────────────────────────────────────┘
-
-    * Vision/Spatial are defined in ModalityType but not yet implemented.
+┌─────────────────────────────────────────────────────────────┐
+│                        BROWSER                               │
+│                                                              │
+│  ┌──────────┐    ┌──────────┐    ┌───────────┐              │
+│  │ Silero   │    │ Text     │    │ Audio     │              │
+│  │ VAD v5   │    │ Input    │    │ Playback  │              │
+│  │ (ONNX)   │    │          │    │ (Web Audio│              │
+│  └────┬─────┘    └────┬─────┘    └─────▲─────┘              │
+│       │               │               │                     │
+│       │ onSpeechEnd   │ sendControl    │ enqueueWav          │
+│       │ (Int16 PCM)   │ (JSON)         │ (base64 WAV)       │
+│       ▼               ▼               │                     │
+│  ┌────────────────────────────────────┐│                     │
+│  │      VoiceTransport (WebSocket)    ││                     │
+│  │  binary frames ──►  ──► JSON      ││                     │
+│  │  JSON frames   ──►  ◄── JSON/b64  ││                     │
+│  └────────────────┬───────────────────┘│                     │
+│                   │                    │                     │
+└───────────────────┼────────────────────┼─────────────────────┘
+                    │ WebSocket /ws/chat │
+                    ▼                    │
+┌───────────────────┼────────────────────┼─────────────────────┐
+│                   │  MOD³ SERVER       │                     │
+│                   │  (single process)  │                     │
+│                   ▼                    │                     │
+│  ┌─────────────────────────────────────────┐                │
+│  │         BrowserChannel                   │                │
+│  │                                          │                │
+│  │  _handle_audio(pcm) → buffer            │                │
+│  │  _handle_json(msg)  → dispatch           │                │
+│  │  _deliver_async()   → send to browser    │                │
+│  └──────┬─────────┬──────────▲──────────────┘                │
+│         │         │          │                               │
+│    PCM audio   text msg   encoded output                    │
+│         │         │          │                               │
+│         ▼         │          │                               │
+│  ┌──────────┐     │          │                               │
+│  │ STT      │     │          │                               │
+│  │ (mlx_    │     │          │                               │
+│  │ whisper) │     │          │                               │
+│  │ temp WAV │     │          │                               │
+│  └────┬─────┘     │          │                               │
+│       │           │          │                               │
+│       │ transcript│          │                               │
+│       ▼           ▼          │                               │
+│  ┌─────────────────────┐     │                               │
+│  │  CognitiveEvent     │     │                               │
+│  │  {content: "text"}  │     │                               │
+│  └──────────┬──────────┘     │                               │
+│             │                │                               │
+│             ▼                │                               │
+│  ┌──────────────────────┐    │                               │
+│  │     AgentLoop         │    │                               │
+│  │                       │    │                               │
+│  │  conversation[]       │    │                               │
+│  │  provider.chat()      │    │                               │
+│  │  → tool_calls         │    │                               │
+│  │                       │    │                               │
+│  │  DISPATCH:            │    │                               │
+│  │  speak(text)          │    │                               │
+│  │    → send_response_text ──────► channel (text to chat)    │
+│  │    → bus.act(VOICE)   │    │                               │
+│  │       ▼               │    │                               │
+│  │  send_text(text)      │    │                               │
+│  │    → send_response_text ──────► channel (text to chat)    │
+│  │                       │    │                               │
+│  │  think(reasoning)     │    │                               │
+│  │    → (internal only)  │    │                               │
+│  └──────────┬────────────┘    │                               │
+│             │                │                               │
+│     bus.act(VOICE intent)    │                               │
+│             │                │                               │
+│             ▼                │                               │
+│  ┌──────────────────────┐    │                               │
+│  │   ModalityBus        │    │                               │
+│  │                      │    │                               │
+│  │   OutputQueue        │    │                               │
+│  │   (per-channel FIFO) │    │                               │
+│  │         │            │    │                               │
+│  │         ▼            │    │                               │
+│  │   VoiceEncoder       │    │                               │
+│  │   (Kokoro TTS)       │    │                               │
+│  │   → WAV bytes        │    │                               │
+│  │         │            │    │                               │
+│  │   ch.deliver(output) ─────┘                               │
+│  │   (base64 JSON)      │                                    │
+│  └──────────────────────┘                                    │
+│                                                              │
+│  ┌──────────────────────┐                                    │
+│  │  InferenceProvider   │                                    │
+│  │  (mlx-lm / Ollama)  │                                    │
+│  │                      │                                    │
+│  │  model resident in   │                                    │
+│  │  memory (in-process) │                                    │
+│  └──────────────────────┘                                    │
+│                                                              │
+└──────────────────────────────────────────────────────────────┘
 ```
 
-## Core Types (modality.py)
-
-### Cognitive Primitives
-
-The agent never touches raw bytes. It sees these:
-
-```python
-@dataclass
-class CognitiveEvent:          # Input percept
-    modality: ModalityType     # VOICE, TEXT, VISION, SPATIAL
-    content: str               # The meaning (transcribed text, caption, etc.)
-    source_channel: str        # Which channel it arrived on
-    confidence: float          # Decoder certainty (0.0 - 1.0)
-    timestamp: float
-    metadata: dict[str, Any]
-
-@dataclass
-class CognitiveIntent:         # Output intent (not yet encoded)
-    modality: ModalityType | None  # None = let the bus decide
-    content: str               # What to communicate
-    target_channel: str        # Specific channel, or "" for bus routing
-    priority: int              # Higher = more urgent
-    metadata: dict[str, Any]   # voice, speed, emotion, etc.
-
-@dataclass
-class EncodedOutput:           # Raw signal ready for delivery
-    modality: ModalityType
-    data: bytes                # WAV, PNG, JSON, etc.
-    format: str                # "wav", "png", "text", etc.
-    duration_sec: float
-    metadata: dict[str, Any]
-```
-
-### Abstract Base Classes
-
-Every modality module implements three components:
-
-```python
-class Gate(ABC):
-    def check(self, raw: bytes, **kwargs) -> GateResult: ...
-
-class Decoder(ABC):
-    def decode(self, raw: bytes, **kwargs) -> CognitiveEvent: ...
+## Current Problems
 
-class Encoder(ABC):
-    def encode(self, intent: CognitiveIntent) -> EncodedOutput: ...
+### 1. Agent blocks on TTS delivery
 
-class ModalityModule(ABC):
-    modality_type -> ModalityType   # Which modality this handles
-    gate -> Gate | None             # Input filter (None = pass all)
-    decoder -> Decoder | None       # raw -> CognitiveEvent
-    encoder -> Encoder | None       # CognitiveIntent -> EncodedOutput
-    state -> ModuleState            # Live HUD state
-    health() -> dict                # Diagnostics
+```
+agent_loop._process():
+  await send_response_text(text)    # ← fast, JSON to browser
+  await asyncio.to_thread(bus.act)  # ← BLOCKS until TTS generates + delivers
+  # agent can't process next event until TTS finishes
 ```
 
-`Gate` is optional. Text has no gate (all text passes). Voice uses VAD (Voice Activity Detection) to reject silence.
-
-## The Bus (bus.py)
-
-`ModalityBus` manages module registration, signal routing, and state tracking.
+**Should be:** fire-and-forget the bus.act() intent, return immediately.
+bus.act(blocking=False) already returns QueuedJob — just don't await the result.
 
-### perceive() -- Input Path
+### 2. Kokoro cold start blocks OutputQueue drain thread
 
 ```
-raw bytes ──→ Gate.check() ──→ Decoder.decode() ──→ CognitiveEvent
-                  │                   │
-              (rejected?)        (empty content?)
-                  ↓                   ↓
-               None               None (filtered)
-```
-
-```python
-bus.perceive(raw: bytes, modality: str | ModalityType, channel: str = "", **kwargs)
-    -> CognitiveEvent | None
+OutputQueue drain thread:
+  _do_encode() → VoiceEncoder.encode() → engine.synthesize()
+    → Kokoro first-time init: ~60s blocking
+    → All other queued jobs wait
+    → _deliver_sync timeout (10s) fires on older jobs
 ```
 
-1. Resolve the modality module from the registry
-2. If the module has a gate, run `gate.check(raw)`. Emit a `modality.gate` bus event. Return `None` if rejected.
-3. Run `decoder.decode(raw)`. If content is empty (e.g., hallucination filtered), emit `modality.filtered` and return `None`.
-4. Stamp `source_channel`, emit `modality.input`, return the event.
+**Should be:** pre-warm Kokoro on server startup (background thread).
 
-### act() -- Output Path
+### 3. WebSocket lifecycle fragility
 
 ```
-CognitiveIntent ──→ resolve modality ──→ Encoder.encode() ──→ EncodedOutput
-                                                                    │
-                                                          channel.deliver()
-```
-
-```python
-bus.act(intent: CognitiveIntent, channel: str = "", blocking: bool = False)
-    -> QueuedJob | EncodedOutput
+Browser page reload → new WebSocket → new BrowserChannel
+  Old channel's deliver callback still referenced by bus
+  Old OutputQueue drain thread still running
+  → sends to dead WebSocket → timeout → error cascade
 ```
 
-1. Resolve output modality: explicit on intent, or inferred from channel capabilities (prefers voice over text), or defaults to text.
-2. Encode via the module's encoder. Emits `modality.encode_start` and `modality.output` bus events.
-3. If the target channel has a `deliver` callback, call it with the encoded output.
-4. If `blocking=True`, returns `EncodedOutput` directly. Otherwise queues via `OutputQueueManager` and returns a `QueuedJob`.
+**Should be:** channel cleanup on disconnect should cancel all queued jobs
+for that channel.
 
-### hud() -- Agent Awareness
+### 4. STT blocks the event loop context
 
-```python
-bus.hud() -> dict
 ```
-
-Returns a live snapshot of all modules and channels: current status, active jobs, queue depths, recent events. Designed to be injected into the agent's context window so it knows what the body is doing.
-
-### Channels
-
-Channels declare which modalities they support. The bus auto-routes output based on channel capabilities.
-
-```python
-bus.register_channel("discord-voice", [ModalityType.VOICE, ModalityType.TEXT],
-                     deliver=send_to_discord)
+_process_utterance():
+  await asyncio.to_thread(_transcribe)  # blocks a thread pool thread
+    → mlx_whisper.transcribe()          # 1-2s CPU-bound
+    → blocks one thread pool slot
 ```
 
-### Bus Events
-
-Every boundary crossing is recorded as a `BusEvent` (type, modality, channel, timestamp, data). Listeners can subscribe via `bus.on_event(callback)` for ledger integration. The bus keeps the last 500 events in memory.
-
-## Current Modalities
+This is fine for one user. But the thread pool is shared with bus.act().
 
-### Voice (modules/voice.py)
+### 5. No separation between thinking and acting
 
-| Component | Class | Implementation |
-|-----------|-------|----------------|
-| Gate | `VoiceGate` | Silero VAD via `vad.detect_speech()`. Threshold-configurable (default 0.5). Rejects audio with no detected speech. |
-| Decoder | `WhisperDecoder` | `mlx_whisper` STT on Apple Silicon. Lazy-loads `mlx-community/whisper-turbo`. Applies `vad.is_hallucination()` filter to reject phantom transcripts. |
-| Decoder (legacy) | `PlaceholderDecoder` | Accepts pre-transcribed text. Used by the MCP server for the `speak` tool path where text is already known. |
-| Encoder | `VoiceEncoder` | Wraps `engine.synthesize()` (Kokoro, Voxtral, Chatterbox, Spark). Default voice: `bm_lewis` at 1.25x speed. Returns WAV bytes. |
+The agent loop processes ONE event at a time (_processing flag).
+If bus.act() blocks, no new events can be processed.
+The agent should be able to think about the next input while 
+TTS is generating for the current one.
 
-### Text (modules/text.py)
+## Intended Architecture (what we should build toward)
 
-| Component | Class | Implementation |
-|-----------|-------|----------------|
-| Gate | None | All text passes through. |
-| Decoder | `TextDecoder` | Identity transform: `bytes.decode("utf-8")` -> `CognitiveEvent`. |
-| Encoder | `TextEncoder` | Identity transform: `intent.content.encode("utf-8")` -> `EncodedOutput`. |
-
-Text exists so it is a first-class modality on the bus, not a special case.
-
-## Integration Points
-
-### MCP Server (server.py)
-
-The MCP server creates the bus singleton at module level:
-
-```python
-_bus = _create_bus()  # ModalityBus with VoiceModule(decoder=PlaceholderDecoder())
 ```
-
-MCP tools (`speak`, `diagnostics`, `vad_check`) use `_bus` for voice state tracking, health reports, and VAD. The `speak` tool resolves voices through the bus's voice module, sets encoder state, and uses the engine directly for synthesis (the adaptive player handles local playback).
-
-The `diagnostics` tool returns `_bus.health()` and `_bus.hud()`.
-
-### HTTP API (http_api.py)
-
-The HTTP API imports the bus singleton from the MCP server:
-
-```python
-from server import _bus as _shared_bus  # Shared instance when co-hosted
-_bus = _shared_bus                       # Falls back to fresh ModalityBus if import fails
+Browser ──WebSocket──► BrowserChannel
+                           │
+                      ┌────▼────┐
+                      │  INPUT  │  (fast, non-blocking)
+                      │  QUEUE  │  CognitiveEvents
+                      └────┬────┘
+                           │
+                      ┌────▼────┐
+                      │  AGENT  │  (owns conversation, calls LLM)
+                      │  LOOP   │  processes events sequentially
+                      │         │  but NEVER blocks on output
+                      └────┬────┘
+                           │
+                    tool calls (non-blocking)
+                           │
+              ┌────────────┼────────────┐
+              │            │            │
+         speak(text)  send_text()  think()
+              │            │            │
+              ▼            ▼            │
+        ┌──────────┐ ┌──────────┐     (log)
+        │ OUTPUT   │ │ channel  │
+        │ QUEUE    │ │ .deliver │
+        │ (async)  │ │ (JSON)   │
+        └────┬─────┘ └──────────┘
+             │
+        ┌────▼─────┐
+        │ TTS      │  (background thread)
+        │ Kokoro   │  
+        └────┬─────┘
+             │
+        ch.deliver(base64 WAV)
+             │
+             ▼
+         Browser playback
 ```
 
-It ensures both Text and Voice modules are registered, then exposes the bus directly:
-
-| Endpoint | Bus Method |
-|----------|------------|
-| `GET /v1/bus/hud` | `_bus.hud()` |
-| `GET /v1/bus/health` | `_bus.health()` |
-| `POST /v1/bus/perceive` | `_bus.perceive(raw, modality, channel)` |
-| `POST /v1/bus/act` | `_bus.act(intent, channel, blocking=True)` |
-| `GET /health` | includes `_bus.health()` and `_bus.hud()` |
-
-When running with `--all`, both MCP and HTTP share the same bus instance and model cache.
-
-## Adding a New Modality
-
-1. **Create `modules/your_modality.py`** -- implement `Gate`, `Decoder`, `Encoder` (all optional), and a `ModalityModule` subclass that wires them together. See `modules/text.py` for the minimal case or `modules/voice.py` for the full pattern.
-
-2. **Add the modality type** to `ModalityType` in `modality.py` if needed. `VISION` and `SPATIAL` are already defined.
-
-3. **Register with the bus** where it is created (`server.py` and/or `http_api.py`):
-   ```python
-   bus.register(VisionModule())
-   bus.register_channel("webcam-feed", [ModalityType.VISION])
-   ```
-
-4. **No routing changes needed.** The bus auto-routes `act()` based on channel capabilities. The HTTP API's `/v1/bus/perceive` and `/v1/bus/act` already accept any registered modality via the `modality` parameter.
+Key principle: **the agent never waits for output delivery.**
+speak() queues a TTS job and returns immediately.
+The bus handles encoding and delivery asynchronously.
+
+## Files
+
+| File | Role | Lines | Status |
+|------|------|-------|--------|
+| `providers.py` | InferenceProvider: MLX, Ollama, CogOS | ~450 | Working |
+| `channels.py` | BrowserChannel: WebSocket ↔ bus | ~260 | Working (fragile) |
+| `agent_loop.py` | Event → LLM → tool dispatch | ~160 | Working (blocks on TTS) |
+| `dashboard/index.html` | UI: chat, VAD, settings | ~700 | Working |
+| `dashboard/transport.js` | WebSocket framing | ~100 | Working |
+| `dashboard/playback.js` | Web Audio playback | ~113 | Working |
+| `http_api.py` | WebSocket endpoint, static serving | +70 | Working |
+| `server.py` | --dashboard startup mode | +12 | Working |
+| `modules/voice.py` | VoiceGate, WhisperDecoder, VoiceEncoder | 309 | Working (not used for dashboard STT) |
+| `bus.py` | ModalityBus: perceive/act, OutputQueue | 318 | Working |
diff --git a/adaptive_player.py b/adaptive_player.py
index 89b8062..7638437 100644
--- a/adaptive_player.py
+++ b/adaptive_player.py
@@ -263,13 +263,36 @@ def wait(self, timeout: float = 120.0) -> PlaybackMetrics:
     # Internal
     # ------------------------------------------------------------------
 
+    def _resolve_device(self):
+        """Resolve the output device, falling back to system default if unavailable."""
+        if self.device is None:
+            return None  # sounddevice uses system default
+
+        try:
+            devices = sd.query_devices()
+            if isinstance(self.device, int):
+                if self.device < len(devices):
+                    info = devices[self.device]
+                    if info["max_output_channels"] > 0:
+                        return self.device
+            elif isinstance(self.device, str):
+                for i, d in enumerate(devices):
+                    if self.device in d["name"] and d["max_output_channels"] > 0:
+                        return i
+        except Exception:
+            pass
+
+        # Device unavailable — fall back to system default.
+        return None
+
     def _start_stream(self):
         self._stream_finished.clear()
+        resolved = self._resolve_device()
         self._stream = sd.OutputStream(
             samplerate=self.sample_rate,
             channels=1,
             dtype="float32",
-            device=self.device,
+            device=resolved,
             callback=self._callback,
             finished_callback=self._on_stream_finished,
             blocksize=self.buffer_size,
diff --git a/agent_loop.py b/agent_loop.py
index 449fcba..2f28aad 100644
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -11,11 +11,12 @@
 import logging
 import os
 import time
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import httpx
 
 from bus import ModalityBus
+from draft_queue import DraftQueue
 from modality import CognitiveEvent, CognitiveIntent, ModalityType
 from pipeline_state import PipelineState
 from providers import AGENT_TOOLS, InferenceProvider
@@ -82,10 +83,11 @@ def _fetch_kernel_context() -> str:
                 interrupted = signal.get("interrupted")
                 if interrupted:
                     delivered = interrupted.get("delivered_text", "")
+                    full = interrupted.get("full_text", "")
                     pct = interrupted.get("spoken_pct", 0)
                     parts.append(
-                        f"[barge-in] Claude's speech was interrupted at {pct * 100:.0f}%. "
-                        f'Delivered: "{delivered}". '
+                        f"[barge-in] Claude's speech was interrupted at {pct*100:.0f}%. "
+                        f"Delivered: \"{delivered}\". "
                         f"The user interrupted to say something — acknowledge and respond to them."
                     )
         except Exception:
@@ -122,7 +124,6 @@ def _log_exchange_to_bus(user_text: str, assistant_text: str, provider_name: str
     except Exception as e:
         logger.debug("Failed to log exchange to bus: %s", e)
 
-
 MAX_HISTORY = 50
 
 
@@ -143,6 +144,9 @@ def __init__(
         self.conversation: list[dict[str, str]] = []
         self._channel_ref: BrowserChannel | None = None
         self._processing = False
+        self.draft_queue = DraftQueue()
+        self._speculative_context: list[dict[str, str]] = []  # Context for speculative inference
+        self._human_speaking = False  # Whether human is currently speaking
 
     async def handle_event(self, event: CognitiveEvent) -> None:
         """Called when a CognitiveEvent arrives from the channel."""
@@ -169,6 +173,13 @@ async def handle_event(self, event: CognitiveEvent) -> None:
 
     async def _process(self, event: CognitiveEvent) -> None:
         """Core: event → provider → tool dispatch."""
+        # Context stitching: inject interrupt context from dashboard path
+        # This closes the barge-in loop — the agent knows what was spoken,
+        # what was unsaid, and what the user interrupted with.
+        interrupt_context = self._build_interrupt_context(event.content)
+        if interrupt_context:
+            self.conversation.append({"role": "system", "content": interrupt_context})
+
         self.conversation.append({"role": "user", "content": event.content})
         self._trim_history()
 
@@ -203,9 +214,7 @@ async def _process(self, event: CognitiveEvent) -> None:
                         content=text,
                         target_channel=self.channel_id,
                         metadata={
-                            "voice": self._channel_ref.config.get("voice", "bm_lewis")
-                            if self._channel_ref
-                            else "bm_lewis",
+                            "voice": self._channel_ref.config.get("voice", "bm_lewis") if self._channel_ref else "bm_lewis",
                             "speed": self._channel_ref.config.get("speed", 1.25) if self._channel_ref else 1.25,
                         },
                     )
@@ -240,12 +249,10 @@ async def _process(self, event: CognitiveEvent) -> None:
         # Update conversation history
         if assistant_parts:
             assistant_text = " ".join(assistant_parts)
-            self.conversation.append(
-                {
-                    "role": "assistant",
-                    "content": assistant_text,
-                }
-            )
+            self.conversation.append({
+                "role": "assistant",
+                "content": assistant_text,
+            })
 
             # Log exchange to CogOS bus (observation channel — Claude can see this)
             _log_exchange_to_bus(event.content, assistant_text, self.provider.name)
@@ -256,6 +263,278 @@ async def _process(self, event: CognitiveEvent) -> None:
                 metrics={"llm_ms": round(t_llm, 1), "provider": self.provider.name}
             )
 
+    async def speculative_infer(self, committed_text: str) -> None:
+        """D2: Speculative inference trigger.
+
+        When T3 commits a sentence while the human is still speaking,
+        launch background inference with context-so-far. Store result
+        in the DraftQueue. Does NOT play — just buffers.
+        """
+        if not committed_text.strip():
+            return
+
+        logger.info("speculative_infer: '%s'", committed_text[:80])
+
+        # Build speculative conversation with committed text so far
+        spec_messages = list(self.conversation) + [
+            {"role": "user", "content": committed_text},
+        ]
+
+        try:
+            t_start = time.perf_counter()
+            kernel_ctx = _fetch_kernel_context()
+            system_prompt = _BASE_SYSTEM_PROMPT + kernel_ctx
+
+            response = await self.provider.chat(
+                messages=spec_messages,
+                tools=AGENT_TOOLS,
+                system=system_prompt,
+            )
+
+            t_ms = (time.perf_counter() - t_start) * 1000
+
+            # Extract response text
+            response_text = ""
+            for tc in response.tool_calls:
+                if tc.name == "speak":
+                    response_text += tc.arguments.get("text", "") + " "
+            if not response_text and response.text:
+                response_text = response.text
+
+            response_text = response_text.strip()
+            if not response_text:
+                return
+
+            # Add to draft queue
+            import hashlib
+            ctx_hash = hashlib.md5(committed_text.encode()).hexdigest()[:8]
+            block = self.draft_queue.add_block(
+                text=response_text,
+                context_hash=ctx_hash,
+                generation_ms=t_ms,
+            )
+
+            logger.info(
+                "speculative block %s: '%s' (%.0fms)",
+                block.id, response_text[:60], t_ms,
+            )
+
+            # F2: Speculative TTS pre-synthesis
+            # Generate audio immediately but don't play
+            await self._presynthesise_block(block)
+
+            # Notify dashboard of draft queue state
+            if self._channel_ref:
+                await self._channel_ref.ws.send_json({
+                    "type": "draft_queue",
+                    "blocks": [b.to_dict() for b in self.draft_queue.get_pending()],
+                })
+
+        except Exception as e:
+            logger.debug("speculative_infer failed: %s", e)
+
+    async def self_barge_snip(self, block_id: str) -> bool:
+        """E1: Remove a queued block that's no longer relevant."""
+        result = self.draft_queue.snip(block_id)
+        if result:
+            logger.info("self-barge: snipped block %s", block_id)
+            await self._push_draft_queue_state()
+        return result
+
+    async def self_barge_inject(self, position: int, text: str) -> None:
+        """E1: Insert a new block at position."""
+        block = self.draft_queue.inject(position, text)
+        logger.info("self-barge: injected block %s at pos %d", block.id, position)
+        # Pre-synthesize the new block
+        await self._presynthesise_block(block)
+        await self._push_draft_queue_state()
+
+    async def self_barge_revise(self, block_id: str, new_text: str) -> bool:
+        """E1: Replace a block's content and re-synthesize TTS."""
+        result = self.draft_queue.revise(block_id, new_text)
+        if result:
+            logger.info("self-barge: revised block %s -> '%s'", block_id, new_text[:60])
+            # Find the block and re-synthesize
+            for block in self.draft_queue.all_blocks:
+                if block.id == block_id:
+                    await self._presynthesise_block(block)
+                    break
+            await self._push_draft_queue_state()
+        return result
+
+    async def _push_draft_queue_state(self) -> None:
+        """Push current draft queue state to the dashboard."""
+        if self._channel_ref:
+            try:
+                await self._channel_ref.ws.send_json({
+                    "type": "draft_queue",
+                    "blocks": [b.to_dict() for b in self.draft_queue.all_blocks],
+                })
+            except Exception:
+                pass
+
+    async def invalidate_stale_drafts(self, new_context: str) -> int:
+        """D3: Draft block invalidation.
+
+        When a new T3 sentence arrives, check if existing draft blocks
+        are still valid given the updated context. Mark stale ones.
+
+        Uses context hash comparison: if a block was generated with
+        different context than what we have now, it's potentially stale.
+
+        Returns count of invalidated blocks.
+        """
+        import hashlib
+
+        new_hash = hashlib.md5(new_context.encode()).hexdigest()[:8]
+        invalidated = 0
+
+        for block in self.draft_queue.get_pending():
+            if block.context_hash and block.context_hash != new_hash:
+                self.draft_queue.invalidate(block.id)
+                invalidated += 1
+                logger.info("invalidated stale draft block %s (context changed)", block.id)
+
+        if invalidated > 0 and self._channel_ref:
+            try:
+                await self._channel_ref.ws.send_json({
+                    "type": "draft_queue",
+                    "blocks": [b.to_dict() for b in self.draft_queue.all_blocks],
+                })
+            except Exception:
+                pass
+
+        return invalidated
+
+    async def _presynthesise_block(self, block) -> None:
+        """F2: Pre-synthesize TTS audio for a draft block.
+
+        Generates audio immediately and attaches it to the block.
+        Ready for instant playback when the human stops speaking.
+        """
+        from modules.voice import VoiceEncoder, _encode_wav
+
+        try:
+            voice = "bm_lewis"
+            speed = 1.25
+            if self._channel_ref:
+                voice = self._channel_ref.config.get("voice", "bm_lewis")
+                speed = self._channel_ref.config.get("speed", 1.25)
+
+            def _synth():
+                from engine import synthesize
+                samples, sample_rate = synthesize(
+                    block.text,
+                    voice=voice,
+                    speed=speed,
+                )
+                wav_bytes = _encode_wav(samples, sample_rate)
+                duration = len(samples) / sample_rate
+                return wav_bytes, duration
+
+            wav_bytes, duration = await asyncio.to_thread(_synth)
+            block.tts_audio = wav_bytes
+            block.tts_duration_sec = duration
+            logger.info("pre-synthesized block %s: %.1fs audio", block.id, duration)
+
+        except Exception as e:
+            logger.debug("pre-synthesis failed for block %s: %s", block.id, e)
+
+    async def background_validate_drafts(self, latest_user_text: str) -> None:
+        """E2: Background validation loop.
+
+        After each new human sentence, re-evaluate all queued draft blocks.
+        Snips/revises if context has invalidated them. This runs between
+        TTS synthesis and playback — the revision window.
+        """
+        pending = self.draft_queue.get_pending()
+        if not pending:
+            return
+
+        logger.info("background_validate: checking %d pending blocks", len(pending))
+
+        # First, invalidate any blocks whose context is clearly stale
+        await self.invalidate_stale_drafts(latest_user_text)
+
+        # Then re-evaluate remaining valid blocks
+        still_pending = self.draft_queue.get_pending()
+        if not still_pending:
+            return
+
+        # Build context with latest human input
+        check_messages = list(self.conversation) + [
+            {"role": "user", "content": latest_user_text},
+        ]
+
+        for block in still_pending:
+            try:
+                # Quick relevance check: ask the model if this block is still appropriate
+                check_prompt = (
+                    f"Given the user just said: \"{latest_user_text}\"\n"
+                    f"Is this planned response still appropriate? "
+                    f"Response: \"{block.text}\"\n"
+                    f"Answer KEEP or REVISE in one word."
+                )
+
+                response = await self.provider.chat(
+                    messages=[{"role": "user", "content": check_prompt}],
+                    tools=[],
+                    system="You are evaluating whether a planned response is still valid. Answer KEEP or REVISE.",
+                )
+
+                answer = (response.text or "").strip().upper()
+                if "REVISE" in answer:
+                    logger.info("background_validate: block %s needs revision", block.id)
+                    self.draft_queue.invalidate(block.id)
+                else:
+                    logger.debug("background_validate: block %s still valid", block.id)
+
+            except Exception as e:
+                logger.debug("background_validate error for block %s: %s", block.id, e)
+
+        await self._push_draft_queue_state()
+
+    def _build_interrupt_context(self, user_text: str) -> str | None:
+        """Build context stitch from pipeline_state.last_interrupt.
+
+        When the user barged in during TTS playback, captures what was
+        spoken vs unspoken and injects it as structured context for the
+        next inference call. Consumes the interrupt (clears it).
+
+        Returns a context string, or None if no interrupt occurred.
+        """
+        info = self.pipeline_state.last_interrupt
+        if info is None:
+            return None
+
+        # Only use recent interrupts (within last 30 seconds)
+        if time.time() - info.timestamp > 30:
+            return None
+
+        # Clear the interrupt so we don't re-inject it
+        with self.pipeline_state._lock:
+            self.pipeline_state._last_interrupt = None
+
+        # Compute unspoken remainder
+        unspoken = ""
+        if info.full_text and info.delivered_text:
+            if info.full_text.startswith(info.delivered_text):
+                unspoken = info.full_text[len(info.delivered_text):].strip()
+            else:
+                # Fallback: everything after the delivered percentage
+                unspoken = info.full_text[len(info.delivered_text):].strip()
+
+        parts = []
+        parts.append("[Barge-in context — your previous response was interrupted]")
+        parts.append(f"spoken (user heard this): \"{info.delivered_text}\"")
+        if unspoken:
+            parts.append(f"unspoken (user did NOT hear this): \"{unspoken}\"")
+        parts.append(f"interrupted_at: {info.spoken_pct*100:.0f}%")
+        parts.append(f"user_said: \"{user_text}\"")
+        parts.append("Acknowledge what was interrupted and respond to the user's new input.")
+
+        return "\n".join(parts)
+
     def _trim_history(self) -> None:
         """Keep conversation within MAX_HISTORY messages."""
         if len(self.conversation) > MAX_HISTORY:
diff --git a/channels.py b/channels.py
index d0ab8c8..8480953 100644
--- a/channels.py
+++ b/channels.py
@@ -3,6 +3,11 @@
 Wraps a FastAPI WebSocket connection as a ChannelDescriptor on the bus.
 Knows the WebSocket protocol (binary PCM / JSON control frames),
 knows nothing about LLMs or agent logic.
+
+Includes three-tier adaptive STT scheduler:
+  T1 (Whisper Base, ~31ms): per-chunk during speech
+  T2 (Whisper Large, ~470ms): on natural pause
+  T3 (Whisper Large, ~470ms): on end-of-utterance (final)
 """
 
 from __future__ import annotations
@@ -14,10 +19,12 @@
 import uuid
 from typing import Any, Awaitable, Callable
 
+import numpy as np
 from fastapi import WebSocket, WebSocketDisconnect
 
 from bus import ModalityBus
 from modality import CognitiveEvent, EncodedOutput, ModalityType
+from modules.voice import WhisperDecoder
 from pipeline_state import PipelineState
 
 logger = logging.getLogger("mod3.channels")
@@ -48,6 +55,16 @@ def __init__(
         self._audio_buffer = bytearray()
         self._active = True
 
+        # Three-tier STT state
+        self._streaming_decoder = WhisperDecoder(load_base=True)
+        self._streaming_audio = bytearray()  # Growing buffer for streaming STT
+        self._last_t1_time = 0.0  # Last T1 transcription time
+        self._last_speech_time = 0.0  # Last time we received speech audio
+        self._t1_interval = 0.3  # Run T1 every 300ms
+        self._t2_pause_threshold = 0.6  # Run T2 after 600ms pause
+        self._is_speaking = False  # Whether user is currently speaking
+        self._t2_scheduled = False  # Whether T2 is already scheduled
+
         # Register on the bus with a delivery callback
         bus.register_channel(
             self.channel_id,
@@ -65,7 +82,9 @@ def _deliver_sync(self, output: EncodedOutput) -> None:
         if not self._active:
             return
         try:
-            future = asyncio.run_coroutine_threadsafe(self._deliver_async(output), self._loop)
+            future = asyncio.run_coroutine_threadsafe(
+                self._deliver_async(output), self._loop
+            )
             future.result(timeout=10.0)
         except (WebSocketDisconnect, RuntimeError, TimeoutError):
             logger.debug("deliver failed (client disconnected?), deactivating channel")
@@ -87,15 +106,13 @@ async def _deliver_async(self, output: EncodedOutput) -> None:
             # Send audio as base64 JSON (avoids binary frame issues)
             audio_b64 = base64.b64encode(output.data).decode("ascii")
             logger.info("deliver: sending base64 audio JSON (%d chars)", len(audio_b64))
-            await self.ws.send_json(
-                {
-                    "type": "audio",
-                    "data": audio_b64,
-                    "format": output.format or "wav",
-                    "duration_sec": round(output.duration_sec, 2),
-                    "sample_rate": output.metadata.get("sample_rate", 24000),
-                }
-            )
+            await self.ws.send_json({
+                "type": "audio",
+                "data": audio_b64,
+                "format": output.format or "wav",
+                "duration_sec": round(output.duration_sec, 2),
+                "sample_rate": output.metadata.get("sample_rate", 24000),
+            })
             logger.info("deliver: audio sent OK")
         elif output.modality == ModalityType.TEXT:
             text = output.data.decode("utf-8") if isinstance(output.data, bytes) else str(output.data)
@@ -128,8 +145,29 @@ async def run(self) -> None:
             self._cleanup()
 
     def _handle_audio(self, pcm_bytes: bytes) -> None:
-        """Binary frame: raw Int16 PCM at 16kHz from browser Silero VAD."""
+        """Binary frame: raw Int16 PCM at 16kHz from browser Silero VAD.
+
+        A5: Receives streaming audio during speech (from onFrameProcessed)
+        AND the final complete buffer (from onSpeechEnd). Both accumulate
+        for the final T3 utterance processing.
+
+        During speech, audio also accumulates in _streaming_audio for T1/T2
+        partial transcription.
+        """
         self._audio_buffer.extend(pcm_bytes)
+        self._streaming_audio.extend(pcm_bytes)
+        self._last_speech_time = time.monotonic()
+        self._is_speaking = True
+
+        # T1: Fast Whisper Base transcription every _t1_interval
+        now = time.monotonic()
+        if now - self._last_t1_time >= self._t1_interval and len(self._streaming_audio) > 6400:
+            self._last_t1_time = now
+            asyncio.ensure_future(self._run_t1())
+
+        # Schedule T2 check on pause detection
+        if not self._t2_scheduled:
+            asyncio.ensure_future(self._schedule_t2_on_pause())
 
     async def _handle_json(self, msg: dict) -> None:
         """JSON frame: control message dispatch."""
@@ -149,12 +187,92 @@ async def _handle_json(self, msg: dict) -> None:
                 if key in msg:
                     self.config[key] = msg[key]
 
+    # ------------------------------------------------------------------
+    # Three-Tier STT
+    # ------------------------------------------------------------------
+
+    async def _run_t1(self) -> None:
+        """T1: Fast Whisper Base transcription on growing audio buffer (~31ms).
+
+        Runs every ~300ms during speech. Emits partial_transcript with
+        confirmed/tentative text at 30% opacity.
+        """
+        if not self._streaming_audio:
+            return
+
+        pcm_data = bytes(self._streaming_audio)
+
+        def _transcribe_t1():
+            audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
+            if len(audio) < 4800:  # <300ms
+                return None
+            return self._streaming_decoder.decode_streaming(audio, tier="t1")
+
+        try:
+            result = await asyncio.to_thread(_transcribe_t1)
+            if result and result.get("changed") and not result.get("filtered"):
+                await self.ws.send_json({
+                    "type": "partial_transcript",
+                    "confirmed": result["confirmed"],
+                    "tentative": result["tentative"],
+                    "tier": "t1",
+                    "elapsed_ms": result["elapsed_ms"],
+                })
+        except Exception as e:
+            logger.debug("T1 error: %s", e)
+
+    async def _run_t2(self) -> None:
+        """T2: Large model transcription on natural pause (~470ms).
+
+        Runs when speech pauses for >600ms but hasn't ended. Emits
+        partial_transcript with higher confidence (60% opacity).
+        """
+        if not self._streaming_audio:
+            return
+
+        pcm_data = bytes(self._streaming_audio)
+
+        def _transcribe_t2():
+            audio = np.frombuffer(pcm_data, dtype=np.int16).astype(np.float32) / 32768.0
+            if len(audio) < 8000:  # <500ms
+                return None
+            return self._streaming_decoder.decode_streaming(audio, tier="t2")
+
+        try:
+            result = await asyncio.to_thread(_transcribe_t2)
+            if result and not result.get("filtered"):
+                await self.ws.send_json({
+                    "type": "partial_transcript",
+                    "confirmed": result["confirmed"],
+                    "tentative": result["tentative"],
+                    "tier": "t2",
+                    "elapsed_ms": result["elapsed_ms"],
+                })
+        except Exception as e:
+            logger.debug("T2 error: %s", e)
+        finally:
+            self._t2_scheduled = False
+
+    async def _schedule_t2_on_pause(self) -> None:
+        """Check if speech has paused long enough for T2."""
+        await asyncio.sleep(self._t2_pause_threshold)
+        if not self._is_speaking:
+            return
+        # Check if there's been a pause since last audio
+        silence = time.monotonic() - self._last_speech_time
+        if silence >= self._t2_pause_threshold and not self._t2_scheduled:
+            self._t2_scheduled = True
+            await self._run_t2()
+
     # ------------------------------------------------------------------
     # Processing
     # ------------------------------------------------------------------
 
     async def _process_utterance(self) -> None:
-        """PCM audio buffer → WhisperDecoder STT → CognitiveEvent → agent loop.
+        """T3: PCM audio buffer → WhisperDecoder STT → CognitiveEvent → agent loop.
+
+        This is the final tier — end-of-utterance. Uses the Large model for
+        maximum accuracy. Everything is confirmed (100% opacity).
 
         Skips the server-side VoiceGate (Silero VAD) because the browser
         already ran Silero VAD client-side — no need to validate again,
@@ -163,6 +281,11 @@ async def _process_utterance(self) -> None:
         pcm_data = bytes(self._audio_buffer)
         self._audio_buffer.clear()
 
+        # Reset streaming state
+        self._streaming_audio.clear()
+        self._streaming_decoder.reset_streaming()
+        self._is_speaking = False
+
         if len(pcm_data) < 6400:  # <200ms at 16kHz Int16
             return
 
@@ -185,7 +308,7 @@ def _transcribe():
             # Skip silence
             if len(audio) < 16000 * 0.3:
                 return None
-            rms = float(np.sqrt(np.mean(audio**2)))
+            rms = float(np.sqrt(np.mean(audio ** 2)))
             if rms < 0.005:
                 return None
 
@@ -234,14 +357,12 @@ def _transcribe():
 
         if event and event.content:
             # Send transcript to browser
-            await self.ws.send_json(
-                {
-                    "type": "transcript",
-                    "text": event.content,
-                    "stt_ms": round(stt_ms, 1),
-                    "source": "voice",
-                }
-            )
+            await self.ws.send_json({
+                "type": "transcript",
+                "text": event.content,
+                "stt_ms": round(stt_ms, 1),
+                "source": "voice",
+            })
             # Forward to agent loop
             event.metadata["stt_ms"] = stt_ms
             if self._on_event:
@@ -255,13 +376,11 @@ async def _process_text(self, text: str) -> None:
             source_channel=self.channel_id,
             confidence=1.0,
         )
-        await self.ws.send_json(
-            {
-                "type": "transcript",
-                "text": text,
-                "source": "text",
-            }
-        )
+        await self.ws.send_json({
+            "type": "transcript",
+            "text": text,
+            "source": "text",
+        })
         if self._on_event:
             await self._on_event(event)
 
@@ -288,12 +407,10 @@ async def send_response_complete(self, metrics: dict | None = None) -> None:
         """Signal response is complete."""
         if self._active:
             try:
-                await self.ws.send_json(
-                    {
-                        "type": "response_complete",
-                        "metrics": metrics or {},
-                    }
-                )
+                await self.ws.send_json({
+                    "type": "response_complete",
+                    "metrics": metrics or {},
+                })
             except Exception:
                 self._active = False
 
diff --git a/dashboard/index.html b/dashboard/index.html
index 6b1f9f8..914a2b0 100644
--- a/dashboard/index.html
+++ b/dashboard/index.html
@@ -106,6 +106,71 @@
   /* Headphone hint */
   .hint { font-size: 0.7rem; color: var(--muted); padding: 4px 0; flex-shrink: 0; }
 
+  /* Opacity-as-state: three visual states for text blocks */
+  .opacity-inflight { opacity: 0.3; transition: opacity 0.3s ease; }
+  .opacity-corrected { opacity: 0.6; transition: opacity 0.3s ease; }
+  .opacity-committed { opacity: 1.0; transition: opacity 0.3s ease; }
+
+  /* Word-level solidification spans */
+  .voice-word {
+    display: inline;
+    transition: opacity 0.2s ease;
+  }
+  .voice-word.spoken { opacity: 1.0; }
+  .voice-word.speaking { opacity: 0.85; color: var(--accent); }
+  .voice-word.unspoken { opacity: 0.3; }
+
+  /* Draft queue preview blocks */
+  .draft-preview {
+    opacity: 0.3;
+    padding: 6px 12px;
+    margin-top: 4px;
+    border-left: 2px solid var(--accent);
+    border-radius: 4px;
+    font-size: 0.85rem;
+    color: var(--muted);
+    transition: opacity 0.3s ease;
+  }
+  .draft-preview.validated { opacity: 0.6; }
+  .draft-preview.stale {
+    opacity: 0.15;
+    text-decoration: line-through;
+    border-left-color: var(--orange);
+  }
+  .draft-preview.revised {
+    border-left-color: var(--green);
+    animation: revision-flash 0.6s ease;
+  }
+  @keyframes revision-flash {
+    0% { opacity: 0.8; border-left-color: var(--green); }
+    100% { opacity: 0.3; border-left-color: var(--accent); }
+  }
+
+  /* Partial transcript (assembling) */
+  .partial-transcript {
+    font-size: 0.85rem;
+    color: var(--muted);
+    padding: 4px 0;
+    min-height: 1.3em;
+  }
+  .partial-confirmed { opacity: 0.6; color: var(--text); }
+  .partial-tentative { opacity: 0.3; color: var(--muted); font-style: italic; }
+
+  /* Interruption marker */
+  .interrupt-marker {
+    display: inline-block;
+    width: 2px;
+    height: 1em;
+    background: var(--red);
+    margin: 0 2px;
+    vertical-align: middle;
+    animation: blink 1s ease-in-out 3;
+  }
+  @keyframes blink {
+    0%, 100% { opacity: 1; }
+    50% { opacity: 0.3; }
+  }
+
   /* Responsive */
   @media (max-width: 700px) {
     .main { padding: 12px 16px; }
@@ -214,9 +279,15 @@ <h1>Mod&#179;</h1>
     </div>
   </div>
 
+  <!-- Partial transcript (live speech assembly) -->
+  <div class="partial-transcript" id="partial-transcript" style="display:none;"></div>
+
   <!-- Unified chat / transcript area -->
   <div class="chat-area" id="chat-area"></div>
 
+  <!-- Draft queue preview -->
+  <div id="draft-queue-preview" style="display:none;"></div>
+
   <!-- Metrics -->
   <div class="metrics-bar" id="voice-metrics"></div>
 
@@ -379,6 +450,49 @@ <h1>Mod&#179;</h1>
 }
 
 let _currentAssistantMsg = null;
+let _currentAssistantText = '';  // Full text of current response for word tracking
+
+/**
+ * C2: Word-level timing estimation.
+ * Given total audio duration and full text, estimates the time position
+ * of each word assuming uniform speech rate.
+ *
+ * Returns array of { word, startSec, endSec, index }
+ */
+function estimateWordTimings(text, totalDurationSec) {
+  const words = text.split(/\s+/).filter(w => w.length > 0);
+  if (words.length === 0) return [];
+
+  // Weight by character count (longer words take more time)
+  const totalChars = words.reduce((sum, w) => sum + w.length, 0);
+  let currentTime = 0;
+
+  return words.map((word, index) => {
+    const fraction = word.length / totalChars;
+    const duration = fraction * totalDurationSec;
+    const entry = {
+      word,
+      startSec: currentTime,
+      endSec: currentTime + duration,
+      index,
+    };
+    currentTime += duration;
+    return entry;
+  });
+}
+
+/**
+ * Get the word index being spoken at a given playback time.
+ */
+function getWordAtTime(timings, currentTimeSec) {
+  for (let i = timings.length - 1; i >= 0; i--) {
+    if (currentTimeSec >= timings[i].startSec) return i;
+  }
+  return 0;
+}
+
+let _wordTimings = [];   // Current word timings for solidification
+let _solidificationActive = false;
 
 function appendAssistantStart(source) {
   const div = document.createElement('div');
@@ -393,6 +507,7 @@ <h1>Mod&#179;</h1>
   chatArea.appendChild(div);
   chatArea.scrollTop = chatArea.scrollHeight;
   _currentAssistantMsg = div.querySelector('.msg-text');
+  _currentAssistantText = '';
   return div;
 }
 
@@ -401,15 +516,72 @@ <h1>Mod&#179;</h1>
   if (!_currentAssistantMsg) {
     appendAssistantStart();
   }
-  _currentAssistantMsg.innerHTML += escapeHtml(text);
+  _currentAssistantText += text;
+
+  // C4: Wrap each word in a span for solidification animation
+  const words = text.split(/(\s+)/);
+  words.forEach(segment => {
+    if (/^\s+$/.test(segment)) {
+      _currentAssistantMsg.appendChild(document.createTextNode(segment));
+    } else if (segment) {
+      const span = document.createElement('span');
+      span.className = 'voice-word unspoken';
+      span.textContent = segment;
+      _currentAssistantMsg.appendChild(span);
+    }
+  });
+
   chatArea.scrollTop = chatArea.scrollHeight;
 }
 
+function startSolidification(totalDurationSec) {
+  /**
+   * C4: Begin solidification animation — words solidify left-to-right
+   * tracking the audio playback position. Uses C1 progress + C2 timing.
+   */
+  if (!_currentAssistantMsg || !_currentAssistantText) return;
+
+  _wordTimings = estimateWordTimings(_currentAssistantText, totalDurationSec);
+  _solidificationActive = true;
+}
+
+// Listen for playback progress to drive solidification
+window.addEventListener('playback-progress', (e) => {
+  if (!_solidificationActive || !_currentAssistantMsg) return;
+
+  const { progress } = e.detail;
+  const currentTime = progress * (_wordTimings.length > 0 ?
+    _wordTimings[_wordTimings.length - 1].endSec : 0);
+
+  const wordSpans = _currentAssistantMsg.querySelectorAll('.voice-word');
+  const currentWordIdx = getWordAtTime(_wordTimings, currentTime);
+
+  wordSpans.forEach((span, i) => {
+    if (i < currentWordIdx) {
+      span.className = 'voice-word spoken';
+    } else if (i === currentWordIdx) {
+      span.className = 'voice-word speaking';
+    } else {
+      span.className = 'voice-word unspoken';
+    }
+  });
+});
+
 function finalizeAssistant(suffix) {
-  if (_currentAssistantMsg && suffix) {
-    _currentAssistantMsg.innerHTML += ` <em style="color:var(--muted)">${escapeHtml(suffix)}</em>`;
+  _solidificationActive = false;
+  _wordTimings = [];
+
+  if (_currentAssistantMsg) {
+    // Mark all words as spoken (committed)
+    _currentAssistantMsg.querySelectorAll('.voice-word').forEach(span => {
+      span.className = 'voice-word spoken';
+    });
+    if (suffix) {
+      _currentAssistantMsg.innerHTML += ` <em style="color:var(--muted)">${escapeHtml(suffix)}</em>`;
+    }
   }
   _currentAssistantMsg = null;
+  _currentAssistantText = '';
 }
 
 async function sendTextMessage(text) {
@@ -471,19 +643,51 @@ <h1>Mod&#179;</h1>
     }
   };
 
+  // Wire up playback progress for solidification
+  pb.onProgress = (samplesPlayed, totalSamples) => {
+    // Will be used by C4 solidification animation (Wave 2)
+    const pct = totalSamples > 0 ? samplesPlayed / totalSamples : 0;
+    // Emit custom event for word-level tracking
+    window.dispatchEvent(new CustomEvent('playback-progress', {
+      detail: { samplesPlayed, totalSamples, progress: pct }
+    }));
+  };
+
   const t = new VoiceTransport(wsUrl, {
     onAudio: (data) => {
       pb.enqueueWav(data);
+      // Start solidification when first audio arrives
+      if (_currentAssistantText && !_solidificationActive) {
+        // Estimate total duration from accumulated audio
+        // (will refine as more chunks arrive)
+        startSolidification(pb.totalDuration || 2.0);
+      }
       const el = document.getElementById('voice-status');
       if (el) { el.textContent = 'Speaking...'; el.style.color = 'var(--accent)'; }
     },
     onTranscript: (msg) => {
       if (msg.source !== 'text') appendMessage('user', msg.text, 'voice');
-      // Don't create assistant bubble here — appendAssistantChunk lazy-creates it
+      // Hide partial transcript when final arrives
+      const ptEl = document.getElementById('partial-transcript');
+      if (ptEl) { ptEl.style.display = 'none'; ptEl.innerHTML = ''; }
       const el = document.getElementById('voice-status');
       if (el) { el.textContent = 'Thinking...'; el.style.color = 'var(--orange)'; }
     },
-    onResponseText: (msg) => appendAssistantChunk(msg.text),
+    onPartialTranscript: (msg) => {
+      // Show partial transcript with confirmed/tentative styling
+      const ptEl = document.getElementById('partial-transcript');
+      if (ptEl) {
+        ptEl.style.display = '';
+        let html = '';
+        if (msg.confirmed) html += `<span class="partial-confirmed">${escapeHtml(msg.confirmed)}</span> `;
+        if (msg.tentative) html += `<span class="partial-tentative">${escapeHtml(msg.tentative)}</span>`;
+        ptEl.innerHTML = html || '...';
+      }
+    },
+    onResponseText: (msg) => {
+      pb.resetProgress(); // Reset progress for new response
+      appendAssistantChunk(msg.text);
+    },
     onResponseComplete: (msg) => {
       finalizeAssistant();
       if (msg.metrics) {
@@ -500,6 +704,67 @@ <h1>Mod&#179;</h1>
       pb.flush();
       finalizeAssistant('[interrupted]');
     },
+    onDraftQueue: (msg) => {
+      /**
+       * E3 + D4: Self-barge visual feedback + Queue preview UI.
+       *
+       * When the agent revises queued output, the transparent text visibly
+       * changes in the dashboard. Each block gets a data-block-id attribute
+       * for targeted updates. Revised blocks flash briefly to draw attention.
+       */
+      const dqEl = document.getElementById('draft-queue-preview');
+      if (!dqEl) return;
+      if (msg.blocks && msg.blocks.length > 0) {
+        dqEl.style.display = '';
+
+        msg.blocks.forEach(b => {
+          const existing = dqEl.querySelector(`[data-block-id="${b.id}"]`);
+          if (existing) {
+            // Block exists — check if content changed (self-barge revision)
+            const oldText = existing.getAttribute('data-text') || '';
+            if (oldText !== b.text) {
+              // E3: Content changed — flash animation to show revision
+              existing.innerHTML = escapeHtml(b.text);
+              existing.setAttribute('data-text', b.text);
+              existing.style.transition = 'none';
+              existing.style.borderLeftColor = 'var(--green)';
+              existing.style.opacity = '0.8';
+              requestAnimationFrame(() => {
+                existing.style.transition = 'opacity 0.5s ease, border-left-color 0.5s ease';
+                existing.style.opacity = b.status === 'stale' ? '0.15' : '0.3';
+                existing.style.borderLeftColor = b.status === 'stale' ? 'var(--orange)' : 'var(--accent)';
+              });
+            }
+            // Update status class
+            existing.className = b.status === 'stale' ? 'draft-preview stale' :
+                                 b.status === 'valid' ? 'draft-preview' : 'draft-preview validated';
+          } else {
+            // New block — create element
+            const div = document.createElement('div');
+            div.className = b.status === 'stale' ? 'draft-preview stale' :
+                            b.status === 'valid' ? 'draft-preview' : 'draft-preview validated';
+            div.setAttribute('data-block-id', b.id);
+            div.setAttribute('data-text', b.text);
+            div.textContent = b.text;
+            dqEl.appendChild(div);
+          }
+        });
+
+        // Remove blocks that no longer exist
+        const currentIds = new Set(msg.blocks.map(b => b.id));
+        dqEl.querySelectorAll('[data-block-id]').forEach(el => {
+          if (!currentIds.has(el.getAttribute('data-block-id'))) {
+            // Snipped block — fade out
+            el.style.transition = 'opacity 0.3s ease';
+            el.style.opacity = '0';
+            setTimeout(() => el.remove(), 300);
+          }
+        });
+      } else {
+        dqEl.style.display = 'none';
+        dqEl.innerHTML = '';
+      }
+    },
     onMetrics: (msg) => {
       if (msg.sample_rate && msg.sample_rate !== pb.sampleRate) pb.setSampleRate(msg.sample_rate);
     },
@@ -632,7 +897,27 @@ <h1>Mod&#179;</h1>
         preSpeechPadFrames: 8,
         onSpeechStart: () => {
           console.log('[Silero VAD] Speech START');
+          // B3: Barge-in visual state
           if (_playback && _playback.isPlaying) {
+            // Freeze unspoken words at 30% opacity with interrupt marker
+            if (_currentAssistantMsg && _solidificationActive) {
+              const wordSpans = _currentAssistantMsg.querySelectorAll('.voice-word');
+              let interrupted = false;
+              wordSpans.forEach(span => {
+                if (span.classList.contains('unspoken') || span.classList.contains('speaking')) {
+                  span.classList.remove('speaking');
+                  span.classList.add('unspoken');
+                  if (!interrupted) {
+                    // Insert interrupt marker before the first unspoken word
+                    const marker = document.createElement('span');
+                    marker.className = 'interrupt-marker';
+                    span.parentNode.insertBefore(marker, span);
+                    interrupted = true;
+                  }
+                }
+              });
+              _solidificationActive = false;
+            }
             _playback.flush();
             if (_transport) _transport.interrupt();
           }
@@ -653,9 +938,19 @@ <h1>Mod&#179;</h1>
         onVADMisfire: () => {
           if (micDebug) micDebug.textContent = 'VAD: misfire (too short)';
         },
-        onFrameProcessed: (probs) => {
+        onFrameProcessed: (probs, audioFrame) => {
           if (levelBar) levelBar.style.width = Math.min(100, probs.isSpeech * 100) + '%';
           if (micDebug) micDebug.textContent = `silero: ${probs.isSpeech.toFixed(3)} thr=${vadThreshold}`;
+
+          // A5: Stream audio chunks during speech for server-side streaming STT
+          // Send frames when speech is detected (server accumulates for T1/T2)
+          if (probs.isSpeech > vadThreshold && audioFrame && _transport && _transport.connected) {
+            const int16 = new Int16Array(audioFrame.length);
+            for (let i = 0; i < audioFrame.length; i++) {
+              int16[i] = Math.max(-32768, Math.min(32767, audioFrame[i] * 32768));
+            }
+            _transport.sendAudio(int16.buffer);
+          }
         },
       });
       capture.start();
diff --git a/dashboard/playback.js b/dashboard/playback.js
index 82be279..9849b97 100644
--- a/dashboard/playback.js
+++ b/dashboard/playback.js
@@ -1,6 +1,8 @@
 /**
  * Streaming audio playback engine.
  * Receives Int16 PCM chunks and plays them seamlessly via Web Audio API.
+ * Tracks playback progress (samplesPlayed/totalSamples) for word-level
+ * solidification animation.
  */
 class AudioPlayback {
   constructor(sampleRate = 24000) {
@@ -12,7 +14,32 @@ class AudioPlayback {
     this.nextStartTime = 0;
     this.onPlaybackStart = null;
     this.onPlaybackEnd = null;
+    this.onProgress = null;  // (samplesPlayed, totalSamples) => void
     this.sinkId = undefined; // output device ID
+
+    // Progress tracking
+    this.totalSamples = 0;       // Total samples across all queued buffers
+    this.samplesPlayed = 0;      // Samples played so far
+    this._chunkStartSample = 0;  // Sample offset of current chunk
+    this._currentChunkSamples = 0;
+    this._playbackStartTime = 0; // audioContext.currentTime when chunk started
+    this._progressTimer = null;
+  }
+
+  /** Current playback progress as 0.0-1.0 */
+  get progress() {
+    if (this.totalSamples === 0) return 0;
+    return Math.min(1.0, this.samplesPlayed / this.totalSamples);
+  }
+
+  /** Estimated current playback time in seconds */
+  get currentTime() {
+    return this.samplesPlayed / this.sampleRate;
+  }
+
+  /** Total duration in seconds of all queued audio */
+  get totalDuration() {
+    return this.totalSamples / this.sampleRate;
   }
 
   _ensureContext() {
@@ -48,6 +75,7 @@ class AudioPlayback {
     const buffer = this.audioContext.createBuffer(1, float32.length, this.sampleRate);
     buffer.getChannelData(0).set(float32);
     this.queue.push(buffer);
+    this.totalSamples += float32.length;
 
     if (!this.isPlaying) this._playNext();
   }
@@ -58,6 +86,7 @@ class AudioPlayback {
     try {
       const audioBuffer = await this.audioContext.decodeAudioData(wavArrayBuffer.slice(0));
       this.queue.push(audioBuffer);
+      this.totalSamples += audioBuffer.length;
       if (!this.isPlaying) this._playNext();
     } catch (err) {
       console.error("[AudioPlayback] Failed to decode WAV:", err);
@@ -67,6 +96,9 @@ class AudioPlayback {
   _playNext() {
     if (this.queue.length === 0) {
       this.isPlaying = false;
+      this._stopProgressTimer();
+      this.samplesPlayed = this.totalSamples; // Mark fully played
+      if (this.onProgress) this.onProgress(this.samplesPlayed, this.totalSamples);
       if (this.onPlaybackEnd) this.onPlaybackEnd();
       return;
     }
@@ -81,22 +113,65 @@ class AudioPlayback {
     const source = this.audioContext.createBufferSource();
     source.buffer = buffer;
     source.connect(this.audioContext.destination);
-    source.onended = () => this._playNext();
+
+    // Track progress for this chunk
+    this._chunkStartSample = this.samplesPlayed;
+    this._currentChunkSamples = buffer.length;
+
+    source.onended = () => {
+      // Mark chunk as fully played
+      this.samplesPlayed = this._chunkStartSample + this._currentChunkSamples;
+      if (this.onProgress) this.onProgress(this.samplesPlayed, this.totalSamples);
+      this._playNext();
+    };
 
     // Schedule this chunk right after the previous one for gapless playback
     const startTime = Math.max(this.nextStartTime, this.audioContext.currentTime);
     source.start(startTime);
+    this._playbackStartTime = startTime;
     this.nextStartTime = startTime + buffer.duration;
     this.currentSource = source;
+
+    // Start progress timer for smooth updates during playback
+    this._startProgressTimer();
+  }
+
+  _startProgressTimer() {
+    this._stopProgressTimer();
+    this._progressTimer = setInterval(() => {
+      if (!this.isPlaying || !this.audioContext) return;
+      const elapsed = this.audioContext.currentTime - this._playbackStartTime;
+      const chunkProgress = Math.min(elapsed * this.sampleRate, this._currentChunkSamples);
+      this.samplesPlayed = this._chunkStartSample + Math.floor(chunkProgress);
+      if (this.onProgress) this.onProgress(this.samplesPlayed, this.totalSamples);
+    }, 50); // 20 fps progress updates
+  }
+
+  _stopProgressTimer() {
+    if (this._progressTimer) {
+      clearInterval(this._progressTimer);
+      this._progressTimer = null;
+    }
   }
 
   flush() {
     this.queue = [];
+    this._stopProgressTimer();
     if (this.currentSource) {
       try { this.currentSource.stop(); } catch {}
     }
     this.isPlaying = false;
     this.nextStartTime = 0;
+    // Keep samplesPlayed/totalSamples for interrupt context
+    // (tells us how much was delivered before flush)
+  }
+
+  /** Reset all progress counters (call when starting a new response) */
+  resetProgress() {
+    this.totalSamples = 0;
+    this.samplesPlayed = 0;
+    this._chunkStartSample = 0;
+    this._currentChunkSamples = 0;
   }
 
   setSampleRate(rate) {
diff --git a/dashboard/transport.js b/dashboard/transport.js
index d0d02ad..ee8dc5c 100644
--- a/dashboard/transport.js
+++ b/dashboard/transport.js
@@ -63,9 +63,12 @@ class VoiceTransport {
 
     const handlerMap = {
       transcript: "onTranscript",
+      partial_transcript: "onPartialTranscript",
       response_text: "onResponseText",
       response_complete: "onResponseComplete",
       interrupted: "onInterrupted",
+      tts_progress: "onTtsProgress",
+      draft_queue: "onDraftQueue",
       metrics: "onMetrics",
       error: "onError",
     };
diff --git a/draft_queue.py b/draft_queue.py
new file mode 100644
index 0000000..dccbc15
--- /dev/null
+++ b/draft_queue.py
@@ -0,0 +1,267 @@
+"""Draft Queue — speculative response blocks with status tracking.
+
+Holds draft response blocks generated speculatively while the human is
+still speaking. Each block has a status lifecycle:
+
+    valid → spoken (played aloud)
+    valid → stale  (invalidated by new context)
+    valid → snipped (removed from queue by self-barge)
+
+Thread-safe. Used by the agent loop for speculative inference and
+self-barge operations (snip, inject, revise).
+"""
+
+from __future__ import annotations
+
+import threading
+import time
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any
+
+
+class BlockStatus(Enum):
+    """Lifecycle states for a draft block."""
+    VALID = "valid"        # Generated, awaiting playback
+    STALE = "stale"        # Invalidated by new context
+    SPOKEN = "spoken"      # Successfully played aloud
+    SNIPPED = "snipped"    # Removed by self-barge
+    SPEAKING = "speaking"  # Currently being spoken
+
+
+@dataclass
+class DraftBlock:
+    """A single draft response block with metadata."""
+
+    id: str
+    text: str
+    status: BlockStatus = BlockStatus.VALID
+    created_at: float = field(default_factory=time.time)
+    context_hash: str = ""        # Hash of context at generation time
+    generation_ms: float = 0.0    # How long inference took
+    tts_audio: bytes | None = None  # Pre-synthesized audio (if available)
+    tts_duration_sec: float = 0.0
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def is_playable(self) -> bool:
+        """Whether this block can be played."""
+        return self.status == BlockStatus.VALID
+
+    @property
+    def is_active(self) -> bool:
+        """Whether this block is still relevant (not stale/snipped)."""
+        return self.status in (BlockStatus.VALID, BlockStatus.SPEAKING)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "id": self.id,
+            "text": self.text,
+            "status": self.status.value,
+            "created_at": self.created_at,
+            "has_audio": self.tts_audio is not None,
+            "tts_duration_sec": self.tts_duration_sec,
+            "generation_ms": self.generation_ms,
+        }
+
+
+class DraftQueue:
+    """Thread-safe queue of speculative draft response blocks.
+
+    The agent generates blocks speculatively while the human speaks.
+    Blocks are played in order when the human stops. Blocks can be
+    invalidated (stale), removed (snip), or replaced (revise) before
+    they're spoken.
+
+    Operations:
+        add_block     — append a new draft block
+        invalidate    — mark a block as stale (context changed)
+        snip          — remove a block from the queue
+        inject        — insert a new block at a position
+        revise        — replace a block's text (and optionally audio)
+        get_pending   — get all valid blocks awaiting playback
+        mark_speaking — mark a block as currently being spoken
+        mark_spoken   — mark a block as successfully spoken
+        clear         — reset the queue
+    """
+
+    def __init__(self):
+        self._lock = threading.Lock()
+        self._blocks: list[DraftBlock] = []
+        self._spoken_history: list[DraftBlock] = []  # Archive of spoken blocks
+
+    # ------------------------------------------------------------------
+    # Core operations
+    # ------------------------------------------------------------------
+
+    def add_block(
+        self,
+        text: str,
+        context_hash: str = "",
+        generation_ms: float = 0.0,
+        **metadata,
+    ) -> DraftBlock:
+        """Add a new draft block to the end of the queue."""
+        block = DraftBlock(
+            id=uuid.uuid4().hex[:8],
+            text=text,
+            context_hash=context_hash,
+            generation_ms=generation_ms,
+            metadata=metadata,
+        )
+        with self._lock:
+            self._blocks.append(block)
+        return block
+
+    def invalidate(self, block_id: str) -> bool:
+        """Mark a block as stale. Returns True if found and invalidated."""
+        with self._lock:
+            for block in self._blocks:
+                if block.id == block_id and block.is_active:
+                    block.status = BlockStatus.STALE
+                    return True
+        return False
+
+    def invalidate_all(self) -> int:
+        """Mark all valid blocks as stale. Returns count invalidated."""
+        count = 0
+        with self._lock:
+            for block in self._blocks:
+                if block.status == BlockStatus.VALID:
+                    block.status = BlockStatus.STALE
+                    count += 1
+        return count
+
+    def snip(self, block_id: str) -> bool:
+        """Remove a block from the queue. Returns True if found."""
+        with self._lock:
+            for i, block in enumerate(self._blocks):
+                if block.id == block_id:
+                    block.status = BlockStatus.SNIPPED
+                    self._blocks.pop(i)
+                    return True
+        return False
+
+    def inject(
+        self,
+        position: int,
+        text: str,
+        context_hash: str = "",
+        generation_ms: float = 0.0,
+        **metadata,
+    ) -> DraftBlock:
+        """Insert a new block at the given position."""
+        block = DraftBlock(
+            id=uuid.uuid4().hex[:8],
+            text=text,
+            context_hash=context_hash,
+            generation_ms=generation_ms,
+            metadata=metadata,
+        )
+        with self._lock:
+            self._blocks.insert(position, block)
+        return block
+
+    def revise(
+        self,
+        block_id: str,
+        new_text: str,
+        new_audio: bytes | None = None,
+        new_duration: float = 0.0,
+    ) -> bool:
+        """Replace a block's content. Returns True if found and revised."""
+        with self._lock:
+            for block in self._blocks:
+                if block.id == block_id and block.is_active:
+                    block.text = new_text
+                    if new_audio is not None:
+                        block.tts_audio = new_audio
+                        block.tts_duration_sec = new_duration
+                    block.metadata["revised_at"] = time.time()
+                    return True
+        return False
+
+    # ------------------------------------------------------------------
+    # Playback lifecycle
+    # ------------------------------------------------------------------
+
+    def get_pending(self) -> list[DraftBlock]:
+        """Get all valid blocks awaiting playback, in order."""
+        with self._lock:
+            return [b for b in self._blocks if b.status == BlockStatus.VALID]
+
+    def get_next(self) -> DraftBlock | None:
+        """Get the next valid block to play, or None."""
+        with self._lock:
+            for block in self._blocks:
+                if block.status == BlockStatus.VALID:
+                    return block
+        return None
+
+    def mark_speaking(self, block_id: str) -> bool:
+        """Mark a block as currently being spoken."""
+        with self._lock:
+            for block in self._blocks:
+                if block.id == block_id:
+                    block.status = BlockStatus.SPEAKING
+                    return True
+        return False
+
+    def mark_spoken(self, block_id: str) -> bool:
+        """Mark a block as successfully spoken and archive it."""
+        with self._lock:
+            for i, block in enumerate(self._blocks):
+                if block.id == block_id:
+                    block.status = BlockStatus.SPOKEN
+                    self._spoken_history.append(block)
+                    self._blocks.pop(i)
+                    return True
+        return False
+
+    # ------------------------------------------------------------------
+    # Query
+    # ------------------------------------------------------------------
+
+    @property
+    def depth(self) -> int:
+        """Number of blocks in the queue (all statuses)."""
+        with self._lock:
+            return len(self._blocks)
+
+    @property
+    def pending_count(self) -> int:
+        """Number of valid (playable) blocks."""
+        with self._lock:
+            return sum(1 for b in self._blocks if b.status == BlockStatus.VALID)
+
+    @property
+    def all_blocks(self) -> list[DraftBlock]:
+        """Snapshot of all blocks in current queue."""
+        with self._lock:
+            return list(self._blocks)
+
+    @property
+    def spoken_text(self) -> str:
+        """All text that has been successfully spoken."""
+        with self._lock:
+            return " ".join(b.text for b in self._spoken_history)
+
+    def clear(self) -> int:
+        """Clear the queue. Returns number of blocks removed."""
+        with self._lock:
+            count = len(self._blocks)
+            self._blocks.clear()
+            return count
+
+    def status(self) -> dict[str, Any]:
+        """Queue status snapshot."""
+        with self._lock:
+            return {
+                "total": len(self._blocks),
+                "valid": sum(1 for b in self._blocks if b.status == BlockStatus.VALID),
+                "stale": sum(1 for b in self._blocks if b.status == BlockStatus.STALE),
+                "speaking": sum(1 for b in self._blocks if b.status == BlockStatus.SPEAKING),
+                "spoken_total": len(self._spoken_history),
+                "blocks": [b.to_dict() for b in self._blocks],
+            }
diff --git a/http_api.py b/http_api.py
index 45a7fb8..981d7fc 100644
--- a/http_api.py
+++ b/http_api.py
@@ -29,7 +29,7 @@
 from threading import Lock
 from typing import Optional
 
-from fastapi import FastAPI, Request, Response, UploadFile, WebSocket
+from fastapi import FastAPI, Request, Response, UploadFile, WebSocket, WebSocketDisconnect
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
@@ -58,7 +58,6 @@ async def _warmup_kokoro():
     def _do_warmup():
         try:
             from engine import get_model
-
             get_model("kokoro")
             logger.info("Kokoro TTS engine pre-warmed successfully")
         except Exception as e:
@@ -572,7 +571,6 @@ def stop_speech(job_id: str = ""):
     """
     try:
         from server import _speech_queue, pipeline_state
-
         if job_id:
             cancelled = _speech_queue.cancel(job_id)
             return {"status": "ok", "message": f"Cancelled {job_id}" if cancelled else f"Job {job_id} not found"}
@@ -679,16 +677,24 @@ async def _graceful_exit():
         deadline = time.time() + timeout_sec
         while time.time() < deadline:
             with _jobs_lock:
-                active = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing"))
+                active = sum(
+                    1 for j in _jobs.values()
+                    if j.get("status") in ("generating", "processing")
+                )
             if active == 0:
                 break
             await asyncio.sleep(0.25)
 
         with _jobs_lock:
-            remaining = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing"))
+            remaining = sum(
+                1 for j in _jobs.values()
+                if j.get("status") in ("generating", "processing")
+            )
 
         if remaining:
-            logger.warning("Shutdown timeout reached with %d active jobs — forcing exit", remaining)
+            logger.warning(
+                "Shutdown timeout reached with %d active jobs — forcing exit", remaining
+            )
         else:
             logger.info("All jobs drained — exiting cleanly")
 
diff --git a/mcp.channel.json b/mcp.channel.json
index 53ce5bf..ccd2bf4 100644
--- a/mcp.channel.json
+++ b/mcp.channel.json
@@ -2,7 +2,7 @@
   "mcpServers": {
     "mod3-voice": {
       "command": "python3",
-      "args": ["${MOD3_ROOT}/server.py", "--channel"]
+      "args": ["/Users/slowbro/workspaces/mod3/server.py", "--channel"]
     }
   }
 }
diff --git a/mcp_shim.py b/mcp_shim.py
new file mode 100644
index 0000000..c99ee73
--- /dev/null
+++ b/mcp_shim.py
@@ -0,0 +1,742 @@
+#!/usr/bin/env python3
+"""Mod³ MCP shim — thin stdio proxy to a running Mod³ HTTP service.
+
+Instead of spawning a full server.py (which loads TTS models, ~4GB VRAM),
+this shim implements the MCP stdio protocol and forwards tool calls to
+the Mod³ HTTP API at localhost:7860.
+
+Tools that are purely local (set_output_device, await_voice_input) are
+handled in-process without touching the HTTP service.
+
+For `speak`, the shim posts to /v1/synthesize for audio generation, then
+plays the returned WAV bytes locally via sounddevice.
+
+Usage:
+    python mcp_shim.py              # normal MCP stdio mode
+    python mcp_shim.py --test       # connectivity check, then exit
+"""
+
+import io
+import json
+import logging
+import os
+import struct
+import sys
+import threading
+import time
+import urllib.error
+import urllib.request
+import wave
+from collections import OrderedDict
+from typing import Any
+
+logger = logging.getLogger("mod3.shim")
+
+MOD3_BASE = os.environ.get("MOD3_URL", "http://localhost:7860")
+
+# ---------------------------------------------------------------------------
+# Lightweight audio playback (only needs sounddevice, not full TTS stack)
+# ---------------------------------------------------------------------------
+
+_output_device: Any = None
+_current_player_lock = threading.Lock()
+_current_sd_stream = None
+_playback_interrupt = threading.Event()
+
+# Job tracking (lightweight — just for speak/stop/status)
+_jobs: OrderedDict = OrderedDict()
+_jobs_lock = threading.Lock()
+_MAX_JOBS = 50
+
+# Barge-in signal file (same as server.py)
+_BARGEIN_SIGNAL = os.path.expanduser("~/.mod3_bargein_signal.json")
+
+
+def _http_request(method: str, path: str, body: dict | None = None,
+                  timeout: float = 30.0) -> tuple[int, dict | bytes]:
+    """Make an HTTP request to the Mod3 service. Returns (status_code, parsed_json_or_bytes)."""
+    url = f"{MOD3_BASE}{path}"
+    headers = {"Content-Type": "application/json"} if body is not None else {}
+    data = json.dumps(body).encode() if body is not None else None
+
+    req = urllib.request.Request(url, data=data, headers=headers, method=method)
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            content_type = resp.headers.get("Content-Type", "")
+            raw = resp.read()
+            if "application/json" in content_type:
+                return resp.status, json.loads(raw)
+            elif "audio/" in content_type:
+                return resp.status, raw
+            else:
+                try:
+                    return resp.status, json.loads(raw)
+                except (json.JSONDecodeError, ValueError):
+                    return resp.status, raw
+    except urllib.error.HTTPError as e:
+        try:
+            body_bytes = e.read()
+            return e.code, json.loads(body_bytes)
+        except Exception:
+            return e.code, {"error": str(e)}
+    except urllib.error.URLError as e:
+        return 0, {"error": f"Mod3 service unreachable: {e.reason}"}
+    except Exception as e:
+        return 0, {"error": f"Request failed: {e}"}
+
+
+def _play_wav_bytes(wav_bytes: bytes, job_id: str):
+    """Play WAV audio bytes through speakers via sounddevice."""
+    global _current_sd_stream
+    try:
+        import numpy as np
+        import sounddevice as sd
+    except ImportError:
+        logger.error("sounddevice/numpy not available — cannot play audio")
+        with _jobs_lock:
+            if job_id in _jobs:
+                _jobs[job_id]["status"] = "error"
+                _jobs[job_id]["error"] = "sounddevice not installed"
+        return
+
+    try:
+        buf = io.BytesIO(wav_bytes)
+        with wave.open(buf, "rb") as wf:
+            sr = wf.getframerate()
+            ch = wf.getnchannels()
+            sw = wf.getsampwidth()
+            frames = wf.readframes(wf.getnframes())
+
+        if sw == 2:
+            audio = np.frombuffer(frames, dtype=np.int16).astype(np.float32) / 32767.0
+        elif sw == 4:
+            audio = np.frombuffer(frames, dtype=np.int32).astype(np.float32) / 2147483647.0
+        else:
+            audio = np.frombuffer(frames, dtype=np.float32)
+
+        if ch > 1:
+            audio = audio.reshape(-1, ch)[:, 0]  # mono mixdown
+
+        duration = len(audio) / sr
+        with _jobs_lock:
+            if job_id in _jobs:
+                _jobs[job_id]["status"] = "speaking"
+                _jobs[job_id]["start_time"] = time.time()
+                _jobs[job_id]["duration_sec"] = round(duration, 2)
+
+        _playback_interrupt.clear()
+        device = _output_device
+        with _current_player_lock:
+            _current_sd_stream = job_id
+
+        sd.play(audio, samplerate=sr, device=device, blocking=True)
+
+        with _current_player_lock:
+            _current_sd_stream = None
+
+        if not _playback_interrupt.is_set():
+            with _jobs_lock:
+                if job_id in _jobs:
+                    _jobs[job_id]["status"] = "done"
+                    _jobs[job_id]["metrics"] = {
+                        "audio_duration_sec": round(duration, 2),
+                        "sample_rate": sr,
+                    }
+    except Exception as e:
+        logger.error("Playback error: %s", e)
+        with _current_player_lock:
+            _current_sd_stream = None
+        with _jobs_lock:
+            if job_id in _jobs:
+                _jobs[job_id]["status"] = "error"
+                _jobs[job_id]["error"] = str(e)
+
+
+def _estimate_duration(text: str, speed: float) -> float:
+    words = len(text.split())
+    base_wpm = 160 * speed
+    return (words / base_wpm) * 60
+
+
+# ---------------------------------------------------------------------------
+# Tool implementations
+# ---------------------------------------------------------------------------
+
+def tool_speak(text: str, voice: str = "bm_lewis", stream: bool = True,
+               speed: float = 1.25, emotion: float = 0.5) -> str:
+    """Synthesize via HTTP, play locally."""
+    if not text.strip():
+        return json.dumps({"status": "error", "error": "Nothing to say"})
+
+    # Check barge-in
+    try:
+        if os.path.exists(_BARGEIN_SIGNAL):
+            with open(_BARGEIN_SIGNAL) as f:
+                sig = json.load(f)
+            if sig.get("event") == "user_speaking_start":
+                return json.dumps({
+                    "status": "held",
+                    "reason": "User is currently speaking — re-send after user finishes.",
+                    "user_state": "recording",
+                    "estimated_duration_sec": round(_estimate_duration(text, speed), 1),
+                })
+    except Exception:
+        pass
+
+    # Request synthesis from HTTP service
+    status, resp = _http_request("POST", "/v1/synthesize", {
+        "text": text, "voice": voice, "speed": speed, "emotion": emotion,
+        "format": "wav",
+    }, timeout=60.0)
+
+    if status == 0:
+        return json.dumps({"status": "error", "error": resp.get("error", "Service unreachable")})
+    if status != 200:
+        err = resp.get("error", f"HTTP {status}") if isinstance(resp, dict) else f"HTTP {status}"
+        return json.dumps({"status": "error", "error": err})
+    if not isinstance(resp, bytes):
+        return json.dumps({"status": "error", "error": "Expected audio bytes from synthesize"})
+
+    # Create job and play in background
+    job_id = f"shim-{int(time.time()*1000)}"
+    with _jobs_lock:
+        _jobs[job_id] = {
+            "status": "generating",
+            "text": text[:100],
+            "voice": voice,
+            "created": time.time(),
+        }
+        while len(_jobs) > _MAX_JOBS:
+            _jobs.popitem(last=False)
+
+    t = threading.Thread(target=_play_wav_bytes, args=(resp, job_id), daemon=True)
+    t.start()
+
+    return json.dumps({"status": "speaking", "job_id": job_id})
+
+
+def tool_stop(job_id: str = "") -> str:
+    """Stop playback."""
+    try:
+        import sounddevice as sd
+    except ImportError:
+        pass
+
+    if job_id:
+        with _jobs_lock:
+            job = _jobs.get(job_id)
+        if not job:
+            return json.dumps({"status": "error", "error": f"Unknown job '{job_id}'"})
+        if job["status"] == "speaking":
+            _playback_interrupt.set()
+            try:
+                import sounddevice as sd
+                sd.stop()
+            except Exception:
+                pass
+            with _jobs_lock:
+                _jobs[job_id]["status"] = "interrupted"
+            return json.dumps({"status": "ok", "message": f"Interrupted '{job_id}'"})
+        return json.dumps({"status": "ok", "message": f"Job '{job_id}' status: {job['status']}"})
+
+    # Stop all
+    _playback_interrupt.set()
+    try:
+        import sounddevice as sd
+        sd.stop()
+    except Exception:
+        pass
+    with _jobs_lock:
+        for j in _jobs.values():
+            if j["status"] in ("speaking", "generating"):
+                j["status"] = "interrupted"
+    return json.dumps({"status": "ok", "message": "Stopped all playback"})
+
+
+def tool_speech_status(job_id: str = "", verbose: bool = False) -> str:
+    """Check job status."""
+    with _jobs_lock:
+        if not job_id:
+            if not _jobs:
+                return json.dumps({"status": "idle", "message": "No speech jobs", "queue_depth": 0})
+            job_id = next(reversed(_jobs))
+        job = _jobs.get(job_id)
+
+    if not job:
+        return json.dumps({"status": "error", "error": f"Unknown job '{job_id}'"})
+
+    result = {"job_id": job_id, "status": job["status"]}
+    if job["status"] == "speaking" and "start_time" in job:
+        result["elapsed_sec"] = round(time.time() - job["start_time"], 1)
+    if job.get("metrics"):
+        result["metrics"] = job["metrics"]
+    if job.get("error"):
+        result["error"] = job["error"]
+
+    # Queue state
+    with _jobs_lock:
+        speaking = sum(1 for j in _jobs.values() if j["status"] == "speaking")
+    result["queue"] = {"depth": speaking, "currently_playing": None}
+
+    return json.dumps(result)
+
+
+def tool_list_voices() -> str:
+    """List voices via HTTP."""
+    status, resp = _http_request("GET", "/v1/voices")
+    if status != 200:
+        return json.dumps({"status": "error", "error": "Could not reach Mod3 service"})
+
+    engines = resp.get("engines", {})
+    lines = []
+    for engine, cfg in engines.items():
+        supports = cfg.get("supports", [])
+        tag = f" ({', '.join(supports)})" if supports else ""
+        voices = cfg.get("voices", [])
+        lines.append(f"  {engine}{tag}: {', '.join(voices)}")
+    return "Available voices:\n" + "\n".join(lines)
+
+
+def tool_diagnostics() -> str:
+    """Diagnostics via HTTP."""
+    status, resp = _http_request("GET", "/diagnostics")
+    if status != 200:
+        return json.dumps({"status": "error", "error": "Could not reach Mod3 service"})
+    return json.dumps(resp, indent=2)
+
+
+def tool_set_output_device(device: str = "") -> str:
+    """List or set audio output device (local only)."""
+    global _output_device
+    try:
+        import sounddevice as sd
+    except ImportError:
+        return json.dumps({"status": "error", "error": "sounddevice not installed"})
+
+    outputs = []
+    for i, d in enumerate(sd.query_devices()):
+        if d["max_output_channels"] > 0:
+            is_default = i == sd.default.device[1]
+            is_active = (
+                (_output_device is None and is_default)
+                or _output_device == i
+                or (isinstance(_output_device, str) and _output_device in d["name"])
+            )
+            outputs.append({"index": i, "name": d["name"], "active": is_active, "default": is_default})
+
+    if not device:
+        return json.dumps({"devices": outputs})
+
+    if device == "default":
+        _output_device = None
+        return json.dumps({"status": "ok", "message": "Tracking system default"})
+
+    # Try numeric index
+    try:
+        idx = int(device)
+        for d in outputs:
+            if d["index"] == idx:
+                _output_device = idx
+                return json.dumps({"status": "ok", "device": d["name"], "index": idx})
+        return json.dumps({"status": "error", "error": f"No output device at index {idx}"})
+    except ValueError:
+        pass
+
+    # Try name substring
+    for d in outputs:
+        if device.lower() in d["name"].lower():
+            _output_device = d["index"]
+            return json.dumps({"status": "ok", "device": d["name"], "index": d["index"]})
+
+    return json.dumps({"status": "error", "error": f"No device matching '{device}'"})
+
+
+def tool_await_voice_input(timeout_sec: float = 180.0) -> str:
+    """Block until SuperWhisper recording finishes (local only)."""
+    _rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings")
+
+    start = time.time()
+    while time.time() - start < timeout_sec:
+        try:
+            if os.path.exists(_BARGEIN_SIGNAL):
+                with open(_BARGEIN_SIGNAL) as f:
+                    signal = json.load(f)
+                if signal.get("event") == "user_speaking_end":
+                    break
+        except (OSError, json.JSONDecodeError):
+            pass
+        time.sleep(0.2)
+    else:
+        return json.dumps({"status": "timeout", "error": f"No recording completed within {timeout_sec}s"})
+
+    # Find latest transcript
+    try:
+        folders = sorted(
+            [d for d in os.listdir(_rec_dir) if d.isdigit()],
+            key=int, reverse=True,
+        )
+        if folders:
+            meta_path = os.path.join(_rec_dir, folders[0], "meta.json")
+            if os.path.exists(meta_path):
+                with open(meta_path) as f:
+                    meta = json.load(f)
+                raw = meta.get("rawResult", "").strip()
+                result = meta.get("result", raw).strip()
+                duration_ms = meta.get("duration", 0)
+                return json.dumps({
+                    "status": "ok",
+                    "transcript": result if result else raw,
+                    "raw_transcript": raw,
+                    "duration_sec": round(duration_ms / 1000, 1),
+                    "folder": folders[0],
+                    "source": "superwhisper",
+                })
+    except Exception as e:
+        logger.warning("await_voice_input error: %s", e)
+
+    return json.dumps({"status": "error", "error": "Could not retrieve transcript"})
+
+
+def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
+    """VAD check via HTTP."""
+    if not os.path.exists(file_path):
+        return json.dumps({"status": "error", "error": f"File not found: {file_path}"})
+
+    # Read WAV and send to HTTP endpoint
+    try:
+        with open(file_path, "rb") as f:
+            wav_data = f.read()
+    except Exception as e:
+        return json.dumps({"status": "error", "error": str(e)})
+
+    # The HTTP API expects multipart file upload, use urllib
+    import mimetypes
+    boundary = "----Mod3ShimBoundary"
+    body = (
+        f"--{boundary}\r\n"
+        f'Content-Disposition: form-data; name="file"; filename="{os.path.basename(file_path)}"\r\n'
+        f"Content-Type: audio/wav\r\n\r\n"
+    ).encode() + wav_data + f"\r\n--{boundary}--\r\n".encode()
+
+    url = f"{MOD3_BASE}/v1/vad"
+    if threshold != 0.5:
+        url += f"?threshold={threshold}"
+    req = urllib.request.Request(
+        url, data=body, method="POST",
+        headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            return json.dumps(json.loads(resp.read()))
+    except Exception as e:
+        return json.dumps({"status": "error", "error": f"VAD request failed: {e}"})
+
+
+# ---------------------------------------------------------------------------
+# Tool registry (matches server.py exactly)
+# ---------------------------------------------------------------------------
+
+TOOLS = [
+    {
+        "name": "speak",
+        "description": (
+            "Synthesize text to speech and play it through the user's speakers.\n\n"
+            "Non-blocking: returns immediately with a job ID while audio plays or is\n"
+            "queued. If nothing is playing, starts immediately. If audio is already\n"
+            "playing, the new request is queued and will play automatically when the\n"
+            "current item finishes.\n\n"
+            "The response always includes the current queue state so the agent knows\n"
+            "exactly what's happening on the output channel without a separate status call.\n\n"
+            "Args:\n"
+            "    text: The text to speak aloud. Keep it conversational.\n"
+            "    voice: Voice preset. Use list_voices() to see options.\n"
+            '           Defaults to "bm_lewis" (Kokoro).\n'
+            "    stream: If True, plays audio chunks as they generate (lower latency).\n"
+            "            If False, generates all audio first then plays (better prosody).\n"
+            "    speed: Speed multiplier (engines with speed support). Default 1.25.\n"
+            "    emotion: Emotion/exaggeration intensity 0.0-1.0 (Chatterbox only). Default 0.5."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "text": {"type": "string", "description": "The text to speak aloud. Keep it conversational."},
+                "voice": {"type": "string", "default": "bm_lewis", "description": "Voice preset. Use list_voices() to see options. Defaults to \"bm_lewis\" (Kokoro)."},
+                "stream": {"type": "boolean", "default": True, "description": "If True, plays audio chunks as they generate (lower latency)."},
+                "speed": {"type": "number", "default": 1.25, "description": "Speed multiplier (engines with speed support). Default 1.25."},
+                "emotion": {"type": "number", "default": 0.5, "description": "Emotion/exaggeration intensity 0.0-1.0 (Chatterbox only). Default 0.5."},
+            },
+            "required": ["text"],
+        },
+    },
+    {
+        "name": "speech_status",
+        "description": (
+            "Check status of a speech job, or get the most recent result.\n\n"
+            "Always includes queue state so the agent has full output channel awareness.\n\n"
+            "Args:\n"
+            "    job_id: The job ID returned by speak(). If empty, returns the latest job.\n"
+            "    verbose: If True, include per-chunk metrics. Default False (summary only)."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "job_id": {"type": "string", "default": "", "description": "The job ID returned by speak(). If empty, returns the latest job."},
+                "verbose": {"type": "boolean", "default": False, "description": "If True, include per-chunk metrics. Default False (summary only)."},
+            },
+        },
+    },
+    {
+        "name": "stop",
+        "description": (
+            "Stop current speech or cancel a specific queued item.\n\n"
+            "Args:\n"
+            "    job_id: If provided, cancels that specific queued job (not yet playing).\n"
+            "            If the job_id is the currently playing job, interrupts playback.\n"
+            "            If empty, interrupts current playback AND clears the entire queue."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "job_id": {"type": "string", "default": "", "description": "If provided, cancels that specific job. If empty, stops everything."},
+            },
+        },
+    },
+    {
+        "name": "list_voices",
+        "description": "List all available voice presets grouped by engine.",
+        "inputSchema": {"type": "object", "properties": {}},
+    },
+    {
+        "name": "await_voice_input",
+        "description": (
+            "Block until the user finishes a SuperWhisper recording, then return the transcript.\n\n"
+            "This closes the voice input loop: instead of waiting for the user to paste\n"
+            "their transcribed text, you can directly receive what they said. Use this\n"
+            "when speak() returns \"held\" (user is recording) or when you want to listen\n"
+            "for the next voice input.\n\n"
+            "Polls the barge-in signal file for user_speaking_end, then reads the\n"
+            "transcript from SuperWhisper's recordings directory.\n\n"
+            "Args:\n"
+            "    timeout_sec: Maximum seconds to wait for recording to finish. Default 180 (3 minutes)."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "timeout_sec": {"type": "number", "default": 180, "description": "Maximum seconds to wait for recording to finish. Default 180 (3 minutes)."},
+            },
+        },
+    },
+    {
+        "name": "diagnostics",
+        "description": "Return engine state and last generation metrics for debugging.",
+        "inputSchema": {"type": "object", "properties": {}},
+    },
+    {
+        "name": "set_output_device",
+        "description": (
+            "List audio output devices, or set the active one.\n\n"
+            "Args:\n"
+            "    device: Device index (e.g. \"3\"), name substring (e.g. \"AirPods\"),\n"
+            "            or \"default\" to track the system default automatically.\n"
+            "            If empty, lists available devices without changing anything."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "device": {"type": "string", "default": "", "description": "Device index, name substring, or 'default'. If empty, lists devices."},
+            },
+        },
+    },
+    {
+        "name": "vad_check",
+        "description": (
+            "Check if an audio file contains speech using Silero VAD.\n\n"
+            "Use this before transcription to avoid Whisper hallucinations on\n"
+            "silence or ambient noise.\n\n"
+            "Args:\n"
+            "    file_path: Path to a WAV audio file.\n"
+            "    threshold: Speech probability threshold 0-1 (default 0.5). Higher = stricter."
+        ),
+        "inputSchema": {
+            "type": "object",
+            "properties": {
+                "file_path": {"type": "string", "description": "Path to a WAV audio file."},
+                "threshold": {"type": "number", "default": 0.5, "description": "Speech probability threshold 0-1 (default 0.5). Higher = stricter."},
+            },
+            "required": ["file_path"],
+        },
+    },
+]
+
+TOOL_DISPATCH = {
+    "speak": lambda args: tool_speak(
+        args["text"],
+        voice=args.get("voice", "bm_lewis"),
+        stream=args.get("stream", True),
+        speed=args.get("speed", 1.25),
+        emotion=args.get("emotion", 0.5),
+    ),
+    "speech_status": lambda args: tool_speech_status(
+        job_id=args.get("job_id", ""),
+        verbose=args.get("verbose", False),
+    ),
+    "stop": lambda args: tool_stop(job_id=args.get("job_id", "")),
+    "list_voices": lambda args: tool_list_voices(),
+    "await_voice_input": lambda args: tool_await_voice_input(
+        timeout_sec=args.get("timeout_sec", 180.0),
+    ),
+    "diagnostics": lambda args: tool_diagnostics(),
+    "set_output_device": lambda args: tool_set_output_device(
+        device=args.get("device", ""),
+    ),
+    "vad_check": lambda args: tool_vad_check(
+        file_path=args["file_path"],
+        threshold=args.get("threshold", 0.5),
+    ),
+}
+
+
+# ---------------------------------------------------------------------------
+# MCP stdio protocol
+# ---------------------------------------------------------------------------
+
+SERVER_INFO = {
+    "name": "mod3",
+    "version": "0.3.0-shim",
+}
+
+CAPABILITIES = {
+    "tools": {},
+}
+
+
+def _read_message() -> dict | None:
+    """Read a JSON-RPC message from stdin (newline-delimited)."""
+    try:
+        line = sys.stdin.readline()
+        if not line:
+            return None
+        return json.loads(line.strip())
+    except (json.JSONDecodeError, ValueError):
+        return None
+
+
+def _write_message(msg: dict):
+    """Write a JSON-RPC message to stdout."""
+    sys.stdout.write(json.dumps(msg) + "\n")
+    sys.stdout.flush()
+
+
+def _jsonrpc_response(id: Any, result: Any) -> dict:
+    return {"jsonrpc": "2.0", "id": id, "result": result}
+
+
+def _jsonrpc_error(id: Any, code: int, message: str) -> dict:
+    return {"jsonrpc": "2.0", "id": id, "error": {"code": code, "message": message}}
+
+
+def handle_initialize(msg: dict) -> dict:
+    return _jsonrpc_response(msg["id"], {
+        "protocolVersion": "2024-11-05",
+        "serverInfo": SERVER_INFO,
+        "capabilities": CAPABILITIES,
+    })
+
+
+def handle_tools_list(msg: dict) -> dict:
+    return _jsonrpc_response(msg["id"], {"tools": TOOLS})
+
+
+def handle_tools_call(msg: dict) -> dict:
+    params = msg.get("params", {})
+    tool_name = params.get("name", "")
+    arguments = params.get("arguments", {})
+
+    handler = TOOL_DISPATCH.get(tool_name)
+    if not handler:
+        return _jsonrpc_error(msg["id"], -32602, f"Unknown tool: {tool_name}")
+
+    try:
+        result_text = handler(arguments)
+    except Exception as e:
+        result_text = json.dumps({"status": "error", "error": str(e)})
+
+    return _jsonrpc_response(msg["id"], {
+        "content": [{"type": "text", "text": result_text}],
+    })
+
+
+def handle_notifications_initialized(msg: dict):
+    """Client sends this after initialize — no response needed."""
+    pass
+
+
+METHOD_HANDLERS = {
+    "initialize": handle_initialize,
+    "tools/list": handle_tools_list,
+    "tools/call": handle_tools_call,
+    "notifications/initialized": handle_notifications_initialized,
+    "ping": lambda msg: _jsonrpc_response(msg["id"], {}),
+}
+
+
+def run_stdio():
+    """Main MCP stdio loop."""
+    logging.basicConfig(level=logging.WARNING, stream=sys.stderr)
+
+    while True:
+        msg = _read_message()
+        if msg is None:
+            break  # EOF
+
+        method = msg.get("method", "")
+        handler = METHOD_HANDLERS.get(method)
+
+        if handler is None:
+            # Unknown method — if it has an id, return error; if notification, ignore
+            if "id" in msg:
+                _write_message(_jsonrpc_error(msg["id"], -32601, f"Method not found: {method}"))
+            continue
+
+        result = handler(msg)
+        if result is not None:
+            _write_message(result)
+
+
+# ---------------------------------------------------------------------------
+# Self-test
+# ---------------------------------------------------------------------------
+
+def self_test():
+    """Quick connectivity check."""
+    print(f"Mod3 shim — testing connection to {MOD3_BASE}")
+
+    status, resp = _http_request("GET", "/health")
+    if status == 200:
+        engines = resp.get("engines", {})
+        loaded = [k for k, v in engines.items() if v == "loaded"]
+        print(f"  OK: Mod3 service healthy — {len(loaded)} engine(s) loaded: {', '.join(loaded) or 'none'}")
+    elif status == 0:
+        print(f"  WARN: Mod3 service not reachable at {MOD3_BASE}")
+        print("        Tools will return errors until the service starts.")
+    else:
+        print(f"  WARN: Unexpected status {status} from /health")
+
+    # Check sounddevice
+    try:
+        import sounddevice as sd
+        default_out = sd.query_devices(sd.default.device[1])
+        print(f"  OK: sounddevice available — default output: {default_out['name']}")
+    except ImportError:
+        print("  WARN: sounddevice not installed — speak/stop will fail")
+    except Exception as e:
+        print(f"  WARN: sounddevice error: {e}")
+
+    print("  Shim ready.")
+
+
+if __name__ == "__main__":
+    if "--test" in sys.argv:
+        self_test()
+    else:
+        run_stdio()
diff --git a/modules/voice.py b/modules/voice.py
index 6cacce3..aa03263 100644
--- a/modules/voice.py
+++ b/modules/voice.py
@@ -82,22 +82,28 @@ class WhisperDecoder(Decoder):
     Accepts PCM float32 bytes at 16kHz or a numpy float32 array directly.
     Lazy-loads the model on first call; subsequent calls reuse it.
     Applies BoH hallucination filter to transcripts.
+
+    Supports two models:
+    - Large (whisper-large-v3-turbo): high-quality, used for T2/T3 tiers (~470ms)
+    - Base (whisper-base-mlx): fast, used for T1 tier (~31ms)
     """
 
-    DEFAULT_MODEL = "mlx-community/whisper-turbo"
+    DEFAULT_MODEL = "mlx-community/whisper-large-v3-turbo"
+    BASE_MODEL = "mlx-community/whisper-base-mlx"
 
-    def __init__(self, model: str | None = None):
+    def __init__(self, model: str | None = None, load_base: bool = True):
         self._model = model or self.DEFAULT_MODEL
         self._loaded = False
+        self._base_loaded = False
+        self._load_base = load_base
+        # Streaming state: last transcript for diff-based partial detection
+        self._last_streaming_text: str = ""
 
     def _ensure_model(self) -> None:
         """Trigger model download/load on first use."""
         if not self._loaded:
             import mlx_whisper
 
-            # A dry-run transcribe forces the model to download & cache.
-            # mlx_whisper handles caching internally — subsequent calls
-            # with the same path_or_hf_repo are fast.
             logger.info("WhisperDecoder: loading model %s (first call)", self._model)
             mlx_whisper.transcribe(
                 np.zeros(16000, dtype=np.float32),  # 1 s of silence
@@ -106,6 +112,182 @@ def _ensure_model(self) -> None:
             self._loaded = True
             logger.info("WhisperDecoder: model ready")
 
+    def _ensure_base_model(self) -> None:
+        """Load Whisper Base model for T1 fast transcription."""
+        if not self._base_loaded:
+            import mlx_whisper
+
+            logger.info("WhisperDecoder: loading base model %s", self.BASE_MODEL)
+            mlx_whisper.transcribe(
+                np.zeros(16000, dtype=np.float32),
+                path_or_hf_repo=self.BASE_MODEL,
+            )
+            self._base_loaded = True
+            logger.info("WhisperDecoder: base model ready")
+
+    def decode_streaming(
+        self,
+        audio: np.ndarray,
+        tier: str = "t1",
+        **kwargs,
+    ) -> dict:
+        """Chunked re-transcription with LocalAgreement-2 diff.
+
+        Re-runs mlx_whisper.transcribe() on the growing audio buffer,
+        diffs consecutive outputs to produce confirmed vs tentative text.
+
+        Args:
+            audio: Growing float32 audio buffer at 16kHz.
+            tier: "t1" (Base, fast), "t2" (Large, on pause), "t3" (Large, final).
+
+        Returns:
+            dict with keys:
+              - confirmed: str — text stable across 2+ consecutive runs
+              - tentative: str — new text not yet confirmed
+              - full_text: str — complete transcript from this run
+              - tier: str — which tier was used
+              - changed: bool — whether output differs from last run
+        """
+        import mlx_whisper
+
+        from vad import is_hallucination
+
+        # Select model based on tier
+        if tier == "t1":
+            self._ensure_base_model()
+            model_path = self.BASE_MODEL
+        else:
+            self._ensure_model()
+            model_path = self._model
+
+        t0 = time.time()
+        result = mlx_whisper.transcribe(
+            audio,
+            path_or_hf_repo=model_path,
+            language="en",
+        )
+        elapsed_ms = (time.time() - t0) * 1000
+
+        transcript: str = result.get("text", "").strip()
+
+        if is_hallucination(transcript):
+            return {
+                "confirmed": "",
+                "tentative": "",
+                "full_text": "",
+                "tier": tier,
+                "changed": False,
+                "elapsed_ms": round(elapsed_ms, 1),
+                "filtered": True,
+            }
+
+        # LocalAgreement-2 diff: find longest common prefix with last run
+        prev = self._last_streaming_text
+        changed = transcript != prev
+
+        # Confirmed = common prefix (stable across consecutive runs)
+        confirmed = ""
+        min_len = min(len(prev), len(transcript))
+        for i in range(min_len):
+            if prev[i] == transcript[i]:
+                confirmed = transcript[: i + 1]
+            else:
+                break
+
+        # Snap to word boundary
+        if confirmed and not confirmed.endswith(" "):
+            last_space = confirmed.rfind(" ")
+            if last_space > 0:
+                confirmed = confirmed[:last_space]
+
+        # Tentative = remainder after confirmed prefix
+        tentative = transcript[len(confirmed):].strip()
+
+        # T3 = end-of-utterance, everything is confirmed
+        if tier == "t3":
+            confirmed = transcript
+            tentative = ""
+
+        self._last_streaming_text = transcript
+
+        return {
+            "confirmed": confirmed.strip(),
+            "tentative": tentative,
+            "full_text": transcript,
+            "tier": tier,
+            "changed": changed,
+            "elapsed_ms": round(elapsed_ms, 1),
+        }
+
+    def reset_streaming(self) -> None:
+        """Reset streaming state between utterances."""
+        self._last_streaming_text = ""
+
+    def validate_tts_output(self, audio_samples: np.ndarray, source_text: str, sample_rate: int = 24000) -> dict:
+        """Whisper validation loop: run TTS audio through Whisper Base and compare.
+
+        After TTS generates an audio chunk, run it through Whisper Base (~31ms)
+        and compare transcript to source text. Flag mismatches.
+
+        Args:
+            audio_samples: Float32 audio samples from TTS.
+            source_text: The original text that was synthesized.
+            sample_rate: Sample rate of the TTS audio.
+
+        Returns:
+            dict with keys:
+              - match: bool — whether transcript matches source
+              - transcript: str — what Whisper heard
+              - source: str — original text
+              - similarity: float — 0.0-1.0 word overlap ratio
+              - elapsed_ms: float
+        """
+        import mlx_whisper
+
+        self._ensure_base_model()
+
+        # Resample to 16kHz if needed (Whisper expects 16kHz)
+        if sample_rate != 16000:
+            # Simple linear resampling
+            ratio = 16000 / sample_rate
+            new_len = int(len(audio_samples) * ratio)
+            indices = np.linspace(0, len(audio_samples) - 1, new_len)
+            audio_16k = np.interp(indices, np.arange(len(audio_samples)), audio_samples).astype(np.float32)
+        else:
+            audio_16k = audio_samples
+
+        t0 = time.time()
+        result = mlx_whisper.transcribe(
+            audio_16k,
+            path_or_hf_repo=self.BASE_MODEL,
+            language="en",
+        )
+        elapsed_ms = (time.time() - t0) * 1000
+
+        transcript = result.get("text", "").strip().lower()
+        source_clean = source_text.strip().lower()
+
+        # Word-level similarity
+        source_words = set(source_clean.split())
+        transcript_words = set(transcript.split())
+
+        if source_words:
+            overlap = len(source_words & transcript_words)
+            similarity = overlap / len(source_words)
+        else:
+            similarity = 1.0 if not transcript_words else 0.0
+
+        # Match if similarity >= 0.7 (TTS output may have minor variations)
+        match = similarity >= 0.7
+
+        return {
+            "match": match,
+            "transcript": transcript,
+            "source": source_text,
+            "similarity": round(similarity, 3),
+            "elapsed_ms": round(elapsed_ms, 1),
+        }
+
     def decode(self, raw: bytes, **kwargs) -> CognitiveEvent:
         import mlx_whisper
 
diff --git a/providers.py b/providers.py
index cc551c4..9068733 100644
--- a/providers.py
+++ b/providers.py
@@ -121,7 +121,8 @@ def _format_tools_for_prompt(tools: list[dict]) -> str:
             for pname, pinfo in props.items():
                 req_marker = " (required)" if pname in required else ""
                 lines.append(
-                    f"    - {pname} ({pinfo.get('type', 'string')}): {pinfo.get('description', '')}{req_marker}"
+                    f"    - {pname} ({pinfo.get('type', 'string')}): "
+                    f"{pinfo.get('description', '')}{req_marker}"
                 )
     lines.append(
         "\nTo call a tool, output exactly:\n"
@@ -134,7 +135,9 @@ def _format_tools_for_prompt(tools: list[dict]) -> str:
     return "\n".join(lines)
 
 
-_TOOL_CALL_RE = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
+_TOOL_CALL_RE = re.compile(
+    r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL
+)
 
 
 def _parse_tool_calls(text: str) -> list[ToolCall]:
@@ -165,7 +168,9 @@ class MlxProvider:
     """
 
     def __init__(self, model_id: str | None = None):
-        self._model_id = model_id or os.environ.get("MLX_MODEL", "mlx-community/gemma-3-4b-it-4bit")
+        self._model_id = model_id or os.environ.get(
+            "MLX_MODEL", "mlx-community/gemma-3-4b-it-4bit"
+        )
         self._model = None
         self._tokenizer = None
 
@@ -205,7 +210,9 @@ def _generate_sync(
             msgs = [{"role": "system", "content": "\n\n".join(system_parts)}] + msgs
 
         # Apply chat template
-        prompt = self._tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
+        prompt = self._tokenizer.apply_chat_template(
+            msgs, add_generation_prompt=True, tokenize=False
+        )
 
         max_tokens = int(os.environ.get("MLX_MAX_TOKENS", "512"))
         raw_output = generate(
@@ -233,7 +240,9 @@ async def chat(
         tools: list[dict] | None = None,
         system: str = "",
     ) -> ProviderResponse:
-        return await asyncio.to_thread(self._generate_sync, messages, tools, system)
+        return await asyncio.to_thread(
+            self._generate_sync, messages, tools, system
+        )
 
 
 # ---------------------------------------------------------------------------
@@ -249,7 +258,9 @@ def __init__(
         endpoint: str | None = None,
         model: str | None = None,
     ):
-        self._endpoint = endpoint or os.environ.get("OLLAMA_ENDPOINT", "http://localhost:11434")
+        self._endpoint = endpoint or os.environ.get(
+            "OLLAMA_ENDPOINT", "http://localhost:11434"
+        )
         self._model = model or os.environ.get("OLLAMA_MODEL", "gemma4:e4b")
 
     @property
@@ -311,7 +322,9 @@ class CogOSProvider:
     """CogOS kernel — OpenAI-compatible chat/completions with tool support."""
 
     def __init__(self, endpoint: str | None = None):
-        self._endpoint = endpoint or os.environ.get("COGOS_ENDPOINT", "http://localhost:5100")
+        self._endpoint = endpoint or os.environ.get(
+            "COGOS_ENDPOINT", "http://localhost:5100"
+        )
 
     @property
     def name(self) -> str:
@@ -430,7 +443,7 @@ def auto_detect_provider() -> InferenceProvider:
         return MlxProvider()
 
     try:
-        asyncio.get_running_loop()
+        loop = asyncio.get_running_loop()
     except RuntimeError:
         return asyncio.run(auto_detect_provider_async())
 
diff --git a/server.py b/server.py
index 4d18002..8c79bcb 100644
--- a/server.py
+++ b/server.py
@@ -365,18 +365,60 @@ async def _filter_read_stream():
 # ---------------------------------------------------------------------------
 
 _BARGEIN_SIGNAL = "/tmp/mod3-barge-in.json"
+_SPEAKING_LOCK = "/tmp/mod3-speaking.json"
 _bargein_last_mtime: float = 0.0
 
 
+def _acquire_speaking_lock(job_id: str, text: str):
+    """Write cross-process speaking lock so the barge-in watcher knows ANY Mod³ is speaking."""
+    try:
+        payload = {
+            "speaking": True,
+            "job_id": job_id,
+            "text": text,
+            "pid": os.getpid(),
+            "timestamp": time.time(),
+        }
+        tmp = _SPEAKING_LOCK + ".tmp"
+        with open(tmp, "w") as f:
+            json.dump(payload, f)
+        os.replace(tmp, _SPEAKING_LOCK)
+    except OSError:
+        pass
+
+
+def _release_speaking_lock():
+    """Clear the cross-process speaking lock."""
+    try:
+        if os.path.exists(_SPEAKING_LOCK):
+            os.remove(_SPEAKING_LOCK)
+    except OSError:
+        pass
+
+
+def _is_any_process_speaking() -> dict | None:
+    """Check if ANY Mod³ process is currently speaking (cross-process)."""
+    try:
+        if not os.path.exists(_SPEAKING_LOCK):
+            return None
+        with open(_SPEAKING_LOCK) as f:
+            lock = json.load(f)
+        # Stale lock check: if older than 60s, ignore it (crashed process)
+        if time.time() - lock.get("timestamp", 0) > 60:
+            os.remove(_SPEAKING_LOCK)
+            return None
+        return lock
+    except (OSError, json.JSONDecodeError):
+        return None
+
+
 def _bargein_watcher():
     """Background thread that watches for barge-in signal file changes."""
     global _bargein_last_mtime
     import json as _json
-
     while True:
         try:
             import os
-
             if os.path.exists(_BARGEIN_SIGNAL):
                 mtime = os.path.getmtime(_BARGEIN_SIGNAL)
                 if mtime > _bargein_last_mtime:
@@ -384,10 +426,10 @@ def _bargein_watcher():
                     with open(_BARGEIN_SIGNAL) as f:
                         signal = _json.load(f)
                     if signal.get("event") == "user_speaking_start":
+                        # Check local pipeline state first (same process)
                         if pipeline_state.is_speaking:
                             info = pipeline_state.interrupt(reason="barge_in")
                             if info:
-                                # Write interrupt context back to signal file
                                 signal["interrupted"] = {
                                     "spoken_pct": info.spoken_pct,
                                     "delivered_text": info.delivered_text,
@@ -395,9 +437,25 @@ def _bargein_watcher():
                                 }
                                 with open(_BARGEIN_SIGNAL, "w") as f:
                                     _json.dump(signal, f, indent=2)
-                            logging.info(
-                                "Barge-in: paused playback (%.0f%% delivered)", info.spoken_pct * 100 if info else 0
-                            )
+                            logging.info("Barge-in: paused local playback (%.0f%% delivered)", info.spoken_pct * 100 if info else 0)
+                        else:
+                            # Check cross-process lock (another Mod³ process may be speaking)
+                            lock = _is_any_process_speaking()
+                            if lock:
+                                # We can't interrupt another process's pipeline_state,
+                                # but we CAN write the interrupt context from the lock data
+                                signal["interrupted"] = {
+                                    "spoken_pct": 0.0,  # Unknown from cross-process
+                                    "delivered_text": "",
+                                    "full_text": lock.get("text", ""),
+                                    "cross_process": True,
+                                    "source_pid": lock.get("pid"),
+                                }
+                                with open(_BARGEIN_SIGNAL, "w") as f:
+                                    _json.dump(signal, f, indent=2)
+                                # Clear the speaking lock to signal the other process
+                                _release_speaking_lock()
+                                logging.info("Barge-in: cross-process interrupt (pid=%s)", lock.get("pid"))
         except Exception as e:
             logging.debug("Barge-in watcher error: %s", e)
         time.sleep(0.1)  # 100ms poll
@@ -591,6 +649,7 @@ def _run_speech_job(entry: dict) -> None:
 
     # Register with the reflex arc so inbound VAD can interrupt us
     pipeline_state.start_speaking(text, player)
+    _acquire_speaking_lock(job_id, text)
     try:
         for chunk in engine_module.generate_audio(
             text,
@@ -600,6 +659,11 @@ def _run_speech_job(entry: dict) -> None:
             speed=speed,
             emotion=emotion,
         ):
+            # Check if barge-in cleared our speaking lock (cross-process interrupt)
+            if not os.path.exists(_SPEAKING_LOCK):
+                logging.info("Speaking lock cleared by barge-in watcher — stopping generation")
+                player.stop()
+                break
             player.queue_audio(chunk.samples, chunk_meta=chunk.metadata if chunk.metadata else None)
             _set_bus_voice_state(
                 status=ModuleStatus.ENCODING,
@@ -617,6 +681,7 @@ def _run_speech_job(entry: dict) -> None:
     # Final position update and clear speaking state
     pipeline_state.update_position(*player.get_progress())
     pipeline_state.stop_speaking()
+    _release_speaking_lock()
 
     result = metrics.to_dict()
     result["engine"] = engine
@@ -769,14 +834,12 @@ def speak(
     # can't be cleared by stop().
     if user_state == "recording":
         est_duration = _estimate_duration_sec(text, speed)
-        return json.dumps(
-            {
-                "status": "held",
-                "reason": "User is currently speaking — re-send this speak() call after user finishes.",
-                "user_state": "recording",
-                "estimated_duration_sec": round(est_duration, 1),
-            }
-        )
+        return json.dumps({
+            "status": "held",
+            "reason": "User is currently speaking — re-send this speak() call after user finishes.",
+            "user_state": "recording",
+            "estimated_duration_sec": round(est_duration, 1),
+        })
 
     try:
         job_id, position = _start_speech(text, voice, stream=stream, speed=speed, emotion=emotion)
@@ -1075,6 +1138,106 @@ def list_voices() -> str:
     return "Available voices:\n" + "\n".join(lines)
 
 
+@mcp.tool(
+    annotations={
+        "readOnlyHint": True,
+        "destructiveHint": False,
+        "idempotentHint": False,
+        "openWorldHint": True,
+    }
+)
+def await_voice_input(timeout_sec: float = 180.0) -> str:
+    """Block until the user finishes a SuperWhisper recording, then return the transcript.
+
+    This closes the voice input loop: instead of waiting for the user to paste
+    their transcribed text, you can directly receive what they said. Use this
+    when speak() returns "held" (user is recording) or when you want to listen
+    for the next voice input.
+
+    Polls the barge-in signal file for user_speaking_end, then reads the
+    transcript from SuperWhisper's recordings directory.
+
+    Args:
+        timeout_sec: Maximum seconds to wait for recording to finish. Default 180 (3 minutes).
+    """
+    import sqlite3 as _sqlite3
+
+    _sw_db = os.path.expanduser(
+        "~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite"
+    )
+    _rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings")
+
+    start = time.time()
+    # If user is currently recording, wait for them to finish
+    while time.time() - start < timeout_sec:
+        try:
+            if os.path.exists(_BARGEIN_SIGNAL):
+                with open(_BARGEIN_SIGNAL) as f:
+                    signal = json.load(f)
+                if signal.get("event") == "user_speaking_end":
+                    break
+        except (OSError, json.JSONDecodeError):
+            pass
+        time.sleep(0.2)
+    else:
+        return json.dumps({"status": "timeout", "error": f"No recording completed within {timeout_sec}s"})
+
+    # Recording finished — find the latest transcript
+    # Method 1: Check the most recent recording folder's meta.json
+    try:
+        folders = sorted(
+            [d for d in os.listdir(_rec_dir) if d.isdigit()],
+            key=int,
+            reverse=True,
+        )
+        if folders:
+            meta_path = os.path.join(_rec_dir, folders[0], "meta.json")
+            if os.path.exists(meta_path):
+                with open(meta_path) as f:
+                    meta = json.load(f)
+                raw = meta.get("rawResult", "").strip()
+                result = meta.get("result", raw).strip()
+                duration_ms = meta.get("duration", 0)
+                return json.dumps({
+                    "status": "ok",
+                    "transcript": result if result else raw,
+                    "raw_transcript": raw,
+                    "duration_sec": round(duration_ms / 1000, 1),
+                    "folder": folders[0],
+                    "source": "superwhisper",
+                })
+    except Exception as e:
+        logger.warning("await_voice_input meta.json fallback failed: %s", e)
+
+    # Method 2: Query SuperWhisper SQLite DB
+    try:
+        conn = _sqlite3.connect(f"file:{_sw_db}?mode=ro", uri=True, timeout=2.0)
+        row = conn.execute(
+            "SELECT folderName, duration FROM recording ORDER BY datetime DESC LIMIT 1"
+        ).fetchone()
+        conn.close()
+        if row:
+            folder_name, duration = row
+            meta_path = os.path.join(_rec_dir, folder_name, "meta.json")
+            if os.path.exists(meta_path):
+                with open(meta_path) as f:
+                    meta = json.load(f)
+                raw = meta.get("rawResult", "").strip()
+                result = meta.get("result", raw).strip()
+                return json.dumps({
+                    "status": "ok",
+                    "transcript": result if result else raw,
+                    "raw_transcript": raw,
+                    "duration_sec": round(duration / 1000, 1),
+                    "folder": folder_name,
+                    "source": "superwhisper_db",
+                })
+    except Exception as e:
+        logger.warning("await_voice_input DB fallback failed: %s", e)
+
+    return json.dumps({"status": "error", "error": "Could not retrieve transcript"})
+
+
 @mcp.tool(
     annotations={
         "readOnlyHint": True,
@@ -1125,7 +1288,8 @@ def set_output_device(device: str = "") -> str:
     """List audio output devices, or set the active one.
 
     Args:
-        device: Device index (e.g. "3") or name substring (e.g. "AirPods").
+        device: Device index (e.g. "3"), name substring (e.g. "AirPods"),
+                or "default" to track the system default automatically.
                 If empty, lists available devices without changing anything.
     """
     import sounddevice as sd
@@ -1141,12 +1305,16 @@ def set_output_device(device: str = "") -> str:
                 or _output_device == i
                 or (isinstance(_output_device, str) and _output_device in d["name"])
             )
-            outputs.append({"index": i, "name": d["name"], "active": is_active})
+            outputs.append({"index": i, "name": d["name"], "active": is_active, "default": is_default})
 
     if not device:
-        lines = [f"  [{'*' if d['active'] else ' '}] {d['index']}: {d['name']}" for d in outputs]
+        lines = [f"  [{'*' if d['active'] else ' '}] {d['index']}: {d['name']}{' (system default)' if d['default'] else ''}" for d in outputs]
         return "Audio output devices (* = active):\n" + "\n".join(lines)
 
+    if device.lower() == "default":
+        _output_device = None
+        return json.dumps({"status": "ok", "device": "system_default", "note": "Now tracking system default output device"})
+
     if device.isdigit():
         _output_device = int(device)
     else:

From df8b3799d5f36e4049dda352981fed647c9d7fbc Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Wed, 15 Apr 2026 15:34:01 -0400
Subject: [PATCH 2/9] fix: resolve 9 ruff lint errors (unused imports, missing
 asyncio, formatting)

- Remove unused imports: typing.Any, VoiceEncoder, WebSocketDisconnect, struct, mimetypes
- Add missing asyncio import for to_thread() in speculative TTS
- Prefix unused variables with _ (full, check_messages, loop)
- Auto-fixed by ruff --fix + manual corrections

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_loop.py | 9 +++++----
 http_api.py   | 2 +-
 mcp_shim.py   | 2 --
 providers.py  | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/agent_loop.py b/agent_loop.py
index 2f28aad..afdca23 100644
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -7,11 +7,12 @@
 
 from __future__ import annotations
 
+import asyncio
 import json as _json
 import logging
 import os
 import time
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 import httpx
 
@@ -83,7 +84,7 @@ def _fetch_kernel_context() -> str:
                 interrupted = signal.get("interrupted")
                 if interrupted:
                     delivered = interrupted.get("delivered_text", "")
-                    full = interrupted.get("full_text", "")
+                    _full = interrupted.get("full_text", "")
                     pct = interrupted.get("spoken_pct", 0)
                     parts.append(
                         f"[barge-in] Claude's speech was interrupted at {pct*100:.0f}%. "
@@ -412,7 +413,7 @@ async def _presynthesise_block(self, block) -> None:
         Generates audio immediately and attaches it to the block.
         Ready for instant playback when the human stops speaking.
         """
-        from modules.voice import VoiceEncoder, _encode_wav
+        from modules.voice import _encode_wav
 
         try:
             voice = "bm_lewis"
@@ -462,7 +463,7 @@ async def background_validate_drafts(self, latest_user_text: str) -> None:
             return
 
         # Build context with latest human input
-        check_messages = list(self.conversation) + [
+        _check_messages = list(self.conversation) + [
             {"role": "user", "content": latest_user_text},
         ]
 
diff --git a/http_api.py b/http_api.py
index 981d7fc..0a6396f 100644
--- a/http_api.py
+++ b/http_api.py
@@ -29,7 +29,7 @@
 from threading import Lock
 from typing import Optional
 
-from fastapi import FastAPI, Request, Response, UploadFile, WebSocket, WebSocketDisconnect
+from fastapi import FastAPI, Request, Response, UploadFile, WebSocket
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
diff --git a/mcp_shim.py b/mcp_shim.py
index c99ee73..83831fb 100644
--- a/mcp_shim.py
+++ b/mcp_shim.py
@@ -20,7 +20,6 @@
 import json
 import logging
 import os
-import struct
 import sys
 import threading
 import time
@@ -410,7 +409,6 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
         return json.dumps({"status": "error", "error": str(e)})
 
     # The HTTP API expects multipart file upload, use urllib
-    import mimetypes
     boundary = "----Mod3ShimBoundary"
     body = (
         f"--{boundary}\r\n"
diff --git a/providers.py b/providers.py
index 9068733..a8034b9 100644
--- a/providers.py
+++ b/providers.py
@@ -443,7 +443,7 @@ def auto_detect_provider() -> InferenceProvider:
         return MlxProvider()
 
     try:
-        loop = asyncio.get_running_loop()
+        _loop = asyncio.get_running_loop()
     except RuntimeError:
         return asyncio.run(auto_detect_provider_async())
 

From da06193a9e632718042915cc846d277efc0cc582 Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Wed, 15 Apr 2026 15:36:48 -0400
Subject: [PATCH 3/9] style: apply ruff formatting to pass CI format check

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 agent_loop.py    |  71 ++++++++++++--------
 channels.py      |  90 ++++++++++++++-----------
 draft_queue.py   |  13 ++--
 http_api.py      |  16 ++---
 mcp_shim.py      | 170 +++++++++++++++++++++++++++++++++--------------
 modules/voice.py |   2 +-
 providers.py     |  27 ++------
 server.py        |  74 ++++++++++++---------
 8 files changed, 276 insertions(+), 187 deletions(-)

diff --git a/agent_loop.py b/agent_loop.py
index afdca23..dd8f548 100644
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -87,8 +87,8 @@ def _fetch_kernel_context() -> str:
                     _full = interrupted.get("full_text", "")
                     pct = interrupted.get("spoken_pct", 0)
                     parts.append(
-                        f"[barge-in] Claude's speech was interrupted at {pct*100:.0f}%. "
-                        f"Delivered: \"{delivered}\". "
+                        f"[barge-in] Claude's speech was interrupted at {pct * 100:.0f}%. "
+                        f'Delivered: "{delivered}". '
                         f"The user interrupted to say something — acknowledge and respond to them."
                     )
         except Exception:
@@ -125,6 +125,7 @@ def _log_exchange_to_bus(user_text: str, assistant_text: str, provider_name: str
     except Exception as e:
         logger.debug("Failed to log exchange to bus: %s", e)
 
+
 MAX_HISTORY = 50
 
 
@@ -215,7 +216,9 @@ async def _process(self, event: CognitiveEvent) -> None:
                         content=text,
                         target_channel=self.channel_id,
                         metadata={
-                            "voice": self._channel_ref.config.get("voice", "bm_lewis") if self._channel_ref else "bm_lewis",
+                            "voice": self._channel_ref.config.get("voice", "bm_lewis")
+                            if self._channel_ref
+                            else "bm_lewis",
                             "speed": self._channel_ref.config.get("speed", 1.25) if self._channel_ref else 1.25,
                         },
                     )
@@ -250,10 +253,12 @@ async def _process(self, event: CognitiveEvent) -> None:
         # Update conversation history
         if assistant_parts:
             assistant_text = " ".join(assistant_parts)
-            self.conversation.append({
-                "role": "assistant",
-                "content": assistant_text,
-            })
+            self.conversation.append(
+                {
+                    "role": "assistant",
+                    "content": assistant_text,
+                }
+            )
 
             # Log exchange to CogOS bus (observation channel — Claude can see this)
             _log_exchange_to_bus(event.content, assistant_text, self.provider.name)
@@ -308,6 +313,7 @@ async def speculative_infer(self, committed_text: str) -> None:
 
             # Add to draft queue
             import hashlib
+
             ctx_hash = hashlib.md5(committed_text.encode()).hexdigest()[:8]
             block = self.draft_queue.add_block(
                 text=response_text,
@@ -317,7 +323,9 @@ async def speculative_infer(self, committed_text: str) -> None:
 
             logger.info(
                 "speculative block %s: '%s' (%.0fms)",
-                block.id, response_text[:60], t_ms,
+                block.id,
+                response_text[:60],
+                t_ms,
             )
 
             # F2: Speculative TTS pre-synthesis
@@ -326,10 +334,12 @@ async def speculative_infer(self, committed_text: str) -> None:
 
             # Notify dashboard of draft queue state
             if self._channel_ref:
-                await self._channel_ref.ws.send_json({
-                    "type": "draft_queue",
-                    "blocks": [b.to_dict() for b in self.draft_queue.get_pending()],
-                })
+                await self._channel_ref.ws.send_json(
+                    {
+                        "type": "draft_queue",
+                        "blocks": [b.to_dict() for b in self.draft_queue.get_pending()],
+                    }
+                )
 
         except Exception as e:
             logger.debug("speculative_infer failed: %s", e)
@@ -367,10 +377,12 @@ async def _push_draft_queue_state(self) -> None:
         """Push current draft queue state to the dashboard."""
         if self._channel_ref:
             try:
-                await self._channel_ref.ws.send_json({
-                    "type": "draft_queue",
-                    "blocks": [b.to_dict() for b in self.draft_queue.all_blocks],
-                })
+                await self._channel_ref.ws.send_json(
+                    {
+                        "type": "draft_queue",
+                        "blocks": [b.to_dict() for b in self.draft_queue.all_blocks],
+                    }
+                )
             except Exception:
                 pass
 
@@ -398,10 +410,12 @@ async def invalidate_stale_drafts(self, new_context: str) -> int:
 
         if invalidated > 0 and self._channel_ref:
             try:
-                await self._channel_ref.ws.send_json({
-                    "type": "draft_queue",
-                    "blocks": [b.to_dict() for b in self.draft_queue.all_blocks],
-                })
+                await self._channel_ref.ws.send_json(
+                    {
+                        "type": "draft_queue",
+                        "blocks": [b.to_dict() for b in self.draft_queue.all_blocks],
+                    }
+                )
             except Exception:
                 pass
 
@@ -424,6 +438,7 @@ async def _presynthesise_block(self, block) -> None:
 
             def _synth():
                 from engine import synthesize
+
                 samples, sample_rate = synthesize(
                     block.text,
                     voice=voice,
@@ -471,9 +486,9 @@ async def background_validate_drafts(self, latest_user_text: str) -> None:
             try:
                 # Quick relevance check: ask the model if this block is still appropriate
                 check_prompt = (
-                    f"Given the user just said: \"{latest_user_text}\"\n"
+                    f'Given the user just said: "{latest_user_text}"\n'
                     f"Is this planned response still appropriate? "
-                    f"Response: \"{block.text}\"\n"
+                    f'Response: "{block.text}"\n'
                     f"Answer KEEP or REVISE in one word."
                 )
 
@@ -520,18 +535,18 @@ def _build_interrupt_context(self, user_text: str) -> str | None:
         unspoken = ""
         if info.full_text and info.delivered_text:
             if info.full_text.startswith(info.delivered_text):
-                unspoken = info.full_text[len(info.delivered_text):].strip()
+                unspoken = info.full_text[len(info.delivered_text) :].strip()
             else:
                 # Fallback: everything after the delivered percentage
-                unspoken = info.full_text[len(info.delivered_text):].strip()
+                unspoken = info.full_text[len(info.delivered_text) :].strip()
 
         parts = []
         parts.append("[Barge-in context — your previous response was interrupted]")
-        parts.append(f"spoken (user heard this): \"{info.delivered_text}\"")
+        parts.append(f'spoken (user heard this): "{info.delivered_text}"')
         if unspoken:
-            parts.append(f"unspoken (user did NOT hear this): \"{unspoken}\"")
-        parts.append(f"interrupted_at: {info.spoken_pct*100:.0f}%")
-        parts.append(f"user_said: \"{user_text}\"")
+            parts.append(f'unspoken (user did NOT hear this): "{unspoken}"')
+        parts.append(f"interrupted_at: {info.spoken_pct * 100:.0f}%")
+        parts.append(f'user_said: "{user_text}"')
         parts.append("Acknowledge what was interrupted and respond to the user's new input.")
 
         return "\n".join(parts)
diff --git a/channels.py b/channels.py
index 8480953..c026e64 100644
--- a/channels.py
+++ b/channels.py
@@ -82,9 +82,7 @@ def _deliver_sync(self, output: EncodedOutput) -> None:
         if not self._active:
             return
         try:
-            future = asyncio.run_coroutine_threadsafe(
-                self._deliver_async(output), self._loop
-            )
+            future = asyncio.run_coroutine_threadsafe(self._deliver_async(output), self._loop)
             future.result(timeout=10.0)
         except (WebSocketDisconnect, RuntimeError, TimeoutError):
             logger.debug("deliver failed (client disconnected?), deactivating channel")
@@ -106,13 +104,15 @@ async def _deliver_async(self, output: EncodedOutput) -> None:
             # Send audio as base64 JSON (avoids binary frame issues)
             audio_b64 = base64.b64encode(output.data).decode("ascii")
             logger.info("deliver: sending base64 audio JSON (%d chars)", len(audio_b64))
-            await self.ws.send_json({
-                "type": "audio",
-                "data": audio_b64,
-                "format": output.format or "wav",
-                "duration_sec": round(output.duration_sec, 2),
-                "sample_rate": output.metadata.get("sample_rate", 24000),
-            })
+            await self.ws.send_json(
+                {
+                    "type": "audio",
+                    "data": audio_b64,
+                    "format": output.format or "wav",
+                    "duration_sec": round(output.duration_sec, 2),
+                    "sample_rate": output.metadata.get("sample_rate", 24000),
+                }
+            )
             logger.info("deliver: audio sent OK")
         elif output.modality == ModalityType.TEXT:
             text = output.data.decode("utf-8") if isinstance(output.data, bytes) else str(output.data)
@@ -211,13 +211,15 @@ def _transcribe_t1():
         try:
             result = await asyncio.to_thread(_transcribe_t1)
             if result and result.get("changed") and not result.get("filtered"):
-                await self.ws.send_json({
-                    "type": "partial_transcript",
-                    "confirmed": result["confirmed"],
-                    "tentative": result["tentative"],
-                    "tier": "t1",
-                    "elapsed_ms": result["elapsed_ms"],
-                })
+                await self.ws.send_json(
+                    {
+                        "type": "partial_transcript",
+                        "confirmed": result["confirmed"],
+                        "tentative": result["tentative"],
+                        "tier": "t1",
+                        "elapsed_ms": result["elapsed_ms"],
+                    }
+                )
         except Exception as e:
             logger.debug("T1 error: %s", e)
 
@@ -241,13 +243,15 @@ def _transcribe_t2():
         try:
             result = await asyncio.to_thread(_transcribe_t2)
             if result and not result.get("filtered"):
-                await self.ws.send_json({
-                    "type": "partial_transcript",
-                    "confirmed": result["confirmed"],
-                    "tentative": result["tentative"],
-                    "tier": "t2",
-                    "elapsed_ms": result["elapsed_ms"],
-                })
+                await self.ws.send_json(
+                    {
+                        "type": "partial_transcript",
+                        "confirmed": result["confirmed"],
+                        "tentative": result["tentative"],
+                        "tier": "t2",
+                        "elapsed_ms": result["elapsed_ms"],
+                    }
+                )
         except Exception as e:
             logger.debug("T2 error: %s", e)
         finally:
@@ -308,7 +312,7 @@ def _transcribe():
             # Skip silence
             if len(audio) < 16000 * 0.3:
                 return None
-            rms = float(np.sqrt(np.mean(audio ** 2)))
+            rms = float(np.sqrt(np.mean(audio**2)))
             if rms < 0.005:
                 return None
 
@@ -357,12 +361,14 @@ def _transcribe():
 
         if event and event.content:
             # Send transcript to browser
-            await self.ws.send_json({
-                "type": "transcript",
-                "text": event.content,
-                "stt_ms": round(stt_ms, 1),
-                "source": "voice",
-            })
+            await self.ws.send_json(
+                {
+                    "type": "transcript",
+                    "text": event.content,
+                    "stt_ms": round(stt_ms, 1),
+                    "source": "voice",
+                }
+            )
             # Forward to agent loop
             event.metadata["stt_ms"] = stt_ms
             if self._on_event:
@@ -376,11 +382,13 @@ async def _process_text(self, text: str) -> None:
             source_channel=self.channel_id,
             confidence=1.0,
         )
-        await self.ws.send_json({
-            "type": "transcript",
-            "text": text,
-            "source": "text",
-        })
+        await self.ws.send_json(
+            {
+                "type": "transcript",
+                "text": text,
+                "source": "text",
+            }
+        )
         if self._on_event:
             await self._on_event(event)
 
@@ -407,10 +415,12 @@ async def send_response_complete(self, metrics: dict | None = None) -> None:
         """Signal response is complete."""
         if self._active:
             try:
-                await self.ws.send_json({
-                    "type": "response_complete",
-                    "metrics": metrics or {},
-                })
+                await self.ws.send_json(
+                    {
+                        "type": "response_complete",
+                        "metrics": metrics or {},
+                    }
+                )
             except Exception:
                 self._active = False
 
diff --git a/draft_queue.py b/draft_queue.py
index dccbc15..3a7c630 100644
--- a/draft_queue.py
+++ b/draft_queue.py
@@ -23,10 +23,11 @@
 
 class BlockStatus(Enum):
     """Lifecycle states for a draft block."""
-    VALID = "valid"        # Generated, awaiting playback
-    STALE = "stale"        # Invalidated by new context
-    SPOKEN = "spoken"      # Successfully played aloud
-    SNIPPED = "snipped"    # Removed by self-barge
+
+    VALID = "valid"  # Generated, awaiting playback
+    STALE = "stale"  # Invalidated by new context
+    SPOKEN = "spoken"  # Successfully played aloud
+    SNIPPED = "snipped"  # Removed by self-barge
     SPEAKING = "speaking"  # Currently being spoken
 
 
@@ -38,8 +39,8 @@ class DraftBlock:
     text: str
     status: BlockStatus = BlockStatus.VALID
     created_at: float = field(default_factory=time.time)
-    context_hash: str = ""        # Hash of context at generation time
-    generation_ms: float = 0.0    # How long inference took
+    context_hash: str = ""  # Hash of context at generation time
+    generation_ms: float = 0.0  # How long inference took
     tts_audio: bytes | None = None  # Pre-synthesized audio (if available)
     tts_duration_sec: float = 0.0
     metadata: dict[str, Any] = field(default_factory=dict)
diff --git a/http_api.py b/http_api.py
index 0a6396f..45a7fb8 100644
--- a/http_api.py
+++ b/http_api.py
@@ -58,6 +58,7 @@ async def _warmup_kokoro():
     def _do_warmup():
         try:
             from engine import get_model
+
             get_model("kokoro")
             logger.info("Kokoro TTS engine pre-warmed successfully")
         except Exception as e:
@@ -571,6 +572,7 @@ def stop_speech(job_id: str = ""):
     """
     try:
         from server import _speech_queue, pipeline_state
+
         if job_id:
             cancelled = _speech_queue.cancel(job_id)
             return {"status": "ok", "message": f"Cancelled {job_id}" if cancelled else f"Job {job_id} not found"}
@@ -677,24 +679,16 @@ async def _graceful_exit():
         deadline = time.time() + timeout_sec
         while time.time() < deadline:
             with _jobs_lock:
-                active = sum(
-                    1 for j in _jobs.values()
-                    if j.get("status") in ("generating", "processing")
-                )
+                active = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing"))
             if active == 0:
                 break
             await asyncio.sleep(0.25)
 
         with _jobs_lock:
-            remaining = sum(
-                1 for j in _jobs.values()
-                if j.get("status") in ("generating", "processing")
-            )
+            remaining = sum(1 for j in _jobs.values() if j.get("status") in ("generating", "processing"))
 
         if remaining:
-            logger.warning(
-                "Shutdown timeout reached with %d active jobs — forcing exit", remaining
-            )
+            logger.warning("Shutdown timeout reached with %d active jobs — forcing exit", remaining)
         else:
             logger.info("All jobs drained — exiting cleanly")
 
diff --git a/mcp_shim.py b/mcp_shim.py
index 83831fb..a34ca35 100644
--- a/mcp_shim.py
+++ b/mcp_shim.py
@@ -51,8 +51,7 @@
 _BARGEIN_SIGNAL = os.path.expanduser("~/.mod3_bargein_signal.json")
 
 
-def _http_request(method: str, path: str, body: dict | None = None,
-                  timeout: float = 30.0) -> tuple[int, dict | bytes]:
+def _http_request(method: str, path: str, body: dict | None = None, timeout: float = 30.0) -> tuple[int, dict | bytes]:
     """Make an HTTP request to the Mod3 service. Returns (status_code, parsed_json_or_bytes)."""
     url = f"{MOD3_BASE}{path}"
     headers = {"Content-Type": "application/json"} if body is not None else {}
@@ -161,8 +160,10 @@ def _estimate_duration(text: str, speed: float) -> float:
 # Tool implementations
 # ---------------------------------------------------------------------------
 
-def tool_speak(text: str, voice: str = "bm_lewis", stream: bool = True,
-               speed: float = 1.25, emotion: float = 0.5) -> str:
+
+def tool_speak(
+    text: str, voice: str = "bm_lewis", stream: bool = True, speed: float = 1.25, emotion: float = 0.5
+) -> str:
     """Synthesize via HTTP, play locally."""
     if not text.strip():
         return json.dumps({"status": "error", "error": "Nothing to say"})
@@ -173,20 +174,30 @@ def tool_speak(text: str, voice: str = "bm_lewis", stream: bool = True,
             with open(_BARGEIN_SIGNAL) as f:
                 sig = json.load(f)
             if sig.get("event") == "user_speaking_start":
-                return json.dumps({
-                    "status": "held",
-                    "reason": "User is currently speaking — re-send after user finishes.",
-                    "user_state": "recording",
-                    "estimated_duration_sec": round(_estimate_duration(text, speed), 1),
-                })
+                return json.dumps(
+                    {
+                        "status": "held",
+                        "reason": "User is currently speaking — re-send after user finishes.",
+                        "user_state": "recording",
+                        "estimated_duration_sec": round(_estimate_duration(text, speed), 1),
+                    }
+                )
     except Exception:
         pass
 
     # Request synthesis from HTTP service
-    status, resp = _http_request("POST", "/v1/synthesize", {
-        "text": text, "voice": voice, "speed": speed, "emotion": emotion,
-        "format": "wav",
-    }, timeout=60.0)
+    status, resp = _http_request(
+        "POST",
+        "/v1/synthesize",
+        {
+            "text": text,
+            "voice": voice,
+            "speed": speed,
+            "emotion": emotion,
+            "format": "wav",
+        },
+        timeout=60.0,
+    )
 
     if status == 0:
         return json.dumps({"status": "error", "error": resp.get("error", "Service unreachable")})
@@ -197,7 +208,7 @@ def tool_speak(text: str, voice: str = "bm_lewis", stream: bool = True,
         return json.dumps({"status": "error", "error": "Expected audio bytes from synthesize"})
 
     # Create job and play in background
-    job_id = f"shim-{int(time.time()*1000)}"
+    job_id = f"shim-{int(time.time() * 1000)}"
     with _jobs_lock:
         _jobs[job_id] = {
             "status": "generating",
@@ -230,6 +241,7 @@ def tool_stop(job_id: str = "") -> str:
             _playback_interrupt.set()
             try:
                 import sounddevice as sd
+
                 sd.stop()
             except Exception:
                 pass
@@ -242,6 +254,7 @@ def tool_stop(job_id: str = "") -> str:
     _playback_interrupt.set()
     try:
         import sounddevice as sd
+
         sd.stop()
     except Exception:
         pass
@@ -372,7 +385,8 @@ def tool_await_voice_input(timeout_sec: float = 180.0) -> str:
     try:
         folders = sorted(
             [d for d in os.listdir(_rec_dir) if d.isdigit()],
-            key=int, reverse=True,
+            key=int,
+            reverse=True,
         )
         if folders:
             meta_path = os.path.join(_rec_dir, folders[0], "meta.json")
@@ -382,14 +396,16 @@ def tool_await_voice_input(timeout_sec: float = 180.0) -> str:
                 raw = meta.get("rawResult", "").strip()
                 result = meta.get("result", raw).strip()
                 duration_ms = meta.get("duration", 0)
-                return json.dumps({
-                    "status": "ok",
-                    "transcript": result if result else raw,
-                    "raw_transcript": raw,
-                    "duration_sec": round(duration_ms / 1000, 1),
-                    "folder": folders[0],
-                    "source": "superwhisper",
-                })
+                return json.dumps(
+                    {
+                        "status": "ok",
+                        "transcript": result if result else raw,
+                        "raw_transcript": raw,
+                        "duration_sec": round(duration_ms / 1000, 1),
+                        "folder": folders[0],
+                        "source": "superwhisper",
+                    }
+                )
     except Exception as e:
         logger.warning("await_voice_input error: %s", e)
 
@@ -411,16 +427,22 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
     # The HTTP API expects multipart file upload, use urllib
     boundary = "----Mod3ShimBoundary"
     body = (
-        f"--{boundary}\r\n"
-        f'Content-Disposition: form-data; name="file"; filename="{os.path.basename(file_path)}"\r\n'
-        f"Content-Type: audio/wav\r\n\r\n"
-    ).encode() + wav_data + f"\r\n--{boundary}--\r\n".encode()
+        (
+            f"--{boundary}\r\n"
+            f'Content-Disposition: form-data; name="file"; filename="{os.path.basename(file_path)}"\r\n'
+            f"Content-Type: audio/wav\r\n\r\n"
+        ).encode()
+        + wav_data
+        + f"\r\n--{boundary}--\r\n".encode()
+    )
 
     url = f"{MOD3_BASE}/v1/vad"
     if threshold != 0.5:
         url += f"?threshold={threshold}"
     req = urllib.request.Request(
-        url, data=body, method="POST",
+        url,
+        data=body,
+        method="POST",
         headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
     )
     try:
@@ -458,10 +480,26 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
             "type": "object",
             "properties": {
                 "text": {"type": "string", "description": "The text to speak aloud. Keep it conversational."},
-                "voice": {"type": "string", "default": "bm_lewis", "description": "Voice preset. Use list_voices() to see options. Defaults to \"bm_lewis\" (Kokoro)."},
-                "stream": {"type": "boolean", "default": True, "description": "If True, plays audio chunks as they generate (lower latency)."},
-                "speed": {"type": "number", "default": 1.25, "description": "Speed multiplier (engines with speed support). Default 1.25."},
-                "emotion": {"type": "number", "default": 0.5, "description": "Emotion/exaggeration intensity 0.0-1.0 (Chatterbox only). Default 0.5."},
+                "voice": {
+                    "type": "string",
+                    "default": "bm_lewis",
+                    "description": 'Voice preset. Use list_voices() to see options. Defaults to "bm_lewis" (Kokoro).',
+                },
+                "stream": {
+                    "type": "boolean",
+                    "default": True,
+                    "description": "If True, plays audio chunks as they generate (lower latency).",
+                },
+                "speed": {
+                    "type": "number",
+                    "default": 1.25,
+                    "description": "Speed multiplier (engines with speed support). Default 1.25.",
+                },
+                "emotion": {
+                    "type": "number",
+                    "default": 0.5,
+                    "description": "Emotion/exaggeration intensity 0.0-1.0 (Chatterbox only). Default 0.5.",
+                },
             },
             "required": ["text"],
         },
@@ -478,8 +516,16 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
         "inputSchema": {
             "type": "object",
             "properties": {
-                "job_id": {"type": "string", "default": "", "description": "The job ID returned by speak(). If empty, returns the latest job."},
-                "verbose": {"type": "boolean", "default": False, "description": "If True, include per-chunk metrics. Default False (summary only)."},
+                "job_id": {
+                    "type": "string",
+                    "default": "",
+                    "description": "The job ID returned by speak(). If empty, returns the latest job.",
+                },
+                "verbose": {
+                    "type": "boolean",
+                    "default": False,
+                    "description": "If True, include per-chunk metrics. Default False (summary only).",
+                },
             },
         },
     },
@@ -495,7 +541,11 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
         "inputSchema": {
             "type": "object",
             "properties": {
-                "job_id": {"type": "string", "default": "", "description": "If provided, cancels that specific job. If empty, stops everything."},
+                "job_id": {
+                    "type": "string",
+                    "default": "",
+                    "description": "If provided, cancels that specific job. If empty, stops everything.",
+                },
             },
         },
     },
@@ -510,7 +560,7 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
             "Block until the user finishes a SuperWhisper recording, then return the transcript.\n\n"
             "This closes the voice input loop: instead of waiting for the user to paste\n"
             "their transcribed text, you can directly receive what they said. Use this\n"
-            "when speak() returns \"held\" (user is recording) or when you want to listen\n"
+            'when speak() returns "held" (user is recording) or when you want to listen\n'
             "for the next voice input.\n\n"
             "Polls the barge-in signal file for user_speaking_end, then reads the\n"
             "transcript from SuperWhisper's recordings directory.\n\n"
@@ -520,7 +570,11 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
         "inputSchema": {
             "type": "object",
             "properties": {
-                "timeout_sec": {"type": "number", "default": 180, "description": "Maximum seconds to wait for recording to finish. Default 180 (3 minutes)."},
+                "timeout_sec": {
+                    "type": "number",
+                    "default": 180,
+                    "description": "Maximum seconds to wait for recording to finish. Default 180 (3 minutes).",
+                },
             },
         },
     },
@@ -534,14 +588,18 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
         "description": (
             "List audio output devices, or set the active one.\n\n"
             "Args:\n"
-            "    device: Device index (e.g. \"3\"), name substring (e.g. \"AirPods\"),\n"
-            "            or \"default\" to track the system default automatically.\n"
+            '    device: Device index (e.g. "3"), name substring (e.g. "AirPods"),\n'
+            '            or "default" to track the system default automatically.\n'
             "            If empty, lists available devices without changing anything."
         ),
         "inputSchema": {
             "type": "object",
             "properties": {
-                "device": {"type": "string", "default": "", "description": "Device index, name substring, or 'default'. If empty, lists devices."},
+                "device": {
+                    "type": "string",
+                    "default": "",
+                    "description": "Device index, name substring, or 'default'. If empty, lists devices.",
+                },
             },
         },
     },
@@ -559,7 +617,11 @@ def tool_vad_check(file_path: str, threshold: float = 0.5) -> str:
             "type": "object",
             "properties": {
                 "file_path": {"type": "string", "description": "Path to a WAV audio file."},
-                "threshold": {"type": "number", "default": 0.5, "description": "Speech probability threshold 0-1 (default 0.5). Higher = stricter."},
+                "threshold": {
+                    "type": "number",
+                    "default": 0.5,
+                    "description": "Speech probability threshold 0-1 (default 0.5). Higher = stricter.",
+                },
             },
             "required": ["file_path"],
         },
@@ -634,11 +696,14 @@ def _jsonrpc_error(id: Any, code: int, message: str) -> dict:
 
 
 def handle_initialize(msg: dict) -> dict:
-    return _jsonrpc_response(msg["id"], {
-        "protocolVersion": "2024-11-05",
-        "serverInfo": SERVER_INFO,
-        "capabilities": CAPABILITIES,
-    })
+    return _jsonrpc_response(
+        msg["id"],
+        {
+            "protocolVersion": "2024-11-05",
+            "serverInfo": SERVER_INFO,
+            "capabilities": CAPABILITIES,
+        },
+    )
 
 
 def handle_tools_list(msg: dict) -> dict:
@@ -659,9 +724,12 @@ def handle_tools_call(msg: dict) -> dict:
     except Exception as e:
         result_text = json.dumps({"status": "error", "error": str(e)})
 
-    return _jsonrpc_response(msg["id"], {
-        "content": [{"type": "text", "text": result_text}],
-    })
+    return _jsonrpc_response(
+        msg["id"],
+        {
+            "content": [{"type": "text", "text": result_text}],
+        },
+    )
 
 
 def handle_notifications_initialized(msg: dict):
@@ -705,6 +773,7 @@ def run_stdio():
 # Self-test
 # ---------------------------------------------------------------------------
 
+
 def self_test():
     """Quick connectivity check."""
     print(f"Mod3 shim — testing connection to {MOD3_BASE}")
@@ -723,6 +792,7 @@ def self_test():
     # Check sounddevice
     try:
         import sounddevice as sd
+
         default_out = sd.query_devices(sd.default.device[1])
         print(f"  OK: sounddevice available — default output: {default_out['name']}")
     except ImportError:
diff --git a/modules/voice.py b/modules/voice.py
index aa03263..20e02e1 100644
--- a/modules/voice.py
+++ b/modules/voice.py
@@ -201,7 +201,7 @@ def decode_streaming(
                 confirmed = confirmed[:last_space]
 
         # Tentative = remainder after confirmed prefix
-        tentative = transcript[len(confirmed):].strip()
+        tentative = transcript[len(confirmed) :].strip()
 
         # T3 = end-of-utterance, everything is confirmed
         if tier == "t3":
diff --git a/providers.py b/providers.py
index a8034b9..259361b 100644
--- a/providers.py
+++ b/providers.py
@@ -121,8 +121,7 @@ def _format_tools_for_prompt(tools: list[dict]) -> str:
             for pname, pinfo in props.items():
                 req_marker = " (required)" if pname in required else ""
                 lines.append(
-                    f"    - {pname} ({pinfo.get('type', 'string')}): "
-                    f"{pinfo.get('description', '')}{req_marker}"
+                    f"    - {pname} ({pinfo.get('type', 'string')}): {pinfo.get('description', '')}{req_marker}"
                 )
     lines.append(
         "\nTo call a tool, output exactly:\n"
@@ -135,9 +134,7 @@ def _format_tools_for_prompt(tools: list[dict]) -> str:
     return "\n".join(lines)
 
 
-_TOOL_CALL_RE = re.compile(
-    r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL
-)
+_TOOL_CALL_RE = re.compile(r"<tool_call>\s*(\{.*?\})\s*</tool_call>", re.DOTALL)
 
 
 def _parse_tool_calls(text: str) -> list[ToolCall]:
@@ -168,9 +165,7 @@ class MlxProvider:
     """
 
     def __init__(self, model_id: str | None = None):
-        self._model_id = model_id or os.environ.get(
-            "MLX_MODEL", "mlx-community/gemma-3-4b-it-4bit"
-        )
+        self._model_id = model_id or os.environ.get("MLX_MODEL", "mlx-community/gemma-3-4b-it-4bit")
         self._model = None
         self._tokenizer = None
 
@@ -210,9 +205,7 @@ def _generate_sync(
             msgs = [{"role": "system", "content": "\n\n".join(system_parts)}] + msgs
 
         # Apply chat template
-        prompt = self._tokenizer.apply_chat_template(
-            msgs, add_generation_prompt=True, tokenize=False
-        )
+        prompt = self._tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
 
         max_tokens = int(os.environ.get("MLX_MAX_TOKENS", "512"))
         raw_output = generate(
@@ -240,9 +233,7 @@ async def chat(
         tools: list[dict] | None = None,
         system: str = "",
     ) -> ProviderResponse:
-        return await asyncio.to_thread(
-            self._generate_sync, messages, tools, system
-        )
+        return await asyncio.to_thread(self._generate_sync, messages, tools, system)
 
 
 # ---------------------------------------------------------------------------
@@ -258,9 +249,7 @@ def __init__(
         endpoint: str | None = None,
         model: str | None = None,
     ):
-        self._endpoint = endpoint or os.environ.get(
-            "OLLAMA_ENDPOINT", "http://localhost:11434"
-        )
+        self._endpoint = endpoint or os.environ.get("OLLAMA_ENDPOINT", "http://localhost:11434")
         self._model = model or os.environ.get("OLLAMA_MODEL", "gemma4:e4b")
 
     @property
@@ -322,9 +311,7 @@ class CogOSProvider:
     """CogOS kernel — OpenAI-compatible chat/completions with tool support."""
 
     def __init__(self, endpoint: str | None = None):
-        self._endpoint = endpoint or os.environ.get(
-            "COGOS_ENDPOINT", "http://localhost:5100"
-        )
+        self._endpoint = endpoint or os.environ.get("COGOS_ENDPOINT", "http://localhost:5100")
 
     @property
     def name(self) -> str:
diff --git a/server.py b/server.py
index 8c79bcb..cc9016f 100644
--- a/server.py
+++ b/server.py
@@ -416,9 +416,11 @@ def _bargein_watcher():
     """Background thread that watches for barge-in signal file changes."""
     global _bargein_last_mtime
     import json as _json
+
     while True:
         try:
             import os
+
             if os.path.exists(_BARGEIN_SIGNAL):
                 mtime = os.path.getmtime(_BARGEIN_SIGNAL)
                 if mtime > _bargein_last_mtime:
@@ -437,7 +439,10 @@ def _bargein_watcher():
                                 }
                                 with open(_BARGEIN_SIGNAL, "w") as f:
                                     _json.dump(signal, f, indent=2)
-                            logging.info("Barge-in: paused local playback (%.0f%% delivered)", info.spoken_pct * 100 if info else 0)
+                            logging.info(
+                                "Barge-in: paused local playback (%.0f%% delivered)",
+                                info.spoken_pct * 100 if info else 0,
+                            )
                         else:
                             # Check cross-process lock (another Mod³ process may be speaking)
                             lock = _is_any_process_speaking()
@@ -834,12 +839,14 @@ def speak(
     # can't be cleared by stop().
     if user_state == "recording":
         est_duration = _estimate_duration_sec(text, speed)
-        return json.dumps({
-            "status": "held",
-            "reason": "User is currently speaking — re-send this speak() call after user finishes.",
-            "user_state": "recording",
-            "estimated_duration_sec": round(est_duration, 1),
-        })
+        return json.dumps(
+            {
+                "status": "held",
+                "reason": "User is currently speaking — re-send this speak() call after user finishes.",
+                "user_state": "recording",
+                "estimated_duration_sec": round(est_duration, 1),
+            }
+        )
 
     try:
         job_id, position = _start_speech(text, voice, stream=stream, speed=speed, emotion=emotion)
@@ -1162,9 +1169,7 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     """
     import sqlite3 as _sqlite3
 
-    _sw_db = os.path.expanduser(
-        "~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite"
-    )
+    _sw_db = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
     _rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings")
 
     start = time.time()
@@ -1198,23 +1203,23 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
                 raw = meta.get("rawResult", "").strip()
                 result = meta.get("result", raw).strip()
                 duration_ms = meta.get("duration", 0)
-                return json.dumps({
-                    "status": "ok",
-                    "transcript": result if result else raw,
-                    "raw_transcript": raw,
-                    "duration_sec": round(duration_ms / 1000, 1),
-                    "folder": folders[0],
-                    "source": "superwhisper",
-                })
+                return json.dumps(
+                    {
+                        "status": "ok",
+                        "transcript": result if result else raw,
+                        "raw_transcript": raw,
+                        "duration_sec": round(duration_ms / 1000, 1),
+                        "folder": folders[0],
+                        "source": "superwhisper",
+                    }
+                )
     except Exception as e:
         logger.warning("await_voice_input meta.json fallback failed: %s", e)
 
     # Method 2: Query SuperWhisper SQLite DB
     try:
         conn = _sqlite3.connect(f"file:{_sw_db}?mode=ro", uri=True, timeout=2.0)
-        row = conn.execute(
-            "SELECT folderName, duration FROM recording ORDER BY datetime DESC LIMIT 1"
-        ).fetchone()
+        row = conn.execute("SELECT folderName, duration FROM recording ORDER BY datetime DESC LIMIT 1").fetchone()
         conn.close()
         if row:
             folder_name, duration = row
@@ -1224,14 +1229,16 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
                     meta = json.load(f)
                 raw = meta.get("rawResult", "").strip()
                 result = meta.get("result", raw).strip()
-                return json.dumps({
-                    "status": "ok",
-                    "transcript": result if result else raw,
-                    "raw_transcript": raw,
-                    "duration_sec": round(duration / 1000, 1),
-                    "folder": folder_name,
-                    "source": "superwhisper_db",
-                })
+                return json.dumps(
+                    {
+                        "status": "ok",
+                        "transcript": result if result else raw,
+                        "raw_transcript": raw,
+                        "duration_sec": round(duration / 1000, 1),
+                        "folder": folder_name,
+                        "source": "superwhisper_db",
+                    }
+                )
     except Exception as e:
         logger.warning("await_voice_input DB fallback failed: %s", e)
 
@@ -1308,12 +1315,17 @@ def set_output_device(device: str = "") -> str:
             outputs.append({"index": i, "name": d["name"], "active": is_active, "default": is_default})
 
     if not device:
-        lines = [f"  [{'*' if d['active'] else ' '}] {d['index']}: {d['name']}{' (system default)' if d['default'] else ''}" for d in outputs]
+        lines = [
+            f"  [{'*' if d['active'] else ' '}] {d['index']}: {d['name']}{' (system default)' if d['default'] else ''}"
+            for d in outputs
+        ]
         return "Audio output devices (* = active):\n" + "\n".join(lines)
 
     if device.lower() == "default":
         _output_device = None
-        return json.dumps({"status": "ok", "device": "system_default", "note": "Now tracking system default output device"})
+        return json.dumps(
+            {"status": "ok", "device": "system_default", "note": "Now tracking system default output device"}
+        )
 
     if device.isdigit():
         _output_device = int(device)

From f7f0df9ab6abd5889b0936951cc063caeb50d0c7 Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Wed, 15 Apr 2026 15:40:57 -0400
Subject: [PATCH 4/9] =?UTF-8?q?fix:=20player.stop()=20=E2=86=92=20player.f?=
 =?UTF-8?q?lush()=20for=20barge-in=20interrupt=20(pyright=20type=20error)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/server.py b/server.py
index cc9016f..0c0e4e5 100644
--- a/server.py
+++ b/server.py
@@ -667,7 +667,7 @@ def _run_speech_job(entry: dict) -> None:
             # Check if barge-in cleared our speaking lock (cross-process interrupt)
             if not os.path.exists(_SPEAKING_LOCK):
                 logging.info("Speaking lock cleared by barge-in watcher — stopping generation")
-                player.stop()
+                player.flush()
                 break
             player.queue_audio(chunk.samples, chunk_meta=chunk.metadata if chunk.metadata else None)
             _set_bus_voice_state(

From 503057e1ca7ddf77e5941569a6a73e893b0dfa80 Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Fri, 17 Apr 2026 23:47:24 -0400
Subject: [PATCH 5/9] =?UTF-8?q?feat:=20bus-mediated=20dashboard=20chat=20?=
 =?UTF-8?q?=E2=80=94=20cogos=20kernel=20as=20inference=20backend?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Wires Mod³'s dashboard chat to route user messages through the cogos kernel's
running metabolic-cycle agent instead of the local MLX Gemma provider. When
MOD3_USE_COGOS_AGENT=1, user turns flow as bus events (bus_dashboard_chat → kernel
inlet → harness observation → respond tool → bus_dashboard_response) and render
back in the dashboard as response_text frames. Voice and text now share a single
conversation through the same metabolic cycle.

Also lands bidirectional barge-in context stitching on the WebSocket path:
BargeinContext schema + agent_loop injection into next-turn system prompt. Fixes
the gap where dashboard interruptions halted TTS but didn't surface structured
context to the agent (previously only the MCP/SuperWhisper file-signal path
injected it). 6 new bargein tests; 2 new bus-bridge tests; 5 new cogos-agent
bridge tests. 47 pytest collect total.

Dashboard: live Cycle Trace drawer consuming bus_cycle_trace via SSE subscriber.
Bottom-drawer UI, 100-entry rolling window, collapsible with localStorage.
ort.min.js + WASM for VAD runtime.

Whisper default pinned to whisper-base-mlx to reduce concurrent MLX Metal
pressure (Gemma + Kokoro + Whisper segfault). Large-v3-turbo restoration is a
separate MLX-stability fix; voice-input path still crashes on mic due to
underlying MLX concurrency issue (known, tracked separately).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 agent_loop.py                    |  112 +-
 bus_bridge.py                    |  281 +++
 bus_bridge_runner.py             |  144 ++
 channels.py                      |   70 +
 cogos_agent_bridge.py            |  240 +++
 dashboard/index.html             |   53 +
 dashboard/trace.js               |  179 ++
 dashboard/transport.js           |   10 +
 dashboard/vad/ort.min.js         | 2869 ++++++++++++++++++++++++++++++
 demo/e2e_audio_trace_demo.py     |  140 ++
 demo/e2e_dashboard_harness.py    |  176 ++
 http_api.py                      |   52 +
 modules/voice.py                 |    4 +-
 schemas/__init__.py              |    1 +
 schemas/bargein.py               |   70 +
 tests/test_bargein_context.py    |  200 +++
 tests/test_bus_bridge_runner.py  |   74 +
 tests/test_cogos_agent_bridge.py |  114 ++
 18 files changed, 4754 insertions(+), 35 deletions(-)
 create mode 100644 bus_bridge.py
 create mode 100644 bus_bridge_runner.py
 create mode 100644 cogos_agent_bridge.py
 create mode 100644 dashboard/trace.js
 create mode 100644 dashboard/vad/ort.min.js
 create mode 100644 demo/e2e_audio_trace_demo.py
 create mode 100644 demo/e2e_dashboard_harness.py
 create mode 100644 schemas/__init__.py
 create mode 100644 schemas/bargein.py
 create mode 100644 tests/test_bargein_context.py
 create mode 100644 tests/test_bus_bridge_runner.py
 create mode 100644 tests/test_cogos_agent_bridge.py

diff --git a/agent_loop.py b/agent_loop.py
index dd8f548..63b9ea3 100644
--- a/agent_loop.py
+++ b/agent_loop.py
@@ -21,6 +21,7 @@
 from modality import CognitiveEvent, CognitiveIntent, ModalityType
 from pipeline_state import PipelineState
 from providers import AGENT_TOOLS, InferenceProvider
+from schemas.bargein import BargeinContext
 
 if TYPE_CHECKING:
     from channels import BrowserChannel
@@ -149,6 +150,9 @@ def __init__(
         self.draft_queue = DraftQueue()
         self._speculative_context: list[dict[str, str]] = []  # Context for speculative inference
         self._human_speaking = False  # Whether human is currently speaking
+        # A2: typed barge-in context prepared before the next turn, consumed by A3
+        # for prompt injection. Set by _prepare_bargein_context() on the WS path.
+        self._pending_bargein: BargeinContext | None = None
 
     async def handle_event(self, event: CognitiveEvent) -> None:
         """Called when a CognitiveEvent arrives from the channel."""
@@ -175,12 +179,46 @@ async def handle_event(self, event: CognitiveEvent) -> None:
 
     async def _process(self, event: CognitiveEvent) -> None:
         """Core: event → provider → tool dispatch."""
-        # Context stitching: inject interrupt context from dashboard path
-        # This closes the barge-in loop — the agent knows what was spoken,
-        # what was unsaid, and what the user interrupted with.
-        interrupt_context = self._build_interrupt_context(event.content)
-        if interrupt_context:
-            self.conversation.append({"role": "system", "content": interrupt_context})
+        # A2: build typed BargeinContext from pipeline_state.last_interrupt (if any)
+        # and stash on self._pending_bargein. A3 will consume it for prompt injection.
+        self._prepare_bargein_context(user_text=event.content)
+
+        # MOD3_USE_COGOS_AGENT fork: forward user turn to kernel bus instead of
+        # calling local provider. Response arrives asynchronously via the
+        # cogos_agent_bridge → BrowserChannel.broadcast_response_text path.
+        from cogos_agent_bridge import is_enabled as _cogos_agent_enabled
+        from cogos_agent_bridge import post_user_message as _post_user_message
+
+        if _cogos_agent_enabled():
+            session_id = f"mod3:{self.channel_id or 'unknown'}"
+            # Fold any pending barge-in context into the forwarded text so the
+            # kernel cycle sees it. A full structured payload will come in a
+            # later iteration; for v1 we prepend the terse prompt renderer.
+            forwarded_text = event.content
+            pending = self._pending_bargein
+            if pending is not None:
+                self._pending_bargein = None
+                forwarded_text = (
+                    "[interrupted earlier] "
+                    + pending.format_for_prompt()
+                    + "\n"
+                    + forwarded_text
+                )
+            ok = await _post_user_message(forwarded_text, session_id=session_id)
+            if not ok and self._channel_ref:
+                try:
+                    await self._channel_ref.send_response_text(
+                        "[cogos-agent unreachable — check kernel]"
+                    )
+                    await self._channel_ref.send_response_complete(
+                        metrics={"provider": "cogos-agent", "error": "unreachable"}
+                    )
+                except Exception:
+                    pass
+            # Track the user turn in history so subsequent turns carry it.
+            self.conversation.append({"role": "user", "content": event.content})
+            self._trim_history()
+            return
 
         self.conversation.append({"role": "user", "content": event.content})
         self._trim_history()
@@ -190,6 +228,7 @@ async def _process(self, event: CognitiveEvent) -> None:
         # Assemble system prompt with kernel context (afferent path)
         kernel_ctx = _fetch_kernel_context()
         system_prompt = _BASE_SYSTEM_PROMPT + kernel_ctx
+        system_prompt = self._inject_pending_bargein(system_prompt)
 
         response = await self.provider.chat(
             messages=self.conversation,
@@ -510,46 +549,51 @@ async def background_validate_drafts(self, latest_user_text: str) -> None:
 
         await self._push_draft_queue_state()
 
-    def _build_interrupt_context(self, user_text: str) -> str | None:
-        """Build context stitch from pipeline_state.last_interrupt.
-
-        When the user barged in during TTS playback, captures what was
-        spoken vs unspoken and injects it as structured context for the
-        next inference call. Consumes the interrupt (clears it).
+    def _prepare_bargein_context(self, user_text: str | None) -> None:
+        """Read pipeline_state.last_interrupt and stash a typed BargeinContext.
 
-        Returns a context string, or None if no interrupt occurred.
+        Called at the top of each WS turn. If the previous assistant reply was
+        interrupted (and the interrupt is still fresh, < 30s), build a
+        BargeinContext via the A1 schema and store it on ``self._pending_bargein``
+        for A3 to pick up during prompt construction. Clears last_interrupt so
+        the next turn does not re-consume a stale record.
         """
         info = self.pipeline_state.last_interrupt
         if info is None:
-            return None
+            self._pending_bargein = None
+            return
 
         # Only use recent interrupts (within last 30 seconds)
         if time.time() - info.timestamp > 30:
-            return None
+            # Stale — clear and skip.
+            with self.pipeline_state._lock:
+                self.pipeline_state._last_interrupt = None
+            self._pending_bargein = None
+            return
 
-        # Clear the interrupt so we don't re-inject it
+        # Consume the interrupt so we don't re-inject it on subsequent turns.
+        # pipeline_state has no public consume helper yet; clear the private
+        # slot under its lock (matches the pre-existing pattern on this path).
         with self.pipeline_state._lock:
             self.pipeline_state._last_interrupt = None
 
-        # Compute unspoken remainder
-        unspoken = ""
-        if info.full_text and info.delivered_text:
-            if info.full_text.startswith(info.delivered_text):
-                unspoken = info.full_text[len(info.delivered_text) :].strip()
-            else:
-                # Fallback: everything after the delivered percentage
-                unspoken = info.full_text[len(info.delivered_text) :].strip()
+        self._pending_bargein = BargeinContext.from_interrupt_info(
+            info,
+            source="browser_vad",
+            user_said=user_text or None,
+        )
 
-        parts = []
-        parts.append("[Barge-in context — your previous response was interrupted]")
-        parts.append(f'spoken (user heard this): "{info.delivered_text}"')
-        if unspoken:
-            parts.append(f'unspoken (user did NOT hear this): "{unspoken}"')
-        parts.append(f"interrupted_at: {info.spoken_pct * 100:.0f}%")
-        parts.append(f'user_said: "{user_text}"')
-        parts.append("Acknowledge what was interrupted and respond to the user's new input.")
-
-        return "\n".join(parts)
+    def _inject_pending_bargein(self, system_prompt: str) -> str:
+        """Append the pending BargeinContext (if any) to the system prompt.
+
+        Consumes ``self._pending_bargein`` so it does not leak into subsequent
+        turns. Returns the prompt unchanged if no barge-in is pending.
+        """
+        pending = self._pending_bargein
+        if pending is None:
+            return system_prompt
+        self._pending_bargein = None
+        return system_prompt + "\n\n" + pending.format_for_prompt()
 
     def _trim_history(self) -> None:
         """Keep conversation within MAX_HISTORY messages."""
diff --git a/bus_bridge.py b/bus_bridge.py
new file mode 100644
index 0000000..27745c1
--- /dev/null
+++ b/bus_bridge.py
@@ -0,0 +1,281 @@
+"""Kernel-bus SSE subscriber.
+
+Consumes http://localhost:6931/v1/events/stream and yields parsed bus events.
+Reconnects on disconnect with exponential backoff. Tolerates unknown event kinds
+per ADR-083 (cycle-trace event contract).
+
+C3 will consume this to broadcast CycleEvents to dashboard WebSocket clients.
+
+The kernel (see apps/cogos/bus_stream.go) emits SSE frames of the form:
+
+    data: {"id":"live_*_42","type":"bus.event","timestamp":"...","data":{<CogBlock>}}\\n\\n
+
+Heartbeats arrive as SSE comment lines:
+
+    : keep-alive\\n\\n
+
+An initial frame of {"type":"connected","bus_id":"*","timestamp":"..."} is
+sent on subscribe — we surface that as a BusEnvelope with kind="connected".
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+from dataclasses import dataclass, field
+from typing import Any, AsyncIterator, Optional
+
+import httpx
+
+logger = logging.getLogger("mod3.bus_bridge")
+
+KERNEL_BUS_STREAM_URL = "http://localhost:6931/v1/events/stream"
+
+
+@dataclass
+class BusEnvelope:
+    """Raw bus-envelope record as received from the kernel SSE stream.
+
+    `raw` is the full outer JSON (the bus.event envelope). `payload` is the
+    inner CogBlock dict (envelope["data"]) — may be {} for non-bus.event
+    frames (e.g. the initial "connected" frame). `kind` is the best-effort
+    event-kind string: preferring payload["kind"] (ADR-083 CycleEvent), then
+    payload["type"], then envelope["type"]. Consumers MUST tolerate unknown
+    kinds.
+    """
+
+    raw: dict
+    kind: str
+    payload: dict = field(default_factory=dict)
+    ts: Optional[str] = None
+    event_id: Optional[str] = None
+
+
+def _extract_kind(envelope: dict, payload: dict) -> str:
+    for src in (payload, envelope):
+        for key in ("kind", "type"):
+            val = src.get(key) if isinstance(src, dict) else None
+            if isinstance(val, str) and val:
+                return val
+    return "unknown"
+
+
+class KernelBusSubscriber:
+    """Async SSE subscriber for the cogos kernel bus stream.
+
+    Usage::
+
+        sub = KernelBusSubscriber()
+        async for env in sub.stream():
+            handle(env)
+
+    `stream()` yields indefinitely; on any transport error it reconnects
+    with exponential backoff clamped to [reconnect_min_s, reconnect_max_s].
+    Call `close()` (or cancel the consuming task) to stop.
+    """
+
+    def __init__(
+        self,
+        url: str = KERNEL_BUS_STREAM_URL,
+        *,
+        bus_filter: str = "*",
+        consumer_id: Optional[str] = None,
+        reconnect_min_s: float = 1.0,
+        reconnect_max_s: float = 30.0,
+        request_timeout_s: float = 10.0,
+    ) -> None:
+        self._url = url
+        self._bus_filter = bus_filter
+        self._consumer_id = consumer_id
+        self._min_backoff = reconnect_min_s
+        self._max_backoff = reconnect_max_s
+        self._request_timeout = request_timeout_s
+        self._last_event_id: Optional[str] = None
+        self._closed = asyncio.Event()
+        self._client: Optional[httpx.AsyncClient] = None
+
+    async def close(self) -> None:
+        self._closed.set()
+        if self._client is not None:
+            try:
+                await self._client.aclose()
+            except Exception:  # pragma: no cover - best-effort
+                pass
+            self._client = None
+
+    def _build_params(self) -> dict[str, str]:
+        params: dict[str, str] = {}
+        if self._bus_filter and self._bus_filter != "*":
+            params["bus_id"] = self._bus_filter
+        if self._consumer_id:
+            params["consumer"] = self._consumer_id
+        return params
+
+    def _build_headers(self) -> dict[str, str]:
+        headers = {"Accept": "text/event-stream", "Cache-Control": "no-cache"}
+        if self._last_event_id:
+            # Harmless if the kernel doesn't honor it today; future protocol
+            # bump may use it for resume.
+            headers["Last-Event-ID"] = self._last_event_id
+        return headers
+
+    async def stream(self) -> AsyncIterator[BusEnvelope]:
+        backoff = self._min_backoff
+        # Generous read timeout — SSE is long-lived with 30s heartbeats.
+        timeout = httpx.Timeout(self._request_timeout, read=None)
+        while not self._closed.is_set():
+            self._client = httpx.AsyncClient(timeout=timeout)
+            try:
+                async with self._client.stream(
+                    "GET",
+                    self._url,
+                    params=self._build_params(),
+                    headers=self._build_headers(),
+                ) as resp:
+                    if resp.status_code != 200:
+                        logger.info(
+                            "bus-bridge: non-200 from %s: %s — backing off %.1fs",
+                            self._url, resp.status_code, backoff,
+                        )
+                        await self._sleep_or_close(backoff)
+                        backoff = min(self._max_backoff, max(self._min_backoff, backoff * 2))
+                        continue
+
+                    logger.info("bus-bridge: connected to %s", self._url)
+                    backoff = self._min_backoff  # reset on successful connect
+
+                    async for envelope in self._iter_sse(resp):
+                        yield envelope
+            except (httpx.HTTPError, asyncio.TimeoutError, ConnectionError) as e:
+                logger.info(
+                    "bus-bridge: transport error (%s); reconnecting in %.1fs",
+                    e.__class__.__name__, backoff,
+                )
+                await self._sleep_or_close(backoff)
+                backoff = min(self._max_backoff, max(self._min_backoff, backoff * 2))
+            except asyncio.CancelledError:
+                await self.close()
+                raise
+            finally:
+                if self._client is not None:
+                    try:
+                        await self._client.aclose()
+                    except Exception:  # pragma: no cover
+                        pass
+                    self._client = None
+
+    async def _sleep_or_close(self, seconds: float) -> None:
+        try:
+            await asyncio.wait_for(self._closed.wait(), timeout=seconds)
+        except asyncio.TimeoutError:
+            return
+
+    async def _iter_sse(self, resp: httpx.Response) -> AsyncIterator[BusEnvelope]:
+        """Parse the SSE byte stream into BusEnvelope records.
+
+        Minimal SSE parser: we accumulate field lines into the current event,
+        dispatch on blank-line boundaries, silently skip comment lines
+        (`: heartbeat`), and honor `data:`, `event:`, `id:` fields.
+        """
+        event_name: Optional[str] = None
+        data_lines: list[str] = []
+        event_id: Optional[str] = None
+
+        async for raw_line in resp.aiter_lines():
+            if self._closed.is_set():
+                return
+            # httpx strips the trailing \n but preserves empty lines.
+            if raw_line == "":
+                # Dispatch boundary.
+                if data_lines:
+                    env = self._parse_event(event_name, "\n".join(data_lines), event_id)
+                    if env is not None:
+                        yield env
+                event_name = None
+                data_lines = []
+                event_id = None
+                continue
+            if raw_line.startswith(":"):
+                # Comment line / heartbeat.
+                continue
+            field, _, value = raw_line.partition(":")
+            if value.startswith(" "):
+                value = value[1:]
+            if field == "data":
+                data_lines.append(value)
+            elif field == "event":
+                event_name = value
+            elif field == "id":
+                event_id = value
+                self._last_event_id = value
+            # retry / unknown fields: ignore
+
+    def _parse_event(
+        self, event_name: Optional[str], data: str, event_id: Optional[str]
+    ) -> Optional[BusEnvelope]:
+        try:
+            envelope: Any = json.loads(data)
+        except json.JSONDecodeError:
+            logger.debug("bus-bridge: non-JSON data frame dropped: %r", data[:200])
+            return None
+        if not isinstance(envelope, dict):
+            logger.debug("bus-bridge: non-object data frame dropped: %r", envelope)
+            return None
+
+        inner = envelope.get("data")
+        payload: dict = inner if isinstance(inner, dict) else {}
+        kind = _extract_kind(envelope, payload)
+        ts = envelope.get("timestamp") or payload.get("ts") or payload.get("timestamp")
+        eid = event_id or envelope.get("id")
+        if eid and not self._last_event_id:
+            self._last_event_id = eid
+
+        if kind not in ("state_transition", "tool_dispatch", "assessment", "bus.event", "connected"):
+            # Tolerate unknowns — just log and forward.
+            logger.debug("bus-bridge: forwarding unknown event kind=%r", kind)
+
+        return BusEnvelope(
+            raw=envelope,
+            kind=kind,
+            payload=payload,
+            ts=ts if isinstance(ts, str) else None,
+            event_id=eid if isinstance(eid, str) else None,
+        )
+
+
+# ---------------------------------------------------------------------------
+# Manual validation entry point
+# ---------------------------------------------------------------------------
+
+
+async def _main() -> None:
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    sub = KernelBusSubscriber()
+    print(f"bus-bridge: subscribing to {sub._url} (Ctrl-C to stop)")
+    try:
+        async for env in sub.stream():
+            print(
+                json.dumps(
+                    {
+                        "kind": env.kind,
+                        "ts": env.ts,
+                        "id": env.event_id,
+                        "payload_keys": sorted(env.payload.keys())[:12],
+                    }
+                )
+            )
+    except (KeyboardInterrupt, asyncio.CancelledError):
+        pass
+    finally:
+        await sub.close()
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(_main())
+    except KeyboardInterrupt:
+        pass
diff --git a/bus_bridge_runner.py b/bus_bridge_runner.py
new file mode 100644
index 0000000..5747ec2
--- /dev/null
+++ b/bus_bridge_runner.py
@@ -0,0 +1,144 @@
+"""Kernel-bus → dashboard bridge runner.
+
+Consumes `KernelBusSubscriber.stream()` (see `bus_bridge.py`) and fans the
+ADR-083 cycle-trace events out to every connected dashboard WebSocket via
+`BrowserChannel.broadcast_trace_event()` (see `channels.py`).
+
+Wiring:
+
+  kernel (bus_cycle_trace)
+     └─► SSE /v1/events/stream?bus_id=bus_cycle_trace
+            └─► KernelBusSubscriber.stream()       [C1]
+                   └─► run_bridge() filter + forward
+                          └─► BrowserChannel.broadcast_trace_event()  [C2]
+
+The subscriber does its own reconnect with exponential backoff, so a kernel
+that is temporarily unreachable does not affect server startup. Disable the
+bridge entirely at process boot by setting env `MOD3_BUS_BRIDGE_DISABLED=1`.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+from typing import Optional
+
+from bus_bridge import KERNEL_BUS_STREAM_URL, BusEnvelope, KernelBusSubscriber
+from channels import BrowserChannel
+
+logger = logging.getLogger("mod3.bus_bridge")
+
+# ADR-083 kinds the dashboard trace panel cares about. Kept as a module-level
+# constant so tests and the lifespan wiring share one definition.
+ADR083_KINDS: frozenset[str] = frozenset({"state_transition", "tool_dispatch", "assessment"})
+
+# Kernel-side bus name (see apps/cogos/trace_emit.go:const traceBusID).
+TRACE_BUS_ID = "bus_cycle_trace"
+
+# Env flag consulted at startup.
+DISABLE_ENV = "MOD3_BUS_BRIDGE_DISABLED"
+
+
+def is_disabled() -> bool:
+    """True when MOD3_BUS_BRIDGE_DISABLED is set to a truthy value."""
+    v = os.environ.get(DISABLE_ENV, "").strip().lower()
+    return v in ("1", "true", "yes", "on")
+
+
+async def run_bridge(
+    subscriber: KernelBusSubscriber,
+    *,
+    filter_kinds: Optional[set[str]] = None,
+) -> None:
+    """Consume `subscriber` and broadcast cycle-trace events to dashboard clients.
+
+    `filter_kinds`:
+      - `None`: forward everything (dev mode — useful when inspecting the raw
+        stream through a dashboard).
+      - set of kind strings: only forward envelopes whose `BusEnvelope.kind`
+        is in the set. Unknown kinds are tolerated per ADR-083 — they simply
+        won't pass this filter.
+
+    `BrowserChannel.broadcast_trace_event()` is thread-safe and non-blocking:
+    it dispatches each WS send via `run_coroutine_threadsafe`. We call it
+    directly (no await).
+    """
+    first_event_logged = False
+    forwarded = 0
+    async for env in subscriber.stream():
+        if filter_kinds is not None and env.kind not in filter_kinds:
+            continue
+        # The "connected" bootstrap frame has an empty payload; skip silently.
+        if env.kind == "connected":
+            continue
+        if not first_event_logged:
+            logger.info(
+                "bridge: first event forwarded kind=%s event_id=%s",
+                env.kind, env.event_id,
+            )
+            first_event_logged = True
+        try:
+            BrowserChannel.broadcast_trace_event(env.payload)
+            forwarded += 1
+            logger.debug(
+                "bridge: forwarded kind=%s event_id=%s (total=%d)",
+                env.kind, env.event_id, forwarded,
+            )
+        except Exception as exc:  # noqa: BLE001 — broadcaster is best-effort
+            logger.debug("bridge: broadcast failed: %s", exc)
+
+
+async def start_bridge(
+    app_state: object,
+    *,
+    url: str = KERNEL_BUS_STREAM_URL,
+    bus_filter: str = TRACE_BUS_ID,
+    filter_kinds: Optional[set[str]] = frozenset(ADR083_KINDS),
+) -> None:
+    """Construct the subscriber + bridge task and store them on `app_state`.
+
+    Startup is non-blocking: we don't await the task or probe the kernel.
+    The subscriber's own backoff loop handles reconnects. Logs a disabled
+    notice and returns cleanly when `MOD3_BUS_BRIDGE_DISABLED` is set.
+    """
+    if is_disabled():
+        logger.info("bridge: disabled via %s=1", DISABLE_ENV)
+        setattr(app_state, "bus_bridge_subscriber", None)
+        setattr(app_state, "bus_bridge_task", None)
+        return
+
+    subscriber = KernelBusSubscriber(url=url, bus_filter=bus_filter, consumer_id="mod3-dashboard")
+    task = asyncio.create_task(
+        run_bridge(subscriber, filter_kinds=set(filter_kinds) if filter_kinds else None),
+        name="mod3-bus-bridge",
+    )
+    setattr(app_state, "bus_bridge_subscriber", subscriber)
+    setattr(app_state, "bus_bridge_task", task)
+    logger.info(
+        "bridge: started, target=%s bus_id=%s filter=%s",
+        url, bus_filter, sorted(filter_kinds) if filter_kinds else "*",
+    )
+
+
+async def stop_bridge(app_state: object, *, timeout_s: float = 2.0) -> None:
+    """Gracefully stop the bridge: close subscriber, await task, cancel on timeout."""
+    subscriber: Optional[KernelBusSubscriber] = getattr(app_state, "bus_bridge_subscriber", None)
+    task: Optional[asyncio.Task] = getattr(app_state, "bus_bridge_task", None)
+    if subscriber is None and task is None:
+        return
+    if subscriber is not None:
+        try:
+            await subscriber.close()
+        except Exception:  # pragma: no cover - best-effort
+            pass
+    if task is not None:
+        try:
+            await asyncio.wait_for(task, timeout=timeout_s)
+        except (asyncio.TimeoutError, asyncio.CancelledError):
+            task.cancel()
+            try:
+                await task
+            except (asyncio.CancelledError, Exception):  # pragma: no cover
+                pass
+    logger.info("bridge: stopped")
diff --git a/channels.py b/channels.py
index c026e64..702dbc8 100644
--- a/channels.py
+++ b/channels.py
@@ -8,6 +8,12 @@
   T1 (Whisper Base, ~31ms): per-chunk during speech
   T2 (Whisper Large, ~470ms): on natural pause
   T3 (Whisper Large, ~470ms): on end-of-utterance (final)
+
+Server→client WebSocket message types:
+  audio, response_text, response_complete, interrupted,
+  partial_transcript, transcript,
+  trace_event  — kernel cycle-trace events (ADR-083), fanned out via
+                 BrowserChannel.broadcast_trace_event().
 """
 
 from __future__ import annotations
@@ -33,6 +39,12 @@
 class BrowserChannel:
     """WebSocket-backed channel for the browser dashboard."""
 
+    # Registry of currently-active dashboard channels. Used by
+    # broadcast_trace_event() to fan kernel cycle-trace events out to every
+    # connected dashboard client (see ADR-083). Populated in __init__,
+    # pruned in _cleanup.
+    _active_channels: "set[BrowserChannel]" = set()
+
     def __init__(
         self,
         ws: WebSocket,
@@ -71,6 +83,7 @@ def __init__(
             modalities=[ModalityType.VOICE, ModalityType.TEXT],
             deliver=self._deliver_sync,
         )
+        BrowserChannel._active_channels.add(self)
         logger.info("BrowserChannel registered: %s", self.channel_id)
 
     # ------------------------------------------------------------------
@@ -424,6 +437,62 @@ async def send_response_complete(self, metrics: dict | None = None) -> None:
             except Exception:
                 self._active = False
 
+    # ------------------------------------------------------------------
+    # Trace event broadcast (kernel cycle-trace → dashboards)
+    # ------------------------------------------------------------------
+
+    @classmethod
+    def broadcast_trace_event(cls, event: dict) -> None:
+        """Fan a kernel cycle-trace event out to every connected dashboard.
+
+        Per ADR-083, `event` is a pre-parsed CycleEvent dict
+        (id, ts, source, cycle_id, kind, payload). Wrapped in the
+        `{"type": "trace_event", "event": ...}` envelope and sent to each
+        active BrowserChannel's WebSocket. Clients whose send fails are
+        skipped silently (they will be pruned by their own disconnect path).
+        """
+        frame = {"type": "trace_event", "event": event}
+        for ch in list(cls._active_channels):
+            if not ch._active:
+                continue
+            try:
+                asyncio.run_coroutine_threadsafe(ch.ws.send_json(frame), ch._loop)
+            except Exception as exc:  # noqa: BLE001 — disconnected clients are expected
+                logger.debug("trace_event send failed for %s: %s", ch.channel_id, exc)
+
+    @classmethod
+    def broadcast_response_text(cls, text: str, session_id: str | None = None) -> None:
+        """Push an agent-reply text frame to dashboard WebSocket clients.
+
+        Used by the MOD3_USE_COGOS_AGENT response bridge (see
+        `cogos_agent_bridge.run_response_bridge`). The frame matches the
+        existing text-response shape emitted by `_deliver_async` and
+        `send_response_text`: `{"type": "response_text", "text": <text>}`.
+
+        If `session_id` is None (default) the frame is broadcast to every
+        active dashboard channel. When provided, only channels whose
+        `channel_id` matches the `mod3:<channel_id>` convention from
+        `cogos_agent_bridge.post_user_message` receive the frame — this is
+        how future multi-user routing will land, but for v1 a None
+        broadcast is the common case (only one dashboard attached).
+
+        Thread-safe: dispatches each WS send via `run_coroutine_threadsafe`
+        on the channel's own loop, matching `broadcast_trace_event`.
+        """
+        frame = {"type": "response_text", "text": text}
+        expected_channel = None
+        if session_id and session_id.startswith("mod3:"):
+            expected_channel = session_id[len("mod3:"):]
+        for ch in list(cls._active_channels):
+            if not ch._active:
+                continue
+            if expected_channel and ch.channel_id != expected_channel:
+                continue
+            try:
+                asyncio.run_coroutine_threadsafe(ch.ws.send_json(frame), ch._loop)
+            except Exception as exc:  # noqa: BLE001 — disconnected clients are expected
+                logger.debug("response_text send failed for %s: %s", ch.channel_id, exc)
+
     # ------------------------------------------------------------------
     # Cleanup
     # ------------------------------------------------------------------
@@ -431,6 +500,7 @@ async def send_response_complete(self, metrics: dict | None = None) -> None:
     def _cleanup(self) -> None:
         """Deactivate channel and cancel pending TTS jobs on disconnect."""
         self._active = False
+        BrowserChannel._active_channels.discard(self)
         ch = self.bus._channels.get(self.channel_id)
         if ch:
             ch.active = False
diff --git a/cogos_agent_bridge.py b/cogos_agent_bridge.py
new file mode 100644
index 0000000..0137b8a
--- /dev/null
+++ b/cogos_agent_bridge.py
@@ -0,0 +1,240 @@
+"""CogOS kernel agent bridge (MOD3_USE_COGOS_AGENT=1).
+
+When the env flag is set, Mod³'s agent loop forwards user turns to the
+cogos kernel's metabolic cycle instead of the local inference provider:
+
+  browser → WS turn → post_user_message()  ─POST /v1/bus/send─►  kernel
+                                                                     │
+                                                                     ▼
+                                                         bus_dashboard_chat
+                                                                     │
+                                                                     ▼
+                                                   kernel cycle → `respond` tool
+                                                                     │
+                                                                     ▼
+                                                         bus_dashboard_response
+                                                                     │
+                                                     SSE /v1/events/stream
+                                                                     │
+                                                                     ▼
+                                               KernelBusSubscriber.stream()
+                                                                     │
+                                                                     ▼
+                                                    run_response_bridge()
+                                                                     │
+                                                                     ▼
+                                          BrowserChannel.broadcast_response_text()
+
+The subscriber does its own reconnect with exponential backoff (see
+`bus_bridge.py`). Disable the whole fork by leaving `MOD3_USE_COGOS_AGENT`
+unset (default).
+
+Note: the kernel's `POST /v1/bus/send` takes a flat `{bus_id, from, to,
+message, type}` body — the inner JSON event is serialised into `message`
+(matches the pattern used by other cogos producers).
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Optional
+
+import httpx
+
+from bus_bridge import KERNEL_BUS_STREAM_URL, KernelBusSubscriber
+from channels import BrowserChannel
+
+logger = logging.getLogger("mod3.cogos_agent")
+
+# Bus names — contract with the kernel side (see ADR / c-agent subagent).
+CHAT_BUS_ID = "bus_dashboard_chat"
+RESPONSE_BUS_ID = "bus_dashboard_response"
+
+# Kernel endpoints.
+_DEFAULT_KERNEL_BASE = os.environ.get("COGOS_ENDPOINT", "http://localhost:6931")
+BUS_SEND_URL = f"{_DEFAULT_KERNEL_BASE}/v1/bus/send"
+
+# Env gate.
+ENABLE_ENV = "MOD3_USE_COGOS_AGENT"
+
+_POST_TIMEOUT_S = 5.0
+
+
+def is_enabled() -> bool:
+    """True when MOD3_USE_COGOS_AGENT is set to a truthy value."""
+    v = os.environ.get(ENABLE_ENV, "").strip().lower()
+    return v in ("1", "true", "yes", "on")
+
+
+def _now_rfc3339() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+async def post_user_message(text: str, session_id: str) -> bool:
+    """POST a user turn to the kernel's `bus_dashboard_chat` bus.
+
+    Returns True if the send succeeded (kernel replied 2xx), False otherwise.
+    Logs at warning-level on failure but never raises — callers use graceful
+    degradation (e.g. show an error response frame to the dashboard).
+
+    The kernel's handleBusSend (see apps/cogos/bus_api.go) accepts
+    `{bus_id, from, to, message, type}` — we JSON-encode the full event dict
+    into `message` so the kernel's cycle receives the structured payload.
+    """
+    event = {
+        "type": "user_message",
+        "text": text,
+        "session_id": session_id,
+        "ts": _now_rfc3339(),
+    }
+    body = {
+        "bus_id": CHAT_BUS_ID,
+        "from": "mod3-dashboard",
+        "type": "user_message",
+        "message": json.dumps(event, separators=(",", ":")),
+    }
+    try:
+        async with httpx.AsyncClient(timeout=_POST_TIMEOUT_S) as client:
+            resp = await client.post(BUS_SEND_URL, json=body)
+    except httpx.HTTPError as exc:
+        logger.warning("cogos-agent: post to %s failed: %s", BUS_SEND_URL, exc)
+        return False
+    if resp.status_code // 100 != 2:
+        logger.warning(
+            "cogos-agent: post non-2xx: %d body=%r",
+            resp.status_code, resp.text[:200],
+        )
+        return False
+    logger.info(
+        "cogos-agent: forwarded user turn to kernel bus (session=%s)",
+        session_id,
+    )
+    return True
+
+
+def _extract_response_text(payload: dict) -> Optional[str]:
+    """Dig the assistant reply out of the bus event payload.
+
+    Kernel's `handleBusSend` wraps the sent `message` string inside a
+    `{"content": "<message>"}` map. On SSE delivery, the envelope's `data`
+    field is that map. We look first for structured keys (`text`, direct
+    agent_response shape), then fall through to parsing `content` as JSON.
+    """
+    if not isinstance(payload, dict):
+        return None
+    # Direct shape (if an upstream producer wrote the event dict at the top level).
+    for key in ("text", "reply", "response"):
+        val = payload.get(key)
+        if isinstance(val, str) and val:
+            return val
+    # Standard bus envelope: payload = {"content": "<json-encoded event>"}
+    content = payload.get("content")
+    if isinstance(content, str) and content:
+        try:
+            inner = json.loads(content)
+        except (TypeError, ValueError):
+            # Free-form string — treat the whole thing as the reply.
+            return content
+        if isinstance(inner, dict):
+            for key in ("text", "reply", "response"):
+                val = inner.get(key)
+                if isinstance(val, str) and val:
+                    return val
+        elif isinstance(inner, str) and inner:
+            return inner
+    return None
+
+
+async def run_response_bridge(subscriber: KernelBusSubscriber) -> None:
+    """Consume `subscriber` and broadcast agent replies to dashboard clients.
+
+    `BrowserChannel.broadcast_response_text()` is thread-safe via
+    `run_coroutine_threadsafe`, matching the existing trace-event pattern.
+    Malformed events (no recoverable text) are logged at debug and skipped.
+    """
+    first_event_logged = False
+    forwarded = 0
+    async for env in subscriber.stream():
+        if env.kind == "connected":
+            continue
+        text = _extract_response_text(env.payload)
+        if not text:
+            logger.debug(
+                "cogos-agent: skip event with no text kind=%s id=%s",
+                env.kind, env.event_id,
+            )
+            continue
+        if not first_event_logged:
+            logger.info(
+                "cogos-agent: first response forwarded kind=%s event_id=%s",
+                env.kind, env.event_id,
+            )
+            first_event_logged = True
+        try:
+            BrowserChannel.broadcast_response_text(text)
+            forwarded += 1
+            logger.debug(
+                "cogos-agent: forwarded response event_id=%s (total=%d)",
+                env.event_id, forwarded,
+            )
+        except Exception as exc:  # noqa: BLE001 — best-effort fan-out
+            logger.debug("cogos-agent: broadcast failed: %s", exc)
+
+
+async def start_response_bridge(
+    app_state: object,
+    *,
+    url: str = KERNEL_BUS_STREAM_URL,
+) -> None:
+    """Construct the response subscriber + bridge task and store on `app_state`.
+
+    No-op (logs once) when `MOD3_USE_COGOS_AGENT` is unset.
+    """
+    if not is_enabled():
+        logger.debug("cogos-agent: response bridge disabled (%s unset)", ENABLE_ENV)
+        setattr(app_state, "cogos_agent_subscriber", None)
+        setattr(app_state, "cogos_agent_task", None)
+        return
+
+    subscriber = KernelBusSubscriber(
+        url=url,
+        bus_filter=RESPONSE_BUS_ID,
+        consumer_id="mod3-dashboard-agent",
+    )
+    task = asyncio.create_task(
+        run_response_bridge(subscriber),
+        name="mod3-cogos-agent-bridge",
+    )
+    setattr(app_state, "cogos_agent_subscriber", subscriber)
+    setattr(app_state, "cogos_agent_task", task)
+    logger.info(
+        "cogos-agent: response bridge started, target=%s bus_id=%s",
+        url, RESPONSE_BUS_ID,
+    )
+
+
+async def stop_response_bridge(app_state: object, *, timeout_s: float = 2.0) -> None:
+    """Gracefully stop the response bridge: close subscriber, await task, cancel on timeout."""
+    subscriber: Optional[KernelBusSubscriber] = getattr(app_state, "cogos_agent_subscriber", None)
+    task: Optional[asyncio.Task] = getattr(app_state, "cogos_agent_task", None)
+    if subscriber is None and task is None:
+        return
+    if subscriber is not None:
+        try:
+            await subscriber.close()
+        except Exception:  # pragma: no cover - best-effort
+            pass
+    if task is not None:
+        try:
+            await asyncio.wait_for(task, timeout=timeout_s)
+        except (asyncio.TimeoutError, asyncio.CancelledError):
+            task.cancel()
+            try:
+                await task
+            except (asyncio.CancelledError, Exception):  # pragma: no cover
+                pass
+    logger.info("cogos-agent: response bridge stopped")
diff --git a/dashboard/index.html b/dashboard/index.html
index 914a2b0..ae166c5 100644
--- a/dashboard/index.html
+++ b/dashboard/index.html
@@ -171,6 +171,49 @@
     50% { opacity: 0.3; }
   }
 
+  /* Cycle Trace panel (bottom drawer) */
+  #trace-panel {
+    position: fixed; left: 0; right: 0; bottom: 0;
+    background: var(--surface); border-top: 1px solid var(--border);
+    font-family: ui-monospace, SFMono-Regular, Menlo, monospace;
+    z-index: 20; max-height: 40vh; display: flex; flex-direction: column;
+    transition: max-height 0.2s ease;
+  }
+  #trace-panel.collapsed { max-height: 32px; }
+  #trace-panel .trace-header {
+    display: flex; align-items: center; gap: 8px;
+    padding: 6px 16px; border-bottom: 1px solid var(--border);
+    background: var(--bg); cursor: pointer; user-select: none;
+    font-size: 0.75rem; color: var(--muted); text-transform: uppercase;
+    letter-spacing: 0.5px; flex-shrink: 0; height: 32px;
+  }
+  #trace-panel .trace-header .trace-title { font-weight: 600; }
+  #trace-panel .trace-header .trace-toggle {
+    margin-left: auto; font-size: 0.7rem; color: var(--muted);
+  }
+  #trace-panel.collapsed #trace-entries { display: none; }
+  #trace-entries {
+    overflow-y: auto; padding: 4px 0; flex: 1; min-height: 0;
+    font-size: 0.75rem; line-height: 1.4;
+  }
+  .trace-entry {
+    display: flex; gap: 8px; align-items: baseline;
+    padding: 2px 16px; border-bottom: 1px solid rgba(48,54,61,0.3);
+    white-space: nowrap; overflow: hidden;
+  }
+  .trace-entry:hover { background: rgba(88,166,255,0.05); }
+  .trace-time { color: var(--muted); flex-shrink: 0; font-variant-numeric: tabular-nums; }
+  .trace-source { color: var(--muted); flex-shrink: 0; font-size: 0.7rem; }
+  .trace-kind {
+    flex-shrink: 0; font-size: 0.65rem; padding: 1px 6px;
+    border: 1px solid var(--border); border-radius: 3px;
+    text-transform: uppercase; letter-spacing: 0.5px;
+  }
+  .trace-summary { color: var(--text); overflow: hidden; text-overflow: ellipsis; }
+
+  /* Leave room at the bottom so the drawer doesn't cover the input */
+  body { padding-bottom: 32px; }
+
   /* Responsive */
   @media (max-width: 700px) {
     .main { padding: 12px 16px; }
@@ -300,6 +343,15 @@ <h1>Mod&#179;</h1>
   <div class="hint">For voice, use headphones. Speak naturally — the system detects when you start and stop. Speak during playback to interrupt.</div>
 </div>
 
+<!-- Cycle Trace drawer (ADR-083 trace_event frames) -->
+<div id="trace-panel" class="collapsed">
+  <div class="trace-header" id="trace-toggle" title="Toggle Cycle Trace panel">
+    <span class="trace-title">Cycle Trace</span>
+    <span class="trace-toggle" id="trace-toggle-icon">&#9650;</span>
+  </div>
+  <div id="trace-entries"></div>
+</div>
+
 <script>
 // --- Settings panel toggle ---
 document.getElementById('settings-toggle').addEventListener('click', () => {
@@ -625,6 +677,7 @@ <h1>Mod&#179;</h1>
 <script src="/dashboard/vad/bundle.min.js"></script>
 <script src="/dashboard/playback.js?v=2"></script>
 <script src="/dashboard/transport.js?v=2"></script>
+<script src="/dashboard/trace.js?v=1" defer></script>
 
 <script>
 // Auto-connect WebSocket on page load (text chat works without mic)
diff --git a/dashboard/trace.js b/dashboard/trace.js
new file mode 100644
index 0000000..d443911
--- /dev/null
+++ b/dashboard/trace.js
@@ -0,0 +1,179 @@
+/**
+ * TracePanel — renders `trace_event` WebSocket frames into the Cycle Trace drawer.
+ *
+ * Event shape (ADR-083):
+ *   { id, ts (RFC3339), source, cycle_id, kind, payload }
+ *
+ * Renders most-recent-first, caps at MAX_ENTRIES, exposes clear().
+ */
+(function () {
+  const MAX_ENTRIES = 100;
+
+  const KIND_META = {
+    state_transition: { label: "state", color: "var(--accent)" },
+    tool_dispatch:    { label: "tool",  color: "var(--orange)" },
+    assessment:       { label: "asmt",  color: "var(--green)" },
+  };
+
+  function escapeHtml(s) {
+    const d = document.createElement("div");
+    d.textContent = String(s);
+    return d.innerHTML;
+  }
+
+  function fmtTime(ts) {
+    try {
+      const d = new Date(ts);
+      if (isNaN(d.getTime())) return "--:--:--";
+      const hh = String(d.getHours()).padStart(2, "0");
+      const mm = String(d.getMinutes()).padStart(2, "0");
+      const ss = String(d.getSeconds()).padStart(2, "0");
+      return `${hh}:${mm}:${ss}`;
+    } catch (_e) {
+      return "--:--:--";
+    }
+  }
+
+  function summarize(kind, payload) {
+    const p = payload || {};
+    try {
+      if (kind === "state_transition") {
+        const from = p.from != null ? p.from : "?";
+        const to = p.to != null ? p.to : "?";
+        return `${from} \u2192 ${to}`;
+      }
+      if (kind === "tool_dispatch") {
+        const tool = p.tool || p.name || "tool";
+        const ms = p.duration_ms != null ? p.duration_ms
+                  : p.latency_ms != null ? p.latency_ms
+                  : null;
+        return ms != null ? `${tool} (${Math.round(ms)}ms)` : `${tool}`;
+      }
+      if (kind === "assessment") {
+        const action = p.action || p.decision || "assess";
+        const conf = p.confidence != null ? p.confidence
+                    : p.conf != null ? p.conf
+                    : null;
+        return conf != null
+          ? `${action} (conf: ${Number(conf).toFixed(2)})`
+          : `${action}`;
+      }
+      // Unknown kind — show a compact inline JSON preview if short
+      const keys = Object.keys(p);
+      if (keys.length === 0) return "";
+      const first = keys[0];
+      const val = p[first];
+      const valStr = typeof val === "object" ? "{...}" : String(val);
+      return `${first}=${valStr}${keys.length > 1 ? ` +${keys.length - 1}` : ""}`;
+    } catch (_e) {
+      return "";
+    }
+  }
+
+  class TracePanel {
+    constructor(container) {
+      this.container = container;
+      this.count = 0;
+    }
+
+    render(event) {
+      if (!event || !this.container) return;
+      const kind = event.kind || "unknown";
+      const meta = KIND_META[kind] || { label: kind, color: "var(--muted)" };
+
+      const row = document.createElement("div");
+      row.className = "trace-entry";
+
+      const time = fmtTime(event.ts);
+      const source = event.source || "?";
+      const summary = summarize(kind, event.payload);
+
+      let fullJson = "";
+      try { fullJson = JSON.stringify(event, null, 2); } catch (_e) { fullJson = "<unserializable>"; }
+
+      row.innerHTML =
+        `<span class="trace-time">${escapeHtml(time)}</span>` +
+        `<span class="trace-source">${escapeHtml(source)}</span>` +
+        `<span class="trace-kind" style="color:${meta.color};border-color:${meta.color};">` +
+          `${escapeHtml(meta.label)}</span>` +
+        `<span class="trace-summary">${escapeHtml(summary)}</span>`;
+      row.title = fullJson;
+
+      // Most-recent-first
+      if (this.container.firstChild) {
+        this.container.insertBefore(row, this.container.firstChild);
+      } else {
+        this.container.appendChild(row);
+      }
+      this.count += 1;
+
+      // Rolling window: drop oldest (last child)
+      while (this.count > MAX_ENTRIES && this.container.lastChild) {
+        this.container.removeChild(this.container.lastChild);
+        this.count -= 1;
+      }
+    }
+
+    clear() {
+      if (!this.container) return;
+      while (this.container.firstChild) {
+        this.container.removeChild(this.container.firstChild);
+      }
+      this.count = 0;
+    }
+  }
+
+  // --- Singleton wiring on DOMContentLoaded ---
+  function init() {
+    const container = document.getElementById("trace-entries");
+    const panel = document.getElementById("trace-panel");
+    const toggle = document.getElementById("trace-toggle");
+    const toggleIcon = document.getElementById("trace-toggle-icon");
+    if (!container || !panel) return;
+
+    window.tracePanel = new TracePanel(container);
+
+    // Persist collapse state
+    const LS_KEY = "mod3.tracePanel.collapsed";
+    function applyCollapsed(collapsed) {
+      if (collapsed) panel.classList.add("collapsed");
+      else panel.classList.remove("collapsed");
+      if (toggleIcon) toggleIcon.innerHTML = collapsed ? "&#9650;" : "&#9660;";
+    }
+    let stored = null;
+    try { stored = localStorage.getItem(LS_KEY); } catch (_e) {}
+    // Default collapsed on first load
+    applyCollapsed(stored === null ? true : stored === "1");
+
+    if (toggle) {
+      toggle.addEventListener("click", () => {
+        const nowCollapsed = !panel.classList.contains("collapsed");
+        applyCollapsed(nowCollapsed);
+        try { localStorage.setItem(LS_KEY, nowCollapsed ? "1" : "0"); } catch (_e) {}
+      });
+    }
+
+    // Dev hook for smoke-testing from the browser console.
+    window.testTrace = function (partial) {
+      const ev = Object.assign(
+        {
+          id: "test-" + Math.random().toString(36).slice(2, 8),
+          ts: new Date().toISOString(),
+          source: "cog",
+          cycle_id: "test-cycle",
+          kind: "state_transition",
+          payload: { from: "idle", to: "active" },
+        },
+        partial || {}
+      );
+      window.tracePanel.render(ev);
+      return ev;
+    };
+  }
+
+  if (document.readyState === "loading") {
+    document.addEventListener("DOMContentLoaded", init);
+  } else {
+    init();
+  }
+})();
diff --git a/dashboard/transport.js b/dashboard/transport.js
index ee8dc5c..0646c24 100644
--- a/dashboard/transport.js
+++ b/dashboard/transport.js
@@ -51,6 +51,16 @@ class VoiceTransport {
   }
 
   _dispatch(msg) {
+    // Trace events — cosmetic, never blocks audio/text rendering
+    if (msg.type === "trace_event") {
+      try {
+        if (window.tracePanel && msg.event) window.tracePanel.render(msg.event);
+      } catch (e) {
+        console.warn("[WS] trace_event render failed:", e);
+      }
+      return;
+    }
+
     // Handle base64 audio message — decode and route to onAudio
     if (msg.type === "audio" && msg.data) {
       const binary = atob(msg.data);
diff --git a/dashboard/vad/ort.min.js b/dashboard/vad/ort.min.js
new file mode 100644
index 0000000..63ab7bd
--- /dev/null
+++ b/dashboard/vad/ort.min.js
@@ -0,0 +1,2869 @@
+/*!
+ * ONNX Runtime Web v1.22.0
+ * Copyright (c) Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License.
+ */
+"use strict";var ort=(()=>{var bn=Object.defineProperty;var Vu=Object.getOwnPropertyDescriptor;var Nu=Object.getOwnPropertyNames;var Lu=Object.prototype.hasOwnProperty;var _n=(e=>typeof require<"u"?require:typeof Proxy<"u"?new Proxy(e,{get:(t,n)=>(typeof require<"u"?require:t)[n]}):e)(function(e){if(typeof require<"u")return require.apply(this,arguments);throw Error('Dynamic require of "'+e+'" is not supported')});var E=(e,t)=>()=>(e&&(t=e(e=0)),t);var et=(e,t)=>{for(var n in t)bn(e,n,{get:t[n],enumerable:!0})},Gu=(e,t,n,r)=>{if(t&&typeof t=="object"||typeof t=="function")for(let o of Nu(t))!Lu.call(e,o)&&o!==n&&bn(e,o,{get:()=>t[o],enumerable:!(r=Vu(t,o))||r.enumerable});return e};var at=e=>Gu(bn({},"__esModule",{value:!0}),e);var $t,Fe,Ne,Wu,hr,wn=E(()=>{"use strict";$t=new Map,Fe=[],Ne=(e,t,n)=>{if(t&&typeof t.init=="function"&&typeof t.createInferenceSessionHandler=="function"){let r=$t.get(e);if(r===void 0)$t.set(e,{backend:t,priority:n});else{if(r.priority>n)return;if(r.priority===n&&r.backend!==t)throw new Error(`cannot register backend "${e}" using priority ${n}`)}if(n>=0){let o=Fe.indexOf(e);o!==-1&&Fe.splice(o,1);for(let i=0;i<Fe.length;i++)if($t.get(Fe[i]).priority<=n){Fe.splice(i,0,e);return}Fe.push(e)}return}throw new TypeError("not a valid backend")},Wu=async e=>{let t=$t.get(e);if(!t)return"backend not found.";if(t.initialized)return t.backend;if(t.aborted)return t.error;{let n=!!t.initPromise;try{return n||(t.initPromise=t.backend.init(e)),await t.initPromise,t.initialized=!0,t.backend}catch(r){return n||(t.error=`${r}`,t.aborted=!0),t.error}finally{delete t.initPromise}}},hr=async e=>{let t=e.executionProviders||[],n=t.map(u=>typeof u=="string"?u:u.name),r=n.length===0?Fe:n,o,i=[],s=new Set;for(let u of r){let d=await Wu(u);typeof d=="string"?i.push({name:u,err:d}):(o||(o=d),o===d&&s.add(u))}if(!o)throw new Error(`no available backend found. ERR: ${i.map(u=>`[${u.name}] ${u.err}`).join(", ")}`);for(let{name:u,err:d}of i)n.includes(u)&&console.warn(`removing requested execution provider "${u}" from session options because it is not available: ${d}`);let a=t.filter(u=>s.has(typeof u=="string"?u:u.name));return[o,new Proxy(e,{get:(u,d)=>d==="executionProviders"?a:Reflect.get(u,d)})]}});var gr=E(()=>{"use strict";wn()});var yr,br=E(()=>{"use strict";yr="1.22.0"});var _r,ge,$n=E(()=>{"use strict";br();_r="warning",ge={wasm:{},webgl:{},webgpu:{},versions:{common:yr},set logLevel(e){if(e!==void 0){if(typeof e!="string"||["verbose","info","warning","error","fatal"].indexOf(e)===-1)throw new Error(`Unsupported logging level: ${e}`);_r=e}},get logLevel(){return _r}};Object.defineProperty(ge,"logLevel",{enumerable:!0})});var te,wr=E(()=>{"use strict";$n();te=ge});var $r,vr,xr=E(()=>{"use strict";$r=(e,t)=>{let n=typeof document<"u"?document.createElement("canvas"):new OffscreenCanvas(1,1);n.width=e.dims[3],n.height=e.dims[2];let r=n.getContext("2d");if(r!=null){let o,i;t?.tensorLayout!==void 0&&t.tensorLayout==="NHWC"?(o=e.dims[2],i=e.dims[3]):(o=e.dims[3],i=e.dims[2]);let s=t?.format!==void 0?t.format:"RGB",a=t?.norm,u,d;a===void 0||a.mean===void 0?u=[255,255,255,255]:typeof a.mean=="number"?u=[a.mean,a.mean,a.mean,a.mean]:(u=[a.mean[0],a.mean[1],a.mean[2],0],a.mean[3]!==void 0&&(u[3]=a.mean[3])),a===void 0||a.bias===void 0?d=[0,0,0,0]:typeof a.bias=="number"?d=[a.bias,a.bias,a.bias,a.bias]:(d=[a.bias[0],a.bias[1],a.bias[2],0],a.bias[3]!==void 0&&(d[3]=a.bias[3]));let l=i*o,c=0,p=l,f=l*2,m=-1;s==="RGBA"?(c=0,p=l,f=l*2,m=l*3):s==="RGB"?(c=0,p=l,f=l*2):s==="RBG"&&(c=0,f=l,p=l*2);for(let h=0;h<i;h++)for(let b=0;b<o;b++){let y=(e.data[c++]-d[0])*u[0],g=(e.data[p++]-d[1])*u[1],_=(e.data[f++]-d[2])*u[2],w=m===-1?255:(e.data[m++]-d[3])*u[3];r.fillStyle="rgba("+y+","+g+","+_+","+w+")",r.fillRect(b,h,1,1)}if("toDataURL"in n)return n.toDataURL();throw new Error("toDataURL is not supported")}else throw new Error("Can not access image data")},vr=(e,t)=>{let n=typeof document<"u"?document.createElement("canvas").getContext("2d"):new OffscreenCanvas(1,1).getContext("2d"),r;if(n!=null){let o,i,s;t?.tensorLayout!==void 0&&t.tensorLayout==="NHWC"?(o=e.dims[2],i=e.dims[1],s=e.dims[3]):(o=e.dims[3],i=e.dims[2],s=e.dims[1]);let a=t!==void 0&&t.format!==void 0?t.format:"RGB",u=t?.norm,d,l;u===void 0||u.mean===void 0?d=[255,255,255,255]:typeof u.mean=="number"?d=[u.mean,u.mean,u.mean,u.mean]:(d=[u.mean[0],u.mean[1],u.mean[2],255],u.mean[3]!==void 0&&(d[3]=u.mean[3])),u===void 0||u.bias===void 0?l=[0,0,0,0]:typeof u.bias=="number"?l=[u.bias,u.bias,u.bias,u.bias]:(l=[u.bias[0],u.bias[1],u.bias[2],0],u.bias[3]!==void 0&&(l[3]=u.bias[3]));let c=i*o;if(t!==void 0&&(t.format!==void 0&&s===4&&t.format!=="RGBA"||s===3&&t.format!=="RGB"&&t.format!=="BGR"))throw new Error("Tensor format doesn't match input tensor dims");let p=4,f=0,m=1,h=2,b=3,y=0,g=c,_=c*2,w=-1;a==="RGBA"?(y=0,g=c,_=c*2,w=c*3):a==="RGB"?(y=0,g=c,_=c*2):a==="RBG"&&(y=0,_=c,g=c*2),r=n.createImageData(o,i);for(let v=0;v<i*o;f+=p,m+=p,h+=p,b+=p,v++)r.data[f]=(e.data[y++]-l[0])*d[0],r.data[m]=(e.data[g++]-l[1])*d[1],r.data[h]=(e.data[_++]-l[2])*d[2],r.data[b]=w===-1?255:(e.data[w++]-l[3])*d[3]}else throw new Error("Can not access image data");return r}});var vn,Sr,Tr,Ir,Cr,Ar,kr=E(()=>{"use strict";vt();vn=(e,t)=>{if(e===void 0)throw new Error("Image buffer must be defined");if(t.height===void 0||t.width===void 0)throw new Error("Image height and width must be defined");if(t.tensorLayout==="NHWC")throw new Error("NHWC Tensor layout is not supported yet");let{height:n,width:r}=t,o=t.norm??{mean:255,bias:0},i,s;typeof o.mean=="number"?i=[o.mean,o.mean,o.mean,o.mean]:i=[o.mean[0],o.mean[1],o.mean[2],o.mean[3]??255],typeof o.bias=="number"?s=[o.bias,o.bias,o.bias,o.bias]:s=[o.bias[0],o.bias[1],o.bias[2],o.bias[3]??0];let a=t.format!==void 0?t.format:"RGBA",u=t.tensorFormat!==void 0&&t.tensorFormat!==void 0?t.tensorFormat:"RGB",d=n*r,l=u==="RGBA"?new Float32Array(d*4):new Float32Array(d*3),c=4,p=0,f=1,m=2,h=3,b=0,y=d,g=d*2,_=-1;a==="RGB"&&(c=3,p=0,f=1,m=2,h=-1),u==="RGBA"?_=d*3:u==="RBG"?(b=0,g=d,y=d*2):u==="BGR"&&(g=0,y=d,b=d*2);for(let v=0;v<d;v++,p+=c,m+=c,f+=c,h+=c)l[b++]=(e[p]+s[0])/i[0],l[y++]=(e[f]+s[1])/i[1],l[g++]=(e[m]+s[2])/i[2],_!==-1&&h!==-1&&(l[_++]=(e[h]+s[3])/i[3]);return u==="RGBA"?new pe("float32",l,[1,4,n,r]):new pe("float32",l,[1,3,n,r])},Sr=async(e,t)=>{let n=typeof HTMLImageElement<"u"&&e instanceof HTMLImageElement,r=typeof ImageData<"u"&&e instanceof ImageData,o=typeof ImageBitmap<"u"&&e instanceof ImageBitmap,i=typeof e=="string",s,a=t??{},u=()=>{if(typeof document<"u")return document.createElement("canvas");if(typeof OffscreenCanvas<"u")return new OffscreenCanvas(1,1);throw new Error("Canvas is not supported")},d=l=>typeof HTMLCanvasElement<"u"&&l instanceof HTMLCanvasElement||l instanceof OffscreenCanvas?l.getContext("2d"):null;if(n){let l=u();l.width=e.width,l.height=e.height;let c=d(l);if(c!=null){let p=e.height,f=e.width;if(t!==void 0&&t.resizedHeight!==void 0&&t.resizedWidth!==void 0&&(p=t.resizedHeight,f=t.resizedWidth),t!==void 0){if(a=t,t.tensorFormat!==void 0)throw new Error("Image input config format must be RGBA for HTMLImageElement");a.tensorFormat="RGBA",a.height=p,a.width=f}else a.tensorFormat="RGBA",a.height=p,a.width=f;c.drawImage(e,0,0),s=c.getImageData(0,0,f,p).data}else throw new Error("Can not access image data")}else if(r){let l,c;if(t!==void 0&&t.resizedWidth!==void 0&&t.resizedHeight!==void 0?(l=t.resizedHeight,c=t.resizedWidth):(l=e.height,c=e.width),t!==void 0&&(a=t),a.format="RGBA",a.height=l,a.width=c,t!==void 0){let p=u();p.width=c,p.height=l;let f=d(p);if(f!=null)f.putImageData(e,0,0),s=f.getImageData(0,0,c,l).data;else throw new Error("Can not access image data")}else s=e.data}else if(o){if(t===void 0)throw new Error("Please provide image config with format for Imagebitmap");let l=u();l.width=e.width,l.height=e.height;let c=d(l);if(c!=null){let p=e.height,f=e.width;return c.drawImage(e,0,0,f,p),s=c.getImageData(0,0,f,p).data,a.height=p,a.width=f,vn(s,a)}else throw new Error("Can not access image data")}else{if(i)return new Promise((l,c)=>{let p=u(),f=d(p);if(!e||!f)return c();let m=new Image;m.crossOrigin="Anonymous",m.src=e,m.onload=()=>{p.width=m.width,p.height=m.height,f.drawImage(m,0,0,p.width,p.height);let h=f.getImageData(0,0,p.width,p.height);a.height=p.height,a.width=p.width,l(vn(h.data,a))}});throw new Error("Input data provided is not supported - aborted tensor creation")}if(s!==void 0)return vn(s,a);throw new Error("Input data provided is not supported - aborted tensor creation")},Tr=(e,t)=>{let{width:n,height:r,download:o,dispose:i}=t,s=[1,r,n,4];return new pe({location:"texture",type:"float32",texture:e,dims:s,download:o,dispose:i})},Ir=(e,t)=>{let{dataType:n,dims:r,download:o,dispose:i}=t;return new pe({location:"gpu-buffer",type:n??"float32",gpuBuffer:e,dims:r,download:o,dispose:i})},Cr=(e,t)=>{let{dataType:n,dims:r,download:o,dispose:i}=t;return new pe({location:"ml-tensor",type:n??"float32",mlTensor:e,dims:r,download:o,dispose:i})},Ar=(e,t,n)=>new pe({location:"cpu-pinned",type:e,data:t,dims:n??[t.length]})});var Ke,ut,Er,Pr,zr=E(()=>{"use strict";Ke=new Map([["float32",Float32Array],["uint8",Uint8Array],["int8",Int8Array],["uint16",Uint16Array],["int16",Int16Array],["int32",Int32Array],["bool",Uint8Array],["float64",Float64Array],["uint32",Uint32Array],["int4",Uint8Array],["uint4",Uint8Array]]),ut=new Map([[Float32Array,"float32"],[Uint8Array,"uint8"],[Int8Array,"int8"],[Uint16Array,"uint16"],[Int16Array,"int16"],[Int32Array,"int32"],[Float64Array,"float64"],[Uint32Array,"uint32"]]),Er=!1,Pr=()=>{if(!Er){Er=!0;let e=typeof BigInt64Array<"u"&&BigInt64Array.from,t=typeof BigUint64Array<"u"&&BigUint64Array.from,n=globalThis.Float16Array,r=typeof n<"u"&&n.from;e&&(Ke.set("int64",BigInt64Array),ut.set(BigInt64Array,"int64")),t&&(Ke.set("uint64",BigUint64Array),ut.set(BigUint64Array,"uint64")),r?(Ke.set("float16",n),ut.set(n,"float16")):Ke.set("float16",Uint16Array)}}});var Or,Dr,Br=E(()=>{"use strict";vt();Or=e=>{let t=1;for(let n=0;n<e.length;n++){let r=e[n];if(typeof r!="number"||!Number.isSafeInteger(r))throw new TypeError(`dims[${n}] must be an integer, got: ${r}`);if(r<0)throw new RangeError(`dims[${n}] must be a non-negative integer, got: ${r}`);t*=r}return t},Dr=(e,t)=>{switch(e.location){case"cpu":return new pe(e.type,e.data,t);case"cpu-pinned":return new pe({location:"cpu-pinned",data:e.data,type:e.type,dims:t});case"texture":return new pe({location:"texture",texture:e.texture,type:e.type,dims:t});case"gpu-buffer":return new pe({location:"gpu-buffer",gpuBuffer:e.gpuBuffer,type:e.type,dims:t});case"ml-tensor":return new pe({location:"ml-tensor",mlTensor:e.mlTensor,type:e.type,dims:t});default:throw new Error(`tensorReshape: tensor location ${e.location} is not supported`)}}});var pe,vt=E(()=>{"use strict";xr();kr();zr();Br();pe=class{constructor(t,n,r){Pr();let o,i;if(typeof t=="object"&&"location"in t)switch(this.dataLocation=t.location,o=t.type,i=t.dims,t.location){case"cpu-pinned":{let a=Ke.get(o);if(!a)throw new TypeError(`unsupported type "${o}" to create tensor from pinned buffer`);if(!(t.data instanceof a))throw new TypeError(`buffer should be of type ${a.name}`);this.cpuData=t.data;break}case"texture":{if(o!=="float32")throw new TypeError(`unsupported type "${o}" to create tensor from texture`);this.gpuTextureData=t.texture,this.downloader=t.download,this.disposer=t.dispose;break}case"gpu-buffer":{if(o!=="float32"&&o!=="float16"&&o!=="int32"&&o!=="int64"&&o!=="uint32"&&o!=="uint8"&&o!=="bool"&&o!=="uint4"&&o!=="int4")throw new TypeError(`unsupported type "${o}" to create tensor from gpu buffer`);this.gpuBufferData=t.gpuBuffer,this.downloader=t.download,this.disposer=t.dispose;break}case"ml-tensor":{if(o!=="float32"&&o!=="float16"&&o!=="int32"&&o!=="int64"&&o!=="uint32"&&o!=="uint64"&&o!=="int8"&&o!=="uint8"&&o!=="bool"&&o!=="uint4"&&o!=="int4")throw new TypeError(`unsupported type "${o}" to create tensor from MLTensor`);this.mlTensorData=t.mlTensor,this.downloader=t.download,this.disposer=t.dispose;break}default:throw new Error(`Tensor constructor: unsupported location '${this.dataLocation}'`)}else{let a,u;if(typeof t=="string")if(o=t,u=r,t==="string"){if(!Array.isArray(n))throw new TypeError("A string tensor's data must be a string array.");a=n}else{let d=Ke.get(t);if(d===void 0)throw new TypeError(`Unsupported tensor type: ${t}.`);if(Array.isArray(n)){if(t==="float16"&&d===Uint16Array||t==="uint4"||t==="int4")throw new TypeError(`Creating a ${t} tensor from number array is not supported. Please use ${d.name} as data.`);t==="uint64"||t==="int64"?a=d.from(n,BigInt):a=d.from(n)}else if(n instanceof d)a=n;else if(n instanceof Uint8ClampedArray)if(t==="uint8")a=Uint8Array.from(n);else throw new TypeError("A Uint8ClampedArray tensor's data must be type of uint8");else if(t==="float16"&&n instanceof Uint16Array&&d!==Uint16Array)a=new globalThis.Float16Array(n.buffer,n.byteOffset,n.length);else throw new TypeError(`A ${o} tensor's data must be type of ${d}`)}else if(u=n,Array.isArray(t)){if(t.length===0)throw new TypeError("Tensor type cannot be inferred from an empty array.");let d=typeof t[0];if(d==="string")o="string",a=t;else if(d==="boolean")o="bool",a=Uint8Array.from(t);else throw new TypeError(`Invalid element type of data array: ${d}.`)}else if(t instanceof Uint8ClampedArray)o="uint8",a=Uint8Array.from(t);else{let d=ut.get(t.constructor);if(d===void 0)throw new TypeError(`Unsupported type for tensor data: ${t.constructor}.`);o=d,a=t}if(u===void 0)u=[a.length];else if(!Array.isArray(u))throw new TypeError("A tensor's dims must be a number array");i=u,this.cpuData=a,this.dataLocation="cpu"}let s=Or(i);if(this.cpuData&&s!==this.cpuData.length&&!((o==="uint4"||o==="int4")&&Math.ceil(s/2)===this.cpuData.length))throw new Error(`Tensor's size(${s}) does not match data length(${this.cpuData.length}).`);this.type=o,this.dims=i,this.size=s}static async fromImage(t,n){return Sr(t,n)}static fromTexture(t,n){return Tr(t,n)}static fromGpuBuffer(t,n){return Ir(t,n)}static fromMLTensor(t,n){return Cr(t,n)}static fromPinnedBuffer(t,n,r){return Ar(t,n,r)}toDataURL(t){return $r(this,t)}toImageData(t){return vr(this,t)}get data(){if(this.ensureValid(),!this.cpuData)throw new Error("The data is not on CPU. Use `getData()` to download GPU data to CPU, or use `texture` or `gpuBuffer` property to access the GPU data directly.");return this.cpuData}get location(){return this.dataLocation}get texture(){if(this.ensureValid(),!this.gpuTextureData)throw new Error("The data is not stored as a WebGL texture.");return this.gpuTextureData}get gpuBuffer(){if(this.ensureValid(),!this.gpuBufferData)throw new Error("The data is not stored as a WebGPU buffer.");return this.gpuBufferData}get mlTensor(){if(this.ensureValid(),!this.mlTensorData)throw new Error("The data is not stored as a WebNN MLTensor.");return this.mlTensorData}async getData(t){switch(this.ensureValid(),this.dataLocation){case"cpu":case"cpu-pinned":return this.data;case"texture":case"gpu-buffer":case"ml-tensor":{if(!this.downloader)throw new Error("The current tensor is not created with a specified data downloader.");if(this.isDownloading)throw new Error("The current tensor is being downloaded.");try{this.isDownloading=!0;let n=await this.downloader();return this.downloader=void 0,this.dataLocation="cpu",this.cpuData=n,t&&this.disposer&&(this.disposer(),this.disposer=void 0),n}finally{this.isDownloading=!1}}default:throw new Error(`cannot get data from location: ${this.dataLocation}`)}}dispose(){if(this.isDownloading)throw new Error("The current tensor is being downloaded.");this.disposer&&(this.disposer(),this.disposer=void 0),this.cpuData=void 0,this.gpuTextureData=void 0,this.gpuBufferData=void 0,this.mlTensorData=void 0,this.downloader=void 0,this.isDownloading=void 0,this.dataLocation="none"}ensureValid(){if(this.dataLocation==="none")throw new Error("The tensor is disposed.")}reshape(t){if(this.ensureValid(),this.downloader||this.disposer)throw new Error("Cannot reshape a tensor that owns GPU resource.");return Dr(this,t)}}});var $e,xn=E(()=>{"use strict";vt();$e=pe});var dt,Mr,fe,me,Sn=E(()=>{"use strict";$n();dt=(e,t)=>{(typeof ge.trace>"u"?!ge.wasm.trace:!ge.trace)||console.timeStamp(`${e}::ORT::${t}`)},Mr=(e,t)=>{let n=new Error().stack?.split(/\r\n|\r|\n/g)||[],r=!1;for(let o=0;o<n.length;o++){if(r&&!n[o].includes("TRACE_FUNC")){let i=`FUNC_${e}::${n[o].trim().split(" ")[1]}`;t&&(i+=`::${t}`),dt("CPU",i);return}n[o].includes("TRACE_FUNC")&&(r=!0)}},fe=e=>{(typeof ge.trace>"u"?!ge.wasm.trace:!ge.trace)||Mr("BEGIN",e)},me=e=>{(typeof ge.trace>"u"?!ge.wasm.trace:!ge.trace)||Mr("END",e)}});var xt,Rr=E(()=>{"use strict";wn();xn();Sn();xt=class e{constructor(t){this.handler=t}async run(t,n,r){fe();let o={},i={};if(typeof t!="object"||t===null||t instanceof $e||Array.isArray(t))throw new TypeError("'feeds' must be an object that use input names as keys and OnnxValue as corresponding values.");let s=!0;if(typeof n=="object"){if(n===null)throw new TypeError("Unexpected argument[1]: cannot be null.");if(n instanceof $e)throw new TypeError("'fetches' cannot be a Tensor");if(Array.isArray(n)){if(n.length===0)throw new TypeError("'fetches' cannot be an empty array.");s=!1;for(let d of n){if(typeof d!="string")throw new TypeError("'fetches' must be a string array or an object.");if(this.outputNames.indexOf(d)===-1)throw new RangeError(`'fetches' contains invalid output name: ${d}.`);o[d]=null}if(typeof r=="object"&&r!==null)i=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else{let d=!1,l=Object.getOwnPropertyNames(n);for(let c of this.outputNames)if(l.indexOf(c)!==-1){let p=n[c];(p===null||p instanceof $e)&&(d=!0,s=!1,o[c]=p)}if(d){if(typeof r=="object"&&r!==null)i=r;else if(typeof r<"u")throw new TypeError("'options' must be an object.")}else i=n}}else if(typeof n<"u")throw new TypeError("Unexpected argument[1]: must be 'fetches' or 'options'.");for(let d of this.inputNames)if(typeof t[d]>"u")throw new Error(`input '${d}' is missing in 'feeds'.`);if(s)for(let d of this.outputNames)o[d]=null;let a=await this.handler.run(t,o,i),u={};for(let d in a)if(Object.hasOwnProperty.call(a,d)){let l=a[d];l instanceof $e?u[d]=l:u[d]=new $e(l.type,l.data,l.dims)}return me(),u}async release(){return this.handler.dispose()}static async create(t,n,r,o){fe();let i,s={};if(typeof t=="string"){if(i=t,typeof n=="object"&&n!==null)s=n;else if(typeof n<"u")throw new TypeError("'options' must be an object.")}else if(t instanceof Uint8Array){if(i=t,typeof n=="object"&&n!==null)s=n;else if(typeof n<"u")throw new TypeError("'options' must be an object.")}else if(t instanceof ArrayBuffer||typeof SharedArrayBuffer<"u"&&t instanceof SharedArrayBuffer){let l=t,c=0,p=t.byteLength;if(typeof n=="object"&&n!==null)s=n;else if(typeof n=="number"){if(c=n,!Number.isSafeInteger(c))throw new RangeError("'byteOffset' must be an integer.");if(c<0||c>=l.byteLength)throw new RangeError(`'byteOffset' is out of range [0, ${l.byteLength}).`);if(p=t.byteLength-c,typeof r=="number"){if(p=r,!Number.isSafeInteger(p))throw new RangeError("'byteLength' must be an integer.");if(p<=0||c+p>l.byteLength)throw new RangeError(`'byteLength' is out of range (0, ${l.byteLength-c}].`);if(typeof o=="object"&&o!==null)s=o;else if(typeof o<"u")throw new TypeError("'options' must be an object.")}else if(typeof r<"u")throw new TypeError("'byteLength' must be a number.")}else if(typeof n<"u")throw new TypeError("'options' must be an object.");i=new Uint8Array(l,c,p)}else throw new TypeError("Unexpected argument[0]: must be 'path' or 'buffer'.");let[a,u]=await hr(s),d=await a.createInferenceSessionHandler(i,u);return me(),new e(d)}startProfiling(){this.handler.startProfiling()}endProfiling(){this.handler.endProfiling()}get inputNames(){return this.handler.inputNames}get outputNames(){return this.handler.outputNames}get inputMetadata(){return this.handler.inputMetadata}get outputMetadata(){return this.handler.outputMetadata}}});var Ur,Vr=E(()=>{"use strict";Rr();Ur=xt});var Nr=E(()=>{"use strict"});var Lr=E(()=>{"use strict"});var Gr=E(()=>{"use strict"});var Wr=E(()=>{"use strict"});var Tn={};et(Tn,{InferenceSession:()=>Ur,TRACE:()=>dt,TRACE_FUNC_BEGIN:()=>fe,TRACE_FUNC_END:()=>me,Tensor:()=>$e,env:()=>te,registerBackend:()=>Ne});var ve=E(()=>{"use strict";gr();wr();Vr();xn();Nr();Lr();Sn();Gr();Wr()});var St=E(()=>{"use strict"});var Kr={};et(Kr,{default:()=>Hu});var qr,Fr,Hu,jr=E(()=>{"use strict";In();Le();Tt();qr="ort-wasm-proxy-worker",Fr=globalThis.self?.name===qr;Fr&&(self.onmessage=e=>{let{type:t,in:n}=e.data;try{switch(t){case"init-wasm":It(n.wasm).then(()=>{Ct(n).then(()=>{postMessage({type:t})},r=>{postMessage({type:t,err:r})})},r=>{postMessage({type:t,err:r})});break;case"init-ep":{let{epName:r,env:o}=n;At(o,r).then(()=>{postMessage({type:t})},i=>{postMessage({type:t,err:i})});break}case"copy-from":{let{buffer:r}=n,o=lt(r);postMessage({type:t,out:o});break}case"create":{let{model:r,options:o}=n;kt(r,o).then(i=>{postMessage({type:t,out:i})},i=>{postMessage({type:t,err:i})});break}case"release":Et(n),postMessage({type:t});break;case"run":{let{sessionId:r,inputIndices:o,inputs:i,outputIndices:s,options:a}=n;Pt(r,o,i,s,new Array(s.length).fill(null),a).then(u=>{u.some(d=>d[3]!=="cpu")?postMessage({type:t,err:"Proxy does not support non-cpu tensor location."}):postMessage({type:t,out:u},Ot([...i,...u]))},u=>{postMessage({type:t,err:u})});break}case"end-profiling":zt(n),postMessage({type:t});break;default:}}catch(r){postMessage({type:t,err:r})}});Hu=Fr?null:e=>new Worker(e??ye,{type:"classic",name:qr})});var qu,Fu,ye,Dt,Cn,Ku,ju,Xr,Zu,Zr,Yr,Qr,Jr,Tt=E(()=>{"use strict";St();qu=typeof location>"u"?void 0:location.origin,Fu=()=>{if(!!1)return typeof document<"u"?document.currentScript?.src:typeof self<"u"?self.location?.href:void 0},ye=Fu(),Dt=()=>{if(ye&&!ye.startsWith("blob:"))return ye.substring(0,ye.lastIndexOf("/")+1)},Cn=(e,t)=>{try{let n=t??ye;return(n?new URL(e,n):new URL(e)).origin===qu}catch{return!1}},Ku=(e,t)=>{let n=t??ye;try{return(n?new URL(e,n):new URL(e)).href}catch{return}},ju=(e,t)=>`${t??"./"}${e}`,Xr=async e=>{let n=await(await fetch(e,{credentials:"same-origin"})).blob();return URL.createObjectURL(n)},Zu=async e=>(await import(/*webpackIgnore:true*/e)).default,Zr=(jr(),at(Kr)).default,Yr=async()=>{if(!ye)throw new Error("Failed to load proxy worker: cannot determine the script source URL.");if(Cn(ye))return[void 0,Zr()];let e=await Xr(ye);return[e,Zr(e)]},Qr=void 0,Jr=async(e,t,n)=>{if(!e&&!t&&Qr&&ye&&Cn(ye))return[void 0,Qr];{let r="ort-wasm-simd-threaded.jsep.mjs",o=e??Ku(r,t),i=!!1&&n&&o&&!Cn(o,t),s=i?await Xr(o):o??ju(r,t);return[i?s:void 0,await Zu(s)]}}});var An,kn,Bt,eo,Qu,Xu,Yu,It,ne,Le=E(()=>{"use strict";Tt();kn=!1,Bt=!1,eo=!1,Qu=()=>{if(typeof SharedArrayBuffer>"u")return!1;try{return typeof MessageChannel<"u"&&new MessageChannel().port1.postMessage(new SharedArrayBuffer(1)),WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,2,1,0,5,4,1,3,1,1,10,11,1,9,0,65,0,254,16,2,0,26,11]))}catch{return!1}},Xu=()=>{try{return WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,4,1,96,0,0,3,2,1,0,10,30,1,28,0,65,0,253,15,253,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,253,186,1,26,11]))}catch{return!1}},Yu=()=>{try{return WebAssembly.validate(new Uint8Array([0,97,115,109,1,0,0,0,1,5,1,96,0,1,123,3,2,1,0,10,19,1,17,0,65,1,253,15,65,2,253,15,65,3,253,15,253,147,2,11]))}catch{return!1}},It=async e=>{if(kn)return Promise.resolve();if(Bt)throw new Error("multiple calls to 'initializeWebAssembly()' detected.");if(eo)throw new Error("previous call to 'initializeWebAssembly()' failed.");Bt=!0;let t=e.initTimeout,n=e.numThreads;if(e.simd!==!1){if(e.simd==="relaxed"){if(!Yu())throw new Error("Relaxed WebAssembly SIMD is not supported in the current environment.")}else if(!Xu())throw new Error("WebAssembly SIMD is not supported in the current environment.")}let r=Qu();n>1&&!r&&(typeof self<"u"&&!self.crossOriginIsolated&&console.warn("env.wasm.numThreads is set to "+n+", but this will not work unless you enable crossOriginIsolated mode. See https://web.dev/cross-origin-isolation-guide/ for more info."),console.warn("WebAssembly multi-threading is not supported in the current environment. Falling back to single-threading."),e.numThreads=n=1);let o=e.wasmPaths,i=typeof o=="string"?o:void 0,s=o?.mjs,a=s?.href??s,u=o?.wasm,d=u?.href??u,l=e.wasmBinary,[c,p]=await Jr(a,i,n>1),f=!1,m=[];if(t>0&&m.push(new Promise(h=>{setTimeout(()=>{f=!0,h()},t)})),m.push(new Promise((h,b)=>{let y={numThreads:n};if(l)y.wasmBinary=l;else if(d||i)y.locateFile=g=>d??i+g;else if(a&&a.indexOf("blob:")!==0)y.locateFile=g=>new URL(g,a).href;else if(c){let g=Dt();g&&(y.locateFile=_=>g+_)}p(y).then(g=>{Bt=!1,kn=!0,An=g,h(),c&&URL.revokeObjectURL(c)},g=>{Bt=!1,eo=!0,b(g)})})),await Promise.race(m),f)throw new Error(`WebAssembly backend initializing failed due to timeout: ${t}ms`)},ne=()=>{if(kn&&An)return An;throw new Error("WebAssembly is not initialized yet.")}});var be,ct,Y,Mt=E(()=>{"use strict";Le();be=(e,t)=>{let n=ne(),r=n.lengthBytesUTF8(e)+1,o=n._malloc(r);return n.stringToUTF8(e,o,r),t.push(o),o},ct=(e,t,n,r)=>{if(typeof e=="object"&&e!==null){if(n.has(e))throw new Error("Circular reference in options");n.add(e)}Object.entries(e).forEach(([o,i])=>{let s=t?t+o:o;if(typeof i=="object")ct(i,s+".",n,r);else if(typeof i=="string"||typeof i=="number")r(s,i.toString());else if(typeof i=="boolean")r(s,i?"1":"0");else throw new Error(`Can't handle extra config type: ${typeof i}`)})},Y=e=>{let t=ne(),n=t.stackSave();try{let r=t.PTR_SIZE,o=t.stackAlloc(2*r);t._OrtGetLastError(o,o+r);let i=Number(t.getValue(o,r===4?"i32":"i64")),s=t.getValue(o+r,"*"),a=s?t.UTF8ToString(s):"";throw new Error(`${e} ERROR_CODE: ${i}, ERROR_MESSAGE: ${a}`)}finally{t.stackRestore(n)}}});var to,no=E(()=>{"use strict";Le();Mt();to=e=>{let t=ne(),n=0,r=[],o=e||{};try{if(e?.logSeverityLevel===void 0)o.logSeverityLevel=2;else if(typeof e.logSeverityLevel!="number"||!Number.isInteger(e.logSeverityLevel)||e.logSeverityLevel<0||e.logSeverityLevel>4)throw new Error(`log serverity level is not valid: ${e.logSeverityLevel}`);if(e?.logVerbosityLevel===void 0)o.logVerbosityLevel=0;else if(typeof e.logVerbosityLevel!="number"||!Number.isInteger(e.logVerbosityLevel))throw new Error(`log verbosity level is not valid: ${e.logVerbosityLevel}`);e?.terminate===void 0&&(o.terminate=!1);let i=0;return e?.tag!==void 0&&(i=be(e.tag,r)),n=t._OrtCreateRunOptions(o.logSeverityLevel,o.logVerbosityLevel,!!o.terminate,i),n===0&&Y("Can't create run options."),e?.extra!==void 0&&ct(e.extra,"",new WeakSet,(s,a)=>{let u=be(s,r),d=be(a,r);t._OrtAddRunConfigEntry(n,u,d)!==0&&Y(`Can't set a run config entry: ${s} - ${a}.`)}),[n,r]}catch(i){throw n!==0&&t._OrtReleaseRunOptions(n),r.forEach(s=>t._free(s)),i}}});var Ju,ed,td,Rt,nd,ro,oo=E(()=>{"use strict";Le();Mt();Ju=e=>{switch(e){case"disabled":return 0;case"basic":return 1;case"extended":return 2;case"all":return 99;default:throw new Error(`unsupported graph optimization level: ${e}`)}},ed=e=>{switch(e){case"sequential":return 0;case"parallel":return 1;default:throw new Error(`unsupported execution mode: ${e}`)}},td=e=>{e.extra||(e.extra={}),e.extra.session||(e.extra.session={});let t=e.extra.session;t.use_ort_model_bytes_directly||(t.use_ort_model_bytes_directly="1"),e.executionProviders&&e.executionProviders.some(n=>(typeof n=="string"?n:n.name)==="webgpu")&&(e.enableMemPattern=!1)},Rt=(e,t,n,r)=>{let o=be(t,r),i=be(n,r);ne()._OrtAddSessionConfigEntry(e,o,i)!==0&&Y(`Can't set a session config entry: ${t} - ${n}.`)},nd=async(e,t,n)=>{for(let r of t){let o=typeof r=="string"?r:r.name,i=[];switch(o){case"webnn":if(o="WEBNN",typeof r!="string"){let c=r?.deviceType;c&&Rt(e,"deviceType",c,n)}break;case"webgpu":if(o="JS",typeof r!="string"){let l=r;if(l?.preferredLayout){if(l.preferredLayout!=="NCHW"&&l.preferredLayout!=="NHWC")throw new Error(`preferredLayout must be either 'NCHW' or 'NHWC': ${l.preferredLayout}`);Rt(e,"preferredLayout",l.preferredLayout,n)}}break;case"wasm":case"cpu":continue;default:throw new Error(`not supported execution provider: ${o}`)}let s=be(o,n),a=i.length,u=0,d=0;if(a>0){u=ne()._malloc(a*ne().PTR_SIZE),n.push(u),d=ne()._malloc(a*ne().PTR_SIZE),n.push(d);for(let l=0;l<a;l++)ne().setValue(u+l*ne().PTR_SIZE,i[l][0],"*"),ne().setValue(d+l*ne().PTR_SIZE,i[l][1],"*")}await ne()._OrtAppendExecutionProvider(e,s,u,d,a)!==0&&Y(`Can't append execution provider: ${o}.`)}},ro=async e=>{let t=ne(),n=0,r=[],o=e||{};td(o);try{let i=Ju(o.graphOptimizationLevel??"all"),s=ed(o.executionMode??"sequential"),a=typeof o.logId=="string"?be(o.logId,r):0,u=o.logSeverityLevel??2;if(!Number.isInteger(u)||u<0||u>4)throw new Error(`log serverity level is not valid: ${u}`);let d=o.logVerbosityLevel??0;if(!Number.isInteger(d)||d<0||d>4)throw new Error(`log verbosity level is not valid: ${d}`);let l=typeof o.optimizedModelFilePath=="string"?be(o.optimizedModelFilePath,r):0;if(n=t._OrtCreateSessionOptions(i,!!o.enableCpuMemArena,!!o.enableMemPattern,s,!!o.enableProfiling,0,a,u,d,l),n===0&&Y("Can't create session options."),o.executionProviders&&await nd(n,o.executionProviders,r),o.enableGraphCapture!==void 0){if(typeof o.enableGraphCapture!="boolean")throw new Error(`enableGraphCapture must be a boolean value: ${o.enableGraphCapture}`);Rt(n,"enableGraphCapture",o.enableGraphCapture.toString(),r)}if(o.freeDimensionOverrides)for(let[c,p]of Object.entries(o.freeDimensionOverrides)){if(typeof c!="string")throw new Error(`free dimension override name must be a string: ${c}`);if(typeof p!="number"||!Number.isInteger(p)||p<0)throw new Error(`free dimension override value must be a non-negative integer: ${p}`);let f=be(c,r);t._OrtAddFreeDimensionOverride(n,f,p)!==0&&Y(`Can't set a free dimension override: ${c} - ${p}.`)}return o.extra!==void 0&&ct(o.extra,"",new WeakSet,(c,p)=>{Rt(n,c,p,r)}),[n,r]}catch(i){throw n!==0&&t._OrtReleaseSessionOptions(n)!==0&&Y("Can't release session options."),r.forEach(s=>t._free(s)),i}}});var Ge,ke,We,tt,pt,Ut,Vt,En,V=E(()=>{"use strict";Ge=e=>{switch(e){case"int8":return 3;case"uint8":return 2;case"bool":return 9;case"int16":return 5;case"uint16":return 4;case"int32":return 6;case"uint32":return 12;case"float16":return 10;case"float32":return 1;case"float64":return 11;case"string":return 8;case"int64":return 7;case"uint64":return 13;case"int4":return 22;case"uint4":return 21;default:throw new Error(`unsupported data type: ${e}`)}},ke=e=>{switch(e){case 3:return"int8";case 2:return"uint8";case 9:return"bool";case 5:return"int16";case 4:return"uint16";case 6:return"int32";case 12:return"uint32";case 10:return"float16";case 1:return"float32";case 11:return"float64";case 8:return"string";case 7:return"int64";case 13:return"uint64";case 22:return"int4";case 21:return"uint4";default:throw new Error(`unsupported data type: ${e}`)}},We=(e,t)=>{let n=[-1,4,1,1,2,2,4,8,-1,1,2,8,4,8,-1,-1,-1,-1,-1,-1,-1,.5,.5][e],r=typeof t=="number"?t:t.reduce((o,i)=>o*i,1);return n>0?Math.ceil(r*n):void 0},tt=e=>{switch(e){case"float16":return typeof Float16Array<"u"&&Float16Array.from?Float16Array:Uint16Array;case"float32":return Float32Array;case"uint8":return Uint8Array;case"int8":return Int8Array;case"uint16":return Uint16Array;case"int16":return Int16Array;case"int32":return Int32Array;case"bool":return Uint8Array;case"float64":return Float64Array;case"uint32":return Uint32Array;case"int64":return BigInt64Array;case"uint64":return BigUint64Array;default:throw new Error(`unsupported type: ${e}`)}},pt=e=>{switch(e){case"verbose":return 0;case"info":return 1;case"warning":return 2;case"error":return 3;case"fatal":return 4;default:throw new Error(`unsupported logging level: ${e}`)}},Ut=e=>e==="float32"||e==="float16"||e==="int32"||e==="int64"||e==="uint32"||e==="uint8"||e==="bool"||e==="uint4"||e==="int4",Vt=e=>e==="float32"||e==="float16"||e==="int32"||e==="int64"||e==="uint32"||e==="uint64"||e==="int8"||e==="uint8"||e==="bool"||e==="uint4"||e==="int4",En=e=>{switch(e){case"none":return 0;case"cpu":return 1;case"cpu-pinned":return 2;case"texture":return 3;case"gpu-buffer":return 4;case"ml-tensor":return 5;default:throw new Error(`unsupported data location: ${e}`)}}});var mt,Pn=E(()=>{"use strict";St();mt=async e=>{if(typeof e=="string")if(!1)try{let{readFile:t}=_n("node:fs/promises");return new Uint8Array(await t(e))}catch(t){if(t.code==="ERR_FS_FILE_TOO_LARGE"){let{createReadStream:n}=_n("node:fs"),r=n(e),o=[];for await(let i of r)o.push(i);return new Uint8Array(Buffer.concat(o))}throw t}else{let t=await fetch(e);if(!t.ok)throw new Error(`failed to load external data file: ${e}`);let n=t.headers.get("Content-Length"),r=n?parseInt(n,10):0;if(r<1073741824)return new Uint8Array(await t.arrayBuffer());{if(!t.body)throw new Error(`failed to load external data file: ${e}, no response body.`);let o=t.body.getReader(),i;try{i=new ArrayBuffer(r)}catch(a){if(a instanceof RangeError){let u=Math.ceil(r/65536);i=new WebAssembly.Memory({initial:u,maximum:u}).buffer}else throw a}let s=0;for(;;){let{done:a,value:u}=await o.read();if(a)break;let d=u.byteLength;new Uint8Array(i,s,d).set(u),s+=d}return new Uint8Array(i,0,r)}}else return e instanceof Blob?new Uint8Array(await e.arrayBuffer()):e instanceof Uint8Array?e:new Uint8Array(e)}});var rd,od,io,so,Nt,id,j,Ee=E(()=>{"use strict";V();rd=["V","I","W","E","F"],od=(e,t)=>{console.log(`[${rd[e]},${new Date().toISOString()}]${t}`)},Nt=(e,t)=>{io=e,so=t},id=(e,t)=>{let n=pt(e),r=pt(io);n>=r&&od(n,typeof t=="function"?t():t)},j=(...e)=>{so&&id(...e)}});var zn,Pe,x,Ze,Lt,ao,uo,q=E(()=>{"use strict";zn=class{static calcMatMulShape(t,n){return t[1]!==n[0]?void 0:[t[0],n[1]]}},Pe=class{static calcShape(t,n,r=!1){let o=t.length,i=n.length;if(o===0)return n;if(i===0)return t;let s=Math.max(t.length,n.length),a=new Array(s);if(r){if(o<2||i<2)return;let u=zn.calcMatMulShape([t[o-2],t[o-1]],[n[i-2],n[i-1]]);if(u===void 0)return;[a[s-2],a[s-1]]=u}for(let u=r?3:1;u<=s;u++){let d=o-u<0?1:t[o-u],l=i-u<0?1:n[i-u];if(d!==l&&d>1&&l>1)return;let c=Math.max(d,l);if(d&&l)a[s-u]=Math.max(d,l);else{if(c>1)return;a[s-u]=0}}return a}static isValidBroadcast(t,n){let r=t.length,o=n.length;if(r>o)return!1;for(let i=1;i<=r;i++)if(t[r-i]!==1&&t[r-i]!==n[o-i])return!1;return!0}},x=class e{static size(t){return e.getSizeFromDimensionRange(t,0,t.length)}static convertShape(t,n=4){let r=t.length;if(r===0)return[];let o=new Array(r),i=r-1;for(;i>=0;){if(t[i]%n===0){o[i]=t[i]/n;break}if(n%t[i]!==0)throw new Error("cannot convert shape");o[i]=1,n/=t[i],i--}for(i--;i>=0;i--)o[i]=t[i];return o}static sizeFromDimension(t,n){if(n<0||n>t.length)throw new Error(`invalid dimension of ${n} for sizeFromDimension as Tensor has ${t.length} dimensions.`);return e.getSizeFromDimensionRange(t,n,t.length)}static sizeToDimension(t,n){if(n<0||n>t.length)throw new Error(`invalid dimension of ${n} for sizeToDimension as Tensor has ${t.length} dimensions.`);return e.getSizeFromDimensionRange(t,0,n)}static getSizeFromDimensionRange(t,n,r){let o=1;for(let i=n;i<r;i++){if(t[i]<0)throw new Error("cannot get valid size from specified dimension range. Most likely the range contains negative values in them.");o*=Number(t[i])}return o}static computeStrides(t){let n=t.length;if(n===0)return[];if(n===1)return[1];let r=new Array(n);r[n-1]=1,r[n-2]=t[n-1];for(let o=n-3;o>=0;--o)r[o]=r[o+1]*t[o+1];return r}static normalizeAxis(t,n){if(t<-n&&t>=n)throw new Error("unsupported axis for this operation.");return t<0?t+n:t}static normalizeAxes(t,n){return t.map(r=>this.normalizeAxis(r,n??t.length))}static sortBasedOnPerm(t,n){return n?n.map(r=>t[r]):t.slice().reverse()}static padShape(t,n){let r=t.length;return t.map((o,i)=>o+n[i]+n[i+r])}static areEqual(t,n){return t.length!==n.length?!1:t.every((r,o)=>r===n[o])}},Ze=class e{static adjustPoolAttributes(t,n,r,o,i,s){if(!t&&r.length!==n.length-2)throw new Error("length of specified kernel shapes should be 2 less than length of input dimensions");if(t)for(let a=0;a<n.length-2;a++)a>=r.length?r.push(n[a+2]):r[a]=n[a+2];for(let a=0;a<r.length;a++)if(a<o.length){if(o[a]<0)throw new Error("strides should be greater than or equal to 1")}else o.push(1);for(let a=0;a<r.length;a++)if(a<i.length){if(i[a]<0)throw new Error("dilations should be greater than or equal to 1")}else i.push(1);for(let a=0;a<r.length*2;a++)if(a<s.length){if(s[a]<0)throw new Error("pad should be greater than or equal to 1")}else s.push(0);for(let a=0;a<r.length;a++){if(r[a]<=0)throw new Error("kernel shapes need to be greater than 0");if(s[a]>=r[a]||s[a+r.length]>=r[a])throw new Error("pads should be smaller than kernel")}}static adjustPadsBasedOnAutoPad(t,n,r,o,i,s,a){if(a){if(i.length!==2*(t.length-2))throw new Error("length of pads should be twice the length of data dimensions");if(n.length!==t.length-2)throw new Error("length of strides should be the length of data dimensions");if(o.length!==t.length-2)throw new Error("length of kernel shapes should be the length of data dimensions");for(let u=0;u<t.length-2;u++)e.adjustPadAndReturnShape(t[u+(s?1:2)],n[u],r[u],o[u],i,u,u+t.length-2,a)}}static computePoolOutputShape(t,n,r,o,i,s,a){if(n.length<=0)throw new Error("input shape must be of size greater than 0");let u=[n[0],n[1]];return e.computeShapeHelper(t,n,u,r,o,i,s,a),u}static computeConvOutputShape(t,n,r,o,i,s,a){if(t.length<=0||n.length<=0)throw new Error("invalid input tensor dims or invalid filter tensor dims");let u=[t[0],n[0]];return e.computeShapeHelper(!1,t,u,r,o,i,s,a),u}static computeShapeHelper(t,n,r,o,i,s,a,u){if(t)for(let d=0;d<n.length-2;d++)r.push(1);else for(let d=0;d<n.length-2;d++)r.push(e.adjustPadAndReturnShape(n[d+2],o[d],i[d],s[d],a,d,d+n.length-2,u))}static adjustPadAndReturnShape(t,n,r,o,i,s,a,u){let d=r*(o-1)+1;if(u&&u!=="NOTSET")switch(u){case"VALID":return i[s]=0,i[a]=0,Math.floor((t-d)/n+1);case"SAME_LOWER":case"SAME_UPPER":if(r!==1)throw new Error("Dilation not supported for SAME_UPPER or SAME_LOWER");{let c=((t+n-1)/n-1)*n+o-t;return i[s]=Math.floor(u==="SAME_LOWER"?(c+1)/2:c/2),i[a]=c-i[s],Math.floor((t+c-o)/n+1)}default:throw new Error("Unsupported AutoPad type")}else return Math.floor((t+i[s]+i[a]-d)/n+1)}},Lt=class{static getShapeOfGemmResult(t,n,r,o,i){if(t.length!==2||r.length!==2)throw new Error("shape need to be of size 2");let s,a,u;n?(s=t[1],a=t[0]):(s=t[0],a=t[1]);let d=-1;if(o?(u=r[0],d=1):(u=r[1],d=0),r[d]!==a)throw new Error("dimension mismatch");if(s<=0||u<=0||a<=0)throw new Error("invalid shape specified");if(i&&!Pe.isValidBroadcast(i,[s,u]))throw new Error("gemm: invalid bias shape for broadcast");return[s,u,a]}},ao=-34028234663852886e22,uo=34028234663852886e22});var Gt,On=E(()=>{"use strict";V();Gt=(e,t)=>new(tt(t))(e)});var co,Bn,po,sd,lo,ad,mo,Wt,Ht,Dn,fo,ho=E(()=>{"use strict";V();Ee();co=new Map([["float32",32],["float16",16],["int32",32],["uint32",32],["int64",64],["uint64",64],["int8",8],["uint8",8],["int4",4],["uint4",4]]),Bn=(e,t)=>{if(t==="int32")return e;let n=co.get(t);if(!n)throw new Error(`WebNN backend does not support data type: ${t}`);let r=n/8;if(e.byteLength%r!==0)throw new Error(`Invalid Uint8Array length - must be a multiple of ${r}.`);let o=e.byteLength/r,i=new(tt(t))(e.buffer,e.byteOffset,o);switch(t){case"int64":case"uint64":{let s=new Int32Array(o);for(let a=0;a<o;a++){let u=i[a];if(u>2147483647n||u<-2147483648n)throw new Error("Can not convert int64 data to int32 - value out of range.");s[a]=Number(u)}return new Uint8Array(s.buffer)}case"int8":case"uint8":case"uint32":{if(t==="uint32"&&i.some(a=>a>2147483647))throw new Error("Can not convert uint32 data to int32 - value out of range.");let s=Int32Array.from(i,Number);return new Uint8Array(s.buffer)}default:throw new Error(`Unsupported data conversion from ${t} to 'int32'`)}},po=(e,t)=>{if(t==="int32")return e;if(e.byteLength%4!==0)throw new Error("Invalid Uint8Array length - must be a multiple of 4 (int32).");let n=e.byteLength/4,r=new Int32Array(e.buffer,e.byteOffset,n);switch(t){case"int64":{let o=BigInt64Array.from(r,BigInt);return new Uint8Array(o.buffer)}case"uint64":{if(r.some(i=>i<0))throw new Error("Can not convert int32 data to uin64 - negative value found.");let o=BigUint64Array.from(r,BigInt);return new Uint8Array(o.buffer)}case"int8":{if(r.some(i=>i<-128||i>127))throw new Error("Can not convert int32 data to int8 - value out of range.");let o=Int8Array.from(r,Number);return new Uint8Array(o.buffer)}case"uint8":{if(r.some(o=>o<0||o>255))throw new Error("Can not convert int32 data to uint8 - value out of range.");return Uint8Array.from(r,Number)}case"uint32":{if(r.some(i=>i<0))throw new Error("Can not convert int32 data to uint32 - negative value found.");let o=Uint32Array.from(r,Number);return new Uint8Array(o.buffer)}default:throw new Error(`Unsupported data conversion from 'int32' to ${t}`)}},sd=1,lo=()=>sd++,ad=new Map([["int8","int32"],["uint8","int32"],["uint32","int32"],["int64","int32"]]),mo=(e,t)=>{let n=co.get(e);if(!n)throw new Error(`WebNN backend does not support data type: ${e}`);return t.length>0?Math.ceil(t.reduce((r,o)=>r*o)*n/8):0},Wt=class{constructor(t){this.isDataConverted=!1;let{sessionId:n,context:r,tensor:o,dataType:i,shape:s,fallbackDataType:a}=t;this.sessionId=n,this.mlContext=r,this.mlTensor=o,this.dataType=i,this.tensorShape=s,this.fallbackDataType=a}get tensor(){return this.mlTensor}get type(){return this.dataType}get fallbackType(){return this.fallbackDataType}get shape(){return this.tensorShape}get byteLength(){return mo(this.dataType,this.tensorShape)}destroy(){j("verbose",()=>"[WebNN] TensorWrapper.destroy"),this.mlTensor.destroy()}write(t){this.mlContext.writeTensor(this.mlTensor,t)}async read(t){if(this.fallbackDataType){let n=await this.mlContext.readTensor(this.mlTensor),r=po(new Uint8Array(n),this.dataType);if(t){(t instanceof ArrayBuffer?new Uint8Array(t):new Uint8Array(t.buffer,t.byteOffset,t.byteLength)).set(r);return}else return r.buffer}else return t?this.mlContext.readTensor(this.mlTensor,t):this.mlContext.readTensor(this.mlTensor)}canReuseTensor(t,n,r){return this.mlContext===t&&this.dataType===n&&this.tensorShape.length===r.length&&this.tensorShape.every((o,i)=>o===r[i])}setIsDataConverted(t){this.isDataConverted=t}},Ht=class{constructor(t,n){this.tensorManager=t;this.wrapper=n}get tensorWrapper(){return this.wrapper}releaseTensor(){this.tensorWrapper&&(this.tensorManager.releaseTensor(this.tensorWrapper),this.wrapper=void 0)}async ensureTensor(t,n,r,o){let i=this.tensorManager.getMLContext(t),s;if(!i.opSupportLimits().input.dataTypes.includes(n)){if(s=ad.get(n),!s||!i.opSupportLimits().input.dataTypes.includes(s))throw new Error(`WebNN backend does not support data type: ${n}`);j("verbose",()=>`[WebNN] TensorIdTracker.ensureTensor: fallback dataType from ${n} to ${s}`)}if(this.wrapper){if(this.wrapper.canReuseTensor(i,n,r))return this.wrapper.tensor;if(o){if(this.wrapper.byteLength!==mo(n,r))throw new Error("Unable to copy data to tensor with different size.");this.activeUpload=new Uint8Array(await this.wrapper.read())}this.tensorManager.releaseTensor(this.wrapper)}let a=typeof MLTensorUsage>"u"?void 0:MLTensorUsage.READ|MLTensorUsage.WRITE;return this.wrapper=await this.tensorManager.getCachedTensor(t,n,r,a,!0,!0,s),o&&this.activeUpload&&(this.wrapper.write(this.activeUpload),this.activeUpload=void 0),this.wrapper.tensor}upload(t){let n=t;if(this.wrapper){if(this.wrapper.fallbackType)if(this.wrapper.fallbackType==="int32")n=Bn(t,this.wrapper.type),this.wrapper.setIsDataConverted(!0);else throw new Error(`Unsupported fallback data type: ${this.wrapper.fallbackType}`);if(t.byteLength===this.wrapper.byteLength){this.wrapper.write(n);return}else j("verbose",()=>"Data size does not match tensor size. Releasing tensor."),this.releaseTensor()}this.activeUpload?this.activeUpload.set(n):this.activeUpload=new Uint8Array(n)}async download(t){if(this.activeUpload){let n=this.wrapper?.isDataConverted?po(this.activeUpload,this.wrapper?.type):this.activeUpload;if(t){t instanceof ArrayBuffer?new Uint8Array(t).set(n):new Uint8Array(t.buffer,t.byteOffset,t.byteLength).set(n);return}else return n.buffer}if(!this.wrapper)throw new Error("Tensor has not been created.");return t?this.wrapper.read(t):this.wrapper.read()}},Dn=class{constructor(t){this.backend=t;this.tensorTrackersById=new Map;this.freeTensors=[];this.externalTensors=new Set}getMLContext(t){let n=this.backend.getMLContext(t);if(!n)throw new Error("MLContext not found for session.");return n}reserveTensorId(){let t=lo();return this.tensorTrackersById.set(t,new Ht(this)),t}releaseTensorId(t){let n=this.tensorTrackersById.get(t);n&&(this.tensorTrackersById.delete(t),n.tensorWrapper&&this.releaseTensor(n.tensorWrapper))}async ensureTensor(t,n,r,o,i){j("verbose",()=>`[WebNN] TensorManager.ensureTensor {tensorId: ${n}, dataType: ${r}, shape: ${o}, copyOld: ${i}}`);let s=this.tensorTrackersById.get(n);if(!s)throw new Error("Tensor not found.");return s.ensureTensor(t,r,o,i)}upload(t,n){let r=this.tensorTrackersById.get(t);if(!r)throw new Error("Tensor not found.");r.upload(n)}async download(t,n){j("verbose",()=>`[WebNN] TensorManager.download {tensorId: ${t}, dstBuffer: ${n?.byteLength}}`);let r=this.tensorTrackersById.get(t);if(!r)throw new Error("Tensor not found.");return r.download(n)}releaseTensorsForSession(t){for(let n of this.freeTensors)n.sessionId===t&&n.destroy();this.freeTensors=this.freeTensors.filter(n=>n.sessionId!==t)}registerTensor(t,n,r,o){let i=this.getMLContext(t),s=lo(),a=new Wt({sessionId:t,context:i,tensor:n,dataType:r,shape:o});return this.tensorTrackersById.set(s,new Ht(this,a)),this.externalTensors.add(a),s}async getCachedTensor(t,n,r,o,i,s,a){let u=this.getMLContext(t);for(let[l,c]of this.freeTensors.entries())if(c.canReuseTensor(u,n,r)){j("verbose",()=>`[WebNN] Reusing tensor {dataType: ${n}, ${a?`fallbackDataType: ${a},`:""} shape: ${r}`);let p=this.freeTensors.splice(l,1)[0];return p.sessionId=t,p}j("verbose",()=>`[WebNN] MLContext.createTensor {dataType: ${n}, ${a?`fallbackDataType: ${a},`:""} shape: ${r}}`);let d=await u.createTensor({dataType:a??n,shape:r,dimensions:r,usage:o,writable:i,readable:s});return new Wt({sessionId:t,context:u,tensor:d,dataType:n,shape:r,fallbackDataType:a})}releaseTensor(t){this.externalTensors.has(t)&&this.externalTensors.delete(t),this.freeTensors.push(t)}},fo=(...e)=>new Dn(...e)});var qt,ud,Ft,go=E(()=>{"use strict";V();Le();On();ho();Ee();qt=new Map([[1,"float32"],[10,"float16"],[6,"int32"],[12,"uint32"],[7,"int64"],[13,"uint64"],[22,"int4"],[21,"uint4"],[3,"int8"],[2,"uint8"],[9,"uint8"]]),ud=(e,t)=>{if(e===t)return!0;if(e===void 0||t===void 0)return!1;let n=Object.keys(e).sort(),r=Object.keys(t).sort();return n.length===r.length&&n.every((o,i)=>o===r[i]&&e[o]===t[o])},Ft=class{constructor(t){this.tensorManager=fo(this);this.mlContextBySessionId=new Map;this.sessionIdsByMLContext=new Map;this.mlContextCache=[];this.sessionGraphInputs=new Map;this.sessionGraphOutputs=new Map;this.temporaryGraphInputs=[];this.temporaryGraphOutputs=[];this.temporarySessionTensorIds=new Map;Nt(t.logLevel,!!t.debug)}get currentSessionId(){if(this.activeSessionId===void 0)throw new Error("No active session");return this.activeSessionId}onRunStart(t){j("verbose",()=>`[WebNN] onRunStart {sessionId: ${t}}`),this.activeSessionId=t}onRunEnd(t){j("verbose",()=>`[WebNN] onRunEnd {sessionId: ${t}}`);let n=this.temporarySessionTensorIds.get(t);if(n){for(let r of n)j("verbose",()=>`[WebNN] releasing temporary tensor {tensorId: ${r}}`),this.tensorManager.releaseTensorId(r);this.temporarySessionTensorIds.delete(t),this.activeSessionId=void 0}}async createMLContext(t){if(t instanceof GPUDevice){let r=this.mlContextCache.findIndex(o=>o.gpuDevice===t);if(r!==-1)return this.mlContextCache[r].mlContext;{let o=await navigator.ml.createContext(t);return this.mlContextCache.push({gpuDevice:t,mlContext:o}),o}}else if(t===void 0){let r=this.mlContextCache.findIndex(o=>o.options===void 0&&o.gpuDevice===void 0);if(r!==-1)return this.mlContextCache[r].mlContext;{let o=await navigator.ml.createContext();return this.mlContextCache.push({mlContext:o}),o}}let n=this.mlContextCache.findIndex(r=>ud(r.options,t));if(n!==-1)return this.mlContextCache[n].mlContext;{let r=await navigator.ml.createContext(t);return this.mlContextCache.push({options:t,mlContext:r}),r}}registerMLContext(t,n){this.mlContextBySessionId.set(t,n);let r=this.sessionIdsByMLContext.get(n);r||(r=new Set,this.sessionIdsByMLContext.set(n,r)),r.add(t),this.temporaryGraphInputs.length>0&&(this.sessionGraphInputs.set(t,this.temporaryGraphInputs),this.temporaryGraphInputs=[]),this.temporaryGraphOutputs.length>0&&(this.sessionGraphOutputs.set(t,this.temporaryGraphOutputs),this.temporaryGraphOutputs=[])}onReleaseSession(t){this.sessionGraphInputs.delete(t),this.sessionGraphOutputs.delete(t);let n=this.mlContextBySessionId.get(t);if(!n)return;this.tensorManager.releaseTensorsForSession(t),this.mlContextBySessionId.delete(t);let r=this.sessionIdsByMLContext.get(n);if(r.delete(t),r.size===0){this.sessionIdsByMLContext.delete(n);let o=this.mlContextCache.findIndex(i=>i.mlContext===n);o!==-1&&this.mlContextCache.splice(o,1)}}getMLContext(t){return this.mlContextBySessionId.get(t)}reserveTensorId(){return this.tensorManager.reserveTensorId()}releaseTensorId(t){j("verbose",()=>`[WebNN] releaseTensorId {tensorId: ${t}}`),this.tensorManager.releaseTensorId(t)}async ensureTensor(t,n,r,o,i){let s=qt.get(r);if(!s)throw new Error(`Unsupported ONNX data type: ${r}`);return this.tensorManager.ensureTensor(t??this.currentSessionId,n,s,o,i)}async createTemporaryTensor(t,n,r){j("verbose",()=>`[WebNN] createTemporaryTensor {onnxDataType: ${n}, shape: ${r}}`);let o=qt.get(n);if(!o)throw new Error(`Unsupported ONNX data type: ${n}`);let i=this.tensorManager.reserveTensorId();await this.tensorManager.ensureTensor(t,i,o,r,!1);let s=this.temporarySessionTensorIds.get(t);return s?s.push(i):this.temporarySessionTensorIds.set(t,[i]),i}uploadTensor(t,n){if(!ne().shouldTransferToMLTensor)throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");j("verbose",()=>`[WebNN] uploadTensor {tensorId: ${t}, data: ${n.byteLength}}`),this.tensorManager.upload(t,n)}async downloadTensor(t,n){return this.tensorManager.download(t,n)}createMLTensorDownloader(t,n){return async()=>{let r=await this.tensorManager.download(t);return Gt(r,n)}}registerMLTensor(t,n,r,o){let i=qt.get(r);if(!i)throw new Error(`Unsupported ONNX data type: ${r}`);let s=this.tensorManager.registerTensor(t,n,i,o);return j("verbose",()=>`[WebNN] registerMLTensor {tensor: ${n}, dataType: ${i}, dimensions: ${o}} -> {tensorId: ${s}}`),s}registerMLConstant(t,n,r,o,i,s,a=!1){if(!s)throw new Error("External mounted files are not available.");let u=t;t.startsWith("./")&&(u=t.substring(2));let d=s.get(u);if(!d)throw new Error(`File with name ${u} not found in preloaded files.`);if(n+r>d.byteLength)throw new Error("Out of bounds: data offset and length exceed the external file data size.");let l=d.slice(n,n+r).buffer,c;switch(i.dataType){case"float32":c=new Float32Array(l);break;case"float16":c=typeof Float16Array<"u"&&Float16Array.from?new Float16Array(l):new Uint16Array(l);break;case"int32":c=new Int32Array(l);break;case"uint32":c=new Uint32Array(l);break;case"int64":if(a){let p=Bn(new Uint8Array(l),"int64");c=new Int32Array(p.buffer),i.dataType="int32"}else c=new BigInt64Array(l);break;case"uint64":c=new BigUint64Array(l);break;case"int8":c=new Int8Array(l);break;case"int4":case"uint4":case"uint8":c=new Uint8Array(l);break;default:throw new Error(`Unsupported data type: ${i.dataType} in creating WebNN Constant from external data.`)}return j("verbose",()=>`[WebNN] registerMLConstant {dataType: ${i.dataType}, shape: ${i.shape}}} ${a?"(Note: it was int64 data type and registered to int32 as workaround)":""}`),o.constant(i,c)}registerGraphInput(t){this.temporaryGraphInputs.push(t)}registerGraphOutput(t){this.temporaryGraphOutputs.push(t)}isGraphInput(t,n){let r=this.sessionGraphInputs.get(t);return r?r.includes(n):!1}isGraphOutput(t,n){let r=this.sessionGraphOutputs.get(t);return r?r.includes(n):!1}isGraphInputOutputTypeSupported(t,n,r=!0){let o=this.mlContextBySessionId.get(t),i=qt.get(Ge(n));return typeof i>"u"?!1:r?!!o?.opSupportLimits().input.dataTypes.includes(i):!!o?.opSupportLimits().output.dataTypes.includes(i)}flush(){}}});var Kt=E(()=>{"use strict"});var yo,Mn,Rn,dd,ld,bo,Vn,Un,wo,$o=E(()=>{"use strict";Ee();Kt();yo=new Map([[64,250],[128,200],[256,200],[512,200],[2048,230],[4096,200],[8192,50],[16384,50],[32768,50],[65536,50],[131072,50],[262144,50],[524288,50],[1048576,50],[2097152,30],[4194304,20],[8388608,10],[12582912,10],[16777216,10],[26214400,15],[33554432,22],[44236800,2],[58982400,6],[67108864,6],[134217728,6],[167772160,6]]),Mn=[],Rn=e=>Math.ceil(Number(e)/16)*16,dd=e=>{for(let t=0;t<Mn.length;t++){let n=Mn[t];if(e<=n)return n}return Math.ceil(e/16)*16},ld=1,bo=()=>ld++,Vn=async(e,t,n,r)=>{let o=Rn(n),i=e.device.createBuffer({size:o,usage:GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ});try{let s=e.getCommandEncoder();e.endComputePass(),s.copyBufferToBuffer(t,0,i,0,o),e.flush(),await i.mapAsync(GPUMapMode.READ);let a=i.getMappedRange();if(r){let u=r();return u.set(new Uint8Array(a,0,n)),u}else return new Uint8Array(a.slice(0,n))}finally{i.destroy()}},Un=class{constructor(t){this.backend=t;this.storageCache=new Map,this.freeBuffers=new Map,this.freeUniformBuffers=new Map,this.buffersPending=[],this.capturedPendingBuffers=new Map;for(let[n]of yo)Mn.push(n),this.freeBuffers.set(n,[]),this.freeUniformBuffers.set(n,[]);this.sessionCount=0}upload(t,n){let r=n.buffer,o=n.byteOffset,i=n.byteLength,s=Rn(i),a=this.storageCache.get(t);if(!a)throw new Error("gpu data for uploading does not exist");if(Number(a.originalSize)!==i)throw new Error(`inconsistent data size. gpu data size=${a.originalSize}, data size=${i}`);let u=this.backend.device.createBuffer({mappedAtCreation:!0,size:s,usage:GPUBufferUsage.MAP_WRITE|GPUBufferUsage.COPY_SRC}),d=u.getMappedRange();new Uint8Array(d).set(new Uint8Array(r,o,i)),u.unmap();let l=this.backend.device.createCommandEncoder();l.copyBufferToBuffer(u,0,a.gpuData.buffer,0,s),this.backend.device.queue.submit([l.finish()]),u.destroy(),j("verbose",()=>`[WebGPU] GpuDataManager.upload(id=${t})`)}memcpy(t,n){let r=this.storageCache.get(t);if(!r)throw new Error("source gpu data for memcpy does not exist");let o=this.storageCache.get(n);if(!o)throw new Error("destination gpu data for memcpy does not exist");if(r.originalSize!==o.originalSize)throw new Error("inconsistent source and destination gpu data size");let i=Rn(r.originalSize),s=this.backend.getCommandEncoder();this.backend.endComputePass(),s.copyBufferToBuffer(r.gpuData.buffer,0,o.gpuData.buffer,0,i)}registerExternalBuffer(t,n,r){let o;if(r){if(o=r[0],t===r[1])return j("verbose",()=>`[WebGPU] GpuDataManager.registerExternalBuffer(size=${n}) => id=${o}, buffer is the same, skip.`),o;if(this.backend.capturedCommandList.has(this.backend.currentSessionId))throw new Error(`Registering a different external buffer under graph capture mode is not supported yet.
+             Please use the previous external buffer!`)}else o=bo();return this.storageCache.set(o,{gpuData:{id:o,type:0,buffer:t},originalSize:n}),j("verbose",()=>`[WebGPU] GpuDataManager.registerExternalBuffer(size=${n}) => id=${o}, registered.`),o}unregisterExternalBuffer(t){t!==void 0&&(this.storageCache.delete(t),j("verbose",()=>`[WebGPU] GpuDataManager.unregisterExternalBuffer() => id=${t}`))}create(t,n=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST){let r=dd(t),o,i=(n&GPUBufferUsage.STORAGE)===GPUBufferUsage.STORAGE,s=(n&GPUBufferUsage.UNIFORM)===GPUBufferUsage.UNIFORM;if(i||s){let d=(i?this.freeBuffers:this.freeUniformBuffers).get(r);d?d.length>0?o=d.pop():o=this.backend.device.createBuffer({size:r,usage:n}):o=this.backend.device.createBuffer({size:r,usage:n})}else o=this.backend.device.createBuffer({size:r,usage:n});let a={id:bo(),type:0,buffer:o};return this.storageCache.set(a.id,{gpuData:a,originalSize:Number(t)}),j("verbose",()=>`[WebGPU] GpuDataManager.create(size=${t}) => id=${a.id}`),a}get(t){return this.storageCache.get(t)?.gpuData}release(t){let n=typeof t=="bigint"?Number(t):t,r=this.storageCache.get(n);if(!r){if(this.storageCache.size===0)return 0;throw new Error("releasing data does not exist")}return j("verbose",()=>`[WebGPU] GpuDataManager.release(id=${n}), gpuDataId=${r.gpuData.id}`),this.storageCache.delete(n),this.buffersPending.push(r.gpuData.buffer),r.originalSize}async download(t,n){let r=this.storageCache.get(Number(t));if(!r)throw new Error("data does not exist");await Vn(this.backend,r.gpuData.buffer,r.originalSize,n)}refreshPendingBuffers(){if(this.buffersPending.length!==0)if(this.backend.sessionStatus==="default"){for(let t of this.buffersPending){let n=yo.get(t.size);if((t.usage&GPUBufferUsage.STORAGE)===GPUBufferUsage.STORAGE){let r=this.freeBuffers.get(t.size)||[];n===void 0||r.length>=n?t.destroy():r.push(t)}else if((t.usage&GPUBufferUsage.UNIFORM)===GPUBufferUsage.UNIFORM){let r=this.freeUniformBuffers.get(t.size)||[];n===void 0||r.length>=n?t.destroy():r.push(t)}else t.destroy()}this.buffersPending=[]}else{let t=this.capturedPendingBuffers.get(this.backend.currentSessionId);t||(t=[],this.capturedPendingBuffers.set(this.backend.currentSessionId,t));for(let n of this.buffersPending)t.push(n);this.buffersPending=[]}}dispose(){this.freeBuffers.forEach(t=>{t.forEach(n=>{n.destroy()})}),this.freeUniformBuffers.forEach(t=>{t.forEach(n=>{n.destroy()})}),this.storageCache.forEach(t=>{t.gpuData.buffer.destroy()}),this.capturedPendingBuffers.forEach(t=>{t.forEach(n=>{n.destroy()})}),this.storageCache=new Map,this.freeBuffers=new Map,this.freeUniformBuffers=new Map,this.capturedPendingBuffers=new Map}onCreateSession(){this.sessionCount+=1}onReleaseSession(t){let n=this.capturedPendingBuffers.get(t);n&&(n.forEach(r=>{r.destroy()}),this.capturedPendingBuffers.delete(t)),this.sessionCount-=1,this.sessionCount===0&&(j("warning",()=>"[WebGPU] Clearing webgpu buffer cache"),this.storageCache.forEach(r=>{r.gpuData.buffer.destroy()}),this.storageCache=new Map)}},wo=(...e)=>new Un(...e)});var Nn,N,ie=E(()=>{"use strict";Nn=class{constructor(t){Object.assign(this,t)}get cacheKey(){return this.key||(this.key=Object.getOwnPropertyNames(this).sort().map(t=>`${this[t]}`).join(";")),this.key}},N=e=>new Nn(e)});var Qe,Gn,re,le,P,X,Wn,Xe,Te,B,jt,S,C,vo,Zt,Ln,xo,K=E(()=>{"use strict";V();q();Qe=64,Gn=(e,t)=>{if(t===3)throw new Error("vec3 has same alignment as vec4, use vec4 instead");switch(Number(e)){case 10:return t>1?`vec${t}<f16>`:"f16";case 1:return t>1?`vec${t}<f32>`:"f32";case 6:return t>1?`vec${t}<i32>`:"i32";case 12:return t>1?`vec${t}<u32>`:"u32";case 7:if(t>1)throw new Error("currently not supported vecX of uint64 yet");return["vec2<u32>","i32"];case 13:if(t>1)throw new Error("currently not supported vecX of uint64 yet");return["vec2<u32>","u32"];case 9:if(t!==4)throw new Error("bool must be vec4");return["u32","vec4<bool>"];case 22:return"i32";case 21:return"u32";default:throw new Error(`Unknown data type: ${e}`)}},re=(e,t=1)=>{let n=Gn(e,t);return typeof n=="string"?n:n[0]},le=(e,t=1)=>{let n=Gn(e,t);return typeof n=="string"?n:n[1]},P=(...e)=>{let t=[];return e.forEach(n=>{n.length!==0&&t.push({type:12,data:n},{type:12,data:x.computeStrides(n)})}),t},X=e=>e%4===0?4:e%2===0?2:1,Wn=(e="f32",t,n="0")=>!t||t===1?`${e}(${n})`:`vec${t}<${e}>(${n})`,Xe=(e,t,n)=>e==="f32"?n:t===1?`f32(${n})`:`vec${t}<f32>(${n})`,Te=(e,t)=>t===4?`(${e}.x + ${e}.y + ${e}.z + ${e}.w)`:t===2?`(${e}.x + ${e}.y)`:t===3?`(${e}.x + ${e}.y + ${e}.z)`:e,B=(e,t,n,r)=>e.startsWith("uniforms.")&&n>4?typeof t=="string"?r==="f16"?`${e}[(${t}) / 8][(${t}) % 8 / 4][(${t}) % 8 % 4]`:`${e}[(${t}) / 4][(${t}) % 4]`:r==="f16"?`${e}[${Math.floor(t/8)}][${Math.floor(t%8/4)}][${t%8%4}]`:`${e}[${Math.floor(t/4)}][${t%4}]`:n>1?`${e}[${t}]`:e,jt=(e,t,n,r,o)=>{let i=typeof n=="number",s=i?n:n.length,a=[...new Array(s).keys()],u=s<2?"u32":s<=4?`vec${s}<u32>`:`array<u32, ${s}>`,d=Gn(t,o),l=typeof d=="string"?d:d[1],c=typeof d=="string"?d:d[0],p={indices:u,value:l,storage:c,tensor:t},f=k=>typeof k=="string"?k:`${k}u`,m={offsetToIndices:!1,indicesToOffset:!1,broadcastedIndicesToOffset:!1,set:!1,setByIndices:!1,get:!1,getByIndices:!1},h=i?"uniforms.":"",b=`${h}${e}_shape`,y=`${h}${e}_strides`,g="";for(let k=0;k<s-1;k++)g+=`
+    let dim${k} = current / ${B(y,k,s)};
+    let rest${k} = current % ${B(y,k,s)};
+    indices[${k}] = dim${k};
+    current = rest${k};
+    `;g+=`indices[${s-1}] = current;`;let _=s<2?"":`
+  fn o2i_${e}(offset: u32) -> ${p.indices} {
+    var indices: ${p.indices};
+    var current = offset;
+    ${g}
+    return indices;
+  }`,w=k=>(m.offsetToIndices=!0,s<2?k:`o2i_${e}(${k})`),v=[];if(s>=2)for(let k=s-1;k>=0;k--)v.push(`${B(y,k,s)} * (indices[${k}])`);let $=s<2?"":`
+  fn i2o_${e}(indices: ${p.indices}) -> u32 {
+    return ${v.join("+")};
+  }`,T=k=>(m.indicesToOffset=!0,s<2?k:`i2o_${e}(${k})`),I=(...k)=>s===0?"0u":`${p.indices}(${k.map(f).join(",")})`,A=(k,L)=>s<2?`${k}`:`${B(k,L,s)}`,z=(k,L,oe)=>s<2?`${k}=${oe};`:`${B(k,L,s)}=${oe};`,M={},R=(k,L)=>{m.broadcastedIndicesToOffset=!0;let oe=`${L.name}broadcastedIndicesTo${e}Offset`;if(oe in M)return`${oe}(${k})`;let we=[];for(let Se=s-1;Se>=0;Se--){let ue=L.indicesGet("outputIndices",Se+L.rank-s);we.push(`${A(y,Se)} * (${ue} % ${A(b,Se)})`)}return M[oe]=`fn ${oe}(outputIndices: ${L.type.indices}) -> u32 {
+             return ${we.length>0?we.join("+"):"0u"};
+           }`,`${oe}(${k})`},W=(k,L)=>(()=>{if(p.storage===p.value)return`${e}[${k}]=${L};`;if(p.storage==="vec2<u32>"&&p.value==="i32")return`${e}[${k}]=vec2<u32>(u32(${L}), select(0u, 0xFFFFFFFFu, ${L} < 0));`;if(p.storage==="vec2<u32>"&&p.value==="u32")return`${e}[${k}]=vec2<u32>(u32(${L}), 0u);`;if(p.storage==="u32"&&p.value==="vec4<bool>")return`${e}[${k}]=dot(vec4<u32>(0x1, 0x100, 0x10000, 0x1000000), vec4<u32>(${L}));`;throw new Error(`not supported combination of storage type ${p.storage} and value type ${p.value} yet`)})(),O=k=>(()=>{if(p.storage===p.value)return`${e}[${k}]`;if(p.storage==="vec2<u32>"&&p.value==="i32")return`i32(${e}[${k}].x)`;if(p.storage==="vec2<u32>"&&p.value==="u32")return`u32(${e}[${k}].x)`;if(p.storage==="u32"&&p.value==="vec4<bool>")return`vec4<bool>(bool(${e}[${k}] & 0xFFu), bool(${e}[${k}] & 0xFF00u), bool(${e}[${k}] & 0xFF0000u), bool(${e}[${k}] & 0xFF000000u))`;throw new Error(`not supported combination of storage type ${p.storage} and value type ${p.value} yet`)})(),ee=s<2?"":`
+  fn get_${e}ByIndices(indices: ${p.indices}) -> ${l} {
+    return ${O(`i2o_${e}(indices)`)};
+  }`,G=s<2?"":(()=>{let k=a.map(oe=>`d${oe}: u32`).join(", "),L=a.map(oe=>`d${oe}`).join(", ");return`
+  fn get_${e}(${k}) -> ${l} {
+    return get_${e}ByIndices(${I(L)});
+  }`})(),D=(...k)=>{if(k.length!==s)throw new Error(`indices length must be ${s}`);let L=k.map(f).join(",");return s===0?O("0u"):s===1?O(L[0]):(m.get=!0,m.getByIndices=!0,m.indicesToOffset=!0,`get_${e}(${L})`)},Z=k=>s<2?O(k):(m.getByIndices=!0,m.indicesToOffset=!0,`get_${e}ByIndices(${k})`),U=s<2?"":`
+  fn set_${e}ByIndices(indices: ${p.indices}, value: ${l}) {
+    ${W(`i2o_${e}(indices)`,"value")}
+  }`,Q=s<2?"":(()=>{let k=a.map(oe=>`d${oe}: u32`).join(", "),L=a.map(oe=>`d${oe}`).join(", ");return`
+  fn set_${e}(${k}, value: ${l}) {
+    set_${e}ByIndices(${I(L)}, value);
+  }`})();return{impl:()=>{let k=[],L=!1;return m.offsetToIndices&&(k.push(_),L=!0),m.indicesToOffset&&(k.push($),L=!0),m.broadcastedIndicesToOffset&&(Object.values(M).forEach(oe=>k.push(oe)),L=!0),m.set&&(k.push(Q),L=!0),m.setByIndices&&(k.push(U),L=!0),m.get&&(k.push(G),L=!0),m.getByIndices&&(k.push(ee),L=!0),!i&&L&&k.unshift(`const ${b} = ${p.indices}(${n.join(",")});`,`const ${y} = ${p.indices}(${x.computeStrides(n).join(",")});`),k.join(`
+`)},type:p,offsetToIndices:w,indicesToOffset:T,broadcastedIndicesToOffset:R,indices:I,indicesGet:A,indicesSet:z,set:(...k)=>{if(k.length!==s+1)throw new Error(`indices length must be ${s}`);let L=k[s];if(typeof L!="string")throw new Error("value must be string");let oe=k.slice(0,s).map(f).join(",");return s===0?W("0u",L):s===1?W(oe[0],L):(m.set=!0,m.setByIndices=!0,m.indicesToOffset=!0,`set_${e}(${oe}, ${L})`)},setByOffset:W,setByIndices:(k,L)=>s<2?W(k,L):(m.setByIndices=!0,m.indicesToOffset=!0,`set_${e}ByIndices(${k}, ${L});`),get:D,getByOffset:O,getByIndices:Z,usage:r,name:e,strides:y,shape:b,rank:s}},S=(e,t,n,r=1)=>jt(e,t,n,"input",r),C=(e,t,n,r=1)=>jt(e,t,n,"output",r),vo=(e,t,n)=>jt(e,t,n,"atomicOutput",1),Zt=(e,t,n,r=1)=>jt(e,t,n,"internal",r),Ln=class{constructor(t,n){this.normalizedDispatchGroup=t;this.limits=n;this.internalVariables=[];this.variables=[];this.uniforms=[];this.variableIndex=0}guardAgainstOutOfBoundsWorkgroupSizes(t){return`if (global_idx >= ${typeof t=="number"?`${t}u`:t}) { return; }`}mainStart(t=Qe){let n=typeof t=="number"?t:t[0],r=typeof t=="number"?1:t[1],o=typeof t=="number"?1:t[2];if(n>this.limits.maxComputeWorkgroupSizeX||r>this.limits.maxComputeWorkgroupSizeY||o>this.limits.maxComputeWorkgroupSizeZ)throw new Error(`workgroup size [${n}, ${r}, ${o}] exceeds the maximum workgroup size [${this.limits.maxComputeWorkgroupSizeX}, ${this.limits.maxComputeWorkgroupSizeY}, ${this.limits.maxComputeWorkgroupSizeZ}].`);if(n*r*o>this.limits.maxComputeInvocationsPerWorkgroup)throw new Error(`workgroup size [${n}, ${r}, ${o}] exceeds the maximum workgroup invocations ${this.limits.maxComputeInvocationsPerWorkgroup}.`);let i=this.normalizedDispatchGroup[1]===1&&this.normalizedDispatchGroup[2]===1,s=i?`@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
+    @builtin(local_invocation_id) local_id : vec3<u32>`:`@builtin(global_invocation_id) global_id : vec3<u32>,
+                                             @builtin(local_invocation_id) local_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
+    @builtin(num_workgroups) num_workgroups : vec3<u32>`,a=i?`let global_idx = global_id.x;
+         let workgroup_index = workgroup_id.x;`:`let workgroup_index = workgroup_id.z * num_workgroups[0] * num_workgroups[1] +
+             workgroup_id.y * num_workgroups[0] + workgroup_id.x;
+         let global_idx = workgroup_index * ${n*r*o}u + local_idx;`;return`@compute @workgroup_size(${n}, ${r}, ${o})
+  fn main(${s}) {
+    ${a}
+  `}appendVariableUniforms(t){t.rank!==0&&(t.shape.startsWith("uniforms.")&&this.uniforms.push({name:t.shape.replace("uniforms.",""),type:"u32",length:t.rank}),t.strides.startsWith("uniforms.")&&this.uniforms.push({name:t.strides.replace("uniforms.",""),type:"u32",length:t.rank}))}declareVariable(t,n){if(t.usage==="internal")throw new Error("cannot use internal variable with declareVariable(). use registerInternalVariables() instead.");this.variables.push(t),this.appendVariableUniforms(t);let r=t.usage==="input"?"read":"read_write",o=t.usage==="atomicOutput"?"atomic<i32>":t.type.storage;return`@group(0) @binding(${n}) var<storage, ${r}> ${t.name}: array<${o}>;`}declareVariables(...t){return t.map(n=>this.declareVariable(n,this.variableIndex++)).join(`
+`)}registerInternalVariable(t){if(t.usage!=="internal")throw new Error("cannot use input or output variable with registerInternalVariable(). use declareVariables() instead.");this.internalVariables.push(t),this.appendVariableUniforms(t)}registerInternalVariables(...t){return t.forEach(n=>this.registerInternalVariable(n)),this}registerUniform(t,n,r=1){return this.uniforms.push({name:t,type:n,length:r}),this}registerUniforms(t){return this.uniforms=this.uniforms.concat(t),this}uniformDeclaration(){if(this.uniforms.length===0)return"";let t=[];for(let{name:n,type:r,length:o}of this.uniforms)if(o&&o>4)r==="f16"?t.push(`@align(16) ${n}:array<mat2x4<${r}>, ${Math.ceil(o/8)}>`):t.push(`${n}:array<vec4<${r}>, ${Math.ceil(o/4)}>`);else{let i=o==null||o===1?r:`vec${o}<${r}>`;t.push(`${n}:${i}`)}return`
+      struct Uniforms { ${t.join(", ")} };
+      @group(0) @binding(${this.variableIndex}) var<uniform> uniforms: Uniforms;`}get additionalImplementations(){return this.uniformDeclaration()+this.variables.map(t=>t.impl()).join(`
+`)+this.internalVariables.map(t=>t.impl()).join(`
+`)}get variablesInfo(){if(this.uniforms.length===0)return;let t=n=>[12,10,1,6][["u32","f16","f32","i32"].indexOf(n)];return this.uniforms.map(n=>[t(n.type),n.length??1])}},xo=(e,t)=>new Ln(e,t)});var cd,So,pd,md,fd,hd,ce,To,Io,Re=E(()=>{"use strict";V();q();ie();K();cd=(e,t)=>{if(!e||e.length!==1)throw new Error("Transpose requires 1 input.");if(t.length!==0&&t.length!==e[0].dims.length)throw new Error(`perm size ${t.length} does not match input rank ${e[0].dims.length}`)},So=(e,t)=>t.length!==0?t:[...new Array(e).keys()].reverse(),pd=(e,t)=>x.sortBasedOnPerm(e,So(e.length,t)),md=(e,t,n,r)=>{let o=`fn perm(i: ${r.type.indices}) -> ${n.type.indices} {
+    var a: ${n.type.indices};`;for(let i=0;i<t;++i)o+=`a[${e[i]}]=i[${i}];`;return o+="return a;}"},fd=(e,t)=>{let n=[],r=[];for(let o=0;o<e.length;++o)e[o]!==1&&n.push(e[o]),e[t[o]]!==1&&r.push(t[o]);return{newShape:n,newPerm:r}},hd=(e,t)=>{let n=0;for(let r=0;r<e.length;++r)if(t[e[r]]!==1){if(e[r]<n)return!1;n=e[r]}return!0},ce=(e,t)=>{let n=e.dataType,r=e.dims.length,o=So(r,t),i=pd(e.dims,o),s=e.dims,a=i,u=r<2||hd(o,e.dims),d;if(u)return d=h=>{let b=S("input",n,s,4),y=C("output",n,a,4);return`
+  ${h.registerUniform("output_size","u32").declareVariables(b,y)}
+  ${h.mainStart()}
+    ${h.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    output[global_idx] = input[global_idx];
+  }`},{name:"TransposeCopy",shaderCache:{inputDependencies:["type"]},getRunData:()=>{let h=x.size(i);return{outputs:[{dims:i,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(h/64/4)},programUniforms:[{type:12,data:Math.ceil(h/4)}]}},getShaderSource:d};let{newShape:l,newPerm:c}=fd(e.dims,o),p=x.areEqual(c,[2,3,1]),f=x.areEqual(c,[3,1,2]);if(l.length===2||p||f){s=p?[l[0],l[1]*l[2]]:f?[l[0]*l[1],l[2]]:l,a=[s[1],s[0]];let h=16;return d=b=>{let y=S("a",n,s.length),g=C("output",n,a.length);return`
+  ${b.registerUniform("output_size","u32").declareVariables(y,g)}
+  var<workgroup> tile : array<array<${g.type.value}, ${h+1}>, ${h}>;
+  ${b.mainStart([h,h,1])}
+    let stride = (uniforms.output_shape[1] - 1) / ${h} + 1;
+    let workgroup_id_x = workgroup_index % stride;
+    let workgroup_id_y = workgroup_index / stride;
+    let input_col = workgroup_id_y * ${h}u + local_id.x;
+    let input_row = workgroup_id_x * ${h}u + local_id.y;
+    if (input_row < uniforms.a_shape[0] && input_col < uniforms.a_shape[1]) {
+      tile[local_id.y][local_id.x] = ${y.getByIndices(`${y.type.indices}(input_row, input_col)`)};
+    }
+    workgroupBarrier();
+
+    let output_col = workgroup_id_x * ${h}u + local_id.x;
+    let output_row = workgroup_id_y * ${h}u + local_id.y;
+    if (output_row < uniforms.output_shape[0] && output_col < uniforms.output_shape[1]) {
+      ${g.setByIndices(`${g.type.indices}(output_row, output_col)`,"tile[local_id.x][local_id.y]")}
+    }
+  }`},{name:"TransposeShared",shaderCache:{inputDependencies:["type"]},getRunData:()=>{let b=x.size(i);return{outputs:[{dims:i,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(a[1]/h),y:Math.ceil(a[0]/h)},programUniforms:[{type:12,data:b},...P(s,a)]}},getShaderSource:d}}return d=h=>{let b=S("a",n,s.length),y=C("output",n,a.length);return`
+  ${h.registerUniform("output_size","u32").declareVariables(b,y)}
+
+  ${md(o,r,b,y)}
+
+  ${h.mainStart()}
+    ${h.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let indices = ${y.offsetToIndices("global_idx")};
+    let aIndices = perm(indices);
+
+    ${y.setByOffset("global_idx",b.getByIndices("aIndices"))}
+  }`},{name:"Transpose",shaderCache:{hint:`${t}`,inputDependencies:["rank"]},getRunData:()=>{let h=x.size(i);return{outputs:[{dims:i,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(h/64)},programUniforms:[{type:12,data:h},...P(s,a)]}},getShaderSource:d}},To=(e,t)=>{cd(e.inputs,t.perm),e.compute(ce(e.inputs[0],t.perm))},Io=e=>N({perm:e.perm})});var gd,yd,bd,_d,wd,$d,vd,xd,Sd,Td,ze,Co,Ao,ko,Eo,Po,zo,Oo,Do,Bo,Mo,Ro=E(()=>{"use strict";V();q();K();Qt();Re();gd={max:"select(bestValue, candidate, candidate > bestValue)",min:"select(bestValue, candidate, candidate < bestValue)",mean:"bestValue + candidate",sum:"bestValue + candidate",prod:"bestValue * candidate",sumSquare:"bestValue + candidate * candidate",logSumExp:"bestValue + exp(candidate)",l1:"bestValue + abs(candidate)",l2:"bestValue + candidate * candidate",logSum:"bestValue + candidate"},yd={max:"select(bestValue, candidate, candidate > bestValue)",min:"select(bestValue, candidate, candidate < bestValue)",mean:"bestValue + candidate",sum:"bestValue + candidate",prod:"bestValue * candidate",sumSquare:"bestValue + candidate",logSumExp:"bestValue + candidate",l1:"bestValue + candidate",l2:"bestValue + candidate",logSum:"bestValue + candidate"},bd={max:"_A[offset]",min:"_A[offset]",mean:"0",sum:"0",prod:"1",sumSquare:"0",logSumExp:"0",l1:"0",l2:"0",logSum:"0"},_d={max:"bestValue",min:"bestValue",sum:"bestValue",prod:"bestValue",sumSquare:"bestValue",logSumExp:"log(bestValue)",l1:"bestValue",l2:"sqrt(bestValue)",logSum:"log(bestValue)"},wd=(e,t)=>{let n=[];for(let r=t-e;r<t;++r)n.push(r);return n},$d=(e,t)=>{let n=[],r=e.length;for(let i=0;i<r;i++)t.indexOf(i)===-1&&n.push(e[i]);let o=t.map(i=>e[i]);return[n,o]},vd=(e,t)=>{let n=e.length+t.length,r=[],o=0;for(let i=0;i<n;i++)t.indexOf(i)===-1?r.push(e[o++]):r.push(1);return r},xd=(e,t)=>{for(let n=0;n<e.length;++n)if(e[e.length-n-1]!==t-1-n)return!1;return!0},Sd=(e,t)=>{let n=[];if(!xd(e,t)){for(let r=0;r<t;++r)e.indexOf(r)===-1&&n.push(r);e.forEach(r=>n.push(r))}return n},Td=(e,t,n,r,o,i,s)=>{let a=n[0].dims,u=x.size(i),d=x.size(s),l=S("_A",n[0].dataType,a),c=C("output",o,i),p=64;u===1&&(p=256);let f=`
+          var<workgroup> aBestValues : array<f32, ${p}>;
+       `,m=h=>`
+        ${h.registerUniform("reduceSize","u32").declareVariables(l,c)}
+        ${f}
+        fn DIV_CEIL(a : u32, b : u32) -> u32 {
+          return ((a - 1u) / b + 1u);
+         }
+         ${h.mainStart(p)}
+
+          let outputIndex = global_idx / ${p};
+          let offset = outputIndex * uniforms.reduceSize;
+
+          var bestValue = f32(${bd[r]});
+          let Length = uniforms.reduceSize;
+          for (var k = local_idx; k < Length; k = k + ${p}) {
+           let candidate = f32(${l.getByOffset("offset + k")});
+           bestValue = ${gd[r]};
+          }
+          aBestValues[local_idx] = bestValue;
+          workgroupBarrier();
+
+         var reduceSize = min(Length, ${p}u);
+         for (var currentSize = reduceSize / 2u; reduceSize > 1u;
+             currentSize = reduceSize / 2u) {
+           let interval = DIV_CEIL(reduceSize, 2u);
+           if (local_idx < currentSize) {
+            let candidate = aBestValues[local_idx + interval];
+            bestValue = ${yd[r]};
+            aBestValues[local_idx] = bestValue;
+           }
+           reduceSize = interval;
+           workgroupBarrier();
+         }
+
+         if (local_idx == 0u) {
+          ${c.setByOffset("outputIndex",`${r==="mean"?`${c.type.storage}(bestValue / f32(uniforms.reduceSize))`:`${c.type.storage}(${_d[r]})`}`)};
+         }
+        }`;return{name:e,shaderCache:{hint:`${t};${p}`,inputDependencies:["type"]},getShaderSource:m,getRunData:()=>({outputs:[{dims:i,dataType:o}],dispatchGroup:{x:u},programUniforms:[{type:12,data:d}]})}},ze=(e,t,n,r)=>{let o=e.inputs.length===1?n:Hn(e.inputs,n),i=o.axes;i.length===0&&!o.noopWithEmptyAxes&&(i=e.inputs[0].dims.map((f,m)=>m));let s=x.normalizeAxes(i,e.inputs[0].dims.length),a=s,u=e.inputs[0],d=Sd(a,e.inputs[0].dims.length);d.length>0&&(u=e.compute(ce(e.inputs[0],d),{inputs:[0],outputs:[-1]})[0],a=wd(a.length,u.dims.length));let[l,c]=$d(u.dims,a),p=l;o.keepDims&&(p=vd(l,s)),e.compute(Td(t,o.cacheKey,[u],r,e.inputs[0].dataType,p,c),{inputs:[u]})},Co=(e,t)=>{ze(e,"ReduceMeanShared",t,"mean")},Ao=(e,t)=>{ze(e,"ReduceL1Shared",t,"l1")},ko=(e,t)=>{ze(e,"ReduceL2Shared",t,"l2")},Eo=(e,t)=>{ze(e,"ReduceLogSumExpShared",t,"logSumExp")},Po=(e,t)=>{ze(e,"ReduceMaxShared",t,"max")},zo=(e,t)=>{ze(e,"ReduceMinShared",t,"min")},Oo=(e,t)=>{ze(e,"ReduceProdShared",t,"prod")},Do=(e,t)=>{ze(e,"ReduceSumShared",t,"sum")},Bo=(e,t)=>{ze(e,"ReduceSumSquareShared",t,"sumSquare")},Mo=(e,t)=>{ze(e,"ReduceLogSumShared",t,"logSum")}});var Oe,Id,Xt,Hn,De,Cd,Ad,kd,Ed,Pd,zd,Od,Dd,Bd,Md,Be,Uo,Vo,No,Lo,Go,Wo,Ho,qo,Fo,Ko,Qt=E(()=>{"use strict";V();q();ie();K();Ro();Oe=e=>{if(!e||e.length===0||e.length>2)throw new Error("Reduce op requires 1 or 2 inputs.");if(e.length===2&&e[1].dims.length!==1)throw new Error("Invalid axes input dims.")},Id=e=>["","",`var value = ${e.getByIndices("input_indices")};`,""],Xt=(e,t,n,r,o,i,s=!1,a=!1)=>{let u=[],d=n[0].dims,l=d.length,c=x.normalizeAxes(o,l),p=!a&&c.length===0;d.forEach((b,y)=>{p||c.indexOf(y)>=0?s&&u.push(1):u.push(b)});let f=u.length,m=x.size(u);return{name:e,shaderCache:t,getShaderSource:b=>{let y=[],g=S("_A",n[0].dataType,l),_=C("output",i,f),w=r(g,_,c),v=w[2];for(let $=0,T=0;$<l;$++)p||c.indexOf($)>=0?(s&&T++,v=`for(var j${$}: u32 = 0; j${$} < ${d[$]}; j${$}++) {
+                  ${w[2].includes("last_index")?`let last_index = j${$};`:""}
+                  ${g.indicesSet("input_indices",$,`j${$}`)}
+                  ${v}
+                }`):(y.push(`${g.indicesSet("input_indices",$,_.indicesGet("output_indices",T))};`),T++);return`
+
+        ${b.registerUniform("output_size","u32").declareVariables(g,_)}
+
+        ${b.mainStart()}
+          ${b.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+          var input_indices: ${g.type.indices};
+          let output_indices = ${_.offsetToIndices("global_idx")};
+
+          ${y.join(`
+`)}
+          ${w[0]}       // init ops for reduce max/min
+          ${w[1]}
+          ${v}
+          ${w[3]}
+          ${w.length===4?_.setByOffset("global_idx","value"):w.slice(4).join(`
+`)}
+        }`},getRunData:()=>({outputs:[{dims:u,dataType:i}],dispatchGroup:{x:Math.ceil(m/64)},programUniforms:[{type:12,data:m},...P(d,u)]})}},Hn=(e,t)=>{let n=[];return e[1].dims[0]>0&&e[1].getBigInt64Array().forEach(r=>n.push(Number(r))),N({axes:n,keepDims:t.keepDims,noopWithEmptyAxes:t.noopWithEmptyAxes})},De=(e,t,n,r)=>{let o=e.inputs,i=o.length===1?n:Hn(o,n);e.compute(Xt(t,{hint:i.cacheKey,inputDependencies:["rank"]},[o[0]],i.noopWithEmptyAxes&&i.axes.length===0?Id:r,i.axes,o[0].dataType,i.keepDims,i.noopWithEmptyAxes),{inputs:[0]})},Cd=(e,t)=>{Oe(e.inputs),De(e,"ReduceLogSum",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += ${r.getByIndices("input_indices")};`,"value = log(value);"])},Ad=(e,t)=>{Oe(e.inputs),De(e,"ReduceL1",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += abs(${r.getByIndices("input_indices")});`,""])},kd=(e,t)=>{Oe(e.inputs),De(e,"ReduceL2",t,(r,o)=>[`var t = ${o.type.value}(0); var value = ${o.type.value}(0);`,"",`t = ${r.getByIndices("input_indices")}; value += (t * t);`,"value = sqrt(value);"])},Ed=(e,t)=>{Oe(e.inputs),De(e,"ReduceLogSumExp",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += exp(${r.getByIndices("input_indices")});`,"value = log(value);"])},Pd=(e,t)=>{Oe(e.inputs),De(e,"ReduceMax",t,(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(r.indicesSet("input_indices",a,0));return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};`,`value = max(value, ${r.getByIndices("input_indices")});`,""]})},zd=(e,t)=>{Oe(e.inputs),De(e,"ReduceMean",t,(r,o,i)=>{let s=1;for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&(s*=e.inputs[0].dims[a]);return["var sum = f32(0);","",`sum += f32(${r.getByIndices("input_indices")});`,`let value = ${o.type.value}(sum / ${s});`]})},Od=(e,t)=>{Oe(e.inputs),De(e,"ReduceMin",t,(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(`input_indices[${a}] = 0;`);return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};`,`value = min(value, ${r.getByIndices("input_indices")});`,""]})},Dd=(e,t)=>{Oe(e.inputs),De(e,"ReduceProd",t,(r,o)=>[`var value = ${o.type.storage}(1);`,"",`value *= ${r.getByIndices("input_indices")};`,""])},Bd=(e,t)=>{Oe(e.inputs),De(e,"ReduceSum",t,(r,o)=>[`var value = ${o.type.storage}(0);`,"",`value += ${r.getByIndices("input_indices")};`,""])},Md=(e,t)=>{Oe(e.inputs),De(e,"ReduceSumSquare",t,(r,o)=>[`var t = ${o.type.value}(0); var value = ${o.type.value}(0);`,"",`t = ${r.getByIndices("input_indices")}; value += t * t;`,""])},Be=(e,t,n)=>{if(t.length===0)return n;let r=1,o=1;for(let i=0;i<t.length;i++)t.indexOf(i)===-1?r*=e[i]:o*=e[i];return o<32&&r>1024},Uo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?zd(e,t):Co(e,t)},Vo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Ad(e,t):Ao(e,t)},No=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?kd(e,t):ko(e,t)},Lo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Ed(e,t):Eo(e,t)},Go=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Pd(e,t):Po(e,t)},Wo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Od(e,t):zo(e,t)},Ho=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Dd(e,t):Oo(e,t)},qo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Bd(e,t):Do(e,t)},Fo=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Md(e,t):Bo(e,t)},Ko=(e,t)=>{Be(e.inputs[0].dims,t.axes,t.noopWithEmptyAxes)?Cd(e,t):Mo(e,t)}});var jo,Zo,Qo,qn,Xo=E(()=>{"use strict";V();ie();Qt();jo=e=>{if(!e||e.length===0||e.length>2)throw new Error("ArgMinMaxOp op requires 1 or 2 inputs.");if(e[0].dataType!==1)throw new Error("Invalid input type.")},Zo=(e,t)=>{jo(e.inputs);let n=(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(`input_indices[${a}] = 0;`);return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};
+var best_index : i32 = 0;`,`if (${r.getByIndices("input_indices")} ${t.selectLastIndex>0?"<=":"<"} value) {
+         value = ${r.getByIndices("input_indices")};
+         best_index = i32(last_index);
+       }`,"",o.setByOffset("global_idx","best_index")]};e.compute(Xt("ArgMin",{hint:t.cacheKey,inputDependencies:["rank"]},[e.inputs[0]],n,[t.axis],7,t.keepDims),{inputs:[0]})},Qo=(e,t)=>{jo(e.inputs);let n=(r,o,i)=>{let s=[];for(let a=0;a<r.rank;a++)(i.indexOf(a)>=0||i.length===0)&&s.push(`input_indices[${a}] = 0;`);return[`${s.join(`
+`)}`,`var value = ${r.getByIndices("input_indices")};
+var best_index : i32 = 0;`,`if (${r.getByIndices("input_indices")} ${t.selectLastIndex>0?">=":">"} value) {
+         value = ${r.getByIndices("input_indices")};
+         best_index = i32(last_index);
+       }`,"",o.setByOffset("global_idx","best_index")]};e.compute(Xt("argMax",{hint:t.cacheKey,inputDependencies:["rank"]},[e.inputs[0]],n,[t.axis],7,t.keepDims),{inputs:[0]})},qn=e=>N(e)});var Rd,Fn,Ud,Vd,Nd,nt,Ld,Yo,Yt=E(()=>{"use strict";V();q();Kt();K();Rd=(e,t)=>{let n=e[0],r=e[1],o=e[2],i=e[3],s=e[4],a=e[5];if(s&&a)throw new Error("Attention cannot have both past and attention_bias");if(n.dims.length!==3)throw new Error('Input "input" must have 3 dimensions');let u=n.dims[0],d=n.dims[1],l=n.dims[2];if(o.dims.length!==1)throw new Error('Input "bias" is expected to have 1 dimensions');if(r.dims.length!==2)throw new Error('Input "weights" is expected to have 2 dimensions');if(r.dims[0]!==l)throw new Error("Input 1 dimension 0 should have same length as dimension 2 of input 0");if(o.dims[0]!==r.dims[1])throw new Error('Input "bias" dimension 0 should have same length as dimension 1 of input "weights"');let c=o.dims[0]/3,p=c,f=p;if(t.qkvHiddenSizes.length>0){if(t.qkvHiddenSizes.length!==3)throw new Error("qkv_hidden_sizes attribute should have 3 elements");for(let _ of t.qkvHiddenSizes)if(_%t.numHeads!==0)throw new Error("qkv_hidden_sizes should be divisible by num_heads");c=t.qkvHiddenSizes[0],p=t.qkvHiddenSizes[1],f=t.qkvHiddenSizes[2]}let m=d;if(c!==p)throw new Error("qkv_hidden_sizes first element should be same as the second");if(o.dims[0]!==c+p+f)throw new Error('Input "bias" dimension 0 should have same length as sum of Q/K/V hidden sizes');let h=0;if(s){if(p!==f)throw new Error('Input "past" expect k_hidden_size == v_hidden_size');if(s.dims.length!==5)throw new Error('Input "past" must have 5 dimensions');if(s.dims[0]!==2)throw new Error('Input "past" first dimension must be 2');if(s.dims[1]!==u)throw new Error('Input "past" second dimension must be batch_size');if(s.dims[2]!==t.numHeads)throw new Error('Input "past" third dimension must be num_heads');if(s.dims[4]!==p/t.numHeads)throw new Error('Input "past" fifth dimension must be k_hidden_size / num_heads');t.pastPresentShareBuffer||(h=s.dims[3])}let b=m+h,y=-1,g=0;if(i)throw new Error("Mask not supported");if(s)throw new Error("past is not supported");if(a){if(a.dims.length!==4)throw new Error('Input "attention_bias" must have 4 dimensions');if(a.dims[0]!==u||a.dims[1]!==t.numHeads||a.dims[2]!==d||a.dims[3]!==b)throw new Error('Expect "attention_bias" shape (batch_size, num_heads, sequence_length, total_sequence_length)')}return{batchSize:u,sequenceLength:d,pastSequenceLength:h,kvSequenceLength:m,totalSequenceLength:b,maxSequenceLength:y,inputHiddenSize:l,hiddenSize:c,vHiddenSize:f,headSize:Math.floor(c/t.numHeads),vHeadSize:Math.floor(f/t.numHeads),numHeads:t.numHeads,isUnidirectional:!1,pastPresentShareBuffer:!1,maskFilterValue:t.maskFilterValue,maskType:g,scale:t.scale,broadcastResPosBias:!1,passPastInKv:!1,qkvFormat:1}},Fn=(e,t,n)=>t&&e?`
+      let total_sequence_length_input = u32(${t.getByOffset("0")});
+      let present_sequence_length = max(total_sequence_length_input, uniforms.past_sequence_length);
+      let is_subsequent_prompt: bool = sequence_length > 1 && sequence_length != total_sequence_length_input;
+      let is_first_prompt: bool = is_subsequent_prompt == false && sequence_length == total_sequence_length_input;
+      total_sequence_length = u32(${e?.getByOffset("batchIdx")}) + 1;
+      var past_sequence_length: u32 = 0;
+      if (is_first_prompt == false) {
+        past_sequence_length = total_sequence_length - sequence_length;
+      }
+       `:`
+    ${n?"let past_sequence_length = uniforms.past_sequence_length":""};
+    let present_sequence_length = total_sequence_length;
+    `,Ud=(e,t,n,r,o,i,s,a)=>{let u=X(s?1:i),d=64,l=i/u;l<d&&(d=32);let c=Math.ceil(i/u/d),p=[{type:12,data:t},{type:12,data:n},{type:12,data:r},{type:12,data:o},{type:12,data:l},{type:12,data:c}],f=re(e.dataType,u),m=le(1,u),h=["type"];s&&h.push("type"),a&&h.push("type");let b=y=>{let g=C("x",e.dataType,e.dims,u),_=[g],w=s?S("seq_lens",s.dataType,s.dims):void 0;w&&_.push(w);let v=a?S("total_sequence_length_input",a.dataType,a.dims):void 0;v&&_.push(v);let $=le(e.dataType),T=[{name:"batch_size",type:"u32"},{name:"num_heads",type:"u32"},{name:"past_sequence_length",type:"u32"},{name:"sequence_length",type:"u32"},{name:"total_sequence_length",type:"u32"},{name:"elements_per_thread",type:"u32"}];return`
+  var<workgroup> thread_max: array<f32, ${d}>;
+  var<workgroup> thread_sum: array<f32, ${d}>;
+  ${y.registerUniforms(T).declareVariables(..._)}
+  ${y.mainStart([d,1,1])}
+    let batchIdx = workgroup_id.z / uniforms.num_heads;
+    let headIdx = workgroup_id.z % uniforms.num_heads;
+    let sequence_length = uniforms.sequence_length;
+    var total_sequence_length = uniforms.total_sequence_length;
+    ${Fn(w,v,!1)}
+    let local_offset = local_idx * uniforms.elements_per_thread;
+    let offset = (global_idx / ${d}) * uniforms.total_sequence_length + local_offset;
+    let seq_causal_length = ${s?"u32(past_sequence_length + workgroup_id.y + 1)":"total_sequence_length"};
+    var thread_max_vector = ${m}(-3.402823e+38f);
+    for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+      thread_max_vector = max(${m}(x[offset + i]), thread_max_vector);
+    }
+    thread_max[local_idx] = ${(()=>{switch(u){case 1:return"thread_max_vector";case 2:return"max(thread_max_vector.x, thread_max_vector.y)";case 4:return"max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))";default:throw new Error(`Unsupported components: ${u}`)}})()};
+    workgroupBarrier();
+
+    var max_value =  f32(-3.402823e+38f);
+    for (var i = 0u; i < ${d}; i++) {
+      max_value = max(thread_max[i], max_value);
+    }
+
+    var sum_vector = ${m}(0);
+    for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+      sum_vector += exp(${m}(x[offset + i]) - max_value);
+    }
+    thread_sum[local_idx] = ${(()=>{switch(u){case 1:return"sum_vector";case 2:return"sum_vector.x + sum_vector.y";case 4:return"sum_vector.x + sum_vector.y + sum_vector.z + sum_vector.w";default:throw new Error(`Unsupported components: ${u}`)}})()};
+    workgroupBarrier();
+
+    var sum: f32 = 0;
+    for (var i = 0u; i < ${d}; i++) {
+      sum += thread_sum[i];
+    }
+
+    if (sum == 0) {
+      for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+        x[offset + i] = ${g.type.value}(${$}(1.0) / ${$}(seq_causal_length));
+      }
+    } else {
+      for (var i: u32 = 0; i < uniforms.elements_per_thread && i + local_offset < seq_causal_length; i++) {
+        var f32input = ${m}(x[offset + i]);
+        x[offset + i] = ${g.type.value}(exp(f32input - max_value) / sum);
+      }
+    }
+      ${s?`
+        for (var total_seq_id: u32 = seq_causal_length; total_seq_id + local_offset < uniforms.total_sequence_length; total_seq_id++) {
+          x[offset + total_seq_id] = ${g.type.value}(${$}(0));
+        }`:""};
+  }`};return{name:"AttentionProbsSoftmax",shaderCache:{hint:`${d};${f};${u}`,inputDependencies:h},getShaderSource:b,getRunData:()=>({outputs:[],dispatchGroup:{x:1,y:o,z:t*n},programUniforms:p})}},Vd=(e,t,n,r,o,i,s,a,u)=>{let d=s+i.kvSequenceLength,l=[i.batchSize,i.numHeads,i.sequenceLength,d],c=e>1&&r,p=i.kvNumHeads?i.kvNumHeads:i.numHeads,f=c?[i.batchSize,p,d,i.headSize]:void 0,m=i.nReps?i.nReps:1,h=i.scale===0?1/Math.sqrt(i.headSize):i.scale,b=X(i.headSize),y=i.headSize/b,g=12,_={x:Math.ceil(d/g),y:Math.ceil(i.sequenceLength/g),z:i.batchSize*i.numHeads},w=[{type:12,data:i.sequenceLength},{type:12,data:y},{type:12,data:d},{type:12,data:i.numHeads},{type:12,data:i.headSize},{type:1,data:h},{type:12,data:s},{type:12,data:i.kvSequenceLength},{type:12,data:m}],v=c&&r&&x.size(r.dims)>0,$=["type","type"];v&&$.push("type"),o&&$.push("type"),a&&$.push("type"),u&&$.push("type");let T=[{dims:l,dataType:t.dataType,gpuDataType:0}];c&&T.push({dims:f,dataType:t.dataType,gpuDataType:0});let I=A=>{let z=S("q",t.dataType,t.dims,b),M=S("key",n.dataType,n.dims,b),R=[z,M];if(v){let U=S("past_key",r.dataType,r.dims,b);R.push(U)}o&&R.push(S("attention_bias",o.dataType,o.dims));let W=a?S("seq_lens",a.dataType,a.dims):void 0;W&&R.push(W);let O=u?S("total_sequence_length_input",u.dataType,u.dims):void 0;O&&R.push(O);let ee=C("output",t.dataType,l),G=[ee];c&&G.push(C("present_key",t.dataType,f,b));let D=le(1,b),Z=[{name:"M",type:"u32"},{name:"K",type:"u32"},{name:"N",type:"u32"},{name:"num_heads",type:"u32"},{name:"head_size",type:"u32"},{name:"alpha",type:"f32"},{name:"past_sequence_length",type:"u32"},{name:"kv_sequence_length",type:"u32"},{name:"n_reps",type:"u32"}];return`
+  const TILE_SIZE = ${g}u;
+
+  var<workgroup> tileQ: array<${z.type.storage}, ${g*g}>;
+  var<workgroup> tileK: array<${z.type.storage}, ${g*g}>;
+  ${A.registerUniforms(Z).declareVariables(...R,...G)}
+  ${A.mainStart([g,g,1])}
+    // x holds the N and y holds the M
+    let headIdx = workgroup_id.z % uniforms.num_heads;
+    let kvHeadIdx = ${m===1?"headIdx":"headIdx / uniforms.n_reps"};
+    let kv_num_heads = ${m===1?"uniforms.num_heads":"uniforms.num_heads / uniforms.n_reps"};
+    let batchIdx = workgroup_id.z / uniforms.num_heads;
+    let m = workgroup_id.y * TILE_SIZE;
+    let n = workgroup_id.x * TILE_SIZE;
+    let sequence_length = uniforms.M;
+    var total_sequence_length = uniforms.N;
+    ${Fn(W,O,!0)}
+    let absKvHeadIdx = batchIdx * kv_num_heads + kvHeadIdx;
+    let qOffset = workgroup_id.z * uniforms.M * uniforms.K + m * uniforms.K;
+    ${v&&c?"let pastKeyOffset = absKvHeadIdx * uniforms.past_sequence_length * uniforms.K;":""};
+    let kOffset = absKvHeadIdx * uniforms.kv_sequence_length * uniforms.K;
+    ${c?"let presentKeyOffset = absKvHeadIdx * uniforms.N * uniforms.K;":""}
+    var value = ${D}(0);
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (global_id.y < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];
+      }
+      if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {
+        var idx = TILE_SIZE * local_id.y + local_id.x;
+      ${v&&c?`
+              if (n + local_id.y < past_sequence_length) {
+                tileK[idx] = past_key[pastKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x];
+              } else if (n + local_id.y - past_sequence_length < uniforms.kv_sequence_length) {
+                tileK[idx] = key[kOffset + (n + local_id.y - past_sequence_length) * uniforms.K + w + local_id.x];
+              }`:`
+          if (n + local_id.y < uniforms.kv_sequence_length) {
+            tileK[idx] = key[kOffset + (n + local_id.y) * uniforms.K + w + local_id.x];
+          }`}
+      ${c?`if (n + local_id.y < present_sequence_length) {
+        present_key[presentKeyOffset + (n + local_id.y) * uniforms.K + w + local_id.x] = tileK[idx];
+      }`:""}
+      }
+      workgroupBarrier();
+
+      for (var k: u32 = 0u; k < TILE_SIZE && w+k < uniforms.K; k++) {
+          value += ${D}(tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k]);
+      }
+
+      workgroupBarrier();
+    }
+
+    if (global_id.y < uniforms.M && global_id.x < total_sequence_length) {
+      let headOffset = workgroup_id.z * uniforms.M * uniforms.N;
+      let outputIdx = headOffset + global_id.y * uniforms.N + global_id.x;
+      var sum: f32 = ${(()=>{switch(b){case 1:return"value";case 2:return"value.x + value.y";case 4:return"value.x + value.y + value.z + value.w";default:throw new Error(`Unsupported components: ${b}`)}})()};
+        output[outputIdx] = ${ee.type.value} (sum * uniforms.alpha) + ${o?"attention_bias[outputIdx]":"0.0"};
+    }
+  }`};return{name:"AttentionProbs",shaderCache:{hint:`${b};${o!==void 0};${r!==void 0};${e}`,inputDependencies:$},getRunData:()=>({outputs:T,dispatchGroup:_,programUniforms:w}),getShaderSource:I}},Nd=(e,t,n,r,o,i,s=void 0,a=void 0)=>{let u=i+o.kvSequenceLength,d=o.nReps?o.nReps:1,l=o.vHiddenSize*d,c=e>1&&r,p=o.kvNumHeads?o.kvNumHeads:o.numHeads,f=c?[o.batchSize,p,u,o.headSize]:void 0,m=[o.batchSize,o.sequenceLength,l],h=12,b={x:Math.ceil(o.vHeadSize/h),y:Math.ceil(o.sequenceLength/h),z:o.batchSize*o.numHeads},y=[{type:12,data:o.sequenceLength},{type:12,data:u},{type:12,data:o.vHeadSize},{type:12,data:o.numHeads},{type:12,data:o.headSize},{type:12,data:l},{type:12,data:i},{type:12,data:o.kvSequenceLength},{type:12,data:d}],g=c&&r&&x.size(r.dims)>0,_=["type","type"];g&&_.push("type"),s&&_.push("type"),a&&_.push("type");let w=[{dims:m,dataType:t.dataType,gpuDataType:0}];c&&w.push({dims:f,dataType:t.dataType,gpuDataType:0});let v=$=>{let T=S("probs",t.dataType,t.dims),I=S("v",n.dataType,n.dims),A=[T,I];g&&A.push(S("past_value",r.dataType,r.dims));let z=s?S("seq_lens",s.dataType,s.dims):void 0;s&&A.push(z);let M=a?S("total_sequence_length_input",a.dataType,a.dims):void 0;a&&A.push(M);let W=[C("output",t.dataType,m)];c&&W.push(C("present_value",t.dataType,f));let O=[{name:"M",type:"u32"},{name:"K",type:"u32"},{name:"N",type:"u32"},{name:"num_heads",type:"u32"},{name:"head_size",type:"u32"},{name:"v_hidden_size",type:"u32"},{name:"past_sequence_length",type:"u32"},{name:"kv_sequence_length",type:"u32"},{name:"n_reps",type:"u32"}];return`
+  const TILE_SIZE = ${h}u;
+  var<workgroup> tileQ: array<${T.type.value}, ${h*h}>;
+  var<workgroup> tileV: array<${T.type.value}, ${h*h}>;
+  ${$.registerUniforms(O).declareVariables(...A,...W)}
+  ${$.mainStart([h,h,1])}
+   let headIdx = workgroup_id.z % uniforms.num_heads;
+   let batchIdx = workgroup_id.z / uniforms.num_heads;
+   let kvHeadIdx = ${d===1?"headIdx":"headIdx / uniforms.n_reps"};
+   let kv_num_heads = ${d===1?"uniforms.num_heads":"uniforms.num_heads / uniforms.n_reps"};
+   let m = global_id.y;
+   let n = global_id.x;
+   let sequence_length = uniforms.M;
+   var total_sequence_length = uniforms.K;
+   ${Fn(z,M,!0)}
+   let offsetA = workgroup_id.z * uniforms.M * uniforms.K + m * uniforms.K;
+   let absKvHeadIdx = batchIdx * kv_num_heads + kvHeadIdx; // kvHeadIdx is relative to the batch
+   ${g&&c?"let pastValueOffset = absKvHeadIdx * uniforms.N * uniforms.past_sequence_length + n;":""};
+   let vOffset = absKvHeadIdx * uniforms.N * uniforms.kv_sequence_length + n;
+   ${c?"let presentValueOffset = absKvHeadIdx * uniforms.N * uniforms.K + n;":""}
+   var value = ${T.type.storage}(0);
+   for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];
+      }
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        var idx = TILE_SIZE * local_id.y + local_id.x;
+        ${g&&c?`
+        if (w + local_id.y < past_sequence_length) {
+          tileV[idx] = past_value[pastValueOffset + (w + local_id.y) * uniforms.N];
+        } else if (w + local_id.y - past_sequence_length < uniforms.kv_sequence_length) {
+          tileV[idx] = v[vOffset + (w + local_id.y - past_sequence_length) * uniforms.N];
+        }
+      `:`
+            if (w + local_id.y < uniforms.kv_sequence_length) {
+              tileV[idx] = v[vOffset + (w + local_id.y) * uniforms.N];
+            }`}
+        ${c?`
+            if (w + local_id.y < present_sequence_length) {
+          present_value[presentValueOffset + (w + local_id.y) * uniforms.N] = tileV[idx];
+        }`:""}
+      }
+     workgroupBarrier();
+     for (var k: u32 = 0u; k < TILE_SIZE && w+k < total_sequence_length; k++) {
+       value += tileQ[TILE_SIZE * local_id.y + k] * tileV[TILE_SIZE * k + local_id.x];
+     }
+     workgroupBarrier();
+   }
+
+   // we need to transpose output from BNSH_v to BSND_v
+   if (m < uniforms.M && n < uniforms.N) {
+     let outputIdx = batchIdx * uniforms.M * uniforms.v_hidden_size + m * uniforms.v_hidden_size
+       + headIdx * uniforms.N + n;
+     output[outputIdx] = value;
+   }
+  }`};return{name:"AttentionScore",shaderCache:{hint:`${r!==void 0};${e}`,inputDependencies:_},getRunData:()=>({outputs:w,dispatchGroup:b,programUniforms:y}),getShaderSource:v}},nt=(e,t,n,r,o,i,s,a,u,d,l=void 0,c=void 0)=>{let p=Math.min(e.outputCount,1+(s?1:0)+(a?1:0)),f=p>1?d.pastSequenceLength:0,m=f+d.kvSequenceLength,h=u&&x.size(u.dims)>0?u:void 0,b=[t,n];p>1&&s&&x.size(s.dims)>0&&b.push(s),h&&b.push(h),l&&b.push(l),c&&b.push(c);let y=e.compute(Vd(p,t,n,s,h,d,f,l,c),{inputs:b,outputs:p>1?[-1,1]:[-1]})[0];e.compute(Ud(y,d.batchSize,d.numHeads,f,d.sequenceLength,m,l,c),{inputs:l&&c?[y,l,c]:[y],outputs:[]});let g=[y,r];p>1&&a&&x.size(a.dims)>0&&g.push(a),l&&g.push(l),c&&g.push(c),e.compute(Nd(p,y,r,a,d,f,l,c),{inputs:g,outputs:p>1?[0,2]:[0]})},Ld=(e,t)=>{let n=[t.batchSize,t.numHeads,t.sequenceLength,t.headSize],r=t.sequenceLength,o=t.inputHiddenSize,i=t.headSize,s=12,a={x:Math.ceil(t.headSize/s),y:Math.ceil(t.sequenceLength/s),z:t.batchSize*t.numHeads},u=[e.inputs[0],e.inputs[1],e.inputs[2]],d=[{type:12,data:r},{type:12,data:o},{type:12,data:i},{type:12,data:t.numHeads},{type:12,data:t.headSize},{type:12,data:t.hiddenSize},{type:12,data:t.hiddenSize+t.hiddenSize+t.vHiddenSize}],l=c=>{let p=C("output_q",u[0].dataType,n),f=C("output_k",u[0].dataType,n),m=C("output_v",u[0].dataType,n),h=S("input",u[0].dataType,u[0].dims),b=S("weight",u[1].dataType,u[1].dims),y=S("bias",u[2].dataType,u[2].dims),g=h.type.storage,_=[{name:"M",type:"u32"},{name:"K",type:"u32"},{name:"N",type:"u32"},{name:"num_heads",type:"u32"},{name:"head_size",type:"u32"},{name:"hidden_size",type:"u32"},{name:"ldb",type:"u32"}];return`
+  const TILE_SIZE = ${s}u;
+  var<workgroup> tileInput: array<${g}, ${s*s}>;
+  var<workgroup> tileWeightQ: array<${g}, ${s*s}>;
+  var<workgroup> tileWeightK: array<${g}, ${s*s}>;
+  var<workgroup> tileWeightV: array<${g}, ${s*s}>;
+  ${c.registerUniforms(_).declareVariables(h,b,y,p,f,m)}
+  ${c.mainStart([s,s,1])}
+    let batchIndex = workgroup_id.z / uniforms.num_heads;
+    let headNumber = workgroup_id.z % uniforms.num_heads;
+    let m = global_id.y;
+    let n = global_id.x;
+
+    let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K;
+    let biasOffsetQ = headNumber * uniforms.head_size;
+    let biasOffsetK = uniforms.hidden_size + biasOffsetQ;
+    let biasOffsetV = uniforms.hidden_size + biasOffsetK;
+
+    var valueQ = ${g}(0);
+    var valueK = ${g}(0);
+    var valueV = ${g}(0);
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
+        tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x];
+      }
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        let offset = n + (w + local_id.y) * uniforms.ldb;
+        tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset];
+        tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset];
+        tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset];
+      }
+      workgroupBarrier();
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
+        let inputTileOffset = TILE_SIZE * local_id.y + k;
+        let weightTileOffset = TILE_SIZE * k + local_id.x;
+        valueQ += tileInput[inputTileOffset] * tileWeightQ[weightTileOffset];
+        valueK += tileInput[inputTileOffset] * tileWeightK[weightTileOffset];
+        valueV += tileInput[inputTileOffset] * tileWeightV[weightTileOffset];
+      }
+
+      workgroupBarrier();
+    }
+
+    let headOffset = (m * uniforms.N + n) % uniforms.head_size;
+    valueQ += bias[headOffset + biasOffsetQ];
+    valueK += bias[headOffset + biasOffsetK];
+    valueV += bias[headOffset + biasOffsetV];
+
+    let offset = workgroup_id.z * uniforms.M * uniforms.N;
+    if (m < uniforms.M && n < uniforms.N) {
+      let outputIdx = offset + m * uniforms.N + n;
+      output_q[outputIdx] = valueQ;
+      output_k[outputIdx] = valueK;
+      output_v[outputIdx] = valueV;
+    }
+  }`};return e.compute({name:"AttentionPrepare",shaderCache:{inputDependencies:["type","type","type"]},getRunData:()=>({outputs:[{dims:n,dataType:e.inputs[0].dataType,gpuDataType:0},{dims:n,dataType:e.inputs[0].dataType,gpuDataType:0},{dims:n,dataType:e.inputs[0].dataType,gpuDataType:0}],dispatchGroup:a,programUniforms:d}),getShaderSource:l},{inputs:u,outputs:[-1,-1,-1]})},Yo=(e,t)=>{let n=Rd(e.inputs,t),[r,o,i]=Ld(e,n);return nt(e,r,o,i,e.inputs[4],void 0,void 0,void 0,e.inputs[5],n)}});var Gd,Wd,Hd,Jo,ei=E(()=>{"use strict";ve();V();q();ie();K();Gd=(e,t)=>{if(!e||e.length!==5)throw new Error("BatchNormalization requires 5 inputs");let n=(r,o,i)=>{let s=o.length;if(s!==r.length)throw new Error(`${i}: num dimensions != ${s}`);o.forEach((a,u)=>{if(a!==r[u])throw new Error(`${i}: dim[${u}] do not match`)})};if(e[0].dims.length>1){let r=t.format==="NHWC"?t.spatial?e[0].dims.slice(-1):e[0].dims.slice(-1).concat(e[0].dims.slice(1,e[0].dims.length-1)):e[0].dims.slice(1,t.spatial?2:void 0);n(e[1].dims,r,"Invalid input scale"),n(e[2].dims,r,"Invalid input B"),n(e[3].dims,r,"Invalid input mean"),n(e[4].dims,r,"Invalid input var")}else n(e[1].dims,[1],"Invalid input scale"),n(e[2].dims,[1],"Invalid input B"),n(e[3].dims,[1],"Invalid input mean"),n(e[4].dims,[1],"Invalid input var")},Wd=(e,t)=>{let{epsilon:n,spatial:r,format:o}=t,i=e[0].dims,s=r?X(i[i.length-1]):1,a=o==="NHWC"&&i.length>1?s:1,u=x.size(i)/s,d=r,l=d?i.length:i,c=S("x",e[0].dataType,e[0].dims,s),p=S("scale",e[1].dataType,e[1].dims,a),f=S("bias",e[2].dataType,e[2].dims,a),m=S("inputMean",e[3].dataType,e[3].dims,a),h=S("inputVar",e[4].dataType,e[4].dims,a),b=C("y",e[0].dataType,l,s),y=()=>{let _="";if(r)_=`let cOffset = ${i.length===1?"0u":o==="NHWC"?`outputIndices[${i.length-1}] / ${s}`:"outputIndices[1]"};`;else if(o==="NCHW")_=`
+            ${b.indicesSet("outputIndices","0","0")}
+            let cOffset = ${b.indicesToOffset("outputIndices")};`;else{_=`var cIndices = ${p.type.indices}(0);
+                       cIndices[0] = outputIndices[${i.length-1}];`;for(let w=1;w<p.rank;w++)_+=`cIndices[${w}] = outputIndices[${w}];`;_+=`let cOffset = ${p.indicesToOffset("cIndices")};`}return _},g=_=>`
+  const epsilon = ${n};
+  ${_.registerUniform("outputSize","u32").declareVariables(c,p,f,m,h,b)}
+  ${_.mainStart()}
+  ${_.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+    var outputIndices = ${b.offsetToIndices(`global_idx * ${s}`)};
+    ${y()}
+    let scale = ${p.getByOffset("cOffset")};
+    let bias = ${f.getByOffset("cOffset")};
+    let inputMean = ${m.getByOffset("cOffset")};
+    let inputVar = ${h.getByOffset("cOffset")};
+    let x = ${c.getByOffset("global_idx")};
+    let value = (x - inputMean) * inverseSqrt(inputVar + epsilon) * scale + bias;
+    ${b.setByOffset("global_idx","value")}
+  }`;return{name:"BatchNormalization",shaderCache:{hint:`${t.epsilon}_${t.format}_${r}_${s}`,inputDependencies:d?["rank","type","type","type","type"]:void 0},getShaderSource:g,getRunData:()=>({outputs:[{dims:e[0].dims,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:d?[{type:12,data:u},...P(i)]:[{type:12,data:u}]})}},Hd=e=>N(e),Jo=(e,t)=>{let{inputs:n,outputCount:r}=e,o=Hd({...t,outputCount:r});if(te.webgpu.validateInputContent&&Gd(n,o),t.trainingMode)throw new Error("BatchNormalization trainingMode is not supported yet.");e.compute(Wd(n,o))}});var qd,Fd,ti,ni=E(()=>{"use strict";q();K();qd=e=>{if(e[0].dims.length!==3)throw new Error("input should have 3 dimensions");if(![320,640,1280].includes(e[0].dims[2]))throw new Error("number of channels should be 320, 640 or 1280");if(e[1].dims.length!==1)throw new Error("bias is expected to have 1 dimensions");if(e[0].dims[2]!==e[1].dims[0])throw new Error("last dimension of input and bias are not the same")},Fd=e=>{let t=e[0].dims,n=e[0].dims[2],r=x.size(t)/4,o=e[0].dataType,i=S("input",o,t,4),s=S("bias",o,[n],4),a=S("residual",o,t,4),u=C("output",o,t,4);return{name:"BiasAdd",getRunData:()=>({outputs:[{dims:t,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(r/64)}}),getShaderSource:l=>`
+  const channels = ${n}u / 4;
+  ${l.declareVariables(i,s,a,u)}
+
+  ${l.mainStart()}
+    ${l.guardAgainstOutOfBoundsWorkgroupSizes(r)}
+    let value = ${i.getByOffset("global_idx")}
+      + ${s.getByOffset("global_idx % channels")} + ${a.getByOffset("global_idx")};
+    ${u.setByOffset("global_idx","value")}
+  }`}},ti=e=>{qd(e.inputs),e.compute(Fd(e.inputs))}});var Kd,J,ri,oi,ii,si,ai,ui,di,li,ci,jd,pi,mi,fi,hi,ft,gi,Jt,yi,bi,_i,wi,$i,vi,xi,Si,Ti,Ii,Ci,Ai,ki,Ei,Pi,zi,Oi,Di,Kn,jn,Bi,Mi,Ri,Zd,Qd,Ui,en=E(()=>{"use strict";V();q();ie();K();Kd=(e,t,n,r,o,i,s)=>{let a=Math.ceil(t/4),u="";typeof o=="string"?u=`${o}(a)`:u=o("a");let d=S("inputData",n,[a],4),l=C("outputData",r,[a],4),c=[{name:"vec_size",type:"u32"}];return s&&c.push(...s),`
+      ${e.registerUniforms(c).declareVariables(d,l)}
+
+  ${i??""}
+
+  ${e.mainStart()}
+    ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+
+    let a = ${d.getByOffset("global_idx")};
+    ${l.setByOffset("global_idx",u)}
+  }`},J=(e,t,n,r,o,i=e.dataType,s,a)=>{let u=[{type:12,data:Math.ceil(x.size(e.dims)/4)}];return s&&u.push(...s),{name:t,shaderCache:{hint:o,inputDependencies:["type"]},getShaderSource:d=>Kd(d,x.size(e.dims),e.dataType,i,n,r,a),getRunData:d=>({outputs:[{dims:e.dims,dataType:i}],dispatchGroup:{x:Math.ceil(x.size(d[0].dims)/64/4)},programUniforms:u})}},ri=e=>{e.compute(J(e.inputs[0],"Abs","abs"))},oi=e=>{e.compute(J(e.inputs[0],"Acos","acos"))},ii=e=>{e.compute(J(e.inputs[0],"Acosh","acosh"))},si=e=>{e.compute(J(e.inputs[0],"Asin","asin"))},ai=e=>{e.compute(J(e.inputs[0],"Asinh","asinh"))},ui=e=>{e.compute(J(e.inputs[0],"Atan","atan"))},di=e=>{e.compute(J(e.inputs[0],"Atanh","atanh"))},li=e=>N(e),ci=(e,t)=>{let n;switch(t.to){case 10:n="vec4<f16>";break;case 1:n="vec4<f32>";break;case 12:n="vec4<u32>";break;case 6:n="vec4<i32>";break;case 9:n="vec4<bool>";break;default:throw new RangeError(`not supported type (specified in attribute 'to' from 'Cast' operator): ${t.to}`)}e.compute(J(e.inputs[0],"Cast",n,void 0,t.cacheKey,t.to))},jd=e=>{let t,n,r=e.length>=2&&e[1].data!==0,o=e.length>=3&&e[2].data!==0;switch(e[0].dataType){case 1:t=r?e[1].getFloat32Array()[0]:-34028234663852886e22,n=o?e[2].getFloat32Array()[0]:34028234663852886e22;break;case 10:t=r?e[1].getUint16Array()[0]:64511,n=o?e[2].getUint16Array()[0]:31743;break;default:throw new Error("Unsupport data type")}return N({min:t,max:n})},pi=(e,t)=>{let n=t||jd(e.inputs),r=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Clip",o=>`clamp(${o}, vec4<${r}>(uniforms.min), vec4<${r}>(uniforms.max))`,void 0,n.cacheKey,void 0,[{type:e.inputs[0].dataType,data:n.min},{type:e.inputs[0].dataType,data:n.max}],[{name:"min",type:r},{name:"max",type:r}]),{inputs:[0]})},mi=e=>{e.compute(J(e.inputs[0],"Ceil","ceil"))},fi=e=>{e.compute(J(e.inputs[0],"Cos","cos"))},hi=e=>{e.compute(J(e.inputs[0],"Cosh","cosh"))},ft=e=>N(e),gi=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Elu",r=>`elu_vf32(${r})`,`
+  const elu_alpha_ = ${n}(${t.alpha});
+
+  fn elu_f32(a: ${n}) -> ${n} {
+  return select((exp(a) - 1.0) * elu_alpha_, a, a >= 0.0);
+  }
+
+  fn elu_vf32(v: vec4<${n}>) -> vec4<${n}> {
+  return vec4(elu_f32(v.x), elu_f32(v.y), elu_f32(v.z), elu_f32(v.w));
+  }`,t.cacheKey))},Jt=(e="f32")=>`
+const r0: ${e} = 0.3275911;
+const r1: ${e} = 0.254829592;
+const r2: ${e} = -0.284496736;
+const r3: ${e} = 1.421413741;
+const r4: ${e} = -1.453152027;
+const r5: ${e} = 1.061405429;
+
+fn erf_vf32(v: vec4<${e}>) -> vec4<${e}> {
+  let absv = abs(v);
+  let x = 1.0 / (1.0 + r0 * absv);
+  return sign(v) * (1.0 - ((((r5 * x + r4) * x + r3) * x + r2) * x + r1) * x * exp(-absv * absv));
+}`,yi=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Erf",n=>`erf_vf32(${n})`,Jt(t)))},bi=e=>{e.compute(J(e.inputs[0],"Exp","exp"))},_i=e=>{e.compute(J(e.inputs[0],"Floor","floor"))},wi=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Gelu",n=>`0.5 * ${n} * (1.0 + erf_vf32(${n} * 0.7071067811865475))`,Jt(t)))},$i=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"LeakyRelu",r=>`select(leaky_relu_alpha_ * ${r}, ${r}, ${r} >= vec4<${n}>(0.0))`,`const leaky_relu_alpha_ = ${n}(${t.alpha});`,t.cacheKey))},vi=e=>{e.compute(J(e.inputs[0],"Not",t=>`!${t}`))},xi=e=>{e.compute(J(e.inputs[0],"Neg",t=>`-${t}`))},Si=e=>{e.compute(J(e.inputs[0],"Reciprocal",t=>`1.0/${t}`))},Ti=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"Relu",n=>`select(vec4<${t}>(0.0), ${n}, ${n} > vec4<${t}>(0.0))`))},Ii=e=>{e.compute(J(e.inputs[0],"Sigmoid",t=>`(1.0 / (1.0 + exp(-${t})))`))},Ci=e=>N(e),Ai=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"HardSigmoid",r=>`max(vec4<${n}>(0.0), min(vec4<${n}>(1.0), ${t.alpha} * ${r} + vec4<${n}>(${t.beta})))`,void 0,t.cacheKey))},ki=e=>{e.compute(J(e.inputs[0],"Sin","sin"))},Ei=e=>{e.compute(J(e.inputs[0],"Sinh","sinh"))},Pi=e=>{e.compute(J(e.inputs[0],"Sqrt","sqrt"))},zi=e=>{e.compute(J(e.inputs[0],"Tan","tan"))},Oi=e=>`sign(${e}) * (1 - exp(-2 * abs(${e}))) / (1 + exp(-2 * abs(${e})))`,Di=e=>{e.compute(J(e.inputs[0],"Tanh",Oi))},Kn=(e="f32")=>`
+const fast_gelu_a: ${e} = 0.5;
+const fast_gelu_b: ${e} = 0.7978845608028654;
+const fast_gelu_c: ${e} = 0.035677408136300125;
+
+fn tanh_v(v: vec4<${e}>) -> vec4<${e}> {
+  return ${Oi("v")};
+}
+`,jn=e=>`(fast_gelu_a + fast_gelu_a * tanh_v(${e} * (fast_gelu_c * ${e} * ${e} + fast_gelu_b))) * ${e}`,Bi=e=>{let t=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"FastGelu",jn,Kn(t),void 0,e.inputs[0].dataType))},Mi=(e,t)=>{let n=le(e.inputs[0].dataType);return e.compute(J(e.inputs[0],"ThresholdedRelu",r=>`select(vec4<${n}>(0.0), ${r}, ${r} > thresholded_relu_alpha_)`,`const thresholded_relu_alpha_ = vec4<${n}>(${t.alpha});`,t.cacheKey)),0},Ri=e=>{e.compute(J(e.inputs[0],"Log","log"))},Zd=(e,t)=>`
+const alpha = vec4<${e}>(${t});
+const one = ${e}(1.0);
+const zero = ${e}(0.0);
+
+fn quick_gelu_impl(x: vec4<${e}>) -> vec4<${e}> {
+  let v = x *alpha;
+  var x1 : vec4<${e}>;
+  for (var i = 0; i < 4; i = i + 1) {
+    if (v[i] >= zero) {
+      x1[i] = one / (one + exp(-v[i]));
+    } else {
+      x1[i] = one - one / (one + exp(v[i]));
+    }
+  }
+  return x * x1;
+}
+`,Qd=e=>`quick_gelu_impl(${e})`,Ui=(e,t)=>{let n=le(e.inputs[0].dataType);e.compute(J(e.inputs[0],"QuickGelu",Qd,Zd(n,t.alpha),t.cacheKey,e.inputs[0].dataType))}});var Xd,Yd,Ni,Li=E(()=>{"use strict";q();K();en();Xd=e=>{if(e[0].dims.length!==3)throw new Error("input should have 3 dimensions");if(![2560,5120,10240].includes(e[0].dims[2]))throw new Error("hidden state should be 2560, 5120 or 10240");if(e[1].dims.length!==1)throw new Error("bias is expected to have 1 dimensions");if(e[0].dims[2]!==e[1].dims[0])throw new Error("last dimension of input and bias are not the same")},Yd=e=>{let t=e[0].dims.slice();t[2]=t[2]/2;let n=S("input",e[0].dataType,e[0].dims,4),r=S("bias",e[0].dataType,[e[0].dims[2]],4),o=C("output",e[0].dataType,t,4),i=x.size(t)/4,s=re(e[0].dataType);return{name:"BiasSplitGelu",getRunData:()=>({outputs:[{dims:t,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(i/64)}}),getShaderSource:u=>`
+  const M_SQRT2 = sqrt(2.0);
+  const halfChannels = ${e[0].dims[2]/4/2}u;
+
+  ${u.declareVariables(n,r,o)}
+
+  ${Jt(s)}
+
+  ${u.mainStart()}
+    ${u.guardAgainstOutOfBoundsWorkgroupSizes(i)}
+    let biasIdx = global_idx % halfChannels;
+    let batchIndex = global_idx / halfChannels;
+    let inputOffset = biasIdx + batchIndex * halfChannels * 2;
+    let valueLeft = input[inputOffset] + bias[biasIdx];
+    let valueRight = input[inputOffset + halfChannels] + bias[biasIdx + halfChannels];
+    let geluRight = valueRight * 0.5 * (erf_vf32(valueRight / M_SQRT2) + 1);
+
+    ${o.setByOffset("global_idx","valueLeft * geluRight")}
+  }`}},Ni=e=>{Xd(e.inputs),e.compute(Yd(e.inputs))}});var Jd,el,Me,Gi,Wi,Hi,qi,Fi,Ki,ji,Zi,Qi,Xi,Yi=E(()=>{"use strict";V();q();K();Jd=(e,t,n,r,o,i,s,a,u,d,l,c)=>{let p,f;typeof a=="string"?p=f=(g,_)=>`${a}((${g}),(${_}))`:typeof a=="function"?p=f=a:(p=a.scalar,f=a.vector);let m=C("outputData",l,r.length,4),h=S("aData",u,t.length,4),b=S("bData",d,n.length,4),y;if(o)if(i){let g=x.size(t)===1,_=x.size(n)===1,w=t.length>0&&t[t.length-1]%4===0,v=n.length>0&&n[n.length-1]%4===0;g||_?y=m.setByOffset("global_idx",f(g?`${h.type.value}(${h.getByOffset("0")}.x)`:h.getByOffset("global_idx"),_?`${b.type.value}(${b.getByOffset("0")}.x)`:b.getByOffset("global_idx"))):y=`
+            let outputIndices = ${m.offsetToIndices("global_idx * 4u")};
+            let offsetA = ${h.broadcastedIndicesToOffset("outputIndices",m)};
+            let offsetB = ${b.broadcastedIndicesToOffset("outputIndices",m)};
+            ${m.setByOffset("global_idx",f(s||w?h.getByOffset("offsetA / 4u"):`${h.type.value}(${h.getByOffset("offsetA / 4u")}[offsetA % 4u])`,s||v?b.getByOffset("offsetB / 4u"):`${b.type.value}(${b.getByOffset("offsetB / 4u")}[offsetB % 4u])`))}
+          `}else y=m.setByOffset("global_idx",f(h.getByOffset("global_idx"),b.getByOffset("global_idx")));else{if(!i)throw new Error("no necessary to use scalar implementation for element-wise binary op implementation.");let g=(_,w,v="")=>{let $=`aData[indexA${w}][componentA${w}]`,T=`bData[indexB${w}][componentB${w}]`;return`
+            let outputIndices${w} = ${m.offsetToIndices(`global_idx * 4u + ${w}u`)};
+            let offsetA${w} = ${h.broadcastedIndicesToOffset(`outputIndices${w}`,m)};
+            let offsetB${w} = ${b.broadcastedIndicesToOffset(`outputIndices${w}`,m)};
+            let indexA${w} = offsetA${w} / 4u;
+            let indexB${w} = offsetB${w} / 4u;
+            let componentA${w} = offsetA${w} % 4u;
+            let componentB${w} = offsetB${w} % 4u;
+            ${_}[${w}] = ${v}(${p($,T)});
+          `};l===9?y=`
+            var data = vec4<u32>(0);
+            ${g("data",0,"u32")}
+            ${g("data",1,"u32")}
+            ${g("data",2,"u32")}
+            ${g("data",3,"u32")}
+            outputData[global_idx] = dot(vec4<u32>(0x1, 0x100, 0x10000, 0x1000000), vec4<u32>(data));`:y=`
+            ${g("outputData[global_idx]",0)}
+            ${g("outputData[global_idx]",1)}
+            ${g("outputData[global_idx]",2)}
+            ${g("outputData[global_idx]",3)}
+          `}return`
+        ${e.registerUniform("vec_size","u32").declareVariables(h,b,m)}
+
+        ${c??""}
+
+        ${e.mainStart()}
+        ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+        ${y}
+      }`},el=(e,t,n,r,o,i,s=n.dataType)=>{let a=n.dims.map(h=>Number(h)??1),u=r.dims.map(h=>Number(h)??1),d=!x.areEqual(a,u),l=a,c=x.size(a),p=!1,f=!1,m=[d];if(d){let h=Pe.calcShape(a,u,!1);if(!h)throw new Error("Can't perform binary op on the given tensors");l=h.slice(),c=x.size(l);let b=x.size(a)===1,y=x.size(u)===1,g=a.length>0&&a[a.length-1]%4===0,_=u.length>0&&u[u.length-1]%4===0;m.push(b),m.push(y),m.push(g),m.push(_);let w=1;for(let v=1;v<l.length;v++){let $=a[a.length-v],T=u[u.length-v];if($===T)w*=$;else break}w%4===0?(f=!0,p=!0):(b||y||g||_)&&(p=!0)}else p=!0;return m.push(p),{name:e,shaderCache:{hint:t+m.map(h=>h.toString()).join("_"),inputDependencies:["rank","rank"]},getShaderSource:h=>Jd(h,a,u,l,p,d,f,o,n.dataType,r.dataType,s,i),getRunData:()=>({outputs:[{dims:l,dataType:s}],dispatchGroup:{x:Math.ceil(c/64/4)},programUniforms:[{type:12,data:Math.ceil(x.size(l)/4)},...P(a,u,l)]})}},Me=(e,t,n,r,o,i)=>{e.compute(el(t,o??"",e.inputs[0],e.inputs[1],n,r,i))},Gi=e=>{Me(e,"Add",(t,n)=>`${t}+${n}`)},Wi=e=>{Me(e,"Div",(t,n)=>`${t}/${n}`)},Hi=e=>{Me(e,"Equal",{scalar:(t,n)=>`u32(${t}==${n})`,vector:(t,n)=>`vec4<u32>(${t}==${n})`},void 0,void 0,9)},qi=e=>{Me(e,"Mul",(t,n)=>`${t}*${n}`)},Fi=e=>{let t=S("input",e.inputs[0].dataType,e.inputs[0].dims).type.value;Me(e,"Pow",{scalar:(r,o)=>`pow_custom(${r},${o})`,vector:(r,o)=>`pow_vector_custom(${r},${o})`},`
+    fn pow_custom(a : ${t}, b : ${t}) -> ${t} {
+      if (b == ${t}(0.0)) {
+        return ${t}(1.0);
+      } else if (a < ${t}(0.0) && f32(b) != floor(f32(b))) {
+        return ${t}(pow(f32(a), f32(b))); // NaN
+      }
+      return select(sign(a), ${t}(1.0), round(f32(abs(b) % ${t}(2.0))) != 1.0) * ${t}(${t==="i32"?"round":""}(pow(f32(abs(a)), f32(b))));
+    }
+    fn pow_vector_custom(a : vec4<${t}>, b : vec4<${t}>) -> vec4<${t}> {
+      // TODO: implement vectorized pow
+      return vec4<${t}>(pow_custom(a.x, b.x), pow_custom(a.y, b.y), pow_custom(a.z, b.z), pow_custom(a.w, b.w));
+    }
+      `)},Ki=e=>{Me(e,"Sub",(t,n)=>`${t}-${n}`)},ji=e=>{Me(e,"Greater",{scalar:(t,n)=>`u32(${t}>${n})`,vector:(t,n)=>`vec4<u32>(${t}>${n})`},void 0,void 0,9)},Zi=e=>{Me(e,"Less",{scalar:(t,n)=>`u32(${t}<${n})`,vector:(t,n)=>`vec4<u32>(${t}<${n})`},void 0,void 0,9)},Qi=e=>{Me(e,"GreaterOrEqual",{scalar:(t,n)=>`u32(${t}>=${n})`,vector:(t,n)=>`vec4<u32>(${t}>=${n})`},void 0,void 0,9)},Xi=e=>{Me(e,"LessOrEqual",{scalar:(t,n)=>`u32(${t}<=${n})`,vector:(t,n)=>`vec4<u32>(${t}<=${n})`},void 0,void 0,9)}});var nl,rl,ol,il,Ji,es,ts=E(()=>{"use strict";V();q();ie();K();nl=(e,t)=>{if(!e||e.length<1)throw new Error("too few inputs");let n=0,r=e[n],o=r.dataType,i=r.dims.length;e.forEach((s,a)=>{if(a!==n){if(s.dataType!==o)throw new Error("input tensors should be one type");if(s.dims.length!==i)throw new Error("input tensors should have the same shape");s.dims.forEach((u,d)=>{if(d!==t&&u!==r.dims[d])throw new Error("non concat dimensions must match")})}})},rl=(e,t)=>`
+  fn calculateInputIndex(index: u32) -> u32 {
+    let sizeInConcatAxis = array<u32, ${e}u>(${t});
+    for (var i: u32 = 0u; i < ${e}; i += 1u ) {
+      if (index < sizeInConcatAxis[i]) {
+        return i;
+      }
+    }
+    return ${e}u;
+  }`,ol=(e,t)=>{let n=e.length,r=[];for(let o=0;o<n;++o){let i=t.setByOffset("global_idx",e[o].getByIndices("indices"));n===1?r.push(i):o===0?r.push(`if (inputIndex == ${o}u) { ${i} }`):o===n-1?r.push(`else { ${i} }`):r.push(`else if (inputIndex == ${o}) { ${i} }`)}return r.join(`
+`)},il=(e,t,n,r)=>{let o=x.size(n),i=new Array(e.length),s=new Array(e.length),a=0,u=[],d=[],l=[{type:12,data:o}];for(let h=0;h<e.length;++h)a+=e[h].dims[t],i[h]=a,d.push(e[h].dims.length),s[h]=S(`input${h}`,r,d[h]),u.push("rank"),l.push({type:12,data:i[h]});for(let h=0;h<e.length;++h)l.push(...P(e[h].dims));l.push(...P(n));let c=C("output",r,n.length),p=c.indicesGet("indices",t),f=Array.from(Array(i.length).keys()).map(h=>`uniforms.sizeInConcatAxis${h}`).join(","),m=h=>`
+
+  ${(()=>{h.registerUniform("outputSize","u32");for(let b=0;b<e.length;b++)h.registerUniform(`sizeInConcatAxis${b}`,"u32");return h.declareVariables(...s,c)})()}
+
+  ${rl(i.length,f)}
+
+  ${h.mainStart()}
+    ${h.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+
+    var indices = ${c.offsetToIndices("global_idx")};
+
+    let inputIndex = calculateInputIndex(${p});
+    if (inputIndex != 0u) {
+      let sizeInConcatAxis = array<u32, ${i.length}u>(${f});
+      ${p} -= sizeInConcatAxis[inputIndex - 1u];
+    }
+
+    ${ol(s,c)}
+  }`;return{name:"Concat",shaderCache:{hint:`${t}`,inputDependencies:u},getRunData:()=>({outputs:[{dims:n,dataType:r}],dispatchGroup:{x:Math.ceil(o/64)},programUniforms:l}),getShaderSource:m}},Ji=(e,t)=>{let n=e.inputs,r=n[0].dims,o=x.normalizeAxis(t.axis,r.length);nl(n,o);let i=r.slice();i[o]=n.reduce((a,u)=>a+(u.dims.length>o?u.dims[o]:0),0);let s=n.filter(a=>x.size(a.dims)>0);e.compute(il(s,o,i,n[0].dataType),{inputs:s})},es=e=>N({axis:e.axis})});var Ie,Ce,Ae,tn,He=E(()=>{"use strict";V();q();Ie=(e,t,n="f32")=>{switch(e.activation){case"Relu":return`value = max(value, ${t}(0.0));`;case"Sigmoid":return`value = (${t}(1.0) / (${t}(1.0) + exp(-value)));`;case"Clip":return`value = clamp(value, ${t}(${n}(uniforms.clip_min)), ${t}(${n}(uniforms.clip_max)));`;case"HardSigmoid":return`value = max(${t}(0.0), min(${t}(1.0), ${n}(uniforms.alpha) * value + ${n}(uniforms.beta)));`;case"LeakyRelu":return`value = select(${n}(uniforms.alpha) * value, value, value >= ${t}(0.0));`;case"Tanh":return`let e2x = exp(-2.0 * abs(value));
+              value = sign(value) * (1.0 - e2x) / (1.0 + e2x);
+        `;case"":return"";default:throw new Error(`Unsupported activation ${e.activation}`)}},Ce=(e,t)=>{e.activation==="Clip"?t.push({type:1,data:e.clipMax},{type:1,data:e.clipMin}):e.activation==="HardSigmoid"?t.push({type:1,data:e.alpha},{type:1,data:e.beta}):e.activation==="LeakyRelu"&&t.push({type:1,data:e.alpha})},Ae=(e,t)=>{e.activation==="Clip"?t.push({name:"clip_max",type:"f32"},{name:"clip_min",type:"f32"}):e.activation==="HardSigmoid"?t.push({name:"alpha",type:"f32"},{name:"beta",type:"f32"}):e.activation==="LeakyRelu"&&t.push({name:"alpha",type:"f32"})},tn=e=>{let t=e?.activation||"";if(t==="HardSigmoid"){let[n,r]=e?.activation_params||[.2,.5];return{activation:t,alpha:n,beta:r}}else if(t==="Clip"){let[n,r]=e?.activation_params||[ao,uo];return{activation:t,clipMax:r,clipMin:n}}else if(t==="LeakyRelu"){let[n]=e?.activation_params||[.01];return{activation:t,alpha:n}}return{activation:t}}});var de,ns,nn=E(()=>{"use strict";de=(e,t)=>{switch(e){case 1:return t;case 2:return`vec2<${t}>`;case 3:return`vec3<${t}>`;case 4:return`vec4<${t}>`;default:throw new Error(`${e}-component is not supported.`)}},ns=e=>`
+      ${e?"value = value + getBiasByOutputCoords(coords);":""}
+      `});var rs,os=E(()=>{"use strict";rs=e=>`
+fn getIndexFromCoords4D(coords : vec4<i32>, shape : vec4<i32>) -> i32 {
+  return dot(coords, vec4<i32>(
+      shape.y * shape.z * shape.w, shape.z * shape.w, shape.w, 1));
+}
+fn getOutputIndexFromCoords(coords : vec4<i32>) -> i32 {
+  return dot(coords, vec4<i32>(
+    i32(${e}.x), i32(${e}.y), i32(${e}.z), 1));
+}
+`});var ht,rn,on=E(()=>{"use strict";V();q();K();He();ht=(e,t,n,r,o)=>{let i=r-n;return`
+      ${Array.from({length:n}).map((s,a)=>`
+      if (${B(t.shape,a,t.rank)} != 1) {
+        ${t.indicesSet(e,a,B(o,a+i,r))}
+      } else {
+        ${t.indicesSet(e,a,0)}
+      }`).join("")}
+`},rn=(e,t,n,r,o=!1,i)=>{let s=e[0].dims,a=e[1].dims,u=s[s.length-2],d=a[a.length-1],l=s[s.length-1],c=X(d),p=X(l),f=X(u),m=x.size(n)/c/f,h=e.length>2,b=r?r.slice(0,-2):n.slice(0,-2),g=[x.size(b),u,d],_=[{type:12,data:m},{type:12,data:u},{type:12,data:d},{type:12,data:l}];Ce(t,_),_.push(...P(b,s,a)),h&&_.push(...P(e[2].dims)),_.push(...P(g));let w=v=>{let $=Zt("batch_dims",e[0].dataType,b.length),T=S("a",e[0].dataType,s.length,p),I=S("b",e[1].dataType,a.length,c),A=C("output",e[0].dataType,g.length,c),z=re(A.type.tensor),M=Ie(t,A.type.value,z),R=[T,I],W="";if(h){let G=o?c:1;R.push(S("bias",e[2].dataType,e[2].dims.length,G)),W=`${o?`value += bias[col / ${G}];`:`value += ${A.type.value}(bias[row + i]);`}`}let O=[{name:"output_size",type:"u32"},{name:"M",type:"u32"},{name:"N",type:"u32"},{name:"K",type:"u32"}];Ae(t,O);let ee=()=>{let G=`var a_data: ${T.type.value};`;for(let D=0;D<p;D++)G+=`
+              let b_data${D} = b[(b_offset + (k + ${D}) * uniforms.N + col) / ${c}];`;for(let D=0;D<f;D++){G+=`a_data = a[(a_offset + (row + ${D}) * uniforms.K + k) / ${p}];`;for(let Z=0;Z<p;Z++)G+=`
+            values[${D}] = fma(${I.type.value}(a_data${p===1?"":`[${Z}]`}), b_data${Z}, values[${D}]);
+`}return G};return`
+  ${v.registerUniforms(O).registerInternalVariables($).declareVariables(...R,A)}
+  ${v.mainStart()}
+    ${v.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let col = (global_idx % (uniforms.N / ${c})) * ${c};
+    var index1 = global_idx / (uniforms.N / ${c});
+    let stride1 = uniforms.M / ${f};
+    let row = (index1 % stride1) * ${f};
+    let batch = index1 / stride1;
+
+    ${n.length===2?"":`let batch_indices = ${$.offsetToIndices("batch")};`}
+
+    var a_indices: ${T.type.indices};
+    ${ht("a_indices",T,T.rank-2,$.rank,"batch_indices")}
+    ${T.indicesSet("a_indices",T.rank-2,0)}
+    ${T.indicesSet("a_indices",T.rank-1,0)}
+    let a_offset = ${T.indicesToOffset("a_indices")};
+
+    var b_indices: ${I.type.indices};
+    ${ht("b_indices",I,I.rank-2,$.rank,"batch_indices")}
+    ${I.indicesSet("b_indices",I.rank-2,0)}
+    ${I.indicesSet("b_indices",I.rank-1,0)}
+    let b_offset = ${I.indicesToOffset("b_indices")};
+    var values: array<${A.type.value}, ${f}>;
+    for (var k: u32 = 0u; k < uniforms.K; k = k + ${p}) {
+      ${ee()}
+    }
+    for (var i = 0u; i < ${f}u; i++) {
+      var value = values[i];
+      ${W}
+      ${M}
+      let cur_indices = ${A.type.indices}(batch, row + i, col);
+      let offset = ${A.indicesToOffset("cur_indices")};
+      ${A.setByOffset(`offset / ${c}`,"value")};
+    }
+  }
+  `};return{name:"MatMulNaive",shaderCache:{hint:`${t.activation};${c};${p};${f};${o}`,inputDependencies:h?["rank","rank","rank"]:["rank","rank"]},getRunData:()=>({outputs:[{dims:i?i(n):n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(m/64)},programUniforms:_}),getShaderSource:w}}});var sl,al,Zn,is,ul,Qn,dl,gt,sn=E(()=>{"use strict";V();q();K();He();on();nn();sl=(e,t)=>e?`
+        mm_Asub[inputRow][inputCol] = mm_readA(batch,
+          kStart + inputRow,
+          globalRowStart / innerElementSize + inputCol${t?", batchIndices":""});
+        `:`
+        mm_Asub[inputRow][inputCol] = mm_readA(batch,
+          globalRow + innerRow,
+          kStart / innerElementSize + inputCol${t?", batchIndices":""});
+        `,al=(e,t)=>e?`
+        let ACached0 = mm_Asub[k * innerElementSize][localRow];
+        let ACached1 = mm_Asub[k * innerElementSize + 1][localRow];
+        let ACached2 = mm_Asub[k * innerElementSize + 2][localRow];
+        ${t===3?"":"let ACached3 = mm_Asub[k * innerElementSize + 3][localRow];"}
+        for (var i = 0; i < rowPerThread; i = i + 1) {
+          acc[i] = BCached0 * ACached0[i] + acc[i];
+          acc[i] = BCached1 * ACached1[i] + acc[i];
+          acc[i] = BCached2 * ACached2[i] + acc[i];
+          ${t===3?"":"acc[i] = BCached3 * ACached3[i] + acc[i];"}
+        }`:`
+        for (var i = 0; i < rowPerThread; i = i + 1) {
+          let ACached = mm_Asub[tileRow + i][k];
+          acc[i] = BCached0 * ACached.x + acc[i];
+          acc[i] = BCached1 * ACached.y + acc[i];
+          acc[i] = BCached2 * ACached.z + acc[i];
+          ${t===3?"":"acc[i] = BCached3 * ACached.w + acc[i];"}
+        }`,Zn=(e,t,n="f32",r,o=!1,i=32,s=!1,a=32)=>{let u=t[1]*e[1],d=t[0]*e[0],l=o?u:i,c=o?i:u,p=l/t[0],f=i/t[1];if(!((o&&p===4&&e[1]===4||!o&&(p===3||p===4))&&l%t[0]===0&&i%t[1]===0&&e[0]===4))throw new Error(`If transposeA ${o} is true, innerElementSize ${p} and workPerThread[1] ${e[1]} must be 4.
+      Otherwise, innerElementSize ${p} must be 3 or 4.
+  tileAWidth ${l} must be divisible by workgroupSize[0]${t[0]}. tileInner ${i} must be divisible by workgroupSize[1] ${t[1]}. colPerThread ${e[0]} must be 4.`);return`
+var<workgroup> mm_Asub: array<array<vec${p}<${n}>, ${l/p}>, ${c}>;
+var<workgroup> mm_Bsub: array<array<vec4<${n}>, ${d/e[0]}>, ${i}>;
+
+const rowPerThread = ${e[1]};
+const colPerThread = ${e[0]};
+const innerElementSize = ${p};
+const tileInner = ${i};
+
+@compute @workgroup_size(${t[0]}, ${t[1]}, ${t[2]})
+fn main(@builtin(local_invocation_id) localId : vec3<u32>,
+        @builtin(global_invocation_id) globalId : vec3<u32>,
+        @builtin(workgroup_id) workgroupId : vec3<u32>) {
+  let localRow = i32(localId.y);
+  let tileRow = localRow * rowPerThread;
+  let tileCol = i32(localId.x);
+
+  let globalRow =i32(globalId.y) * rowPerThread;
+  let globalCol = i32(globalId.x);
+  let batch = ${s?"0":"i32(globalId.z)"};
+  ${r?`let batchIndices = ${r.offsetToIndices("u32(batch)")};`:""}
+  let globalRowStart = i32(workgroupId.y) * ${u};
+
+  let num_tiles = ${s?`${Math.ceil(a/i)}`:"(uniforms.dim_inner - 1) / tileInner + 1"};
+  var kStart = ${s?`i32(globalId.z) * ${a}`:"0"};
+
+  var acc: array<vec4<${n}>, rowPerThread>;
+
+  // Loop over shared dimension.
+  let tileRowB = localRow * ${f};
+  for (var t = 0; t < num_tiles; t = t + 1) {
+      // Load one tile of A into local memory.
+      for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+          let inputRow = tileRow + innerRow;
+          let inputCol = tileCol;
+          ${sl(o,r)}
+      }
+
+      // Load one tile of B into local memory.
+      for (var innerRow = 0; innerRow < ${f}; innerRow = innerRow + 1) {
+          let inputRow = tileRowB + innerRow;
+          let inputCol = tileCol;
+          mm_Bsub[inputRow][inputCol] = mm_readB(batch, kStart + inputRow, globalCol${r?", batchIndices":""});
+      }
+      kStart = kStart + tileInner;
+      workgroupBarrier();
+
+      // Compute acc values for a single thread.
+      for (var k = 0; k < tileInner / innerElementSize; k = k + 1) {
+          let BCached0 = mm_Bsub[k * innerElementSize][tileCol];
+          let BCached1 = mm_Bsub[k * innerElementSize + 1][tileCol];
+          let BCached2 = mm_Bsub[k * innerElementSize + 2][tileCol];
+          ${p===3?"":"let BCached3 = mm_Bsub[k * innerElementSize + 3][tileCol];"}
+
+          ${al(o,p)}
+      }
+
+      workgroupBarrier();
+  }
+
+  for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      mm_write(batch, globalRow + innerRow, globalCol, acc[innerRow]);
+  }
+}`},is=(e,t)=>e?`
+            mm_Asub[inputRow][inputCol] = mm_readA(batch,
+              kStart + inputRow,
+              globalRowStart + inputCol${t?", batchIndices":""});
+            `:`
+            mm_Asub[inputRow][inputCol] = mm_readA(batch,
+              globalRowStart + inputRow,
+              kStart + inputCol${t?", batchIndices":""});
+            `,ul=e=>e?"let ACached = mm_Asub[k][tileRow + innerRow];":"let ACached = mm_Asub[tileRow + innerRow][k];",Qn=(e,t,n="f32",r,o=!1,i=32,s=!1,a=32,u=!1)=>{let d=e[1]*t[1],l=e[0]*t[0],c=o?d:i,p=o?i:d;if(!(p%t[1]===0&&c%t[0]===0&&i%t[1]===0))throw new Error(`tileAHight ${p} must be divisible by workgroupSize[1]${t[1]}, tileAWidth ${c} must be divisible by workgroupSize[0]${t[0]}, tileInner ${i} must be divisible by workgroupSize[1]${t[1]}`);let f=p/t[1],m=c/t[0],h=i/t[1],b=u?`
+    let localRow = i32(localId.y);
+    let localCol = i32(localId.x);
+    let globalRowStart = i32(workgroupId.y) * ${d};
+    let globalColStart = i32(workgroupId.x) * ${l};
+
+    // Loop over shared dimension.
+    for (var t = 0; t < num_tiles; t = t + 1) {
+      // Load one tile of A into local memory.
+      for (var inputRow = localRow; inputRow < ${p}; inputRow = inputRow + ${t[1]}) {
+        for (var inputCol = localCol; inputCol < ${c}; inputCol = inputCol + ${t[0]}) {
+          ${is(o,r)}
+        }
+      }
+      // Load one tile of B into local memory.
+      for (var inputRow = localRow; inputRow < ${i}; inputRow = inputRow + ${t[1]}) {
+            for (var inputCol = localCol; inputCol < ${l}; inputCol = inputCol + ${t[0]}) {
+          mm_Bsub[inputRow][inputCol] = mm_readB(batch,
+            kStart + inputRow,
+            globalColStart + inputCol${r?", batchIndices":""});
+        }
+      }
+      kStart = kStart + tileInner;
+      workgroupBarrier();
+
+      // Compute acc values for a single thread.
+      var BCached : array<${n}, colPerThread>;
+      for (var k = 0; k < tileInner; k = k + 1) {
+        for (var inner = 0; inner < colPerThread; inner = inner + 1) {
+          BCached[inner] = mm_Bsub[k][localCol + inner * ${t[0]}];
+        }
+        for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+          let ACached = ${o?`mm_Asub[k][localRow + innerRow * ${t[1]}];`:`mm_Asub[localRow + innerRow * ${t[1]}][k];`}
+          for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+            acc[innerRow][innerCol] = acc[innerRow][innerCol] +
+                ACached * BCached[innerCol];
+          }
+        }
+      }
+      workgroupBarrier();
+    }
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      let gRow = globalRowStart + localRow + innerRow * ${t[1]};
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        let gCol = globalColStart + localCol + innerCol * ${t[0]};
+        mm_write(batch, gRow, gCol, acc[innerRow][innerCol]);
+      }
+    }
+    `:`
+let tileRow = i32(localId.y) * rowPerThread;
+let tileCol = i32(localId.x) * colPerThread;
+
+let globalRow = i32(globalId.y) * rowPerThread;
+let globalCol = i32(globalId.x) * colPerThread;
+let globalRowStart = i32(workgroupId.y) * ${d};
+
+let tileRowA = i32(localId.y) * ${f};
+let tileColA = i32(localId.x) * ${m};
+let tileRowB = i32(localId.y) * ${h};
+// Loop over shared dimension.
+for (var t = 0; t < num_tiles; t = t + 1) {
+  // Load one tile of A into local memory.
+  for (var innerRow = 0; innerRow < ${f}; innerRow = innerRow + 1) {
+    for (var innerCol = 0; innerCol < ${m}; innerCol = innerCol + 1) {
+      let inputRow = tileRowA + innerRow;
+      let inputCol = tileColA + innerCol;
+      ${is(o,r)}
+    }
+  }
+
+  // Load one tile of B into local memory.
+  for (var innerRow = 0; innerRow < ${h}; innerRow = innerRow + 1) {
+    for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+      let inputRow = tileRowB + innerRow;
+      let inputCol = tileCol + innerCol;
+      mm_Bsub[inputRow][inputCol] = mm_readB(batch,
+        kStart + inputRow,
+        globalCol + innerCol${r?", batchIndices":""});
+    }
+  }
+  kStart = kStart + tileInner;
+  workgroupBarrier();
+
+  // Compute acc values for a single thread.
+  var BCached : array<${n}, colPerThread>;
+  for (var k = 0; k < tileInner; k = k + 1) {
+    for (var inner = 0; inner < colPerThread; inner = inner + 1) {
+      BCached[inner] = mm_Bsub[k][tileCol + inner];
+    }
+
+    for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+      ${ul(o)}
+      for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+        acc[innerRow][innerCol] = acc[innerRow][innerCol] + ACached * BCached[innerCol];
+      }
+    }
+  }
+
+  workgroupBarrier();
+}
+
+for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) {
+  for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) {
+    mm_write(batch, globalRow + innerRow, globalCol + innerCol,
+        acc[innerRow][innerCol]);
+  }
+}
+`;return`
+  var<workgroup> mm_Asub : array<array<${n}, ${c}>, ${p}>;
+  var<workgroup> mm_Bsub : array<array<${n}, ${l}>, ${i}>;
+  const rowPerThread = ${e[1]};
+  const colPerThread = ${e[0]};
+  const tileInner = ${i};
+
+@compute @workgroup_size(${t[0]}, ${t[1]}, ${t[2]})
+fn main(@builtin(local_invocation_id) localId : vec3<u32>,
+        @builtin(global_invocation_id) globalId : vec3<u32>,
+        @builtin(workgroup_id) workgroupId : vec3<u32>) {
+    let batch = ${s?"0":"i32(globalId.z)"};
+    ${r?`let batchIndices = ${r.offsetToIndices("u32(batch)")};`:""}
+    let num_tiles = ${s?`${Math.ceil(a/i)}`:"(uniforms.dim_inner - 1) / tileInner + 1"};
+    var kStart = ${s?`i32(globalId.z) * ${a}`:"0"};
+
+    var acc : array<array<${n}, colPerThread>, rowPerThread>;
+    ${b}
+  }
+`},dl=(e,t,n,r,o=!1)=>{let[i,s,a,u]=r,d=re(r[0].type.tensor);return`
+    fn mm_readA(batch: i32, row: i32, colIn: i32, batchIndices: ${i.type.indices}) -> ${de(e,d)} {
+      var value = ${de(e,d)}(0.0);
+      let col = colIn * ${e};
+      if(row < uniforms.dim_a_outer && col < uniforms.dim_inner)
+      {
+        var aIndices: ${s.type.indices};
+        ${ht("aIndices",s,s.rank-2,i.rank,"batchIndices")}
+        ${s.indicesSet("aIndices",s.rank-2,"u32(row)")}
+        ${s.indicesSet("aIndices",s.rank-1,"u32(colIn)")}
+        value = ${s.getByIndices("aIndices")};
+      }
+      return value;
+    }
+
+    fn mm_readB(batch: i32, row: i32, colIn: i32, batchIndices: ${i.type.indices}) -> ${de(e,d)} {
+      var value = ${de(e,d)}(0.0);
+      let col = colIn * ${e};
+      if(row < uniforms.dim_inner && col < uniforms.dim_b_outer)
+      {
+        var bIndices: ${a.type.indices};
+        ${ht("bIndices",a,a.rank-2,i.rank,"batchIndices")}
+        ${a.indicesSet("bIndices",a.rank-2,"u32(row)")}
+        ${a.indicesSet("bIndices",a.rank-1,"u32(colIn)")}
+        value = ${a.getByIndices("bIndices")};
+      }
+      return value;
+    }
+
+    fn mm_write(batch: i32, row: i32, colIn: i32, valueIn: ${de(e,d)}) {
+      let col = colIn * ${e};
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer) {
+        var value = valueIn;
+        let coords = vec3<i32>(batch, row, colIn);
+        ${t?`value = value + ${o?"bias[colIn]":`${de(e,d)}(bias[row])`};`:""}
+        ${n}
+        ${u.setByIndices("vec3<u32>(coords)","value")}
+      }
+    }
+    `},gt=(e,t,n,r,o=!1,i)=>{let s=e[0].dims,a=e[1].dims,u=s.slice(0,-2),d=a.slice(0,-2),l=r?r.slice(0,-2):n.slice(0,-2),c=x.size(l),p=s[s.length-2],f=s[s.length-1],m=a[a.length-1],h=f%4===0&&m%4===0,b=p<=8?[4,1,1]:[4,4,1],y=[8,8,1],g=[Math.ceil(m/y[0]/b[0]),Math.ceil(p/y[1]/b[1]),Math.ceil(c/y[2]/b[2])],_=h?4:1,w=[...u,p,f/_],v=w.length,$=[...d,f,m/_],T=$.length,I=[c,p,m/_],A=[{type:6,data:p},{type:6,data:m},{type:6,data:f}];Ce(t,A),A.push(...P(l,w,$));let z=["rank","rank"],M=e.length>2;M&&(A.push(...P(e[2].dims)),z.push("rank")),A.push(...P(I));let R=W=>{let O=l.length,ee=Zt("batchDims",e[0].dataType,O,1),G=re(e[0].dataType),D=S("a",e[0].dataType,v,_),Z=S("b",e[1].dataType,T,_),U=C("result",e[0].dataType,I.length,_),Q=[D,Z];if(M){let L=o?_:1;Q.push(S("bias",e[2].dataType,e[2].dims.length,L))}let _e=[{name:"dim_a_outer",type:"i32"},{name:"dim_b_outer",type:"i32"},{name:"dim_inner",type:"i32"}];Ae(t,_e);let se=re(U.type.tensor),H=Ie(t,U.type.value,se),k=dl(_,M,H,[ee,D,Z,U],o);return`
+  ${W.registerUniforms(_e).registerInternalVariables(ee).declareVariables(...Q,U)}
+  ${k}
+  ${h?Zn(b,y,G,ee):Qn(b,y,G,ee)}
+                   `};return{name:"MatMul",shaderCache:{hint:`${b};${t.activation};${h};${o}`,inputDependencies:z},getRunData:()=>({outputs:[{dims:i?i(n):n,dataType:e[0].dataType}],dispatchGroup:{x:g[0],y:g[1],z:g[2]},programUniforms:A}),getShaderSource:R}}});var ll,ss,as=E(()=>{"use strict";V();Ee();K();He();nn();os();sn();ll=(e,t,n,r,o=!1,i,s=4,a=4,u=4,d="f32")=>{let l=z=>{switch(z){case 1:return"resData = x[xIndex];";case 3:return`resData = vec3<${d}>(x[xIndex], x[xIndex + 1], x[xIndex + 2]);`;case 4:return"resData = x[xIndex / 4];";default:throw new Error(`innerElementSize ${z} is not supported.`)}},c=z=>{switch(z){case 1:return"return w[row * i32(uniforms.w_shape[3]) + colIn];";case 4:return"return w[row * i32(uniforms.w_shape[3]) / 4 + colIn];";default:throw new Error(`innerElementSize ${z} is not supported.`)}},p=e?`
+    let coord = vec4<i32>(batch, xRow, xCol, xCh);
+    `:`
+    let coord = vec4<i32>(batch, xCh, xRow, xCol);
+    `,f=e?`
+    let coords = vec4<i32>(
+      batch,
+      row / outWidth,
+      row % outWidth,
+      col);
+    `:`
+    let coords = vec4<i32>(
+      batch,
+      row,
+      col / outWidth,
+      col % outWidth);
+    `,m=e?"i32(uniforms.x_shape[1])":"i32(uniforms.x_shape[2])",h=e?"i32(uniforms.x_shape[2])":"i32(uniforms.x_shape[3])",b=e?"row":"col",y=e?"col":"row",g=`
+    let inChannels = i32(uniforms.w_shape[2]);
+    let outWidth = ${e?"i32(uniforms.result_shape[2])":"i32(uniforms.result_shape[3])"};
+    let outRow = ${b} / outWidth;
+    let outCol = ${b} % outWidth;
+
+    let WRow = ${y} / (i32(uniforms.w_shape[1]) * inChannels);
+    let WCol = ${y} / inChannels % i32(uniforms.w_shape[1]);
+    let xRow = outRow * uniforms.stride[0] + uniforms.dilation[0] * WRow - uniforms.pad[0];
+    let xCol = outCol * uniforms.stride[1] + uniforms.dilation[1] * WCol - uniforms.pad[1];
+    let xCh = ${y} % inChannels;
+    var resData = ${de(s,d)}(0.0);
+    // The bounds checking is always needed since we use it to pad zero for
+    // the 'same' padding type.
+    if (xRow >= 0 && xRow < ${m} && xCol >= 0 && xCol < ${h}) {
+      ${p}
+      let xIndex = getIndexFromCoords4D(coord, vec4<i32>(uniforms.x_shape));
+      ${l(s)}
+    }
+    return resData;`,_=e?t&&r?`
+    let col = colIn * ${s};
+    ${g}`:`
+    let col = colIn * ${s};
+    if (row < uniforms.dim_a_outer && col < uniforms.dim_inner) {
+      ${g}
+    }
+    return ${de(s,d)}(0.0);`:r&&n?`
+    let col = colIn * ${s};
+    ${g}`:`
+    let col = colIn * ${s};
+    if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
+      ${g}
+    }
+    return ${de(s,d)}(0.0);`,w=e?r&&n?c(a):`
+    let col = colIn * ${a};
+    if (row < uniforms.dim_inner && col < uniforms.dim_b_outer) {
+      ${c(a)}
+    }
+    return ${de(a,d)}(0.0);`:`
+    let col = colIn * ${a};
+    if (row < uniforms.dim_inner && col < uniforms.dim_a_outer) {
+      ${c(a)}
+    }
+    return ${de(a,d)}(0.0);`,v=de(u,d),$=e?de(s,d):de(a,d),T=e?de(a,d):de(s,d),I=Ie(i,v,d);return`
+    fn mm_readA(batch: i32, row : i32, colIn : i32) -> ${$} {
+      ${e?_:w}
+    }
+
+    fn mm_readB(batch: i32, row : i32, colIn : i32) -> ${T} {
+      ${e?w:_}
+    }
+
+    fn mm_write(batch: i32, row : i32, colIn : i32, valueIn : ${v}) {
+      let col = colIn * ${u};
+      if (row < uniforms.dim_a_outer && col < uniforms.dim_b_outer)
+      {
+      var value = valueIn;
+      let outWidth = ${e?"i32(uniforms.result_shape[2])":"i32(uniforms.result_shape[3])"};
+      ${f}
+      ${ns(o)}
+      ${I}
+      setOutputAtCoords(coords[0], coords[1], coords[2], coords[3], value);
+      }
+    }`},ss=(e,t,n,r,o,i,s,a,u)=>{let d=t.format==="NHWC",l=d?e[0].dims[3]:e[0].dims[1],c=n[0],p=d?n[2]:n[3],f=d?n[1]:n[2],m=d?n[3]:n[1],h=d&&(l%4===0||l%3===0)&&m%4===0,b=d?m:p*f,y=d?p*f:m,g=[8,8,1],_=r<=8?[4,1,1]:[4,4,1],w=[Math.ceil(b/g[0]/_[0]),Math.ceil(y/g[1]/_[1]),Math.ceil(c/g[2]/_[2])];j("verbose",()=>`[conv2d_mm_webgpu] dispatch = ${w}`);let v=h?d&&l%4!==0?3:4:1,$=g[1]*_[1],T=g[0]*_[0],I=Math.max(g[0]*v,g[1]),A=r%$===0,z=o%T===0,M=i%I===0,R=h?[v,4,4]:[1,1,1],W=[{type:6,data:r},{type:6,data:o},{type:6,data:i},{type:6,data:[t.pads[0],t.pads[1]]},{type:6,data:t.strides},{type:6,data:t.dilations}];Ce(t,W),W.push(...P(e[0].dims,e[1].dims));let O=["rank","rank"];s&&(W.push(...P(e[2].dims)),O.push("rank")),W.push(...P(n));let ee=G=>{let D=[{name:"dim_a_outer",type:"i32"},{name:"dim_b_outer",type:"i32"},{name:"dim_inner",type:"i32"},{name:"pad",type:"i32",length:2},{name:"stride",type:"i32",length:2},{name:"dilation",type:"i32",length:2}];Ae(t,D);let Z=h?4:1,U=re(e[0].dataType),Q=`
+      fn setOutputAtIndex(flatIndex : i32, value : ${h?`vec4<${U}>`:U}) {
+        result[flatIndex] = ${h?`vec4<${U}>`:U}(value);
+      }
+      fn setOutputAtCoords(d0 : i32, d1 : i32, d2 : i32, d3 : i32, value : ${h?`vec4<${U}>`:U}) {
+        let flatIndex = getOutputIndexFromCoords(vec4<i32>(d0, d1, d2, d3));
+        setOutputAtIndex(flatIndex ${h?"/ 4":""}, value);
+      }`,_e=S("x",e[0].dataType,e[0].dims.length,v===3?1:v),se=S("w",e[1].dataType,e[1].dims.length,Z),H=[_e,se],k=C("result",e[0].dataType,n.length,Z);if(s){let L=S("bias",e[2].dataType,e[2].dims.length,Z);H.push(L),Q+=`
+        fn getBiasByOutputCoords(coords : vec4<i32>) -> ${h?`vec4<${U}>`:U} {
+          return bias[coords.${d?"w":"y"}${h?"/ 4":""}];
+        }`}return`
+        ${rs("uniforms.result_strides")}
+        //struct Uniforms { xShape : vec4<i32>, wShape : vec4<i32>, outShape : vec4<i32>,
+        //  outShapeStrides: vec3<i32>, filterDims : vec2<i32>, pad : vec2<i32>, stride : vec2<i32>,
+        //  dilation : vec2<i32>, dimAOuter : i32, dimBOuter : i32, dimInner : i32 };
+        ${G.registerUniforms(D).declareVariables(...H,k)}
+        ${Q}
+        ${ll(d,A,z,M,s,t,R[0],R[1],R[2],U)}
+        ${h?Zn(_,g,U,void 0,!d,I):Qn(_,g,U,void 0,!d,I,!1,void 0,a)}`};return{name:"Conv2DMatMul",shaderCache:{hint:`${t.cacheKey};${v};${h};${A};${z};${M};${$};${T};${I}`,inputDependencies:O},getRunData:()=>({outputs:[{dims:u?u(n):n,dataType:e[0].dataType}],dispatchGroup:{x:w[0],y:w[1],z:w[2]},programUniforms:W}),getShaderSource:ee}}});var cl,us,an,pl,ds,ml,ls,cs,ps=E(()=>{"use strict";V();Ee();q();K();He();nn();cl=e=>{let t=1;for(let n=0;n<e.length;n++)t*=e[n];return t},us=e=>typeof e=="number"?[e,e,e]:e,an=(e,t)=>t<=1?e:e+(e-1)*(t-1),pl=(e,t,n,r=1)=>{let o=an(t,r);return Math.floor((e[0]*(n-1)-n+o)/2)},ds=(e,t,n,r,o)=>{o==null&&(o=pl(e,t[0],r[0]));let i=[0,0,0,n];for(let s=0;s<3;s++)e[s]+2*o>=t[s]&&(i[s]=Math.trunc((e[s]-t[s]+2*o)/r[s]+1));return i},ml=(e,t,n,r,o,i,s,a,u,d)=>{let l,c,p,f;if(e==="VALID"&&(e=0),typeof e=="number"){l={top:e,bottom:e,left:e,right:e,front:e,back:e};let m=ds([t,n,r,1],[a,u,d],1,[o,i,s],e);c=m[0],p=m[1],f=m[2]}else if(Array.isArray(e)){if(!e.every((h,b,y)=>h===y[0]))throw Error(`Unsupported padding parameter: ${e}`);l={top:e[0],bottom:e[1],left:e[2],right:e[3],front:e[4],back:e[5]};let m=ds([t,n,r,1],[a,u,d],1,[o,i,s],e[0]);c=m[0],p=m[1],f=m[2]}else if(e==="SAME_UPPER"){c=Math.ceil(t/o),p=Math.ceil(n/i),f=Math.ceil(r/s);let m=(c-1)*o+a-t,h=(p-1)*i+u-n,b=(f-1)*s+d-r,y=Math.floor(m/2),g=m-y,_=Math.floor(h/2),w=h-_,v=Math.floor(b/2),$=b-v;l={top:_,bottom:w,left:v,right:$,front:y,back:g}}else throw Error(`Unknown padding parameter: ${e}`);return{padInfo:l,outDepth:c,outHeight:p,outWidth:f}},ls=(e,t,n,r,o,i=!1,s="channelsLast")=>{let a,u,d,l,c;if(s==="channelsLast")[a,u,d,l,c]=e;else if(s==="channelsFirst")[a,c,u,d,l]=e;else throw new Error(`Unknown dataFormat ${s}`);let[p,,f,m,h]=t,[b,y,g]=us(n),[_,w,v]=us(r),$=an(f,_),T=an(m,w),I=an(h,v),{padInfo:A,outDepth:z,outHeight:M,outWidth:R}=ml(o,u,d,l,b,y,g,$,T,I),W=i?p*c:p,O=[0,0,0,0,0];return s==="channelsFirst"?O=[a,W,z,M,R]:s==="channelsLast"&&(O=[a,z,M,R,W]),{batchSize:a,dataFormat:s,inDepth:u,inHeight:d,inWidth:l,inChannels:c,outDepth:z,outHeight:M,outWidth:R,outChannels:W,padInfo:A,strideDepth:b,strideHeight:y,strideWidth:g,filterDepth:f,filterHeight:m,filterWidth:h,effectiveFilterDepth:$,effectiveFilterHeight:T,effectiveFilterWidth:I,dilationDepth:_,dilationHeight:w,dilationWidth:v,inShape:e,outShape:O,filterShape:t}},cs=(e,t,n,r,o,i)=>{let s=i==="channelsLast",a=s?e[0].dims[3]:e[0].dims[1],u=!1,d=[64,1,1],l={x:n.map((g,_)=>_)},c=[Math.ceil(cl(l.x.map(g=>n[g]))/d[0]),1,1];j("verbose",()=>`[conv3d_naive_webgpu] dispatch = ${c}`);let p=u?s&&a%4!==0?3:4:1,f=x.size(n),m=[{type:12,data:f},{type:12,data:r},{type:12,data:o},{type:12,data:t.strides},{type:12,data:t.dilations}];Ce(t,m),m.push(...P(e[0].dims,e[1].dims));let h=["rank","rank"],b=e.length===3;b&&(m.push(...P(e[2].dims)),h.push("rank")),m.push(...P(n));let y=g=>{let _=[{name:"output_size",type:"u32"},{name:"filter_dims",type:"u32",length:r.length},{name:"pads",type:"u32",length:o.length},{name:"strides",type:"u32",length:t.strides.length},{name:"dilations",type:"u32",length:t.dilations.length}];Ae(t,_);let w=u?4:1,v=re(e[0].dataType),$=S("x",e[0].dataType,e[0].dims.length,p===3?1:p),T=S("W",e[1].dataType,e[1].dims.length,w),I=[$,T],A=C("result",e[0].dataType,n.length,w),z="";if(b){let W=S("bias",e[2].dataType,e[2].dims.length,w);I.push(W),z+=`
+        fn getBiasByOutputCoords(coords : array<u32, 5>) -> ${u?`vec4<${v}>`:v} {
+          return bias[${s?B("coords",4,5):B("coords",1,5)}${u?"/ 4":""}];
+        }`}let M=de(p,v),R=Ie(t,M,v);return`
+            ${z}
+            fn getX(d0 : u32, d1 : u32, d2 : u32, d3 : u32, d4 : u32) -> f32 {
+              let aIndices = array<u32, 5>(d0, d1, d2, d3, d4);
+              return ${$.getByIndices("aIndices")};
+            }
+            fn getW(d0 : u32, d1 : u32, d2 : u32, d3 : u32, d4 : u32) -> f32 {
+              let aIndices = array<u32, 5>(d0, d1, d2, d3, d4);
+              return ${T.getByIndices("aIndices")};
+            }
+          ${g.registerUniforms(_).declareVariables(...I,A)}
+          ${g.mainStart()}
+          ${g.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+              let coords = ${A.offsetToIndices("global_idx")};
+              let batch = ${B("coords",0,$.rank)};
+              let d2 = ${s?B("coords",$.rank-1,$.rank):B("coords",1,$.rank)};
+              let xFRCCorner = vec3<u32>(${s?B("coords",1,$.rank):B("coords",2,$.rank)},
+              ${s?B("coords",2,$.rank):B("coords",3,$.rank)},
+              ${s?B("coords",3,$.rank):B("coords",4,$.rank)}) * uniforms.strides - uniforms.pads;
+              let xFCorner = xFRCCorner.x;
+              let xRCorner = xFRCCorner.y;
+              let xCCorner = xFRCCorner.z;
+              let xShapeY = ${s?B("uniforms.x_shape",1,$.rank):B("uniforms.x_shape",2,$.rank)};
+              let xShapeZ = ${s?B("uniforms.x_shape",2,$.rank):B("uniforms.x_shape",3,$.rank)};
+              let xShapeW = ${s?B("uniforms.x_shape",3,$.rank):B("uniforms.x_shape",4,$.rank)};
+              let xShapeU = ${s?B("uniforms.x_shape",4,$.rank):B("uniforms.x_shape",1,$.rank)};
+              let inputDepthNearestVec4 = (xShapeU / 4) * 4;
+              let inputDepthVec4Remainder = xShapeU % 4;
+
+              var value = 0.0;
+              for (var wF = 0u; wF < uniforms.filter_dims[0]; wF++) {
+                let xF = xFCorner + wF * uniforms.dilations[0];
+                if (xF < 0 || xF >= xShapeY) {
+                  continue;
+                }
+
+                for (var wR = 0u; wR < uniforms.filter_dims[1]; wR++) {
+                  let xR = xRCorner + wR * uniforms.dilations[1];
+                  if (xR < 0 || xR >= xShapeZ) {
+                    continue;
+                  }
+
+                  for (var wC = 0u; wC < uniforms.filter_dims[2]; wC++) {
+                    let xC = xCCorner + wC * uniforms.dilations[2];
+                    if (xC < 0 || xC >= xShapeW) {
+                      continue;
+                    }
+
+                    for (var d1 = 0u; d1 < inputDepthNearestVec4; d1 += 4) {
+                      ${s?`let xValues = vec4<f32>(
+                               getX(batch, xF, xR, xC, d1),
+                               getX(batch, xF, xR, xC, d1 + 1),
+                               getX(batch, xF, xR, xC, d1 + 2),
+                               getX(batch, xF, xR, xC, d1 + 3));
+                            `:`let xValues = vec4<f32>(
+                               getX(batch, d1, xF, xR, xC),
+                               getX(batch, d1 + 1, xF, xR, xC),
+                               getX(batch, d1 + 2, xF, xR, xC),
+                               getX(batch, d1 + 3, xF, xR, xC));
+                            `}
+                            let wValues = vec4<f32>(
+                              getW(d2, d1, wF, wR, wC),
+                              getW(d2, d1 + 1, wF, wR, wC),
+                              getW(d2, d1 + 2, wF, wR, wC),
+                              getW(d2, d1 + 3, wF, wR, wC));
+                      value += dot(xValues, wValues);
+                    }
+                    if (inputDepthVec4Remainder == 1) {
+                        ${s?`value += getX(batch, xF, xR, xC, inputDepthNearestVec4)
+                          * getW(d2, inputDepthNearestVec4, wF, wR, wC);`:`value += getX(batch, inputDepthNearestVec4, xF, xR, xC)
+                          * getW(d2, inputDepthNearestVec4, wF, wR, wC);`}
+                    } else if (inputDepthVec4Remainder == 2) {
+                      ${s?`let xValues = vec2<f32>(
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4),
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1));
+                      `:`let xValues = vec2<f32>(
+                        getX(batch, inputDepthNearestVec4, xF, xR, xC),
+                        getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC));
+                    `}
+                    let wValues = vec2<f32>(
+                      getW(d2, inputDepthNearestVec4, wF, wR, wC),
+                      getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC));
+                      value += dot(xValues, wValues);
+                    } else if (inputDepthVec4Remainder == 3) {
+                      ${s?`let xValues = vec3<f32>(
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4),
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1),
+                        getX(batch, xF, xR, xC, inputDepthNearestVec4 + 2));
+                      `:`let xValues = vec3<f32>(
+                        getX(batch, inputDepthNearestVec4, xF, xR, xC),
+                        getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC),
+                        getX(batch, inputDepthNearestVec4 + 2, xF, xR, xC));
+                    `}
+                    let wValues = vec3<f32>(
+                      getW(d2, inputDepthNearestVec4, wF, wR, wC),
+                      getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC),
+                      getW(d2, inputDepthNearestVec4 + 2, wF, wR, wC));
+                      value += dot(xValues, wValues);
+                    }
+                  }
+                }
+              }
+              ${b?"value = value + getBiasByOutputCoords(coords)":""};
+              ${R}
+              result[global_idx] = f32(value);
+          }`};return{name:"Conv3DNaive",shaderCache:{hint:`${t.cacheKey};${s};${p};${b}`,inputDependencies:h},getRunData:()=>({outputs:[{dims:n,dataType:e[0].dataType}],dispatchGroup:{x:c[0],y:c[1],z:c[2]},programUniforms:m}),getShaderSource:y}}});var ms,fs,hs=E(()=>{"use strict";V();q();K();He();ms=(e,t,n,r)=>{let o=e.length>2,i=o?"value += b[output_channel];":"",s=e[0].dims,a=e[1].dims,u=t.format==="NHWC",d=u?n[3]:n[1],l=d/t.group,c=u&&l>=4?X(d):1,p=x.size(n)/c,f=[{type:12,data:p},{type:12,data:t.dilations},{type:12,data:[t.strides[0],t.strides[1]]},{type:12,data:[t.pads[0],t.pads[1]]},{type:12,data:l}];Ce(t,f),f.push(...P(s,[a[0],a[1],a[2],a[3]/c]));let m=o?["rank","rank","rank"]:["rank","rank"];f.push(...P([n[0],n[1],n[2],n[3]/c]));let h=b=>{let y=C("output",e[0].dataType,n.length,c),g=re(y.type.tensor),_=Ie(t,y.type.value,g),w=S("x",e[0].dataType,s.length),v=S("w",e[1].dataType,a.length,c),$=[w,v];o&&$.push(S("b",e[2].dataType,e[2].dims,c));let T=[{name:"output_size",type:"u32"},{name:"dilations",type:"u32",length:t.dilations.length},{name:"strides",type:"u32",length:2},{name:"pads",type:"u32",length:2},{name:"output_channels_per_group",type:"u32"}];Ae(t,T);let I=u?`
+      for (var wHeight: u32 = 0u; wHeight < uniforms.w_shape[0]; wHeight++) {
+        let xHeight = xRCCorner.x + wHeight * uniforms.dilations[0];
+
+        if (xHeight < 0u || xHeight >= uniforms.x_shape[1]) {
+          continue;
+        }
+
+        for (var wWidth: u32 = 0u; wWidth < uniforms.w_shape[1]; wWidth++) {
+          let xWidth = xRCCorner.y + wWidth * uniforms.dilations[1];
+          if (xWidth < 0u || xWidth >= uniforms.x_shape[2]) {
+            continue;
+          }
+
+          for (var wInChannel: u32 = 0u; wInChannel < uniforms.w_shape[2]; wInChannel++) {
+            let input_channel = in_channel_offset + wInChannel;
+            let xVal = ${w.get("batch","xHeight","xWidth","input_channel")};
+            let wVal = ${v.get("wHeight","wWidth","wInChannel","output_channel")};
+            value += xVal * wVal;
+          }
+        }
+      }
+      `:`
+      for (var wInChannel: u32 = 0u; wInChannel < uniforms.w_shape[1]; wInChannel++) {
+        let input_channel = in_channel_offset + wInChannel;
+        for (var wHeight: u32 = 0u; wHeight < uniforms.w_shape[2]; wHeight++) {
+          let xHeight = xRCCorner.x + wHeight * uniforms.dilations[0];
+
+          if (xHeight < 0u || xHeight >= uniforms.x_shape[2]) {
+            continue;
+          }
+
+          for (var wWidth: u32 = 0u; wWidth < uniforms.w_shape[3]; wWidth++) {
+            let xWidth = xRCCorner.y + wWidth * uniforms.dilations[1];
+            if (xWidth < 0u || xWidth >= uniforms.x_shape[3]) {
+              continue;
+            }
+
+            let xVal = ${w.get("batch","input_channel","xHeight","xWidth")};
+            let wVal = ${v.get("output_channel","wInChannel","wHeight","wWidth")};
+            value += xVal * wVal;
+          }
+        }
+      }
+      `;return`
+  ${b.registerUniforms(T).declareVariables(...$,y)}
+
+  ${b.mainStart()}
+    ${b.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let outputIndices = ${y.offsetToIndices("global_idx")};
+    let batch: u32 = outputIndices[0];
+    let output_channel: u32 = outputIndices[${u?3:1}];
+    let xRCCorner: vec2<u32> = vec2<u32>(outputIndices[${u?1:2}], outputIndices[${u?2:3}]) * uniforms.strides - uniforms.pads;
+    let group_id: u32 = output_channel * ${c} / uniforms.output_channels_per_group;
+    var in_channel_offset = group_id * uniforms.w_shape[${u?2:1}];
+
+    var value: ${y.type.value} = ${y.type.value}(0);
+    ${I}
+    ${i}
+    ${_}
+    ${y.setByOffset("global_idx","value")}
+  }`};return{name:"GroupedConv",shaderCache:{hint:`${t.cacheKey}_${c}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:r?r(n):n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:f}),getShaderSource:h}},fs=(e,t,n,r)=>{let o=e.length>2,i=X(n[3]),s=X(n[2]),a=x.size(n)/i/s,u=[e[0].dims[0],e[0].dims[1],e[0].dims[2],e[0].dims[3]/i],d=[e[1].dims[0],e[1].dims[1],e[1].dims[2],e[1].dims[3]/i],l=[n[0],n[1],n[2],n[3]/i],c=[{type:12,data:a},{type:6,data:[t.strides[0],t.strides[1]]},{type:6,data:[t.pads[0],t.pads[1]]}];Ce(t,c),c.push(...P(u,d,l));let p=(s-1)*t.strides[1]+d[1],f=m=>{let h=C("output",e[0].dataType,l.length,i),b=re(h.type.tensor),y=Ie(t,h.type.value,b),g=S("x",e[0].dataType,u.length,i),_=S("w",e[1].dataType,d.length,i),w=[g,_];o&&w.push(S("b",e[2].dataType,e[2].dims,i));let v=o?"value += b[output_channel];":"",$=[{name:"output_size",type:"u32"},{name:"strides",type:"i32",length:2},{name:"pads",type:"i32",length:2}];return Ae(t,$),`
+  ${m.registerUniforms($).declareVariables(...w,h)}
+  ${m.mainStart()}
+    ${m.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let width0 = uniforms.output_shape[3];
+    let output_channel = global_idx % width0;
+    var index1 = global_idx / width0;
+    let width1 = uniforms.output_shape[2] / ${s}u;
+    let col = (index1 % width1) * ${s}u;
+    index1 = index1 / width1;
+    let row = index1 % uniforms.output_shape[1];
+    let batch = index1 / uniforms.output_shape[1];
+
+    let x_corner = vec2<i32>(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+    var x_vals: array<${g.type.value}, ${p}>;
+    var values: array<${h.type.value}, ${s}>;
+    let input_channel = output_channel;
+    // Use constant instead of uniform can give better performance for w's height/width.
+    for (var w_height: u32 = 0u; w_height < ${d[0]}; w_height++) {
+      let x_height = x_corner.x + i32(w_height);
+      if (x_height >= 0 && u32(x_height) < uniforms.x_shape[1]) {
+        for (var i = 0; i < ${p}; i++) {
+          let x_width = x_corner.y + i;
+          if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+            x_vals[i] = ${g.get("batch","u32(x_height)","u32(x_width)","input_channel")};
+          } else {
+            x_vals[i] = ${g.type.value}(0);
+          }
+        }
+        for (var w_width: u32 = 0u; w_width < ${d[1]}; w_width++) {
+          let w_val = ${_.get("w_height","w_width","0","output_channel")};
+          for (var i = 0u; i < ${s}u; i++) {
+            values[i] = fma(x_vals[i * u32(uniforms.strides[1]) + w_width], w_val, values[i]);
+          }
+        }
+      }
+    }
+
+    for (var i = 0u; i < ${s}u; i++) {
+      var value = values[i];
+      ${v}
+      ${y}
+      ${h.set("batch","row","col + i","output_channel","value")};
+    }
+  }`};return{name:"GroupedConv-Vectorize",shaderCache:{hint:`${t.cacheKey};${i};${s};${p};${d[0]};${d[1]}`,inputDependencies:o?["rank","rank","type"]:["rank","rank"]},getRunData:()=>({outputs:[{dims:r?r(n):n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(a/64)},programUniforms:c}),getShaderSource:f}}});var fl,Xn,hl,Yn,Jn,gs,gl,yl,er,ys=E(()=>{"use strict";q();as();ps();sn();hs();He();on();Re();fl=(e,t,n,r,o,i)=>{let s=e[0],a=e.slice(i?1:2,i?3:4),u=a.length,d=t[0],c=t.slice(2).map((m,h)=>m+(m-1)*(n[h]-1)),f=a.map((m,h)=>m+r[h]+r[h+u]).map((m,h)=>Math.floor((m-c[h]+o[h])/o[h]));return f.splice(0,0,s),f.splice(i?3:1,0,d),f},Xn=[2,3,1,0],hl=(e,t)=>{if(!e||e.length!==2&&e.length!==3)throw new Error("Conv requires 2 or 3 inputs");if(e[0].dims.length>5)throw new Error("greater than 5D is not supported");if(e[0].dims.length!==e[1].dims.length)throw new Error("filter does not have same dimension as input");let n=e[0].dims[t.format==="NHWC"?e[0].dims.length-1:1],r=e[1].dims[1]*t.group;if(n!==r)throw new Error("FILTER_IN_CHANNEL should be equal to DATA_CHANNEL");if(e.length===3&&(e[2].dims.length!==1||e[1].dims[0]!==e[2].dims[0]))throw new Error("invalid bias");let o=e[0].dims.length-2;if(t.dilations.length!==o)throw new Error(`dilations should be ${o}D`);if(t.strides.length!==o)throw new Error(`strides should be ${o}D`);if(t.pads.length!==o*2)throw new Error(`pads should be ${o*2}D`);if(t.kernelShape.length!==0&&t.kernelShape.length!==e[1].dims.length-2)throw new Error("invalid kernel shape")},Yn=(e,t)=>{let n=e.kernelShape.slice();n.length<t[1].dims.length-2&&n.push(...Array(t[1].dims.length-2-n.length).fill(0));for(let i=2;i<t[1].dims.length;++i)n[i-2]===0&&(n[i-2]=t[1].dims[i]);let r=e.pads.slice();Ze.adjustPadsBasedOnAutoPad(t[0].dims,e.strides,e.dilations,n,r,e.format==="NHWC",e.autoPad);let o=Object.assign({},e);return Object.assign(o,{kernelShape:n,pads:r}),o},Jn=e=>{let t=tn(e),n=e.format,r=["NOTSET","VALID","SAME_UPPER","SAME_LOWER"][e.auto_pad],o=e.dilations,i=e.group,s=e.kernel_shape,a=e.pads,u=e.strides,d=e.w_is_const();return{autoPad:r,format:n,dilations:o,group:i,kernelShape:s,pads:a,strides:u,wIsConst:d,...t,cacheKey:`${e.format};${t.activation};`}},gs=(e,t,n,r)=>{let o=n.format==="NHWC",i=fl(t[0].dims,t[1].dims,n.dilations,n.pads,n.strides,o);if(n.group!==1){let $=[t[0]];if(o){let I=e.kernelCustomData.wT??e.compute(ce(t[1],Xn),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=I),$.push(I)}else $.push(t[1]);t.length===3&&$.push(t[2]),!e.adapterInfo.isArchitecture("ampere")&&o&&t[1].dims[0]===n.group&&t[1].dims[1]===1&&n.dilations[0]===1&&n.dilations[1]===1?e.compute(fs($,n,i,r),{inputs:$}):e.compute(ms($,n,i,r),{inputs:$});return}let s=t.length===3,a=t[0].dims[o?1:2],u=t[0].dims[o?2:3],d=t[0].dims[o?3:1],l=t[1].dims[2],c=t[1].dims[3],p=i[o?1:2],f=i[o?2:3],m=i[o?3:1],h=o&&l===a&&c===u&&n.pads[0]===0&&n.pads[1]===0;if(h||l===1&&c===1&&n.dilations[0]===1&&n.dilations[1]===1&&n.strides[0]===1&&n.strides[1]===1&&n.pads[0]===0&&n.pads[1]===0){let $=i[0],T,I,A,z=[];if(o){let W=e.kernelCustomData.wT??e.compute(ce(t[1],Xn),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];if(n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=W),h){let O=a*u*d;T=t[0].reshape([1,$,O]),I=W.reshape([1,O,m]),A=[1,$,m]}else T=t[0].reshape([$,a*u,d]),I=W.reshape([1,d,m]),A=[$,p*f,m];z.push(T),z.push(I)}else T=t[0].reshape([$,d,a*u]),I=t[1].reshape([1,m,d]),A=[$,m,p*f],z.push(I),z.push(T);s&&z.push(t[2]);let M=A[2],R=z[0].dims[z[0].dims.length-1];M<8&&R<8?e.compute(rn(z,n,i,A,o,r),{inputs:z}):e.compute(gt(z,n,i,A,o,r),{inputs:z});return}let b=!0,y=e.kernelCustomData.wT??e.compute(ce(t[1],Xn),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=y);let g=[t[0],y];s&&g.push(t[2]);let _=o?p*f:m,w=o?m:p*f,v=l*c*d;e.compute(ss(g,n,i,_,w,v,s,b,r),{inputs:g})},gl=(e,t)=>{let n=t.format==="NHWC",r=[e.inputs[0].reshape(n?[e.inputs[0].dims[0],1,e.inputs[0].dims[1],e.inputs[0].dims[2]]:[e.inputs[0].dims[0],e.inputs[0].dims[1],1,e.inputs[0].dims[2]]),e.inputs[1].reshape([e.inputs[1].dims[0],e.inputs[1].dims[1],1,e.inputs[1].dims[2]])];e.inputs.length===3&&r.push(e.inputs[2]);let o=[0,t.pads[0],0,t.pads[1]],i=[1].concat(t.strides),s=[1].concat(t.dilations),a=[1].concat(t.kernelShape),u=Yn({...t,pads:o,strides:i,dilations:s,kernelShape:a},r);gs(e,r,u,d=>n?[d[0],d[2],d[3]]:[d[0],d[1],d[3]])},yl=(e,t,n)=>{let r=n.format==="NHWC"?"channelsLast":"channelsFirst",o=Yn(n,t),i=n.autoPad==="NOTSET"?n.pads:n.autoPad,s=ls(t[0].dims,t[1].dims,n.strides,n.dilations,i,!1,r);e.compute(cs(t,o,s.outShape,[s.filterDepth,s.filterHeight,s.filterWidth],[s.padInfo.front,s.padInfo.top,s.padInfo.left],r))},er=(e,t)=>{if(hl(e.inputs,t),e.inputs[0].dims.length===3)gl(e,t);else if(e.inputs[0].dims.length===5)yl(e,e.inputs,t);else{let n=Yn(t,e.inputs);gs(e,e.inputs,n)}}});var bs,_s=E(()=>{"use strict";V();Ee();q();K();bs=(e,t,n)=>{let r=e.length>2,o=t.outputShape,i=t.format==="NHWC",s=t.group,a=e[1].dims,u=a[2]/s,d=a[3],l=i?X(u):1,c=i&&d===1&&u>=4,p=c?Math.floor(u/4)*4:Math.floor(u/l)*l,f=u-p,m=i?X(d):1,h=i?d===1?l:m:1,b=x.size(o)/m,y=[Math.ceil(b/64),1,1];j("verbose",()=>`[conv2d_backprop_webgpu] dispatch = ${y}`);let g=["rank","rank"],_=[t.strides[0],t.strides[1]],w=[t.kernelShape[i?1:2],t.kernelShape[i?2:3]],v=[t.dilations[0],t.dilations[1]],$=[w[0]+(t.dilations[0]<=1?0:(t.kernelShape[i?1:2]-1)*(t.dilations[0]-1)),w[1]+(t.dilations[1]<=1?0:(t.kernelShape[i?2:3]-1)*(t.dilations[1]-1))],T=[$[0]-1-Math.floor((t.pads[0]+t.pads[2])/2),$[1]-1-Math.floor((t.pads[1]+t.pads[3])/2)],I=[{type:12,data:b},{type:12,data:_},{type:12,data:w},{type:12,data:v},{type:12,data:$},{type:6,data:T},{type:12,data:p},{type:12,data:u},{type:12,data:d},...P(e[0].dims,e[1].dims)];r&&(I.push(...P(e[2].dims)),g.push("rank")),I.push(...P(o));let A=z=>{let M=[{name:"output_size",type:"u32"},{name:"strides",type:"u32",length:_.length},{name:"filter_dims",type:"u32",length:w.length},{name:"dilations",type:"u32",length:w.length},{name:"effective_filter_dims",type:"u32",length:$.length},{name:"pads",type:"i32",length:T.length},{name:"input_channels_per_group_int",type:"u32"},{name:"input_channels_per_group",type:"u32"},{name:"output_channels_per_group",type:"u32"}],R=re(e[0].dataType),W=i?1:2,O=i?2:3,ee=i?3:1,G=S("W",e[1].dataType,e[1].dims.length,h),D=S("Dy",e[0].dataType,e[0].dims.length,l),Z=[D,G];r&&Z.push(S("bias",e[2].dataType,[o[ee]].length,m));let U=C("result",e[0].dataType,o.length,m),Q=()=>{let H="";if(c)l===4?H+=`
+        let xValue = ${D.getByOffset("x_offset")};
+        let wValue = ${G.getByOffset("w_offset")};
+        dotProd = dotProd + dot(xValue, wValue);
+        x_offset += 1u;
+        w_offset += 1u;`:l===2?H+=`
+          dotProd = dotProd + dot(vec4<${R}>(${D.getByOffset("x_offset")}, ${D.getByOffset("x_offset + 1u")}), vec4<${R}>(${G.getByOffset("w_offset")}, ${G.getByOffset("w_offset + 1u")}));
+          x_offset += 2u;
+          w_offset += 2u;`:l===1&&(H+=`
+          dotProd = dotProd + dot(vec4<${R}>(${D.getByOffset("x_offset")}, ${D.getByOffset("x_offset + 1u")}, ${D.getByOffset("x_offset + 2u")}, ${D.getByOffset("x_offset + 3u")}), vec4<${R}>(${G.getByOffset("w_offset")}, ${G.getByOffset("w_offset + 1u")}, ${G.getByOffset("w_offset + 2u")}, ${G.getByOffset("w_offset + 3u")}));
+          x_offset += 4u;
+          w_offset += 4u;`);else if(H+=`
+                  let xValue = ${i?D.getByOffset(`${D.indicesToOffset(`${D.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${l}`):D.get("batch","inputChannel","idyR","idyC")};
+        `,l===1)H+=`
+          let w_offset = ${G.indicesToOffset(`${G.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel, wOutChannel)`)};
+          let wValue = ${G.getByOffset(`w_offset / ${h}`)};
+          dotProd = dotProd + xValue * wValue;`;else for(let k=0;k<l;k++)H+=`
+            let wValue${k} = ${G.getByOffset(`${G.indicesToOffset(`${G.type.indices}(u32(wRPerm), u32(wCPerm), inputChannel + ${k}, wOutChannel)`)} / ${h}`)};
+            dotProd = dotProd + xValue[${k}] * wValue${k};`;return H},_e=()=>{if(f===0)return"";if(!c)throw new Error(`packInputAs4 ${c} is not true.`);let H="";if(l===1){H+="dotProd = dotProd";for(let k=0;k<f;k++)H+=`
+            + ${D.getByOffset(`x_offset + ${k}`)} * ${G.getByOffset(`w_offset + ${k}`)}`;H+=";"}else if(l===2){if(f!==2)throw new Error(`Invalid inputChannelsRemainder ${f}.`);H+=`
+          let xValue = ${D.getByOffset("x_offset")};
+          let wValue = ${G.getByOffset("w_offset")};
+          dotProd = dotProd + dot(xValue, wValue);`}return H},se=`
+            let outputIndices = ${U.offsetToIndices(`global_idx * ${m}`)};
+            let batch = ${U.indicesGet("outputIndices",0)};
+            let d1 = ${U.indicesGet("outputIndices",ee)};
+            let r = ${U.indicesGet("outputIndices",W)};
+            let c = ${U.indicesGet("outputIndices",O)};
+            let dyCorner = vec2<i32>(i32(r), i32(c)) - uniforms.pads;
+            let dyRCorner = dyCorner.x;
+            let dyCCorner = dyCorner.y;
+            let groupId = d1 / uniforms.output_channels_per_group;
+            let wOutChannel = d1 - groupId * uniforms.output_channels_per_group;
+            // Convolve dy(?, ?, d2) with w(:, :, d1, d2) to compute dx(xR, xC, d1).
+            // ? = to be determined. : = across all values in that axis.
+            var dotProd = ${U.type.value}(0.0);
+            var wR: u32 = 0;
+            if (uniforms.dilations.x == 1) {
+              // Minimum wR >= 0 that satisfies (dyRCorner + wR) % (uniforms.strides.x) == 0
+              wR = u32(((dyRCorner + i32(uniforms.strides.x) - 1) / i32(uniforms.strides.x)) * i32(uniforms.strides.x) - dyRCorner);
+            }
+            for (; wR < uniforms.effective_filter_dims.x; wR = wR + 1) {
+              if (wR % uniforms.dilations.x != 0) {
+                continue;
+              }
+              let dyR = (${R}(dyRCorner) + ${R}(wR)) / ${R}(uniforms.strides[0]);
+              let wRPerm = uniforms.filter_dims.x - 1 - wR / uniforms.dilations.x;
+              if (dyR < 0.0 || dyR >= ${R}(uniforms.Dy_shape[${W}]) || fract(dyR) > 0.0 ||
+                  wRPerm < 0) {
+                continue;
+              }
+              let idyR: u32 = u32(dyR);
+              var wC: u32 = 0;
+              if (uniforms.dilations.y == 1) {
+                // Minimum wC >= 0 that satisfies (dyCCorner + wC) % (uniforms.strides.y) == 0
+                wC = u32(((dyCCorner + i32(uniforms.strides.y) - 1) / i32(uniforms.strides.y)) * i32(uniforms.strides.y) - dyCCorner);
+              }
+              for (; wC < uniforms.effective_filter_dims.y; wC = wC + 1) {
+                if (wC % uniforms.dilations.y != 0) {
+                  continue;
+                }
+                let dyC = (${R}(dyCCorner) + ${R}(wC)) / ${R}(uniforms.strides.y);
+                let wCPerm = uniforms.filter_dims.y - 1 - wC / uniforms.dilations.y;
+                if (dyC < 0.0 || dyC >= ${R}(uniforms.Dy_shape[${O}]) ||
+                    fract(dyC) > 0.0 || wCPerm < 0) {
+                  continue;
+                }
+                let idyC: u32 = u32(dyC);
+                var inputChannel = groupId * uniforms.input_channels_per_group;
+                ${c?`
+                var x_offset = ${D.indicesToOffset(`${D.type.indices}(batch, idyR, idyC, inputChannel)`)} / ${l};
+                var w_offset = ${G.indicesToOffset(`${G.type.indices}(wRPerm, wCPerm, inputChannel, wOutChannel)`)} / ${h};
+                  `:""}
+                for (var d2: u32 = 0; d2 < uniforms.input_channels_per_group_int; d2 = d2 + ${c?4:l}) {
+                  ${Q()}
+                  inputChannel = inputChannel + ${c?4:l};
+                }
+                ${_e()}
+                wC = wC + uniforms.strides.y - 1;
+              }
+              wR = wR + uniforms.strides[0] - 1;
+            }
+            let value = dotProd${r?` + bias[d1 / ${m}]`:""};
+            ${U.setByOffset("global_idx","value")};
+          `;return`
+    ${z.registerUniforms(M).declareVariables(...Z,U)}
+      ${z.mainStart()}
+      ${z.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")};
+    ${se}}`};return{name:"ConvTranspose2D",shaderCache:{hint:`${t.cacheKey};${l}${h}${m}${c}${f}`,inputDependencies:g},getRunData:()=>({dispatchGroup:{x:y[0],y:y[1],z:y[2]},outputs:[{dims:n?n(o):o,dataType:e[0].dataType}],programUniforms:I}),getShaderSource:A}}});var bl,_l,wl,ws,$s,$l,vs,vl,xs,Ss=E(()=>{"use strict";_s();He();Re();bl=(e,t,n,r,o,i)=>(e-1)*t+n+(r-1)*o+1-i,_l=(e,t,n,r,o)=>{let i=Math.floor(e/2);t==="SAME_UPPER"?(n[r]=i,n[o]=e-i):t==="SAME_LOWER"&&(n[r]=e-i,n[o]=i)},wl=(e,t,n,r,o,i,s,a,u,d)=>{let l=e.length-2,c=d.length===0;u.length<l&&u.push(...Array(l-u.length).fill(0));let p=e[0],f=t[a?3:1]*o;for(let m=0,h=e.length-l-(a?1:0);m<l;++m,++h){let b=e[h],y=c?b*s[m]:d[m],g=bl(b,s[m],i[m],t[h],n[m],y);_l(g,r,i,m,m+l),c&&d.push(s[m]*(b-1)+u[m]+(t[h]-1)*n[m]+1-i[m]-i[m+l])}d.splice(0,0,p),d.splice(a?3:1,0,f)},ws=(e,t)=>{let n=e.kernelShape.slice();if(e.kernelShape.length===0||e.kernelShape.reduce((c,p)=>c*p,1)===0){n.length=0;for(let c=2;c<t[1].dims.length;++c)n.push(t[1].dims[c])}let r=e.format==="NHWC";n.splice(0,0,t[1].dims[0]),n.splice(r?3:1,0,t[1].dims[1]);let o=e.pads.slice(),i=e.outputShape.slice(),s=e.outputPadding.slice(),a=t[0].dims,u=e.dilations.slice();if(u.reduce((c,p)=>c+p,0)===0){let c=t[0].dims.length-2;u=new Array(c).fill(1)}let d=e.strides.slice();if(d.reduce((c,p)=>c+p,0)===0){let c=t[0].dims.length-2;d=new Array(c).fill(1)}wl(a,n,u,e.autoPad,e.group,o,d,r,s,i);let l=Object.assign({},e);return Object.assign(l,{kernelShape:n,pads:o,outputPadding:s,outputShape:i,dilations:u,strides:d}),l},$s=e=>{let t=tn(e),n=e.format,r=["NOTSET","VALID","SAME_UPPER","SAME_LOWER"][typeof e.autoPad>"u"?0:e.autoPad],o=e.dilations,i=e.group,s=e.kernelShape,a=e.pads,u=e.strides,d=e.wIsConst(),l=e.outputPadding,c=e.outputShape;return{autoPad:r,format:n,dilations:o,group:i,kernelShape:s,outputPadding:l,outputShape:c,pads:a,strides:u,wIsConst:d,...t,cacheKey:`${e.format};${t.activation};`}},$l=(e,t)=>{if(!e||e.length!==2&&e.length!==3)throw new Error("Conv requires 2 or 3 inputs");if(e[0].dims.length!==4&&e[0].dims.length!==3)throw new Error("currently only support 2-dimensional conv");if(e[0].dims.length!==e[1].dims.length)throw new Error("filter does not have same dimension as input");let n=e[0].dims[t.format==="NHWC"?e[0].dims.length-1:1],r=e[1].dims[0];if(n!==r)throw new Error("FILTER_IN_CHANNEL should be equal to DATA_CHANNEL");let o=e[1].dims[1]*t.group;if(e.length===3&&(e[2].dims.length!==1||e[2].dims[0]!==o))throw new Error("invalid bias");let i=e[0].dims.length-2;if(t.dilations.reduce((l,c)=>l+c,0)>0&&t.dilations.length!==i)throw new Error(`dilations should be ${i}D`);if(t.strides.reduce((l,c)=>l+c,0)>0&&t.strides.length!==i)throw new Error(`strides should be ${i}D`);if(t.pads.reduce((l,c)=>l+c,0)>0&&t.pads.length!==i*2)throw new Error(`pads should be ${i*2}D`);if(t.outputPadding.length!==i&&t.outputPadding.length!==0)throw new Error(`output_padding should be ${i}D`);if(t.kernelShape.reduce((l,c)=>l+c,0)>0&&t.kernelShape.length!==0&&t.kernelShape.length!==e[1].dims.length-2)throw new Error("invalid kernel shape");if(t.outputShape.length!==0&&t.outputShape.length!==e[0].dims.length-2)throw new Error("invalid output shape")},vs=(e,t,n,r)=>{let o=e.kernelCustomData.wT??e.compute(ce(t[1],[2,3,0,1]),{inputs:[1],outputs:[n.wIsConst?-2:-1]})[0];n.wIsConst&&!e.kernelCustomData.wT&&(e.kernelCustomData.wT=o);let i=[t[0],o];t.length===3&&i.push(t[2]),e.compute(bs(i,n,r),{inputs:i})},vl=(e,t)=>{let n=t.format==="NHWC",r=[e.inputs[0].reshape(n?[e.inputs[0].dims[0],1,e.inputs[0].dims[1],e.inputs[0].dims[2]]:[e.inputs[0].dims[0],e.inputs[0].dims[1],1,e.inputs[0].dims[2]]),e.inputs[1].reshape([e.inputs[1].dims[0],e.inputs[1].dims[1],1,e.inputs[1].dims[2]])];e.inputs.length===3&&r.push(e.inputs[2]);let o=t.kernelShape;(o.length===0||o[0]===0)&&(o=[e.inputs[1].dims[2]]);let i=t.dilations;(i.length===0||i[0]===0)&&(i=[1]);let s=t.strides;(s.length===0||s[0]===0)&&(s=[1]);let a=t.pads;a.length===0&&(a=[0,0]),a=[0,a[0],0,a[1]],s=[1].concat(s),i=[1].concat(i),o=[1].concat(o);let u=t.outputPadding;u=[0].concat(u);let d=ws({...t,pads:a,strides:s,dilations:i,kernelShape:o,outputPadding:u},r);vs(e,r,d,l=>n?[l[0],l[2],l[3]]:[l[0],l[1],l[3]])},xs=(e,t)=>{if($l(e.inputs,t),e.inputs[0].dims.length===3)vl(e,t);else{let n=ws(t,e.inputs);vs(e,e.inputs,n)}}});var xl,Ts,Is,Cs=E(()=>{"use strict";V();q();ie();K();xl=(e,t,n,r)=>{let o=x.size(t),i=t.length,s=S("input",e,i),a=C("output",e,i),u=n.dataType===6?n.getInt32Array()[0]:Number(n.getBigInt64Array()[0]),d=x.normalizeAxis(u,i),l=c=>{let p=` i32(${s.indicesGet("inputIndices","uniforms.axis")}) `,f=B("uniforms.input_shape","uniforms.axis",i),m=r.reverse?p+(r.exclusive?" + 1":""):"0",h=r.reverse?f:p+(r.exclusive?"":" + 1");return`
+                ${c.registerUniform("outputSize","u32").registerUniform("axis","u32").declareVariables(s,a)}
+                ${c.mainStart()}
+                  ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+                  var inputIndices = ${a.offsetToIndices("global_idx")};
+                  var sum = ${a.type.value}(0);
+                  let first : i32 = ${m};
+                  let last : i32 = ${h};
+                  for (var i : i32 = first; i < last; i++) {
+                    ${s.indicesSet("inputIndices","uniforms.axis","u32(i)")};
+                    sum = sum + ${s.getByIndices("inputIndices")};
+                  }
+                  ${a.setByOffset("global_idx","sum")};
+                }`};return{name:"CumSum",shaderCache:{hint:r.cacheKey,inputDependencies:["rank"]},getRunData:()=>({outputs:[{dims:t,dataType:e}],dispatchGroup:{x:Math.ceil(o/64)},programUniforms:[{type:12,data:o},{type:12,data:d},...P(t,t)]}),getShaderSource:l}},Ts=(e,t)=>{let n=e.inputs[0].dims,r=e.inputs[0].dataType,o=e.inputs[1];e.compute(xl(r,n,o,t),{inputs:[0]})},Is=e=>{let t=e.exclusive===1,n=e.reverse===1;return N({exclusive:t,reverse:n})}});var Sl,Tl,Il,As,ks,Es=E(()=>{"use strict";V();q();ie();K();Sl=e=>{if(!e||e.length!==1)throw new Error("DepthToSpace requires 1 input.");if(e[0].dims.length!==4)throw new Error("DepthToSpace requires 4D input.")},Tl=(e,t,n,r)=>{let o=[];o.push(`fn perm(i: ${r.type.indices}) -> ${n.type.indices} {
+    var a: ${n.type.indices};`);for(let i=0;i<t;++i)o.push(n.indicesSet("a",e[i],`i[${i}]`));return o.push("return a;}"),o.join(`
+`)},Il=(e,t)=>{let n,r,o,i,s,a,u=t.format==="NHWC",d=t.blocksize,l=t.mode==="DCR";u?([n,r,o,i]=e.dims,s=l?[n,r,o,d,d,i/d**2]:[n,r,o,i/d**2,d,d],a=l?[0,1,3,2,4,5]:[0,1,4,2,5,3]):([n,r,o,i]=[e.dims[0],e.dims[2],e.dims[3],e.dims[1]],s=l?[n,d,d,i/d**2,r,o]:[n,i/d**2,d,d,r,o],a=l?[0,3,4,1,5,2]:[0,1,4,2,5,3]);let c=e.reshape(s),p=c.dims.length,f=e.dataType,m=S("a",f,p),h=C("output",f,p),b=y=>`
+  ${y.registerUniform("output_size","u32").declareVariables(m,h)}
+
+  ${Tl(a,p,m,h)}
+
+  ${y.mainStart()}
+    ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let indices = ${h.offsetToIndices("global_idx")};
+    let aIndices = perm(indices);
+
+    ${h.setByOffset("global_idx",m.getByIndices("aIndices"))}
+  }`;return{name:"DepthToSpace",shaderCache:{hint:`${e.dims};${t.blocksize};${t.mode}`,inputDependencies:["rank"]},getRunData:y=>{let g=u?[n,r*d,o*d,i/d**2]:[n,i/d**2,r*d,o*d],_=x.size(g),w=c.dims,v=x.sortBasedOnPerm(w,a);return{outputs:[{dims:g,dataType:y[0].dataType}],dispatchGroup:{x:Math.ceil(_/64)},programUniforms:[{type:12,data:_},...P(w,v)]}},getShaderSource:b}},As=(e,t)=>{Sl(e.inputs),e.compute(Il(e.inputs[0],t))},ks=e=>N({blocksize:e.blocksize,mode:e.mode,format:e.format})});var tr,un,Ps,Cl,Al,nr,rr,zs,kl,Os,Ds,Bs=E(()=>{"use strict";V();q();ie();K();tr="[a-zA-Z]|\\.\\.\\.",un="("+tr+")+",Ps="^"+un+"$",Cl="("+un+",)*"+un,Al="^"+Cl+"$",nr=class{constructor(t=-1){this.symbolToIndices=new Map,this.inputIndex=t}addSymbol(t,n){let r=this.symbolToIndices.get(t);r===void 0?r=[n]:r.push(n),this.symbolToIndices.set(t,r)}},rr=class{constructor(t,n){this.equation=n;this.hasEllipsis=!1,this.symbolToInfo=new Map,this.lhs=new Array,this.outputDims=[];let[r,o]=n.includes("->")?n.split("->",2):[n,""];if(!r.match(RegExp(Al)))throw new Error("Invalid LHS term");if(r.split(",").forEach((a,u)=>{let d=t[u].dims.slice();if(!a.match(RegExp(Ps)))throw new Error("Invalid LHS term");let l=this.processTerm(a,!0,d,u);this.lhs.push(l)}),o==="")o+=[...this.symbolToInfo.entries()].filter(([a,u])=>u.count===1||a==="...").map(([a])=>a).join("");else if(!o.match(RegExp(un)))throw new Error("Invalid RHS");o.match(RegExp(tr,"g"))?.forEach(a=>{if(a==="...")this.outputDims=this.outputDims.concat(this.ellipsisDims);else{let u=this.symbolToInfo.get(a);if(u===void 0)throw new Error("Invalid RHS symbol");this.outputDims.push(u.dimValue)}}),this.rhs=this.processTerm(o,!1,this.outputDims)}addSymbol(t,n,r){let o=this.symbolToInfo.get(t);if(o!==void 0){if(o.dimValue!==n&&o.count!==1)throw new Error("Dimension mismatch");o.count++,o.inputIndices.push(r)}else o={count:1,dimValue:n,inputIndices:[r]};this.symbolToInfo.set(t,o)}processTerm(t,n,r,o=-1){let i=r.length,s=!1,a=[],u=0;if(!t.match(RegExp(Ps))&&!n&&t!=="")throw new Error("Invalid LHS term");let d=t.match(RegExp(tr,"g")),l=new nr(o);return d?.forEach((c,p)=>{if(c==="..."){if(s)throw new Error("Only one ellipsis is allowed per input term");s=!0;let f=i-d.length+1;if(f<0)throw new Error("Ellipsis out of bounds");if(a=r.slice(u,u+f),this.hasEllipsis){if(this.ellipsisDims.length!==a.length||this.ellipsisDims.toString()!==a.toString())throw new Error("Ellipsis dimensions mismatch")}else if(n)this.hasEllipsis=!0,this.ellipsisDims=a;else throw new Error("Ellipsis must be specified in the LHS");for(let m=0;m<a.length;m++){let h=String.fromCharCode(48+m);l.addSymbol(h,p+m),this.addSymbol(h,r[u++],o)}}else l.addSymbol(c,p+(this.hasEllipsis?this.ellipsisDims.length-1:0)),this.addSymbol(c,r[u++],o)}),l}},zs=e=>e+"_max",kl=(e,t,n,r)=>{let i=e.map(l=>l.length).map((l,c)=>S(`input${c}`,t,l)),s=x.size(r),a=C("output",t,r.length),u=[...n.symbolToInfo.keys()].filter(l=>!n.rhs.symbolToIndices.has(l)),d=l=>{let c=[],p="var prod = 1.0;",f="var sum = 0.0;",m="sum += prod;",h=[],b=[],y=[],g=[],_=n.symbolToInfo.size===n.rhs.symbolToIndices.size;n.symbolToInfo.forEach((v,$)=>{if(n.rhs.symbolToIndices.has($)){let T=n.rhs.symbolToIndices.get($)?.[0];T!==void 0&&n.lhs.forEach((I,A)=>{if(v.inputIndices.includes(A)){let z=I.symbolToIndices.get($);if(z===void 0)throw new Error("Invalid symbol error");z.forEach(M=>{c.push(`${i[A].indicesSet(`input${A}Indices`,M,a.indicesGet("outputIndices",T))}`)})}})}else n.lhs.forEach((T,I)=>{if(v.inputIndices.includes(I)){let A=T.symbolToIndices.get($);if(A===void 0)throw new Error("Invalid symbol error");A.forEach(z=>{h.push(`${i[I].indicesSet(`input${I}Indices`,z,`${$}`)}`)}),g.push(`prod *= ${i[I].getByIndices(`input${I}Indices`)};`)}}),b.push(`for(var ${$}: u32 = 0; ${$} < uniforms.${zs($)}; ${$}++) {`),y.push("}")});let w=_?[...c,`let sum = ${i.map((v,$)=>v.getByIndices(`input${$}Indices`)).join(" * ")};`]:[...c,f,...b,...h,p,...g,m,...y];return`
+            ${l.registerUniforms(u.map(v=>({name:`${zs(v)}`,type:"u32"}))).registerUniform("outputSize","u32").declareVariables(...i,a)}
+
+            ${l.mainStart()}
+            ${l.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+            var outputIndices = ${a.offsetToIndices("global_idx")};
+            ${i.map((v,$)=>`var input${$}Indices: ${i[$].type.indices};`).join(`
+`)}
+            ${w.join(`
+`)};
+            ${a.setByOffset("global_idx","sum")};
+          }`};return{name:"Einsum",shaderCache:{hint:n.equation,inputDependencies:e.map(()=>"rank")},getRunData:()=>{let l=u.filter(p=>n.symbolToInfo.has(p)).map(p=>({type:12,data:n.symbolToInfo.get(p)?.dimValue||0}));l.push({type:12,data:s});let c=e.map((p,f)=>[...P(p)]).reduce((p,f)=>p.concat(f),l);return c.push(...P(r)),{outputs:[{dims:r,dataType:t}],dispatchGroup:{x:Math.ceil(s/64)},programUniforms:c}},getShaderSource:d}},Os=(e,t)=>{let n=new rr(e.inputs,t.equation),r=n.outputDims,o=e.inputs.map((i,s)=>i.dims);e.compute(kl(o,e.inputs[0].dataType,n,r))},Ds=e=>{let t=e.equation.replace(/\s+/g,"");return N({equation:t})}});var El,Ms,Pl,zl,Rs,Us=E(()=>{"use strict";V();q();K();El=e=>{if(!e||e.length!==2)throw new Error("Expand requires 2 input.");let t=e[0].dims,n=Array.from(e[1].getBigInt64Array(),Number),r=n.length<t.length?0:n.length-t.length,o=t.length<n.length?0:t.length-n.length;for(;r<n.length&&o<t.length;++r,++o)if(n[r]!==t[o]&&n[r]!==1&&t[o]!==1)throw new Error("Expand requires shape to be broadcastable to input")},Ms=(e,t)=>{let n=e.length-t.length,r=[];for(let o=0;o<n;++o)r.push(e[o]);for(let o=0;o<t.length;++o)r.push(t[o]===1?e[o+n]:t[o]);return r},Pl=(e,t)=>e.length>t.length?Ms(e,t):Ms(t,e),zl=e=>{let t=e[0].dims,n=Array.from(e[1].getBigInt64Array(),Number),r=Pl(t,n),o=e[0].dataType,i=o===9||x.size(t)===1,s=o===9||t.length>0&&t[t.length-1]%4===0?4:1,a=i||r.length>0&&r[r.length-1]%4===0?4:1,u=Math.ceil(x.size(r)/a),d=c=>{let p=S("input",o,t.length,s),f=C("output",o,r.length,a),m;if(o===9){let h=(b,y,g="")=>`
+          let outputIndices${y} = ${f.offsetToIndices(`outputOffset + ${y}u`)};
+          let offset${y} = ${p.broadcastedIndicesToOffset(`outputIndices${y}`,f)};
+          let index${y} = offset${y} / 4u;
+          let component${y} = offset${y} % 4u;
+          ${b}[${y}] = ${g}(${p.getByOffset(`index${y}`)}[component${y}]);
+        `;m=`
+        let outputOffset = global_idx * ${a};
+        var data = vec4<u32>(0);
+        ${h("data",0,"u32")}
+        ${h("data",1,"u32")}
+        ${h("data",2,"u32")}
+        ${h("data",3,"u32")}
+        ${f.setByOffset("global_idx","data")}
+      }`}else m=`
+        let outputIndices = ${f.offsetToIndices(`global_idx * ${a}`)};
+        let inputOffset = ${p.broadcastedIndicesToOffset("outputIndices",f)};
+        let data = ${f.type.value}(${p.getByOffset(`inputOffset / ${s}`)});
+        ${f.setByOffset("global_idx","data")}
+      }`;return`
+    ${c.registerUniform("vec_size","u32").declareVariables(p,f)}
+    ${c.mainStart()}
+    ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+    ${m}`},l=[{type:12,data:u},...P(t,r)];return{name:"Expand",shaderCache:{hint:`${r.length};${s}${a}`,inputDependencies:["rank"]},getShaderSource:d,getRunData:()=>({outputs:[{dims:r,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:l})}},Rs=e=>{El(e.inputs),e.compute(zl(e.inputs),{inputs:[0]})}});var Ol,Vs,Ns=E(()=>{"use strict";V();q();K();en();Ol=e=>{let t=e[0].dataType,n=x.size(e[0].dims),r=x.size(e[1].dims),o=r%4===0,i=s=>{let a=S("x",t,[1],4),u=S("bias",t,[1],4),d=C("y",t,[1],4),l=[{name:"output_vec_size",type:"u32"},{name:"bias_size",type:"u32"}],c=f=>`
+      let bias${f}_offset: u32 = (global_idx * 4 + ${f}) % uniforms.bias_size;
+      let bias${f} = ${u.getByOffset(`bias${f}_offset / 4`)}[bias${f}_offset % 4];`,p=o?`
+      let bias = ${u.getByOffset("global_idx % (uniforms.bias_size / 4)")};`:`${c(0)}${c(1)}${c(2)}${c(3)}
+      let bias = ${a.type.value}(bias0, bias1, bias2, bias3);`;return`${s.registerUniforms(l).declareVariables(a,u,d)}
+
+    ${Kn(le(t))}
+
+    ${s.mainStart(Qe)}
+      ${s.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_vec_size")}
+
+      let x = ${a.getByOffset("global_idx")};
+      ${p}
+      let x_in = x + bias;
+      ${d.setByOffset("global_idx",jn("x_in"))}
+    }`};return{name:"FastGeluWithBias",shaderCache:{hint:`${o}`,inputDependencies:["type","type"]},getShaderSource:i,getRunData:s=>({outputs:[{dims:s[0].dims,dataType:s[0].dataType}],programUniforms:[{type:12,data:Math.ceil(n/4)},{type:12,data:r}],dispatchGroup:{x:Math.ceil(n/Qe/4)}})}},Vs=e=>{e.inputs.length<2||x.size(e.inputs[1].dims)===0?Bi(e):e.compute(Ol(e.inputs))}});var Dl,Bl,Ls,Gs,Ws=E(()=>{"use strict";V();q();ie();K();Dl=e=>{if(!e||e.length!==2)throw new Error("Gather requires 2 inputs.")},Bl=(e,t)=>{let n=e[0].dims,r=e[1].dims,o=n.length,i=x.normalizeAxis(t.axis,o),s=n.slice(0);s.splice(i,1,...r);let a=n[i],u=e[0].dataType===9?4:1,d=Math.ceil(x.size(s)/u),l=[{type:12,data:d},{type:6,data:a},{type:12,data:i},...P(e[0].dims,e[1].dims,s)],c=p=>{let f=S("data",e[0].dataType,e[0].dims.length,u),m=S("inputIndices",e[1].dataType,e[1].dims.length),h=C("output",e[0].dataType,s.length,u),b=g=>{let _=r.length,w=`var indicesIndices${g}  = ${m.type.indices}(0);`;for(let v=0;v<_;v++)w+=`${_>1?`indicesIndices${g}[${v}]`:`indicesIndices${g}`} = ${s.length>1?`outputIndices${g}[uniforms.axis + ${v}]`:`outputIndices${g}`};`;w+=`
+          var idx${g} = ${m.getByIndices(`indicesIndices${g}`)};
+          if (idx${g} < 0) {
+            idx${g} = idx${g} + uniforms.axisDimLimit;
+          }
+          var dataIndices${g} : ${f.type.indices};
+        `;for(let v=0,$=0;v<o;v++)v===i?(w+=`${o>1?`dataIndices${g}[${v}]`:`dataIndices${g}`} = u32(idx${g});`,$+=_):(w+=`${o>1?`dataIndices${g}[${v}]`:`dataIndices${g}`} = ${s.length>1?`outputIndices${g}[${$}]`:`outputIndices${g}`};`,$++);return w},y;if(e[0].dataType===9){let g=(_,w,v="")=>`
+          let outputIndices${w} = ${h.offsetToIndices(`outputOffset + ${w}u`)};
+          ${b(w)};
+          let offset${w} = ${f.indicesToOffset(`dataIndices${w}`)};
+          let index${w} = offset${w} / 4u;
+          let component${w} = offset${w} % 4u;
+          ${_}[${w}] = ${v}(${f.getByOffset(`index${w}`)}[component${w}]);
+        `;y=`
+        let outputOffset = global_idx * ${u};
+        var value = vec4<u32>(0);
+        ${g("value",0,"u32")}
+        ${g("value",1,"u32")}
+        ${g("value",2,"u32")}
+        ${g("value",3,"u32")}
+        ${h.setByOffset("global_idx","value")}
+      `}else y=`
+      let outputIndices = ${h.offsetToIndices("global_idx")};
+      ${b("")};
+      let value = ${f.getByIndices("dataIndices")};
+      ${h.setByOffset("global_idx","value")};
+      `;return`
+      ${p.registerUniform("outputSize","u32").registerUniform("axisDimLimit","i32").registerUniform("axis","u32").declareVariables(f,m,h)}
+      ${p.mainStart()}
+        ${p.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+        ${y}
+      }`};return{name:"Gather",shaderCache:{hint:t.cacheKey,inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:s,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(d/64)},programUniforms:l}),getShaderSource:c}},Ls=e=>N({axis:e.axis}),Gs=(e,t)=>{let n=e.inputs;Dl(n),e.compute(Bl(e.inputs,t))}});var Ml,Hs,qs,Fs=E(()=>{"use strict";V();q();K();Ml=(e,t,n,r,o,i,s,a,u)=>{let d=[{type:12,data:i},{type:12,data:r},{type:12,data:o},{type:12,data:n},{type:12,data:s},{type:12,data:a},{type:12,data:u}],l=[i];d.push(...P(t.dims,l));let c=p=>{let f=S("indices_data",t.dataType,t.dims.length),m=C("input_slice_offsets_data",12,1,1),h=[f,m],b=[{name:"output_size",type:"u32"},{name:"batch_dims",type:"u32"},{name:"input_dims",type:"u32",length:o.length},{name:"sizes_from_slice_dims_data",type:"u32",length:n.length},{name:"num_slices_per_batch",type:"u32"},{name:"input_batch_stride",type:"u32"},{name:"num_slice_dims",type:"u32"}];return`
+  ${p.registerUniforms(b).declareVariables(...h)}
+  ${p.mainStart()}
+    ${p.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let batch_idx = global_idx / uniforms.num_slices_per_batch;
+    let base_offset = batch_idx * uniforms.input_batch_stride;
+
+    let slice_indices_base_offset = global_idx * uniforms.num_slice_dims;
+    var relative_slice_offset = 0;
+    for (var dim_idx = 0u; dim_idx < uniforms.num_slice_dims; dim_idx ++) {
+      var index = i32(indices_data[dim_idx + slice_indices_base_offset].x);
+      let input_dim_idx = uniforms.batch_dims + dim_idx;
+      if (index < 0) {
+        ${o.length===1?"index += i32(uniforms.input_dims);":"index += i32(uniforms.input_dims[input_dim_idx]);"}
+      }
+      ${n.length===1?"relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data);":"relative_slice_offset += index * i32(uniforms.sizes_from_slice_dims_data[dim_idx]);"}
+    }
+
+    input_slice_offsets_data[global_idx] =  base_offset + u32(relative_slice_offset);
+  }`};return e.compute({name:"computeSliceOffsets",shaderCache:{hint:`${o.length}_${n.length}`,inputDependencies:["rank"]},getRunData:()=>({outputs:[{dims:l,dataType:e.inputs[1].dataType}],dispatchGroup:{x:Math.ceil(i/64)},programUniforms:d}),getShaderSource:c},{inputs:[t],outputs:[-1]})[0]},Hs=(e,t)=>{let n=e.inputs,r=n[0].dims,o=n[0].dataType,i=n[1].dims,s=i[i.length-1],a=x.sizeToDimension(i,i.length-1),u=x.sizeFromDimension(r,t.batchDims+s),d=x.sizeToDimension(r,t.batchDims),l=x.sizeFromDimension(r,t.batchDims),c=a/d,p=new Array(s),f=u;for(let w=0;w<s;++w)p[s-1-w]=f,f*=r[t.batchDims+s-1-w];let m=Ml(e,n[1],p,t.batchDims,r,a,c,l,s),h=t.batchDims+s;if(h>r.length)throw new Error("last dimension of indices must not be larger than rank of input tensor");let b=i.slice(0,-1).concat(r.slice(h)),y=x.size(b),g=[{type:12,data:y},{type:12,data:u},...P(n[0].dims,m.dims,b)],_=w=>{let v=S("data",n[0].dataType,n[0].dims.length),$=S("slice_offsets",12,m.dims.length),T=C("output",n[0].dataType,b.length);return`
+          ${w.registerUniform("output_size","u32").registerUniform("slice_size","u32").declareVariables(v,$,T)}
+            ${w.mainStart()}
+            ${w.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+          let slice_offset = slice_offsets[global_idx / uniforms.slice_size];
+          output[global_idx] = data[u32(slice_offset) + global_idx % uniforms.slice_size];
+        }`};e.compute({name:"GatherND",shaderCache:{hint:t.cacheKey,inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:b,dataType:o}],dispatchGroup:{x:Math.ceil(y/64)},programUniforms:g}),getShaderSource:_},{inputs:[n[0],m]})},qs=e=>({batchDims:e.batch_dims,cacheKey:""})});var Rl,Ul,Ks,js,Zs=E(()=>{"use strict";V();q();ie();K();Rl=(e,t)=>{if(e.length<3||e.length>4)throw new Error("GatherBlockQuantized requires 3 or 4 inputs.");let n=x.normalizeAxis(t.quantizeAxis,e[0].dims.length),r=t.blockSize,o=e[0],i=e[2],s=e.length===4?e[3]:void 0;if(i.dims.length!==o.dims.length||!o.dims.map((a,u)=>u===n?Math.ceil(a/r)===i.dims[u]:a===i.dims[u]).reduce((a,u)=>a&&u,!0))throw new Error("Scales must have the same rank as the input tensor and the dims should match except on gatherAxis.");if(s){if(s.dataType!==o.dataType)throw new Error("Zero point must have the same data type as the input tensor.");if(s.dims.length!==i.dims.length||!s.dims.map((a,u)=>a===i.dims[u]).reduce((a,u)=>a&&u,!0))throw new Error("Zero point must have the same rank as the input tensor and the dims should match except on quantizeAxis.")}},Ul=(e,t)=>{let n=e[0].dims,r=e[1].dims,o=n.length,i=x.normalizeAxis(t.gatherAxis,o),s=x.normalizeAxis(t.quantizeAxis,o),a=n.slice(0);a.splice(i,1,...r);let u=x.size(a),d=e[2].dataType,c=e[0].dataType===22,p=[{type:12,data:u},{type:12,data:s},{type:12,data:i},{type:12,data:t.blockSize},...P(...e.map((m,h)=>m.dims),a)],f=m=>{let h=S("data",e[0].dataType,e[0].dims.length),b=S("inputIndices",e[1].dataType,e[1].dims.length),y=S("scales",e[2].dataType,e[2].dims.length),g=e.length>3?S("zeroPoint",e[3].dataType,e[3].dims.length):void 0,_=C("output",d,a.length),w=[h,b,y];g&&w.push(g);let v=[{name:"output_size",type:"u32"},{name:"quantize_axis",type:"u32"},{name:"gather_axis",type:"u32"},{name:"block_size",type:"u32"}];return`
+        ${m.registerUniforms(v).declareVariables(...w,_)}
+        ${m.mainStart()}
+        let output_indices = ${_.offsetToIndices("global_idx")};
+        var indices_indices = ${b.type.indices}(0);
+        ${r.length>1?`
+          for (var i: u32 = 0; i < ${r.length}; i++) {
+            let index = ${_.indicesGet("output_indices","uniforms.gather_axis + i")};
+            ${b.indicesSet("indices_indices","i","index")};
+          }`:`indices_indices = ${_.indicesGet("output_indices","uniforms.gather_axis")};`};
+        var data_indices = ${h.type.indices}(0);
+        for (var i: u32 = 0; i < uniforms.gather_axis; i++) {
+          let index = ${_.indicesGet("output_indices","i")};
+          ${h.indicesSet("data_indices","i","index")};
+        }
+        var index_from_indices = ${b.getByIndices("indices_indices")};
+        if (index_from_indices < 0) {
+          index_from_indices += ${n[i]};
+        }
+        ${h.indicesSet("data_indices","uniforms.gather_axis","u32(index_from_indices)")};
+        for (var i = uniforms.gather_axis + 1; i < ${a.length}; i++) {
+          let index = ${_.indicesGet("output_indices",`i + ${r.length} - 1`)};
+          ${h.indicesSet("data_indices","i","index")};
+        }
+        let data_offset = ${h.indicesToOffset("data_indices")};
+        let data_index = data_offset % 8;
+        // Convert 4-bit packed data to 8-bit packed data.
+        let packed_4bit_quantized_data = ${h.getByOffset("data_offset / 8")};
+        let packed_8bit_quantized_data = (packed_4bit_quantized_data >> (4 * (data_index % 2))) & 0x0f0f0f0f;
+        let quantized_data_vec = ${c?"unpack4xI8":"unpack4xU8"}(u32(packed_8bit_quantized_data));
+        let quantized_data = quantized_data_vec[data_index / 2];
+        var scale_indices = data_indices;
+        let quantize_axis_index = ${y.indicesGet("data_indices","uniforms.quantize_axis")} / uniforms.block_size;
+        ${y.indicesSet("scale_indices","uniforms.quantize_axis","quantize_axis_index")};
+        var scale = ${y.getByIndices("scale_indices")};
+        ${g?`
+              let zero_point_indices = scale_indices;
+              let zero_point_offset = ${g.indicesToOffset("zero_point_indices")};
+              let zero_point_index = zero_point_offset % 8;
+              let packed_4bit_zero_points = ${g.getByOffset("zero_point_offset / 8")};
+              let packed_8bit_zero_points = (packed_4bit_zero_points >> (4 * (zero_point_index % 2))) & 0x0f0f0f0f;
+              let zero_point_vec = ${c?"unpack4xI8":"unpack4xU8"}(u32(packed_8bit_zero_points));
+              let zero_point = zero_point_vec[zero_point_index / 2];`:"var zero_point = 0"};
+        let dequantized_data = ${le(d)}(quantized_data - zero_point) * scale;
+        ${_.setByOffset("global_idx","dequantized_data")};
+    }`};return{name:"GatherBlockQuantized",shaderCache:{hint:`${t.cacheKey};${e.filter((m,h)=>h!==1).map(m=>m.dims.join("_")).join(";")}`,inputDependencies:Array.from({length:e.length},(m,h)=>"rank")},getRunData:()=>({outputs:[{dims:a,dataType:d}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:p}),getShaderSource:f}},Ks=(e,t)=>{let n=e.inputs;Rl(n,t),e.compute(Ul(e.inputs,t))},js=e=>N({blockSize:e.blockSize,gatherAxis:e.gatherAxis,quantizeAxis:e.quantizeAxis})});var Vl,Nl,Qs,Xs,Ys=E(()=>{"use strict";V();q();ie();K();Vl=e=>{if(!e||e.length!==2)throw new Error("GatherElements requires 2 inputs.");if(e[0].dims.length<1)throw new Error("GatherElements requires that the data input be rank >= 1.");if(e[0].dims.length!==e[1].dims.length)throw new Error(`GatherElements requires that the data input and
+                     indices input tensors be of same rank.`)},Nl=(e,t)=>{let n=e[0].dims,r=e[0].dataType,o=n.length,i=e[1].dims,s=e[1].dataType,a=x.normalizeAxis(t.axis,o),u=n[a],d=i.slice(0),l=x.size(d),c=S("input",r,o),p=S("indicesInput",s,i.length),f=C("output",r,d.length),m=[{type:12,data:l},{type:6,data:u},{type:12,data:a}];return m.push(...P(n,i,d)),{name:"GatherElements",shaderCache:{inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:d,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(l/64)},programUniforms:m}),getShaderSource:y=>`
+      ${y.registerUniform("outputSize","u32").registerUniform("axisDimLimit","i32").registerUniform("axis","u32").declareVariables(c,p,f)}
+      ${y.mainStart()}
+      ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+
+      let outputIndices = ${f.offsetToIndices("global_idx")};
+
+      var idx = ${p.getByOffset("global_idx")};
+      if (idx < 0) {
+        idx = idx + uniforms.axisDimLimit;
+      }
+      var inputIndices = ${c.type.indices}(outputIndices);
+      ${c.indicesSet("inputIndices","uniforms.axis","u32(idx)")};
+      let value = ${c.getByIndices("inputIndices")};
+
+      ${f.setByOffset("global_idx","value")};
+  }`}},Qs=e=>N({axis:e.axis}),Xs=(e,t)=>{let n=e.inputs;Vl(n),e.compute(Nl(e.inputs,t))}});var Ll,Gl,Js,ea,ta=E(()=>{"use strict";V();q();K();Ll=e=>{if(!e)throw new Error("Input is missing");if(e.length<2||e.length>3)throw new Error("Invaid input number.");if(e.length===3&&e[2].dims.length>2)throw new Error("Invalid input shape of C");if(e[0].dataType!==e[1].dataType||e.length===3&&e[0].dataType!==e[2].dataType)throw new Error("Input types are mismatched")},Gl=(e,t)=>{let n=e[0].dims.slice(),r=e[1].dims.slice(),[o,i,s]=Lt.getShapeOfGemmResult(n,t.transA,r,t.transB,e.length===3?e[2].dims:void 0),a=[o,i];if(!a)throw new Error("Can't use gemm on the given tensors");let u=16,d=Math.ceil(i/u),l=Math.ceil(o/u),c=!0,p=x.size(a),f=[{type:12,data:c?d:p},{type:12,data:o},{type:12,data:i},{type:12,data:s},{type:1,data:t.alpha},{type:1,data:t.beta}],m=["type","type"];e.length===3&&(f.push(...P(e[2].dims)),m.push("rank")),f.push(...P(a));let h=y=>{let g="";t.transA&&t.transB?g="value += a[k * uniforms.M + m] * b[n * uniforms.K + k];":t.transA&&!t.transB?g="value += a[k * uniforms.M + m] * b[k * uniforms.N + n];":!t.transA&&t.transB?g="value += a[m * uniforms.K + k] * b[n * uniforms.K + k];":!t.transA&&!t.transB&&(g="value += a[m * uniforms.K + k] * b[k * uniforms.N + n];");let _=t.alpha===1?"":"value *= uniforms.alpha;",w=S("a",e[0].dataType,e[0].dims),v=S("b",e[1].dataType,e[1].dims),$=w.type.value,T=null,I=[w,v];e.length===3&&(T=S("c",e[2].dataType,e[2].dims.length),I.push(T));let A=C("output",e[0].dataType,a.length);I.push(A);let z=[{name:"output_size",type:"u32"},{name:"M",type:"u32"},{name:"N",type:"u32"},{name:"K",type:"u32"},{name:"alpha",type:"f32"},{name:"beta",type:"f32"}];return`
+  ${y.registerUniforms(z).declareVariables(...I)}
+
+  ${y.mainStart()}
+    ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+    let m = global_idx / uniforms.N;
+    let n = global_idx % uniforms.N;
+
+    var value = ${$}(0);
+    for (var k: u32 = 0u; k < uniforms.K; k++) {
+      ${g}
+    }
+
+    ${_}
+    ${T!=null?`let cOffset = ${T.broadcastedIndicesToOffset("vec2(m, n)",A)}; value += ${$}(uniforms.beta) * ${T.getByOffset("cOffset")};`:""}
+    output[global_idx] = value;
+  }`},b=y=>{let g=S("a",e[0].dataType,e[0].dims),_=S("b",e[1].dataType,e[1].dims),w=null,v=[g,_];e.length===3&&(w=S("c",e[2].dataType,e[2].dims.length),v.push(w));let $=C("output",e[0].dataType,a.length);v.push($);let T=[{name:"num_tile_n",type:"u32"},{name:"M",type:"u32"},{name:"N",type:"u32"},{name:"K",type:"u32"},{name:"alpha",type:"f32"},{name:"beta",type:"f32"}],I="",A="";t.transA&&t.transB?(A=`
+      var col = tile_row_start + local_id.x;
+      var row = k_start + local_id.y;
+      if (col < uniforms.M && row < uniforms.K) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.M + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = k_start + local_id.x;
+      row = tile_col_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.N) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.K + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[k][local_id.y] * tile_b[local_id.x][k];"):t.transA&&!t.transB?(A=`
+      var col = tile_row_start + local_id.x;
+      var row = k_start + local_id.y;
+      if (col < uniforms.M && row < uniforms.K) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.M + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = tile_col_start + local_id.x;
+      row = k_start + local_id.y;
+      if (col < uniforms.N && row < uniforms.K) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.N + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[k][local_id.y] * tile_b[k][local_id.x];"):!t.transA&&t.transB?(A=`
+      var col = k_start + local_id.x;
+      var row = tile_row_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.M) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.K + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = k_start + local_id.x;
+      row = tile_col_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.N) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.K + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[local_id.y][k] * tile_b[local_id.x][k];"):!t.transA&&!t.transB&&(A=`
+      var col = k_start + local_id.x;
+      var row = tile_row_start + local_id.y;
+      if (col < uniforms.K && row < uniforms.M) {
+        tile_a[local_id.y][local_id.x] = a[row * uniforms.K + col];
+      } else {
+        tile_a[local_id.y][local_id.x] = ${g.type.value}(0);
+      }
+
+      col = tile_col_start + local_id.x;
+      row = k_start + local_id.y;
+      if (col < uniforms.N && row < uniforms.K) {
+        tile_b[local_id.y][local_id.x] = b[row * uniforms.N + col];
+      } else {
+        tile_b[local_id.y][local_id.x] = ${_.type.value}(0);
+      }
+      `,I="value += tile_a[local_id.y][k] * tile_b[k][local_id.x];");let z=t.alpha===1?"":"value *= uniforms.alpha;";return`
+  ${y.registerUniforms(T).declareVariables(...v)}
+  var<workgroup> tile_a: array<array<${g.type.storage}, ${u}>, ${u}>;
+  var<workgroup> tile_b: array<array<${_.type.storage}, ${u}>, ${u}>;
+  ${y.mainStart([u,u,1])}
+    let tile_col_start = (workgroup_index % uniforms.num_tile_n) * ${u};
+    let tile_row_start = (workgroup_index / uniforms.num_tile_n) * ${u};
+    let num_tiles = (uniforms.K - 1) / ${u} + 1;
+    var k_start = 0u;
+    var value = ${$.type.value}(0);
+    for (var t: u32 = 0u; t < num_tiles; t++) {
+      ${A}
+      k_start = k_start + ${u};
+      workgroupBarrier();
+
+      for (var k: u32 = 0u; k < ${u}; k++) {
+        ${I}
+      }
+      workgroupBarrier();
+    }
+
+    ${z}
+    let m = tile_row_start + local_id.y;
+    let n = tile_col_start + local_id.x;
+    ${w!=null?`let cOffset = ${w.broadcastedIndicesToOffset("vec2(m, n)",$)}; value += ${$.type.value}(uniforms.beta) * ${w.getByOffset("cOffset")};`:""}
+    if (m < uniforms.M && n < uniforms.N) {
+      output[m * uniforms.N + n] = value;
+    }
+  }`};return c?{name:"GemmShared",shaderCache:{hint:`${t.cacheKey}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:a,dataType:e[0].dataType}],dispatchGroup:{x:d*l},programUniforms:f}),getShaderSource:b}:{name:"Gemm",shaderCache:{hint:`${t.cacheKey}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:a,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:f}),getShaderSource:h}},Js=e=>{let t=e.transA,n=e.transB,r=e.alpha,o=e.beta;return{transA:t,transB:n,alpha:r,beta:o,cacheKey:`${e.transA};${e.transB};${e.alpha===1}`}},ea=(e,t)=>{Ll(e.inputs),e.compute(Gl(e.inputs,t))}});var Ue,qe,rt,ot,Wl,Hl,ql,Fl,Kl,jl,Zl,Ql,na,ra,oa=E(()=>{"use strict";V();q();ie();K();[Ue,qe,rt,ot]=[0,1,2,3],Wl=e=>{if(e[0].dims.length!==4)throw new Error("only 4-D tensor is supported.");if(e[0].dims.length!==e[1].dims.length)throw new Error("input dimensions must be equal to grid dimensions");if(e[0].dims.length-2!==e[1].dims[e[1].dims.length-1])throw new Error(`last dimension of grid must be equal to ${e[0].dims.length-2}`);if(e[0].dims[0]!==e[1].dims[0])throw new Error("grid batch size must match input batch size")},Hl=`
+  fn gs_get_cubic_coeffs(x: f32) -> vec4<f32> {
+    let cubic_alpha = -0.75f;
+    let x_abs = abs(x);
+    var coeffs: vec4<f32>;
+    coeffs[0] = (((cubic_alpha * (x_abs + 1) - 5 * cubic_alpha) * (x_abs + 1) + 8 * cubic_alpha) * (x_abs + 1) - 4 * cubic_alpha);
+    coeffs[1] = (((cubic_alpha + 2) * x_abs - (cubic_alpha + 3)) * x_abs * x_abs + 1);
+    coeffs[2] = (((cubic_alpha + 2) * (1 - x_abs) - (cubic_alpha + 3)) * (1 - x_abs) * (1 - x_abs) + 1);
+    coeffs[3] = (((cubic_alpha * (2 - x_abs) - 5 * cubic_alpha) * (2 - x_abs) + 8 * cubic_alpha) * (2 - x_abs) - 4 * cubic_alpha);
+    return coeffs;
+  }
+`,ql=e=>`
+  fn gs_bicubic_interpolate(p: mat4x4<${e}>, x: f32, y: f32) -> ${e} {
+    var v: vec4<f32>;
+    var coeffs = gs_get_cubic_coeffs(x);
+    for (var i = 0; i < 4; i++) {
+      v[i] = coeffs[0] * p[i][0] + coeffs[1] * p[i][1] + coeffs[2] * p[i][2] + coeffs[3] * p[i][3];
+    }
+    coeffs = gs_get_cubic_coeffs(y);
+    let pixel = ${e}(coeffs[0] * v[0] + coeffs[1] * v[1] + coeffs[2] * v[2] + coeffs[3] * v[3]);
+    return pixel;
+  }
+`,Fl=e=>`
+  fn gs_denormalize(n: f32, length: i32) -> f32 {
+    ${e.alignCorners===0?`
+    // alignCorners: false => [-1, 1] to [-0.5, length - 0.5]
+    return ((n + 1.0) * f32(length) - 1.0) / 2.0;
+    `:`
+    // alignCorners: true => [-1, 1] to [0, length - 1]
+    return (n + 1.0) / 2.0 * (f32(length - 1));
+    `}
+  }
+`,Kl=e=>`
+  ${e.paddingMode==="reflection"?`
+      fn gs_reflect(x: i32, x_min: f32, x_max: f32) -> u32 {
+        var dx = 0.0;
+        var fx = f32(x);
+        let range = x_max - x_min;
+        if (fx < x_min) {
+          dx = x_min - fx;
+          let n = u32(dx / range);
+          let r = dx - f32(n) * range;
+          if (n % 2 == 0) {
+            fx = x_min + r;
+          } else {
+            fx = x_max - r;
+          }
+        } else if (fx > x_max) {
+          dx = fx - x_max;
+          let n = u32(dx / range);
+          let r = dx - f32(n) * range;
+          if (n % 2 == 0) {
+            fx = x_max - r;
+          } else {
+            fx = x_min + r;
+          }
+        }
+        return u32(fx);
+      }`:""}
+`,jl=(e,t,n)=>`
+  fn pixel_at_grid(r: i32, c: i32, H: i32, W: i32, batch: u32, channel: u32, border: vec4<f32>) -> ${t} {
+     var pixel = ${t}(0);
+     var indices = vec4<u32>(0);
+     indices[${Ue}] = batch;
+     indices[${qe}] = channel;`+(()=>{switch(n.paddingMode){case"zeros":return`
+          if (r >= 0 && r < H && c >=0 && c < W) {
+            indices[${rt}] = u32(r);
+            indices[${ot}] = u32(c);
+          } else {
+            return ${t}(0);
+          }
+        `;case"border":return`
+          indices[${rt}] = u32(clamp(r, 0, H - 1));
+          indices[${ot}] = u32(clamp(c, 0, W - 1));
+        `;case"reflection":return`
+          indices[${rt}] = gs_reflect(r, border[1], border[3]);
+          indices[${ot}] = gs_reflect(c, border[0], border[2]);
+        `;default:throw new Error(`padding mode ${n.paddingMode} is not supported`)}})()+`
+    return ${e.getByIndices("indices")};
+  }
+`,Zl=(e,t,n)=>(()=>{switch(n.mode){case"nearest":return`
+          let result = pixel_at_grid(i32(round(y)), i32(round(x)), H_in, W_in, indices[${Ue}], indices[${qe}], border);
+        `;case"bilinear":return`
+          let x1 = i32(floor(x));
+          let y1 = i32(floor(y));
+          let x2 = x1 + 1;
+          let y2 = y1 + 1;
+
+          let p11 = pixel_at_grid(y1, x1, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+          let p12 = pixel_at_grid(y1, x2, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+          let p21 = pixel_at_grid(y2, x1, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+          let p22 = pixel_at_grid(y2, x2, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+
+          let dx2 = ${t}(f32(x2) - x);
+          let dx1 = ${t}(x - f32(x1));
+          let dy2 = ${t}(f32(y2) - y);
+          let dy1 = ${t}(y - f32(y1));
+          let result = dy2 * (dx2 * p11 + dx1 * p12) + dy1 * (dx2 * p21 + dx1 * p22);
+        `;case"bicubic":return`
+          let x0 = i32(floor(x)) - 1;
+          let y0 = i32(floor(y)) - 1;
+          var p: mat4x4<${t}>;
+          for (var h = 0; h < 4; h++) {
+            for (var w = 0; w < 4; w++) {
+              p[h][w] = pixel_at_grid(h + y0, w + x0, H_in, W_in, indices[${Ue}], indices[${qe}], border);
+            }
+          }
+
+          let dx = x - f32(x0 + 1);
+          let dy = y - f32(y0 + 1);
+          let result = gs_bicubic_interpolate(p, dx, dy);
+        `;default:throw new Error(`mode ${n.mode} is not supported`)}})()+`${e.setByOffset("global_idx","result")}`,Ql=(e,t)=>{let n=S("x",e[0].dataType,e[0].dims.length),r=[e[1].dims[0],e[1].dims[1],e[1].dims[2]],o=S("grid",e[1].dataType,r.length,2),i=[e[0].dims[0],e[0].dims[1],e[1].dims[1],e[1].dims[2]];t.format==="NHWC"&&(i=[e[0].dims[0],e[1].dims[1],e[1].dims[2],e[0].dims[3]],[Ue,qe,rt,ot]=[0,3,1,2]);let s=C("output",e[0].dataType,i.length),a=n.type.value,u=x.size(i),d=[{type:12,data:u},...P(e[0].dims,r,i)],l=c=>`
+  ${c.registerUniform("output_size","u32").declareVariables(n,o,s)}
+  ${Hl}
+  ${ql(a)}
+  ${Fl(t)}
+  ${Kl(t)}
+  ${jl(n,a,t)}
+
+  ${c.mainStart()}
+    ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+      let H_in = i32(uniforms.x_shape[${rt}]);
+      let W_in = i32(uniforms.x_shape[${ot}]);
+
+      ${t.alignCorners===0?`
+      let x_min = -0.5;
+      let x_max = f32(W_in) - 0.5;
+      let y_min = -0.5;
+      let y_max = f32(H_in) - 0.5;
+      `:`
+      let x_min = 0.0;
+      let x_max = f32(W_in) - 1.0;
+      let y_min = 0.0;
+      let y_max = f32(H_in) - 1.0;
+      `};
+      let border = vec4<f32>(x_min, y_min, x_max, y_max);
+
+      let indices = ${s.offsetToIndices("global_idx")};
+      var grid_indices = vec3<u32>(indices[${Ue}], indices[${rt}], indices[${ot}]);
+      let nxy = ${o.getByIndices("grid_indices")};
+      var x = gs_denormalize(f32(nxy[0]), W_in);
+      var y = gs_denormalize(f32(nxy[1]), H_in);
+
+      ${Zl(s,a,t)}
+  }`;return{name:"GridSample",shaderCache:{hint:`${t.cacheKey}`,inputDependencies:["type","type"]},getRunData:c=>{let p=x.size(i);return{outputs:[{dims:i,dataType:c[0].dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:d}},getShaderSource:l}},na=(e,t)=>{Wl(e.inputs),e.compute(Ql(e.inputs,t))},ra=e=>N({alignCorners:e.align_corners,mode:e.mode,paddingMode:e.padding_mode,format:e.format})});var he,Jl,sa,ia,ec,yt,aa,or=E(()=>{"use strict";V();q();ie();Kt();Yt();K();Re();he=(e,t)=>e.length>t&&e[t].dims.length>0?e[t]:void 0,Jl=(e,t)=>{let n=e[0],r=he(e,1),o=he(e,2),i=he(e,3),s=he(e,4),a=he(e,5),u=he(e,6),d=he(e,7);if(n.dims.length!==3&&n.dims.length!==5)throw new Error("Input query is expected to have 3 or 5 dimensions");let l=n.dims[0],c=n.dims[1],p=n.dims.length===3?n.dims[2]:t.numHeads*n.dims[4],f=c,m=0,h=0,b=Math.floor(p/t.numHeads);if(u&&d&&x.size(u.dims)&&x.size(d.dims)){if(u.dims.length!==4)throw new Error('Input "past_key" is expected to have 4 dimensions');if(u.dims[0]!==l||u.dims[1]!==t.numHeads||u.dims[3]!==b)throw new Error('Input "past_key" shape (batch_size, num_heads, past_sequence_length, head_size)');if(d.dims[0]!==l||d.dims[1]!==t.numHeads||d.dims[3]!==b)throw new Error('Input "past_value" shape (batch_size, num_heads, past_sequence_length, head_size)');if(u.dims[2]!==d.dims[2])throw new Error('Input "past_key" and "past_value" shall have same dim 2 (past_sequence_length)');if(d.dims.length!==4)throw new Error('Input "past_value" is expected to have 4 dimensions');m=u.dims[2],h=u.dims[2]}else if(u&&x.size(u.dims)||d&&x.size(d.dims))throw new Error('Input "past_key" and "past_value" shall be both present or both absent');let y;if(r&&x.size(r.dims)>0){if(n.dims.length!==3)throw new Error('Input "query" is expected to have 3 dimensions when key is given');if(r.dims.length<3||r.dims.length>5)throw new Error('Input "key" is expected to have 3, 4, or 5 dimensions');if(n.dims[0]!==r.dims[0])throw new Error('Input "query" and "key" shall have same dim 0 (batch size)');if(r.dims.length===3){if(r.dims[2]!==n.dims[2])throw new Error('Input "query" and "key" shall have same dim 2 (hidden_size)');y=2,f=r.dims[1]}else if(r.dims.length===5){if(r.dims[2]!==t.numHeads||r.dims[3]!==2||r.dims[4]!==b)throw new Error('Expect "key" shape (batch_size, kv_sequence_length, num_heads, 2, head_size) for packed kv');if(o)throw new Error('Expect "value" be none when "key" has packed kv format.');y=5,f=r.dims[1]}else{if(r.dims[1]!==t.numHeads||r.dims[3]!==b)throw new Error('Expect "key" shape (batch_size, num_heads, kv_sequence_length, head_size) for past_key');y=0,f=r.dims[2]}}else{if(n.dims.length!==5)throw new Error('Input "query" is expected to have 5 dimensions when key is empty');if(n.dims[2]!==t.numHeads||n.dims[3]!==3)throw new Error('Expect "query" shape (batch_size, kv_sequence_length, num_heads, 3, head_size) for packed kv');y=3}if(i&&x.size(i.dims)>0){if(i.dims.length!==1)throw new Error('Input "bias" is expected to have 1 dimension');if(r&&r.dims.length===5&&r.dims[3]===2)throw new Error("bias is not allowed for packed kv.")}let g=m+f,_=0;if(s&&x.size(s.dims)>0){_=8;let T=s.dims;throw T.length===1?T[0]===l?_=1:T[0]===3*l+2&&(_=3):T.length===2&&T[0]===l&&T[1]===g&&(_=5),_===8?new Error('Input "key_padding_mask" shape shall be (batch_size) or (batch_size, total_sequence_length)'):new Error("Mask not supported")}let w=!1,v=p;if(o&&x.size(o.dims)>0){if(o.dims.length!==3&&o.dims.length!==4)throw new Error('Input "value" is expected to have 3 or 4 dimensions');if(n.dims[0]!==o.dims[0])throw new Error('Input "query" and "value" shall have same dim 0 (batch_size)');if(o.dims.length===3){if(f!==o.dims[1])throw new Error('Input "key" and "value" shall have the same dim 1 (kv_sequence_length)');v=o.dims[2]}else{if(f!==o.dims[2])throw new Error('Input "key" and "value" shall have the same dim 2 (kv_sequence_length)');v=o.dims[1]*o.dims[3],w=!0}}let $=!1;if(s&&x.size(s.dims)>0)throw new Error("Key padding mask is not supported");if(a&&x.size(a.dims)>0){if(a.dims.length!==4)throw new Error('Input "attention_bias" is expected to have 4 dimensions');if(a.dims[0]!==l||a.dims[1]!==t.numHeads||a.dims[2]!==c||a.dims[3]!==g)throw new Error('Expect "attention_bias" shape (batch_size, num_heads, sequence_length, total_sequence_length)')}return{batchSize:l,sequenceLength:c,pastSequenceLength:m,kvSequenceLength:f,totalSequenceLength:g,maxSequenceLength:h,inputHiddenSize:0,hiddenSize:p,vHiddenSize:v,headSize:b,vHeadSize:Math.floor(v/t.numHeads),numHeads:t.numHeads,isUnidirectional:!1,pastPresentShareBuffer:!1,maskFilterValue:t.maskFilterValue,maskType:_,scale:t.scale,broadcastResPosBias:$,passPastInKv:w,qkvFormat:y}},sa=e=>N({...e}),ia=N({perm:[0,2,1,3]}),ec=(e,t,n,r,o,i,s)=>{let a=[r,o,i],u=x.size(a),d=[{type:12,data:u},{type:12,data:s},{type:12,data:i}],l=c=>{let p=C("qkv_with_bias",t.dataType,a),f=S("qkv",t.dataType,a),m=S("bias",n.dataType,a),h=[{name:"output_size",type:"u32"},{name:"bias_offset",type:"u32"},{name:"hidden_size",type:"u32"}];return`
+  ${c.registerUniforms(h).declareVariables(f,m,p)}
+  ${c.mainStart()}
+    ${c.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset;
+
+    qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx];
+  }`};return e.compute({name:"MultiHeadAttentionAddBias",shaderCache:{inputDependencies:["type","type"]},getRunData:()=>({outputs:[{dims:a,dataType:t.dataType,gpuDataType:0}],dispatchGroup:{x:Math.ceil(u/64)},programUniforms:d}),getShaderSource:l},{inputs:[t,n],outputs:[-1]})[0]},yt=(e,t,n,r,o,i,s,a)=>{let u=i;if(s&&x.size(s.dims)>0){if(r===1)throw new Error("AddBiasReshape is not implemented. Please export your model with packed QKV or KV");return u=ec(e,i,s,t,r,n*o,a),u=u.reshape([t,r,n,o]),n===1||r===1?u:e.compute(ce(u,ia.perm),{inputs:[u],outputs:[-1]})[0]}else return i.dims.length===3&&(u=i.reshape([t,r,n,o])),n===1||r===1?u:e.compute(ce(u,ia.perm),{inputs:[u],outputs:[-1]})[0]},aa=(e,t)=>{let n=Jl(e.inputs,t),r=e.inputs[0],o=he(e.inputs,1),i=he(e.inputs,2),s=he(e.inputs,3),a=he(e.inputs,4),u=he(e.inputs,5),d=he(e.inputs,6),l=he(e.inputs,7);if(r.dims.length===5)throw new Error("Packed QKV is not implemented");if(o?.dims.length===5)throw new Error("Packed KV is not implemented");let c=o&&i&&o.dims.length===4&&i.dims.length===4,p=yt(e,n.batchSize,n.numHeads,n.sequenceLength,n.headSize,r,s,0);if(c)return nt(e,p,o,i,a,void 0,d,l,u,n);if(!o||!i)throw new Error("key and value must be provided");let f=yt(e,n.batchSize,n.numHeads,n.kvSequenceLength,n.headSize,o,s,n.hiddenSize),m=yt(e,n.batchSize,n.numHeads,n.kvSequenceLength,n.vHeadSize,i,s,2*n.hiddenSize);nt(e,p,f,m,a,void 0,d,l,u,n)}});var tc,nc,rc,oc,ir,ua,da,sr=E(()=>{"use strict";V();q();ie();K();tc=e=>{if(!e||e.length<1)throw new Error("too few inputs")},nc=(e,t)=>{let n=[],r=t.numOutputs;return e[1].dims[0]>0&&(e[1].getBigInt64Array().forEach(o=>n.push(Number(o))),r=n.length),N({numOutputs:r,axis:t.axis,splitSizes:n})},rc=e=>`
+fn calculateOutputIndex(index: u32) -> u32 {
+    for (var i: u32 = 0u; i < ${e}u; i += 1u ) {
+    if (index < ${B("uniforms.size_in_split_axis","i",e)}) {
+        return i;
+    }
+    }
+    return ${e}u;
+}`,oc=e=>{let t=e.length,n=[];for(let r=0;r<t;++r){let o=e[r].setByIndices("indices","input[global_idx]");t===1?n.push(o):r===0?n.push(`if (output_number == ${r}u) { ${o} }`):r===t-1?n.push(`else { ${o} }`):n.push(`else if (output_number == ${r}) { ${o} }`)}return`
+      fn writeBufferData(output_number: u32, indices: ${e[0].type.indices}, global_idx: u32) {
+        ${n.join(`
+`)}
+      }`},ir=(e,t)=>{let n=e[0].dims,r=x.size(n),o=e[0].dataType,i=x.normalizeAxis(t.axis,n.length),s=new Array(t.numOutputs),a=S("input",o,n.length),u=new Array(t.numOutputs),d=[],l=[],c=0,p=[{type:12,data:r}];for(let m=0;m<t.numOutputs;m++){c+=t.splitSizes[m],u[m]=c;let h=n.slice();h[i]=t.splitSizes[m],l.push(h),s[m]=C(`output${m}`,o,h.length),d.push({dims:l[m],dataType:e[0].dataType})}p.push({type:12,data:u},...P(n,...l));let f=m=>`
+  ${m.registerUniform("input_size","u32").registerUniform("size_in_split_axis","u32",u.length).declareVariables(a,...s)}
+  ${rc(u.length)}
+  ${oc(s)}
+
+  ${m.mainStart()}
+    ${m.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.input_size")}
+
+    var indices = ${a.offsetToIndices("global_idx")};
+    var index = ${a.indicesGet("indices",i)};
+    let output_number = calculateOutputIndex(index);
+    if (output_number != 0) {
+      index -= ${B("uniforms.size_in_split_axis","output_number - 1u",u.length)};
+      ${a.indicesSet("indices",i,"index")};
+    }
+    writeBufferData(output_number, indices, global_idx);
+  }`;return{name:"Split",shaderCache:{hint:t.cacheKey,inputDependencies:["rank"]},getShaderSource:f,getRunData:()=>({outputs:d,dispatchGroup:{x:Math.ceil(r/64)},programUniforms:p})}},ua=(e,t)=>{tc(e.inputs);let n=e.inputs.length===1?t:nc(e.inputs,t);e.compute(ir(e.inputs,n),{inputs:[0]})},da=e=>{let t=e.axis,n=e.splitSizes,r=e.numOutputs<0?n.length:e.numOutputs;if(r!==n.length)throw new Error("numOutputs and splitSizes lengh must be equal");return N({axis:t,numOutputs:r,splitSizes:n})}});var ic,dn,la,ar=E(()=>{"use strict";V();q();ie();K();ic=(e,t)=>{let[n,r,o,i]=e,{numHeads:s,rotaryEmbeddingDim:a}=t;if(n.dims.length!==3&&n.dims.length!==4)throw new Error(`Input 'x' is expected to have 3 or 4 dimensions, got ${n.dims.length}`);if(!x.areEqual(r.dims,[])&&!x.areEqual(r.dims,[1])&&r.dims.length!==2)throw new Error(`Input 'position_ids' is expected to have 0, 1, or 2 dimensions, got ${r.dims.length}`);if(o.dims.length!==2)throw new Error(`Input 'cos_cache' is expected to have 2 dimensions, got ${o.dims.length}`);if(i.dims.length!==2)throw new Error(`Input 'sin_cache' is expected to have 2 dimensions, got ${i.dims.length}`);if(!x.areEqual(o.dims,i.dims))throw new Error("Inputs 'cos_cache' and 'sin_cache' are expected to have the same shape");if(a>0&&s===0)throw new Error("num_heads must be provided if rotary_embedding_dim is specified");let u=n.dims[0],d=n.dims[n.dims.length-2],l=o.dims[0],c=x.sizeFromDimension(n.dims,1)/d,p=a===0?o.dims[1]*2:c/s;if(a>p)throw new Error("rotary_embedding_dim must be less than or equal to head_size");if(r.dims.length===2){if(u!==r.dims[0])throw new Error(`Input 'position_ids' dimension 0 should be of size batch_size, got ${r.dims[0]}`);if(d!==r.dims[1])throw new Error(`Input 'position_ids' dimension 1 should be of size sequence_length, got ${r.dims[1]}`)}if(p/2!==o.dims[1]&&a/2!==o.dims[1])throw new Error(`Input 'cos_cache' dimension 1 should be same as head_size / 2 or rotary_embedding_dim / 2, got ${o.dims[1]}`);if(d>l)throw new Error("Updating cos_cache and sin_cache in RotaryEmbedding is not currently supported")},dn=(e,t)=>{let{interleaved:n,numHeads:r,rotaryEmbeddingDim:o,scale:i}=t,s=e[0].dims[0],a=x.sizeFromDimension(e[0].dims,1),u=e[0].dims[e[0].dims.length-2],d=a/u,l=e[2].dims[1],c=o===0?l*2:d/r,p=new Array(s,u,d/c,c-l),f=x.computeStrides(p),m=[{type:1,data:i},{type:12,data:p},{type:12,data:f},...e[0].dims.length===3?new Array({type:12,data:[a,d,c,1]}):[],...e[0].dims.length===4?new Array({type:12,data:[a,c,u*c,1]}):[],...P(e[0].dims,e[1].dims,e[2].dims,e[3].dims,e[0].dims)],h=b=>{let y=S("input",e[0].dataType,e[0].dims.length),g=S("position_ids",e[1].dataType,e[1].dims.length),_=S("cos_cache",e[2].dataType,e[2].dims.length),w=S("sin_cache",e[3].dataType,e[3].dims.length),v=C("output",e[0].dataType,e[0].dims.length);return b.registerUniforms([{name:"scale",type:"f32"},{name:"global_shape",type:"u32",length:p.length},{name:"global_strides",type:"u32",length:f.length},{name:"input_output_strides",type:"u32",length:f.length}]),`
+        ${b.declareVariables(y,g,_,w,v)}
+
+        ${b.mainStart(Qe)}
+          let half_rotary_emb_dim = uniforms.${_.name}_shape[1];
+          let bsnh = global_idx / uniforms.global_strides % uniforms.global_shape;
+          let size = uniforms.global_shape[0] * uniforms.global_strides[0];
+          ${b.guardAgainstOutOfBoundsWorkgroupSizes("size")}
+
+          if (bsnh[3] < half_rotary_emb_dim) {
+            let position_ids_idx =
+                ${g.broadcastedIndicesToOffset("bsnh.xy",C("",g.type.tensor,2))};
+            let position_id =
+                u32(${g.getByOffset("position_ids_idx")}) + select(0, bsnh[1], position_ids_idx == 0);
+            let i = dot(bsnh, uniforms.input_output_strides) + select(0, bsnh[3], ${n});
+            let j = i + select(half_rotary_emb_dim, 1, ${n});
+            let re = ${y.getByOffset("i")} * ${_.get("position_id","bsnh[3]")} -
+                ${y.getByOffset("j")} * ${w.get("position_id","bsnh[3]")};
+            ${v.setByOffset("i","re")}
+            let im = ${y.getByOffset("i")} * ${w.get("position_id","bsnh[3]")} +
+                ${y.getByOffset("j")} * ${_.get("position_id","bsnh[3]")};
+            ${v.setByOffset("j","im")}
+          } else {
+            let k = dot(bsnh, uniforms.input_output_strides) + half_rotary_emb_dim;
+            ${v.setByOffset("k",y.getByOffset("k"))}
+          }
+        }`};return{name:"RotaryEmbedding",shaderCache:{hint:N({interleaved:n}).cacheKey,inputDependencies:["rank","rank","rank","rank"]},getShaderSource:h,getRunData:()=>({outputs:[{dims:e[0].dims,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(x.size(p)/Qe)},programUniforms:m})}},la=(e,t)=>{ic(e.inputs,t),e.compute(dn(e.inputs,t))}});var sc,ac,ca,uc,pa,ma=E(()=>{"use strict";ie();V();Yt();or();sr();Re();ar();K();sc=(e,t)=>{if(t.doRotary&&e.length<=7)throw new Error("cos_cache and sin_cache inputs are required if do_rotary is specified");let n=e[0],r=e[1],o=e[2],i=e[3],s=e[4];if(t.doRotary!==0&&e.length<=7)throw new Error("cos_cast and sin_cache are expected if do_rotary attribute is non-zero");if(t.localWindowSize!==-1)throw new Error("Local attention is not supported");if(t.softcap!==0)throw new Error("Softcap is not supported");if(t.rotaryInterleaved!==0)throw new Error("Rotary interleaved is not supported");if(t.smoothSoftmax)throw new Error("Smooth softmax is not supported");if(n.dims.length!==3&&n.dims.length!==5)throw new Error("Input query is expected to have 3 or 5 dimensions");let a=!1,u=n.dims[0],d=n.dims[1],l=n.dims.length===3?a?n.dims[2]/3:n.dims[2]:t.numHeads*n.dims[4],c=d,p=0,f=!r||r.dims.length===0,m=Math.floor(f?l/(t.numHeads+2*t.kvNumHeads):l/t.numHeads);f&&(l=m*t.numHeads);let h=i&&i.dims.length!==0,b=s&&s.dims.length!==0;if(h&&i.dims.length===4&&i.dims[0]===u&&i.dims[1]!==t.kvNumHeads&&i.dims[2]===t.kvNumHeads&&i.dims[3]===m)throw new Error("BSNH pastKey/pastValue is not supported");if(h&&b){if(i.dims.length!==4)throw new Error('Input "past_key" is expected to have 4 dimensions');if(s.dims.length!==4)throw new Error('Input "past_value" is expected to have 4 dimensions');p=i.dims[2]}else if(h||b)throw new Error('Input "past_key" and "past_value" shall be both present or both absent');let g=1;if(r&&r.dims.length>0){if(n.dims.length!==3)throw new Error('Input "query" is expected to have 3 dimensions when key is given');if(r.dims.length<3||r.dims.length>5)throw new Error('Input "key" is expected to have 3, 4, or 5 dimensions');if(n.dims[0]!==r.dims[0])throw new Error('Input "query" and "key" shall have same dim 0 (batch size)');if(r.dims.length===3){if(n.dims[2]%r.dims[2]!==0)throw new Error('Dimension 2 of "query" should be a multiple of "key"');c=r.dims[1]}else if(r.dims.length===5){if(r.dims[2]!==t.numHeads||r.dims[3]!==2||r.dims[4]!==m)throw new Error('Expect "key" shape (batch_size, kv_sequence_length, num_heads, 2, head_size) for packed kv');if(o)throw new Error('Expect "value" be none when "key" has packed kv format.');c=r.dims[1]}else{if(r.dims[1]!==t.numHeads||r.dims[3]!==m)throw new Error('Expect "key" shape (batch_size, num_heads, kv_sequence_length, head_size) for past_key');c=r.dims[2]}}else{if(n.dims.length!==3&&n.dims.length!==5)throw new Error('Input "query" is expected to have 3 or 5 dimensions when key is empty');if(n.dims.length===5&&(n.dims[2]!==t.numHeads||n.dims[3]!==3))throw new Error('Expect "query" shape (batch_size, kv_sequence_length, num_heads, 3, head_size) for packed kv');g=3}let _=0,w=!1,v=t.kvNumHeads?m*t.kvNumHeads:l;if(o&&o.dims.length>0){if(o.dims.length!==3&&o.dims.length!==4)throw new Error('Input "value" is expected to have 3 or 4 dimensions');if(n.dims[0]!==o.dims[0])throw new Error('Input "query" and "value" shall have same dim 0 (batch_size)');if(o.dims.length===3){if(c!==o.dims[1])throw new Error('Input "key" and "value" shall have the same dim 1 (kv_sequence_length)');v=o.dims[2]}else{if(c!==o.dims[2])throw new Error('Input "past_key" and "past_value" shall have the same dim 2 (kv_sequence_length)');v=o.dims[1]*o.dims[3],w=!0}}let $=e.length>4?e[5]:void 0;if($&&$.dims.length!==1&&$.dims[0]!==u)throw new Error('Input "seqlens" is expected to have 1 dimension and the same dim 0 as batch_size');return{batchSize:u,sequenceLength:d,pastSequenceLength:p,kvSequenceLength:c,totalSequenceLength:-1,maxSequenceLength:-1,inputHiddenSize:0,hiddenSize:l,vHiddenSize:v,headSize:m,vHeadSize:Math.floor(v/t.kvNumHeads),numHeads:t.numHeads,kvNumHeads:t.kvNumHeads,nReps:t.numHeads/t.kvNumHeads,pastPresentShareBuffer:!1,maskType:_,scale:t.scale,broadcastResPosBias:!1,passPastInKv:w,qkvFormat:g}},ac=N({perm:[0,2,1,3]}),ca=(e,t,n)=>{let r=t,o=n.kvNumHeads;return t.dims.length===3&&n.kvSequenceLength!==0&&(r=t.reshape([n.batchSize,n.kvSequenceLength,o,n.headSize]),r=e.compute(ce(r,ac.perm),{inputs:[r],outputs:[-1]})[0]),r},uc=(e,t,n,r)=>{let o=7,i=["type","type"],s=[e*t],a=e*t,u=[{type:12,data:a},{type:12,data:t},{type:12,data:e}],d=l=>{let c=S("seq_lens",n.dataType,n.dims),p=S("total_seq_lens",r.dataType,r.dims),f=C("pos_ids",o,s),m=[{name:"output_size",type:"u32"},{name:"sequence_length",type:"u32"},{name:"batch_size",type:"u32"}];return`
+  ${l.registerUniforms(m).declareVariables(c,p,f)}
+  ${l.mainStart()}
+    ${l.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+    let total_sequence_length = u32(${p.getByOffset("0")});
+    let is_subsequent_prompt = uniforms.sequence_length > 1 && uniforms.sequence_length != total_sequence_length;
+    let is_first_prompt = !is_subsequent_prompt && uniforms.sequence_length == total_sequence_length;
+    let batch_idx = global_idx / uniforms.sequence_length;
+    let sequence_idx = i32(global_idx % uniforms.sequence_length);
+    var pos_id: i32 = 0;
+    let seqlen = ${c.getByOffset("batch_idx")};
+    let total_seqlen = seqlen + 1;
+    if (is_first_prompt) {
+      if (sequence_idx < total_seqlen) {
+        pos_id = sequence_idx;
+      } else {
+        pos_id = 1;
+      }
+      ${f.setByOffset("global_idx","pos_id")}
+    } else if (is_subsequent_prompt) {
+      let past_seqlen = total_seqlen - i32(uniforms.sequence_length);
+      if (past_seqlen + sequence_idx < total_seqlen) {
+        pos_id = past_seqlen + sequence_idx;
+      } else {
+        pos_id = 1;
+      }
+      ${f.setByOffset("global_idx","pos_id")}
+    } else if (global_idx < uniforms.batch_size) {
+      ${f.setByOffset("global_idx","seqlen")}
+    };
+  }
+  `};return{name:"GeneratePositionIds",shaderCache:{hint:`${e};${t}`,inputDependencies:i},getRunData:()=>({outputs:[{dims:s,dataType:o}],dispatchGroup:{x:Math.ceil(a/64)},programUniforms:u}),getShaderSource:d}},pa=(e,t)=>{let n=sc(e.inputs,t);if(e.inputs[0].dims.length===5)throw new Error("Packed QKV is not implemented");if(e.inputs[1]?.dims.length===5)throw new Error("Packed KV is not implemented");let r=e.inputs[0],o=e.inputs[1]&&e.inputs[1].dims.length>0?e.inputs[1]:void 0,i=e.inputs[2]&&e.inputs[2].dims.length>0?e.inputs[2]:void 0,s=e.inputs[3]&&e.inputs[3].dims.length!==0?e.inputs[3]:void 0,a=e.inputs[4]&&e.inputs[4].dims.length!==0?e.inputs[4]:void 0,u=e.inputs.length>4?e.inputs[5]:void 0,d=e.inputs.length>5?e.inputs[6]:void 0,l=n.kvNumHeads?n.kvNumHeads:n.numHeads,c=N({axis:2,numOutputs:3,splitSizes:[n.numHeads*n.headSize,l*n.headSize,l*n.headSize]}),[p,f,m]=!o&&!i?e.compute(ir([r],c),{inputs:[r],outputs:[-1,-1,-1]}):[r,o,i],h,b;if(t.doRotary){let w=e.compute(uc(n.batchSize,n.sequenceLength,u,d),{inputs:[u,d],outputs:[-1]})[0],v=e.inputs[7],$=e.inputs[8],T=N({interleaved:t.rotaryInterleaved!==0,numHeads:n.numHeads,rotaryEmbeddingDim:0,scale:t.scale}),I=[p,w,v,$],A=[-1];h=e.compute(dn(I,T),{inputs:I,outputs:A})[0],I.splice(0,1,f);let z=N({interleaved:t.rotaryInterleaved!==0,numHeads:n.kvNumHeads,rotaryEmbeddingDim:0,scale:t.scale});b=e.compute(dn(I,z),{inputs:I,outputs:A})[0]}let y=yt(e,n.batchSize,n.numHeads,n.sequenceLength,n.headSize,t.doRotary?h:p,void 0,0),g=ca(e,t.doRotary?b:f,n),_=ca(e,m,n);nt(e,y,g,_,void 0,void 0,s,a,void 0,n,u,d)}});var fa,dc,lc,ha,ga=E(()=>{"use strict";V();q();Re();K();fa=(e,t,n,r,o,i,s,a)=>{let u=X(i),d=u===1?"f32":`vec${u}f`,l=u===1?"vec2f":`mat2x${u}f`,c=o*s,p=64;c===1&&(p=256);let f=[o,s,i/u],m=[o,s,2],h=["rank","type","type"],b=[];b.push(...P(f,m));let y=g=>{let _=S("x",t.dataType,3,u),w=S("scale",n.dataType,n.dims),v=S("bias",r.dataType,r.dims),$=C("output",1,3,2),T=[_,w,v,$];return`
+  var<workgroup> workgroup_shared : array<${l}, ${p}>;
+  const workgroup_size = ${p}u;
+  ${g.declareVariables(...T)}
+  ${g.mainStart(p)}
+    let batch = workgroup_index / uniforms.x_shape[1];
+    let channel = workgroup_index % uniforms.x_shape[1];
+    let hight = uniforms.x_shape[2];
+    // initialize workgroup memory
+    var sum = ${d}(0);
+    var squared_sum = ${d}(0);
+    for (var h = local_idx; h < hight; h += workgroup_size) {
+      let value = ${d}(${_.get("batch","channel","h")});
+      sum += value;
+      squared_sum += value * value;
+    }
+    workgroup_shared[local_idx] = ${l}(sum, squared_sum);
+    workgroupBarrier();
+
+    for (var currSize = workgroup_size >> 1;  currSize > 0; currSize = currSize >> 1) {
+      if (local_idx < currSize) {
+        workgroup_shared[local_idx] = workgroup_shared[local_idx] + workgroup_shared[local_idx + currSize];
+      }
+      workgroupBarrier();
+    }
+    if (local_idx == 0) {
+      let sum_final = ${Te("workgroup_shared[0][0]",u)} / f32(hight * ${u});
+      let squared_sum_final = ${Te("workgroup_shared[0][1]",u)} / f32(hight * ${u});
+
+      let inv_std_dev = inverseSqrt(squared_sum_final - sum_final * sum_final + f32(${a}));
+      let channel_scale = inv_std_dev * f32(scale[channel]);
+      let channel_shift = f32(bias[channel]) - sum_final * channel_scale;
+      output[workgroup_index] = vec2f(channel_scale, channel_shift);
+    }
+  }`};return e.compute({name:"InstanceNormComputeChannelScaleShift",shaderCache:{hint:`${u};${a};${p}`,inputDependencies:h},getRunData:()=>({outputs:[{dims:m,dataType:1}],dispatchGroup:{x:c},programUniforms:b}),getShaderSource:y},{inputs:[t,n,r],outputs:[-1]})[0]},dc=(e,t,n)=>{let r=t[0].dims,o=r,i=2,s=r[0],a=r[1],u=x.sizeFromDimension(r,i),d=X(u),l=x.size(o)/d,c=fa(e,t[0],t[1],t[2],s,u,a,n.epsilon),p=[s,a,u/d],f=[s,a],m=["type","none"],h=b=>{let y=S("x",t[0].dataType,p.length,d),g=S("scale_shift",1,f.length,2),_=C("output",t[0].dataType,p.length,d),w=[y,g,_];return`
+  ${b.registerUniform("output_size","u32").declareVariables(...w)}
+  ${b.mainStart()}
+  ${b.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+      let outputIndices = ${_.offsetToIndices("global_idx")};
+      let batch = outputIndices[0];
+      let channel = outputIndices[1];
+      let scale_shift = ${g.getByIndices("vec2<u32>(batch, channel)")};
+      let value = ${y.getByOffset("global_idx")} * ${_.type.value}(scale_shift.x) + ${_.type.value}(scale_shift.y);
+      ${_.setByOffset("global_idx","value")};
+  }`};e.compute({name:"InstanceNormalization",shaderCache:{hint:`${d}`,inputDependencies:m},getRunData:()=>({outputs:[{dims:o,dataType:t[0].dataType}],dispatchGroup:{x:Math.ceil(l/64)},programUniforms:[{type:12,data:l},...P(p,f,p)]}),getShaderSource:h},{inputs:[t[0],c]})},lc=(e,t,n)=>{let r=t[0].dims,o=r,i=r[0],s=r[r.length-1],a=x.sizeFromDimension(r,1)/s,u=X(s),d=x.size(o)/u,l=[{type:12,data:a},{type:12,data:Math.floor(s/u)}],c=["type","type"],p=!1,f=[0,r.length-1];for(let y=0;y<r.length-2;y++)p=p||r[y+1]!==1,f.push(y+1);p=p&&r[r.length-1]!==1;let m=p?e.compute(ce(e.inputs[0],f),{inputs:[e.inputs[0]],outputs:[-1]})[0]:e.inputs[0].reshape(Array.from({length:r.length},(y,g)=>r[f[g]])),h=fa(e,m,t[1],t[2],i,a,s,n.epsilon),b=y=>{let g=re(t[0].dataType),_=u===1?"vec2f":`mat${u}x2f`,w=T=>{let I=T===0?"x":"y",A=u===1?"f32":`vec${u}f`;switch(u){case 1:return`${g}(${A}(scale.${I}))`;case 2:return`vec2<${g}>(${A}(scale[0].${I}, scale[1].${I}))`;case 4:return`vec4<${g}>(${A}(scale[0].${I}, scale[1].${I}, scale[2].${I}, scale[3].${I}))`;default:throw new Error(`Not supported compoents ${u}`)}},v=S("input",t[0].dataType,t[0].dims,u),$=C("output",t[0].dataType,o,u);return`
+  @group(0) @binding(0) var<storage, read> input : array<${v.type.storage}>;
+  @group(0) @binding(1) var<storage, read> scale_input : array<${_}>;
+  @group(0) @binding(2) var<storage, read_write> output : array<${$.type.storage}>;
+  struct Uniforms {H: u32, C : u32};
+  @group(0) @binding(3) var<uniform> uniforms: Uniforms;
+
+  ${y.mainStart()}
+    let current_image_number = global_idx / (uniforms.C * uniforms.H);
+    let current_channel_number = global_idx % uniforms.C;
+
+    let scale_offset = current_image_number * uniforms.C + current_channel_number;
+    let scale = scale_input[scale_offset];
+    output[global_idx] = fma(input[global_idx], ${w(0)}, ${w(1)});
+  }`};e.compute({name:"InstanceNormalizationNHWC",shaderCache:{hint:`${u}`,inputDependencies:c},getRunData:()=>({outputs:[{dims:o,dataType:t[0].dataType}],dispatchGroup:{x:Math.ceil(d/64)},programUniforms:l}),getShaderSource:b},{inputs:[t[0],h]})},ha=(e,t)=>{t.format==="NHWC"?lc(e,e.inputs,t):dc(e,e.inputs,t)}});var cc,pc,ya,ba=E(()=>{"use strict";V();q();K();cc=e=>{if(!e||e.length<2)throw new Error("layerNorm requires at least 2 inputs.")},pc=(e,t,n)=>{let r=t.simplified,o=e[0].dims,i=e[1],s=!r&&e[2],a=o,u=x.normalizeAxis(t.axis,o.length),d=x.sizeToDimension(o,u),l=x.sizeFromDimension(o,u),c=x.size(i.dims),p=s?x.size(s.dims):0;if(c!==l||s&&p!==l)throw new Error(`Size of X.shape()[axis:] == ${l}.
+       Size of scale and bias (if provided) must match this.
+       Got scale size of ${c} and bias size of ${p}`);let f=[];for(let v=0;v<o.length;++v)v<u?f.push(o[v]):f.push(1);let m=X(l),h=["type","type"],b=[{type:12,data:d},{type:1,data:l},{type:12,data:Math.floor(l/m)},{type:1,data:t.epsilon}];s&&h.push("type");let y=n>1,g=n>2,_=v=>{let $=re(e[0].dataType),T=[S("x",e[0].dataType,e[0].dims,m),S("scale",i.dataType,i.dims,m)];s&&T.push(S("bias",s.dataType,s.dims,m)),T.push(C("output",e[0].dataType,a,m)),y&&T.push(C("mean_data_output",1,f)),g&&T.push(C("inv_std_output",1,f));let I=[{name:"norm_count",type:"u32"},{name:"norm_size",type:"f32"},{name:"norm_size_vectorized",type:"u32"},{name:"epsilon",type:"f32"}];return`
+  ${v.registerUniforms(I).declareVariables(...T)}
+  ${v.mainStart()}
+    ${v.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.norm_count")}
+    let offset = global_idx * uniforms.norm_size_vectorized;
+    var mean_vector = ${Wn("f32",m)};
+    var mean_square_vector = ${Wn("f32",m)};
+
+    for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
+      let value = ${Xe($,m,"x[h + offset]")};
+      mean_vector += value;
+      mean_square_vector += value * value;
+    }
+    let mean = ${Te("mean_vector",m)} / uniforms.norm_size;
+    let inv_std_dev = inverseSqrt(${Te("mean_square_vector",m)} / uniforms.norm_size ${r?"":"- mean * mean"} + uniforms.epsilon);
+
+    for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
+      let f32input = ${Xe($,m,"x[j + offset]")};
+      let f32scale = ${Xe($,m,"scale[j]")};
+      output[j + offset] = ${T[0].type.value}((f32input ${r?"":"- mean"}) * inv_std_dev * f32scale
+        ${s?`+ ${Xe($,m,"bias[j]")}`:""}
+      );
+    }
+
+    ${y?"mean_data_output[global_idx] = mean":""};
+    ${g?"inv_std_output[global_idx] = inv_std_dev":""};
+  }`},w=[{dims:a,dataType:e[0].dataType}];return y&&w.push({dims:f,dataType:1}),g&&w.push({dims:f,dataType:1}),{name:"LayerNormalization",shaderCache:{hint:`${m};${n};${r}`,inputDependencies:h},getRunData:()=>({outputs:w,dispatchGroup:{x:Math.ceil(d/64)},programUniforms:b}),getShaderSource:_}},ya=(e,t)=>{cc(e.inputs),e.compute(pc(e.inputs,t,e.outputCount))}});var mc,_a,wa=E(()=>{"use strict";q();on();sn();mc=e=>{if(!e||e.length!==2)throw new Error("MatMul requires 2 inputs.");if(e[0].dims[e[0].dims.length-1]!==e[1].dims[e[1].dims.length-2])throw new Error("shared dimension does not match.")},_a=e=>{mc(e.inputs);let t=Pe.calcShape(e.inputs[0].dims,e.inputs[1].dims,!0);if(!t)throw new Error("Can't use matmul on the given tensors");let n=t[t.length-1],r=e.inputs[0].dims[e.inputs[0].dims.length-1];if(n<8&&r<8)e.compute(rn(e.inputs,{activation:""},t));else{let o=t[t.length-2],i=x.size(e.inputs[0].dims.slice(0,-2)),s=x.size(e.inputs[1].dims.slice(0,-2));if(i!==1&&o===1&&s===1){let a=e.inputs[0].reshape([1,i,r]),u=e.inputs[1].reshape([1,r,n]),d=[1,i,n],l=[a,u];e.compute(gt(l,{activation:""},t,d),{inputs:l})}else e.compute(gt(e.inputs,{activation:""},t))}}});var fc,hc,gc,$a,va,xa=E(()=>{"use strict";V();q();ie();K();fc=(e,t)=>{if(e.length<3||e.length>4)throw new Error("MatMulNBits requires 3 or 4 inputs");let n=e[0],r=n.dims.length;if(n.dims[r-1]!==t.k)throw new Error("The last dim of input shape does not match the k value");let o=Math.floor((t.k+t.blockSize-1)/t.blockSize),i=t.blockSize/8*t.bits,s=e[1];if(!x.areEqual(s.dims,[t.n,o,i]))throw new Error("The second inputs must be 3D tensor with shape N X nBlocksPerCol X blobSize");let u=e[2].dims;if(x.size(u)!==t.n*o)throw new Error("scales input size error.");if(e.length===4){let l=e[3].dims,c=t.bits>4?t.n*o:t.n*Math.floor((o+1)/2);if(x.size(l)!==c)throw new Error("zeroPoints input size error.")}},hc=(e,t)=>{let n=e[0].dims,r=n.length,o=n[r-2],i=t.k,s=t.n,a=n.slice(0,r-2),u=x.size(a),l=e[1].dims[2]/4,c=e[0].dataType,p=X(t.k),f=X(l),m=X(s),h=a.concat([o,s]),b=o>1&&s/m%2===0?2:1,y=x.size(h)/m/b,g=64,_=[],w=[u,o,i/p],v=x.convertShape(e[1].dims).slice();v.splice(-1,1,l/f),_.push(...P(w)),_.push(...P(v)),_.push(...P(e[2].dims)),e.length===4&&_.push(...P(x.convertShape(e[3].dims)));let $=[u,o,s/m];_.push(...P($));let T=I=>{let A=w.length,z=S("a",e[0].dataType,A,p),M=S("b",12,v.length,f),R=S("scales",e[2].dataType,e[2].dims.length),W=[z,M,R],O=e.length===4?S("zero_points",12,e[3].dims.length):void 0;O&&W.push(O);let ee=$.length,G=C("output",e[0].dataType,ee,m),D=re(e[0].dataType),Z=(()=>{switch(p){case 1:return`array<${D}, 8>`;case 2:return`mat4x2<${D}>`;case 4:return`mat2x4<${D}>`;default:throw new Error(`${p}-component is not supported.`)}})(),U=()=>{let se=`
+          // reuse a data
+            var input_offset = ${z.indicesToOffset(`${z.type.indices}(batch, row, word_offset)`)};
+            var a_data: ${Z};
+            for (var j: u32 = 0; j < ${8/p}; j++) {
+              a_data[j] = ${z.getByOffset("input_offset")};
+              input_offset++;
+            }
+          `;for(let H=0;H<m*b;H++)se+=`
+            b_value = ${f===1?`b${H}_data`:`b${H}_data[i]`};
+            b_value_lower = unpack4xU8(b_value & b_mask);
+            b_value_upper = unpack4xU8((b_value >> 4) & b_mask);
+            b_quantized_values = ${Z}(${Array.from({length:4},(k,L)=>`${D}(b_value_lower[${L}]), ${D}(b_value_upper[${L}])`).join(", ")});
+            b_dequantized_values = ${p===1?`${Z}(${Array.from({length:8},(k,L)=>`(b_quantized_values[${L}] - ${O?`zero_point${H}`:"zero_point"}) * scale${H}`).join(", ")});`:`(b_quantized_values - ${Z}(${Array(8).fill(`${O?`zero_point${H}`:"zero_point"}`).join(",")})) * scale${H};`};
+            workgroup_shared[local_id.x * ${b} + ${Math.floor(H/m)}]${m>1?`[${H%m}]`:""} += ${Array.from({length:8/p},(k,L)=>`${p===1?`a_data[${L}] * b_dequantized_values[${L}]`:`dot(a_data[${L}], b_dequantized_values[${L}])`}`).join(" + ")};
+          `;return se},Q=()=>{let se=`
+            var col_index = col * ${m};
+            ${O?`
+            let zero_point_bytes_per_col = (nBlocksPerCol + 1) / 2;
+            var zero_point_byte_count: u32;
+            var zero_point_word_index: u32;
+            var zero_point_byte_offset: u32;
+            let zero_point_nibble_offset: u32 = block & 0x1u;
+            var zero_point_bits_offset: u32;
+            var zero_point_word: u32;`:`
+            // The default zero point is 8 for unsigned 4-bit quantization.
+            let zero_point = ${D}(8);`}
+            `;for(let H=0;H<m*b;H++)se+=`
+            let scale${H} = ${R.getByOffset("col_index * nBlocksPerCol + block")};
+            ${O?`
+            zero_point_byte_count = col_index * zero_point_bytes_per_col + (block >> 0x1u);
+            zero_point_word_index = zero_point_byte_count >> 0x2u;
+            zero_point_byte_offset = zero_point_byte_count & 0x3u;
+            zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);
+            zero_point_word = ${O.getByOffset("zero_point_word_index")} >> zero_point_bits_offset;
+            let zero_point${H} = ${D}((zero_point_word) & 0xFu);`:""}
+            col_index += 1;`;return se},_e=()=>{let se=`col_index = col * ${m};`;for(let H=0;H<m*b;H++)se+=`
+            let b${H}_data = ${M.getByIndices(`${M.type.indices}(col_index, block, word)`)};
+            col_index += 1;`;return se+=`
+            var b_value: u32;
+            let b_mask: u32 = 0x0F0F0F0Fu;
+            var b_value_lower: vec4<u32>;
+            var b_value_upper: vec4<u32>;
+            var b_quantized_values: ${Z};
+            var b_dequantized_values: ${Z};`,se};return`
+        var<workgroup> workgroup_shared: array<${G.type.value}, ${b*g}>;
+        ${I.declareVariables(...W,G)}
+        ${I.mainStart([g,1,1])}
+          let output_indices = ${G.offsetToIndices(`(global_idx / ${g}) * ${b}`)};
+          let col = output_indices[2];
+          let row = output_indices[1];
+          let batch = output_indices[0];
+          let nBlocksPerCol = uniforms.b_shape[1];
+
+          for (var block = local_id.x; block < nBlocksPerCol; block += ${g}) {
+            //process one block
+            var word_offset: u32 = block * ${t.blockSize/p};
+            ${Q()}
+            for (var word: u32 = 0; word < ${l}; word += ${f}) {
+              ${_e()}
+              for (var i: u32 = 0; i < ${f}; i++) {
+                ${U()}
+                word_offset += ${8/p};
+              }
+            }
+          }
+          workgroupBarrier();
+
+          if (local_id.x < ${b}) {
+            var output_value: ${G.type.value} = ${G.type.value}(0);
+            var workgroup_shared_offset: u32 = local_id.x;
+            for (var b: u32 = 0u; b < ${g}u; b++) {
+              output_value += workgroup_shared[workgroup_shared_offset];
+              workgroup_shared_offset += ${b};
+            }
+            ${G.setByIndices(`${G.type.indices}(batch, row, col + local_id.x)`,"output_value")};
+          }
+        }`};return{name:"MatMulNBits",shaderCache:{hint:`${t.blockSize};${t.bits};${p};${f};${m};${b};${g}`,inputDependencies:Array(e.length).fill("rank")},getRunData:()=>({outputs:[{dims:h,dataType:c}],dispatchGroup:{x:y},programUniforms:_}),getShaderSource:T}},gc=(e,t)=>{let n=e[0].dims,r=n.length,o=n[r-2],i=t.k,s=t.n,a=n.slice(0,r-2),u=x.size(a),l=e[1].dims[2]/4,c=e[0].dataType,p=X(t.k),f=X(l),m=a.concat([o,s]),h=128,b=s%8===0?8:s%4===0?4:1,y=h/b,g=y*f*8,_=g/p,w=g/t.blockSize,v=x.size(m)/b,$=[],T=[u,o,i/p],I=x.convertShape(e[1].dims).slice();I.splice(-1,1,l/f),$.push(...P(T)),$.push(...P(I)),$.push(...P(e[2].dims)),e.length===4&&$.push(...P(x.convertShape(e[3].dims)));let A=[u,o,s];$.push(...P(A));let z=M=>{let R=T.length,W=S("a",e[0].dataType,R,p),O=S("b",12,I.length,f),ee=S("scales",e[2].dataType,e[2].dims.length),G=[W,O,ee],D=e.length===4?S("zero_points",12,e[3].dims.length):void 0;D&&G.push(D);let Z=A.length,U=C("output",e[0].dataType,Z),Q=re(e[0].dataType),_e=()=>{switch(p){case 1:return`
+          let a_data0 = vec4<${Q}>(sub_a[word_offset], sub_a[word_offset + 1], sub_a[word_offset + 2], sub_a[word_offset + 3]);
+          let a_data1 = vec4<${Q}>(sub_a[word_offset + 4], sub_a[word_offset + 5], sub_a[word_offset + 6], sub_a[word_offset + 7]);`;case 2:return`
+          let a_data0 = vec4<${Q}>(sub_a[word_offset], sub_a[word_offset + 1]);
+          let a_data1 = vec4<${Q}>(sub_a[word_offset + 2], sub_a[word_offset + 3]);`;case 4:return`
+          let a_data0 = sub_a[word_offset];
+          let a_data1 = sub_a[word_offset + 1];`;default:throw new Error(`${p}-component is not supported.`)}};return`
+        var<workgroup> sub_a: array<${W.type.value}, ${_}>;
+        var<workgroup> inter_results: array<array<${U.type.value}, ${y}>, ${b}>;
+        ${M.declareVariables(...G,U)}
+        ${M.mainStart([y,b,1])}
+          let output_indices = ${U.offsetToIndices(`workgroup_index * ${b}`)};
+          let col = output_indices[2];
+          let row = output_indices[1];
+          let batch = output_indices[0];
+          let n_blocks_per_col = uniforms.b_shape[1];
+          let num_tiles =  (n_blocks_per_col - 1) / ${w} + 1;
+
+          // Loop over shared dimension.
+          for (var tile: u32 = 0; tile < num_tiles; tile += 1) {
+            let a_col_start = tile * ${_};
+            // load one tile A data into shared memory.
+            for (var a_offset = local_idx; a_offset < ${_}; a_offset += ${h})
+            {
+              let a_col = a_col_start + a_offset;
+              if (a_col < uniforms.a_shape[2])
+              {
+                sub_a[a_offset] = ${W.getByIndices(`${W.type.indices}(batch, row, a_col)`)};
+              } else {
+                sub_a[a_offset] = ${W.type.value}(0);
+              }
+            }
+            workgroupBarrier();
+
+            // each thread process one block
+            let b_row = col + local_id.y;
+            let block = tile * ${w} + local_id.x;
+            ${D?`
+            let zero_point_bytes_per_col = (n_blocks_per_col + 1) / 2;
+            let zero_point_byte_count = b_row * zero_point_bytes_per_col + (block >> 0x1u);
+            let zero_point_word_index = zero_point_byte_count >> 0x2u;
+            let zero_point_byte_offset = zero_point_byte_count & 0x3u;
+            let zero_point_nibble_offset: u32 = block & 0x1u;
+            let zero_point_bits_offset = (zero_point_byte_offset << 3) + (zero_point_nibble_offset << 2);
+            let zero_point_word = ${D.getByOffset("zero_point_word_index")} >> zero_point_bits_offset;
+            let zero_point = ${Q}((zero_point_word) & 0xFu);`:`
+            // The default zero point is 8 for unsigned 4-bit quantization.
+            let zero_point = ${Q}(8);`}
+            let scale = ${ee.getByOffset("b_row * n_blocks_per_col + block")};
+            let b_data = ${O.getByIndices(`${O.type.indices}(b_row, block, 0)`)};
+            var word_offset = local_id.x * ${t.blockSize/p};
+            for (var i: u32 = 0; i < ${f}; i++) {
+              ${_e()}
+              let b_value = ${f===1?"b_data":"b_data[i]"};
+              let b_value_lower = unpack4xU8(b_value & 0x0F0F0F0Fu);
+              let b_value_upper = unpack4xU8((b_value >> 4) & 0x0F0F0F0Fu);
+              let b_quantized_values = mat2x4<${Q}>(${Array.from({length:4},(se,H)=>`${Q}(b_value_lower[${H}]), ${Q}(b_value_upper[${H}])`).join(", ")});
+              let b_dequantized_values = (b_quantized_values - mat2x4<${Q}>(${Array(8).fill("zero_point").join(",")})) * scale;
+              inter_results[local_id.y][local_id.x] += ${Array.from({length:2},(se,H)=>`${`dot(a_data${H}, b_dequantized_values[${H}])`}`).join(" + ")};
+              word_offset += ${8/p};
+            }
+            workgroupBarrier();
+          }
+
+          if (local_idx < ${b}) {
+            var output_value: ${U.type.value} = ${U.type.value}(0);
+            for (var b = 0u; b < ${y}; b++) {
+              output_value += inter_results[local_idx][b];
+            }
+            if (col + local_idx < uniforms.output_shape[2])
+            {
+              ${U.setByIndices(`${U.type.indices}(batch, row, col + local_idx)`,"output_value")}
+            }
+          }
+        }`};return{name:"BlockwiseMatMulNBits32",shaderCache:{hint:`${t.blockSize};${p};${f};${y};${b}`,inputDependencies:Array(e.length).fill("rank")},getRunData:()=>({outputs:[{dims:m,dataType:c}],dispatchGroup:{x:v},programUniforms:$}),getShaderSource:z}},$a=(e,t)=>{fc(e.inputs,t),t.blockSize===32&&e.adapterInfo.isVendor("intel")&&e.adapterInfo.isArchitecture("gen-12lp")?e.compute(gc(e.inputs,t)):e.compute(hc(e.inputs,t))},va=e=>N(e)});var yc,bc,_c,wc,$c,vc,xc,Sc,Sa,Ta=E(()=>{"use strict";V();q();K();yc=e=>{if(!e||e.length<1)throw new Error("Too few inputs");if(e[0].dataType!==1&&e[0].dataType!==10)throw new Error("Input type must be float or float16.");if(e.length>=2){let t=e[0].dims.length*2===e[1].dims[0];if(e.length===4&&(t=e[3].dims[0]*2===e[1].dims[0]),!t)throw new Error("The pads should be a 1D tensor of shape [2 * input_rank] or [2 * num_axes].")}},bc=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+            k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+            if (k < 0) {
+              break;
+            }
+            if (k >= i32(${B("uniforms.x_shape",o,t)})) {
+              break;
+            }
+            offset += k * i32(${B("uniforms.x_strides",o,t)});
+        `;return`
+          value = ${e.type.value}(uniforms.constant_value);
+          for (var i = 0; i < 1; i++) {
+            var offset = 0;
+            var k = 0;
+            ${r}
+            value = x[offset];
+          }
+      `},_c=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+                k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+                if (k < 0) {
+                  k = -k;
+                }
+                {
+                  let _2n_1 = 2 * (i32(${B("uniforms.x_shape",o,t)}) - 1);
+                  k = k % _2n_1;
+                  if(k >= i32(${B("uniforms.x_shape",o,t)})) {
+                    k = _2n_1 - k;
+                  }
+                }
+                offset += k * i32(${B("uniforms.x_strides",o,t)});
+            `;return`
+              var offset = 0;
+              var k = 0;
+              ${r}
+              value = x[offset];
+          `},wc=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+                k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+                if (k < 0) {
+                  k = 0;
+                }
+                if (k >= i32(${B("uniforms.x_shape",o,t)})) {
+                  k = i32(${B("uniforms.x_shape",o,t)}) - 1;
+                }
+                offset += k * i32(${B("uniforms.x_strides",o,t)});
+            `;return`
+              var offset = 0;
+              var k = 0;
+              ${r}
+              value = x[offset];
+          `},$c=(e,t,n)=>{let r="";for(let o=t-1;o>=0;--o)r+=`
+                k = i32(${e.indicesGet("indices",o)}) - ${B("uniforms.pads",o,n)};
+                if (k < 0)  {
+                  k += i32(${B("uniforms.x_shape",o,t)}]);
+                }
+                if (k >= i32(${B("uniforms.x_shape",o,t)})) {
+                  k -= i32(${B("uniforms.x_shape",o,t)});
+                }
+                offset += k * i32(${B("uniforms.x_strides",o,t)});
+            `;return`
+              var offset = 0;
+              var k = 0;
+              ${r}
+              value = x[offset];
+          `},vc=(e,t,n)=>{switch(n.mode){case 0:return bc(e,t,n.pads.length);case 1:return _c(e,t,n.pads.length);case 2:return wc(e,t,n.pads.length);case 3:return $c(e,t,n.pads.length);default:throw new Error("Invalid mode")}},xc=(e,t)=>{let n=x.padShape(e[0].dims.slice(),t.pads),r=e[0].dims,o=x.size(n),i=[{type:12,data:o},{type:6,data:t.pads}],s=e.length>=3&&e[2].data;t.mode===0&&i.push({type:s?e[2].dataType:1,data:t.value}),i.push(...P(e[0].dims,n));let a=["rank"],u=d=>{let l=C("output",e[0].dataType,n.length),c=S("x",e[0].dataType,r.length),p=c.type.value,f=vc(l,r.length,t),m=[{name:"output_size",type:"u32"},{name:"pads",type:"i32",length:t.pads.length}];return t.mode===0&&m.push({name:"constant_value",type:s?p:"f32"}),`
+            ${d.registerUniforms(m).declareVariables(c,l)}
+            ${d.mainStart()}
+            ${d.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+
+            let indices = ${l.offsetToIndices("global_idx")};
+
+            var value = ${p}(0);
+            ${f}
+            output[global_idx] = value;
+        }`};return{name:"Pad",shaderCache:{hint:`${t.mode}${s}`,inputDependencies:a},getRunData:()=>({outputs:[{dims:n,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(x.size(n)/64)},programUniforms:i}),getShaderSource:u}},Sc=(e,t)=>{if(e.length>1){let n=e[1].getBigInt64Array(),r=e.length>=3&&e[2].data?e[2].dataType===10?e[2].getUint16Array()[0]:e[2].getFloat32Array()[0]:0,o=e[0].dims.length,i=new Int32Array(2*o).fill(0);if(e.length>=4){let a=e[3].getBigInt64Array();for(let u=0;u<a.length;u++)i[Number(a[u])]=Number(n[u]),i[Number(a[u])+o]=Number(n[u+a.length])}else n.forEach((a,u)=>i[Number(u)]=Number(a));let s=[];return i.forEach(a=>s.push(a)),{mode:t.mode,value:r,pads:s}}else return t},Sa=(e,t)=>{yc(e.inputs);let n=Sc(e.inputs,t);e.compute(xc(e.inputs,n),{inputs:[0]})}});var ln,Ia,Ca,Aa,ka,Tc,Ic,Ea,Pa,za,Oa,Da,Ba,Ma,Ra,Ua,Va,Na,La,Ga=E(()=>{"use strict";ve();V();q();K();ln=e=>{if(te.webgpu.validateInputContent&&(!e||e.length!==1))throw new Error("Pool ops requires 1 input.")},Ia=(e,t,n)=>{let r=t.format==="NHWC",o=e.dims.slice();r&&o.splice(1,0,o.pop());let i=Object.hasOwnProperty.call(t,"dilations"),s=t.kernelShape.slice(),a=t.strides.slice(),u=i?t.dilations.slice():[],d=t.pads.slice();Ze.adjustPoolAttributes(n,o,s,a,u,d);let l=Ze.computePoolOutputShape(n,o,a,u,s,d,t.autoPad),c=Object.assign({},t);i?Object.assign(c,{kernelShape:s,strides:a,pads:d,dilations:u,cacheKey:t.cacheKey}):Object.assign(c,{kernelShape:s,strides:a,pads:d,cacheKey:t.cacheKey});let p=l.slice();return p.push(p.splice(1,1)[0]),[c,r?p:l]},Ca=(e,t)=>{let n=t.format==="NHWC",r=x.size(e),o=x.size(t.kernelShape),i=[{type:12,data:r},{type:12,data:o}],s=[{name:"outputSize",type:"u32"},{name:"kernelSize",type:"u32"}];if(t.kernelShape.length<=2){let a=t.kernelShape[t.kernelShape.length-1],u=t.strides[t.strides.length-1],d=t.pads[t.pads.length/2-1],l=t.pads[t.pads.length-1],c=!!(d+l);i.push({type:12,data:a},{type:12,data:u},{type:12,data:d},{type:12,data:l}),s.push({name:"kw",type:"u32"},{name:"sw",type:"u32"},{name:"pwStart",type:"u32"},{name:"pwEnd",type:"u32"});let p=!1;if(t.kernelShape.length===2){let f=t.kernelShape[t.kernelShape.length-2],m=t.strides[t.strides.length-2],h=t.pads[t.pads.length/2-2],b=t.pads[t.pads.length-2];p=!!(h+b),i.push({type:12,data:f},{type:12,data:m},{type:12,data:h},{type:12,data:b}),s.push({name:"kh",type:"u32"},{name:"sh",type:"u32"},{name:"phStart",type:"u32"},{name:"phEnd",type:"u32"})}return[i,s,!0,c,p]}else{if(n)throw new Error("Pooling with kernelShape.length > 2 is not supported for NHWC format.");let a=x.computeStrides(t.kernelShape);i.push({type:12,data:a},{type:12,data:t.pads},{type:12,data:t.strides}),s.push({name:"kernelStrides",type:"u32",length:a.length},{name:"pads",type:"u32",length:t.pads.length},{name:"strides",type:"u32",length:t.strides.length});let u=t.pads.reduce((d,l)=>d+l);return[i,s,!!u,!1,!1]}},Aa=(e,t,n,r,o,i,s,a,u,d,l,c)=>{let p=o.format==="NHWC",f=t.type.value,m=C("output",t.type.tensor,r);if(o.kernelShape.length<=2){let h="",b="",y="",g=n-(p?2:1);if(l?h=`
+                for (var i: u32 = 0u; i < uniforms.kw; i++) {
+                  xIndices[${g}] = indices[${g}] * uniforms.sw - uniforms.pwStart + i;
+                  if (xIndices[${g}] < 0 || xIndices[${g}]
+                      >= uniforms.x_shape[${g}]) {
+                    pad++;
+                    continue;
+                  }
+                  let x_val = x[${t.indicesToOffset("xIndices")}];
+                  ${i}
+                }`:h=`
+                for (var i: u32 = 0u; i < uniforms.kw; i++) {
+                  xIndices[${g}] = indices[${g}] * uniforms.sw - uniforms.pwStart + i;
+                  let x_val = x[${t.indicesToOffset("xIndices")}];
+                  ${i}
+                }`,o.kernelShape.length===2){let w=n-(p?3:2);c?b=`
+                for (var j: u32 = 0u; j < uniforms.kh; j++) {
+                  xIndices[${w}] = indices[${w}] * uniforms.sh - uniforms.phStart + j;
+                  if (xIndices[${w}] < 0 || xIndices[${w}] >= uniforms.x_shape[${w}]) {
+                    pad += i32(uniforms.kw);
+                    continue;
+                  }
+              `:b=`
+                for (var j: u32 = 0u; j < uniforms.kh; j++) {
+                  xIndices[${w}] = indices[${w}] * uniforms.sh - uniforms.phStart + j;
+                `,y=`
+              }
+            `}return`
+            ${e.registerUniforms(u).declareVariables(t,m)}
+
+            ${e.mainStart()}
+              ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+
+              let indices = ${m.offsetToIndices("global_idx")};
+              var xIndices = ${m.offsetToIndices("global_idx")};
+
+              var value = ${f}(${a});
+              var pad = 0;
+              ${b}
+              ${h}
+              ${y}
+              ${s}
+
+              output[global_idx] = value;
+            }`}else{if(p)throw new Error("Pooling with kernelShape.length > 2 is not supported for NHWC format.");let h=o.kernelShape.length,b=o.pads.length,y="";return d?y=`
+                if (xIndices[j] >= uniforms.x_shape[j]) {
+                  pad++;
+                  isPad = true;
+                  break;
+                }
+              }
+              if (!isPad) {
+                let x_val = x[${t.indicesToOffset("xIndices")}];
+                ${i}
+              }`:y=`
+              }
+              let x_val = x[${t.indicesToOffset("xIndices")}];
+              ${i}
+            `,`
+            ${e.registerUniforms(u).declareVariables(t,m)}
+
+            ${e.mainStart()}
+              ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+              let indices = ${m.offsetToIndices("global_idx")};
+              var xIndices = ${m.offsetToIndices("global_idx")};
+
+              var offsets: array<u32, ${h}>;
+
+              var value = ${f}(${a});
+              var pad = 0;
+              var isPad = false;
+
+              for (var i: u32 = 0u; i < uniforms.kernelSize; i++) {
+                var offset = i;
+                for (var j = 0u; j < ${h-1}u; j++) {
+                  offsets[j] = offset / ${B("uniforms.kernelStrides","j",h)};
+                  offset -= offsets[j] * ${B("uniforms.kernelStrides","j",h)};
+                }
+                offsets[${h-1}] = offset;
+
+                isPad = false;
+                for (var j = ${n-h}u; j < ${n}u; j++) {
+                  xIndices[j] = indices[j] * ${B("uniforms.strides",`j - ${n-h}u`,h)}
+                    + offsets[j - ${n-h}u] - ${B("uniforms.pads","j - 2u",b)};
+                  ${y}
+              }
+              ${s}
+
+              output[global_idx] = value;
+            }`}},ka=e=>`${e.format};${e.ceilMode};${e.autoPad};${e.kernelShape.length}`,Tc=e=>`${ka(e)};${e.countIncludePad}`,Ic=e=>`${ka(e)};${e.storageOrder};${e.dilations}`,Ea=e=>({format:e.format,autoPad:["NOTSET","VALID","SAME_UPPER","SAME_LOWER"][e.auto_pad],ceilMode:e.ceil_mode,kernelShape:e.kernel_shape,strides:e.strides,pads:e.pads}),Pa=(e,t,n,r)=>{let[o,i]=Ia(t,r,n),s=S("x",t.dataType,t.dims.length),a=s.type.value,u="value += x_val;",d="";o.countIncludePad?d+=`value /= ${a}(uniforms.kernelSize);`:d+=`value /= ${a}(i32(uniforms.kernelSize) - pad);`;let[l,c,p,f,m]=Ca(i,o);l.push(...P(t.dims,i));let h=["rank"];return{name:e,shaderCache:{hint:`${r.cacheKey};${p};${f};${m}`,inputDependencies:h},getRunData:()=>({outputs:[{dims:i,dataType:t.dataType}],dispatchGroup:{x:Math.ceil(x.size(i)/64)},programUniforms:l}),getShaderSource:b=>Aa(b,s,t.dims.length,i.length,o,u,d,0,c,p,f,m)}},za=e=>{let t=e.count_include_pad!==0,n=Ea(e);if(n.ceilMode!==0)throw new Error("using ceil() in shape computation is not yet supported for AveragePool");let r={countIncludePad:t,...n,cacheKey:""};return{...r,cacheKey:Tc(r)}},Oa=(e,t)=>{ln(e.inputs),e.compute(Pa("AveragePool",e.inputs[0],!1,t))},Da={autoPad:"",ceilMode:0,countIncludePad:!1,kernelShape:[],strides:[],pads:[],storageOrder:0,dilations:[]},Ba=e=>{let t=e.format;return{format:t,...Da,cacheKey:t}},Ma=(e,t)=>{ln(e.inputs),e.compute(Pa("GlobalAveragePool",e.inputs[0],!0,t))},Ra=(e,t,n,r)=>{let[o,i]=Ia(t,r,n),s=`
+      value = max(x_val, value);
+    `,a="",u=S("x",t.dataType,t.dims.length),d=["rank"],[l,c,p,f,m]=Ca(i,o);return l.push(...P(t.dims,i)),{name:e,shaderCache:{hint:`${r.cacheKey};${p};${f};${m}`,inputDependencies:d},getRunData:()=>({outputs:[{dims:i,dataType:t.dataType}],dispatchGroup:{x:Math.ceil(x.size(i)/64)},programUniforms:l}),getShaderSource:h=>Aa(h,u,t.dims.length,i.length,o,s,a,t.dataType===10?-65504:-1e5,c,p,f,m)}},Ua=(e,t)=>{ln(e.inputs),e.compute(Ra("MaxPool",e.inputs[0],!1,t))},Va=e=>{let t=e.storage_order,n=e.dilations,r=Ea(e);if(t!==0)throw new Error("column major storage order is not yet supported for MaxPool");if(r.ceilMode!==0)throw new Error("using ceil() in shape computation is not yet supported for MaxPool");let o={storageOrder:t,dilations:n,...r,cacheKey:""};return{...o,cacheKey:Ic(o)}},Na=e=>{let t=e.format;return{format:t,...Da,cacheKey:t}},La=(e,t)=>{ln(e.inputs),e.compute(Ra("GlobalMaxPool",e.inputs[0],!0,t))}});var Ac,kc,Wa,Ha,qa=E(()=>{"use strict";V();q();ie();K();Ac=(e,t)=>{if(e.length<2||e.length>3)throw new Error("DequantizeLinear requires 2 or 3 inputs.");if(e.length===3&&e[1].dims===e[2].dims)throw new Error("x-scale and x-zero-point must have the same shape.");if(e.length===3&&e[0].dataType!==e[2].dataType)throw new Error("x and x-zero-point must have the same data type.");if(e[0].dataType===6&&e.length>2)throw new Error("In the case of dequantizing int32 there is no zero point.");if(e[1].dims.length!==0&&e[1].dims.length!==1&&e[1].dims.length!==e[0].dims.length)throw new Error("scale input must be a scalar, a 1D tensor, or have the same rank as the input tensor.");if(e.length>2){if(e[0].dataType!==e[2].dataType)throw new Error("x and x-zero-point must have the same data type.");if(e[1].dims.length!==e[2].dims.length)throw new Error("scale and zero-point inputs must have the same rank.");if(!e[1].dims.map((n,r)=>n===e[2].dims[r]).reduce((n,r)=>n&&r,!0))throw new Error("scale and zero-point inputs must have the same shape.")}if(t.blockSize>0){if(e[1].dims.length===0||e[1].dims.length===1&&e[1].dims[0]===1)throw new Error("blockSize must be set only for block quantization.");if(!e[1].dims.map((o,i)=>i===t.axis||o===e[0].dims[i]).reduce((o,i)=>o&&i,!0))throw new Error("For block qunatization, scale input shape to match the input shape except for the axis");if(e[1].dims.length!==e[0].dims.length)throw new Error("For block qunatization the scale input rank must be the same as the x rank.");let n=e[0].dims[t.axis],r=e[1].dims[t.axis];if(t.blockSize<Math.ceil(n/r)||t.blockSize>Math.ceil(n/(r-1)-1))throw new Error("blockSize must be with in the range [ceil(dI / Si), ceil(dI / (Si - 1) - 1)].")}},kc=(e,t)=>{let n=x.normalizeAxis(t.axis,e[0].dims.length),r=e[0].dataType,o=r===3,i=e[0].dims,s=e[1].dataType,a=x.size(i),u=r===3||r===2,d=u?[Math.ceil(x.size(e[0].dims)/4)]:e[0].dims,l=e[1].dims,c=e.length>2?e[2]:void 0,p=c?u?[Math.ceil(x.size(c.dims)/4)]:c.dims:void 0,f=l.length===0||l.length===1&&l[0]===1,m=f===!1&&l.length===1,h=X(a),b=f&&(!u||h===4),y=b?h:1,g=b&&!u?h:1,_=S("input",u?12:r,d.length,g),w=S("scale",s,l.length),v=c?S("zero_point",u?12:r,p.length):void 0,$=C("output",s,i.length,y),T=[_,w];v&&T.push(v);let I=[d,l];c&&I.push(p);let A=[{type:12,data:a/y},{type:12,data:n},{type:12,data:t.blockSize},...P(...I,i)],z=M=>{let R=[{name:"output_size",type:"u32"},{name:"axis",type:"u32"},{name:"block_size",type:"u32"}];return`
+      ${M.registerUniforms(R).declareVariables(...T,$)}
+      ${M.mainStart()}
+          ${M.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+          let output_indices = ${$.offsetToIndices("global_idx")};
+
+          // Set input x
+          ${u?`
+            let input = ${_.getByOffset("global_idx / 4")};
+            let x_vec = ${o?"unpack4xI8(input)":"unpack4xU8(input)"};
+            let x_value = ${y===1?"x_vec[global_idx % 4]":"x_vec"};`:`let x_value = ${_.getByOffset("global_idx")};`};
+
+          // Set scale input
+          ${f?`let scale_value= ${w.getByOffset("0")}`:m?`
+            let scale_index = ${$.indicesGet("output_indices","uniforms.axis")};
+            let scale_value= ${w.getByOffset("scale_index")};`:`
+            var scale_indices: ${w.type.indices} = output_indices;
+            let index = ${w.indicesGet("scale_indices","uniforms.axis")} / uniforms.block_size;
+            ${w.indicesSet("scale_indices","uniforms.axis","index")};
+            let scale_value= ${w.getByIndices("scale_indices")};`};
+
+          // Set zero-point input
+          ${v?f?u?`
+                let zero_point_input = ${v.getByOffset("0")};
+                let zero_point_vec =  ${o?"unpack4xI8(zero_point_input)":"unpack4xU8(zero_point_input)"};
+                let zero_point_value= zero_point_vec[0]`:`let zero_point_value = ${v.getByOffset("0")}`:m?u?`
+                let zero_point_index = ${$.indicesGet("output_indices","uniforms.axis")};
+                let zero_point_input = ${v.getByOffset("zero_point_index / 4")};
+                let zero_point_vec =  ${o?"unpack4xI8(zero_point_input)":"unpack4xU8(zero_point_input)"};
+                let zero_point_value = zero_point_vec[zero_point_index % 4]`:`
+                let zero_point_index = ${$.indicesGet("output_indices","uniforms.axis")};
+                let zero_point_value = ${v.getByOffset("zero_point_index")};`:u?`
+                let zero_point_offset = ${w.indicesToOffset("scale_indices")};
+                let zero_point_input = ${v.getByOffset("zero_point_offset / 4")};
+                let zero_point_vec = ${o?"unpack4xI8(zero_point_input)":"unpack4xU8(zero_point_input)"};
+                let zero_point_value = zero_point_vec[zero_point_offset % 4];`:`let zero_point_value = ${v.getByIndices("scale_indices")};`:`let zero_point_value = ${u?o?"i32":"u32":_.type.value}(0);`};
+      // Compute and write output
+      ${$.setByOffset("global_idx",`${$.type.value}(x_value - zero_point_value) * scale_value`)};
+      }`};return{name:"DequantizeLinear",shaderCache:{hint:t.cacheKey,inputDependencies:v?["rank","rank","rank"]:["rank","rank"]},getShaderSource:z,getRunData:()=>({outputs:[{dims:i,dataType:s}],dispatchGroup:{x:Math.ceil(a/y/64),y:1,z:1},programUniforms:A})}},Wa=(e,t)=>{Ac(e.inputs,t),e.compute(kc(e.inputs,t))},Ha=e=>N({axis:e.axis,blockSize:e.blockSize})});var Ec,Pc,Fa,Ka=E(()=>{"use strict";ve();V();K();Ec=(e,t,n)=>{let r=e===t,o=e<t&&n<0,i=e>t&&n>0;if(r||o||i)throw new Error("Range these inputs' contents are invalid.")},Pc=(e,t,n,r)=>{let o=Math.abs(Math.ceil((t-e)/n)),i=[o],s=o,a=[{type:12,data:s},{type:r,data:e},{type:r,data:n},...P(i)],u=d=>{let l=C("output",r,i.length),c=l.type.value,p=[{name:"outputSize",type:"u32"},{name:"start",type:c},{name:"delta",type:c}];return`
+        ${d.registerUniforms(p).declareVariables(l)}
+        ${d.mainStart()}
+        ${d.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+        output[global_idx] = uniforms.start + ${c}(global_idx) * uniforms.delta;
+      }`};return{name:"Range",shaderCache:{hint:`${r}`},getShaderSource:u,getRunData:()=>({outputs:[{dims:i,dataType:r}],dispatchGroup:{x:Math.ceil(s/64)},programUniforms:a})}},Fa=e=>{let t=0,n=0,r=0;e.inputs[0].dataType===6?(t=e.inputs[0].getInt32Array()[0],n=e.inputs[1].getInt32Array()[0],r=e.inputs[2].getInt32Array()[0]):e.inputs[0].dataType===1&&(t=e.inputs[0].getFloat32Array()[0],n=e.inputs[1].getFloat32Array()[0],r=e.inputs[2].getFloat32Array()[0]),te.webgpu.validateInputContent&&Ec(t,n,r),e.compute(Pc(t,n,r,e.inputs[0].dataType),{inputs:[]})}});var zc,ja,Za,Oc,Qa,Xa,Ya=E(()=>{"use strict";V();q();ie();K();zc=(e,t,n,r)=>{if(e!=="none"&&r!=="i32"&&r!=="u32"&&r!=="f32")throw new Error(`Input ${r} is not supported with reduction ${e}.`);let o=`{
+                var oldValue = 0;
+                loop {
+                  let newValueF32 =`,i=`;
+                  let newValue = bitcast<i32>(newValueF32);
+                  let res = atomicCompareExchangeWeak(&${t}, oldValue, newValue);
+                  if res.exchanged {
+                    break;
+                  }
+                  oldValue = res.old_value;
+                }
+              }`;switch(e){case"none":return`${t}=${n};`;case"add":return r==="i32"||r==="u32"?`atomicAdd(&${t}, bitcast<${r}>(${n}));`:`
+              ${o}bitcast<${r}>(oldValue) + (${n})${i}`;case"max":return r==="i32"||r==="u32"?`atomicMax(&${t}, bitcast<${r}>(${n}));`:`
+                ${o}max(bitcast<f32>(oldValue), (${n}))${i}`;case"min":return r==="i32"||r==="u32"?`atomicMin(&${t}, bitcast<${r}>(${n}));`:`${o}min(bitcast<${r}>(oldValue), (${n}))${i}`;case"mul":return`${o}(bitcast<${r}>(oldValue) * (${n}))${i}`;default:throw new Error(`Reduction ${e} is not supported.`)}},ja=(e,t)=>`${e===1?`
+    let element_count_dim = uniforms.output_strides;
+    let dim_value = uniforms.output_shape;`:`
+    let element_count_dim = uniforms.output_strides[${t?"i - indices_start":"i"}];
+    let dim_value = uniforms.output_shape[${t?"i - indices_start":"i"} + uniforms.last_index_dimension];`}
+    
+    if (index >= 0) {
+      if (index >= i32(dim_value)) {
+        index = i32(dim_value - 1);
+      }
+    } else {
+      if (index < -i32(dim_value)) {
+        index = 0;
+      } else {
+        index += i32(dim_value);
+      }
+    }
+    data_offset += u32((u32(index) * element_count_dim));`,Za=(e,t,n)=>`for (var i = 0u; i < uniforms.num_updates_elements; i++) {
+        let value = updates[uniforms.num_updates_elements * ${n?"global_idx":"idx"} + i];
+        ${zc(e.reduction,"output[data_offset + i]","value",t)}
+      }`,Oc=(e,t)=>{let n=e[0].dims,r=e[1].dims,o=n,i=1,s=Math.ceil(x.size(r)/i),a=r[r.length-1],u=x.sizeFromDimension(n,a),d=x.sizeFromDimension(r,0)/a,l=[{type:12,data:s},{type:12,data:a},{type:12,data:u},...P(e[1].dims,e[2].dims,o)],c=p=>{let f=S("indices",e[1].dataType,e[1].dims.length),m=S("updates",e[2].dataType,e[2].dims.length,i),h=t.reduction!=="none"&&t.reduction!==""?vo("output",e[0].dataType,o.length):C("output",e[0].dataType,o.length,i);return`
+      ${p.registerUniform("output_size","u32").registerUniform("last_index_dimension","u32").registerUniform("num_updates_elements","u32").declareVariables(f,m,h)}
+      ${p.mainStart()}
+        ${p.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+  var hasDuplicates = false;
+  if (${t.reduction==="none"}) {
+    for (var i = 0; i < ${d}; i = i + 1) {
+      for (var j = i + 1; j < ${d}; j = j + 1) {
+        var index_i = i32(indices[i].x);
+        var index_j = i32(indices[j].x);
+        if (index_i == index_j) {
+          hasDuplicates = true;
+          break;
+        }
+      }
+      if (hasDuplicates) {
+        break;
+      }
+    }
+  }
+
+  if (${t.reduction==="none"} && hasDuplicates) {
+    if (global_idx != 0u) {
+      return;
+    }
+    // Process each index-update pair individually when duplicates exist
+    for (var idx = 0u; idx < ${d}u; idx++) {
+      var data_offset = 0u;
+      for (var i = 0u; i < uniforms.last_index_dimension; i++) {
+        var index = i32(indices[idx * uniforms.last_index_dimension + i].x);
+        ${ja(n.length,!1)}
+      }
+      ${Za(t,h.type.value,!1)}
+    }
+    return;
+  }
+
+  var data_offset = 0u;
+  var indices_start = uniforms.last_index_dimension * global_idx;
+  var indices_end = indices_start + uniforms.last_index_dimension;
+  for (var i = indices_start; i < indices_end; i++) {
+    var index = i32(indices[i].x);
+    ${ja(n.length,!0)}
+  }
+  ${Za(t,h.type.value,!0)}
+  }`};return{name:"ScatterND",shaderCache:{hint:`${t.cacheKey}_${t.reduction}`,inputDependencies:["rank","rank"]},getRunData:()=>({outputs:[{dims:o,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(s/64)},programUniforms:l}),getShaderSource:c}},Qa=e=>N({reduction:e.reduction}),Xa=(e,t)=>{e.compute(Oc(e.inputs,t),{inputs:[e.inputs[1],e.inputs[2]],outputs:[]})}});var Dc,Bc,Mc,Ja,Rc,Uc,Vc,Nc,Lc,Gc,Wc,Hc,eu,qc,Fc,Kc,jc,Zc,tu,nu,ru=E(()=>{"use strict";V();q();ie();K();Dc=(e,t)=>{if(e.every(n=>n>0||(()=>{throw new Error("Resize requires scales input values to be positive")})),e.length>0){if(t.mode==="linear"){if(!(e.length===2||e.length===3||e.length===4&&e[0]===1&&e[1]===1||e.length===4&&e[0]===1&&e[3]===1||e.length===5&&e[0]===1&&e[1]===1))throw new Error(`For linear mode, Resize requires scales to be 2D, 3D, 4D with either two outermost or one innermost and
+            one outermost scale values equal to 1, or 5D with two outermost scale values equal to 1`)}else if(t.mode==="cubic"&&!(e.length===2||e.length===4&&e[0]===1&&e[1]===1||e.length===4&&e[0]===1&&e[3]===1))throw new Error("Resize requires scales input size to be 2 or 4 for cubic mode")}},Bc=(e,t,n)=>{t.every(o=>o>=0&&o<n||(()=>{throw new Error("Resize requires axes input values to be positive and less than rank")}));let r=new Array(n).fill(1);return t.forEach((o,i)=>r[o]=e[i]),r},Mc=(e,t,n,r,o,i)=>{let[s,a,u]=n>10?[1,2,3]:[-1,e.length>1?1:-1,-1],d=e[0].dims.length;if(s>0&&e.length>s&&e[s].dims.length>0)e[s].getFloat32Array().forEach(l=>i.push(l));else if(t.coordinateTransformMode==="tf_crop_and_resize")throw new Error("Resize requires RoI input to be specified when coordinateTransformMode is tfCropAndResize");if(a>0&&e.length>a&&e[a].dims.length===1&&e[a].dims[0]>0){if(e[a].getFloat32Array().forEach(l=>r.push(l)),r.length!==0&&r.length!==d&&n>=18&&r.length!==t.axes.length)throw new Error("Resize requires scales input size to be same as input rank or axes size for opset 18 and up");Dc(r,t),t.axes.length>0&&Bc(r,t.axes,d).forEach((l,c)=>r[c]=l)}if(u>0&&e.length>u&&e[u].dims.length===1&&e[u].dims[0]>0&&(e[u].getBigInt64Array().forEach(l=>o.push(Number(l))),o.length!==0&&o.length!==d&&n>=18&&o.length!==t.axes.length))throw new Error("Resize requires sizes input size to be same as input rank or axes size for opset 18 and up");if(t.axes.length>0){if(r.length!==0&&r.length!==t.axes.length)throw new Error('Resize requires "scales" input size to be of axes rank when axes attributes is specified');if(o.length!==0&&o.length!==t.axes.length)throw new Error('Resize requires "sizes" input size to be of rank axes rank when axes attributes is specified')}if(typeof r<"u"&&typeof o<"u"&&r.length>0&&o.length>d)throw new Error("Resize requires only of scales or sizes to be specified")},Ja=(e,t,n,r)=>`
+  // The whole part and the fractional part are calculated separately due to inaccuracy of floating
+  // point division. As an example, f32(21) / f32(7) may evaluate to 2.99... instead of 3, causing an
+  // offset-by-one error later in floor().
+  let big = (${e}) * (${t});
+  let whole = ${r}(big / (${n}));
+  let fract = ${r}(big % (${n})) / ${r}(${n});
+  return whole + fract;
+`,Rc=(e,t)=>`fn getOriginalCoordinateFromResizedCoordinate(xResized: u32, xScale: f32, lengthResized: u32,
+     lengthOriginal: u32, roiStart: f32, roiEnd: f32) -> ${t} { `+(()=>{switch(e){case"asymmetric":return`
+          if (xScale < 1.0 || floor(xScale) != xScale) {
+            return ${t}(xResized) / ${t}(xScale);
+          } else {
+            ${Ja("xResized","lengthOriginal","lengthResized",t)}
+          }
+        `;case"pytorch_half_pixel":return`if (lengthResized > 1) {
+                    return (${t}(xResized) + 0.5) / ${t}(xScale) - 0.5;
+                  } else {
+                    return 0.0;
+                  }`;case"tf_half_pixel_for_nn":return`return (${t}(xResized) + 0.5) / ${t}(xScale);`;case"align_corners":return`if (lengthResized == 1) {
+                    return 0.0;
+                  } else {
+                    ${Ja("xResized","lengthOriginal - 1","lengthResized - 1",t)}
+                  }`;case"tf_crop_and_resize":return`if (lengthResized > 1) {
+                    return ${t}(roiStart) * ${t}(lengthOriginal - 1) +
+                        (${t}(xResized) * ${t}(roiEnd - roiStart) * ${t}(lengthOriginal - 1)) /
+                        ${t}(lengthResized - 1);
+                  } else {
+                    return 0.5 * ${t}(roiStart + roiEnd) * ${t}(lengthOriginal - 1);
+                  }`;case"half_pixel_symmetric":return`const outputWidth = ${t}xScale * ${t}(lengthResized);
+                  const adjustment = ${t}(lengthResized) / outputWidth;
+                  const center = ${t}(lengthOriginal) / 2;
+                  const offset = center * (1 - adjustment);
+                  return offset + ((${t}(xResized) + 0.5) / ${t}(xScale)) - 0.5;`;case"half_pixel":return`return ((${t}(xResized) + 0.5) / ${t}(xScale)) - 0.5;`;default:throw new Error(`Coordinate transform mode ${e} is not supported`)}})()+"}",Uc=(e,t,n)=>`fn getNearestPixelFromOriginal(xOriginal: ${n}, isDownSample: bool) -> ${n} {`+(()=>{switch(e){case"round_prefer_ceil":return"if (fract(xOriginal) == 0.5) {             return ceil(xOriginal);           } else {             return round(xOriginal);           }";case"floor":return"return floor(xOriginal);";case"ceil":return"return ceil(xOriginal);";case"round_prefer_floor":return"if (fract(xOriginal) == 0.5) {                     return floor(xOriginal);                   } else {                     return round(xOriginal);                   }";case"simple":default:if(t<11)return"if (isDownSample)                     {                       return ceil(xOriginal);                     } else {                       return xOriginal;                     }";throw new Error(`Nearest mode ${e} is not supported`)}})()+"}",Vc=(e,t,n)=>{let r=new Array(n).fill(0).concat(new Array(n).fill(1)),o=e.length===0?r:e.slice();return t.length>0?(t.forEach((i,s)=>{r[i]=o[s],r[s+n]=o[t.length+s]}),r):o},Nc=(e,t,n,r)=>{let o=[];if(n.length>0)if(r.length>0){if(e.forEach(i=>o.push(i)),Math.max(...r)>e.length)throw new Error("axes is out of bound");r.forEach((i,s)=>o[i]=n[s])}else n.forEach(i=>o.push(i));else{if(t.length===0)throw new Error("Resize requires either scales or sizes.");o=e.map((i,s)=>Math.round(i*t[s]))}return o},Lc=(e,t,n)=>{let r=(()=>{switch(n.keepAspectRatioPolicy){case"not_larger":return n.axes.length>0?Math.min(...n.axes.map(i=>t[i]),Number.MAX_VALUE):Math.min(...t,Number.MAX_VALUE);case"not_smaller":return n.axes.length>0?Math.max(...n.axes.map(i=>t[i]),Number.MIN_VALUE):Math.max(...t,Number.MIN_VALUE);default:throw new Error(`Keep aspect ratio policy ${n.keepAspectRatioPolicy} is not supported`)}})();t.fill(1,0,t.length);let o=e.slice();return n.axes.length>0?(n.axes.forEach(i=>t[i]=r),n.axes.forEach(i=>o[i]=Math.round(e[i]*t[i]))):(t.fill(r,0,t.length),o.forEach((i,s)=>o[s]=Math.round(i*t[s]))),o},Gc=(e,t,n,r,o)=>`
+    fn calculateOriginalIndicesFromOutputIndices(output_indices: ${e.type.indices}) -> array<${e.type.value}, ${n.length}> {
+      var original_indices: array<${e.type.value}, ${n.length}>;
+      for (var i:u32 = 0; i < ${n.length}; i++) {
+        var output_index = ${e.indicesGet("output_indices","i")};
+        var scale = ${B("uniforms.scales","i",r)};
+        var roi_low = ${B("uniforms.roi","i",o)};
+        var roi_hi = ${B("uniforms.roi",`i + ${t.length}`,o)};
+        if (scale == 1.0) {
+          original_indices[i] = ${e.type.value}(output_index);
+        } else {
+          var input_shape_i = ${B("uniforms.input_shape","i",t.length)};
+          var output_shape_i = ${B("uniforms.output_shape","i",n.length)};
+          original_indices[i] = getOriginalCoordinateFromResizedCoordinate(output_index, scale, output_shape_i,
+                                                                           input_shape_i, roi_low, roi_hi);
+        }
+      }
+      return original_indices;
+    }`,Wc=(e,t,n,r,o,i,s)=>`
+    fn calculateInputIndicesFromOutputIndices(output_indices: ${t.type.indices}) -> ${e.type.indices} {
+      var input_indices: ${e.type.indices};
+      for (var i:u32 = 0; i < ${r.length}; i++) {
+        var output_index = ${t.indicesGet("output_indices","i")};
+        var input_index: u32;
+        var scale = ${B("uniforms.scales","i",o)};
+        if (scale == 1.0) {
+          input_index = output_index;
+        } else {
+          var roi_low = ${B("uniforms.roi","i",i)};
+          var roi_hi = ${B("uniforms.roi",`i + ${n.length}`,i)};
+          var input_shape_i = ${B("uniforms.input_shape","i",n.length)};
+          var output_shape_i = ${B("uniforms.output_shape","i",r.length)};
+          var original_idx = getOriginalCoordinateFromResizedCoordinate(output_index, scale, output_shape_i,
+                                                                        input_shape_i, roi_low, roi_hi);
+          if (!${s} || (original_idx >= 0 && original_idx < ${t.type.value}(input_shape_i))) {
+            if (original_idx < 0) {
+              input_index = 0;
+            } else if (original_idx > ${t.type.value}(input_shape_i - 1)) {
+              input_index = input_shape_i - 1;
+            } else {
+              input_index = u32(getNearestPixelFromOriginal(original_idx, scale < 1));
+            }
+          } else {
+            input_index = u32(original_idx);
+          }
+        }
+        ${e.indicesSet("input_indices","i","input_index")}
+      }
+      return input_indices;
+    }`,Hc=(e,t)=>`
+    fn checkInputIndices(input_indices: ${e.type.indices}) -> bool {
+      for (var i:u32 = 0; i < ${t.length}; i++) {
+        var input_index = ${e.indicesGet("input_indices","i")};
+        if (input_index < 0 || input_index >= ${B("uniforms.input_shape","i",t.length)}) {
+          return false;
+        }
+      }
+      return true;
+    }`,eu=(e,t,n,r)=>e.rank>r?`
+    ${e.indicesSet("input_indices",t,"channel")};
+    ${e.indicesSet("input_indices",n,"batch")};
+`:"",qc=(e,t,n,r,o)=>{let[s,a,u,d]=n.length===2?[-1,0,1,-1]:[0,2,3,1],l=e.type.value;return`
+    fn getInputValue(batch: u32, channel: u32, row: u32, col: u32) -> ${l} {
+      var input_indices: ${e.type.indices};
+      ${e.indicesSet("input_indices",a,`max(0, min(row, ${n[a]} - 1))`)};
+      ${e.indicesSet("input_indices",u,`max(0, min(col, ${n[u]} - 1))`)};
+      ${eu(e,d,s,2)}
+      return ${e.getByIndices("input_indices")};
+    }
+
+    fn bilinearInterpolation(output_indices: ${t.type.indices}) -> ${l} {
+      var originalIndices = calculateOriginalIndicesFromOutputIndices(output_indices);
+      var row:${l} = originalIndices[${a}];
+      var col:${l} = originalIndices[${u}];
+      ${r?`if (row < 0 || row > (${n[a]} - 1) || col < 0 || col > (${n[u]} - 1)) {
+        return ${o};
+      }`:""};
+      row = max(0, min(row, ${n[a]} - 1));
+      col = max(0, min(col, ${n[u]} - 1));
+      var row1: u32 = u32(row);
+      var col1: u32 = u32(col);
+      var row2: u32 = u32(row + 1);
+      var col2: u32 = u32(col + 1);
+      var channel: u32 = ${n.length>2?`u32(originalIndices[${d}])`:"0"};
+      var batch: u32 =  ${n.length>2?`u32(originalIndices[${s}])`:"0"};
+      var x11: ${l} = getInputValue(batch, channel, row1, col1);
+      var x12: ${l} = getInputValue(batch, channel, row1, col2);
+      var x21: ${l} = getInputValue(batch, channel, row2, col1);
+      var x22: ${l} = getInputValue(batch, channel, row2, col2);
+      var dx1: ${l} = abs(row - ${l}(row1));
+      var dx2: ${l} = abs(${l}(row2) - row);
+      var dy1: ${l} = abs(col - ${l}(col1));
+      var dy2: ${l} = abs(${l}(col2) - col);
+      if (row1 == row2) {
+        dx1 = 0.5;
+        dx2 = 0.5;
+      }
+      if (col1 == col2) {
+        dy1 = 0.5;
+        dy2 = 0.5;
+      }
+      return (x11 * dx2 * dy2 + x12 * dx2 * dy1 + x21 * dx1 * dy2 + x22 * dx1 * dy1);
+    }`},Fc=(e,t,n,r,o,i,s,a,u,d)=>{let l=n.length===2,c=!0,[p,f]=l?[0,1]:c?[2,3]:[1,2],m=e.type.value,h=b=>{let y=b===p?"row":"col";return`
+      fn ${y}CubicInterpolation(input_indices: ${e.type.indices}, output_indices: ${t.type.indices}) -> ${m} {
+        var output_index = ${t.indicesGet("output_indices",b)};
+        var originalIdx: ${m} = getOriginalCoordinateFromResizedCoordinate(output_index, ${o[b]},
+        ${r[b]}, ${n[b]}, ${i[b]}, ${i[b]} + ${n.length});
+        var fractOriginalIdx: ${m} = originalIdx - floor(originalIdx);
+        var coefs = getCubicInterpolationCoefs(fractOriginalIdx);
+
+        if (${a} && (originalIdx < 0 || originalIdx > (${n[b]} - 1))) {
+          return ${u};
+        }
+        var data: array<${m}, 4> = array<${m}, 4>(0.0, 0.0, 0.0, 0.0);
+        for (var i: i32 = -1; i < 3; i++) {
+          var ${y}: ${m} = originalIdx + ${m}(i);
+          if (${y} < 0 || ${y} >= ${n[b]}) {
+            ${d?`coefs[i + 1] = 0.0;
+                        continue;`:a?`return ${u};`:`${y} = max(0, min(${y}, ${n[b]} - 1));`};
+          }
+        var input_indices_copy: ${e.type.indices} = input_indices;
+          ${e.indicesSet("input_indices_copy",b,`u32(${y})`)};
+          data[i + 1] = ${b===p?e.getByIndices("input_indices_copy"):"rowCubicInterpolation(input_indices_copy, output_indices)"};
+        }
+        return cubicInterpolation1D(data, coefs);
+      }`};return`
+    ${h(p)};
+    ${h(f)};
+  fn getCubicInterpolationCoefs(s: ${m}) -> array<${m}, 4> {
+    var absS = abs(s);
+    var coeffs: array<${m}, 4> = array<${m}, 4>(0.0, 0.0, 0.0, 0.0);
+    var oneMinusAbsS: ${m} = 1.0 - absS;
+    var twoMinusAbsS: ${m} = 2.0 - absS;
+    var onePlusAbsS: ${m} = 1.0 + absS;
+    coeffs[0] = ((${s} * onePlusAbsS - 5 * ${s}) * onePlusAbsS + 8 * ${s}) * onePlusAbsS - 4 * ${s};
+    coeffs[1] = ((${s} + 2) * absS - (${s} + 3)) * absS * absS + 1;
+    coeffs[2] = ((${s} + 2) * oneMinusAbsS - (${s} + 3)) * oneMinusAbsS * oneMinusAbsS + 1;
+    coeffs[3] = ((${s} * twoMinusAbsS - 5 * ${s}) * twoMinusAbsS + 8 * ${s}) * twoMinusAbsS - 4 * ${s};
+    return coeffs;
+  }
+
+  fn cubicInterpolation1D(x: array<${m}, 4>, coefs: array<${m}, 4>) -> ${m} {
+    var coefsSum: ${m} = coefs[0] + coefs[1] + coefs[2] + coefs[3];
+    return (x[0] * coefs[0] + x[1] * coefs[1]+ x[2] * coefs[2]+ x[3] * coefs[3]) / coefsSum;
+  }
+
+  fn bicubicInterpolation(output_indices: ${t.type.indices}) -> ${m} {
+    var input_indices: ${e.type.indices} = output_indices;
+    return colCubicInterpolation(input_indices, output_indices);
+  }
+    `},Kc=(e,t,n,r,o)=>{let[s,a,u,d,l]=n.length===3?[-1,0,1,2,-1]:[0,2,3,4,1],c=e.type.value;return`
+    fn getInputValue(batch: u32, channel: u32, depth:u32, height: u32, width: u32) -> ${c} {
+      var input_indices: ${e.type.indices};
+      ${e.indicesSet("input_indices",a,`max(0, min(depth, ${n[a]} - 1))`)};
+      ${e.indicesSet("input_indices",u,`max(0, min(height, ${n[u]} - 1))`)};
+      ${e.indicesSet("input_indices",d,`max(0, min(width, ${n[d]} - 1))`)};
+      ${eu(e,l,s,3)}
+      return ${e.getByIndices("input_indices")};
+    }
+
+    fn trilinearInterpolation(output_indices: ${t.type.indices}) -> ${c} {
+      var originalIndices = calculateOriginalIndicesFromOutputIndices(output_indices);
+      var depth:${c} = originalIndices[${a}];
+      var height:${c} = originalIndices[${u}];
+      var width:${c} = originalIndices[${d}];
+      ${r?`if (depth < 0 || depth > (${n[a]} - 1) || height < 0 || height > (${n[u]} - 1) || width < 0 || (width > ${n[d]} - 1)) {
+      return ${o};
+        }`:""};
+
+    depth = max(0, min(depth, ${n[a]} - 1));
+      height = max(0, min(height, ${n[u]} - 1));
+      width = max(0, min(width, ${n[d]} - 1));
+      var depth1: u32 = u32(depth);
+      var height1: u32 = u32(height);
+      var width1: u32 = u32(width);
+      var depth2: u32 = u32(depth + 1);
+      var height2: u32 = u32(height + 1);
+      var width2: u32 = u32(width + 1);
+      var channel: u32 = ${n.length>3?`u32(originalIndices[${l}])`:"0"};
+      var batch: u32 =  ${n.length>3?`u32(originalIndices[${s}])`:"0"};
+
+      var x111: ${c} = getInputValue(batch, channel, depth1, height1, width1);
+      var x112: ${c} = getInputValue(batch, channel, depth1, height1, width2);
+      var x121: ${c} = getInputValue(batch, channel, depth1, height2, width1);
+      var x122: ${c} = getInputValue(batch, channel, depth1, height2, width2);
+      var x211: ${c} = getInputValue(batch, channel, depth2, height1, width1);
+      var x212: ${c} = getInputValue(batch, channel, depth2, height1, width2);
+      var x221: ${c} = getInputValue(batch, channel, depth2, height2, width1);
+      var x222: ${c} = getInputValue(batch, channel, depth2, height2, width2);
+      var dx1: ${c} = abs(depth - ${c}(depth1));
+      var dx2: ${c} = abs(${c}(depth2) - depth);
+      var dy1: ${c} = abs(height - ${c}(height1));
+      var dy2: ${c} = abs(${c}(height2) - height);
+      var dz1: ${c} = abs(width - ${c}(width1));
+      var dz2: ${c} = abs(${c}(width2) - width);
+      if (depth1 == depth2) {
+        dx1 = 0.5;
+        dx2 = 0.5;
+      }
+      if (height1 == height2) {
+        dy1 = 0.5;
+        dy2 = 0.5;
+      }
+      if (width1 == width2) {
+        dz1 = 0.5;
+        dz2 = 0.5;
+      }
+      return (x111 * dx2 * dy2 * dz2 + x112 * dx2 * dy2 * dz1 + x121 * dx2 * dy1 *dz2 + x122 * dx2 * dy1 * dz1 +
+              x211 * dx1 * dy2 * dz2 + x212 * dx1 * dy2 * dz1 + x221 * dx1 * dy1 *dz2 + x222 * dx1 * dy1 * dz1);
+    }`},jc=(e,t,n,r,o,i)=>{let s=e.dims,a=Vc(i,t.axes,s.length),u=Nc(s,r,o,t.axes),d=r.slice();r.length===0&&(d=s.map((g,_)=>g===0?1:u[_]/g),t.keepAspectRatioPolicy!=="stretch"&&(u=Lc(s,d,t)));let l=C("output",e.dataType,u.length),c=S("input",e.dataType,s.length),p=x.size(u),f=s.length===u.length&&s.every((g,_)=>g===u[_]),m=t.coordinateTransformMode==="tf_crop_and_resize",h=t.extrapolationValue,b=c.type.value,y=g=>`
+      ${f?"":`
+      ${Rc(t.coordinateTransformMode,b)};
+      ${(()=>{switch(t.mode){case"nearest":return`
+              ${Hc(c,s)};
+              ${Uc(t.nearestMode,n,b)};
+              ${Wc(c,l,s,u,d.length,a.length,m)};
+              `;case"linear":return`
+              ${Gc(l,s,u,d.length,a.length)};
+              ${(()=>{if(s.length===2||s.length===4)return`${qc(c,l,s,m,h)}`;if(s.length===3||s.length===5)return`${Kc(c,l,s,m,h)}`;throw Error("Linear mode only supports input dims 2, 3, 4 and 5 are supported in linear mode.")})()};
+            `;case"cubic":return`
+            ${(()=>{if(s.length===2||s.length===4)return`${Fc(c,l,s,u,d,a,t.cubicCoeffA,m,t.extrapolationValue,t.excludeOutside)}`;throw Error("Cubic mode only supports input dims 2 and 4 are supported in linear mode.")})()};
+            `;default:throw Error("Invalid resize mode")}})()};
+      `}
+      ${g.registerUniform("output_size","u32").registerUniform("scales","f32",d.length).registerUniform("roi","f32",a.length).declareVariables(c,l)}
+      ${g.mainStart()}
+        ${g.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+        ${f?"output[global_idx] = input[global_idx];":`
+        let output_indices = ${l.offsetToIndices("global_idx")};
+        var input_indices: ${c.type.indices};
+        ${(()=>{switch(t.mode){case"nearest":return`input_indices = calculateInputIndicesFromOutputIndices(output_indices);
+                if (checkInputIndices(input_indices)) {
+                  output[global_idx] = ${c.getByIndices("input_indices")};
+                } else {
+                  output[global_idx] = ${t.extrapolationValue};
+                }`;case"linear":return`output[global_idx] = ${s.length===2||s.length===4?"bilinearInterpolation":"trilinearInterpolation"}(output_indices);`;case"cubic":return"output[global_idx] = bicubicInterpolation(output_indices);";default:throw Error(`Unsupported resize mode: ${t.mode}`)}})()};
+`}
+      }`;return{name:"Resize",shaderCache:{hint:`${t.cacheKey}|${n}|${d.length>0?t.mode==="cubic"?d:d.length:""}|${o.length>0?o:""}|${a.length>0?a:""}|${f}|${t.mode==="nearest"?s.length:s}`,inputDependencies:["rank"]},getShaderSource:y,getRunData:()=>({outputs:[{dims:u,dataType:e.dataType}],dispatchGroup:{x:Math.ceil(p/64)},programUniforms:[{type:12,data:p},{type:1,data:d},{type:1,data:a},...P(s,u)]})}},Zc=e=>{let t=e.customDataBuffer;return new Uint32Array(t,t.byteOffset,1)[0]},tu=(e,t)=>{let n=[],r=[],o=[],i=Zc(e);if(t.antialias!==0)throw Error("Only default value (0) for Antialias attribute is supported");Mc(e.inputs,t,i,n,r,o),e.compute(jc(e.inputs[0],t,i,n,r,o),{inputs:[0]})},nu=e=>{let t=e.antialias,n=e.axes,r=e.coordinateTransformMode,o=e.cubicCoeffA,i=e.excludeOutside!==0,s=e.extrapolationValue,a=e.keepAspectRatioPolicy,u=e.mode,d=e.nearestMode===""?"simple":e.nearestMode;return N({antialias:t,axes:n,coordinateTransformMode:r,cubicCoeffA:o,excludeOutside:i,extrapolationValue:s,keepAspectRatioPolicy:a,mode:u,nearestMode:d})}});var Qc,Xc,ou,iu=E(()=>{"use strict";V();q();K();Qc=e=>{if(!e||e.length<3)throw new Error("layerNorm requires at least 3 inputs.");let t=e[0],n=e[1],r=e[2];if(t.dataType!==n.dataType||t.dataType!==r.dataType)throw new Error("All inputs must have the same data type");if(t.dims.length!==3&&t.dims.length!==2)throw new Error("Input must be 2D or 3D");if(n.dims.length!==3&&n.dims.length!==2)throw new Error("Skip must be 2D or 3D");let o=t.dims[t.dims.length-1],i=t.dims[t.dims.length-2];if(n.dims[n.dims.length-1]!==o)throw new Error("Skip must have the same hidden size as input");if(n.dims[n.dims.length-2]!==i)throw new Error("Skip must have the same sequence length as input");if(r.dims.length!==1)throw new Error("Gamma must be 1D");if(r.dims[r.dims.length-1]!==o)throw new Error("Gamma must have the same hidden size as input");if(e.length>3){let s=e[3];if(s.dims.length!==1)throw new Error("Beta must be 1D");if(s.dims[s.dims.length-1]!==o)throw new Error("Beta must have the same hidden size as input")}if(e.length>4){let s=e[4];if(s.dims.length!==1)throw new Error("Bias must be 1D");if(s.dims[s.dims.length-1]!==o)throw new Error("Bias must have the same hidden size as input")}},Xc=(e,t,n,r)=>{let o=t.simplified,i=e[0].dims,s=x.size(i),a=i,u=s,d=i.slice(-1)[0],l=r?i.slice(0,-1).concat(1):[],c=!o&&e.length>3,p=e.length>4,f=r&&n>1,m=r&&n>2,h=n>3,b=64,y=X(d),g=[{type:12,data:u},{type:12,data:y},{type:12,data:d},{type:1,data:t.epsilon}],_=v=>{let $=[{name:"output_size",type:"u32"},{name:"components",type:"u32"},{name:"hidden_size",type:"u32"},{name:"epsilon",type:"f32"}],T=[S("x",e[0].dataType,e[0].dims,y),S("skip",e[1].dataType,e[1].dims,y),S("gamma",e[2].dataType,e[2].dims,y)];c&&T.push(S("beta",e[3].dataType,e[3].dims,y)),p&&T.push(S("bias",e[4].dataType,e[4].dims,y)),T.push(C("output",e[0].dataType,a,y)),f&&T.push(C("mean_output",1,l)),m&&T.push(C("inv_std_output",1,l)),h&&T.push(C("input_skip_bias_sum",e[0].dataType,a,y));let I=re(e[0].dataType),A=re(1,y);return`
+
+      ${v.registerUniforms($).declareVariables(...T)}
+      var<workgroup> sum_shared : array<${A}, ${b}>;
+      var<workgroup> sum_squared_shared : array<${A}, ${b}>;
+
+      ${v.mainStart([b,1,1])}
+        let ix = local_id.x;
+        let iy = global_id.x / ${b};
+
+        let hidden_size_vectorized: u32 = uniforms.hidden_size / uniforms.components;
+        var stride = hidden_size_vectorized / ${b};
+        let offset = ix * stride + iy * hidden_size_vectorized;
+        let offset1d = stride * ix;
+        if (ix == ${b-1}) {
+          stride = hidden_size_vectorized - stride * ix;
+        }
+        for (var i: u32 = 0; i < stride; i++) {
+          let skip_value = skip[offset + i];
+          let bias_value = ${p?"bias[offset1d + i]":I+"(0.0)"};
+          let input_value = x[offset + i];
+          let value = input_value + skip_value + bias_value;
+          ${h?"input_skip_bias_sum[offset + i] = value;":""}
+          output[offset + i] = value;
+          let f32_value = ${Xe(I,y,"value")};
+          sum_shared[ix] += f32_value;
+          sum_squared_shared[ix] += f32_value * f32_value;
+        }
+        workgroupBarrier();
+
+        var reduce_size : u32 = ${b};
+        for (var curr_size = reduce_size >> 1;  curr_size > 0; curr_size = reduce_size >> 1) {
+          reduce_size = curr_size + (reduce_size & 1);
+          if (ix < curr_size) {
+            sum_shared[ix] += sum_shared[ix + reduce_size];
+            sum_squared_shared[ix] += sum_squared_shared[ix + reduce_size];
+          }
+          workgroupBarrier();
+        }
+
+        let sum = sum_shared[0];
+        let square_sum = sum_squared_shared[0];
+        let mean = ${Te("sum",y)} / f32(uniforms.hidden_size);
+        let inv_std_dev = inverseSqrt(${Te("square_sum",y)} / f32(uniforms.hidden_size) ${o?"":"- mean * mean"} + uniforms.epsilon);
+        ${f?"mean_output[global_idx] = mean;":""}
+        ${m?"inv_std_output[global_idx] = inv_std_dev;":""}
+
+        for (var i: u32 = 0; i < stride; i++) {
+          output[offset + i] = (output[offset + i] ${o?"":`- ${I}(mean)`}) *
+            ${I}(inv_std_dev) * gamma[offset1d + i]
+            ${c?"+ beta[offset1d + i]":""};
+        }
+      }`},w=[{dims:a,dataType:e[0].dataType}];return n>1&&w.push({dims:l,dataType:1}),n>2&&w.push({dims:l,dataType:1}),n>3&&w.push({dims:i,dataType:e[0].dataType}),{name:"SkipLayerNormalization",shaderCache:{hint:`${y};${f};${m};${h}`,inputDependencies:e.map((v,$)=>"type")},getShaderSource:_,getRunData:()=>({outputs:w,dispatchGroup:{x:Math.ceil(u/d)},programUniforms:g})}},ou=(e,t)=>{Qc(e.inputs);let r=[0];e.outputCount>1&&r.push(-3),e.outputCount>2&&r.push(-3),e.outputCount>3&&r.push(3),e.compute(Xc(e.inputs,t,e.outputCount,!1),{outputs:r})}});var Yc,cn,Jc,su,ep,tp,au,uu,du=E(()=>{"use strict";V();q();ie();K();Yc=(e,t)=>{if(!e||e.length<1)throw new Error("too few inputs");if(t.axes.length!==0){if(t.axes.length!==t.starts.length||t.axes.length!==t.ends.length)throw new Error("axes, starts and ends must have the same length")}else if(t.starts.length!==t.ends.length)throw new Error("starts and ends must have the same length");e.slice(1).forEach((n,r)=>{if(e[r+1].dataType!==6&&e[r+1].dataType!==7)throw new Error(`Input ${r} must be an array of int32 or int64`)})},cn=(e,t)=>{let n=[];if(e.length>t)if(e[t].dataType===7)e[t].getBigInt64Array().forEach(r=>n.push(Number(r)));else if(e[t].dataType===6)e[t].getInt32Array().forEach(r=>n.push(Number(r)));else throw new Error(`Input ${t} must be an array of int32 or int64`);return n},Jc=(e,t)=>{if(e.length>1){let n=cn(e,1),r=cn(e,2),o=cn(e,3);return o.length===0&&(o=[...Array(e[0].dims.length).keys()]),N({starts:n,ends:r,axes:o})}else return t},su=(e,t,n,r,o)=>{let i=e;return e<0&&(i+=n[r[t]]),o[t]<0?Math.max(0,Math.min(i,n[r[t]]-1)):Math.max(0,Math.min(i,n[r[t]]))},ep=(e,t,n)=>`fn calculateInputIndices(output_indices: ${t.type.indices}) -> ${e.type.indices} {
+          var input_indices: ${e.type.indices};
+          var carry = 0u;
+          for (var i = ${n.length}; i >= 0; i--) {
+            let input_shape_i = ${B("uniforms.input_shape","i",n.length)};
+            let steps_i = ${B("uniforms.steps","i",n.length)};
+            let signs_i = ${B("uniforms.signs","i",n.length)};
+            let starts_i = ${B("uniforms.starts","i",n.length)};
+            var output_index = ${t.indicesGet("output_indices","i")};
+            var input_index = output_index * steps_i + starts_i + carry;
+            carry = input_index / input_shape_i;
+            input_index = input_index % input_shape_i;
+            if (signs_i < 0) {
+              input_index = input_shape_i - input_index - 1u + starts_i;
+            }
+            ${e.indicesSet("input_indices","i","input_index")};
+          }
+          return input_indices;
+      }`,tp=(e,t)=>{let n=e[0].dims,r=x.size(n),o=t.axes.length>0?x.normalizeAxes(t.axes,n.length):[...Array(n.length).keys()],i=cn(e,4);i.forEach(y=>y!==0||(()=>{throw new Error("step cannot be 0")})),i.length===0&&(i=Array(o.length).fill(1));let s=t.starts.map((y,g)=>su(y,g,n,o,i)),a=t.ends.map((y,g)=>su(y,g,n,o,i));if(o.length!==s.length||o.length!==a.length)throw new Error("start, ends and axes should have the same number of elements");if(o.length!==n.length)for(let y=0;y<n.length;++y)o.includes(y)||(s.splice(y,0,0),a.splice(y,0,n[y]),i.splice(y,0,1));let u=i.map(y=>Math.sign(y));i.forEach((y,g,_)=>{if(y<0){let w=(a[g]-s[g])/y,v=s[g],$=v+w*i[g];s[g]=$,a[g]=v,_[g]=-y}});let d=n.slice(0);o.forEach((y,g)=>{d[y]=Math.ceil((a[y]-s[y])/i[y])});let l={dims:d,dataType:e[0].dataType},c=C("output",e[0].dataType,d.length),p=S("input",e[0].dataType,e[0].dims.length),f=x.size(d),m=[{name:"outputSize",type:"u32"},{name:"starts",type:"u32",length:s.length},{name:"signs",type:"i32",length:u.length},{name:"steps",type:"u32",length:i.length}],h=[{type:12,data:f},{type:12,data:s},{type:6,data:u},{type:12,data:i},...P(e[0].dims,d)],b=y=>`
+      ${y.registerUniforms(m).declareVariables(p,c)}
+        ${ep(p,c,n)}
+        ${y.mainStart()}
+          ${y.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.outputSize")}
+          let output_indices = ${c.offsetToIndices("global_idx")};
+          let input_indices = calculateInputIndices(output_indices);
+          ${c.setByOffset("global_idx",p.getByIndices("input_indices"))}
+      }`;return{name:"Slice",shaderCache:{hint:`${u.length}_${s.length}_${i.length}`,inputDependencies:["rank"]},getShaderSource:b,getRunData:()=>({outputs:[l],dispatchGroup:{x:Math.ceil(r/64)},programUniforms:h})}},au=(e,t)=>{Yc(e.inputs,t);let n=Jc(e.inputs,t);e.compute(tp(e.inputs,n),{inputs:[0]})},uu=e=>{let t=e.starts,n=e.ends,r=e.axes;return N({starts:t,ends:n,axes:r})}});var np,rp,lu,cu,pu=E(()=>{"use strict";V();q();ie();Re();K();np=e=>{if(!e||e.length!==1)throw new Error("Softmax op requires 1 input.")},rp=(e,t)=>{let n=e.inputs[0],r=n.dims,o=x.size(r),i=r.length,s=x.normalizeAxis(t.axis,i),a=s<r.length-1,u,d=[];a?(d=Array.from({length:i},(T,I)=>I),d[s]=i-1,d[i-1]=s,u=e.compute(ce(n,d),{inputs:[n],outputs:[-1]})[0]):u=n;let l=u.dims,c=l[i-1],p=o/c,f=X(c),m=c/f,h=64;p===1&&(h=256);let b=(T,I)=>I===4?`max(max(${T}.x, ${T}.y), max(${T}.z, ${T}.w))`:I===2?`max(${T}.x, ${T}.y)`:I===3?`max(max(${T}.x, ${T}.y), ${T}.z)`:T,y=S("x",u.dataType,u.dims,f),g=C("result",u.dataType,u.dims,f),_=y.type.value,w=re(u.dataType)==="f32"?`var threadMax = ${_}(-3.402823e+38f);`:`var threadMax = ${_}(-65504.0h);`,v=T=>`
+      var<workgroup> rowMaxShared : ${_};
+      var<workgroup> rowSumShared : ${_};
+      var<workgroup> threadShared : array<${_}, ${h}>;
+
+      fn getValue(row: i32, col: i32, row_stride: i32) -> ${_} {
+        let index = row * row_stride + col;
+        return x[index];
+      }
+
+      fn setValue(row: i32, col: i32, row_stride: i32, value: ${_}) {
+        let index = row * row_stride + col;
+        result[index] = value;
+      }
+      ${T.registerUniform("packedCols","i32").declareVariables(y,g)}
+      ${T.mainStart(h)}
+        let gindex = i32(global_idx);
+        let lindex = i32(local_idx);
+        const wg = ${h};
+        let row = gindex / wg;
+        let cols = uniforms.packedCols;
+        let row_stride : i32 = uniforms.packedCols;
+
+        // find the rows max
+        ${w}
+        for (var col = lindex; col < cols; col += wg) {
+          let value = getValue(row, col, row_stride);
+          threadMax = max(threadMax, value);
+        }
+        if (lindex < cols) {
+          threadShared[lindex] = threadMax;
+        }
+        workgroupBarrier();
+
+        var reduceSize = min(cols, wg);
+        for (var currSize = reduceSize >> 1;  currSize > 0; currSize = reduceSize >> 1) {
+          reduceSize = currSize + (reduceSize & 1);
+          if (lindex < currSize) {
+            threadShared[lindex] = max(threadShared[lindex], threadShared[lindex + reduceSize]);
+          }
+          workgroupBarrier();
+        }
+        if (lindex == 0) {
+          rowMaxShared = ${_}(${b("threadShared[0]",f)});
+        }
+        workgroupBarrier();
+
+        // find the rows sum
+        var threadSum = ${_}(0.0);
+        for (var col = lindex; col < cols; col += wg) {
+          let subExp = exp(getValue(row, col, row_stride) - rowMaxShared);
+          threadSum += subExp;
+        }
+        threadShared[lindex] = threadSum;
+        workgroupBarrier();
+
+        for (var currSize = wg >> 1;  currSize > 0; currSize = currSize >> 1) {
+          if (lindex < currSize) {
+            threadShared[lindex] = threadShared[lindex] + threadShared[lindex + currSize];
+          }
+          workgroupBarrier();
+        }
+        if (lindex == 0) {
+          rowSumShared = ${_}(${Te("threadShared[0]",f)});
+        }
+        workgroupBarrier();
+
+        // calculate final value for each element in the row
+        for (var col = lindex; col < cols; col += wg) {
+          let value = exp(getValue(row, col, row_stride) - rowMaxShared) / rowSumShared;
+          setValue(row, col, row_stride, value);
+        }
+      }`,$=e.compute({name:"Softmax",shaderCache:{hint:`${f};${h}`,inputDependencies:["type"]},getRunData:()=>({outputs:[{dims:l,dataType:u.dataType}],dispatchGroup:{x:p},programUniforms:[{type:6,data:m}]}),getShaderSource:v},{inputs:[u],outputs:[a?-1:0]})[0];a&&e.compute(ce($,d),{inputs:[$]})},lu=(e,t)=>{np(e.inputs),rp(e,t)},cu=e=>N({axis:e.axis})});var mu,op,ip,sp,fu,hu=E(()=>{"use strict";V();q();K();mu=e=>Array.from(e.getBigInt64Array(),Number),op=e=>{if(!e||e.length!==2)throw new Error("Tile requires 2 inputs.");if(e[0].dataType!==1&&e[0].dataType!==10&&e[0].dataType!==6&&e[0].dataType!==12)throw new Error("Tile only support float, float16, int32, and uint32 data types");if(e[1].dataType!==7)throw new Error("Tile `repeats` input should be of int64 data type");if(e[1].dims.length!==1)throw new Error("Tile `repeats` input should be 1-D");if(mu(e[1]).length!==e[0].dims.length)throw new Error("Tile `repeats` input should have same number of elements as rank of input data tensor")},ip=(e,t)=>{let n=[];for(let r=0;r<e.length;++r)n.push(e[r]*t[r]);return n},sp=(e,t)=>{let n=e[0].dims,r=t??mu(e[1]),o=ip(n,r),i=x.size(o),s=e[0].dataType,a=S("input",s,n.length),u=C("output",s,o.length),d=l=>`
+      const inputShape = ${a.indices(...n)};
+      ${l.registerUniform("output_size","u32").declareVariables(a,u)}
+      ${l.mainStart()}
+      ${l.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size")}
+      let output_indices = ${u.offsetToIndices("global_idx")};
+      var input_indices: ${a.type.indices};
+      for (var i = 0; i < ${n.length}; i++) {
+        let input_dim_i = ${a.indicesGet("uniforms.input_shape","i")};
+        let input_dim_value = ${u.indicesGet("output_indices","i")}  % input_dim_i;
+
+        ${a.indicesSet("input_indices","i","input_dim_value")}
+      }
+      ${u.setByOffset("global_idx",a.getByIndices("input_indices"))}
+    }`;return{name:"Tile",shaderCache:{hint:`${r}`,inputDependencies:["rank"]},getRunData:()=>({outputs:[{dims:o,dataType:e[0].dataType}],dispatchGroup:{x:Math.ceil(i/64)},programUniforms:[{type:12,data:i},...P(e[0].dims,o)]}),getShaderSource:d}},fu=e=>{op(e.inputs),e.compute(sp(e.inputs),{inputs:[0]})}});var ap,up,gu,yu=E(()=>{"use strict";V();q();K();ap=(e,t,n,r,o)=>{let i=C("output_data",o,n.length,4),s=S("a_data",t[1].dataType,t[1].dims.length,4),a=S("b_data",t[2].dataType,t[2].dims.length,4),u=S("c_data",t[0].dataType,t[0].dims.length,4),d,l=(c,p,f)=>`select(${p}, ${c}, ${f})`;if(!r)d=i.setByOffset("global_idx",l(s.getByOffset("global_idx"),a.getByOffset("global_idx"),u.getByOffset("global_idx")));else{let c=(p,f,m="")=>{let h=`a_data[index_a${f}][component_a${f}]`,b=`b_data[index_b${f}][component_b${f}]`,y=`bool(c_data[index_c${f}] & (0xffu << (component_c${f} * 8)))`;return`
+            let output_indices${f} = ${i.offsetToIndices(`global_idx * 4u + ${f}u`)};
+            let offset_a${f} = ${s.broadcastedIndicesToOffset(`output_indices${f}`,i)};
+            let offset_b${f} = ${a.broadcastedIndicesToOffset(`output_indices${f}`,i)};
+            let offset_c${f} = ${u.broadcastedIndicesToOffset(`output_indices${f}`,i)};
+            let index_a${f} = offset_a${f} / 4u;
+            let index_b${f} = offset_b${f} / 4u;
+            let index_c${f} = offset_c${f} / 4u;
+            let component_a${f} = offset_a${f} % 4u;
+            let component_b${f} = offset_b${f} % 4u;
+            let component_c${f} = offset_c${f} % 4u;
+            ${p}[${f}] = ${m}(${l(h,b,y)});
+          `};o===9?d=`
+            var data = vec4<u32>(0);
+            ${c("data",0,"u32")}
+            ${c("data",1,"u32")}
+            ${c("data",2,"u32")}
+            ${c("data",3,"u32")}
+            output_data[global_idx] = dot(vec4<u32>(0x1, 0x100, 0x10000, 0x1000000), vec4<u32>(data));`:d=`
+            ${c("output_data[global_idx]",0)}
+            ${c("output_data[global_idx]",1)}
+            ${c("output_data[global_idx]",2)}
+            ${c("output_data[global_idx]",3)}
+          `}return`
+        ${e.registerUniform("vec_size","u32").declareVariables(u,s,a,i)}
+        ${e.mainStart()}
+        ${e.guardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")}
+        ${d}
+      }`},up=e=>{let t=e[1].dims,n=e[2].dims,r=e[0].dims,o=e[1].dataType,i=!(x.areEqual(t,n)&&x.areEqual(n,r)),s=t,a=x.size(t);if(i){let d=Pe.calcShape(Pe.calcShape(t,n,!1),r,!1);if(!d)throw new Error("Can't perform where op on the given tensors");s=d,a=x.size(s)}let u=Math.ceil(a/4);return{name:"Where",shaderCache:{inputDependencies:["rank","rank","rank"]},getShaderSource:d=>ap(d,e,s,i,o),getRunData:()=>({outputs:[{dims:s,dataType:o}],dispatchGroup:{x:Math.ceil(a/64/4)},programUniforms:[{type:12,data:u},...P(r,t,n,s)]})}},gu=e=>{e.compute(up(e.inputs))}});var bu,_u=E(()=>{"use strict";Xo();Yt();ei();ni();Li();Yi();ts();ys();Ss();Cs();Es();Bs();Us();Ns();Ws();Fs();Zs();Ys();ta();oa();ma();ga();ba();wa();xa();or();Ta();Ga();qa();Ka();Ya();Qt();ru();ar();iu();du();pu();sr();hu();Re();en();yu();bu=new Map([["Abs",[ri]],["Acos",[oi]],["Acosh",[ii]],["Add",[Gi]],["ArgMax",[Qo,qn]],["ArgMin",[Zo,qn]],["Asin",[si]],["Asinh",[ai]],["Atan",[ui]],["Atanh",[di]],["Attention",[Yo]],["AveragePool",[Oa,za]],["BatchNormalization",[Jo]],["BiasAdd",[ti]],["BiasSplitGelu",[Ni]],["Cast",[ci,li]],["Ceil",[mi]],["Clip",[pi]],["Concat",[Ji,es]],["Conv",[er,Jn]],["ConvTranspose",[xs,$s]],["Cos",[fi]],["Cosh",[hi]],["CumSum",[Ts,Is]],["DepthToSpace",[As,ks]],["DequantizeLinear",[Wa,Ha]],["Div",[Wi]],["Einsum",[Os,Ds]],["Elu",[gi,ft]],["Equal",[Hi]],["Erf",[yi]],["Exp",[bi]],["Expand",[Rs]],["FastGelu",[Vs]],["Floor",[_i]],["FusedConv",[er,Jn]],["Gather",[Gs,Ls]],["GatherElements",[Xs,Qs]],["GatherBlockQuantized",[Ks,js]],["GatherND",[Hs,qs]],["Gelu",[wi]],["Gemm",[ea,Js]],["GlobalAveragePool",[Ma,Ba]],["GlobalMaxPool",[La,Na]],["Greater",[ji]],["GreaterOrEqual",[Qi]],["GridSample",[na,ra]],["GroupQueryAttention",[pa]],["HardSigmoid",[Ai,Ci]],["InstanceNormalization",[ha]],["LayerNormalization",[ya]],["LeakyRelu",[$i,ft]],["Less",[Zi]],["LessOrEqual",[Xi]],["Log",[Ri]],["MatMul",[_a]],["MatMulNBits",[$a,va]],["MaxPool",[Ua,Va]],["Mul",[qi]],["MultiHeadAttention",[aa,sa]],["Neg",[xi]],["Not",[vi]],["Pad",[Sa]],["Pow",[Fi]],["QuickGelu",[Ui,ft]],["Range",[Fa]],["Reciprocal",[Si]],["ReduceMin",[Wo]],["ReduceMean",[Uo]],["ReduceMax",[Go]],["ReduceSum",[qo]],["ReduceProd",[Ho]],["ReduceL1",[Vo]],["ReduceL2",[No]],["ReduceLogSum",[Ko]],["ReduceLogSumExp",[Lo]],["ReduceSumSquare",[Fo]],["Relu",[Ti]],["Resize",[tu,nu]],["RotaryEmbedding",[la]],["ScatterND",[Xa,Qa]],["Sigmoid",[Ii]],["Sin",[ki]],["Sinh",[Ei]],["Slice",[au,uu]],["SkipLayerNormalization",[ou]],["Split",[ua,da]],["Sqrt",[Pi]],["Softmax",[lu,cu]],["Sub",[Ki]],["Tan",[zi]],["Tanh",[Di]],["ThresholdedRelu",[Mi,ft]],["Tile",[fu]],["Transpose",[To,Io]],["Where",[gu]]])});var pn,wu=E(()=>{"use strict";ve();Ee();K();pn=class{constructor(t){this.backend=t;this.repo=new Map,this.attributesBound=!1}getArtifact(t){return this.repo.get(t)}setArtifact(t,n){this.repo.set(t,n)}run(t,n,r,o,i){fe(t.programInfo.name);let s=this.backend.device,a=this.backend.getComputePassEncoder();this.backend.writeTimestamp(this.backend.pendingDispatchNumber*2);let u=[];for(let l of n)u.push({binding:u.length,resource:{buffer:l.buffer}});for(let l of r)u.push({binding:u.length,resource:{buffer:l.buffer}});i&&u.push({binding:u.length,resource:i});let d=s.createBindGroup({layout:t.computePipeline.getBindGroupLayout(0),entries:u,label:t.programInfo.name});if(this.backend.sessionStatus==="capturing"){let l={kernelId:this.backend.currentKernelId,computePipeline:t.computePipeline,bindGroup:d,dispatchGroup:o};this.backend.capturedCommandList.get(this.backend.currentSessionId).push(l)}a.setPipeline(t.computePipeline),a.setBindGroup(0,d),a.dispatchWorkgroups(...o),this.backend.writeTimestamp(this.backend.pendingDispatchNumber*2+1),this.backend.pendingDispatchNumber++,(this.backend.pendingDispatchNumber>=this.backend.maxDispatchNumber||this.backend.queryType==="at-passes")&&this.backend.endComputePass(),this.backend.pendingDispatchNumber>=this.backend.maxDispatchNumber&&this.backend.flush(),me(t.programInfo.name)}dispose(){}build(t,n){fe(t.name);let r=this.backend.device,o=[];[{feature:"shader-f16",extension:"f16"},{feature:"subgroups",extension:"subgroups"}].forEach(c=>{r.features.has(c.feature)&&o.push(`enable ${c.extension};`)});let s=xo(n,this.backend.device.limits),a=t.getShaderSource(s),u=`${o.join(`
+`)}
+${s.additionalImplementations}
+${a}`,d=r.createShaderModule({code:u,label:t.name});j("verbose",()=>`[WebGPU] ${t.name} shader code: ${u}`);let l=r.createComputePipeline({compute:{module:d,entryPoint:"main"},layout:"auto",label:t.name});return me(t.name),{programInfo:t,computePipeline:l,uniformVariablesInfo:s.variablesInfo}}normalizeDispatchGroupSize(t){let n=typeof t=="number"?t:t.x,r=typeof t=="number"?1:t.y||1,o=typeof t=="number"?1:t.z||1,i=this.backend.device.limits.maxComputeWorkgroupsPerDimension;if(n<=i&&r<=i&&o<=i)return[n,r,o];let s=n*r*o,a=Math.ceil(Math.sqrt(s));if(a>i){if(a=Math.ceil(Math.cbrt(s)),a>i)throw new Error("Total dispatch size exceeds WebGPU maximum.");return[a,a,a]}else return[a,a,1]}}});var $u={};et($u,{WebGpuBackend:()=>dr});var dp,lp,ur,dr,vu=E(()=>{"use strict";ve();V();Ee();On();$o();_u();wu();dp=(e,t)=>{if(t.length!==e.length)throw new Error(`inputDependencies length ${t.length} is not equal to inputTensors length ${e.length}.`);let n=[];for(let r=0;r<e.length;++r){let o=e[r].dataType;switch(t[r]){case"none":{n.push("");break}case"type":{n.push(`${o}`);break}case"rank":{let i=e[r].dims.length;n.push(`${o};${i}`);break}case"dims":{let i=e[r].dims.join(",");n.push(`${o};${i}`);break}default:throw new Error(`unsupported input dependency: ${t[r]}`)}}return n.join("|")},lp=(e,t,n)=>{let r=e.name;return e.shaderCache?.hint&&(r+="["+e.shaderCache.hint+"]"),r+=":"+n+`:${dp(t,e.shaderCache?.inputDependencies??new Array(t.length).fill("dims"))}`,r},ur=class{constructor(t){t&&(this.architecture=t.architecture,this.vendor=t.vendor)}isArchitecture(t){return this.architecture===t}isVendor(t){return this.vendor===t}},dr=class{constructor(){this.currentSessionId=null;this.currentKernelId=null;this.commandEncoder=null;this.computePassEncoder=null;this.maxDispatchNumber=16;this.pendingDispatchNumber=0;this.pendingKernels=[];this.pendingQueries=new Map;this.sessionStatus="default";this.capturedCommandList=new Map;this.capturedPendingKernels=new Map;this.sessionExternalDataMapping=new Map}get currentKernelCustomData(){if(this.currentKernelId===null)throw new Error("currentKernelCustomData(): currentKernelId is null. (should not happen)");let t=this.kernelCustomData.get(this.currentKernelId);return t||(t={},this.kernelCustomData.set(this.currentKernelId,t)),t}async initialize(t,n){this.env=t;let r=[],o={requiredLimits:{maxComputeWorkgroupStorageSize:n.limits.maxComputeWorkgroupStorageSize,maxComputeWorkgroupsPerDimension:n.limits.maxComputeWorkgroupsPerDimension,maxStorageBufferBindingSize:n.limits.maxStorageBufferBindingSize,maxBufferSize:n.limits.maxBufferSize,maxComputeInvocationsPerWorkgroup:n.limits.maxComputeInvocationsPerWorkgroup,maxComputeWorkgroupSizeX:n.limits.maxComputeWorkgroupSizeX,maxComputeWorkgroupSizeY:n.limits.maxComputeWorkgroupSizeY,maxComputeWorkgroupSizeZ:n.limits.maxComputeWorkgroupSizeZ},requiredFeatures:r},i=s=>n.features.has(s)&&r.push(s)&&!0;i("chromium-experimental-timestamp-query-inside-passes")||i("timestamp-query"),i("shader-f16"),i("subgroups"),this.device=await n.requestDevice(o),this.adapterInfo=new ur(n.info||await n.requestAdapterInfo()),this.gpuDataManager=wo(this),this.programManager=new pn(this),this.kernels=new Map,this.kernelPersistentData=new Map,this.kernelCustomData=new Map,Nt(t.logLevel,!!t.debug),this.device.onuncapturederror=s=>{s.error instanceof GPUValidationError&&console.error(`An uncaught WebGPU validation error was raised: ${s.error.message}`)},Object.defineProperty(this.env.webgpu,"device",{value:this.device,writable:!1,enumerable:!0,configurable:!1}),Object.defineProperty(this.env.webgpu,"adapter",{value:n,writable:!1,enumerable:!0,configurable:!1}),this.setQueryType()}dispose(){typeof this.querySet<"u"&&this.querySet.destroy(),this.gpuDataManager.dispose()}getCommandEncoder(){return this.commandEncoder||(this.commandEncoder=this.device.createCommandEncoder()),this.commandEncoder}getComputePassEncoder(){if(!this.computePassEncoder){let t=this.getCommandEncoder(),n={};this.queryType==="at-passes"&&(n.timestampWrites={querySet:this.querySet,beginningOfPassWriteIndex:this.pendingDispatchNumber*2,endOfPassWriteIndex:this.pendingDispatchNumber*2+1}),this.computePassEncoder=t.beginComputePass(n)}return this.computePassEncoder}endComputePass(){this.computePassEncoder&&(this.computePassEncoder.end(),this.computePassEncoder=null)}flush(){if(!this.commandEncoder)return;fe(),this.endComputePass();let t;this.queryType!=="none"&&(this.commandEncoder.resolveQuerySet(this.querySet,0,this.pendingDispatchNumber*2,this.queryResolveBuffer,0),t=this.device.createBuffer({size:this.pendingDispatchNumber*2*8,usage:GPUBufferUsage.MAP_READ|GPUBufferUsage.COPY_DST}),this.pendingQueries.set(t,this.pendingKernels),this.pendingKernels=[],this.commandEncoder.copyBufferToBuffer(this.queryResolveBuffer,0,t,0,this.pendingDispatchNumber*2*8)),this.device.queue.submit([this.commandEncoder.finish()]),this.gpuDataManager.refreshPendingBuffers(),this.commandEncoder=null,this.pendingDispatchNumber=0,this.queryType!=="none"&&t.mapAsync(GPUMapMode.READ).then(()=>{let n=new BigUint64Array(t.getMappedRange()),r=this.pendingQueries.get(t);for(let o=0;o<n.length/2;o++){let i=r[o],s=i.kernelId,a=this.kernels.get(s),u=a.kernelType,d=a.kernelName,l=i.programName,c=i.inputTensorViews,p=i.outputTensorViews,f=n[o*2],m=n[o*2+1];typeof this.queryTimeBase>"u"&&(this.queryTimeBase=f);let h=Number(f-this.queryTimeBase),b=Number(m-this.queryTimeBase);if(!Number.isSafeInteger(h)||!Number.isSafeInteger(b))throw new RangeError("incorrect timestamp range");if(this.env.webgpu.profiling?.ondata)this.env.webgpu.profiling.ondata({version:1,inputsMetadata:c.map(y=>({dims:y.dims,dataType:ke(y.dataType)})),outputsMetadata:p.map(y=>({dims:y.dims,dataType:ke(y.dataType)})),kernelId:s,kernelType:u,kernelName:d,programName:l,startTime:h,endTime:b});else{let y="";c.forEach((_,w)=>{y+=`input[${w}]: [${_.dims}] | ${ke(_.dataType)}, `});let g="";p.forEach((_,w)=>{g+=`output[${w}]: [${_.dims}] | ${ke(_.dataType)}, `}),console.log(`[profiling] kernel "${s}|${u}|${d}|${l}" ${y}${g}execution time: ${b-h} ns`)}dt("GPU",`${l}::${f}::${m}`)}t.unmap(),this.pendingQueries.delete(t)}),me()}run(t,n,r,o,i,s){fe(t.name);let a=[];for(let _=0;_<n.length;++_){let w=n[_].data;if(w===0)continue;let v=this.gpuDataManager.get(w);if(!v)throw new Error(`no GPU data for input: ${w}`);a.push(v)}let{outputs:u,dispatchGroup:d,programUniforms:l}=t.getRunData(n),c=r.length===0?u.map((_,w)=>w):r;if(c.length!==u.length)throw new Error(`Output size ${c.length} must be equal to ${u.length}.`);let p=[],f=[];for(let _=0;_<u.length;++_){if(!Number.isInteger(c[_])||c[_]<-3||c[_]>=s)throw new Error(`Invalid output index: ${c[_]}`);if(c[_]===-3)continue;let w=c[_]===-1,v=c[_]===-2,$=w||v?i(u[_].dataType,u[_].dims):o(c[_],u[_].dataType,u[_].dims);if(p.push($),$.data===0)continue;let T=this.gpuDataManager.get($.data);if(!T)throw new Error(`no GPU data for output: ${$.data}`);if(w&&this.temporaryData.push(T),v){let I=this.kernelPersistentData.get(this.currentKernelId);I||(I=[],this.kernelPersistentData.set(this.currentKernelId,I)),I.push(T)}f.push(T)}if(a.length!==n.length||f.length!==p.length){if(f.length===0)return me(t.name),p;throw new Error(`Program ${t.name} has zero-sized tensor(s) in inputs or outputs. This is not supported now.`)}let m;if(l){let _=0,w=[];l.forEach(I=>{let A=typeof I.data=="number"?[I.data]:I.data;if(A.length===0)return;let z=I.type===10?2:4,M,R;I.type===10?(R=A.length>4?16:A.length>2?8:A.length*z,M=A.length>4?16:z*A.length):(R=A.length<=2?A.length*z:16,M=16),_=Math.ceil(_/R)*R,w.push(_);let W=I.type===10?8:4;_+=A.length>4?Math.ceil(A.length/W)*M:A.length*z});let v=16;_=Math.ceil(_/v)*v;let $=new ArrayBuffer(_);l.forEach((I,A)=>{let z=w[A],M=typeof I.data=="number"?[I.data]:I.data;if(I.type===6)new Int32Array($,z,M.length).set(M);else if(I.type===12)new Uint32Array($,z,M.length).set(M);else if(I.type===10)new Uint16Array($,z,M.length).set(M);else if(I.type===1)new Float32Array($,z,M.length).set(M);else throw new Error(`Unsupported uniform type: ${ke(I.type)}`)});let T=this.gpuDataManager.create(_,GPUBufferUsage.COPY_DST|GPUBufferUsage.UNIFORM);this.device.queue.writeBuffer(T.buffer,0,$,0,_),this.gpuDataManager.release(T.id),m={offset:0,size:_,buffer:T.buffer}}let h=this.programManager.normalizeDispatchGroupSize(d),b=h[1]===1&&h[2]===1,y=lp(t,n,b),g=this.programManager.getArtifact(y);if(g||(g=this.programManager.build(t,h),this.programManager.setArtifact(y,g),j("info",()=>`[artifact] key: ${y}, programName: ${t.name}`)),l&&g.uniformVariablesInfo){if(l.length!==g.uniformVariablesInfo.length)throw new Error(`Uniform variables count mismatch: expect ${g.uniformVariablesInfo.length}, got ${l.length} in program "${g.programInfo.name}".`);for(let _=0;_<l.length;_++){let w=l[_],v=w.type,$=typeof w.data=="number"?1:w.data.length,[T,I]=g.uniformVariablesInfo[_];if(v!==T||$!==I)throw new Error(`Uniform variable ${_} mismatch: expect type ${T} with size ${I}, got type ${v} with size ${$} in program "${g.programInfo.name}".`)}}if(j("info",()=>`[ProgramManager] run "${t.name}" (key=${y}) with ${h[0]}x${h[1]}x${h[2]}`),this.queryType!=="none"||this.sessionStatus==="capturing"){let _={kernelId:this.currentKernelId,programName:g.programInfo.name,inputTensorViews:n,outputTensorViews:p};this.pendingKernels.push(_),this.sessionStatus==="capturing"&&this.capturedPendingKernels.get(this.currentSessionId).push(_)}return this.programManager.run(g,a,f,h,m),me(t.name),p}upload(t,n){this.gpuDataManager.upload(t,n)}memcpy(t,n){this.gpuDataManager.memcpy(t,n)}async download(t,n){await this.gpuDataManager.download(t,n)}alloc(t){return this.gpuDataManager.create(t).id}free(t){return this.gpuDataManager.release(t)}createKernel(t,n,r,o){let i=bu.get(t);if(!i)throw new Error(`kernel not implemented: ${t}`);let s={kernelType:t,kernelName:o,kernelEntry:i[0],attributes:[i[1],r]};this.kernels.set(n,s)}releaseKernel(t){let n=this.kernelPersistentData.get(t);if(n){for(let r of n)this.gpuDataManager.release(r.id);this.kernelPersistentData.delete(t)}this.kernelCustomData.delete(t),this.kernels.delete(t)}computeKernel(t,n,r){let o=this.kernels.get(t);if(!o)throw new Error(`kernel not created: ${t}`);let i=o.kernelType,s=o.kernelName,a=o.kernelEntry,u=o.attributes;if(this.currentKernelId!==null)throw new Error(`kernel "[${i}] ${s}" is not allowed to be called recursively`);this.currentKernelId=t,u[0]&&(u[1]=u[0](u[1]),u[0]=void 0),j("info",()=>`[WebGPU] Start to run kernel "[${i}] ${s}"...`);let d=this.env.debug;this.temporaryData=[];try{return d&&this.device.pushErrorScope("validation"),a(n,u[1]),0}catch(l){return r.push(Promise.resolve(`[WebGPU] Kernel "[${i}] ${s}" failed. ${l}`)),1}finally{d&&r.push(this.device.popErrorScope().then(l=>l?`GPU validation error for kernel "[${i}] ${s}": ${l.message}`:null));for(let l of this.temporaryData)this.gpuDataManager.release(l.id);this.temporaryData=[],this.currentKernelId=null}}registerBuffer(t,n,r,o){let i=this.sessionExternalDataMapping.get(t);i||(i=new Map,this.sessionExternalDataMapping.set(t,i));let s=i.get(n),a=this.gpuDataManager.registerExternalBuffer(r,o,s);return i.set(n,[a,r]),a}unregisterBuffers(t){let n=this.sessionExternalDataMapping.get(t);n&&(n.forEach(r=>this.gpuDataManager.unregisterExternalBuffer(r[0])),this.sessionExternalDataMapping.delete(t))}getBuffer(t){let n=this.gpuDataManager.get(t);if(!n)throw new Error(`no GPU data for buffer: ${t}`);return n.buffer}createDownloader(t,n,r){return async()=>{let o=await Vn(this,t,n);return Gt(o.buffer,r)}}writeTimestamp(t){this.queryType==="inside-passes"&&this.computePassEncoder.writeTimestamp(this.querySet,t)}setQueryType(){this.queryType="none",(this.env.webgpu.profiling?.mode==="default"||(typeof this.env.trace>"u"?this.env.wasm.trace:this.env.trace))&&(this.device.features.has("chromium-experimental-timestamp-query-inside-passes")?this.queryType="inside-passes":this.device.features.has("timestamp-query")&&(this.queryType="at-passes"),this.queryType!=="none"&&typeof this.querySet>"u"&&(this.querySet=this.device.createQuerySet({type:"timestamp",count:this.maxDispatchNumber*2}),this.queryResolveBuffer=this.device.createBuffer({size:this.maxDispatchNumber*2*8,usage:GPUBufferUsage.COPY_SRC|GPUBufferUsage.QUERY_RESOLVE})))}captureBegin(){j("info","captureBegin"),this.capturedCommandList.get(this.currentSessionId)||this.capturedCommandList.set(this.currentSessionId,[]),this.capturedPendingKernels.get(this.currentSessionId)||this.capturedPendingKernels.set(this.currentSessionId,[]),this.flush(),this.sessionStatus="capturing"}captureEnd(){j("info","captureEnd"),this.flush(),this.sessionStatus="default"}replay(){j("info","replay"),this.sessionStatus="replaying";let t=this.capturedCommandList.get(this.currentSessionId),n=this.capturedPendingKernels.get(this.currentSessionId),r=t.length;this.pendingKernels=[];for(let o=0;o<r;o++){let i=this.getComputePassEncoder(),s=t[o];this.writeTimestamp(this.pendingDispatchNumber*2),i.setPipeline(s.computePipeline),i.setBindGroup(0,s.bindGroup),i.dispatchWorkgroups(...s.dispatchGroup),this.writeTimestamp(this.pendingDispatchNumber*2+1),this.pendingDispatchNumber++,this.queryType!=="none"&&this.pendingKernels.push(n[o]),(this.pendingDispatchNumber>=this.maxDispatchNumber||this.queryType==="at-passes")&&this.endComputePass(),this.pendingDispatchNumber>=this.maxDispatchNumber&&this.flush()}this.flush(),this.sessionStatus="default"}onCreateSession(){this.gpuDataManager.onCreateSession()}onReleaseSession(t){this.unregisterBuffers(t),this.capturedCommandList.has(t)&&this.capturedCommandList.delete(t),this.capturedPendingKernels.has(t)&&this.capturedPendingKernels.delete(t),this.gpuDataManager.onReleaseSession(t)}onRunStart(t){this.currentSessionId=t,this.setQueryType()}}});var xu={};et(xu,{init:()=>cp});var bt,lr,cp,Su=E(()=>{"use strict";V();Ee();q();go();bt=class e{constructor(t,n,r,o){this.module=t;this.dataType=n;this.data=r;this.dims=o}getFloat32Array(){if(this.dataType!==1)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new Float32Array:new Float32Array(this.module.HEAP8.buffer,this.data,t)}getBigInt64Array(){if(this.dataType!==7)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new BigInt64Array:new BigInt64Array(this.module.HEAP8.buffer,this.data,t)}getInt32Array(){if(this.dataType!==6)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new Int32Array:new Int32Array(this.module.HEAP8.buffer,this.data,t)}getUint16Array(){if(this.dataType!==10&&this.dataType!==4)throw new Error("Invalid data type");let t=x.size(this.dims);return t===0?new Uint16Array:new Uint16Array(this.module.HEAP8.buffer,this.data,t)}reshape(t){if(x.size(t)!==x.size(this.dims))throw new Error("Invalid new shape");return new e(this.module,this.dataType,this.data,t)}},lr=class{constructor(t,n,r){this.module=t;this.backend=n;this.customDataOffset=0;this.customDataSize=0;this.adapterInfo=n.adapterInfo;let o=t.PTR_SIZE,i=r/t.PTR_SIZE,s=o===4?"i32":"i64";this.opKernelContext=Number(t.getValue(o*i++,s));let a=Number(t.getValue(o*i++,s));this.outputCount=Number(t.getValue(o*i++,s)),this.customDataOffset=Number(t.getValue(o*i++,"*")),this.customDataSize=Number(t.getValue(o*i++,s));let u=[];for(let d=0;d<a;d++){let l=Number(t.getValue(o*i++,s)),c=Number(t.getValue(o*i++,"*")),p=Number(t.getValue(o*i++,s)),f=[];for(let m=0;m<p;m++)f.push(Number(t.getValue(o*i++,s)));u.push(new bt(t,l,c,f))}this.inputs=u}get kernelCustomData(){return this.backend.currentKernelCustomData}get customDataBuffer(){return this.module.HEAPU8.subarray(this.customDataOffset,this.customDataOffset+this.customDataSize)}compute(t,n){let r=n?.inputs?.map(a=>typeof a=="number"?this.inputs[a]:a)??this.inputs,o=n?.outputs??[],i=(a,u,d)=>new bt(this.module,u,this.output(a,d),d),s=(a,u)=>{let d=We(a,u);if(!d)throw new Error(`Unsupported data type: ${a}`);let l=d>0?this.backend.gpuDataManager.create(d).id:0;return new bt(this.module,a,l,u)};return this.backend.run(t,r,o,i,s,this.outputCount)}output(t,n){let r=this.module.stackSave();try{let o=this.module.PTR_SIZE,i=o===4?"i32":"i64",s=this.module.stackAlloc((1+n.length)*o);this.module.setValue(s,n.length,i);for(let a=0;a<n.length;a++)this.module.setValue(s+o*(a+1),n[a],i);return this.module._JsepOutput(this.opKernelContext,t,s)}catch(o){throw new Error(`Failed to generate kernel's output[${t}] with dims [${n}]. If you are running with pre-allocated output, please make sure the output type/dims are correct. Error: ${o}`)}finally{this.module.stackRestore(r)}}},cp=async(e,t,n,r)=>{let o=t.jsepInit;if(!o)throw new Error("Failed to initialize JSEP. The WebAssembly module is not built with JSEP support.");if(e==="webgpu"){let i=(vu(),at($u)).WebGpuBackend,s=new i;await s.initialize(n,r),o("webgpu",[s,a=>s.alloc(Number(a)),a=>s.free(a),(a,u,d,l=!1)=>{if(l)j("verbose",()=>`[WebGPU] jsepCopyGpuToGpu: src=${Number(a)}, dst=${Number(u)}, size=${Number(d)}`),s.memcpy(Number(a),Number(u));else{j("verbose",()=>`[WebGPU] jsepCopyCpuToGpu: dataOffset=${Number(a)}, gpuDataId=${Number(u)}, size=${Number(d)}`);let c=t.HEAPU8.subarray(Number(a>>>0),Number(a>>>0)+Number(d));s.upload(Number(u),c)}},async(a,u,d)=>{j("verbose",()=>`[WebGPU] jsepCopyGpuToCpu: gpuDataId=${a}, dataOffset=${u}, size=${d}`),await s.download(Number(a),()=>t.HEAPU8.subarray(Number(u)>>>0,Number(u+d)>>>0))},(a,u,d)=>s.createKernel(a,Number(u),d,t.UTF8ToString(t._JsepGetNodeName(Number(u)))),a=>s.releaseKernel(a),(a,u,d,l)=>{j("verbose",()=>`[WebGPU] jsepRun: sessionHandle=${d}, kernel=${a}, contextDataOffset=${u}`);let c=new lr(t,s,Number(u));return s.computeKernel(Number(a),c,l)},()=>s.captureBegin(),()=>s.captureEnd(),()=>s.replay()])}else{let i=new Ft(n);o("webnn",[i,()=>i.reserveTensorId(),s=>i.releaseTensorId(s),async(s,a,u,d,l)=>i.ensureTensor(s,a,u,d,l),(s,a)=>{i.uploadTensor(s,a)},async(s,a)=>i.downloadTensor(s,a)])}}});var pp,Ct,At,Ye,mp,Tu,lt,kt,Et,Iu,Pt,zt,Ot,In=E(()=>{"use strict";no();oo();V();Le();Mt();Pn();pp=(e,t)=>{ne()._OrtInit(e,t)!==0&&Y("Can't initialize onnxruntime.")},Ct=async e=>{pp(e.wasm.numThreads,pt(e.logLevel))},At=async(e,t)=>{ne().asyncInit?.();{let n=(Su(),at(xu)).init;if(t==="webgpu"){if(typeof navigator>"u"||!navigator.gpu)throw new Error("WebGPU is not supported in current environment");let r=e.webgpu.adapter;if(r){if(typeof r.limits!="object"||typeof r.features!="object"||typeof r.requestDevice!="function")throw new Error("Invalid GPU adapter set in `env.webgpu.adapter`. It must be a GPUAdapter object.")}else{let o=e.webgpu.powerPreference;if(o!==void 0&&o!=="low-power"&&o!=="high-performance")throw new Error(`Invalid powerPreference setting: "${o}"`);let i=e.webgpu.forceFallbackAdapter;if(i!==void 0&&typeof i!="boolean")throw new Error(`Invalid forceFallbackAdapter setting: "${i}"`);if(r=await navigator.gpu.requestAdapter({powerPreference:o,forceFallbackAdapter:i}),!r)throw new Error('Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.')}await n("webgpu",ne(),e,r)}if(t==="webnn"){if(typeof navigator>"u"||!navigator.ml)throw new Error("WebNN is not supported in current environment");await n("webnn",ne(),e)}}},Ye=new Map,mp=e=>{let t=ne(),n=t.stackSave();try{let r=t.PTR_SIZE,o=t.stackAlloc(2*r);t._OrtGetInputOutputCount(e,o,o+r)!==0&&Y("Can't get session input/output count.");let s=r===4?"i32":"i64";return[Number(t.getValue(o,s)),Number(t.getValue(o+r,s))]}finally{t.stackRestore(n)}},Tu=(e,t)=>{let n=ne(),r=n.stackSave(),o=0;try{let i=n.PTR_SIZE,s=n.stackAlloc(2*i);n._OrtGetInputOutputMetadata(e,t,s,s+i)!==0&&Y("Can't get session input/output metadata.");let u=Number(n.getValue(s,"*"));o=Number(n.getValue(s+i,"*"));let d=n.HEAP32[o/4];if(d===0)return[u,0];let l=n.HEAPU32[o/4+1],c=[];for(let p=0;p<l;p++){let f=Number(n.getValue(o+8+p*i,"*"));c.push(f!==0?n.UTF8ToString(f):Number(n.getValue(o+8+(p+l)*i,"*")))}return[u,d,c]}finally{n.stackRestore(r),o!==0&&n._OrtFree(o)}},lt=e=>{let t=ne(),n=t._malloc(e.byteLength);if(n===0)throw new Error(`Can't create a session. failed to allocate a buffer of size ${e.byteLength}.`);return t.HEAPU8.set(e,n),[n,e.byteLength]},kt=async(e,t)=>{let n,r,o=ne();Array.isArray(e)?[n,r]=e:e.buffer===o.HEAPU8.buffer?[n,r]=[e.byteOffset,e.byteLength]:[n,r]=lt(e);let i=0,s=0,a=0,u=[],d=[],l=[];try{if([s,u]=await ro(t),t?.externalData&&o.mountExternalData){let w=[];for(let v of t.externalData){let $=typeof v=="string"?v:v.path;w.push(mt(typeof v=="string"?v:v.data).then(T=>{o.mountExternalData($,T)}))}await Promise.all(w)}for(let w of t?.executionProviders??[])if((typeof w=="string"?w:w.name)==="webnn"){if(o.shouldTransferToMLTensor=!1,typeof w!="string"){let $=w,T=$?.context,I=$?.gpuDevice,A=$?.deviceType,z=$?.powerPreference;T?o.currentContext=T:I?o.currentContext=await o.webnnCreateMLContext(I):o.currentContext=await o.webnnCreateMLContext({deviceType:A,powerPreference:z})}else o.currentContext=await o.webnnCreateMLContext();break}i=await o._OrtCreateSession(n,r,s),o.webgpuOnCreateSession?.(i),i===0&&Y("Can't create a session."),o.jsepOnCreateSession?.(),o.currentContext&&(o.webnnRegisterMLContext(i,o.currentContext),o.currentContext=void 0,o.shouldTransferToMLTensor=!0);let[c,p]=mp(i),f=!!t?.enableGraphCapture,m=[],h=[],b=[],y=[],g=[];for(let w=0;w<c;w++){let[v,$,T]=Tu(i,w);v===0&&Y("Can't get an input name."),d.push(v);let I=o.UTF8ToString(v);m.push(I),b.push($===0?{name:I,isTensor:!1}:{name:I,isTensor:!0,type:ke($),shape:T})}for(let w=0;w<p;w++){let[v,$,T]=Tu(i,w+c);v===0&&Y("Can't get an output name."),l.push(v);let I=o.UTF8ToString(v);h.push(I),y.push($===0?{name:I,isTensor:!1}:{name:I,isTensor:!0,type:ke($),shape:T});{if(f&&t?.preferredOutputLocation===void 0){g.push("gpu-buffer");continue}let A=typeof t?.preferredOutputLocation=="string"?t.preferredOutputLocation:t?.preferredOutputLocation?.[I]??"cpu",z=o.webnnIsGraphOutput;if(A==="cpu"&&z&&z(i,I)){g.push("ml-tensor-cpu-output");continue}if(A!=="cpu"&&A!=="cpu-pinned"&&A!=="gpu-buffer"&&A!=="ml-tensor")throw new Error(`Not supported preferred output location: ${A}.`);if(f&&A!=="gpu-buffer")throw new Error(`Not supported preferred output location: ${A}. Only 'gpu-buffer' location is supported when enableGraphCapture is true.`);g.push(A)}}let _=null;return g.some(w=>w==="gpu-buffer"||w==="ml-tensor"||w==="ml-tensor-cpu-output")&&(a=o._OrtCreateBinding(i),a===0&&Y("Can't create IO binding."),_={handle:a,outputPreferredLocations:g,outputPreferredLocationsEncoded:g.map(w=>w==="ml-tensor-cpu-output"?"ml-tensor":w).map(w=>En(w))}),Ye.set(i,[i,d,l,_,f,!1]),[i,m,h,b,y]}catch(c){throw d.forEach(p=>o._OrtFree(p)),l.forEach(p=>o._OrtFree(p)),a!==0&&o._OrtReleaseBinding(a)!==0&&Y("Can't release IO binding."),i!==0&&o._OrtReleaseSession(i)!==0&&Y("Can't release session."),c}finally{o._free(n),s!==0&&o._OrtReleaseSessionOptions(s)!==0&&Y("Can't release session options."),u.forEach(c=>o._free(c)),o.unmountExternalData?.()}},Et=e=>{let t=ne(),n=Ye.get(e);if(!n)throw new Error(`cannot release session. invalid session id: ${e}`);let[r,o,i,s,a]=n;s&&(a&&t._OrtClearBoundOutputs(s.handle)!==0&&Y("Can't clear bound outputs."),t._OrtReleaseBinding(s.handle)!==0&&Y("Can't release IO binding.")),t.jsepOnReleaseSession?.(e),t.webnnOnReleaseSession?.(e),t.webgpuOnReleaseSession?.(e),o.forEach(u=>t._OrtFree(u)),i.forEach(u=>t._OrtFree(u)),t._OrtReleaseSession(r)!==0&&Y("Can't release session."),Ye.delete(e)},Iu=async(e,t,n,r,o,i,s=!1)=>{if(!e){t.push(0);return}let a=ne(),u=a.PTR_SIZE,d=e[0],l=e[1],c=e[3],p=c,f,m;if(d==="string"&&(c==="gpu-buffer"||c==="ml-tensor"))throw new Error("String tensor is not supported on GPU.");if(s&&c!=="gpu-buffer")throw new Error(`External buffer must be provided for input/output index ${i} when enableGraphCapture is true.`);if(c==="gpu-buffer"){let y=e[2].gpuBuffer;m=We(Ge(d),l);{let g=a.jsepRegisterBuffer;if(!g)throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.');f=g(r,i,y,m)}}else if(c==="ml-tensor"){let y=e[2].mlTensor;m=We(Ge(d),l);let g=a.webnnRegisterMLTensor;if(!g)throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');f=g(r,y,Ge(d),l)}else{let y=e[2];if(Array.isArray(y)){m=u*y.length,f=a._malloc(m),n.push(f);for(let g=0;g<y.length;g++){if(typeof y[g]!="string")throw new TypeError(`tensor data at index ${g} is not a string`);a.setValue(f+g*u,be(y[g],n),"*")}}else{let g=a.webnnIsGraphInput,_=a.webnnIsGraphOutput;if(d!=="string"&&g&&_){let w=a.UTF8ToString(o);if(g(r,w)||_(r,w)){let v=Ge(d);m=We(v,l),p="ml-tensor";let $=a.webnnCreateTemporaryTensor,T=a.webnnUploadTensor;if(!$||!T)throw new Error('Tensor location "ml-tensor" is not supported without using WebNN.');let I=await $(r,v,l);T(I,new Uint8Array(y.buffer,y.byteOffset,y.byteLength)),f=I}else m=y.byteLength,f=a._malloc(m),n.push(f),a.HEAPU8.set(new Uint8Array(y.buffer,y.byteOffset,m),f)}else m=y.byteLength,f=a._malloc(m),n.push(f),a.HEAPU8.set(new Uint8Array(y.buffer,y.byteOffset,m),f)}}let h=a.stackSave(),b=a.stackAlloc(4*l.length);try{l.forEach((g,_)=>a.setValue(b+_*u,g,u===4?"i32":"i64"));let y=a._OrtCreateTensor(Ge(d),f,m,b,l.length,En(p));y===0&&Y(`Can't create tensor for input/output. session=${r}, index=${i}.`),t.push(y)}finally{a.stackRestore(h)}},Pt=async(e,t,n,r,o,i)=>{let s=ne(),a=s.PTR_SIZE,u=Ye.get(e);if(!u)throw new Error(`cannot run inference. invalid session id: ${e}`);let d=u[0],l=u[1],c=u[2],p=u[3],f=u[4],m=u[5],h=t.length,b=r.length,y=0,g=[],_=[],w=[],v=[],$=s.stackSave(),T=s.stackAlloc(h*a),I=s.stackAlloc(h*a),A=s.stackAlloc(b*a),z=s.stackAlloc(b*a);try{[y,g]=to(i);for(let O=0;O<h;O++)await Iu(n[O],_,v,e,l[t[O]],t[O],f);for(let O=0;O<b;O++)await Iu(o[O],w,v,e,c[r[O]],h+r[O],f);for(let O=0;O<h;O++)s.setValue(T+O*a,_[O],"*"),s.setValue(I+O*a,l[t[O]],"*");for(let O=0;O<b;O++)s.setValue(A+O*a,w[O],"*"),s.setValue(z+O*a,c[r[O]],"*");if(p&&!m){let{handle:O,outputPreferredLocations:ee,outputPreferredLocationsEncoded:G}=p;if(l.length!==h)throw new Error(`input count from feeds (${h}) is expected to be always equal to model's input count (${l.length}).`);for(let D=0;D<h;D++){let Z=t[D];await s._OrtBindInput(O,l[Z],_[D])!==0&&Y(`Can't bind input[${D}] for session=${e}.`)}for(let D=0;D<b;D++){let Z=r[D];o[D]?.[3]?s._OrtBindOutput(O,c[Z],w[D],0)!==0&&Y(`Can't bind pre-allocated output[${D}] for session=${e}.`):s._OrtBindOutput(O,c[Z],0,G[Z])!==0&&Y(`Can't bind output[${D}] to ${ee[D]} for session=${e}.`)}Ye.set(e,[d,l,c,p,f,!0])}s.jsepOnRunStart?.(d),s.webnnOnRunStart?.(d);let M;p?M=await s._OrtRunWithBinding(d,p.handle,b,A,y):M=await s._OrtRun(d,I,T,h,z,b,A,y),M!==0&&Y("failed to call OrtRun().");let R=[],W=[];for(let O=0;O<b;O++){let ee=Number(s.getValue(A+O*a,"*"));if(ee===w[O]){R.push(o[O]);continue}let G=s.stackSave(),D=s.stackAlloc(4*a),Z=!1,U,Q=0;try{s._OrtGetTensorData(ee,D,D+a,D+2*a,D+3*a)!==0&&Y(`Can't access output tensor data on index ${O}.`);let se=a===4?"i32":"i64",H=Number(s.getValue(D,se));Q=s.getValue(D+a,"*");let k=s.getValue(D+a*2,"*"),L=Number(s.getValue(D+a*3,se)),oe=[];for(let ue=0;ue<L;ue++)oe.push(Number(s.getValue(k+ue*a,se)));s._OrtFree(k)!==0&&Y("Can't free memory for tensor dims.");let we=oe.reduce((ue,ae)=>ue*ae,1);U=ke(H);let Se=p?.outputPreferredLocations[r[O]];if(U==="string"){if(Se==="gpu-buffer"||Se==="ml-tensor")throw new Error("String tensor is not supported on GPU.");let ue=[];for(let ae=0;ae<we;ae++){let Ve=s.getValue(Q+ae*a,"*"),wt=s.getValue(Q+(ae+1)*a,"*"),fr=ae===we-1?void 0:wt-Ve;ue.push(s.UTF8ToString(Ve,fr))}R.push([U,oe,ue,"cpu"])}else if(Se==="gpu-buffer"&&we>0){let ue=s.jsepGetBuffer;if(!ue)throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');let ae=ue(Q),Ve=We(H,we);if(Ve===void 0||!Ut(U))throw new Error(`Unsupported data type: ${U}`);Z=!0,R.push([U,oe,{gpuBuffer:ae,download:s.jsepCreateDownloader(ae,Ve,U),dispose:()=>{s._OrtReleaseTensor(ee)!==0&&Y("Can't release tensor.")}},"gpu-buffer"])}else if(Se==="ml-tensor"&&we>0){let ue=s.webnnEnsureTensor,ae=s.webnnIsGraphInputOutputTypeSupported;if(!ue||!ae)throw new Error('preferredLocation "ml-tensor" is not supported without using WebNN.');if(We(H,we)===void 0||!Vt(U))throw new Error(`Unsupported data type: ${U}`);if(!ae(e,U,!1))throw new Error(`preferredLocation "ml-tensor" for ${U} output is not supported by current WebNN Context.`);let wt=await ue(e,Q,H,oe,!1);Z=!0,R.push([U,oe,{mlTensor:wt,download:s.webnnCreateMLTensorDownloader(Q,U),dispose:()=>{s.webnnReleaseTensorId(Q),s._OrtReleaseTensor(ee)}},"ml-tensor"])}else if(Se==="ml-tensor-cpu-output"&&we>0){let ue=s.webnnCreateMLTensorDownloader(Q,U)(),ae=R.length;Z=!0,W.push((async()=>{let Ve=[ae,await ue];return s.webnnReleaseTensorId(Q),s._OrtReleaseTensor(ee),Ve})()),R.push([U,oe,[],"cpu"])}else{let ue=tt(U),ae=new ue(we);new Uint8Array(ae.buffer,ae.byteOffset,ae.byteLength).set(s.HEAPU8.subarray(Q,Q+ae.byteLength)),R.push([U,oe,ae,"cpu"])}}finally{s.stackRestore(G),U==="string"&&Q&&s._free(Q),Z||s._OrtReleaseTensor(ee)}}p&&!f&&(s._OrtClearBoundOutputs(p.handle)!==0&&Y("Can't clear bound outputs."),Ye.set(e,[d,l,c,p,f,!1]));for(let[O,ee]of await Promise.all(W))R[O][2]=ee;return R}finally{s.webnnOnRunEnd?.(d),s.stackRestore($),_.forEach(M=>s._OrtReleaseTensor(M)),w.forEach(M=>s._OrtReleaseTensor(M)),v.forEach(M=>s._free(M)),y!==0&&s._OrtReleaseRunOptions(y),g.forEach(M=>s._free(M))}},zt=e=>{let t=ne(),n=Ye.get(e);if(!n)throw new Error("invalid session id");let r=n[0],o=t._OrtEndProfiling(r);o===0&&Y("Can't get an profile file name."),t._OrtFree(o)},Ot=e=>{let t=[];for(let n of e){let r=n[2];!Array.isArray(r)&&"buffer"in r&&t.push(r.buffer)}return t}});var Je,xe,_t,fn,hn,mn,cr,pr,it,st,hp,Cu,Au,ku,Eu,Pu,zu,Ou,mr=E(()=>{"use strict";ve();In();Le();Tt();Je=()=>!!te.wasm.proxy&&typeof document<"u",_t=!1,fn=!1,hn=!1,pr=new Map,it=(e,t)=>{let n=pr.get(e);n?n.push(t):pr.set(e,[t])},st=()=>{if(_t||!fn||hn||!xe)throw new Error("worker not ready")},hp=e=>{switch(e.data.type){case"init-wasm":_t=!1,e.data.err?(hn=!0,cr[1](e.data.err)):(fn=!0,cr[0]()),mn&&(URL.revokeObjectURL(mn),mn=void 0);break;case"init-ep":case"copy-from":case"create":case"release":case"run":case"end-profiling":{let t=pr.get(e.data.type);e.data.err?t.shift()[1](e.data.err):t.shift()[0](e.data.out);break}default:}},Cu=async()=>{if(!fn){if(_t)throw new Error("multiple calls to 'initWasm()' detected.");if(hn)throw new Error("previous call to 'initWasm()' failed.");if(_t=!0,Je())return new Promise((e,t)=>{xe?.terminate(),Yr().then(([n,r])=>{try{xe=r,xe.onerror=i=>t(i),xe.onmessage=hp,cr=[e,t];let o={type:"init-wasm",in:te};if(!o.in.wasm.wasmPaths&&n){let i=Dt();i&&(o.in.wasm.wasmPaths=i)}xe.postMessage(o),mn=n}catch(o){t(o)}},t)});try{await It(te.wasm),await Ct(te),fn=!0}catch(e){throw hn=!0,e}finally{_t=!1}}},Au=async e=>{if(Je())return st(),new Promise((t,n)=>{it("init-ep",[t,n]);let r={type:"init-ep",in:{epName:e,env:te}};xe.postMessage(r)});await At(te,e)},ku=async e=>Je()?(st(),new Promise((t,n)=>{it("copy-from",[t,n]);let r={type:"copy-from",in:{buffer:e}};xe.postMessage(r,[e.buffer])})):lt(e),Eu=async(e,t)=>{if(Je()){if(t?.preferredOutputLocation)throw new Error('session option "preferredOutputLocation" is not supported for proxy.');return st(),new Promise((n,r)=>{it("create",[n,r]);let o={type:"create",in:{model:e,options:{...t}}},i=[];e instanceof Uint8Array&&i.push(e.buffer),xe.postMessage(o,i)})}else return kt(e,t)},Pu=async e=>{if(Je())return st(),new Promise((t,n)=>{it("release",[t,n]);let r={type:"release",in:e};xe.postMessage(r)});Et(e)},zu=async(e,t,n,r,o,i)=>{if(Je()){if(n.some(s=>s[3]!=="cpu"))throw new Error("input tensor on GPU is not supported for proxy.");if(o.some(s=>s))throw new Error("pre-allocated output tensor is not supported for proxy.");return st(),new Promise((s,a)=>{it("run",[s,a]);let u=n,d={type:"run",in:{sessionId:e,inputIndices:t,inputs:u,outputIndices:r,options:i}};xe.postMessage(d,Ot(u))})}else return Pt(e,t,n,r,o,i)},Ou=async e=>{if(Je())return st(),new Promise((t,n)=>{it("end-profiling",[t,n]);let r={type:"end-profiling",in:e};xe.postMessage(r)});zt(e)}});var Du,gp,gn,Bu=E(()=>{"use strict";ve();mr();V();St();Pn();Du=(e,t)=>{switch(e.location){case"cpu":return[e.type,e.dims,e.data,"cpu"];case"gpu-buffer":return[e.type,e.dims,{gpuBuffer:e.gpuBuffer},"gpu-buffer"];case"ml-tensor":return[e.type,e.dims,{mlTensor:e.mlTensor},"ml-tensor"];default:throw new Error(`invalid data location: ${e.location} for ${t()}`)}},gp=e=>{switch(e[3]){case"cpu":return new $e(e[0],e[2],e[1]);case"gpu-buffer":{let t=e[0];if(!Ut(t))throw new Error(`not supported data type: ${t} for deserializing GPU tensor`);let{gpuBuffer:n,download:r,dispose:o}=e[2];return $e.fromGpuBuffer(n,{dataType:t,dims:e[1],download:r,dispose:o})}case"ml-tensor":{let t=e[0];if(!Vt(t))throw new Error(`not supported data type: ${t} for deserializing MLTensor tensor`);let{mlTensor:n,download:r,dispose:o}=e[2];return $e.fromMLTensor(n,{dataType:t,dims:e[1],download:r,dispose:o})}default:throw new Error(`invalid data location: ${e[3]}`)}},gn=class{async fetchModelAndCopyToWasmMemory(t){return ku(await mt(t))}async loadModel(t,n){fe();let r;typeof t=="string"?r=await this.fetchModelAndCopyToWasmMemory(t):r=t,[this.sessionId,this.inputNames,this.outputNames,this.inputMetadata,this.outputMetadata]=await Eu(r,n),me()}async dispose(){return Pu(this.sessionId)}async run(t,n,r){fe();let o=[],i=[];Object.entries(t).forEach(p=>{let f=p[0],m=p[1],h=this.inputNames.indexOf(f);if(h===-1)throw new Error(`invalid input '${f}'`);o.push(m),i.push(h)});let s=[],a=[];Object.entries(n).forEach(p=>{let f=p[0],m=p[1],h=this.outputNames.indexOf(f);if(h===-1)throw new Error(`invalid output '${f}'`);s.push(m),a.push(h)});let u=o.map((p,f)=>Du(p,()=>`input "${this.inputNames[i[f]]}"`)),d=s.map((p,f)=>p?Du(p,()=>`output "${this.outputNames[a[f]]}"`):null),l=await zu(this.sessionId,i,u,a,d,r),c={};for(let p=0;p<l.length;p++)c[this.outputNames[a[p]]]=s[p]??gp(l[p]);return me(),c}startProfiling(){}endProfiling(){Ou(this.sessionId)}}});var Ru={};et(Ru,{OnnxruntimeWebAssemblyBackend:()=>yn,initializeFlags:()=>Mu,wasmBackend:()=>yp});var Mu,yn,yp,Uu=E(()=>{"use strict";ve();mr();Bu();Mu=()=>{(typeof te.wasm.initTimeout!="number"||te.wasm.initTimeout<0)&&(te.wasm.initTimeout=0);let e=te.wasm.simd;if(typeof e!="boolean"&&e!==void 0&&e!=="fixed"&&e!=="relaxed"&&(console.warn(`Property "env.wasm.simd" is set to unknown value "${e}". Reset it to \`false\` and ignore SIMD feature checking.`),te.wasm.simd=!1),typeof te.wasm.proxy!="boolean"&&(te.wasm.proxy=!1),typeof te.wasm.trace!="boolean"&&(te.wasm.trace=!1),typeof te.wasm.numThreads!="number"||!Number.isInteger(te.wasm.numThreads)||te.wasm.numThreads<=0)if(typeof self<"u"&&!self.crossOriginIsolated)te.wasm.numThreads=1;else{let t=typeof navigator>"u"?_n("node:os").cpus().length:navigator.hardwareConcurrency;te.wasm.numThreads=Math.min(4,Math.ceil((t||1)/2))}},yn=class{async init(t){Mu(),await Cu(),await Au(t)}async createInferenceSessionHandler(t,n){let r=new gn;return await r.loadModel(t,n),r}},yp=new yn});var _p={};et(_p,{InferenceSession:()=>Ur,TRACE:()=>dt,TRACE_FUNC_BEGIN:()=>fe,TRACE_FUNC_END:()=>me,Tensor:()=>$e,default:()=>bp,env:()=>te,registerBackend:()=>Ne});ve();ve();ve();var Hr="1.22.0";var bp=Tn;{let e=(Uu(),at(Ru)).wasmBackend;Ne("webgpu",e,5),Ne("webnn",e,5),Ne("cpu",e,10),Ne("wasm",e,10)}Object.defineProperty(te.versions,"web",{value:Hr,enumerable:!0});return at(_p);})();
+/**
+ * @license
+ * Copyright 2021 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+/**
+ * @license
+ * Copyright 2020 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+/**
+ * @license
+ * Copyright 2019 Google LLC. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * =============================================================================
+ */
+typeof exports=="object"&&typeof module=="object"&&(module.exports=ort);
+//# sourceMappingURL=ort.min.js.map
diff --git a/demo/e2e_audio_trace_demo.py b/demo/e2e_audio_trace_demo.py
new file mode 100644
index 0000000..5fd84be
--- /dev/null
+++ b/demo/e2e_audio_trace_demo.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""End-to-end smoke test for the Audio Loop + Trace Wiring plan.
+
+Verifies the non-audio half of the loop:
+  1. Kernel health endpoint reachable.
+  2. Cycle-trace bus (bus_cycle_trace) emits ADR-083-shaped envelopes.
+  3. The existing Mod3 KernelBusSubscriber can consume them.
+
+Exit 0 if >=1 event of any recognized kind is validated within 60s.
+Exit 1 with a clear error otherwise.
+
+Usage:
+    python3 demo/e2e_audio_trace_demo.py
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+import time
+from collections import defaultdict
+from pathlib import Path
+
+import httpx
+
+# Allow running from repo root or from demo/
+ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(ROOT))
+
+from bus_bridge import KernelBusSubscriber  # noqa: E402
+
+HEALTH_URL = "http://localhost:6931/health"
+BUS = "bus_cycle_trace"
+KNOWN_KINDS = {"state_transition", "tool_dispatch", "assessment"}
+REQUIRED_FIELDS = ("id", "ts", "source", "cycle_id", "kind", "payload")
+MAX_WAIT_S = 60
+TARGET_EVENTS = 3
+
+
+async def check_health() -> None:
+    async with httpx.AsyncClient(timeout=5.0) as c:
+        r = await c.get(HEALTH_URL)
+    if r.status_code != 200:
+        raise SystemExit(f"health: HTTP {r.status_code}")
+    try:
+        body = r.json()
+    except Exception as e:
+        raise SystemExit(f"health: non-JSON body: {e}")
+    print(f"[ok] kernel health: status={body.get('status')!r}")
+
+
+def validate_envelope(env) -> tuple[bool, list[str], dict]:
+    """ADR-083 envelope shape check.
+
+    The kernel SSE frame is {id, type, timestamp, data: <CogBlock>}.
+    The CogBlock is {bus_id, from, seq, hash, prev, prev_hash, ts, type, v, payload}.
+    The ADR-083 CycleEvent (id/ts/source/cycle_id/kind/payload) is the CogBlock's
+    payload field — that's what consumers validate.
+    """
+    cogblock = env.raw.get("data", {}) if isinstance(env.raw, dict) else {}
+    inner = cogblock.get("payload") if isinstance(cogblock, dict) else {}
+    if not isinstance(inner, dict):
+        inner = {}
+    missing = [f for f in REQUIRED_FIELDS if f not in inner]
+    return (not missing), missing, inner
+
+
+async def collect() -> int:
+    sub = KernelBusSubscriber(bus_filter=BUS, consumer_id="e2e-audio-trace-demo")
+    captured: list = []
+    kinds = defaultdict(int)
+    cycle_ids: set[str] = set()
+    t0 = time.time()
+    unknown_kinds: set[str] = set()
+    first_ts: str | None = None
+    last_ts: str | None = None
+
+    print(f"[..] subscribing to {sub._url} bus={BUS} (up to {MAX_WAIT_S}s or {TARGET_EVENTS} events)")
+    try:
+        async def _run():
+            nonlocal first_ts, last_ts
+            async for env in sub.stream():
+                if env.kind == "connected":
+                    continue
+                ok, missing, inner = validate_envelope(env)
+                kind = inner.get("kind", env.kind)
+                if not ok:
+                    print(f"[warn] envelope missing fields {missing}: keys={sorted(inner.keys())}")
+                    continue
+                if kind not in KNOWN_KINDS:
+                    unknown_kinds.add(kind)
+                    print(f"[warn] unknown kind={kind!r} (tolerated per ADR-083)")
+                kinds[kind] += 1
+                cid = inner.get("cycle_id")
+                if isinstance(cid, str):
+                    cycle_ids.add(cid)
+                ts = inner.get("ts")
+                if isinstance(ts, str):
+                    first_ts = first_ts or ts
+                    last_ts = ts
+                captured.append(inner)
+                print(f"[evt] kind={kind} cycle_id={cid} ts={ts} payload_keys={sorted((inner.get('payload') or {}).keys())[:6]}")
+                if len(captured) >= TARGET_EVENTS:
+                    return
+
+        await asyncio.wait_for(_run(), timeout=MAX_WAIT_S)
+    except asyncio.TimeoutError:
+        print(f"[..] timeout reached after {MAX_WAIT_S}s; collected {len(captured)} event(s)")
+    finally:
+        await sub.close()
+
+    elapsed = time.time() - t0
+    print("\n=== SUMMARY ===")
+    print(f"elapsed: {elapsed:.1f}s  events: {len(captured)}  distinct cycle_ids: {len(cycle_ids)}")
+    print(f"time window: {first_ts} .. {last_ts}")
+    print(f"count per kind: {dict(kinds)}")
+    if unknown_kinds:
+        print(f"unknown kinds (tolerated): {sorted(unknown_kinds)}")
+    for k in sorted(kinds):
+        sample = next((e for e in captured if e.get("kind") == k), None)
+        if sample:
+            print(f"sample[{k}]: {json.dumps(sample, default=str)[:220]}")
+
+    if len(captured) == 0:
+        print("[FAIL] no events captured from bus_cycle_trace within window.", file=sys.stderr)
+        return 1
+    if not any(k in KNOWN_KINDS for k in kinds):
+        print("[FAIL] captured events but none matched known ADR-083 kinds.", file=sys.stderr)
+        return 1
+    print("[PASS] >=1 ADR-083-shaped event validated.")
+    return 0
+
+
+async def main() -> int:
+    await check_health()
+    return await collect()
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
diff --git a/demo/e2e_dashboard_harness.py b/demo/e2e_dashboard_harness.py
new file mode 100644
index 0000000..03a3a9f
--- /dev/null
+++ b/demo/e2e_dashboard_harness.py
@@ -0,0 +1,176 @@
+"""Headless end-to-end harness for the Mod³ dashboard pipeline.
+
+Simulates a dashboard client without a browser or a microphone:
+  1. Connects to ws://localhost:7860/ws/chat
+  2. Sends a text_message, collects response_text frames + audio frames
+  3. Writes captured TTS audio to /tmp/mod3-harness-tts.wav
+  4. Sends an interrupt frame mid-response (if TTS still playing) to exercise barge-in
+  5. In parallel, subscribes to kernel trace events via /v1/events/stream?bus_id=bus_cycle_trace
+  6. Emits a summary
+
+Exits 0 on success, 1 on any missing path.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import base64
+import json
+import os
+import sys
+import time
+import wave
+from pathlib import Path
+
+import httpx
+from websockets.client import connect as ws_connect
+
+HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(HERE.parent))
+from bus_bridge import KernelBusSubscriber  # noqa: E402
+
+WS_URL = "ws://localhost:7860/ws/chat"
+KERNEL_STREAM = "http://localhost:6931/v1/events/stream"
+TTS_OUT = Path("/tmp/mod3-harness-tts.wav")
+RESULT: dict = {"response_text": [], "audio_frames": 0, "audio_bytes": 0,
+                "trace_events": [], "interrupt_sent": False, "started": time.time()}
+
+
+async def subscribe_trace(seconds: float) -> None:
+    sub = KernelBusSubscriber(url=KERNEL_STREAM, bus_filter="*")
+    try:
+        deadline = time.time() + seconds
+        async for env in sub.stream():
+            kind = env.kind or "?"
+            RESULT["trace_events"].append({"kind": kind, "ts": env.ts})
+            if time.time() > deadline:
+                break
+    finally:
+        await sub.close()
+
+
+def _pcm_frames_to_wav(frames: list[bytes], sample_rate: int, path: Path) -> int:
+    pcm = b"".join(frames)
+    with wave.open(str(path), "wb") as w:
+        w.setnchannels(1)
+        w.setsampwidth(2)
+        w.setframerate(sample_rate)
+        w.writeframes(pcm)
+    return len(pcm)
+
+
+async def run_dashboard_session(interrupt_after_s: float, text: str) -> None:
+    audio_pcm: list[bytes] = []
+    audio_sr: int | None = None
+    interrupt_task: asyncio.Task | None = None
+
+    async with ws_connect(WS_URL, max_size=32 * 1024 * 1024) as ws:
+        # 1. config
+        await ws.send(json.dumps({"type": "config", "voice": "bm_lewis", "model": None}))
+        # 2. user turn
+        await ws.send(json.dumps({"type": "text_message", "text": text}))
+
+        async def deferred_interrupt() -> None:
+            await asyncio.sleep(interrupt_after_s)
+            RESULT["interrupt_sent"] = True
+            await ws.send(json.dumps({"type": "interrupt"}))
+            await asyncio.sleep(0.2)
+            await ws.send(json.dumps({"type": "text_message", "text": "wait hold on"}))
+
+        interrupt_task = asyncio.create_task(deferred_interrupt())
+
+        end_deadline = time.time() + 45
+        got_done = False
+        done_ts: float | None = None
+        while time.time() < end_deadline:
+            if got_done and done_ts and (time.time() - done_ts) > 4.0:
+                break
+            try:
+                msg = await asyncio.wait_for(ws.recv(), timeout=5.0)
+            except asyncio.TimeoutError:
+                continue
+            if isinstance(msg, bytes):
+                continue
+            try:
+                frame = json.loads(msg)
+            except Exception:
+                continue
+            t = frame.get("type")
+            print(f"  [frame] type={t} keys={sorted(frame.keys())}", flush=True)
+            if t == "response_text":
+                RESULT["response_text"].append(frame.get("text", ""))
+            elif t == "audio":
+                RESULT["audio_frames"] += 1
+                RESULT["audio_bytes"] += len(frame.get("data", "")) // 4 * 3
+                try:
+                    wav_b64 = frame.get("data", "")
+                    wav_bytes = base64.b64decode(wav_b64)
+                    if wav_bytes.startswith(b"RIFF"):
+                        with wave.open(__import__("io").BytesIO(wav_bytes), "rb") as r:
+                            audio_sr = r.getframerate()
+                            audio_pcm.append(r.readframes(r.getnframes()))
+                    else:
+                        sr = int(frame.get("sample_rate", 24000))
+                        audio_sr = audio_sr or sr
+                        audio_pcm.append(wav_bytes)
+                except Exception as e:
+                    print(f"  (audio decode err: {e})", flush=True)
+            elif t == "response_done" or t == "turn_complete":
+                got_done = True
+                done_ts = time.time()
+            elif t == "trace_event":
+                RESULT["trace_events"].append({
+                    "kind": (frame.get("event") or {}).get("kind", "?"),
+                    "ts": (frame.get("event") or {}).get("ts"),
+                })
+            if RESULT["audio_frames"] >= 2 and RESULT["interrupt_sent"] is False:
+                await deferred_interrupt()
+                break
+
+        if interrupt_task and not interrupt_task.done():
+            interrupt_task.cancel()
+
+    if audio_pcm and audio_sr:
+        size = _pcm_frames_to_wav(audio_pcm, audio_sr, TTS_OUT)
+        RESULT["tts_wav_path"] = str(TTS_OUT)
+        RESULT["tts_wav_bytes"] = size
+        RESULT["tts_wav_sr"] = audio_sr
+
+
+async def main() -> int:
+    text = os.environ.get("HARNESS_PROMPT",
+                          "In one short sentence, describe the planet Jupiter.")
+    skip_interrupt = os.environ.get("HARNESS_SKIP_INTERRUPT") == "1"
+    trace_task = asyncio.create_task(subscribe_trace(seconds=45.0))
+    try:
+        await asyncio.wait_for(
+            run_dashboard_session(interrupt_after_s=9999 if skip_interrupt else 2.5,
+                                  text=text),
+            timeout=90.0)
+    except asyncio.TimeoutError:
+        print("[warn] dashboard session timed out")
+    await asyncio.sleep(1.5)
+    trace_task.cancel()
+    try:
+        await trace_task
+    except asyncio.CancelledError:
+        pass
+
+    print("\n=== HARNESS SUMMARY ===")
+    print(f"prompt: {text!r}")
+    print(f"response_text frames: {len(RESULT['response_text'])}")
+    if RESULT["response_text"]:
+        joined = " ".join(RESULT["response_text"])[:400]
+        print(f"  preview: {joined!r}")
+    print(f"audio frames: {RESULT['audio_frames']}  wav path: {RESULT.get('tts_wav_path','-')}  sr={RESULT.get('tts_wav_sr','-')}")
+    print(f"interrupt_sent: {RESULT['interrupt_sent']}")
+    print(f"trace events observed: {len(RESULT['trace_events'])}  kinds: {sorted({e['kind'] for e in RESULT['trace_events']})}")
+
+    ok = (len(RESULT["response_text"]) > 0 and
+          RESULT["audio_frames"] > 0 and
+          len(RESULT["trace_events"]) > 0)
+    return 0 if ok else 1
+
+
+if __name__ == "__main__":
+    sys.exit(asyncio.run(main()))
diff --git a/http_api.py b/http_api.py
index 45a7fb8..6b7260c 100644
--- a/http_api.py
+++ b/http_api.py
@@ -67,6 +67,58 @@ def _do_warmup():
     threading.Thread(target=_do_warmup, daemon=True, name="kokoro-warmup").start()
 
 
+@app.on_event("startup")
+async def _start_bus_bridge():
+    """Launch the kernel-bus → dashboard trace-event bridge.
+
+    Non-blocking: subscriber handles reconnect with backoff, so an unreachable
+    kernel does not fail server startup. Disable with MOD3_BUS_BRIDGE_DISABLED=1.
+    """
+    from bus_bridge_runner import start_bridge
+
+    try:
+        await start_bridge(app.state)
+    except Exception as e:  # noqa: BLE001 — never fail startup on bridge wiring
+        logger.warning("bus-bridge startup failed (non-fatal): %s", e)
+
+
+@app.on_event("shutdown")
+async def _stop_bus_bridge():
+    """Gracefully stop the bus bridge on FastAPI shutdown."""
+    from bus_bridge_runner import stop_bridge
+
+    try:
+        await stop_bridge(app.state, timeout_s=2.0)
+    except Exception as e:  # noqa: BLE001
+        logger.debug("bus-bridge shutdown error (non-fatal): %s", e)
+
+
+@app.on_event("startup")
+async def _start_cogos_agent_bridge():
+    """Launch the cogos-agent response bridge (MOD3_USE_COGOS_AGENT=1).
+
+    No-op unless the env flag is set. Subscribes to `bus_dashboard_response`
+    and forwards assistant replies to the dashboard WS as `response_text`.
+    """
+    from cogos_agent_bridge import start_response_bridge
+
+    try:
+        await start_response_bridge(app.state)
+    except Exception as e:  # noqa: BLE001 — never fail startup on bridge wiring
+        logger.warning("cogos-agent startup failed (non-fatal): %s", e)
+
+
+@app.on_event("shutdown")
+async def _stop_cogos_agent_bridge():
+    """Gracefully stop the cogos-agent response bridge on FastAPI shutdown."""
+    from cogos_agent_bridge import stop_response_bridge
+
+    try:
+        await stop_response_bridge(app.state, timeout_s=2.0)
+    except Exception as e:  # noqa: BLE001
+        logger.debug("cogos-agent shutdown error (non-fatal): %s", e)
+
+
 try:
     from server import _bus as _shared_bus
 except Exception:
diff --git a/modules/voice.py b/modules/voice.py
index 20e02e1..ac732ad 100644
--- a/modules/voice.py
+++ b/modules/voice.py
@@ -88,7 +88,9 @@ class WhisperDecoder(Decoder):
     - Base (whisper-base-mlx): fast, used for T1 tier (~31ms)
     """
 
-    DEFAULT_MODEL = "mlx-community/whisper-large-v3-turbo"
+    # Downgraded from whisper-large-v3-turbo to base to reduce MLX Metal
+    # pressure (Gemma + Kokoro + Whisper concurrent load segfaults).
+    DEFAULT_MODEL = "mlx-community/whisper-base-mlx"
     BASE_MODEL = "mlx-community/whisper-base-mlx"
 
     def __init__(self, model: str | None = None, load_base: bool = True):
diff --git a/schemas/__init__.py b/schemas/__init__.py
new file mode 100644
index 0000000..1140380
--- /dev/null
+++ b/schemas/__init__.py
@@ -0,0 +1 @@
+"""Mod3 shared schemas."""
diff --git a/schemas/bargein.py b/schemas/bargein.py
new file mode 100644
index 0000000..6738a51
--- /dev/null
+++ b/schemas/bargein.py
@@ -0,0 +1,70 @@
+"""Barge-in context schema for injecting interrupt state into the next agent turn.
+
+Sibling to `pipeline_state.InterruptInfo` — that type is the raw record captured
+at the moment TTS playback is halted (timestamp, spoken_pct, delivered_text,
+full_text, reason). `BargeinContext` is the agent-facing view: it adds the
+precomputed `unspoken` remainder, the interrupting user's transcript (when
+known), and a classified `source`, plus a `format_for_prompt()` renderer for
+system-prompt injection. A2/A3 construct one of these from an `InterruptInfo`
+(and, on the STT path, the resulting transcript) and hand it to agent_loop.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Literal
+
+from pipeline_state import InterruptInfo
+
+BargeinSource = Literal["browser_vad", "mcp_signal", "manual"]
+
+
+@dataclass
+class BargeinContext:
+    """Agent-facing snapshot of a TTS interrupt, ready for prompt injection."""
+
+    spoken: str
+    unspoken: str
+    full_text: str
+    spoken_pct: float
+    user_said: str | None
+    interrupted_at: datetime
+    source: BargeinSource
+
+    @classmethod
+    def from_interrupt_info(
+        cls,
+        info: InterruptInfo,
+        source: BargeinSource,
+        user_said: str | None = None,
+    ) -> BargeinContext:
+        """Build a BargeinContext from a pipeline_state.InterruptInfo record."""
+        full_text = info.full_text or ""
+        spoken = info.delivered_text or ""
+        if full_text.startswith(spoken):
+            unspoken = full_text[len(spoken):].strip()
+        else:
+            unspoken = full_text[len(spoken):].strip()
+        return cls(
+            spoken=spoken,
+            unspoken=unspoken,
+            full_text=full_text,
+            spoken_pct=info.spoken_pct,
+            user_said=user_said,
+            interrupted_at=datetime.fromtimestamp(info.timestamp),
+            source=source,
+        )
+
+    def format_for_prompt(self) -> str:
+        """Render a terse system-prompt-friendly string (3-6 lines)."""
+        lines: list[str] = ["[Your previous reply was interrupted.]"]
+        if self.spoken:
+            lines.append(f'Spoken: "{self.spoken}"')
+        if self.unspoken:
+            lines.append(f'Unspoken: "{self.unspoken}"')
+        if self.user_said:
+            lines.append(f'User said: "{self.user_said}"')
+        else:
+            lines.append(f"User interrupted at {self.spoken_pct * 100:.0f}% (via {self.source}).")
+        return "\n".join(lines)
diff --git a/tests/test_bargein_context.py b/tests/test_bargein_context.py
new file mode 100644
index 0000000..c8ec215
--- /dev/null
+++ b/tests/test_bargein_context.py
@@ -0,0 +1,200 @@
+"""Tests for the barge-in context injection path through AgentLoop.
+
+Verifies the A2/A3 flow end-to-end:
+  pipeline_state.last_interrupt (InterruptInfo)
+    -> AgentLoop._prepare_bargein_context()
+    -> AgentLoop._pending_bargein (BargeinContext)
+    -> AgentLoop._inject_pending_bargein(system_prompt)
+    -> provider.chat(system=...)
+
+Run: python -m pytest tests/test_bargein_context.py -v
+"""
+
+from __future__ import annotations
+
+import asyncio
+import os
+import sys
+import time
+from unittest.mock import MagicMock
+
+import pytest
+
+# Ensure the project root is on sys.path for imports
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from agent_loop import AgentLoop  # noqa: E402
+from modality import CognitiveEvent, ModalityType  # noqa: E402
+from pipeline_state import InterruptInfo, PipelineState  # noqa: E402
+from providers import ProviderResponse  # noqa: E402
+from schemas.bargein import BargeinContext  # noqa: E402
+
+
+# ---------------------------------------------------------------------------
+# Test doubles
+# ---------------------------------------------------------------------------
+
+
+class FakeProvider:
+    """Minimal async provider: records chat() kwargs, returns empty response."""
+
+    name = "fake"
+
+    def __init__(self, response: ProviderResponse | None = None):
+        self.calls: list[dict] = []
+        self._response = response or ProviderResponse(tool_calls=[], text="")
+
+    async def chat(self, messages, tools=None, system: str = ""):
+        self.calls.append({"messages": list(messages), "tools": tools, "system": system})
+        return self._response
+
+
+def _make_loop(provider: FakeProvider | None = None) -> AgentLoop:
+    """Build an AgentLoop with a MagicMock bus and a fresh PipelineState."""
+    bus = MagicMock()
+    prov = provider or FakeProvider()
+    state = PipelineState()
+    return AgentLoop(bus=bus, provider=prov, pipeline_state=state, channel_id="test")
+
+
+def _seed_interrupt(
+    state: PipelineState,
+    *,
+    full_text: str = "Hello there how are you today friend",
+    delivered_text: str = "Hello there how",
+    spoken_pct: float = 0.45,
+    reason: str = "vad_reflex",
+    timestamp: float | None = None,
+) -> InterruptInfo:
+    """Directly seed pipeline_state._last_interrupt (matches the prod consume pattern)."""
+    info = InterruptInfo(
+        timestamp=timestamp if timestamp is not None else time.time(),
+        spoken_pct=spoken_pct,
+        delivered_text=delivered_text,
+        full_text=full_text,
+        reason=reason,
+    )
+    with state._lock:
+        state._last_interrupt = info
+    return info
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+def test_prepare_bargein_from_interrupt():
+    """_prepare_bargein_context consumes last_interrupt and builds a BargeinContext."""
+    loop = _make_loop()
+    info = _seed_interrupt(loop.pipeline_state)
+
+    loop._prepare_bargein_context(user_text="wait go back")
+
+    assert isinstance(loop._pending_bargein, BargeinContext)
+    assert loop._pending_bargein.source == "browser_vad"
+    assert loop._pending_bargein.user_said == "wait go back"
+    assert loop._pending_bargein.full_text == info.full_text
+    assert loop._pending_bargein.spoken == info.delivered_text
+    assert abs(loop._pending_bargein.spoken_pct - info.spoken_pct) < 1e-9
+    # Unspoken = full minus delivered prefix, stripped
+    assert loop._pending_bargein.unspoken == "are you today friend"
+    # Interrupt must be cleared so subsequent turns don't re-consume it
+    assert loop.pipeline_state.last_interrupt is None
+
+
+def test_prepare_bargein_none_when_no_interrupt():
+    """When pipeline_state has no interrupt, _pending_bargein is cleared to None."""
+    loop = _make_loop()
+    loop._pending_bargein = MagicMock()  # something non-None
+    loop._prepare_bargein_context(user_text="hi")
+    assert loop._pending_bargein is None
+
+
+def test_inject_pending_bargein_into_prompt():
+    """_inject_pending_bargein appends the rendered context and clears pending."""
+    loop = _make_loop()
+    loop._pending_bargein = BargeinContext(
+        spoken="Hello there",
+        unspoken="how are you today",
+        full_text="Hello there how are you today",
+        spoken_pct=0.4,
+        user_said="wait go back",
+        interrupted_at=__import__("datetime").datetime.now(),
+        source="browser_vad",
+    )
+
+    result = loop._inject_pending_bargein("BASE PROMPT")
+
+    assert "BASE PROMPT" in result
+    assert "[Your previous reply was interrupted.]" in result
+    assert 'User said: "wait go back"' in result
+    # Consumed — must not leak into subsequent turns
+    assert loop._pending_bargein is None
+
+
+def test_inject_noop_when_no_pending():
+    """_inject_pending_bargein returns the prompt unchanged when nothing is pending."""
+    loop = _make_loop()
+    loop._pending_bargein = None
+    assert loop._inject_pending_bargein("BASE PROMPT") == "BASE PROMPT"
+
+
+def test_stale_interrupt_guarded():
+    """Interrupts older than the 30s freshness window are dropped, not injected."""
+    loop = _make_loop()
+    # Timestamp far in the past (well beyond the 30s guard)
+    _seed_interrupt(loop.pipeline_state, timestamp=time.time() - 120.0)
+
+    loop._prepare_bargein_context(user_text=None)
+
+    assert loop._pending_bargein is None
+    # Stale record must also be cleared so it can't rot in state
+    assert loop.pipeline_state.last_interrupt is None
+
+
+def test_full_flow_through_process_turn(monkeypatch):
+    """End-to-end: _process builds a prompt containing the bargein render.
+
+    Monkey-patches _fetch_kernel_context to avoid any HTTP call, and uses a
+    FakeProvider to capture the `system` kwarg passed into chat().
+    """
+    # Neutralize the kernel-context HTTP fetch: return "" so the prompt is deterministic.
+    import agent_loop as agent_loop_module
+
+    monkeypatch.setattr(agent_loop_module, "_fetch_kernel_context", lambda: "")
+    # Prevent exchange-logging HTTP call
+    monkeypatch.setattr(agent_loop_module, "_log_exchange_to_bus", lambda *a, **kw: None)
+
+    provider = FakeProvider(response=ProviderResponse(tool_calls=[], text=""))
+    loop = _make_loop(provider=provider)
+
+    _seed_interrupt(
+        loop.pipeline_state,
+        full_text="The capital of France is Paris and also other cities",
+        delivered_text="The capital of France",
+        spoken_pct=0.35,
+    )
+
+    event = CognitiveEvent(
+        modality=ModalityType.VOICE,
+        content="wait actually ask about Germany",
+        source_channel="test",
+    )
+
+    asyncio.run(loop._process(event))
+
+    assert len(provider.calls) == 1, "provider.chat should have been called exactly once"
+    system_prompt = provider.calls[0]["system"]
+    # Barge-in banner is present
+    assert "[Your previous reply was interrupted.]" in system_prompt
+    # User's new utterance was threaded into the BargeinContext
+    assert 'User said: "wait actually ask about Germany"' in system_prompt
+    # Spoken prefix surfaced in the render
+    assert 'Spoken: "The capital of France"' in system_prompt
+    # Base prompt still present
+    assert "You are Cog" in system_prompt
+    # Interrupt consumed from pipeline_state
+    assert loop.pipeline_state.last_interrupt is None
+    # Pending bargein cleared after injection
+    assert loop._pending_bargein is None
diff --git a/tests/test_bus_bridge_runner.py b/tests/test_bus_bridge_runner.py
new file mode 100644
index 0000000..4f2581d
--- /dev/null
+++ b/tests/test_bus_bridge_runner.py
@@ -0,0 +1,74 @@
+"""Tests for bus_bridge_runner.run_bridge — filter + fan-out behavior."""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+# Make the mod3 package root importable (tests live in tests/ subfolder).
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from bus_bridge import BusEnvelope  # noqa: E402
+from bus_bridge_runner import ADR083_KINDS, run_bridge  # noqa: E402
+
+
+class _FakeSubscriber:
+    """Minimal stand-in for KernelBusSubscriber: yields canned envelopes then stops."""
+
+    def __init__(self, envelopes: list[BusEnvelope]) -> None:
+        self._envelopes = envelopes
+
+    async def stream(self):
+        for env in self._envelopes:
+            yield env
+
+
+def _env(kind: str, payload: dict | None = None, event_id: str = "e1") -> BusEnvelope:
+    return BusEnvelope(
+        raw={"type": "bus.event", "data": payload or {"kind": kind}},
+        kind=kind,
+        payload=payload or {"kind": kind, "cycle_id": "c1"},
+        ts="2026-04-17T00:00:00Z",
+        event_id=event_id,
+    )
+
+
+def test_run_bridge_forwards_only_filtered_kinds():
+    envelopes = [
+        _env("state_transition", {"kind": "state_transition", "cycle_id": "c1"}, "e1"),
+        _env("unknown_future_kind", {"kind": "unknown_future_kind"}, "e2"),
+        _env("tool_dispatch", {"kind": "tool_dispatch", "cycle_id": "c1"}, "e3"),
+        _env("connected", {}, "e4"),
+    ]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("bus_bridge_runner.BrowserChannel.broadcast_trace_event") as mock_bcast:
+        asyncio.run(run_bridge(sub, filter_kinds=set(ADR083_KINDS)))
+
+    # Only the two ADR-083 envelopes should have been forwarded.
+    assert mock_bcast.call_count == 2
+    kinds_forwarded = [c.args[0]["kind"] for c in mock_bcast.call_args_list]
+    assert kinds_forwarded == ["state_transition", "tool_dispatch"]
+
+
+def test_run_bridge_no_filter_forwards_all_nonconnected():
+    envelopes = [
+        _env("state_transition", {"kind": "state_transition"}, "e1"),
+        _env("assessment", {"kind": "assessment"}, "e2"),
+        _env("weird.ns.kind", {"kind": "weird.ns.kind"}, "e3"),
+        _env("connected", {}, "e4"),  # bootstrap frame — always skipped
+    ]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("bus_bridge_runner.BrowserChannel.broadcast_trace_event") as mock_bcast:
+        asyncio.run(run_bridge(sub, filter_kinds=None))
+
+    assert mock_bcast.call_count == 3
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/test_cogos_agent_bridge.py b/tests/test_cogos_agent_bridge.py
new file mode 100644
index 0000000..6ee197b
--- /dev/null
+++ b/tests/test_cogos_agent_bridge.py
@@ -0,0 +1,114 @@
+"""Tests for cogos_agent_bridge — POST body shape + response fan-out."""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from bus_bridge import BusEnvelope  # noqa: E402
+from cogos_agent_bridge import (  # noqa: E402
+    _extract_response_text,
+    post_user_message,
+    run_response_bridge,
+)
+
+
+class _FakeSubscriber:
+    def __init__(self, envelopes: list[BusEnvelope]) -> None:
+        self._envelopes = envelopes
+
+    async def stream(self):
+        for env in self._envelopes:
+            yield env
+
+
+def _env(payload: dict, event_id: str = "r1") -> BusEnvelope:
+    return BusEnvelope(
+        raw={"type": "bus.event", "data": payload},
+        kind="bus.event",
+        payload=payload,
+        ts="2026-04-17T00:00:00Z",
+        event_id=event_id,
+    )
+
+
+def test_extract_response_text_handles_content_wrapped_json():
+    # Kernel wraps the sent `message` string inside {"content": "<str>"}.
+    inner = {"type": "agent_response", "text": "hi there", "ts": "2026-04-17T00:00:00Z"}
+    payload = {"content": json.dumps(inner)}
+    assert _extract_response_text(payload) == "hi there"
+
+
+def test_extract_response_text_handles_plain_content_string():
+    assert _extract_response_text({"content": "hello"}) == "hello"
+
+
+def test_extract_response_text_skips_unparseable():
+    assert _extract_response_text({"foo": "bar"}) is None
+    assert _extract_response_text({}) is None
+
+
+def test_post_user_message_body_shape():
+    captured: dict = {}
+
+    class _FakeResp:
+        status_code = 200
+        text = ""
+
+    class _FakeClient:
+        def __init__(self, *a, **kw):
+            pass
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *a):
+            return False
+
+        async def post(self, url, json=None):  # noqa: A002 — matches httpx signature
+            captured["url"] = url
+            captured["body"] = json
+            return _FakeResp()
+
+    with patch("cogos_agent_bridge.httpx.AsyncClient", _FakeClient):
+        ok = asyncio.run(post_user_message("hello agent", session_id="mod3:browser:abc"))
+
+    assert ok is True
+    body = captured["body"]
+    assert body["bus_id"] == "bus_dashboard_chat"
+    assert body["type"] == "user_message"
+    assert body["from"] == "mod3-dashboard"
+    # `message` is a JSON-encoded event dict — parse and check shape.
+    event = json.loads(body["message"])
+    assert event["type"] == "user_message"
+    assert event["text"] == "hello agent"
+    assert event["session_id"] == "mod3:browser:abc"
+    assert "ts" in event
+
+
+def test_run_response_bridge_fans_out_to_broadcast():
+    # Envelope in the shape the kernel emits: payload = {"content": "<json>"}
+    inner = json.dumps({"type": "agent_response", "text": "reply one"})
+    envelopes = [
+        _env({"content": inner}, "r1"),
+        _env({"content": "free-form string reply"}, "r2"),
+        _env({"foo": "bar"}, "r3"),  # no text — should be skipped
+    ]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("cogos_agent_bridge.BrowserChannel.broadcast_response_text") as mock_bcast:
+        asyncio.run(run_response_bridge(sub))
+
+    texts = [c.args[0] for c in mock_bcast.call_args_list]
+    assert texts == ["reply one", "free-form string reply"]
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

From 3b9eab07834bce684122f1b792039ca1c807a5d3 Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Sun, 19 Apr 2026 18:47:24 -0400
Subject: [PATCH 6/9] feat(bargein): first-class provider registry with
 SuperWhisper as first provider

Refactor barge-in detection into a pluggable in-process primitive. Sources
emit BargeinEvents through a callback; a registry routes them into the same
consumer helper the legacy /tmp/mod3-barge-in.json file watcher uses. The
SuperWhisper SQLite+filesystem detection logic (previously drifting in the
cog workspace at .cog/bin/bargein-producer.py) is absorbed as the first
provider, opt-in via MOD3_BARGEIN_PROVIDERS=superwhisper.

- bargein/providers/base.py: BargeinProvider + BargeinEvent (thread-based)
- bargein/providers/superwhisper.py: in-process SW watcher emitting via callback
- bargein/__init__.py: BargeinRegistry + handle_bargein_start() shared helper
- schemas/bargein.py: add "superwhisper" to BargeinSource literal
- server.py: factor _bargein_watcher onto handle_bargein_start; start registry
  from env after legacy watcher (empty default preserves current behavior)
- integrations/bargein-producer.py: reconcile drift from workspace (SQLite DB
  as structural ground truth, generous 150s stale timeout) + deprecation note
- mcp_shim.py: unify signal path to /tmp/mod3-barge-in.json (was orphan
  ~/.mod3_bargein_signal.json that nobody wrote to)
- tests: 14 new tests covering lifecycle, dispatch, subscribers, env loader
---
 bargein/__init__.py                     | 210 ++++++++++++++++
 bargein/providers/__init__.py           |  14 ++
 bargein/providers/base.py               | 132 ++++++++++
 bargein/providers/superwhisper.py       | 265 ++++++++++++++++++++
 integrations/bargein-producer.py        |  85 ++++++-
 mcp_shim.py                             |   7 +-
 schemas/bargein.py                      |   6 +-
 server.py                               |  68 ++++--
 tests/test_bargein_provider_registry.py | 308 ++++++++++++++++++++++++
 9 files changed, 1058 insertions(+), 37 deletions(-)
 create mode 100644 bargein/__init__.py
 create mode 100644 bargein/providers/__init__.py
 create mode 100644 bargein/providers/base.py
 create mode 100644 bargein/providers/superwhisper.py
 create mode 100644 tests/test_bargein_provider_registry.py

diff --git a/bargein/__init__.py b/bargein/__init__.py
new file mode 100644
index 0000000..2eb69a1
--- /dev/null
+++ b/bargein/__init__.py
@@ -0,0 +1,210 @@
+"""Barge-in subsystem.
+
+This package owns the first-class barge-in primitive inside mod3. Sources
+(SuperWhisper, browser VAD, MCP signals, etc.) register as
+``BargeinProvider`` instances; each one emits ``BargeinEvent``s through a
+callback. The registry below wires those callbacks into the shared consumer
+helper ``handle_bargein_event``, which does the same work the legacy
+``/tmp/mod3-barge-in.json`` file watcher in ``server.py`` does today:
+interrupt in-progress playback via ``pipeline_state.interrupt()`` and log.
+
+Env-driven config:
+  MOD3_BARGEIN_PROVIDERS — comma-separated provider names (default: empty).
+                           Example: ``MOD3_BARGEIN_PROVIDERS=superwhisper``
+
+Default is empty so users without SuperWhisper installed see no behavior
+change from the current setup — they can still run the standalone
+``integrations/bargein-producer.py`` script and the legacy file watcher
+in ``server.py`` keeps picking up its signals.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import threading
+from typing import Callable
+
+from pipeline_state import InterruptInfo, PipelineState
+
+from .providers.base import BargeinCallback, BargeinEvent, BargeinProvider
+
+log = logging.getLogger("bargein")
+
+# ---------------------------------------------------------------------------
+# Shared consumer helper
+# ---------------------------------------------------------------------------
+#
+# Both the legacy file watcher in server.py and the new provider registry
+# call this when a "user is speaking" signal arrives. It is the single
+# authoritative "barge-in start" handler.
+#
+# Returning the InterruptInfo (or None) lets the file watcher continue its
+# extra work of writing the interrupt detail back into the signal file —
+# cross-process coordination that only matters for the file-based IPC.
+# In-process providers ignore the return.
+
+
+def handle_bargein_start(
+    pipeline_state: PipelineState,
+    source: str,
+    metadata: dict | None = None,
+) -> InterruptInfo | None:
+    """Attempt to interrupt in-progress TTS playback because the user began speaking.
+
+    Returns the ``InterruptInfo`` if playback was actually halted, or ``None``
+    if nothing was speaking (or another process owns the speech — only the
+    file watcher can handle that via the cross-process lock).
+    """
+    if not pipeline_state.is_speaking:
+        return None
+    info = pipeline_state.interrupt(reason="barge_in")
+    if info is not None:
+        log.info(
+            "Barge-in from %s: paused local playback (%.0f%% delivered)%s",
+            source,
+            info.spoken_pct * 100,
+            f" meta={metadata}" if metadata else "",
+        )
+    return info
+
+
+# ---------------------------------------------------------------------------
+# Provider registry
+# ---------------------------------------------------------------------------
+
+
+PROVIDER_NAMES = ["superwhisper"]
+
+
+def _build_provider(name: str, on_event: BargeinCallback) -> BargeinProvider | None:
+    """Instantiate a provider by name. Returns None if unknown or import fails."""
+    name = name.strip().lower()
+    if not name:
+        return None
+    if name == "superwhisper":
+        from .providers.superwhisper import SuperWhisperProvider
+
+        return SuperWhisperProvider(on_event=on_event)
+    log.warning("Unknown barge-in provider: %r (known: %s)", name, PROVIDER_NAMES)
+    return None
+
+
+class BargeinRegistry:
+    """Owns the set of active barge-in providers and routes their events.
+
+    Use:
+        registry = BargeinRegistry(pipeline_state)
+        registry.start_from_env()       # or registry.register(SomeProvider(...))
+        # ... later, on shutdown:
+        registry.stop_all()
+
+    Tests can install their own dispatch by passing ``on_event`` to
+    ``register``; registry-level dispatch goes through ``_dispatch`` which
+    calls both ``handle_bargein_start`` and any extra subscribers.
+    """
+
+    def __init__(self, pipeline_state: PipelineState):
+        self._pipeline_state = pipeline_state
+        self._providers: list[BargeinProvider] = []
+        self._subscribers: list[Callable[[BargeinEvent], None]] = []
+        self._lock = threading.Lock()
+
+    # ------------------------------------------------------------------
+    # Registration
+    # ------------------------------------------------------------------
+
+    def register(self, provider: BargeinProvider) -> None:
+        """Register a pre-built provider. Does NOT start it (see ``start_all``)."""
+        with self._lock:
+            self._providers.append(provider)
+
+    def subscribe(self, callback: Callable[[BargeinEvent], None]) -> None:
+        """Register an additional event subscriber (fires after the consumer helper).
+
+        Useful for tests and for future observers (metrics, bus emits, etc.).
+        """
+        with self._lock:
+            self._subscribers.append(callback)
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def start_all(self) -> None:
+        """Start every registered provider."""
+        with self._lock:
+            providers = list(self._providers)
+        for p in providers:
+            p.start()
+
+    def stop_all(self, timeout: float = 2.0) -> None:
+        """Signal shutdown and (best-effort) join every provider thread."""
+        with self._lock:
+            providers = list(self._providers)
+        for p in providers:
+            p.stop(timeout=timeout)
+
+    def start_from_env(self, env_var: str = "MOD3_BARGEIN_PROVIDERS") -> list[str]:
+        """Instantiate and start providers listed in the env var. Returns started names.
+
+        Providers already present on the registry are kept; we append whatever
+        the env var asks for that isn't already there.
+        """
+        raw = os.environ.get(env_var, "").strip()
+        if not raw:
+            log.info("No barge-in providers configured (set %s=superwhisper to enable)", env_var)
+            return []
+
+        requested = [n.strip().lower() for n in raw.split(",") if n.strip()]
+        already = {type(p).__name__.lower() for p in self._providers}
+        started: list[str] = []
+        for name in requested:
+            # Match by normalized class name (SuperWhisperProvider -> "superwhisperprovider")
+            # or the logical name the factory accepts.
+            if f"{name}provider" in already:
+                continue
+            provider = _build_provider(name, self._dispatch)
+            if provider is None:
+                continue
+            self.register(provider)
+            provider.start()
+            started.append(name)
+        log.info("Barge-in providers started: %s", started)
+        return started
+
+    # ------------------------------------------------------------------
+    # Internals
+    # ------------------------------------------------------------------
+
+    def _dispatch(self, event: BargeinEvent) -> None:
+        """Route a provider event through the shared consumer + any subscribers."""
+        try:
+            if event.event_type == "user_speaking_start":
+                handle_bargein_start(
+                    self._pipeline_state,
+                    source=event.source,
+                    metadata=event.metadata,
+                )
+            # user_speaking_end has no in-process consumer today (the legacy
+            # file watcher also only reacts to "start"). Subscribers still
+            # see it so future code can use it.
+        except Exception:
+            log.exception("consumer helper raised while handling %s", event)
+
+        with self._lock:
+            subs = list(self._subscribers)
+        for cb in subs:
+            try:
+                cb(event)
+            except Exception:
+                log.exception("barge-in subscriber raised")
+
+
+__all__ = [
+    "BargeinEvent",
+    "BargeinProvider",
+    "BargeinRegistry",
+    "handle_bargein_start",
+    "PROVIDER_NAMES",
+]
diff --git a/bargein/providers/__init__.py b/bargein/providers/__init__.py
new file mode 100644
index 0000000..664a82b
--- /dev/null
+++ b/bargein/providers/__init__.py
@@ -0,0 +1,14 @@
+"""Barge-in providers.
+
+Each provider watches a different signal source (SuperWhisper, browser VAD,
+hotkey, mic-level VAD, …) and emits ``BargeinEvent`` through a callback.
+"""
+
+from .base import BargeinCallback, BargeinEvent, BargeinEventType, BargeinProvider
+
+__all__ = [
+    "BargeinCallback",
+    "BargeinEvent",
+    "BargeinEventType",
+    "BargeinProvider",
+]
diff --git a/bargein/providers/base.py b/bargein/providers/base.py
new file mode 100644
index 0000000..7c7cd1d
--- /dev/null
+++ b/bargein/providers/base.py
@@ -0,0 +1,132 @@
+"""Barge-in provider base class + event shape.
+
+A provider watches some external signal source (SuperWhisper recordings,
+browser VAD, a push-to-talk hotkey, a mic-level silero VAD, …) and emits
+``BargeinEvent``s through an ``on_event`` callback supplied at construction.
+The mod3 provider registry wires that callback to the shared consumer helper
+(``bargein._handle_bargein_start``), which takes the same action the legacy
+``/tmp/mod3-barge-in.json`` file watcher takes today.
+
+Concurrency: threads. Providers run their own polling loop on a daemon
+thread started by ``start()`` and stopped by ``stop()``. This matches the
+existing ``_bargein_watcher`` in server.py. The SuperWhisper provider's
+inner loop does blocking filesystem + sqlite3 reads, so a thread is the
+natural fit; an async shape would force every provider to wrap blocking
+calls in ``asyncio.to_thread``.
+"""
+
+from __future__ import annotations
+
+import threading
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Callable, Literal
+
+from schemas.bargein import BargeinSource
+
+BargeinEventType = Literal["user_speaking_start", "user_speaking_end"]
+
+
+@dataclass
+class BargeinEvent:
+    """A single emission from a ``BargeinProvider``.
+
+    ``metadata`` carries provider-specific detail (folder names, confidence
+    scores, etc.) that the consumer may log but must not depend on for
+    correctness.
+    """
+
+    source: BargeinSource
+    event_type: BargeinEventType
+    timestamp: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
+    metadata: dict = field(default_factory=dict)
+
+
+BargeinCallback = Callable[[BargeinEvent], None]
+
+
+class BargeinProvider(ABC):
+    """Abstract barge-in provider.
+
+    Subclasses implement ``_run`` as a blocking poll loop. ``start()`` spawns
+    it on a daemon thread; ``stop()`` sets the stop-event and (best-effort)
+    joins the thread.
+    """
+
+    source: BargeinSource  # class-level — subclasses set this
+
+    def __init__(self, on_event: BargeinCallback):
+        self._on_event = on_event
+        self._stop = threading.Event()
+        self._thread: threading.Thread | None = None
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+
+    def start(self) -> None:
+        """Start the provider's background thread. Idempotent."""
+        if self._thread is not None and self._thread.is_alive():
+            return
+        self._stop.clear()
+        self._thread = threading.Thread(
+            target=self._run_guarded,
+            name=f"bargein-{self.source}",
+            daemon=True,
+        )
+        self._thread.start()
+
+    def stop(self, timeout: float = 2.0) -> None:
+        """Signal shutdown and best-effort join the thread."""
+        self._stop.set()
+        thread = self._thread
+        if thread is not None and thread.is_alive():
+            thread.join(timeout=timeout)
+        self._thread = None
+
+    @property
+    def is_running(self) -> bool:
+        return self._thread is not None and self._thread.is_alive()
+
+    # ------------------------------------------------------------------
+    # Subclass contract
+    # ------------------------------------------------------------------
+
+    @abstractmethod
+    def _run(self) -> None:
+        """Provider-specific poll loop. Must return when ``self._stop`` is set."""
+
+    def _emit(
+        self,
+        event_type: BargeinEventType,
+        metadata: dict | None = None,
+    ) -> None:
+        """Emit an event to the registered callback. Swallows callback errors."""
+        try:
+            self._on_event(
+                BargeinEvent(
+                    source=self.source,
+                    event_type=event_type,
+                    metadata=metadata or {},
+                )
+            )
+        except Exception:
+            # Provider must not die because the consumer threw.
+            import logging
+
+            logging.getLogger(f"bargein.{self.source}").exception("barge-in callback raised; continuing")
+
+    # ------------------------------------------------------------------
+    # Internal
+    # ------------------------------------------------------------------
+
+    def _run_guarded(self) -> None:
+        """Wrap ``_run`` so an unexpected raise logs instead of vanishing silently."""
+        import logging
+
+        log = logging.getLogger(f"bargein.{self.source}")
+        try:
+            self._run()
+        except Exception:
+            log.exception("provider loop crashed")
diff --git a/bargein/providers/superwhisper.py b/bargein/providers/superwhisper.py
new file mode 100644
index 0000000..c95c2e3
--- /dev/null
+++ b/bargein/providers/superwhisper.py
@@ -0,0 +1,265 @@
+"""SuperWhisper barge-in provider.
+
+Watches the SuperWhisper recordings directory and its SQLite DB for
+recording start/end, emitting ``BargeinEvent``s through the registered
+callback. This is the in-process replacement for the standalone
+``integrations/bargein-producer.py`` script: same detection logic, but
+instead of writing ``/tmp/mod3-barge-in.json`` it calls directly into
+mod3's barge-in consumer.
+
+Detection:
+  * Start: a new empty timestamped folder appears under the recordings dir.
+  * End (any of):
+      - ``output.wav`` or ``meta.json`` appears in that folder, OR
+      - a matching row appears in ``superwhisper.sqlite`` (structural ground
+        truth — written only after transcription completes), OR
+      - the folder disappears (cancellation), OR
+      - the staleness timeout elapses without the above (crash / sleep).
+
+Environment variables:
+  SW_RECORDINGS_DIR   — override recordings path
+  BARGEIN_POLL_MS     — poll interval in ms (default: 150)
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import time
+from pathlib import Path
+
+from .base import BargeinProvider
+
+log = logging.getLogger("bargein.superwhisper")
+
+
+class SuperWhisperProvider(BargeinProvider):
+    """Barge-in provider backed by SuperWhisper's recordings folder + DB."""
+
+    source = "superwhisper"
+
+    # Default ~/Documents/superwhisper/recordings, overridable via env.
+    _DEFAULT_REC_DIR = os.path.expanduser("~/Documents/superwhisper/recordings")
+    # SuperWhisper SQLite DB — secondary "recording finished" signal.
+    _SW_DB = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
+    # 2.5 minutes — recordings can legitimately run 60s+; be generous
+    # before declaring a stuck folder stale.
+    _STALE_TIMEOUT = 150
+    _STARTUP_FRESH_SECS = 30
+
+    def __init__(self, on_event, recordings_dir: str | None = None, poll_ms: int | None = None):
+        super().__init__(on_event)
+        self.recordings_dir = Path(recordings_dir or os.environ.get("SW_RECORDINGS_DIR", self._DEFAULT_REC_DIR))
+        poll_ms = poll_ms if poll_ms is not None else int(os.environ.get("BARGEIN_POLL_MS", "150"))
+        self._poll_interval = poll_ms / 1000.0
+
+        # Mutable state (touched only from the provider thread)
+        self._recording = False
+        self._active_folder: str | None = None
+        self._known_folders: set[str] = set()
+        self._last_dir_mtime: float = 0.0
+
+    # ------------------------------------------------------------------
+    # State transitions (emit events through the callback)
+    # ------------------------------------------------------------------
+
+    def _start_recording(self, folder: str) -> None:
+        if self._recording and self._active_folder == folder:
+            return
+        self._recording = True
+        self._active_folder = folder
+        log.info("Recording started (folder=%s)", folder)
+        self._emit("user_speaking_start", {"folder": folder})
+
+    def _end_recording(self, reason: str) -> None:
+        if not self._recording:
+            return
+        folder = self._active_folder
+        self._recording = False
+        if folder:
+            self._known_folders.add(folder)
+        self._active_folder = None
+        log.info("Recording finished (folder=%s, reason=%s)", folder, reason)
+        self._emit("user_speaking_end", {"folder": folder, "reason": reason})
+
+    # ------------------------------------------------------------------
+    # Detection helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _is_empty_dir(path: Path) -> bool:
+        try:
+            return path.is_dir() and not any(path.iterdir())
+        except OSError:
+            return False
+
+    @staticmethod
+    def _has_output(path: Path) -> bool:
+        return (path / "output.wav").exists() or (path / "meta.json").exists()
+
+    @classmethod
+    def _is_in_db(cls, folder_name: str) -> bool:
+        """True if SuperWhisper's DB has a ``recording`` row for this folder.
+
+        SuperWhisper writes the row only after transcription completes, so a
+        hit here is a definitive "recording is done" signal regardless of
+        filesystem state.
+        """
+        try:
+            import sqlite3
+
+            conn = sqlite3.connect(f"file:{cls._SW_DB}?mode=ro", uri=True, timeout=1.0)
+            cursor = conn.execute(
+                "SELECT 1 FROM recording WHERE folderName = ? LIMIT 1",
+                (folder_name,),
+            )
+            found = cursor.fetchone() is not None
+            conn.close()
+            return found
+        except Exception:
+            return False
+
+    def _scan(self) -> None:
+        """One poll cycle: detect state changes in the recordings dir."""
+        rec_dir = self.recordings_dir
+
+        # Fast path: if we're tracking an active recording, check completion signals
+        if self._recording and self._active_folder:
+            active_path = rec_dir / self._active_folder
+            if self._has_output(active_path):
+                self._end_recording(reason="output_files")
+                return
+            if self._is_in_db(self._active_folder):
+                log.info("DB confirms recording complete (filesystem missed it)")
+                self._end_recording(reason="db")
+                return
+            if not active_path.exists():
+                log.warning("Active recording folder disappeared, clearing state")
+                self._end_recording(reason="folder_gone")
+                return
+            # Fall through so we can detect a newer recording superseding this one
+
+        # Stat-then-iterdir: skip the expensive scan if mtime is unchanged
+        try:
+            dir_mtime = os.stat(rec_dir).st_mtime
+        except OSError:
+            return
+        if dir_mtime == self._last_dir_mtime:
+            return
+        self._last_dir_mtime = dir_mtime
+
+        try:
+            candidates: list[Path] = []
+            for entry in rec_dir.iterdir():
+                if entry.is_dir() and entry.name.isdigit() and entry.name not in self._known_folders:
+                    candidates.append(entry)
+        except OSError:
+            return
+
+        candidates.sort(key=lambda p: p.name, reverse=True)
+        for entry in candidates[:5]:
+            name = entry.name
+            if self._is_empty_dir(entry):
+                self._start_recording(name)
+                return
+            # Non-empty, previously unseen — completed recording we missed
+            self._known_folders.add(name)
+
+    def _check_stale(self) -> None:
+        """Clear stuck recording state if the active folder has been empty too long.
+
+        Before clearing, double-check the DB so legitimately long recordings
+        aren't thrown away when they finally land.
+        """
+        if not self._recording or not self._active_folder:
+            return
+        folder = self.recordings_dir / self._active_folder
+        try:
+            ctime = folder.stat().st_birthtime
+        except (OSError, AttributeError):
+            return
+        if time.time() - ctime <= self._STALE_TIMEOUT:
+            return
+
+        if self._is_in_db(self._active_folder):
+            log.info("Stale timeout hit but DB confirms completion — ending normally")
+            self._end_recording(reason="db_after_stale")
+        elif self._has_output(folder):
+            log.info("Stale timeout hit but output files present — ending normally")
+            self._end_recording(reason="output_after_stale")
+        else:
+            log.warning(
+                "Stale recording (>%ds), no DB entry, no output files — clearing as cancelled/crashed",
+                self._STALE_TIMEOUT,
+            )
+            self._end_recording(reason="stale")
+
+    # ------------------------------------------------------------------
+    # Startup scan: handle recordings that existed before we started
+    # ------------------------------------------------------------------
+
+    def _startup_scan(self) -> None:
+        now = time.time()
+        newest_empty: tuple[str, float] | None = None
+        try:
+            for entry in self.recordings_dir.iterdir():
+                if not (entry.is_dir() and entry.name.isdigit()):
+                    continue
+                if self._has_output(entry):
+                    self._known_folders.add(entry.name)
+                elif self._is_empty_dir(entry):
+                    try:
+                        age = now - entry.stat().st_birthtime
+                    except (OSError, AttributeError):
+                        age = float("inf")
+                    if age < self._STARTUP_FRESH_SECS:
+                        if newest_empty is None or entry.name > newest_empty[0]:
+                            newest_empty = (entry.name, age)
+                    else:
+                        self._known_folders.add(entry.name)
+        except OSError as e:
+            log.warning("Startup scan error: %s", e)
+
+        if newest_empty:
+            log.info("Detected in-progress recording on startup (age=%.1fs)", newest_empty[1])
+            self._start_recording(newest_empty[0])
+
+    # ------------------------------------------------------------------
+    # Provider contract
+    # ------------------------------------------------------------------
+
+    def _run(self) -> None:
+        rec_dir = self.recordings_dir
+        if not rec_dir.is_dir():
+            log.warning(
+                "SuperWhisper recordings directory not found: %s (provider inactive)",
+                rec_dir,
+            )
+            return
+
+        self._startup_scan()
+        log.info(
+            "SuperWhisper provider running (poll=%dms, recordings=%s, known=%d)",
+            self._poll_interval * 1000,
+            rec_dir,
+            len(self._known_folders),
+        )
+
+        stale_every = max(1, int(2.0 / self._poll_interval))
+        stale_counter = 0
+        while not self._stop.is_set():
+            try:
+                self._scan()
+                stale_counter += 1
+                if stale_counter >= stale_every:
+                    self._check_stale()
+                    stale_counter = 0
+            except Exception:
+                log.exception("SuperWhisper poll cycle raised; continuing")
+            # Use Event.wait for responsive shutdown
+            if self._stop.wait(self._poll_interval):
+                break
+
+        if self._recording:
+            # Emit a synthetic end so consumers don't stay in "speaking" forever
+            self._end_recording(reason="shutdown")
diff --git a/integrations/bargein-producer.py b/integrations/bargein-producer.py
index 2b9fa51..c7d0d20 100755
--- a/integrations/bargein-producer.py
+++ b/integrations/bargein-producer.py
@@ -3,12 +3,20 @@
 Barge-in signal producer — detects SuperWhisper recording and writes
 the signal file that Mod3's barge-in consumer watches.
 
+DEPRECATED as a standalone script once mod3 absorbs this functionality.
+Prefer the in-process provider at ``mod3.bargein.providers.superwhisper``
+(enabled by setting ``MOD3_BARGEIN_PROVIDERS=superwhisper``), which calls
+into mod3's barge-in consumer directly instead of going through the
+``/tmp/mod3-barge-in.json`` file IPC. This script is retained so existing
+launchd users (e.g. ``com.cogos.bargein-producer.plist``) continue to work
+until they migrate.
+
 Detection method:
   SuperWhisper creates a timestamped directory in its recordings folder
   the instant recording begins (the dir is empty). When recording finishes,
   it writes output.wav and meta.json into that directory. We poll for new
   empty directories to detect start, and for the appearance of output.wav
-  to detect end.
+  (or a matching row in SuperWhisper's SQLite DB) to detect end.
 
 Signal file: /tmp/mod3-barge-in.json
   Start:  {"event": "user_speaking_start", "timestamp": "...", "source": "superwhisper"}
@@ -130,6 +138,32 @@ def _has_output(path: Path) -> bool:
     return (path / "output.wav").exists() or (path / "meta.json").exists()
 
 
+# SuperWhisper SQLite DB — secondary signal for recording completion
+_SW_DB = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
+
+
+def _is_in_db(folder_name: str) -> bool:
+    """Check if SuperWhisper has finished processing this recording.
+
+    SuperWhisper writes to its SQLite DB only AFTER transcription completes.
+    This is the structural ground truth — if the folder is in the DB, the
+    recording is definitely done, regardless of filesystem state.
+    """
+    try:
+        import sqlite3
+
+        conn = sqlite3.connect(f"file:{_SW_DB}?mode=ro", uri=True, timeout=1.0)
+        cursor = conn.execute(
+            "SELECT 1 FROM recording WHERE folderName = ? LIMIT 1",
+            (folder_name,),
+        )
+        found = cursor.fetchone() is not None
+        conn.close()
+        return found
+    except Exception:
+        return False
+
+
 _last_dir_mtime: float = 0.0
 
 
@@ -142,12 +176,25 @@ def _scan(state: State, rec_dir: Path):
     """
     global _last_dir_mtime
 
-    # Fast path: if we're tracking an active recording, just check that folder
+    # Fast path: if we're tracking an active recording, check completion signals
     if state.recording and state.active_folder:
         active_path = rec_dir / state.active_folder
+        # Primary: filesystem (output.wav or meta.json appeared)
         if _has_output(active_path):
             state.end()
-        return
+            return
+        # Secondary: SuperWhisper DB (transcription complete — structural ground truth)
+        if _is_in_db(state.active_folder):
+            log.info("DB confirms recording complete (filesystem missed it)")
+            state.end()
+            return
+        # Folder deleted/cancelled
+        if not active_path.exists():
+            log.warning("Active recording folder disappeared, clearing state")
+            state.end()
+            return
+        # Don't return here — fall through to check if directory changed,
+        # so we can detect if a newer recording superseded this one
 
     # Check if directory changed since last scan
     try:
@@ -184,12 +231,18 @@ def _scan(state: State, rec_dir: Path):
 # Staleness guard
 # ---------------------------------------------------------------------------
 
-_STALE_TIMEOUT = 120  # 2 minutes — if a recording folder stays empty this long,
-#                       assume SuperWhisper cancelled/crashed and clear state
+_STALE_TIMEOUT = 150  # 2.5 minutes — if a recording folder stays empty this long,
+#                       assume SuperWhisper cancelled/crashed/system slept and clear state.
+#                       User has long recordings (60s+), so this is generous.
 
 
 def _check_stale(state: State, rec_dir: Path):
-    """Clear recording state if the active folder has been empty too long."""
+    """Clear recording state if the active folder has been empty too long.
+
+    Before clearing, performs a final DB check to confirm the recording
+    isn't still being actively transcribed. This prevents the staleness
+    timeout from invalidating a legitimately long recording session.
+    """
     if not state.recording or not state.active_folder:
         return
     folder = rec_dir / state.active_folder
@@ -199,8 +252,20 @@ def _check_stale(state: State, rec_dir: Path):
     except (OSError, AttributeError):
         return
     if time.time() - ctime > _STALE_TIMEOUT:
-        log.warning("Stale recording detected (>%ds), clearing state", _STALE_TIMEOUT)
-        state.end()
+        # Final gateway: check DB before declaring stale
+        if _is_in_db(state.active_folder):
+            log.info("Stale timeout hit but DB confirms completion — ending normally")
+            state.end()
+        elif _has_output(folder):
+            log.info("Stale timeout hit but output files present — ending normally")
+            state.end()
+        else:
+            # Neither DB nor filesystem confirm completion — truly stale
+            log.warning(
+                "Stale recording (>%ds), no DB entry, no output files — clearing as cancelled/crashed",
+                _STALE_TIMEOUT,
+            )
+            state.end()
 
 
 # ---------------------------------------------------------------------------
@@ -261,9 +326,9 @@ def main():
         while True:
             _scan(state, rec_dir)
 
-            # Check for stale recordings every ~5 seconds
+            # Check for stale recordings every ~2 seconds
             stale_counter += 1
-            if stale_counter >= int(5.0 / POLL_INTERVAL):
+            if stale_counter >= int(2.0 / POLL_INTERVAL):
                 _check_stale(state, rec_dir)
                 stale_counter = 0
 
diff --git a/mcp_shim.py b/mcp_shim.py
index a34ca35..3f6f87b 100644
--- a/mcp_shim.py
+++ b/mcp_shim.py
@@ -47,8 +47,11 @@
 _jobs_lock = threading.Lock()
 _MAX_JOBS = 50
 
-# Barge-in signal file (same as server.py)
-_BARGEIN_SIGNAL = os.path.expanduser("~/.mod3_bargein_signal.json")
+# Barge-in signal file — must match server.py (_BARGEIN_SIGNAL there).
+# Previously this was ``~/.mod3_bargein_signal.json`` but that was never
+# written by anyone; the canonical path is the one the producer and server
+# already use: /tmp/mod3-barge-in.json.
+_BARGEIN_SIGNAL = os.environ.get("BARGEIN_SIGNAL", "/tmp/mod3-barge-in.json")
 
 
 def _http_request(method: str, path: str, body: dict | None = None, timeout: float = 30.0) -> tuple[int, dict | bytes]:
diff --git a/schemas/bargein.py b/schemas/bargein.py
index 6738a51..0134235 100644
--- a/schemas/bargein.py
+++ b/schemas/bargein.py
@@ -17,7 +17,7 @@
 
 from pipeline_state import InterruptInfo
 
-BargeinSource = Literal["browser_vad", "mcp_signal", "manual"]
+BargeinSource = Literal["browser_vad", "mcp_signal", "manual", "superwhisper"]
 
 
 @dataclass
@@ -43,9 +43,9 @@ def from_interrupt_info(
         full_text = info.full_text or ""
         spoken = info.delivered_text or ""
         if full_text.startswith(spoken):
-            unspoken = full_text[len(spoken):].strip()
+            unspoken = full_text[len(spoken) :].strip()
         else:
-            unspoken = full_text[len(spoken):].strip()
+            unspoken = full_text[len(spoken) :].strip()
         return cls(
             spoken=spoken,
             unspoken=unspoken,
diff --git a/server.py b/server.py
index 0c0e4e5..676ba3f 100644
--- a/server.py
+++ b/server.py
@@ -413,10 +413,18 @@ def _is_any_process_speaking() -> dict | None:
 
 
 def _bargein_watcher():
-    """Background thread that watches for barge-in signal file changes."""
+    """Background thread that watches for barge-in signal file changes.
+
+    This path is retained for the standalone ``integrations/bargein-producer.py``
+    producer (and its launchd plist). In-process providers go through
+    ``bargein.BargeinRegistry`` instead, calling the same shared
+    ``handle_bargein_start`` consumer helper.
+    """
     global _bargein_last_mtime
     import json as _json
 
+    from bargein import handle_bargein_start
+
     while True:
         try:
             import os
@@ -428,29 +436,31 @@ def _bargein_watcher():
                     with open(_BARGEIN_SIGNAL) as f:
                         signal = _json.load(f)
                     if signal.get("event") == "user_speaking_start":
-                        # Check local pipeline state first (same process)
-                        if pipeline_state.is_speaking:
-                            info = pipeline_state.interrupt(reason="barge_in")
-                            if info:
-                                signal["interrupted"] = {
-                                    "spoken_pct": info.spoken_pct,
-                                    "delivered_text": info.delivered_text,
-                                    "full_text": info.full_text,
-                                }
-                                with open(_BARGEIN_SIGNAL, "w") as f:
-                                    _json.dump(signal, f, indent=2)
-                            logging.info(
-                                "Barge-in: paused local playback (%.0f%% delivered)",
-                                info.spoken_pct * 100 if info else 0,
-                            )
+                        # Shared consumer: check is_speaking + interrupt + log
+                        info = handle_bargein_start(
+                            pipeline_state,
+                            source=signal.get("source", "file_signal"),
+                            metadata={"via": "file_signal"},
+                        )
+                        if info is not None:
+                            # Enrich the on-disk signal so cooperating consumers
+                            # can read the interrupt detail.
+                            signal["interrupted"] = {
+                                "spoken_pct": info.spoken_pct,
+                                "delivered_text": info.delivered_text,
+                                "full_text": info.full_text,
+                            }
+                            with open(_BARGEIN_SIGNAL, "w") as f:
+                                _json.dump(signal, f, indent=2)
                         else:
-                            # Check cross-process lock (another Mod³ process may be speaking)
+                            # Nothing speaking locally — check cross-process lock.
+                            # This path is only meaningful for the file-based IPC
+                            # (another mod3 process owns the speech); in-process
+                            # providers share pipeline_state so never land here.
                             lock = _is_any_process_speaking()
                             if lock:
-                                # We can't interrupt another process's pipeline_state,
-                                # but we CAN write the interrupt context from the lock data
                                 signal["interrupted"] = {
-                                    "spoken_pct": 0.0,  # Unknown from cross-process
+                                    "spoken_pct": 0.0,
                                     "delivered_text": "",
                                     "full_text": lock.get("text", ""),
                                     "cross_process": True,
@@ -458,9 +468,11 @@ def _bargein_watcher():
                                 }
                                 with open(_BARGEIN_SIGNAL, "w") as f:
                                     _json.dump(signal, f, indent=2)
-                                # Clear the speaking lock to signal the other process
                                 _release_speaking_lock()
-                                logging.info("Barge-in: cross-process interrupt (pid=%s)", lock.get("pid"))
+                                logging.info(
+                                    "Barge-in: cross-process interrupt (pid=%s)",
+                                    lock.get("pid"),
+                                )
         except Exception as e:
             logging.debug("Barge-in watcher error: %s", e)
         time.sleep(0.1)  # 100ms poll
@@ -470,6 +482,18 @@ def _bargein_watcher():
 _bargein_thread.start()
 
 
+# ---------------------------------------------------------------------------
+# Barge-in provider registry — in-process providers (SuperWhisper, future:
+# silero VAD, hotkey, etc.). Opt-in via MOD3_BARGEIN_PROVIDERS. Empty default
+# preserves current behavior for users who only run the legacy file producer.
+# ---------------------------------------------------------------------------
+
+from bargein import BargeinRegistry  # noqa: E402
+
+_bargein_registry = BargeinRegistry(pipeline_state)
+_bargein_registry.start_from_env()
+
+
 async def _emit_interruption(info: InterruptInfo):
     """Emit a channel notification when playback is interrupted.
 
diff --git a/tests/test_bargein_provider_registry.py b/tests/test_bargein_provider_registry.py
new file mode 100644
index 0000000..01b4aea
--- /dev/null
+++ b/tests/test_bargein_provider_registry.py
@@ -0,0 +1,308 @@
+"""Tests for the barge-in provider registry + shared consumer helper.
+
+Covers:
+  * BargeinProvider lifecycle (start/stop, thread dies on stop)
+  * Registry event dispatch: user_speaking_start -> pipeline_state.interrupt
+  * user_speaking_end does NOT call interrupt (end is a no-op for now)
+  * handle_bargein_start returns None when nothing is speaking
+  * Subscribers fire after the consumer helper, and their exceptions don't
+    break other subscribers
+  * start_from_env respects the env var (empty default, unknown names warned)
+
+Run: python -m pytest tests/test_bargein_provider_registry.py -v
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import threading
+
+import pytest
+
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from bargein import BargeinRegistry, handle_bargein_start  # noqa: E402
+from bargein.providers.base import BargeinEvent, BargeinProvider  # noqa: E402
+from pipeline_state import PipelineState  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Test doubles
+# ---------------------------------------------------------------------------
+
+
+class FakeProvider(BargeinProvider):
+    """Minimal provider: ``_run`` blocks on stop; tests call ``trigger`` to emit."""
+
+    source = "browser_vad"  # existing literal — doesn't require the new "superwhisper"
+
+    def __init__(self, on_event):
+        super().__init__(on_event)
+        self.run_called = threading.Event()
+        self.run_returned = threading.Event()
+
+    def _run(self) -> None:
+        self.run_called.set()
+        # Wait for stop; the registry / test drives via trigger().
+        while not self._stop.is_set():
+            self._stop.wait(0.05)
+        self.run_returned.set()
+
+    def trigger(self, event_type: str, **metadata) -> None:
+        """Drive an event through the callback synchronously."""
+        self._emit(event_type, metadata)
+
+
+class _FakePlayer:
+    """Minimal player stub for PipelineState: flush() is all interrupt() calls."""
+
+    def __init__(self):
+        self.flushed = False
+
+    def flush(self) -> None:
+        self.flushed = True
+
+
+def _speaking_state(text: str = "Hello world how are you") -> PipelineState:
+    """Build a PipelineState already in the speaking state."""
+    state = PipelineState()
+    state.start_speaking(text, _FakePlayer())
+    # Pretend ~30% delivered so interrupt() has a non-zero spoken_pct
+    state.update_position(samples_played=30, total_samples=100)
+    return state
+
+
+# ---------------------------------------------------------------------------
+# handle_bargein_start (the shared consumer helper)
+# ---------------------------------------------------------------------------
+
+
+def test_handle_bargein_start_interrupts_speaking_state():
+    state = _speaking_state("The capital of France is Paris")
+    assert state.is_speaking
+
+    info = handle_bargein_start(state, source="superwhisper", metadata={"folder": "1234"})
+
+    assert info is not None
+    assert info.reason == "barge_in"
+    assert 0.0 < info.spoken_pct <= 1.0
+    assert state.is_speaking is False
+    # Interrupt is recorded for downstream BargeinContext consumption
+    assert state.last_interrupt is info
+
+
+def test_handle_bargein_start_noop_when_silent():
+    state = PipelineState()
+    assert state.is_speaking is False
+
+    info = handle_bargein_start(state, source="superwhisper")
+
+    assert info is None
+    assert state.last_interrupt is None
+
+
+# ---------------------------------------------------------------------------
+# BargeinProvider lifecycle
+# ---------------------------------------------------------------------------
+
+
+def test_provider_start_and_stop():
+    events: list[BargeinEvent] = []
+    p = FakeProvider(on_event=events.append)
+
+    p.start()
+    assert p.run_called.wait(timeout=1.0), "_run was not entered"
+    assert p.is_running
+
+    p.stop(timeout=1.0)
+    assert p.run_returned.wait(timeout=1.0), "_run did not return after stop()"
+    assert not p.is_running
+
+
+def test_provider_start_is_idempotent():
+    p = FakeProvider(on_event=lambda _e: None)
+    p.start()
+    first_thread = p._thread
+    p.start()  # should not spawn a second thread
+    assert p._thread is first_thread
+    p.stop()
+
+
+def test_provider_emits_events_through_callback():
+    events: list[BargeinEvent] = []
+    p = FakeProvider(on_event=events.append)
+    p.start()
+    try:
+        p.trigger("user_speaking_start", folder="1234")
+        p.trigger("user_speaking_end", folder="1234")
+    finally:
+        p.stop()
+
+    assert len(events) == 2
+    assert events[0].event_type == "user_speaking_start"
+    assert events[0].source == "browser_vad"
+    assert events[0].metadata == {"folder": "1234"}
+    assert events[1].event_type == "user_speaking_end"
+
+
+def test_callback_exception_does_not_kill_provider():
+    """A raising callback must not propagate out of _emit."""
+
+    def boom(_event):
+        raise RuntimeError("consumer exploded")
+
+    p = FakeProvider(on_event=boom)
+    p.start()
+    try:
+        # Should not raise
+        p.trigger("user_speaking_start")
+    finally:
+        p.stop()
+    # If we got here, _emit swallowed the exception.
+
+
+# ---------------------------------------------------------------------------
+# BargeinRegistry
+# ---------------------------------------------------------------------------
+
+
+def test_registry_routes_start_event_into_consumer_helper():
+    state = _speaking_state()
+    registry = BargeinRegistry(state)
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    try:
+        p.trigger("user_speaking_start", folder="1234")
+        # Dispatch is synchronous on the triggering thread
+        assert state.is_speaking is False
+        assert state.last_interrupt is not None
+        assert state.last_interrupt.reason == "barge_in"
+    finally:
+        registry.stop_all()
+
+
+def test_registry_end_event_does_not_interrupt():
+    state = _speaking_state()
+    registry = BargeinRegistry(state)
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    try:
+        p.trigger("user_speaking_end", folder="1234")
+        # End events must not interrupt ongoing playback
+        assert state.is_speaking is True
+        assert state.last_interrupt is None
+    finally:
+        registry.stop_all()
+
+
+def test_registry_subscribers_fire_after_consumer():
+    state = _speaking_state()
+    registry = BargeinRegistry(state)
+    seen: list[BargeinEvent] = []
+    registry.subscribe(seen.append)
+
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    try:
+        p.trigger("user_speaking_start")
+        p.trigger("user_speaking_end")
+    finally:
+        registry.stop_all()
+
+    # Subscriber sees both events, in order
+    assert [e.event_type for e in seen] == ["user_speaking_start", "user_speaking_end"]
+
+
+def test_registry_subscriber_exception_isolated():
+    """One subscriber raising must not prevent later subscribers from firing."""
+    state = PipelineState()  # not speaking — consumer is a no-op
+    registry = BargeinRegistry(state)
+
+    def raiser(_e):
+        raise RuntimeError("nope")
+
+    other: list[BargeinEvent] = []
+    registry.subscribe(raiser)
+    registry.subscribe(other.append)
+
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+    try:
+        p.trigger("user_speaking_start")
+    finally:
+        registry.stop_all()
+
+    assert len(other) == 1
+
+
+def test_registry_stop_all_stops_every_provider():
+    state = PipelineState()
+    registry = BargeinRegistry(state)
+
+    providers = [FakeProvider(on_event=registry._dispatch) for _ in range(3)]
+    for p in providers:
+        registry.register(p)
+    registry.start_all()
+
+    # Wait until they're all actually running
+    for p in providers:
+        assert p.run_called.wait(timeout=1.0)
+
+    registry.stop_all(timeout=1.0)
+
+    for p in providers:
+        assert p.run_returned.wait(timeout=1.0)
+        assert not p.is_running
+
+
+# ---------------------------------------------------------------------------
+# start_from_env
+# ---------------------------------------------------------------------------
+
+
+def test_start_from_env_empty_by_default(monkeypatch):
+    monkeypatch.delenv("MOD3_BARGEIN_PROVIDERS", raising=False)
+    registry = BargeinRegistry(PipelineState())
+    started = registry.start_from_env()
+    assert started == []
+
+
+def test_start_from_env_ignores_unknown_names(monkeypatch, caplog):
+    monkeypatch.setenv("MOD3_BARGEIN_PROVIDERS", "definitely_not_a_provider")
+    registry = BargeinRegistry(PipelineState())
+    started = registry.start_from_env()
+    assert started == []
+
+
+def test_start_from_env_instantiates_known_provider(monkeypatch):
+    """Happy path: 'superwhisper' in env -> SuperWhisperProvider registered.
+
+    We don't wait for it to find recordings (that directory may not exist on
+    the test host) — we only verify construction + registration + shutdown.
+    """
+    monkeypatch.setenv("MOD3_BARGEIN_PROVIDERS", "superwhisper")
+    # Point it at a directory we know doesn't exist so its _run returns fast
+    monkeypatch.setenv("SW_RECORDINGS_DIR", "/tmp/mod3-bargein-test-nonexistent")
+
+    registry = BargeinRegistry(PipelineState())
+    started = registry.start_from_env()
+    try:
+        assert started == ["superwhisper"]
+        # One provider is registered
+        assert len(registry._providers) == 1
+        from bargein.providers.superwhisper import SuperWhisperProvider
+
+        assert isinstance(registry._providers[0], SuperWhisperProvider)
+    finally:
+        registry.stop_all(timeout=1.0)
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))

From 614e60abe320742be35311e8858f9473a2d7826e Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Sun, 19 Apr 2026 19:51:05 -0400
Subject: [PATCH 7/9] fix: address Codex review findings on PR #4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four fixes covering the blocker and three concerns from the gpt-5.4 review.

Fix 1 (BLOCKER) — await_voice_input() with new provider model
  bargein/__init__.py:
    * BargeinRegistry.wait_for_event(event_type, source=None, timeout=None) —
      synchronous wait primitive that subscribes-and-waits, returning the
      matching BargeinEvent or None on timeout. Auto-unsubscribes on return.
    * BargeinRegistry.unsubscribe() — public counterpart to subscribe().
    * make_file_mirror_subscriber(path) — factory for a subscriber that
      mirrors registry events into the legacy /tmp/mod3-barge-in.json signal
      file so out-of-process pollers (mcp_shim.py) keep receiving events
      from in-process providers.
  server.py:
    * await_voice_input() now waits on BOTH the registry's user_speaking_end
      events AND the legacy file (whichever fires first wins). With
      MOD3_BARGEIN_PROVIDERS=superwhisper set, the registry path delivers;
      without it, the file path keeps working.
    * Registry installs the file-mirror subscriber at startup so file
      pollers in other processes still see in-process events.

Fix 2 (CONCERN) — owner-aware speaking lock
  server.py:
    * Lock payload gains acquired_at (ISO timestamp) alongside pid + job_id.
    * _acquire_speaking_lock returns bool. Acquires when file is missing,
      holder pid is dead, or (pid, job_id) matches (idempotent re-acquire).
      Returns False when a different LIVE process owns the speaker.
    * _release_speaking_lock(job_id=None) only removes the file when
      (pid, job_id) match. Two overlapping mod3 processes can no longer
      delete each other's locks.
    * _i_own_speaking_lock(job_id) is the new "stop-on-mismatch" predicate
      used by the speech generation loop in place of "stop-on-disappear".
    * _force_clear_speaking_lock() preserves the cross-process kill path
      used by the bargein watcher (the only place that legitimately removes
      another process's lock).
    * _is_any_process_speaking() uses pid-alive (os.kill(pid, 0)) instead
      of a 60s timestamp window to detect stale locks.

Fix 3 (CONCERN) — KernelBusSubscriber honors COGOS_ENDPOINT
  bus_bridge.py:
    * default_stream_url() resolves COGOS_ENDPOINT at call time and appends
      /v1/events/stream. KernelBusSubscriber(url=None) now defaults to it
      so sends and receives target the same kernel host.
    * KERNEL_BUS_STREAM_URL kept as a back-compat module attribute.
  bus_bridge_runner.py / cogos_agent_bridge.py:
    * start_bridge() and start_response_bridge() take url=None and resolve
      from COGOS_ENDPOINT, so the lifespan wiring tracks env at startup.
    * cogos_agent_bridge.post_user_message() resolves the bus-send URL at
      call time too — no more import-time freeze.

Fix 4 (CONCERN) — session-scoped browser routing for kernel replies
  cogos_agent_bridge.py:
    * _extract_session_id(payload) digs the optional session_id out of
      the kernel reply payload (top-level OR JSON-wrapped under content).
    * run_response_bridge() now passes session_id through to
      BrowserChannel.broadcast_response_text(), which already supports
      mod3:<channel_id> routing. Replies WITHOUT a session_id (older kernel,
      non-session-scoped events) fall back to broadcast — preserves
      backward compatibility per the user's gating note.

Tests
  * tests/test_bargein_provider_registry.py — wait_for_event happy path,
    event-type filter, source filter, timeout, subscriber cleanup,
    file-mirror subscriber writes the legacy signal payload.
  * tests/test_speaking_lock.py (new) — full owner-awareness coverage:
    acquire idempotence, blocked-by-live-other-process, dead-pid reclaim,
    release pid+job_id gating, cross-process false-interrupt regression,
    stale-lock auto-removal.
  * tests/test_bus_bridge.py (new) — default_stream_url + subscriber
    construction honors COGOS_ENDPOINT (and explicit url= still wins).
  * tests/test_cogos_agent_bridge.py — session_id extraction (top-level,
    content-wrapped, missing), run_response_bridge forwards session_id
    when present and falls back to broadcast when absent, post_user_message
    resolves COGOS_ENDPOINT at call time.
  * tests/test_browser_channel_routing.py (new) — broadcast routing:
    no session_id → fan out, mod3:<channel_id> → only the matching
    channel, malformed prefix → broadcast fallback, ghost session → no
    delivery.

95 passed (53 in the touched/new files).
---
 bargein/__init__.py                     |  88 ++++++++-
 bus_bridge.py                           |  40 ++++-
 bus_bridge_runner.py                    |  22 ++-
 cogos_agent_bridge.py                   |  82 +++++++--
 server.py                               | 208 ++++++++++++++++-----
 tests/test_bargein_context.py           |   3 -
 tests/test_bargein_provider_registry.py | 164 +++++++++++++++++
 tests/test_browser_channel_routing.py   | 173 ++++++++++++++++++
 tests/test_bus_bridge.py                |  65 +++++++
 tests/test_cogos_agent_bridge.py        |  91 ++++++++++
 tests/test_speaking_lock.py             | 228 ++++++++++++++++++++++++
 11 files changed, 1086 insertions(+), 78 deletions(-)
 create mode 100644 tests/test_browser_channel_routing.py
 create mode 100644 tests/test_bus_bridge.py
 create mode 100644 tests/test_speaking_lock.py

diff --git a/bargein/__init__.py b/bargein/__init__.py
index 2eb69a1..068e18f 100644
--- a/bargein/__init__.py
+++ b/bargein/__init__.py
@@ -20,14 +20,16 @@
 
 from __future__ import annotations
 
+import json
 import logging
 import os
 import threading
 from typing import Callable
 
 from pipeline_state import InterruptInfo, PipelineState
+from schemas.bargein import BargeinSource
 
-from .providers.base import BargeinCallback, BargeinEvent, BargeinProvider
+from .providers.base import BargeinCallback, BargeinEvent, BargeinEventType, BargeinProvider
 
 log = logging.getLogger("bargein")
 
@@ -127,6 +129,57 @@ def subscribe(self, callback: Callable[[BargeinEvent], None]) -> None:
         with self._lock:
             self._subscribers.append(callback)
 
+    def unsubscribe(self, callback: Callable[[BargeinEvent], None]) -> None:
+        """Remove a previously-registered subscriber. Idempotent."""
+        with self._lock:
+            try:
+                self._subscribers.remove(callback)
+            except ValueError:
+                pass
+
+    # ------------------------------------------------------------------
+    # Synchronous wait primitive
+    # ------------------------------------------------------------------
+
+    def wait_for_event(
+        self,
+        event_type: BargeinEventType,
+        source: BargeinSource | None = None,
+        timeout: float | None = None,
+    ) -> BargeinEvent | None:
+        """Block until a matching event is dispatched, or until ``timeout``.
+
+        Returns the matching ``BargeinEvent`` on success, or ``None`` on timeout.
+        Thread-safe; multiple waiters may run concurrently — each receives the
+        first matching event emitted after its wait began.
+
+        Example::
+
+            event = registry.wait_for_event("user_speaking_end", timeout=180)
+            if event is None:
+                ...  # timed out
+        """
+        signal = threading.Event()
+        captured: list[BargeinEvent] = []
+
+        def _waiter(event: BargeinEvent) -> None:
+            if event.event_type != event_type:
+                return
+            if source is not None and event.source != source:
+                return
+            if signal.is_set():
+                return
+            captured.append(event)
+            signal.set()
+
+        self.subscribe(_waiter)
+        try:
+            if signal.wait(timeout):
+                return captured[0]
+            return None
+        finally:
+            self.unsubscribe(_waiter)
+
     # ------------------------------------------------------------------
     # Lifecycle
     # ------------------------------------------------------------------
@@ -201,10 +254,43 @@ def _dispatch(self, event: BargeinEvent) -> None:
                 log.exception("barge-in subscriber raised")
 
 
+def make_file_mirror_subscriber(signal_path: str) -> Callable[[BargeinEvent], None]:
+    """Build a registry subscriber that mirrors events into the legacy signal file.
+
+    The legacy ``/tmp/mod3-barge-in.json`` file is consumed by
+    out-of-process clients (e.g. ``mcp_shim.py``'s ``await_voice_input``)
+    that cannot subscribe to the in-process registry. Installing this
+    subscriber lets in-process providers reach those pollers.
+
+    Writes are atomic (tmp + rename). ``OSError`` is swallowed and logged
+    at debug level — the file mirror is best-effort and must never break
+    in-process delivery.
+    """
+
+    def _mirror(event: BargeinEvent) -> None:
+        try:
+            payload = {
+                "event": event.event_type,
+                "source": event.source,
+                "timestamp": event.timestamp.isoformat(),
+                "via": "bargein_registry",
+                **event.metadata,
+            }
+            tmp = signal_path + ".tmp"
+            with open(tmp, "w") as f:
+                json.dump(payload, f)
+            os.replace(tmp, signal_path)
+        except OSError:
+            log.debug("file mirror write failed", exc_info=True)
+
+    return _mirror
+
+
 __all__ = [
     "BargeinEvent",
     "BargeinProvider",
     "BargeinRegistry",
     "handle_bargein_start",
+    "make_file_mirror_subscriber",
     "PROVIDER_NAMES",
 ]
diff --git a/bus_bridge.py b/bus_bridge.py
index 27745c1..ad2f014 100644
--- a/bus_bridge.py
+++ b/bus_bridge.py
@@ -23,6 +23,7 @@
 import asyncio
 import json
 import logging
+import os
 from dataclasses import dataclass, field
 from typing import Any, AsyncIterator, Optional
 
@@ -30,7 +31,26 @@
 
 logger = logging.getLogger("mod3.bus_bridge")
 
-KERNEL_BUS_STREAM_URL = "http://localhost:6931/v1/events/stream"
+# Path appended to ``COGOS_ENDPOINT`` (or the default below) to form the
+# kernel SSE stream URL.
+KERNEL_BUS_STREAM_PATH = "/v1/events/stream"
+
+_DEFAULT_KERNEL_BASE = "http://localhost:6931"
+
+
+def default_stream_url() -> str:
+    """Build the kernel bus stream URL from ``COGOS_ENDPOINT`` (or the default).
+
+    Resolved at call time, not at import time, so tests and runtime config
+    can override the env var before the bridge is constructed.
+    """
+    base = os.environ.get("COGOS_ENDPOINT", _DEFAULT_KERNEL_BASE).rstrip("/")
+    return f"{base}{KERNEL_BUS_STREAM_PATH}"
+
+
+# Back-compat module attribute. New code should call ``default_stream_url()``
+# so that ``COGOS_ENDPOINT`` overrides take effect at runtime.
+KERNEL_BUS_STREAM_URL = default_stream_url()
 
 
 @dataclass
@@ -77,7 +97,7 @@ class KernelBusSubscriber:
 
     def __init__(
         self,
-        url: str = KERNEL_BUS_STREAM_URL,
+        url: Optional[str] = None,
         *,
         bus_filter: str = "*",
         consumer_id: Optional[str] = None,
@@ -85,7 +105,10 @@ def __init__(
         reconnect_max_s: float = 30.0,
         request_timeout_s: float = 10.0,
     ) -> None:
-        self._url = url
+        # ``COGOS_ENDPOINT`` is honored at construction time when ``url`` is
+        # not explicitly provided, so the subscriber tracks whatever endpoint
+        # the rest of the cogos client code is using.
+        self._url = url or default_stream_url()
         self._bus_filter = bus_filter
         self._consumer_id = consumer_id
         self._min_backoff = reconnect_min_s
@@ -136,7 +159,9 @@ async def stream(self) -> AsyncIterator[BusEnvelope]:
                     if resp.status_code != 200:
                         logger.info(
                             "bus-bridge: non-200 from %s: %s — backing off %.1fs",
-                            self._url, resp.status_code, backoff,
+                            self._url,
+                            resp.status_code,
+                            backoff,
                         )
                         await self._sleep_or_close(backoff)
                         backoff = min(self._max_backoff, max(self._min_backoff, backoff * 2))
@@ -150,7 +175,8 @@ async def stream(self) -> AsyncIterator[BusEnvelope]:
             except (httpx.HTTPError, asyncio.TimeoutError, ConnectionError) as e:
                 logger.info(
                     "bus-bridge: transport error (%s); reconnecting in %.1fs",
-                    e.__class__.__name__, backoff,
+                    e.__class__.__name__,
+                    backoff,
                 )
                 await self._sleep_or_close(backoff)
                 backoff = min(self._max_backoff, max(self._min_backoff, backoff * 2))
@@ -211,9 +237,7 @@ async def _iter_sse(self, resp: httpx.Response) -> AsyncIterator[BusEnvelope]:
                 self._last_event_id = value
             # retry / unknown fields: ignore
 
-    def _parse_event(
-        self, event_name: Optional[str], data: str, event_id: Optional[str]
-    ) -> Optional[BusEnvelope]:
+    def _parse_event(self, event_name: Optional[str], data: str, event_id: Optional[str]) -> Optional[BusEnvelope]:
         try:
             envelope: Any = json.loads(data)
         except json.JSONDecodeError:
diff --git a/bus_bridge_runner.py b/bus_bridge_runner.py
index 5747ec2..aa788c4 100644
--- a/bus_bridge_runner.py
+++ b/bus_bridge_runner.py
@@ -24,7 +24,7 @@
 import os
 from typing import Optional
 
-from bus_bridge import KERNEL_BUS_STREAM_URL, BusEnvelope, KernelBusSubscriber
+from bus_bridge import KernelBusSubscriber, default_stream_url
 from channels import BrowserChannel
 
 logger = logging.getLogger("mod3.bus_bridge")
@@ -75,7 +75,8 @@ async def run_bridge(
         if not first_event_logged:
             logger.info(
                 "bridge: first event forwarded kind=%s event_id=%s",
-                env.kind, env.event_id,
+                env.kind,
+                env.event_id,
             )
             first_event_logged = True
         try:
@@ -83,7 +84,9 @@ async def run_bridge(
             forwarded += 1
             logger.debug(
                 "bridge: forwarded kind=%s event_id=%s (total=%d)",
-                env.kind, env.event_id, forwarded,
+                env.kind,
+                env.event_id,
+                forwarded,
             )
         except Exception as exc:  # noqa: BLE001 — broadcaster is best-effort
             logger.debug("bridge: broadcast failed: %s", exc)
@@ -92,7 +95,7 @@ async def run_bridge(
 async def start_bridge(
     app_state: object,
     *,
-    url: str = KERNEL_BUS_STREAM_URL,
+    url: Optional[str] = None,
     bus_filter: str = TRACE_BUS_ID,
     filter_kinds: Optional[set[str]] = frozenset(ADR083_KINDS),
 ) -> None:
@@ -101,6 +104,10 @@ async def start_bridge(
     Startup is non-blocking: we don't await the task or probe the kernel.
     The subscriber's own backoff loop handles reconnects. Logs a disabled
     notice and returns cleanly when `MOD3_BUS_BRIDGE_DISABLED` is set.
+
+    ``url`` defaults to ``COGOS_ENDPOINT`` (resolved at call time) so the
+    subscriber tracks whatever endpoint the rest of the cogos client code is
+    using.
     """
     if is_disabled():
         logger.info("bridge: disabled via %s=1", DISABLE_ENV)
@@ -108,7 +115,8 @@ async def start_bridge(
         setattr(app_state, "bus_bridge_task", None)
         return
 
-    subscriber = KernelBusSubscriber(url=url, bus_filter=bus_filter, consumer_id="mod3-dashboard")
+    resolved_url = url or default_stream_url()
+    subscriber = KernelBusSubscriber(url=resolved_url, bus_filter=bus_filter, consumer_id="mod3-dashboard")
     task = asyncio.create_task(
         run_bridge(subscriber, filter_kinds=set(filter_kinds) if filter_kinds else None),
         name="mod3-bus-bridge",
@@ -117,7 +125,9 @@ async def start_bridge(
     setattr(app_state, "bus_bridge_task", task)
     logger.info(
         "bridge: started, target=%s bus_id=%s filter=%s",
-        url, bus_filter, sorted(filter_kinds) if filter_kinds else "*",
+        resolved_url,
+        bus_filter,
+        sorted(filter_kinds) if filter_kinds else "*",
     )
 
 
diff --git a/cogos_agent_bridge.py b/cogos_agent_bridge.py
index 0137b8a..c8616f2 100644
--- a/cogos_agent_bridge.py
+++ b/cogos_agent_bridge.py
@@ -45,7 +45,7 @@
 
 import httpx
 
-from bus_bridge import KERNEL_BUS_STREAM_URL, KernelBusSubscriber
+from bus_bridge import KernelBusSubscriber, default_stream_url
 from channels import BrowserChannel
 
 logger = logging.getLogger("mod3.cogos_agent")
@@ -54,9 +54,19 @@
 CHAT_BUS_ID = "bus_dashboard_chat"
 RESPONSE_BUS_ID = "bus_dashboard_response"
 
-# Kernel endpoints.
-_DEFAULT_KERNEL_BASE = os.environ.get("COGOS_ENDPOINT", "http://localhost:6931")
-BUS_SEND_URL = f"{_DEFAULT_KERNEL_BASE}/v1/bus/send"
+
+def _kernel_base() -> str:
+    """Resolve the kernel base URL from ``COGOS_ENDPOINT`` at call time."""
+    return os.environ.get("COGOS_ENDPOINT", "http://localhost:6931").rstrip("/")
+
+
+def _bus_send_url() -> str:
+    """Build the kernel bus-send URL from the current ``COGOS_ENDPOINT``."""
+    return f"{_kernel_base()}/v1/bus/send"
+
+
+# Back-compat module attribute. Use ``_bus_send_url()`` for runtime resolution.
+BUS_SEND_URL = _bus_send_url()
 
 # Env gate.
 ENABLE_ENV = "MOD3_USE_COGOS_AGENT"
@@ -97,16 +107,18 @@ async def post_user_message(text: str, session_id: str) -> bool:
         "type": "user_message",
         "message": json.dumps(event, separators=(",", ":")),
     }
+    url = _bus_send_url()
     try:
         async with httpx.AsyncClient(timeout=_POST_TIMEOUT_S) as client:
-            resp = await client.post(BUS_SEND_URL, json=body)
+            resp = await client.post(url, json=body)
     except httpx.HTTPError as exc:
-        logger.warning("cogos-agent: post to %s failed: %s", BUS_SEND_URL, exc)
+        logger.warning("cogos-agent: post to %s failed: %s", url, exc)
         return False
     if resp.status_code // 100 != 2:
         logger.warning(
             "cogos-agent: post non-2xx: %d body=%r",
-            resp.status_code, resp.text[:200],
+            resp.status_code,
+            resp.text[:200],
         )
         return False
     logger.info(
@@ -116,6 +128,36 @@ async def post_user_message(text: str, session_id: str) -> bool:
     return True
 
 
+def _extract_session_id(payload: dict) -> Optional[str]:
+    """Extract the ``session_id`` from a kernel reply payload, if present.
+
+    Mirrors :func:`_extract_response_text`: checks the top-level shape and
+    the JSON-encoded ``content`` wrapper that ``handleBusSend`` produces.
+    Returns ``None`` for older kernels that don't include a session id, or
+    for non-session-scoped events.
+
+    The downstream :meth:`BrowserChannel.broadcast_response_text` falls
+    back to broadcasting when ``session_id`` is ``None``, preserving the
+    backward-compat behavior.
+    """
+    if not isinstance(payload, dict):
+        return None
+    top = payload.get("session_id")
+    if isinstance(top, str) and top:
+        return top
+    content = payload.get("content")
+    if isinstance(content, str) and content:
+        try:
+            inner = json.loads(content)
+        except (TypeError, ValueError):
+            return None
+        if isinstance(inner, dict):
+            sid = inner.get("session_id")
+            if isinstance(sid, str) and sid:
+                return sid
+    return None
+
+
 def _extract_response_text(payload: dict) -> Optional[str]:
     """Dig the assistant reply out of the bus event payload.
 
@@ -165,21 +207,26 @@ async def run_response_bridge(subscriber: KernelBusSubscriber) -> None:
         if not text:
             logger.debug(
                 "cogos-agent: skip event with no text kind=%s id=%s",
-                env.kind, env.event_id,
+                env.kind,
+                env.event_id,
             )
             continue
         if not first_event_logged:
             logger.info(
                 "cogos-agent: first response forwarded kind=%s event_id=%s",
-                env.kind, env.event_id,
+                env.kind,
+                env.event_id,
             )
             first_event_logged = True
+        session_id = _extract_session_id(env.payload)
         try:
-            BrowserChannel.broadcast_response_text(text)
+            BrowserChannel.broadcast_response_text(text, session_id=session_id)
             forwarded += 1
             logger.debug(
-                "cogos-agent: forwarded response event_id=%s (total=%d)",
-                env.event_id, forwarded,
+                "cogos-agent: forwarded response event_id=%s session=%s (total=%d)",
+                env.event_id,
+                session_id,
+                forwarded,
             )
         except Exception as exc:  # noqa: BLE001 — best-effort fan-out
             logger.debug("cogos-agent: broadcast failed: %s", exc)
@@ -188,11 +235,14 @@ async def run_response_bridge(subscriber: KernelBusSubscriber) -> None:
 async def start_response_bridge(
     app_state: object,
     *,
-    url: str = KERNEL_BUS_STREAM_URL,
+    url: Optional[str] = None,
 ) -> None:
     """Construct the response subscriber + bridge task and store on `app_state`.
 
     No-op (logs once) when `MOD3_USE_COGOS_AGENT` is unset.
+
+    ``url`` defaults to ``COGOS_ENDPOINT`` (resolved at call time) so the
+    subscriber tracks the same kernel endpoint as ``post_user_message``.
     """
     if not is_enabled():
         logger.debug("cogos-agent: response bridge disabled (%s unset)", ENABLE_ENV)
@@ -200,8 +250,9 @@ async def start_response_bridge(
         setattr(app_state, "cogos_agent_task", None)
         return
 
+    resolved_url = url or default_stream_url()
     subscriber = KernelBusSubscriber(
-        url=url,
+        url=resolved_url,
         bus_filter=RESPONSE_BUS_ID,
         consumer_id="mod3-dashboard-agent",
     )
@@ -213,7 +264,8 @@ async def start_response_bridge(
     setattr(app_state, "cogos_agent_task", task)
     logger.info(
         "cogos-agent: response bridge started, target=%s bus_id=%s",
-        url, RESPONSE_BUS_ID,
+        resolved_url,
+        RESPONSE_BUS_ID,
     )
 
 
diff --git a/server.py b/server.py
index 676ba3f..1da3390 100644
--- a/server.py
+++ b/server.py
@@ -26,6 +26,7 @@
 import uuid
 import wave
 from collections import OrderedDict
+from datetime import datetime, timezone
 from typing import Any
 
 import anyio
@@ -369,47 +370,134 @@ async def _filter_read_stream():
 _bargein_last_mtime: float = 0.0
 
 
-def _acquire_speaking_lock(job_id: str, text: str):
-    """Write cross-process speaking lock so the barge-in watcher knows ANY Mod³ is speaking."""
+def _pid_is_alive(pid: Any) -> bool:
+    """Return True if a local process with ``pid`` is still alive."""
+    if not isinstance(pid, int) or pid <= 0:
+        return False
+    try:
+        os.kill(pid, 0)
+    except OSError:
+        return False
+    return True
+
+
+def _read_speaking_lock() -> dict | None:
+    """Read the speaking lock file. Returns None if missing or unparseable."""
+    try:
+        if not os.path.exists(_SPEAKING_LOCK):
+            return None
+        with open(_SPEAKING_LOCK) as f:
+            return json.load(f)
+    except (OSError, json.JSONDecodeError):
+        return None
+
+
+def _acquire_speaking_lock(job_id: str, text: str) -> bool:
+    """Try to claim the cross-process speaking lock for this (pid, job_id).
+
+    The lock is acquired (and overwritten) when:
+      * the file is missing,
+      * the existing holder PID is dead, or
+      * the existing holder is this same (pid, job_id) (idempotent re-acquire).
+
+    Otherwise the lock is left untouched and ``False`` is returned — a
+    different live process owns the speaker. Callers may still play audio
+    locally; they just won't be eligible for cross-process barge-in.
+    """
+    my_pid = os.getpid()
+    payload = {
+        "speaking": True,
+        "job_id": job_id,
+        "text": text,
+        "pid": my_pid,
+        "acquired_at": datetime.now(timezone.utc).isoformat(),
+    }
+
+    existing = _read_speaking_lock()
+    if existing is not None:
+        holder_pid = existing.get("pid")
+        holder_job = existing.get("job_id")
+        same_owner = holder_pid == my_pid and holder_job == job_id
+        if not same_owner and _pid_is_alive(holder_pid):
+            return False
+        # Either same owner re-acquiring, or stale lock from a dead pid —
+        # fall through and overwrite.
+
     try:
-        payload = {
-            "speaking": True,
-            "job_id": job_id,
-            "text": text,
-            "pid": os.getpid(),
-            "timestamp": time.time(),
-        }
         tmp = _SPEAKING_LOCK + ".tmp"
         with open(tmp, "w") as f:
             json.dump(payload, f)
         os.replace(tmp, _SPEAKING_LOCK)
+        return True
     except OSError:
-        pass
+        return False
 
 
-def _release_speaking_lock():
-    """Clear the cross-process speaking lock."""
+def _release_speaking_lock(job_id: str | None = None) -> bool:
+    """Release the speaking lock if this process owns it.
+
+    Returns True if the lock was removed, False if the file is missing,
+    held by a different (pid, job_id), or unreadable. When ``job_id`` is
+    provided, both pid AND job_id must match; otherwise only pid is checked.
+    """
+    existing = _read_speaking_lock()
+    if existing is None:
+        return False
+    if existing.get("pid") != os.getpid():
+        return False
+    if job_id is not None and existing.get("job_id") != job_id:
+        return False
+    try:
+        os.remove(_SPEAKING_LOCK)
+        return True
+    except OSError:
+        return False
+
+
+def _i_own_speaking_lock(job_id: str) -> bool:
+    """True if the on-disk lock matches our (pid, job_id)."""
+    existing = _read_speaking_lock()
+    if existing is None:
+        return False
+    return existing.get("pid") == os.getpid() and existing.get("job_id") == job_id
+
+
+def _force_clear_speaking_lock() -> dict | None:
+    """Forcibly remove the speaking lock regardless of owner.
+
+    Used by the cross-process barge-in path: when the file watcher decides
+    another process must stop speaking, it removes the lock file. The owner
+    notices via stop-on-pid-mismatch (its own pid is no longer present) and
+    halts its generation loop.
+
+    Returns the lock contents at the moment of removal, or ``None`` if the
+    file was missing.
+    """
+    existing = _read_speaking_lock()
     try:
         if os.path.exists(_SPEAKING_LOCK):
             os.remove(_SPEAKING_LOCK)
     except OSError:
         pass
+    return existing
 
 
 def _is_any_process_speaking() -> dict | None:
-    """Check if ANY Mod³ process is currently speaking (cross-process)."""
-    try:
-        if not os.path.exists(_SPEAKING_LOCK):
-            return None
-        with open(_SPEAKING_LOCK) as f:
-            lock = json.load(f)
-        # Stale lock check: if older than 60s, ignore it (crashed process)
-        if time.time() - lock.get("timestamp", 0) > 60:
+    """Check if a live Mod³ process is currently speaking (cross-process).
+
+    Returns the lock dict if a live holder exists; ``None`` otherwise.
+    Stale locks (holder pid is dead) are removed as a side effect.
+    """
+    existing = _read_speaking_lock()
+    if existing is None:
+        return None
+    if not _pid_is_alive(existing.get("pid")):
+        try:
             os.remove(_SPEAKING_LOCK)
-            return None
-        return lock
-    except (OSError, json.JSONDecodeError):
+        except OSError:
+            pass
         return None
+    return existing
 
 
 def _bargein_watcher():
@@ -468,7 +556,7 @@ def _bargein_watcher():
                                 }
                                 with open(_BARGEIN_SIGNAL, "w") as f:
                                     _json.dump(signal, f, indent=2)
-                                _release_speaking_lock()
+                                _force_clear_speaking_lock()
                                 logging.info(
                                     "Barge-in: cross-process interrupt (pid=%s)",
                                     lock.get("pid"),
@@ -488,9 +576,13 @@ def _bargein_watcher():
 # preserves current behavior for users who only run the legacy file producer.
 # ---------------------------------------------------------------------------
 
-from bargein import BargeinRegistry  # noqa: E402
+from bargein import BargeinRegistry, make_file_mirror_subscriber  # noqa: E402
 
 _bargein_registry = BargeinRegistry(pipeline_state)
+# Mirror in-process provider events into the legacy signal file so
+# out-of-process consumers (mcp_shim.py, integrations watching the file)
+# keep receiving events from in-process providers like SuperWhisperProvider.
+_bargein_registry.subscribe(make_file_mirror_subscriber(_BARGEIN_SIGNAL))
 _bargein_registry.start_from_env()
 
 
@@ -678,7 +770,7 @@ def _run_speech_job(entry: dict) -> None:
 
     # Register with the reflex arc so inbound VAD can interrupt us
     pipeline_state.start_speaking(text, player)
-    _acquire_speaking_lock(job_id, text)
+    i_have_lock = _acquire_speaking_lock(job_id, text)
     try:
         for chunk in engine_module.generate_audio(
             text,
@@ -688,9 +780,15 @@ def _run_speech_job(entry: dict) -> None:
             speed=speed,
             emotion=emotion,
         ):
-            # Check if barge-in cleared our speaking lock (cross-process interrupt)
-            if not os.path.exists(_SPEAKING_LOCK):
-                logging.info("Speaking lock cleared by barge-in watcher — stopping generation")
+            # If we held the cross-process lock and lost it (file gone or
+            # pid no longer matches), the bargein watcher decided we should
+            # stop. Without our own lock, we don't gate on this signal —
+            # another process owns the speaker and we're playing locally.
+            if i_have_lock and not _i_own_speaking_lock(job_id):
+                logging.info(
+                    "Speaking lock no longer ours (job %s) — stopping generation",
+                    job_id,
+                )
                 player.flush()
                 break
             player.queue_audio(chunk.samples, chunk_meta=chunk.metadata if chunk.metadata else None)
@@ -710,7 +808,7 @@ def _run_speech_job(entry: dict) -> None:
     # Final position update and clear speaking state
     pipeline_state.update_position(*player.get_progress())
     pipeline_state.stop_speaking()
-    _release_speaking_lock()
+    _release_speaking_lock(job_id)
 
     result = metrics.to_dict()
     result["engine"] = engine
@@ -1185,8 +1283,13 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     when speak() returns "held" (user is recording) or when you want to listen
     for the next voice input.
 
-    Polls the barge-in signal file for user_speaking_end, then reads the
-    transcript from SuperWhisper's recordings directory.
+    Waits on two parallel signal sources, returning when either fires:
+      1. ``BargeinRegistry`` ``user_speaking_end`` events (in-process
+         providers like SuperWhisperProvider).
+      2. The legacy ``/tmp/mod3-barge-in.json`` signal file (standalone
+         producer / cross-process IPC).
+
+    Then reads the transcript from SuperWhisper's recordings directory.
 
     Args:
         timeout_sec: Maximum seconds to wait for recording to finish. Default 180 (3 minutes).
@@ -1196,19 +1299,34 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     _sw_db = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
     _rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings")
 
-    start = time.time()
-    # If user is currently recording, wait for them to finish
-    while time.time() - start < timeout_sec:
-        try:
-            if os.path.exists(_BARGEIN_SIGNAL):
-                with open(_BARGEIN_SIGNAL) as f:
-                    signal = json.load(f)
-                if signal.get("event") == "user_speaking_end":
-                    break
-        except (OSError, json.JSONDecodeError):
-            pass
-        time.sleep(0.2)
-    else:
+    end_signal = threading.Event()
+
+    def _on_end(event):
+        if event.event_type == "user_speaking_end":
+            end_signal.set()
+
+    _bargein_registry.subscribe(_on_end)
+    try:
+        deadline = time.monotonic() + timeout_sec
+        timed_out = True
+        while time.monotonic() < deadline:
+            try:
+                if os.path.exists(_BARGEIN_SIGNAL):
+                    with open(_BARGEIN_SIGNAL) as f:
+                        signal = json.load(f)
+                    if signal.get("event") == "user_speaking_end":
+                        timed_out = False
+                        break
+            except (OSError, json.JSONDecodeError):
+                pass
+            remaining = max(0.0, deadline - time.monotonic())
+            if end_signal.wait(min(0.2, remaining)):
+                timed_out = False
+                break
+    finally:
+        _bargein_registry.unsubscribe(_on_end)
+
+    if timed_out:
         return json.dumps({"status": "timeout", "error": f"No recording completed within {timeout_sec}s"})
 
     # Recording finished — find the latest transcript
diff --git a/tests/test_bargein_context.py b/tests/test_bargein_context.py
index c8ec215..71c7f53 100644
--- a/tests/test_bargein_context.py
+++ b/tests/test_bargein_context.py
@@ -18,8 +18,6 @@
 import time
 from unittest.mock import MagicMock
 
-import pytest
-
 # Ensure the project root is on sys.path for imports
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
@@ -29,7 +27,6 @@
 from providers import ProviderResponse  # noqa: E402
 from schemas.bargein import BargeinContext  # noqa: E402
 
-
 # ---------------------------------------------------------------------------
 # Test doubles
 # ---------------------------------------------------------------------------
diff --git a/tests/test_bargein_provider_registry.py b/tests/test_bargein_provider_registry.py
index 01b4aea..a2337c2 100644
--- a/tests/test_bargein_provider_registry.py
+++ b/tests/test_bargein_provider_registry.py
@@ -17,6 +17,7 @@
 import os
 import sys
 import threading
+import time
 
 import pytest
 
@@ -304,5 +305,168 @@ def test_start_from_env_instantiates_known_provider(monkeypatch):
         registry.stop_all(timeout=1.0)
 
 
+# ---------------------------------------------------------------------------
+# wait_for_event — used by await_voice_input() to block until in-process
+# providers fire (replaces the old file-only poll). Regression target for
+# Codex review #4 / Fix 1.
+# ---------------------------------------------------------------------------
+
+
+def test_wait_for_event_returns_matching_event():
+    """wait_for_event returns the event when an in-process provider emits it."""
+    registry = BargeinRegistry(PipelineState())
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    fired = threading.Event()
+    captured: list[BargeinEvent | None] = []
+
+    def _waiter():
+        evt = registry.wait_for_event("user_speaking_end", timeout=2.0)
+        captured.append(evt)
+        fired.set()
+
+    threading.Thread(target=_waiter, daemon=True).start()
+    # Give the waiter a tick to subscribe before we trigger
+    time.sleep(0.05)
+    p.trigger("user_speaking_end", folder="42")
+
+    assert fired.wait(timeout=2.0), "wait_for_event did not return after emit"
+    registry.stop_all()
+    assert len(captured) == 1
+    assert captured[0] is not None
+    assert captured[0].event_type == "user_speaking_end"
+    assert captured[0].metadata == {"folder": "42"}
+
+
+def test_wait_for_event_filters_by_event_type():
+    """A start event must NOT satisfy a wait for end."""
+    registry = BargeinRegistry(PipelineState())
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    captured: list[BargeinEvent | None] = []
+    done = threading.Event()
+
+    def _waiter():
+        captured.append(registry.wait_for_event("user_speaking_end", timeout=0.4))
+        done.set()
+
+    threading.Thread(target=_waiter, daemon=True).start()
+    time.sleep(0.05)
+    # Wrong event type — the waiter should ignore this and time out
+    p.trigger("user_speaking_start")
+
+    assert done.wait(timeout=2.0)
+    registry.stop_all()
+    assert captured == [None]
+
+
+def test_wait_for_event_filters_by_source():
+    """source=... narrows the wait to a specific provider."""
+    registry = BargeinRegistry(PipelineState())
+    p_browser = FakeProvider(on_event=registry._dispatch)
+
+    # Subclass with a different source value
+    class _SuperFake(FakeProvider):
+        source = "superwhisper"
+
+    p_sw = _SuperFake(on_event=registry._dispatch)
+    registry.register(p_browser)
+    registry.register(p_sw)
+    registry.start_all()
+
+    captured: list[BargeinEvent | None] = []
+    done = threading.Event()
+
+    def _waiter():
+        captured.append(registry.wait_for_event("user_speaking_end", source="superwhisper", timeout=2.0))
+        done.set()
+
+    threading.Thread(target=_waiter, daemon=True).start()
+    time.sleep(0.05)
+    # Browser-VAD end first — should be ignored by source filter
+    p_browser.trigger("user_speaking_end", folder="b1")
+    time.sleep(0.05)
+    # Then SuperWhisper end — should satisfy
+    p_sw.trigger("user_speaking_end", folder="sw1")
+
+    assert done.wait(timeout=2.0)
+    registry.stop_all()
+    assert captured[0] is not None
+    assert captured[0].source == "superwhisper"
+    assert captured[0].metadata == {"folder": "sw1"}
+
+
+def test_wait_for_event_times_out_when_silent():
+    """No event emitted -> wait_for_event returns None within timeout."""
+    registry = BargeinRegistry(PipelineState())
+    t0 = time.monotonic()
+    result = registry.wait_for_event("user_speaking_end", timeout=0.2)
+    elapsed = time.monotonic() - t0
+    assert result is None
+    assert 0.15 < elapsed < 1.0
+
+
+def test_wait_for_event_unsubscribes_on_completion(monkeypatch):
+    """The temporary waiter subscriber must not leak after the wait returns."""
+    registry = BargeinRegistry(PipelineState())
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+
+    starting = len(registry._subscribers)
+
+    def _do_wait():
+        registry.wait_for_event("user_speaking_end", timeout=0.5)
+
+    t = threading.Thread(target=_do_wait, daemon=True)
+    t.start()
+    time.sleep(0.05)
+    # While waiting, the subscriber count should be elevated
+    assert len(registry._subscribers) == starting + 1
+    p.trigger("user_speaking_end")
+    t.join(timeout=2.0)
+    registry.stop_all()
+    # Cleaned up
+    assert len(registry._subscribers) == starting
+
+
+# ---------------------------------------------------------------------------
+# make_file_mirror_subscriber — bridges in-process events to the legacy
+# /tmp/mod3-barge-in.json signal file so out-of-process pollers (mcp_shim)
+# keep working alongside the new registry.
+# ---------------------------------------------------------------------------
+
+
+def test_file_mirror_subscriber_writes_event_to_path(tmp_path):
+    import json as _json
+
+    from bargein import make_file_mirror_subscriber
+
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    registry = BargeinRegistry(PipelineState())
+    registry.subscribe(make_file_mirror_subscriber(signal_path))
+
+    p = FakeProvider(on_event=registry._dispatch)
+    registry.register(p)
+    registry.start_all()
+    try:
+        p.trigger("user_speaking_end", folder="abc")
+    finally:
+        registry.stop_all()
+
+    with open(signal_path) as f:
+        written = _json.load(f)
+
+    assert written["event"] == "user_speaking_end"
+    assert written["source"] == "browser_vad"  # FakeProvider's source
+    assert written["folder"] == "abc"
+    assert written["via"] == "bargein_registry"
+    assert "timestamp" in written
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/test_browser_channel_routing.py b/tests/test_browser_channel_routing.py
new file mode 100644
index 0000000..fd7f89c
--- /dev/null
+++ b/tests/test_browser_channel_routing.py
@@ -0,0 +1,173 @@
+"""Tests for BrowserChannel.broadcast_response_text session routing.
+
+Covers Codex review #4 / Fix 4: kernel replies that include a session_id
+should be delivered ONLY to the matching BrowserChannel; replies without a
+session_id should fall back to broadcast (preserves backward compatibility
+with older kernels that don't yet include the session field).
+
+The session_id format is the convention from cogos_agent_bridge.post_user_message:
+``mod3:<channel_id>``. The leading ``mod3:`` is stripped to match the
+``BrowserChannel.channel_id`` value (e.g. ``browser:abc12345``).
+
+We don't spin up a real WebSocket; instead we register lightweight stand-ins
+on ``BrowserChannel._active_channels`` (a class-level set) that record the
+frames they receive.
+
+Run: python -m pytest tests/test_browser_channel_routing.py -v
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+from pathlib import Path
+from typing import Any
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from channels import BrowserChannel  # noqa: E402
+
+
+class _FakeWS:
+    def __init__(self) -> None:
+        self.sent: list[dict] = []
+
+    async def send_json(self, frame: dict) -> None:
+        self.sent.append(frame)
+
+
+class _FakeChannel:
+    """Mimics enough of BrowserChannel for broadcast_response_text to use it.
+
+    BrowserChannel.broadcast_response_text iterates ``_active_channels`` and
+    calls ``asyncio.run_coroutine_threadsafe(ch.ws.send_json(frame), ch._loop)``.
+    We replace run_coroutine_threadsafe with a synchronous shim that just
+    runs the coroutine on the current loop, so we don't need a separate
+    background loop per fake channel.
+    """
+
+    def __init__(self, channel_id: str, loop: asyncio.AbstractEventLoop) -> None:
+        self.channel_id = channel_id
+        self.ws = _FakeWS()
+        self._loop = loop
+        self._active = True
+
+
+@pytest.fixture(autouse=True)
+def _isolate_active_channels():
+    """Snapshot and restore BrowserChannel._active_channels around each test."""
+    snapshot = set(BrowserChannel._active_channels)
+    BrowserChannel._active_channels.clear()
+    yield
+    BrowserChannel._active_channels.clear()
+    BrowserChannel._active_channels.update(snapshot)
+
+
+def _patched_run(coro, _loop):
+    """Drive the awaitable to completion on the current event loop."""
+    asyncio.get_event_loop().run_until_complete(coro)
+
+    class _Done:
+        def result(self, timeout: float = 0) -> Any:  # noqa: ARG002
+            return None
+
+    return _Done()
+
+
+def _broadcast_with_loop(text: str, session_id: str | None = None) -> None:
+    """Run broadcast_response_text with run_coroutine_threadsafe stubbed.
+
+    Uses a fresh event loop so the fake channels' ws.send_json coroutines
+    actually run.
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    try:
+        with patch("channels.asyncio.run_coroutine_threadsafe", _patched_run):
+            BrowserChannel.broadcast_response_text(text, session_id=session_id)
+    finally:
+        loop.close()
+
+
+def test_broadcast_with_no_session_id_fans_out_to_all_active_channels():
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    b = _FakeChannel("browser:bbb", loop)
+    c = _FakeChannel("browser:ccc", loop)
+    BrowserChannel._active_channels.update({a, b, c})
+
+    _broadcast_with_loop("hello everyone")
+
+    for ch in (a, b, c):
+        assert len(ch.ws.sent) == 1
+        assert ch.ws.sent[0] == {"type": "response_text", "text": "hello everyone"}
+
+    loop.close()
+
+
+def test_broadcast_with_session_id_routes_to_only_matching_channel():
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    b = _FakeChannel("browser:bbb", loop)
+    c = _FakeChannel("browser:ccc", loop)
+    BrowserChannel._active_channels.update({a, b, c})
+
+    _broadcast_with_loop("just for B", session_id="mod3:browser:bbb")
+
+    assert a.ws.sent == []
+    assert b.ws.sent == [{"type": "response_text", "text": "just for B"}]
+    assert c.ws.sent == []
+
+    loop.close()
+
+
+def test_broadcast_skips_inactive_channels():
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    a._active = False
+    b = _FakeChannel("browser:bbb", loop)
+    BrowserChannel._active_channels.update({a, b})
+
+    _broadcast_with_loop("only active wins")
+
+    assert a.ws.sent == []
+    assert b.ws.sent == [{"type": "response_text", "text": "only active wins"}]
+
+    loop.close()
+
+
+def test_broadcast_session_id_without_mod3_prefix_falls_back_to_broadcast():
+    """Defensive: if a malformed session_id arrives, don't lose the message."""
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    b = _FakeChannel("browser:bbb", loop)
+    BrowserChannel._active_channels.update({a, b})
+
+    # No "mod3:" prefix -> expected_channel stays None -> broadcast
+    _broadcast_with_loop("legacy session id", session_id="browser:aaa")
+
+    assert len(a.ws.sent) == 1
+    assert len(b.ws.sent) == 1
+    assert a.ws.sent[0]["text"] == "legacy session id"
+
+    loop.close()
+
+
+def test_broadcast_session_id_with_no_match_drops_silently():
+    """Routed session for a channel that's not connected -> no delivery."""
+    loop = asyncio.new_event_loop()
+    a = _FakeChannel("browser:aaa", loop)
+    BrowserChannel._active_channels.add(a)
+
+    _broadcast_with_loop("for a ghost", session_id="mod3:browser:zzz")
+
+    assert a.ws.sent == []
+
+    loop.close()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/test_bus_bridge.py b/tests/test_bus_bridge.py
new file mode 100644
index 0000000..f27a8a0
--- /dev/null
+++ b/tests/test_bus_bridge.py
@@ -0,0 +1,65 @@
+"""Tests for KernelBusSubscriber endpoint resolution (Codex review #4 / Fix 3).
+
+The subscriber used to hard-code ``http://localhost:6931/v1/events/stream`` as
+its default. In any non-default kernel topology that meant sends and receives
+targeted different hosts. The fix: resolve the URL from ``COGOS_ENDPOINT`` (with
+the same default as the rest of the cogos client code) at construction time.
+
+Run: python -m pytest tests/test_bus_bridge.py -v
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+import bus_bridge  # noqa: E402
+from bus_bridge import KernelBusSubscriber, default_stream_url  # noqa: E402
+
+
+def test_default_stream_url_uses_default_when_env_unset(monkeypatch):
+    monkeypatch.delenv("COGOS_ENDPOINT", raising=False)
+    assert default_stream_url() == "http://localhost:6931/v1/events/stream"
+
+
+def test_default_stream_url_honors_cogos_endpoint(monkeypatch):
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7000")
+    assert default_stream_url() == "http://kernel.internal:7000/v1/events/stream"
+
+
+def test_default_stream_url_strips_trailing_slash(monkeypatch):
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7000/")
+    assert default_stream_url() == "http://kernel.internal:7000/v1/events/stream"
+
+
+def test_subscriber_default_url_uses_env(monkeypatch):
+    """A KernelBusSubscriber constructed with no url must honor COGOS_ENDPOINT."""
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7777")
+    sub = KernelBusSubscriber()
+    assert sub._url == "http://kernel.internal:7777/v1/events/stream"
+
+
+def test_subscriber_explicit_url_overrides_env(monkeypatch):
+    """An explicit url= argument must beat the env var (back-compat)."""
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.internal:7777")
+    sub = KernelBusSubscriber(url="http://override.example/v1/events/stream")
+    assert sub._url == "http://override.example/v1/events/stream"
+
+
+def test_kernel_bus_stream_url_module_attr_resolves_default(monkeypatch):
+    """The module-level back-compat attribute reflects the env at import.
+
+    We can't reload the module mid-test, but we can verify it agrees with
+    default_stream_url() called from the same env state.
+    """
+    # Just confirm the back-compat attr exists and is a string of expected shape
+    assert isinstance(bus_bridge.KERNEL_BUS_STREAM_URL, str)
+    assert bus_bridge.KERNEL_BUS_STREAM_URL.endswith("/v1/events/stream")
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))
diff --git a/tests/test_cogos_agent_bridge.py b/tests/test_cogos_agent_bridge.py
index 6ee197b..2d5de43 100644
--- a/tests/test_cogos_agent_bridge.py
+++ b/tests/test_cogos_agent_bridge.py
@@ -15,6 +15,7 @@
 from bus_bridge import BusEnvelope  # noqa: E402
 from cogos_agent_bridge import (  # noqa: E402
     _extract_response_text,
+    _extract_session_id,
     post_user_message,
     run_response_bridge,
 )
@@ -110,5 +111,95 @@ def test_run_response_bridge_fans_out_to_broadcast():
     assert texts == ["reply one", "free-form string reply"]
 
 
+# ---------------------------------------------------------------------------
+# Session-id extraction & forwarding (Codex review #4 / Fix 4)
+#
+# The kernel-side change includes session_id in reply payloads so mod3 can
+# route to the originating BrowserChannel. When session_id is missing
+# (older kernel, non-session-scoped event), broadcast_response_text falls
+# back to broadcasting — preserving backward compat.
+# ---------------------------------------------------------------------------
+
+
+def test_extract_session_id_from_top_level():
+    assert _extract_session_id({"session_id": "mod3:browser:abc"}) == "mod3:browser:abc"
+
+
+def test_extract_session_id_from_content_wrapped_json():
+    inner = {"type": "agent_response", "text": "hi", "session_id": "mod3:browser:xyz"}
+    payload = {"content": json.dumps(inner)}
+    assert _extract_session_id(payload) == "mod3:browser:xyz"
+
+
+def test_extract_session_id_returns_none_when_absent():
+    assert _extract_session_id({"text": "hi"}) is None
+    assert _extract_session_id({"content": json.dumps({"text": "hi"})}) is None
+    assert _extract_session_id({"content": "free-form string"}) is None
+    assert _extract_session_id({}) is None
+
+
+def test_run_response_bridge_forwards_session_id_when_present():
+    """When the kernel reply includes session_id, it must reach broadcast."""
+    inner = json.dumps({"type": "agent_response", "text": "scoped reply", "session_id": "mod3:browser:abc"})
+    envelopes = [_env({"content": inner}, "r1")]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("cogos_agent_bridge.BrowserChannel.broadcast_response_text") as mock_bcast:
+        asyncio.run(run_response_bridge(sub))
+
+    assert mock_bcast.call_count == 1
+    call = mock_bcast.call_args_list[0]
+    assert call.args[0] == "scoped reply"
+    # session_id passed as keyword
+    assert call.kwargs.get("session_id") == "mod3:browser:abc"
+
+
+def test_run_response_bridge_falls_back_to_broadcast_when_no_session_id():
+    """Old-kernel reply (no session_id) -> broadcast_response_text(session_id=None)."""
+    inner = json.dumps({"type": "agent_response", "text": "broadcast reply"})
+    envelopes = [_env({"content": inner}, "r1")]
+    sub = _FakeSubscriber(envelopes)
+
+    with patch("cogos_agent_bridge.BrowserChannel.broadcast_response_text") as mock_bcast:
+        asyncio.run(run_response_bridge(sub))
+
+    assert mock_bcast.call_count == 1
+    call = mock_bcast.call_args_list[0]
+    assert call.args[0] == "broadcast reply"
+    assert call.kwargs.get("session_id") is None
+
+
+def test_post_user_message_uses_runtime_endpoint(monkeypatch):
+    """post_user_message must POST to the URL derived from COGOS_ENDPOINT
+    at call time — not a stale module-import-time value."""
+    monkeypatch.setenv("COGOS_ENDPOINT", "http://kernel.test:9000")
+
+    captured: dict = {}
+
+    class _FakeResp:
+        status_code = 200
+        text = ""
+
+    class _FakeClient:
+        def __init__(self, *a, **kw):
+            pass
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *a):
+            return False
+
+        async def post(self, url, json=None):  # noqa: A002
+            captured["url"] = url
+            return _FakeResp()
+
+    with patch("cogos_agent_bridge.httpx.AsyncClient", _FakeClient):
+        ok = asyncio.run(post_user_message("hi", session_id="mod3:browser:abc"))
+
+    assert ok is True
+    assert captured["url"] == "http://kernel.test:9000/v1/bus/send"
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])
diff --git a/tests/test_speaking_lock.py b/tests/test_speaking_lock.py
new file mode 100644
index 0000000..69f6f9f
--- /dev/null
+++ b/tests/test_speaking_lock.py
@@ -0,0 +1,228 @@
+"""Tests for the owner-aware speaking lock (Codex review #4 / Fix 2).
+
+The cross-process ``/tmp/mod3-speaking.json`` lock used to be ownership-blind:
+``_release_speaking_lock`` removed the file unconditionally, so two overlapping
+mod3 processes could falsely interrupt each other when one finished its speech.
+
+The new contract:
+  * Acquire writes only if the file is missing, the holder PID is dead, or
+    (pid, job_id) match the current process (idempotent re-acquire).
+  * Release only removes the file when (pid, job_id) match.
+  * ``_i_own_speaking_lock`` returns True only when our (pid, job_id) is the
+    current on-disk holder; the speech loop uses this for stop-on-mismatch.
+  * ``_force_clear_speaking_lock`` is the only path that clears regardless of
+    owner — used by the bargein watcher's cross-process interrupt path.
+
+Run: python -m pytest tests/test_speaking_lock.py -v
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+import server  # noqa: E402
+
+
+@pytest.fixture(autouse=True)
+def _isolated_lock_path(tmp_path, monkeypatch):
+    """Each test gets its own lock-file path so they don't collide on /tmp."""
+    lock_path = str(tmp_path / "mod3-speaking.json")
+    monkeypatch.setattr(server, "_SPEAKING_LOCK", lock_path)
+    yield lock_path
+    # Best-effort cleanup
+    try:
+        os.remove(lock_path)
+    except OSError:
+        pass
+
+
+def _write_raw_lock(path: str, payload: dict) -> None:
+    with open(path, "w") as f:
+        json.dump(payload, f)
+
+
+def test_acquire_writes_lock_when_missing(_isolated_lock_path):
+    ok = server._acquire_speaking_lock("job-1", "hello")
+    assert ok is True
+
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == os.getpid()
+    assert lock["job_id"] == "job-1"
+    assert lock["text"] == "hello"
+    assert "acquired_at" in lock
+
+
+def test_acquire_is_idempotent_for_same_owner(_isolated_lock_path):
+    assert server._acquire_speaking_lock("job-1", "hello") is True
+    # Same (pid, job_id) re-acquiring — must succeed and refresh the file
+    assert server._acquire_speaking_lock("job-1", "hello again") is True
+
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["text"] == "hello again"
+
+
+def test_acquire_blocked_by_live_other_process(_isolated_lock_path):
+    """Different live PID owns the lock -> we can't acquire."""
+    other_pid = os.getppid()  # parent pid is reliably alive during this test
+    assert other_pid != os.getpid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "their-job", "text": "..."},
+    )
+
+    ok = server._acquire_speaking_lock("our-job", "hi")
+    assert ok is False
+
+    # The other process's lock must be untouched
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == other_pid
+    assert lock["job_id"] == "their-job"
+
+
+def test_acquire_reclaims_lock_when_holder_pid_is_dead(_isolated_lock_path):
+    """A lock left by a crashed process must be reclaimable."""
+    # PID 1 on macOS is launchd — not us. We mock _pid_is_alive to make it
+    # appear dead so the acquire path takes the "stale" branch.
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": 99999, "job_id": "ghost-job", "text": "stale"},
+    )
+
+    with patch.object(server, "_pid_is_alive", return_value=False):
+        ok = server._acquire_speaking_lock("our-job", "hi")
+    assert ok is True
+
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == os.getpid()
+    assert lock["job_id"] == "our-job"
+
+
+def test_release_only_clears_own_lock(_isolated_lock_path):
+    """Release with mismatched job_id must NOT remove the file."""
+    other_pid = os.getppid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "their-job", "text": "..."},
+    )
+
+    # Same pid, different job_id -> still must not remove (we don't own job_id)
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": os.getpid(), "job_id": "their-job", "text": "..."},
+    )
+    ok = server._release_speaking_lock("our-job")
+    assert ok is False
+    assert os.path.exists(_isolated_lock_path)
+
+    # Wrong PID -> still must not remove
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "any", "text": "..."},
+    )
+    ok = server._release_speaking_lock("any")
+    assert ok is False
+    assert os.path.exists(_isolated_lock_path)
+
+
+def test_release_removes_own_lock(_isolated_lock_path):
+    assert server._acquire_speaking_lock("job-1", "x") is True
+    assert os.path.exists(_isolated_lock_path)
+
+    ok = server._release_speaking_lock("job-1")
+    assert ok is True
+    assert not os.path.exists(_isolated_lock_path)
+
+
+def test_i_own_speaking_lock_matches_pid_and_job(_isolated_lock_path):
+    assert server._i_own_speaking_lock("job-1") is False  # missing
+
+    server._acquire_speaking_lock("job-1", "x")
+    assert server._i_own_speaking_lock("job-1") is True
+    assert server._i_own_speaking_lock("other-job") is False
+
+    # Simulate another process taking over the lock
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": os.getppid(), "job_id": "job-1", "text": "x"},
+    )
+    assert server._i_own_speaking_lock("job-1") is False
+
+
+def test_two_processes_cannot_release_each_others_locks(_isolated_lock_path):
+    """The original Codex bug: process A's release deletes process B's lock.
+
+    Simulated with mismatched-pid lock content + a release call that should
+    no-op because (pid, job_id) doesn't match this process.
+    """
+    # Process B (different pid) currently owns the lock
+    other_pid = os.getppid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "B-job", "text": "playing..."},
+    )
+
+    # Process A finishes its own job and calls release.
+    # The OLD release would delete the file (false interrupt for B);
+    # the NEW release must observe pid mismatch and no-op.
+    ok = server._release_speaking_lock("A-job")
+    assert ok is False
+    assert os.path.exists(_isolated_lock_path)
+
+    # B's lock content is unchanged
+    with open(_isolated_lock_path) as f:
+        lock = json.load(f)
+    assert lock["pid"] == other_pid
+    assert lock["job_id"] == "B-job"
+
+
+def test_force_clear_removes_any_lock(_isolated_lock_path):
+    """The bargein watcher's cross-process kill path."""
+    other_pid = os.getppid()
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": other_pid, "job_id": "B-job", "text": "long playback"},
+    )
+
+    cleared = server._force_clear_speaking_lock()
+    assert cleared is not None
+    assert cleared["pid"] == other_pid
+    assert cleared["job_id"] == "B-job"
+    assert not os.path.exists(_isolated_lock_path)
+
+
+def test_is_any_process_speaking_drops_dead_holder_lock(_isolated_lock_path):
+    """Stale lock (dead holder pid) must be treated as 'no one speaking'."""
+    _write_raw_lock(
+        _isolated_lock_path,
+        {"pid": 99999, "job_id": "ghost", "text": "..."},
+    )
+
+    with patch.object(server, "_pid_is_alive", return_value=False):
+        result = server._is_any_process_speaking()
+
+    assert result is None
+    # Side effect: stale file is removed
+    assert not os.path.exists(_isolated_lock_path)
+
+
+def test_pid_is_alive_handles_self_and_invalid():
+    assert server._pid_is_alive(os.getpid()) is True
+    assert server._pid_is_alive(0) is False
+    assert server._pid_is_alive(-1) is False
+    assert server._pid_is_alive("not-an-int") is False  # type: ignore[arg-type]
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__, "-v"]))

From fba8accf5635ec6e63c0b9adc108e620a5cb9db1 Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Sun, 19 Apr 2026 20:17:49 -0400
Subject: [PATCH 8/9] fix: address Codex review #4 re-review nits
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Nit 1: Add integration tests for await_voice_input() regression.
Three tests cover the registry-dispatch path (the original regression),
the legacy file-write path (backward-compat), and the timeout path
(negative control). Previously only wait_for_event was unit-tested in
isolation — the actual await_voice_input function had no direct coverage.

Nit 2: Unify await_voice_input's dual wait paths.
Extended _bargein_watcher to bridge file user_speaking_end events into
the registry as synthetic BargeinEvents. await_voice_input now makes a
single wait_for_event("user_speaking_end", timeout=...) call instead of
hand-rolling its own subscribe-poll-loop. Feedback between the file
mirror and watcher is broken via the via=bargein_registry marker.

The registry is now constructed before the watcher thread starts so the
bridge can reference it safely on the first iteration.

Tests: 98 passing (was 95; +3 new).
---
 server.py                               |  79 +++++++-------
 tests/test_bargein_provider_registry.py | 130 ++++++++++++++++++++++++
 2 files changed, 171 insertions(+), 38 deletions(-)

diff --git a/server.py b/server.py
index 1da3390..8909f02 100644
--- a/server.py
+++ b/server.py
@@ -507,11 +507,19 @@ def _bargein_watcher():
     producer (and its launchd plist). In-process providers go through
     ``bargein.BargeinRegistry`` instead, calling the same shared
     ``handle_bargein_start`` consumer helper.
+
+    For ``user_speaking_end`` events, the watcher also bridges the file into
+    the registry by dispatching a synthetic ``BargeinEvent`` — that lets
+    registry-side waiters (``await_voice_input``'s ``wait_for_event``) wake
+    from file-based producers without maintaining a second wait path.
+    Feedback is broken by skipping files whose ``via`` marker shows they were
+    written by our own file-mirror subscriber.
     """
     global _bargein_last_mtime
     import json as _json
 
     from bargein import handle_bargein_start
+    from bargein.providers.base import BargeinEvent
 
     while True:
         try:
@@ -523,6 +531,24 @@ def _bargein_watcher():
                     _bargein_last_mtime = mtime
                     with open(_BARGEIN_SIGNAL) as f:
                         signal = _json.load(f)
+                    event_type = signal.get("event")
+                    # Break the file_mirror → watcher → registry feedback loop:
+                    # events the registry itself just mirrored out are marked
+                    # with via=bargein_registry and should not round-trip back.
+                    from_mirror = signal.get("via") == "bargein_registry"
+                    if event_type == "user_speaking_end" and not from_mirror:
+                        # Bridge external producers (integrations/bargein-producer.py)
+                        # into the in-process registry so wait_for_event sees them.
+                        _bargein_registry._dispatch(
+                            BargeinEvent(
+                                source=signal.get("source", "superwhisper"),
+                                event_type="user_speaking_end",
+                                metadata={
+                                    "via": "file_signal",
+                                    **{k: v for k, v in signal.items() if k not in ("event", "source", "timestamp")},
+                                },
+                            )
+                        )
                     if signal.get("event") == "user_speaking_start":
                         # Shared consumer: check is_speaking + interrupt + log
                         info = handle_bargein_start(
@@ -566,14 +592,13 @@ def _bargein_watcher():
         time.sleep(0.1)  # 100ms poll
 
 
-_bargein_thread = threading.Thread(target=_bargein_watcher, daemon=True)
-_bargein_thread.start()
-
-
 # ---------------------------------------------------------------------------
 # Barge-in provider registry — in-process providers (SuperWhisper, future:
 # silero VAD, hotkey, etc.). Opt-in via MOD3_BARGEIN_PROVIDERS. Empty default
 # preserves current behavior for users who only run the legacy file producer.
+#
+# NOTE: the registry is constructed BEFORE the watcher thread starts because
+# the watcher bridges file user_speaking_end events into the registry.
 # ---------------------------------------------------------------------------
 
 from bargein import BargeinRegistry, make_file_mirror_subscriber  # noqa: E402
@@ -585,6 +610,9 @@ def _bargein_watcher():
 _bargein_registry.subscribe(make_file_mirror_subscriber(_BARGEIN_SIGNAL))
 _bargein_registry.start_from_env()
 
+_bargein_thread = threading.Thread(target=_bargein_watcher, daemon=True)
+_bargein_thread.start()
+
 
 async def _emit_interruption(info: InterruptInfo):
     """Emit a channel notification when playback is interrupted.
@@ -1283,13 +1311,14 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     when speak() returns "held" (user is recording) or when you want to listen
     for the next voice input.
 
-    Waits on two parallel signal sources, returning when either fires:
-      1. ``BargeinRegistry`` ``user_speaking_end`` events (in-process
-         providers like SuperWhisperProvider).
-      2. The legacy ``/tmp/mod3-barge-in.json`` signal file (standalone
-         producer / cross-process IPC).
+    Single wait path: ``BargeinRegistry.wait_for_event("user_speaking_end", ...)``.
+    Out-of-process producers (``integrations/bargein-producer.py``) write to
+    ``/tmp/mod3-barge-in.json``; the module-level ``_bargein_watcher`` bridges
+    those writes into the registry as synthetic events, so both in-process
+    and out-of-process sources funnel through one wait.
 
-    Then reads the transcript from SuperWhisper's recordings directory.
+    After the wait unblocks, reads the transcript from SuperWhisper's
+    recordings directory (meta.json) or SQLite DB as a fallback.
 
     Args:
         timeout_sec: Maximum seconds to wait for recording to finish. Default 180 (3 minutes).
@@ -1299,34 +1328,8 @@ def await_voice_input(timeout_sec: float = 180.0) -> str:
     _sw_db = os.path.expanduser("~/Library/Application Support/SuperWhisper/database/superwhisper.sqlite")
     _rec_dir = os.path.expanduser("~/Documents/superwhisper/recordings")
 
-    end_signal = threading.Event()
-
-    def _on_end(event):
-        if event.event_type == "user_speaking_end":
-            end_signal.set()
-
-    _bargein_registry.subscribe(_on_end)
-    try:
-        deadline = time.monotonic() + timeout_sec
-        timed_out = True
-        while time.monotonic() < deadline:
-            try:
-                if os.path.exists(_BARGEIN_SIGNAL):
-                    with open(_BARGEIN_SIGNAL) as f:
-                        signal = json.load(f)
-                    if signal.get("event") == "user_speaking_end":
-                        timed_out = False
-                        break
-            except (OSError, json.JSONDecodeError):
-                pass
-            remaining = max(0.0, deadline - time.monotonic())
-            if end_signal.wait(min(0.2, remaining)):
-                timed_out = False
-                break
-    finally:
-        _bargein_registry.unsubscribe(_on_end)
-
-    if timed_out:
+    event = _bargein_registry.wait_for_event("user_speaking_end", timeout=timeout_sec)
+    if event is None:
         return json.dumps({"status": "timeout", "error": f"No recording completed within {timeout_sec}s"})
 
     # Recording finished — find the latest transcript
diff --git a/tests/test_bargein_provider_registry.py b/tests/test_bargein_provider_registry.py
index a2337c2..27e87e5 100644
--- a/tests/test_bargein_provider_registry.py
+++ b/tests/test_bargein_provider_registry.py
@@ -468,5 +468,135 @@ def test_file_mirror_subscriber_writes_event_to_path(tmp_path):
     assert "timestamp" in written
 
 
+# ---------------------------------------------------------------------------
+# await_voice_input — end-to-end regression test for Codex review #4 / Fix 1.
+# The unit tests above cover wait_for_event + make_file_mirror_subscriber in
+# isolation; these lock down the actual mod3 tool function, which is what the
+# original regression (in-process user_speaking_end never waking the tool)
+# was about.
+# ---------------------------------------------------------------------------
+
+
+def test_await_voice_input_returns_when_registry_emits_end(monkeypatch, tmp_path):
+    """Regression: await_voice_input() must return when an in-process provider
+    dispatches user_speaking_end through the registry.
+
+    This is exactly the bug Fix 1 addressed — before the registry-aware wait,
+    await_voice_input only watched the legacy signal file and never saw
+    events from in-process providers like SuperWhisperProvider.
+    """
+    import json as _json
+
+    import server  # noqa: E402
+
+    # Isolate the file signal so we don't race with any existing /tmp state
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    monkeypatch.setattr(server, "_BARGEIN_SIGNAL", signal_path)
+    monkeypatch.setattr(server, "_bargein_last_mtime", 0.0)
+
+    result_box: list[str] = []
+    t0 = time.monotonic()
+
+    def _caller():
+        result_box.append(server.await_voice_input(timeout_sec=5.0))
+
+    caller = threading.Thread(target=_caller, daemon=True)
+    caller.start()
+
+    # Let await_voice_input subscribe before we dispatch
+    time.sleep(0.2)
+
+    server._bargein_registry._dispatch(
+        BargeinEvent(
+            source="superwhisper",
+            event_type="user_speaking_end",
+            metadata={"folder": "42"},
+        )
+    )
+
+    caller.join(timeout=3.0)
+    elapsed = time.monotonic() - t0
+
+    assert not caller.is_alive(), "await_voice_input did not return after registry dispatch"
+    # 5.0s timeout would mean we missed the event; anything under ~2s means we caught it
+    assert elapsed < 2.5, f"took {elapsed:.2f}s — likely timed out rather than catching the event"
+    assert len(result_box) == 1
+
+    result = _json.loads(result_box[0])
+    # status may be "ok" (if SuperWhisper recordings exist) or "error" (no transcript
+    # to read in this test env), but MUST NOT be "timeout" — that is the regression.
+    assert result["status"] != "timeout", f"timed out despite registry event: {result}"
+
+
+def test_await_voice_input_returns_on_legacy_file_write(monkeypatch, tmp_path):
+    """Backward-compat: out-of-process producers (e.g. integrations/bargein-producer.py)
+    write ``user_speaking_end`` to ``/tmp/mod3-barge-in.json``. await_voice_input()
+    must still wake on that path after the Fix 2 refactor.
+    """
+    import json as _json
+
+    import server  # noqa: E402
+
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    monkeypatch.setattr(server, "_BARGEIN_SIGNAL", signal_path)
+    monkeypatch.setattr(server, "_bargein_last_mtime", 0.0)
+
+    result_box: list[str] = []
+    t0 = time.monotonic()
+
+    def _caller():
+        result_box.append(server.await_voice_input(timeout_sec=5.0))
+
+    caller = threading.Thread(target=_caller, daemon=True)
+    caller.start()
+
+    # Give await_voice_input a tick to enter its wait
+    time.sleep(0.2)
+
+    # Simulate the legacy producer writing to the signal file
+    with open(signal_path, "w") as f:
+        _json.dump(
+            {
+                "event": "user_speaking_end",
+                "source": "superwhisper",
+                "timestamp": "2026-04-19T00:00:00Z",
+            },
+            f,
+        )
+
+    caller.join(timeout=3.0)
+    elapsed = time.monotonic() - t0
+
+    assert not caller.is_alive(), "await_voice_input did not return after file write"
+    assert elapsed < 2.5, f"took {elapsed:.2f}s — likely timed out rather than reading file"
+    assert len(result_box) == 1
+
+    result = _json.loads(result_box[0])
+    assert result["status"] != "timeout", f"timed out despite file write: {result}"
+
+
+def test_await_voice_input_times_out_when_no_signal(monkeypatch, tmp_path):
+    """If neither source fires, await_voice_input() must actually time out.
+
+    This is the negative control for the two regression tests above — if it
+    always returned quickly, they wouldn't be proving anything.
+    """
+    import json as _json
+
+    import server  # noqa: E402
+
+    signal_path = str(tmp_path / "mod3-barge-in.json")
+    monkeypatch.setattr(server, "_BARGEIN_SIGNAL", signal_path)
+    monkeypatch.setattr(server, "_bargein_last_mtime", 0.0)
+
+    t0 = time.monotonic()
+    raw = server.await_voice_input(timeout_sec=0.4)
+    elapsed = time.monotonic() - t0
+
+    assert 0.3 < elapsed < 2.0, f"timeout path ran for {elapsed:.2f}s"
+    result = _json.loads(raw)
+    assert result["status"] == "timeout"
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__, "-v"]))

From 31c50757d9d8f463fd7237f1c774066a7150eb19 Mon Sep 17 00:00:00 2001
From: Chaz Dinkle <chazmaniandinkle@gmail.com>
Date: Sun, 19 Apr 2026 20:55:36 -0400
Subject: [PATCH 9/9] fix: remove unused httpx import from
 e2e_dashboard_harness (lint)

---
 demo/e2e_dashboard_harness.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/demo/e2e_dashboard_harness.py b/demo/e2e_dashboard_harness.py
index 03a3a9f..cc7dac7 100644
--- a/demo/e2e_dashboard_harness.py
+++ b/demo/e2e_dashboard_harness.py
@@ -22,7 +22,6 @@
 import wave
 from pathlib import Path
 
-import httpx
 from websockets.client import connect as ws_connect
 
 HERE = Path(__file__).resolve().parent